# We import the usual suspects pus PCA from sklearn, and load up daily treasury data between 2011 adn 2020 
import pandas as pd
import datetime as dt
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

data = pd.read_csv('http://erwan.marginalq.com/index_files/tea_files/ycrates1120.csv')
data['Date'] = pd.to_datetime(data['Date'])
data.sort_values(by='Date',inplace=True)
data.set_index('Date',inplace=True)


# A quick visualization before normalizing

data.plot.line().legend(bbox_to_anchor=(1.05, 1.0),loc='upper left')
plt.title('Lots of variance to explain')

Text(0.5, 1.0, 'Lots of variance to explain')


# Next we run PCA on a standardized version of the data

pca=PCA(n_components=11, whiten=True)
data_normalized=(data-data.mean())/(data.std())

#my former student Boyu Dong pointed out that pca.fit_transform(data) actuall doesn't standardize or demean the data
#so the aboveline makes sure the normalization does happen as it should before the decomposition takes place 

pca.fit_transform(data_normalized)
data


# visualization borrowed from Prasad Ostwal (https://ostwalprasad.github.io/machine-learning/PCA-using-python.html)

plt.bar(range(1,len(pca.explained_variance_ )+1),pca.explained_variance_ )
plt.ylabel('Explained variance')
plt.xlabel('Components')
plt.plot(range(1,len(pca.explained_variance_ )+1),
         np.cumsum(pca.explained_variance_),
         c='red',
         label="Cumulative Explained Variance")

# plt.legend(loc='upper right')

plt.title('Two factors do all the work, third adds a little')

Text(0.5, 1.0, 'Two factors do all the work, third adds a little')


#Now we compute the (unormalized) loadings of each principal component
#And see where they earned their name

# loadings=pca.components_.T*np.sqrt(pca.explained_variance_)
loadings=pca.components_[:3].T
loadings_df = pd.DataFrame(loadings, columns=['Level', 'Slope','Curvature'], index=data.columns)
loadings_df.plot.line()

<Axes: >


#Let's get to the same place via old-school SVD

from scipy.linalg import svd

#Frist we normalize the data for consistency with what we did with PCA

data_normalized=(data-data.mean())/(data.std())

U, singular, V_transpose = svd(data_normalized)

x=['EV'+ str(i+1) for i in range(np.size(singular))]
plt.bar(x,singular)
plt.title('First two eigenfactors still do most of the work')

Text(0.5, 1.0, 'First two eigenfactors still do most of the work')


# Now we plot the first three factors to see what they look like

plt.plot(data.columns,V_transpose[0],label='Level')
plt.plot(data.columns,-V_transpose[1],label='Slope') #Note that I'm forcefully undoing the sign ambiguity here so thing look most comparable
plt.plot(data.columns,-V_transpose[2],label='Curvature') #Same

plt.legend()

<matplotlib.legend.Legend at 0x1a762fc3b80>

	1 Mo	3 Mo	6 Mo	1 Yr	2 Yr	3 Yr	5 Yr	7 Yr	10 Yr	20 Yr	30 Yr
Date
2011-01-03	0.11	0.15	0.19	0.29	0.61	1.03	2.02	2.74	3.36	4.18	4.39
2011-01-04	0.12	0.14	0.19	0.28	0.63	1.04	2.01	2.72	3.36	4.21	4.44
2011-01-05	0.13	0.14	0.19	0.31	0.71	1.16	2.14	2.86	3.50	4.34	4.55
2011-01-06	0.13	0.15	0.18	0.30	0.68	1.11	2.09	2.80	3.44	4.31	4.53
2011-01-07	0.13	0.14	0.18	0.29	0.60	1.02	1.96	2.69	3.34	4.25	4.48
...	...	...	...	...	...	...	...	...	...	...	...
2020-12-24	0.09	0.09	0.09	0.10	0.13	0.17	0.37	0.66	0.94	1.46	1.66
2020-12-28	0.09	0.11	0.11	0.11	0.13	0.17	0.38	0.65	0.94	1.46	1.67
2020-12-29	0.08	0.10	0.12	0.11	0.12	0.17	0.37	0.66	0.94	1.47	1.67
2020-12-30	0.06	0.08	0.09	0.12	0.12	0.17	0.37	0.66	0.93	1.46	1.66
2020-12-31	0.08	0.09	0.09	0.10	0.13	0.17	0.36	0.65	0.93	1.45	1.65

PCA decomposition of the yield curve¶