# We import the usual suspects pus PCA from sklearn, and load up daily treasury data between 2011 adn 2020
import pandas as pd
import datetime as dt
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
data = pd.read_csv('http://erwan.marginalq.com/index_files/tea_files/ycrates1120.csv')
data['Date'] = pd.to_datetime(data['Date'])
data.sort_values(by='Date',inplace=True)
data.set_index('Date',inplace=True)
# A quick visualization before normalizing
data.plot.line().legend(bbox_to_anchor=(1.05, 1.0),loc='upper left')
plt.title('Lots of variance to explain')
Text(0.5, 1.0, 'Lots of variance to explain')
# Next we run PCA on a standardized version of the data
pca=PCA(n_components=11, whiten=True)
pca.fit_transform(data)
data
1 Mo | 3 Mo | 6 Mo | 1 Yr | 2 Yr | 3 Yr | 5 Yr | 7 Yr | 10 Yr | 20 Yr | 30 Yr | |
---|---|---|---|---|---|---|---|---|---|---|---|
Date | |||||||||||
2011-01-03 | 0.11 | 0.15 | 0.19 | 0.29 | 0.61 | 1.03 | 2.02 | 2.74 | 3.36 | 4.18 | 4.39 |
2011-01-04 | 0.12 | 0.14 | 0.19 | 0.28 | 0.63 | 1.04 | 2.01 | 2.72 | 3.36 | 4.21 | 4.44 |
2011-01-05 | 0.13 | 0.14 | 0.19 | 0.31 | 0.71 | 1.16 | 2.14 | 2.86 | 3.50 | 4.34 | 4.55 |
2011-01-06 | 0.13 | 0.15 | 0.18 | 0.30 | 0.68 | 1.11 | 2.09 | 2.80 | 3.44 | 4.31 | 4.53 |
2011-01-07 | 0.13 | 0.14 | 0.18 | 0.29 | 0.60 | 1.02 | 1.96 | 2.69 | 3.34 | 4.25 | 4.48 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2020-12-24 | 0.09 | 0.09 | 0.09 | 0.10 | 0.13 | 0.17 | 0.37 | 0.66 | 0.94 | 1.46 | 1.66 |
2020-12-28 | 0.09 | 0.11 | 0.11 | 0.11 | 0.13 | 0.17 | 0.38 | 0.65 | 0.94 | 1.46 | 1.67 |
2020-12-29 | 0.08 | 0.10 | 0.12 | 0.11 | 0.12 | 0.17 | 0.37 | 0.66 | 0.94 | 1.47 | 1.67 |
2020-12-30 | 0.06 | 0.08 | 0.09 | 0.12 | 0.12 | 0.17 | 0.37 | 0.66 | 0.93 | 1.46 | 1.66 |
2020-12-31 | 0.08 | 0.09 | 0.09 | 0.10 | 0.13 | 0.17 | 0.36 | 0.65 | 0.93 | 1.45 | 1.65 |
2501 rows × 11 columns
# visualization borrowed from Prasad Ostwal (https://ostwalprasad.github.io/machine-learning/PCA-using-python.html)
plt.bar(range(1,len(pca.explained_variance_ )+1),pca.explained_variance_ )
plt.ylabel('Explained variance')
plt.xlabel('Components')
plt.plot(range(1,len(pca.explained_variance_ )+1),
np.cumsum(pca.explained_variance_),
c='red',
label="Cumulative Explained Variance")
# plt.legend(loc='upper right')
plt.title('Two factors do all the work, third adds a little')
Text(0.5, 1.0, 'Two factors do all the work, third adds a little')
#Now we compute the (unormalized) loadings of each principal component
#And see where they earned their name
# loadings=pca.components_.T*np.sqrt(pca.explained_variance_)
loadings=pca.components_[:3].T
loadings_df = pd.DataFrame(loadings, columns=['Level', 'Slope','Curvature'], index=data.columns)
loadings_df.plot.line()
<Axes: >
#Let's get to the same place via old-school SVD
from scipy.linalg import svd
#Frist we normalize the data for consistency with what we did with PCA
data_normalized=(data-data.mean())/(data.std())
U, singular, V_transpose = svd(data_normalized)
x=['EV'+ str(i+1) for i in range(np.size(singular))]
plt.bar(x,singular)
plt.title('First two eigenfactors still do most of the work')
Text(0.5, 1.0, 'First two eigenfactors still do most of the work')
# Now we plot the first three factors to see what they look like
plt.plot(data.columns,V_transpose[0],label='Level')
plt.plot(data.columns,-V_transpose[1],label='Slope') #Note that I'm forcefully undoing the sign ambiguity here so thing look most comparable
plt.plot(data.columns,V_transpose[2],label='Curvature')
plt.legend()
<matplotlib.legend.Legend at 0x1b29602f0d0>