import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm 
from scipy.stats import gaussian_kde


df = pd.read_csv('https://tinyurl.com/2p8hr3f5') # ttm data on the spread on a daily basis
df.set_index('Date',inplace=True) 

fig, axs  = plt.subplots(1,3,figsize=(12, 8)) # let's emulate  Bloobmerg's HS visual


axs[0].plot(df['KO'],label='KO')
axs[0].plot(df['PEP'],label='PEP')
axs[0].set_title("Paired yields")
axs[0].invert_xaxis()
axs[0].legend()
axs[0].set_xticks([])
axs[0].set_xlabel('Time in days, ttm')

axs[1].plot(df['KOminusPEP'])
axs[1].set_title("Spread: KO minus PEP")
axs[1].axhline(y = 0,color='black')
axs[1].fill_between(df.index,df['KOminusPEP'], where=df['KOminusPEP']>0, facecolor='g')
axs[1].fill_between(df.index,df['KOminusPEP'], where=df['KOminusPEP']<0, facecolor='r')
axs[1].invert_xaxis()
axs[1].set_xticks([])
axs[1].set_xlabel('Time in days, ttm')


axs[2].hist(df['KOminusPEP'],bins=20,density=True,edgecolor='b')
density = gaussian_kde(df['KOminusPEP'])
ind=np.linspace(-15,15,101)
kdepdf=density.evaluate(ind)
axs[2].plot(ind, kdepdf, label='kernel density', color="r")
axs[2].legend()

axs[2].set_title("Spread distribution")

Text(0.5, 1.0, 'Spread distribution')


# summary stats like in Bloomberbg
# Last doesn't quite match screen shot because I downloaded the date a few hours after taking the screen shot

print('Spread Summary')
print('')
print('Last: %.3f' %df['KOminusPEP'][0])
print('Mean: %.3f' %df['KOminusPEP'].mean())
print('Standard deviation: %.3f' %df['KOminusPEP'].std())
print('SE of mean: %.3f' %df['KOminusPEP'].sem())
print('Median: %.3f' %df['KOminusPEP'].median())
print('High: %.3f' %df['KOminusPEP'].max())
print('Low: %.3f' %df['KOminusPEP'].min())

Spread Summary

Last: -1.042
Mean: -2.761
Standard deviation: 5.215
SE of mean: 0.329
Median: -2.773
High: 11.427
Low: -14.366


# Let us nw run our AR(1) regression

df['LagS']=df['KOminusPEP'].shift(-1)
df.dropna(subset=['LagS'], inplace=True)


x=sm.add_constant(df['LagS']) # we run a standard OLS with constant 
mod=sm.OLS(df['KOminusPEP'],x)
res=mod.fit()
res.summary()


# Now we use the regression output above to estimate S* and the speed of convergence

gamma=1-res.params[1]
Sstar=res.params[0]/gamma

print('gamma estimates to %.3f or so' %gamma)
print('S* estimates to %.3f or so' %Sstar)

gamma estimates to 0.093 or so
S* estimates to -2.491 or so


# Now we test H0: gamma=0

from statsmodels.tsa.stattools import adfuller
adf_test = adfuller(df['KOminusPEP'])

# print(adf_test[0])
print('The p-value of H0: gamma=0 -- is %.5f' %adf_test[1])

The p-value of H0: gamma=0 -- is 0.21391


# maybe a more general stationary test will rescue this pair trade

import statsmodels.tsa.stattools as ts 
x=df['KO']
y=df['PEP']
result=ts.coint(x, y)

print('The p-value of H0 -- no cointegration -- is %.5f' %result[1])

The p-value of H0 -- no cointegration -- is 0.10151


df['KOminusPEP'].describe()

count    251.000000
mean      -2.742278
std        5.216673
min      -14.365900
25%       -6.331850
50%       -2.762900
75%        1.369750
max       11.427400
Name: KOminusPEP, dtype: float64

Dep. Variable:	KOminusPEP	R-squared:	0.825
Model:	OLS	Adj. R-squared:	0.824
Method:	Least Squares	F-statistic:	1170.
Date:	Thu, 07 Apr 2022	Prob (F-statistic):	4.41e-96
Time:	16:23:00	Log-Likelihood:	-551.86
No. Observations:	251	AIC:	1108.
Df Residuals:	249	BIC:	1115.
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	-0.2324	0.156	-1.485	0.139	-0.541	0.076
LagS	0.9067	0.027	34.207	0.000	0.855	0.959

Omnibus:	5.779	Durbin-Watson:	2.631
Prob(Omnibus):	0.056	Jarque-Bera (JB):	6.746
Skew:	-0.202	Prob(JB):	0.0343
Kurtosis:	3.694	Cond. No.	6.73

Statistical Arbitrage¶