Censor-Fix Example Notebook¶
[1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
from censorfix import censorfix
[2]:
# %load -s create_data test_censor.py
def create_data():
"""
returns two dataframes a copy of one another
"""
c = 0.5
n = 3
cov = c + np.identity(n) * (1 - c)
size = 100
full_data = np.random.multivariate_normal(
[0 for i in range(n)], cov, size=size)
df = pd.DataFrame(full_data)
df2 = df.copy()
return df, df2
[3]:
# %load -s single_dim_test test_censor.py
def single_dim_test():
"""
Test censorfix in one d example
"""
df, df2 = create_data()
censor_high = 1.5
censor_low =- 0.5
df.loc[df[0] > censor_high, 0] = censor_high
df.loc[df[0] < censor_low, 0] = censor_low
imp = censorfix.censorImputer(
debug=False, no_columns=2, sample_posterior=True)
df = df.sort_values(by=0, ascending=True)
imp.impute_once(df[0], df[[1, 2]], censor_high, censor_low)
fig, ax = plt.subplots(1, 1)
df2.plot(kind='scatter', x=0, y=2, ax=ax, color='pink',label='imputed')
df.plot(kind='scatter', x=0, y=2, ax=ax,label='true')
plt.title('single imputation of censored values')
plt.show()
return df,df2
[4]:
single_dim_test();
[5]:
# %load -s multi_imp_test test_censor.py
def multi_imp_test(plot=True):
"""
Tests the creation of multiple imputations
plots results or returns dataframe and the imputed data
"""
df, df2 = create_data()
# censor the first dataframe
censor_high_1=0.8
censor_high_2=1
censor_low_1=-0.6
censor_low_2=-2
df.loc[df[0] > censor_high_1, 0] = censor_high_1
df.loc[df[0] < censor_low_1, 0] = censor_low_1
df.loc[df[1] > censor_high_2, 1] = censor_high_2
df.loc[df[1] < censor_low_2, 1] = censor_low_2
imp = censorfix.censorImputer(
debug=False, sample_posterior=True,number_imputations=3)
U = [censor_high_1, censor_high_2, 'NA'] # the upper censor values
L = [censor_low_1, censor_low_2, 'NA'] # the lower censor values
data_mi = imp.impute(df, U, L, iter_val=2)
if plot:
fig, ax = plt.subplots(1, 1)
colours=['red','yellow','green']
for i,data in enumerate(data_mi):
data.plot(kind='scatter',x=0,y=1,color=colours[i],label='imputation {}'.format(i),ax=ax)
df2.plot(kind='scatter',x=0,y=1,color='blue',label='original',ax=ax)
plt.title('Multiple imputations comparison')
plt.legend()
plt.show()
return df2, data_mi
[6]:
multi_imp_test();
100%|██████████| 1/1 [00:16<00:00, 16.10s/it]
[7]:
# %load -s multi_dim_test test_censor.py
def multi_dim_test():
"""
Test censorfix for multiple imputation of multiple dimensions
"""
df, df2 = create_data()
# censor the first dataframe
censor_high_1=0.8
censor_high_2=0.5
censor_low_1=-0.3
censor_low_2=-0.7
df.loc[df[0] > censor_high_1, 0] = censor_high_1
df.loc[df[0] < censor_low_1, 0] = censor_low_1
df.loc[df[1] > censor_high_2, 1] = censor_high_2
df.loc[df[1] < censor_low_2, 1] = censor_low_2
imp = censorfix.censorImputer(
debug=False, sample_posterior=True)
U = [censor_high_1, censor_high_2, 'NA'] # the upper censor values
L = [censor_low_1, censor_low_2, 'NA'] # the lower censor values
fig, ax = plt.subplots(1, 1)
df.plot(kind='scatter', x=0, y=1, ax=ax, color='yellow', label='censored')
df = imp.impute(df, U, L, iter_val=2)
df2.plot(
kind='scatter',
x=0,
y=1,
ax=ax,
color='pink',
label='imputed_values')
df.plot(kind='scatter', x=0, y=1, ax=ax, label='actual')
plt.legend()
plt.title('Multivariate Censor Imputation')
plt.show()
return df,df2
[8]:
multi_dim_test();
100%|██████████| 2/2 [00:44<00:00, 22.52s/it]
[9]:
def censor_ex():
plt.rc('font', size=16)
x = np.random.rand(100)
y = x**3 + 2 * x**0.1 + 0.5 * np.random.rand(100)
fig = plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)
plt.scatter(x, y)
plt.title('Drug Data True Values')
plt.xlabel('BSEP')
plt.ylabel('SPHER')
plt.ylim(1, 3.5)
y[y > 2.3] = 2.3
plt.subplot(1, 2, 2)
plt.scatter(x, y)
plt.title('Drug Data Censored')
plt.xlabel('BSEP')
plt.ylabel('SPHER')
plt.ylim(1, 3.5)
return x,y
[10]:
x,y=censor_ex();
[11]:
# %load -s censor_fix_ex pressy.py
def censor_fix_ex():
import censorfix
plt.rc('font', size=12)
fig = plt.figure(figsize=(15, 5))
imp = censorfix.censorImputer(
debug=False, no_columns=1, sample_posterior=False)
df = pd.DataFrame([y, x]).T
df = df.sort_values(by=0, ascending=True)
imp.impute_once(df[0], df[[1]], 2.3, 'NA')
plt.subplot(1, 2, 1)
plt.scatter(df.iloc[:, 1], df.iloc[:, 0])
plt.title('Drug Data Best Imputation')
plt.xlabel('BSEP')
plt.ylabel('SPHER')
plt.ylim(1, 3.5)
imp = censorfix.censorImputer(
debug=False,
no_columns=1,
sample_posterior=True)
df = pd.DataFrame([y, x]).T
df = df.sort_values(by=0, ascending=True)
imp.impute_once(df[0], df[[1]], 2.3, 'NA')
plt.subplot(1, 2, 2)
plt.scatter(df.iloc[:, 1], df.iloc[:, 0])
plt.title('Drug Data Imputation from the Bayesian posterior ')
plt.xlabel('BSEP')
plt.ylabel('SPHER')
plt.ylim(1, 3.5)
[12]:
censor_fix_ex()
[13]:
df, df2 = create_data()
censor_high = 0.5
censor_low =- 2
df.loc[df[0] > censor_high, 0] = censor_high
df.loc[df[0] < censor_low, 0] = censor_low
df3=df.copy()
df4=df.copy()
imp = censorfix.censorImputer(
debug=False, no_columns='all', sample_posterior=False)
imp2 = censorfix.censorImputer(
debug=False, no_columns='all', sample_posterior=True)
U=[censor_high,'NA','NA']
L=[censor_low,'NA','NA']
[14]:
df = imp.impute(df, U, L, iter_val=2);
df4 = imp2.impute(df4, U, L, iter_val=2);
100%|██████████| 2/2 [00:18<00:00, 9.28s/it]
100%|██████████| 2/2 [00:18<00:00, 9.42s/it]
[15]:
fig,axs=plt.subplots(nrows=2, ncols=2,figsize=(10,10),sharey=True,sharex=True)
plt.rc('font', size=16)
fig.suptitle('Comparison of Imputation Coverage')
df3.plot(kind='scatter', ax=axs[0][1], x=2, y=0,title='Censored Data Points')
df2.plot(kind='scatter', ax=axs[0][0], x=2, y=0,title='Original Data Points')
df.plot(kind='scatter', ax=axs[1][0], x=2, y=0, title='Best Imputations')
df4.plot(kind='scatter', ax=axs[1][1], x=2, y=0, title='Imputations Taking Into Account \n Bayesian Uncertainty')
[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x2b4080566588>
[16]:
df=df.sort_values(by=0)
df2=df2.sort_values(by=0)
df3=df3.sort_values(by=0)
df4=df4.sort_values(by=0)
cens=df[0]>=0.5
cens=cens.values
[17]:
fig,axs=plt.subplots(nrows=2, ncols=2,figsize=(10,10),sharey=True,sharex=True)
fig.suptitle('Censor-Fix Application',fontsize=30)
#plt.rc('font', size=16)
df3[~cens].plot(kind='scatter', ax=axs[0][1], x=2, y=0,title='Censored Data Points',)
df2[~cens].plot(kind='scatter', ax=axs[0][0], x=2, y=0,title='Original Data Points')
df[~cens].plot(kind='scatter', ax=axs[1][0], x=2, y=0, title='Best Imputations')
df4[~cens].plot(kind='scatter', ax=axs[1][1], x=2, y=0, title='Imputations Taking Into Account \n Bayesian Uncertainty')
plt.xlabel("")
df3[cens].plot(kind='scatter', ax=axs[0][1], x=2, y=0, title='Censored Data Points',color='pink')
df2[cens].plot(kind='scatter', ax=axs[0][0], x=2, y=0, title='Original Data Points',color='pink')
df[cens].plot(kind='scatter', ax=axs[1][0], x=2, y=0, title='Single Imputations',color='pink')
df4[cens].plot(kind='scatter', ax=axs[1][1], x=2, y=0, title='Imputations Taking Into Account \n Bayesian Uncertainty',color='pink')
axs[0][0].set_ylabel('')
axs[1][0].set_ylabel('')
axs[1][0].set_xlabel('')
axs[1][1].set_xlabel('')
[17]:
Text(0.5, 0, '')
[ ]: