Censor-Fix Example Notebook¶

[1]:

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
from censorfix import censorfix

[2]:

# %load -s create_data test_censor.py
def create_data():
    """
    returns two dataframes a copy of one another
    """
    c = 0.5
    n = 3
    cov = c + np.identity(n) * (1 - c)
    size = 100
    full_data = np.random.multivariate_normal(
        [0 for i in range(n)], cov, size=size)
    df = pd.DataFrame(full_data)
    df2 = df.copy()
    return df, df2

[3]:

# %load -s single_dim_test test_censor.py
def single_dim_test():
    """
    Test censorfix in one d example
    """
    df, df2 = create_data()
    censor_high = 1.5
    censor_low =- 0.5
    df.loc[df[0] > censor_high, 0] = censor_high
    df.loc[df[0] < censor_low, 0] = censor_low
    imp = censorfix.censorImputer(
        debug=False, no_columns=2, sample_posterior=True)
    df = df.sort_values(by=0, ascending=True)
    imp.impute_once(df[0], df[[1, 2]], censor_high, censor_low)
    fig, ax = plt.subplots(1, 1)
    df2.plot(kind='scatter', x=0, y=2, ax=ax, color='pink',label='imputed')
    df.plot(kind='scatter', x=0, y=2, ax=ax,label='true')
    plt.title('single imputation of censored values')
    plt.show()
    return df,df2

[4]:

single_dim_test();

[5]:

# %load -s multi_imp_test test_censor.py
def multi_imp_test(plot=True):
    """
    Tests the creation of multiple imputations
    plots results or returns dataframe and the imputed data
    """
    df, df2 = create_data()
    # censor the first dataframe
    censor_high_1=0.8
    censor_high_2=1
    censor_low_1=-0.6
    censor_low_2=-2
    df.loc[df[0] > censor_high_1, 0] = censor_high_1
    df.loc[df[0] < censor_low_1, 0] = censor_low_1
    df.loc[df[1] > censor_high_2, 1] = censor_high_2
    df.loc[df[1] < censor_low_2, 1] = censor_low_2

    imp = censorfix.censorImputer(
        debug=False, sample_posterior=True,number_imputations=3)
    U = [censor_high_1, censor_high_2, 'NA']  # the upper censor values
    L = [censor_low_1, censor_low_2, 'NA'] # the lower censor values

    data_mi = imp.impute(df, U, L, iter_val=2)

    if plot:
        fig, ax = plt.subplots(1, 1)
        colours=['red','yellow','green']
        for i,data in enumerate(data_mi):
            data.plot(kind='scatter',x=0,y=1,color=colours[i],label='imputation {}'.format(i),ax=ax)
        df2.plot(kind='scatter',x=0,y=1,color='blue',label='original',ax=ax)
        plt.title('Multiple imputations comparison')
        plt.legend()
        plt.show()
    return df2, data_mi

[6]:

multi_imp_test();

100%|██████████| 1/1 [00:16<00:00, 16.10s/it]

[7]:

# %load -s multi_dim_test test_censor.py
def multi_dim_test():
    """
    Test censorfix for multiple imputation of multiple dimensions
    """
    df, df2 = create_data()

    # censor the first dataframe
    censor_high_1=0.8
    censor_high_2=0.5
    censor_low_1=-0.3
    censor_low_2=-0.7
    df.loc[df[0] > censor_high_1, 0] = censor_high_1
    df.loc[df[0] < censor_low_1, 0] = censor_low_1
    df.loc[df[1] > censor_high_2, 1] = censor_high_2
    df.loc[df[1] < censor_low_2, 1] = censor_low_2

    imp = censorfix.censorImputer(
        debug=False, sample_posterior=True)
    U = [censor_high_1, censor_high_2, 'NA']  # the upper censor values
    L = [censor_low_1, censor_low_2, 'NA'] # the lower censor values

    fig, ax = plt.subplots(1, 1)
    df.plot(kind='scatter', x=0, y=1, ax=ax, color='yellow', label='censored')
    df = imp.impute(df, U, L, iter_val=2)
    df2.plot(
        kind='scatter',
        x=0,
        y=1,
        ax=ax,
        color='pink',
        label='imputed_values')
    df.plot(kind='scatter', x=0, y=1, ax=ax, label='actual')
    plt.legend()
    plt.title('Multivariate Censor Imputation')
    plt.show()
    return df,df2

[8]:

multi_dim_test();

100%|██████████| 2/2 [00:44<00:00, 22.52s/it]

[9]:

def censor_ex():
    plt.rc('font', size=16)
    x = np.random.rand(100)
    y = x**3 + 2 * x**0.1 + 0.5 * np.random.rand(100)
    fig = plt.figure(figsize=(15, 5))
    plt.subplot(1, 2, 1)
    plt.scatter(x, y)
    plt.title('Drug Data True Values')
    plt.xlabel('BSEP')
    plt.ylabel('SPHER')
    plt.ylim(1, 3.5)

    y[y > 2.3] = 2.3
    plt.subplot(1, 2, 2)
    plt.scatter(x, y)
    plt.title('Drug Data Censored')
    plt.xlabel('BSEP')
    plt.ylabel('SPHER')
    plt.ylim(1, 3.5)
    return x,y

[10]:

x,y=censor_ex();

[11]:

# %load -s censor_fix_ex  pressy.py
def censor_fix_ex():
    import censorfix
    plt.rc('font', size=12)
    fig = plt.figure(figsize=(15, 5))
    imp = censorfix.censorImputer(
        debug=False, no_columns=1, sample_posterior=False)
    df = pd.DataFrame([y, x]).T
    df = df.sort_values(by=0, ascending=True)
    imp.impute_once(df[0], df[[1]], 2.3, 'NA')

    plt.subplot(1, 2, 1)
    plt.scatter(df.iloc[:, 1], df.iloc[:, 0])
    plt.title('Drug Data Best Imputation')
    plt.xlabel('BSEP')
    plt.ylabel('SPHER')
    plt.ylim(1, 3.5)

    imp = censorfix.censorImputer(
        debug=False,
        no_columns=1,
        sample_posterior=True)
    df = pd.DataFrame([y, x]).T
    df = df.sort_values(by=0, ascending=True)
    imp.impute_once(df[0], df[[1]], 2.3, 'NA')
    plt.subplot(1, 2, 2)

    plt.scatter(df.iloc[:, 1], df.iloc[:, 0])
    plt.title('Drug Data Imputation from the Bayesian posterior ')
    plt.xlabel('BSEP')
    plt.ylabel('SPHER')
    plt.ylim(1, 3.5)

[12]:

censor_fix_ex()

[13]:

df, df2 = create_data()
censor_high = 0.5
censor_low =- 2
df.loc[df[0] > censor_high, 0] = censor_high
df.loc[df[0] < censor_low, 0] = censor_low

df3=df.copy()
df4=df.copy()
imp = censorfix.censorImputer(
    debug=False, no_columns='all', sample_posterior=False)
imp2 = censorfix.censorImputer(
    debug=False, no_columns='all', sample_posterior=True)
U=[censor_high,'NA','NA']
L=[censor_low,'NA','NA']

[14]:

df = imp.impute(df, U, L, iter_val=2);
df4 = imp2.impute(df4, U, L, iter_val=2);

100%|██████████| 2/2 [00:18<00:00,  9.28s/it]
100%|██████████| 2/2 [00:18<00:00,  9.42s/it]

[15]:

fig,axs=plt.subplots(nrows=2, ncols=2,figsize=(10,10),sharey=True,sharex=True)
plt.rc('font', size=16)
fig.suptitle('Comparison of Imputation Coverage')
df3.plot(kind='scatter', ax=axs[0][1], x=2, y=0,title='Censored Data Points')
df2.plot(kind='scatter', ax=axs[0][0], x=2, y=0,title='Original Data Points')
df.plot(kind='scatter', ax=axs[1][0], x=2, y=0, title='Best Imputations')
df4.plot(kind='scatter', ax=axs[1][1], x=2, y=0, title='Imputations Taking Into Account \n Bayesian Uncertainty')

[15]:

<matplotlib.axes._subplots.AxesSubplot at 0x2b4080566588>

[16]:

df=df.sort_values(by=0)
df2=df2.sort_values(by=0)
df3=df3.sort_values(by=0)
df4=df4.sort_values(by=0)
cens=df[0]>=0.5
cens=cens.values

[17]:

fig,axs=plt.subplots(nrows=2, ncols=2,figsize=(10,10),sharey=True,sharex=True)

fig.suptitle('Censor-Fix Application',fontsize=30)
#plt.rc('font', size=16)
df3[~cens].plot(kind='scatter', ax=axs[0][1], x=2, y=0,title='Censored Data Points',)
df2[~cens].plot(kind='scatter', ax=axs[0][0], x=2, y=0,title='Original Data Points')
df[~cens].plot(kind='scatter', ax=axs[1][0], x=2, y=0, title='Best Imputations')
df4[~cens].plot(kind='scatter', ax=axs[1][1], x=2, y=0, title='Imputations Taking Into Account \n Bayesian Uncertainty')
plt.xlabel("")

df3[cens].plot(kind='scatter', ax=axs[0][1], x=2, y=0, title='Censored Data Points',color='pink')
df2[cens].plot(kind='scatter', ax=axs[0][0], x=2, y=0, title='Original Data Points',color='pink')
df[cens].plot(kind='scatter', ax=axs[1][0], x=2, y=0, title='Single Imputations',color='pink')
df4[cens].plot(kind='scatter', ax=axs[1][1], x=2, y=0, title='Imputations Taking Into Account \n Bayesian Uncertainty',color='pink')

axs[0][0].set_ylabel('')
axs[1][0].set_ylabel('')
axs[1][0].set_xlabel('')
axs[1][1].set_xlabel('')

[17]:

Text(0.5, 0, '')

[ ]: