# Import packages
import numpy as np
import pandas as pd
import time as time
import seaborn as sns
import missingno as msno
import os
import glob
from time import time
from pathlib import Path
import inspect as insp
import matplotlib.pyplot as plt


# Set filepath
file_path = Path('PMAP_Meds.ipynb').resolve()
data_path = file_path.parent.parent.parent.parent.parent.joinpath('Data/jbergma8/IRB_271579_Faraday/IRB-271579-v3-DEID-220607-no-ptsd')


# Import data
med_admin          = pd.read_csv(data_path / 'med_admin.csv',
                                 usecols = ['osler_sid','pat_enc_csn_sid','medication_name','taken_time','mar_action', 'route', 'sig','dose_unit'],
                                 parse_dates = ['taken_time'])
#med_admin_updated  = pd.read_csv(data_path / 'med_admin_updated.csv',
#                                 usecols = ['osler_sid','pat_enc_csn_sid','medication_name','taken_time','mar_action', 'route', 'sig','dose_unit'],
#                                 parse_dates = ['taken_time'])
#med_orders         = pd.read_csv(data_path / 'med_orders.csv')
weights            = pd.read_csv('weights.csv')
CHF_hosp_icu_stays = pd.read_csv('CHF_hosp_icu_stays.csv',parse_dates=['hosp_admsn_time','hosp_disch_time'])


# Remove admins where it wasn't a real admin
keep_actions = ['Given','Rate Verify','New Bag','Handoff','Rate/Dose Verify','Rate Change','Stopped']
med_admin = med_admin[med_admin.mar_action.isin(keep_actions)]

# Extract medication data for our cohort
CHF_hosp_med_admin = pd.merge(left = CHF_hosp_icu_stays[['osler_sid','pat_enc_csn_sid','hosp_admsn_time','hosp_disch_time']],
                              right = med_admin,
                              how = 'left',
                              on = ['osler_sid','pat_enc_csn_sid'])

# Taken medication time needs to be within hospital admission period
CHF_hosp_med_admin = CHF_hosp_med_admin[CHF_hosp_med_admin.taken_time.between(CHF_hosp_med_admin.hosp_admsn_time, CHF_hosp_med_admin.hosp_disch_time)]
CHF_hosp_med_admin['medication_name'] = CHF_hosp_med_admin['medication_name'].str.lower()
CHF_hosp_med_admin = CHF_hosp_med_admin.sort_values(['osler_sid','hosp_admsn_time','taken_time'])

# Get list of filenames in specified folder
all_drug_classes = sorted(glob.glob("./medication_class_name/*"))
all_drug_doses = sorted(glob.glob("./medication_names_dose/*"))


len(all_drug_doses)

42


msno.matrix(CHF_hosp_med_admin, figsize = (5,5))
plt.show()


med_dict = {}
for i in range(len(all_drug_doses)):
    drug_name = all_drug_doses[i].split('/')[-1]
    drug_name = drug_name.split('.')[0]
    drug = pd.read_csv('./medication_names_dose/' + drug_name + '.csv', squeeze = True)
    druglist = drug.str.lower().str.strip().to_list()
    # print(drug_name)
    admins = CHF_hosp_med_admin[CHF_hosp_med_admin['medication_name'].str.contains('|'.join(druglist))]
    med_dict[drug_name] = [len(admins[admins.route.isnull()]), len(admins)]


df = pd.DataFrame.from_dict(med_dict, orient = 'index', columns = ['missing count', 'total count'])
df['prop (%)'] = df['missing count'] / df['total count']*100
df.sort_values('prop (%)', ascending = False)


top_3_drugs = ['mannitol','etomidate','phenylephrine']
for i in range(len(top_3_drugs)):
    med_dict = {}
    drug_name = top_3_drugs[i]
    print(drug_name + '-----------------')
    drug = pd.read_csv('./medication_names_dose/' + drug_name + '.csv', squeeze = True)
    druglist = drug.str.lower().str.strip().to_list()
    admins = CHF_hosp_med_admin[CHF_hosp_med_admin['medication_name'].str.contains('|'.join(druglist))]
    for x in admins.route.unique():
        med_dict[x] = list(admins[admins.route == x]['medication_name'].unique())
    print(med_dict)
    print('med names w missing route values:')
    print(list(admins[admins.route.isnull()]['medication_name'].unique()))

mannitol-----------------
{'Intravenous': ['mannitol 25 % intravenous solution', 'zoledronic acid 4 mg/100 ml-mannitol-0.9 % nacl intravenous piggyback', 'mannitol 20 % intravenous solution'], 'Extracorporeal': ['mannitol 25 % intravenous solution'], nan: []}
med names w missing route values:
['mannitol 25 % intravenous solution']
etomidate-----------------
{'Intravenous': ['etomidate 2 mg/ml intravenous solution'], nan: [], 'Intraosseous': ['etomidate 2 mg/ml intravenous solution']}
med names w missing route values:
['etomidate 2 mg/ml intravenous solution']
phenylephrine-----------------
{'Intravenous': ['phenylephrine 1 mg/10 ml (100 mcg/ml) in 0.9 % sod.chloride iv syringe', 'phenylephrine 1 mg/10 ml (100 mcg/ml) swfi syringe', 'phenylephrine 10 mg/ml injection solution', 'phenylephrine 40 mcg/ml in ns (jhh ped)', 'phenylephrine infusion 20 mg/250 ml ns (jhh)'], nan: [], 'Continuous IV Infusion': ['phenylephrine infusion 20 mg/250 ml ns (jhh)', 'phenylephrine infusion 40 mg/250 ml ns (jhbmc-jhh)', 'phenylephrine infusion 200 mg/250 ml ns (bmc-jhh)'], 'Extracorporeal': ['phenylephrine 10 mg/ml injection solution'], 'Intravenous (Continuous Infusion)': ['phenylephrine infusion 20 mg/250 ml ns (jhh-bmc)', 'phenylephrine infusion 40 mg/250 ml ns (jhbmc-jhh)', 'phenylephrine infusion 20 mg/250 ml ns (jhh adult ed pyxis)'], 'Both Eyes': ['phenylephrine 2.5 % eye drops (per drop)', 'phenylephrine 2.5 % eye drops'], 'Injection': ['phenylephrine 1 mg/10 ml (100 mcg/ml) in 0.9 % sod.chloride iv syringe', 'phenylephrine 10 mg/ml injection solution'], 'intravenous push': ['phenylephrine 1 mg/10 ml (100 mcg/ml) in 0.9 % sod.chloride iv syringe'], 'Right Eye': ['phenylephrine 2.5 % eye drops'], 'Left Eye': ['phenylephrine 2.5 % eye drops', 'phenylephrine 2.5 % eye drops (per drop)'], 'Intracavernosal': ['phenylephrine 1 mg/10 ml (100 mcg/ml) syringe intracavernosal']}
med names w missing route values:
['phenylephrine 10 mg/ml injection solution', 'phenylephrine 1 mg/10 ml (100 mcg/ml) in 0.9 % sod.chloride iv syringe', 'phenylephrine 1 mg/10 ml (100 mcg/ml) swfi syringe', 'phenylephrine infusion 20 mg/250 ml ns (jhh)']


def convert_units(convs, val, unit, weight, minutes):
    '''
    This function takes in unit conversion dictionary, value amount, dose unit, closest weight in kg, and time length of admin
    '''
    conv_factor = convs.get('conv_factor').get(unit)
    multiply_time = convs.get('multiply_min').get(unit)

    # If desired unit divides by weight.
    divide_kg = convs.get('divide_kg')
    if divide_kg == None:
        divide_kg = 0
    else :
        divide_kg = divide_kg.get(unit)

    # If desired unit is not divided by weight. 
    multiply_kg = convs.get('multiply_kg')
    if multiply_kg == None:
        multiply_kg = 0
    else :
        multiply_kg = multiply_kg.get(unit) 

    # Apply conversion factors and adjustments.
    try:
        val = val * conv_factor
    except:
        print("Unit: ", unit)
        print(type(val))
        print(val)
        print(type(conv_factor))
        print(conv_factor)

    if divide_kg == 1: # Needs to divide by patient weight
        val = val / weight
    if multiply_kg == 1:
        val = val * weight # Needs to multiply by patient weight
    if multiply_time == 1:
        val = val * multiply_time
    return val


def get_min_to_next(curr_pat_id, next_pat_id, curr_enc_id, next_enc_id, taken_time, next_taken_time, hosp_disch_time):
    '''
    This function finds the time to the next admin, or the end of the ICU stay if it's the last admin of the encounter. 
    '''
    # Check if next admin is same encounter or not. 
    if (curr_pat_id == next_pat_id) & (curr_enc_id == next_enc_id):
        return (next_taken_time - taken_time).total_seconds()/60
    else:
        return(hosp_disch_time - taken_time).total_seconds()/60


def get_feat(df_hosp_orig, df_meds):
    '''
    This function adds columns to the original dataframe with binary flags for each medication class for each hospital stay.
    '''
    df_hosp = df_hosp_orig.copy()
    for i in range(len(all_drug_classes)):
        drug = pd.read_csv(all_drug_classes[i])
        drug_name = drug.columns.values[0]
        drug_list = list(drug[drug_name].str.lower())

        # Only keep rows where sugars are not being used as a solvent
        if drug_name == 'Sugars':
            df_meds = df_meds[~df_meds['medication_name'].str.contains('piggyback|in dextrose|in 5% dextrose')]

        # Subset dataframe for specific medications in the current medication class
        temp = df_meds[df_meds['medication_name'].str.contains('|'.join(drug_list), na = False)]
        df_hosp[drug_name] = df_hosp['pat_enc_csn_sid'].isin(temp.pat_enc_csn_sid).astype(int)
    return df_hosp


def get_feat_wRoute(df_hosp_orig, df_meds):
    '''
    This function adds columns to the original dataframe with binary flags for each medication class and categorized to different routes for each hospital stay.
    '''
    df_hosp = df_hosp_orig.copy()
    for i in range(len(all_drug_classes)):
        drug = pd.read_csv(all_drug_classes[i])
        drug_name = drug.columns.values[0]
        drug_list = list(drug[drug_name].str.lower())

        # Only keep rows where sugars are not being used as a solvent
        if drug_name == 'Sugars':
            df_meds = df_meds[~df_meds['medication_name'].str.contains('piggyback|in dextrose|in 5% dextrose')]

        # Subset dataframe for specific medications in the current medication class
        temp = df_meds[df_meds['medication_name'].str.contains('|'.join(drug_list), na = False)]

        for routes in temp.route.unique():
            feat = temp[temp.route == routes]['pat_enc_csn_sid'].unique()
            df_hosp[drug_name + '_{}'.format(routes)] = df_hosp['pat_enc_csn_sid'].isin(feat).astype(int)
    return df_hosp


def get_drug_dosage_feat(drug_name, df_cohort):
    
    print(drug_name)
    inp = df_cohort.copy()
    result = CHF_hosp_icu_stays[['osler_sid','pat_enc_csn_sid']].copy()

    #admins = CHF_hosp_med_admin.copy()
    
    drug = pd.read_csv('./medication_names_dose/' + drug_name + '.csv', squeeze = True)
    druglist = drug.str.lower().str.strip().to_list()
    
    # Get hosp length of stay (LOS) in minutes.
    inp['LOS'] = (inp['hosp_disch_time'] - inp['hosp_admsn_time']).dt.total_seconds()/60 # in minutes

    admins = CHF_hosp_med_admin[CHF_hosp_med_admin['medication_name'].str.contains('|'.join(druglist))]

    if admins.shape[0] == 0:
        print(drug_name + ' has no admins')
        return 'no admins'

    admins = admins.dropna(subset=['sig','dose_unit']).copy()
    # Drop admins that have "mL" as dose unit. 
    admins = admins[~(admins['dose_unit']=='mL')].copy()

    # Get minutes between admins.
    admins = admins.merge(inp[['osler_sid','pat_enc_csn_sid','LOS']], on=['osler_sid','pat_enc_csn_sid'], how='left')

    if(drug_name in(['ketamine'])):
        admins['dose_unit'] = admins['dose_unit'].str.replace(' PCA', '')

    # Find correct dose units. 
    med_unit = pd.read_csv('./medication_dose_convs/med_units.csv')
    med_unit.set_index('drugname', inplace=True)

    #Get dose units string.
    target_unit = med_unit.loc[drug_name,'unit']
    target_unit_str = ''.join(target_unit.split('/')[0:-1]) #mcg

    unit_convs = pd.read_csv('./medication_dose_convs/all_med_doses_units_'+ target_unit_str + '.csv')
    unit_convs.set_index('index', inplace=True)
    unit_convs = unit_convs.to_dict()

    # Do the above for medications but not specified by given routes
    admins['next_taken_time'] = admins['taken_time'].shift(periods = -1)
    admins['next_osler_sid'] = admins['osler_sid'].shift(periods = -1)
    admins['next_pat_enc_csn_sid'] = admins['pat_enc_csn_sid'].shift(periods = -1)

    admins['min_to_next'] = admins.apply(lambda row: get_min_to_next(row['osler_sid'], row['next_osler_sid'], row['pat_enc_csn_sid'], row['next_pat_enc_csn_sid'],
                                                                     row['taken_time'], row['next_taken_time'], row['hosp_disch_time']), axis = 1)
    # Get weights (already extracted from most recent)
    admins = pd.merge(left = admins, right = weights , on = ['osler_sid','pat_enc_csn_sid'])

    admins['dose'] = admins.apply(lambda row: convert_units(unit_convs, row['sig'], row['dose_unit'],row['weight'], row['min_to_next']), axis = 1)
    admins['dose_by_time'] = admins['dose'] * admins['min_to_next']
    admins = admins.groupby(['osler_sid', 'pat_enc_csn_sid'], as_index = False).sum()
    admins['average_dose_' + drug_name ] = admins['dose_by_time'] / admins['LOS']
    result = result.merge(admins[['osler_sid','pat_enc_csn_sid','average_dose_' + drug_name]], on = ['osler_sid','pat_enc_csn_sid'], how = 'left')
    
    return result


def get_drug_dosage_featwRoute(drug_name, df_cohort):
    
    print(drug_name)
    inp = df_cohort.copy()
    result = CHF_hosp_icu_stays[['osler_sid','pat_enc_csn_sid']].copy()

    #admins = CHF_hosp_med_admin.copy()
    
    drug = pd.read_csv('./medication_names_dose/' + drug_name + '.csv', squeeze = True)
    druglist = drug.str.lower().str.strip().to_list()
    
    # Get hosp length of stay (LOS) in minutes.
    inp['LOS'] = (inp['hosp_disch_time'] - inp['hosp_admsn_time']).dt.total_seconds()/60 # in minutes

    admins = CHF_hosp_med_admin[CHF_hosp_med_admin['medication_name'].str.contains('|'.join(druglist))]

    if admins.shape[0] == 0:
        print(drug_name + ' has no admins')
        return 'no admins'

    admins = admins.dropna(subset=['sig','dose_unit','route']).copy()
    # Drop admins that have "mL" as dose unit. 
    admins = admins[~(admins['dose_unit']=='mL')].copy()

    # Get minutes between admins.
    admins = admins.merge(inp[['osler_sid','pat_enc_csn_sid','LOS']], on=['osler_sid','pat_enc_csn_sid'], how='left')

    if(drug_name in(['ketamine'])):
        admins['dose_unit'] = admins['dose_unit'].str.replace(' PCA', '')

    # Find correct dose units. 
    med_unit = pd.read_csv('./medication_dose_convs/med_units.csv')

    # Set index
    med_unit.set_index('drugname', inplace=True)

    #Get dose units string.
    target_unit = med_unit.loc[drug_name,'unit']
    target_unit_str = ''.join(target_unit.split('/')[0:-1]) #mcg

    unit_convs = pd.read_csv('./medication_dose_convs/all_med_doses_units_'+ target_unit_str + '.csv')
    unit_convs.set_index('index', inplace=True)
    unit_convs = unit_convs.to_dict()
    
    for rte in admins['route'].unique():
        # For each route, get the subset of admins
        admins_rte = admins[admins['route'] == rte].copy()
        
        # Perform shift operations to get all med administrations in chronological order
        admins_rte['next_taken_time'] = admins_rte['taken_time'].shift(periods = -1)
        admins_rte['next_osler_sid'] = admins_rte['osler_sid'].shift(periods = -1)
        admins_rte['next_pat_enc_csn_sid'] = admins_rte['pat_enc_csn_sid'].shift(periods=-1)

        # check if the route has any administrations in the timeframe
        if admins_rte.shape[0] == 0:
            #print(drug_name)
            #print(rte)
            print(drug_name + ' ' + rte + ' has no admins in this timeframe')
            continue

        # Find mins to next medication administration
        admins_rte['min_to_next'] = admins_rte.apply(lambda row: get_min_to_next(row['osler_sid'], row['next_osler_sid'], row['pat_enc_csn_sid'], row['next_pat_enc_csn_sid'],
                                                                                 row['taken_time'], row['next_taken_time'], row['hosp_disch_time']), axis = 1)

        admins_rte = pd.merge(left = admins_rte, right = weights , on = ['osler_sid','pat_enc_csn_sid'])

        # Convert units of medication (sig -> dose)
        admins_rte['dose'] = admins_rte.apply(lambda row: convert_units(unit_convs, row['sig'], row['dose_unit'],row['weight'], row['min_to_next']), axis = 1)

        # For each admin, multiply the dose per weight by time. 
        admins_rte['dose_by_time'] = admins_rte['dose'] * admins_rte['min_to_next']

        # Add up all doses of those for the whole ICU stay, then divide by length of ICU stay to get average dose/weight over whole stay.
        admins_rte = admins_rte.groupby(['osler_sid', 'pat_enc_csn_sid'], as_index = False).sum()
        admins_rte['average_dose_' + drug_name + '_' + rte] = admins_rte['dose_by_time'] / admins_rte['LOS']
        result = result.merge(admins_rte[['osler_sid','pat_enc_csn_sid','average_dose_' + drug_name + '_' + rte]], on = ['osler_sid','pat_enc_csn_sid'], how = 'outer')
    
    return result


# Generate dosage features
merged = CHF_hosp_icu_stays.copy()
for i in range(len(all_drug_doses)):
    # Get just drug name.
    drug_name = all_drug_doses[i].split('/')[-1]
    drug_name = drug_name.split('.')[0]
    temp = get_drug_dosage_feat(drug_name, CHF_hosp_icu_stays)
    #temp = get_drug_dosage_featwRoute(drug_name, CHF_hosp_icu_stays)
    if isinstance(temp, pd.DataFrame) == False:
            continue
    merged = pd.merge(left = merged, right = temp, on = ['osler_sid','pat_enc_csn_sid'], how = 'left')

betamethasone
cortisone
dexamethasone
dexmedetomidine
diazepam
dobutamine
dopamine
epinephrine
Unit:  mcg/mL
<class 'float'>
2.0
<class 'NoneType'>
None
Unit:  mcg/mL
<class 'float'>
0.05
<class 'NoneType'>
None
Unit:  mcg/mL
<class 'float'>
0.13
<class 'NoneType'>
None
etomidate
fentanyl
fosphenytoin
gabapentin
hydralazine
hydromorphone
isoproterenol
ketamine
labetalol
lacosamide
levetiracetam
lorazepam
mannitol
methylprednisolone
midazolam
milrinone
morphine
nesiritide
nicardipine
nitroglycerin
nitroprusside
norepinephrine
oxycodone
phenobarbital
phenylephrine
Unit:  mcg/mL
<class 'float'>
0.3
<class 'NoneType'>
None
Unit:  mcg/mL
<class 'float'>
0.1
<class 'NoneType'>
None
phenytoin
prednisone
pregabalin
propofol
remifentanil
riociguat
triamcinolone
valproate
vasopressin


temp = merged.drop(['osler_sid','pat_enc_csn_sid','hosp_admsn_time','hosp_disch_time','labels'], axis = 1)
temp = pd.concat([temp.mean(), temp.std()], axis = 1).reset_index()
temp.columns = ['medication','mean','std']
temp['mean ± std'] = round(temp['mean'], 4).astype(str) + ' ± '+ round(temp['std'], 4).astype(str)
temp['medication'] = temp['medication'].str.replace('average_dose_','')
medstable = pd.merge(left = temp, right = med_units, how = 'left', left_on = 'medication', right_on = 'drugname')
medstable = medstable.drop(['mean','std','drugname'], axis = 1)
medstable.to_csv('medstable.csv', index = False)
#medstable.to_csv('medstablewRoutes.csv', index = False)


import scipy.stats as stats

df0 = merged[merged.labels == 0].drop(['osler_sid','pat_enc_csn_sid','hosp_admsn_time','hosp_disch_time','labels'], axis=1)
df1 = merged[merged.labels == 1].drop(['osler_sid','pat_enc_csn_sid','hosp_admsn_time','hosp_disch_time','labels'], axis=1)

pvalues = []
meds = []

for col in df0.columns:
    try:
        _, pvalue = stats.mannwhitneyu(x=df0[col].dropna(), y=df1[col].dropna(), alternative = 'two-sided')
        pvalues.append(pvalue)
        meds.append(col)
    except ValueError:
        pvalues.append('nan') 

med_stat_table = pd.concat([df0.median(), df0.count(), df1.median(), df1.count()], axis = 1)
med_stat_table['p-value'] = pvalues
#med_stat_table = med_stat_table.applymap(lambda x: round(x, 3)).reset_index()
med_stat_table = med_stat_table.reset_index()
med_stat_table.columns = ['medication','label 0 median','label 0 counts','label 1 median','label 1 counts','p-value']
med_stat_table['medication'] = med_stat_table['medication'].str.replace('average_dose_','')

med_stat_table.to_csv('med_stat_table.csv', index = False)
med_stat_table.head()

/home/idies/miniconda3/lib/python3.8/site-packages/scipy/stats/stats.py:7028: RuntimeWarning: divide by zero encountered in scalar divide
  z = (bigu - meanrank) / sd


df0[col].dropna()

0       0.001003
4       0.014419
11      0.001893
16      0.005221
18      0.000082
          ...   
4454    0.000609
4455    0.000152
4456    0.000077
4461    0.001681
4464    0.000712
Name: average_dose_vasopressin, Length: 605, dtype: float64


merged.head()


medication_rte_features = get_feat_wRoute(CHF_hosp_icu_stays, CHF_hosp_med_admin)

/home/idies/miniconda3/lib/python3.8/site-packages/pandas/core/strings.py:2001: UserWarning: This pattern has match groups. To actually get the groups, use str.extract.
  return func(self, *args, **kwargs)


medication_features = get_feat(CHF_hosp_icu_stays, CHF_hosp_med_admin)


medication_rte_features.to_csv('medication_rte_features.csv', index = False)
medication_features.to_csv('medication_features.csv', index = False)


medication_rte_features


df = pd.DataFrame()
df['meds'] = medication_features.columns[7:]
df['count'] = medication_features.iloc[:, 7:].sum().values/4488*100
df = df.sort_values('count')

sns.set_theme()
plt.figure(figsize = (18,18))
plt.barh(df['meds'],df['count'])
plt.title('Medication Class Prevalence in CHF Hospital Stays (N = {})'.format(len(CHF_hosp_icu_stays)), fontsize = 18)
plt.xlim([0, 102])
plt.xticks(fontsize = 14)
plt.yticks(fontsize = 14)
plt.xlabel('prevalence (%)',fontsize = 16)
plt.show()


NameErrorTraceback (most recent call last)
<ipython-input-368-4514580d180f> in <module>
      1 df = pd.DataFrame()
----> 2 df['meds'] = medication_features.columns[7:]
      3 df['count'] = medication_features.iloc[:, 7:].sum().values/4488*100
      4 df = df.sort_values('count')
      5 

NameError: name 'medication_features' is not defined


df = df[df['count'] > 80]

plt.figure(figsize = (7,3))
plt.barh(df['meds'],df['count'])
plt.title('Medication Class Prevalence in CHF Hospital Stays (zoomed in on prev > 80 %) (N = 4,488)', fontsize = 14)
plt.xlim([0, 102])
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('prevalence (%)',fontsize = 12)
plt.show()


temp0 = medication_features[medication_features.labels == 0]
temp1 = medication_features[medication_features.labels == 1]
df0 = pd.DataFrame()
df1 = pd.DataFrame()
df0['meds'] = temp0.columns[7:]
df0['count'] = temp0.iloc[:, 7:].sum().values/temp0.shape[0]*100
df1['meds'] = temp1.columns[7:]
df1['count'] = temp1.iloc[:, 7:].sum().values/temp1.shape[0]*100
df0 = df0.sort_values('count')
df1 = df1.sort_values('count')

#plt.figure(figsize = (18,18))
plt.barh(df0.iloc[-5:, :]['meds'],df0.iloc[-5:, :]['count'])
plt.title('Label 0: CHF Hospital Stays w/o Readmissions (N = 3,801)', fontsize = 14)
plt.xlim([0, 102])
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('prevalence (%)',fontsize = 12)
plt.show()

#plt.figure(figsize = (18,18))
plt.barh(df1.iloc[-5:, :]['meds'],df1.iloc[-5:, :]['count'])
plt.title('Label 1: CHF Hospital Stays w/ Readmissions (N = 687)', fontsize = 14)
plt.xlim([0, 102])
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('prevalence (%)',fontsize = 12)
plt.show()


df1.iloc[-5:, :]


# Drop admins missing dose info. 
med_admin = med_admin.dropna(subset=['dose_unit'])

# Save off all dose units. 
admins = med_admin.dose_unit.value_counts().reset_index()
admins['prop'] = admins['dose_unit'] / admins['dose_unit'].sum()

admins.to_csv('all_med_doses_units_raw.csv', index=False)


msno.matrix(med_admin, figsize = (5,5))

<AxesSubplot:>


# Get med names for each medication


def iqr(x):
    try:
        x = x.dropna()
        q75, q25 = np.percentile(x, [75 ,25])
        return q75 - q25
    except:
        return np.nan
    

med_stat = pd.DataFrame()
temp = merged.iloc[:, 5:]

# All nan values were skipped
med_stat['mean'] = temp.mean()  
med_stat['std'] = temp.std()
med_stat['min'] = temp.min()
med_stat['median'] = temp.median()
med_stat['max'] = temp.max()
med_stat = med_stat.reset_index()
med_stat['IQR'] = temp.apply(iqr, axis = 1)
med_stat['index'] = med_stat['index'].str.replace('average_dose_','')
med_stat = med_stat.rename(columns = {'index':'medication'})
med_stat


all_drug_doses
df_med = pd.DataFrame()
for i in range(len(all_drug_doses)):
    drug_class = pd.read_csv(all_drug_doses[i])
    #med_dict[all_drug_classes[i].split('/')[-1].split('.')[0]] = drug_class.iloc[:, 0].to_list()
    df_med = pd.concat([df_med, drug_class], axis = 1)


meds = pd.melt(df_med).dropna()
meds.columns = ['medication','medication name']
meds
#meds = meds.set_index('drug class')


temp = med_stat.merge(med_units, how = 'left', left_on='medication', right_on='drugname').drop('drugname', axis=1)
merged = temp.merge(meds, how='left', on='medication')


# #merged.groupby(['medication','medication name'])['mean','std','min','max','IQR','unit'].last()
# grouped = merged.groupby(['medication','mean','std','min','max','IQR','unit'])['medication name'].apply(lambda x: x.join(','))
# grouped = pd.DataFrame(grouped).reset_index()
# grouped


grouped.to_csv('med_dose.csv')

	missing count	total count	prop (%)
mannitol	12	90	13.333333
etomidate	29	269	10.780669
phenylephrine	259	5154	5.025223
midazolam	198	10065	1.967213
isoproterenol	4	270	1.481481
nitroprusside	182	13528	1.345358
nitroglycerin	182	13528	1.345358
lorazepam	84	6631	1.266777
dopamine	90	7450	1.208054
vasopressin	167	15212	1.097818
propofol	250	24267	1.030206
epinephrine	947	149494	0.633470
fentanyl	393	62245	0.631376
labetalol	36	5820	0.618557
morphine	20	3258	0.613874
norepinephrine	656	117984	0.556008
nicardipine	54	10300	0.524272
fosphenytoin	2	433	0.461894
methylprednisolone	10	3203	0.312207
triamcinolone	3	1185	0.253165
hydralazine	65	26251	0.247610
diazepam	4	2115	0.189125
phenytoin	2	1063	0.188147
dobutamine	26	14430	0.180180
ketamine	7	3936	0.177846
dexamethasone	7	4785	0.146290
oxycodone	56	51375	0.109002
hydromorphone	33	31616	0.104378
cortisone	6	9043	0.066350
levetiracetam	6	12687	0.047293
milrinone	6	13771	0.043570
gabapentin	4	28046	0.014262
lacosamide	0	3841	0.000000
phenobarbital	0	902	0.000000
dexmedetomidine	0	32766	0.000000
prednisone	0	11536	0.000000
pregabalin	0	4024	0.000000
nesiritide	0	46	0.000000
remifentanil	0	32	0.000000
riociguat	0	13	0.000000
valproate	0	1155	0.000000
betamethasone	0	258	0.000000

	medication	label 0 median	label 0 counts	label 1 median	label 1 counts	p-value
0	betamethasone	2443.768997	4	8343.149212	2	0.10021
1	cortisone	1440.934743	374	848.796527	54	0.0467538
2	dexamethasone	6.764266	273	7.467784	55	0.269819
3	dexmedetomidine	0.000025	1110	0.000016	225	9.71769e-05
4	diazepam	1597.359365	222	1022.481078	52	0.152724

	osler_sid	pat_enc_csn_sid	hosp_admsn_time	hosp_disch_time	labels	Acetaminophen_Oral	Acetaminophen_Per NG Tube	Acetaminophen_nan	Acetaminophen_Rectal	Acetaminophen_Nasogastric	...	Vasopressors_intravenous push	Vasopressors_Right Eye	Vasopressors_Intramuscular	Vasopressors_Topical	Vasopressors_Intraosseous	Vasopressors_Intracameral	Vasopressors_Left Eye	Vasopressors_central venous line infusion	Vasopressors_Intravitreal	Vasopressors_Intracavernosal
0	000432FF-214F-460C-9AB9-DB40A7265A82	1000414574	2017-12-10 17:12:00	2018-01-01 13:10:00	0.0	1	1	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	00081C8C-F7E4-4352-8786-54D8DCDC2AF9	1000468132	2016-09-01 10:49:00	2016-09-23 17:03:00	1.0	1	1	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	000EE0D9-9115-4281-9F96-D15AC0CBAF4E	1000375948	2019-07-16 17:06:00	2019-07-27 17:01:00	1.0	1	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
3	000F988E-763F-47E7-A92F-980028A69795	1000202595	2018-03-22 00:39:00	2018-03-31 19:23:00	0.0	1	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4	000FC796-7639-402E-BA61-CEC8AD52C5E2	1000010641	2016-09-25 10:04:00	2016-10-13 14:19:00	0.0	1	1	0	0	0	...	0	0	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
4483	FFD7AAF8-E048-43CF-BA9C-7E41BB86AA1E	1000447878	2017-02-04 00:58:00	2017-02-05 16:15:00	1.0	1	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4484	FFE0640C-3AFF-485F-8DA8-EFB313567864	1000360263	2018-09-15 15:35:00	2018-10-03 16:09:00	0.0	1	1	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4485	FFE68A1A-2EB6-4687-87E4-9D2495B14134	1000328439	2016-10-19 06:33:00	2016-10-24 11:28:00	0.0	1	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4486	FFEDBC8A-4706-41DA-8C1B-F7F03715ABBB	1000367783	2020-06-19 21:44:00	2020-09-03 14:17:00	1.0	1	1	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4487	FFFA2A47-2BC5-4371-B0F9-E89DB30F186A	1000543690	2019-01-04 08:56:00	2019-01-11 17:08:00	0.0	1	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

	meds	count
0	Acetaminophen	84.570597
5	Anticoagulants	90.393013
35	Opioids	91.848617
27	Laxatives	94.468705
19	CrystalloidFluids	98.835517

	medication	mean	std	min	median	max	IQR
0	betamethasone	4578.35	4087.01	0.00	5391.91	10732.27	121.88
1	cortisone	40941.70	629273.20	0.00	1354.81	12984738.77	36.67
2	dexamethasone	20.58	36.61	0.00	7.23	292.45	92.16
3	dexmedetomidine	0.00	0.01	0.00	0.00	0.50	24761.21
4	diazepam	2353.79	2140.29	12.01	1514.07	9610.09	218.52
5	dobutamine	0.50	3.26	0.00	0.04	44.65	nan
6	dopamine	7.40	141.13	0.00	0.02	2765.79	2633.51
7	epinephrine	0.12	2.65	0.00	0.00	104.48	nan
8	etomidate	164.15	164.91	0.22	123.91	1649.43	1448.10
9	fentanyl	0.11	0.63	0.00	0.03	15.14	483.71
10	fosphenytoin	0.00	0.00	0.00	0.00	0.00	216.62
11	gabapentin	70170.98	139540.80	40.61	17939.13	1191163.03	0.03
12	hydralazine	3070.74	3714.80	25.45	1915.75	49069.42	75871.36
13	hydromorphone	282.58	1664.38	0.02	94.03	37710.51	17.63
14	isoproterenol	1.46	4.80	0.00	0.01	29.43	0.00
15	ketamine	98.37	378.74	0.00	0.07	3372.39	55.27
16	labetalol	6530.43	15221.57	2.42	2888.69	194604.18	0.29
17	lacosamide	11024.32	22797.40	409.19	4182.32	177035.40	1376.44
18	levetiracetam	120199.76	285055.79	1524.17	42141.05	3993069.07	74.37
19	lorazepam	277.87	375.27	0.06	160.16	3999.02	399.99
20	mannitol	252149.75	305406.73	8.37	137638.44	1095231.50	124.82
21	methylprednisolone	606.60	2037.34	0.52	113.18	29156.47	226.84
22	midazolam	274.55	429.10	0.00	137.18	4998.83	333.16
23	milrinone	0.01	0.01	0.00	0.00	0.16	231.45
24	morphine	1851.89	8802.39	0.00	571.60	160099.79	1426.39
25	nesiritide	0.00	0.00	0.00	0.00	0.00	0.69
26	nicardipine	1798.72	23445.61	0.00	5.67	398497.08	0.03
27	nitroglycerin	136.07	2876.34	0.00	0.60	86532.47	0.05
28	nitroprusside	1.80	39.15	0.00	0.01	1177.87	476.97
29	norepinephrine	0.20	4.39	0.00	0.00	134.04	8669.32
30	oxycodone	955.01	1370.15	6.32	432.35	13292.48	199.57
31	phenobarbital	517.43	1491.42	2.22	30.92	5650.02	1339.15
32	phenylephrine	36.97	692.08	0.00	0.56	18927.16	81.31
33	phenytoin	67.78	176.08	0.00	0.00	1159.71	245.24
34	prednisone	48.74	108.34	0.29	13.20	969.63	318.60
35	pregabalin	7699.11	11609.82	94.01	4245.61	90412.48	270.69
36	propofol	61.63	521.73	0.00	0.14	16575.43	107.86
37	remifentanil	0.00	nan	0.00	0.00	0.00	86.31
38	riociguat	267.70	315.38	44.69	267.70	490.71	54723.07
39	triamcinolone	10144.81	18509.90	0.00	0.00	75629.91	3685.62
40	valproate	535.08	1159.17	8.95	128.80	5973.81	598.12
41	vasopressin	0.15	1.81	0.00	0.00	39.21	682.82

PMAP Medication Info Feature Engineering¶

Data Cleaning¶

Investigate relationship between medication names and routes to fill in missing route values¶

Functions¶

Main¶

Statistical Analysis¶

Data Quality Checks¶

1. Check if there are irregular dosage values - verified w/ Dr. F on 02/09/23¶

Data Visualization¶

Medication Class Prevalence in CHF Hospital Stays¶

Medication Class Prevalence in CHF Hospital Stays (Zoomed in on prevalence > 80 %)¶

Medication Class Prevalence in CHF Hospital Stays for both labels¶

Save off all medication dose units in our dataset¶

Statistical Analysis¶

1. Summary statistics for each medication¶