Created: January 17, 2023
Last modified: Feburary 15, 2023
This notebook generates two features from medication data:
Author: Vina Ro
# Import packages
import numpy as np
import pandas as pd
import time as time
import seaborn as sns
import missingno as msno
import os
import glob
from time import time
from pathlib import Path
import inspect as insp
import matplotlib.pyplot as plt
# Set filepath
file_path = Path('PMAP_Meds.ipynb').resolve()
data_path = file_path.parent.parent.parent.parent.parent.joinpath('Data/jbergma8/IRB_271579_Faraday/IRB-271579-v3-DEID-220607-no-ptsd')
# Import data
med_admin = pd.read_csv(data_path / 'med_admin.csv',
usecols = ['osler_sid','pat_enc_csn_sid','medication_name','taken_time','mar_action', 'route', 'sig','dose_unit'],
parse_dates = ['taken_time'])
#med_admin_updated = pd.read_csv(data_path / 'med_admin_updated.csv',
# usecols = ['osler_sid','pat_enc_csn_sid','medication_name','taken_time','mar_action', 'route', 'sig','dose_unit'],
# parse_dates = ['taken_time'])
#med_orders = pd.read_csv(data_path / 'med_orders.csv')
weights = pd.read_csv('weights.csv')
CHF_hosp_icu_stays = pd.read_csv('CHF_hosp_icu_stays.csv',parse_dates=['hosp_admsn_time','hosp_disch_time'])
# Remove admins where it wasn't a real admin
keep_actions = ['Given','Rate Verify','New Bag','Handoff','Rate/Dose Verify','Rate Change','Stopped']
med_admin = med_admin[med_admin.mar_action.isin(keep_actions)]
# Extract medication data for our cohort
CHF_hosp_med_admin = pd.merge(left = CHF_hosp_icu_stays[['osler_sid','pat_enc_csn_sid','hosp_admsn_time','hosp_disch_time']],
right = med_admin,
how = 'left',
on = ['osler_sid','pat_enc_csn_sid'])
# Taken medication time needs to be within hospital admission period
CHF_hosp_med_admin = CHF_hosp_med_admin[CHF_hosp_med_admin.taken_time.between(CHF_hosp_med_admin.hosp_admsn_time, CHF_hosp_med_admin.hosp_disch_time)]
CHF_hosp_med_admin['medication_name'] = CHF_hosp_med_admin['medication_name'].str.lower()
CHF_hosp_med_admin = CHF_hosp_med_admin.sort_values(['osler_sid','hosp_admsn_time','taken_time'])
# Get list of filenames in specified folder
all_drug_classes = sorted(glob.glob("./medication_class_name/*"))
all_drug_doses = sorted(glob.glob("./medication_names_dose/*"))
len(all_drug_doses)
42
msno.matrix(CHF_hosp_med_admin, figsize = (5,5))
plt.show()
med_dict = {}
for i in range(len(all_drug_doses)):
drug_name = all_drug_doses[i].split('/')[-1]
drug_name = drug_name.split('.')[0]
drug = pd.read_csv('./medication_names_dose/' + drug_name + '.csv', squeeze = True)
druglist = drug.str.lower().str.strip().to_list()
# print(drug_name)
admins = CHF_hosp_med_admin[CHF_hosp_med_admin['medication_name'].str.contains('|'.join(druglist))]
med_dict[drug_name] = [len(admins[admins.route.isnull()]), len(admins)]
df = pd.DataFrame.from_dict(med_dict, orient = 'index', columns = ['missing count', 'total count'])
df['prop (%)'] = df['missing count'] / df['total count']*100
df.sort_values('prop (%)', ascending = False)
missing count | total count | prop (%) | |
---|---|---|---|
mannitol | 12 | 90 | 13.333333 |
etomidate | 29 | 269 | 10.780669 |
phenylephrine | 259 | 5154 | 5.025223 |
midazolam | 198 | 10065 | 1.967213 |
isoproterenol | 4 | 270 | 1.481481 |
nitroprusside | 182 | 13528 | 1.345358 |
nitroglycerin | 182 | 13528 | 1.345358 |
lorazepam | 84 | 6631 | 1.266777 |
dopamine | 90 | 7450 | 1.208054 |
vasopressin | 167 | 15212 | 1.097818 |
propofol | 250 | 24267 | 1.030206 |
epinephrine | 947 | 149494 | 0.633470 |
fentanyl | 393 | 62245 | 0.631376 |
labetalol | 36 | 5820 | 0.618557 |
morphine | 20 | 3258 | 0.613874 |
norepinephrine | 656 | 117984 | 0.556008 |
nicardipine | 54 | 10300 | 0.524272 |
fosphenytoin | 2 | 433 | 0.461894 |
methylprednisolone | 10 | 3203 | 0.312207 |
triamcinolone | 3 | 1185 | 0.253165 |
hydralazine | 65 | 26251 | 0.247610 |
diazepam | 4 | 2115 | 0.189125 |
phenytoin | 2 | 1063 | 0.188147 |
dobutamine | 26 | 14430 | 0.180180 |
ketamine | 7 | 3936 | 0.177846 |
dexamethasone | 7 | 4785 | 0.146290 |
oxycodone | 56 | 51375 | 0.109002 |
hydromorphone | 33 | 31616 | 0.104378 |
cortisone | 6 | 9043 | 0.066350 |
levetiracetam | 6 | 12687 | 0.047293 |
milrinone | 6 | 13771 | 0.043570 |
gabapentin | 4 | 28046 | 0.014262 |
lacosamide | 0 | 3841 | 0.000000 |
phenobarbital | 0 | 902 | 0.000000 |
dexmedetomidine | 0 | 32766 | 0.000000 |
prednisone | 0 | 11536 | 0.000000 |
pregabalin | 0 | 4024 | 0.000000 |
nesiritide | 0 | 46 | 0.000000 |
remifentanil | 0 | 32 | 0.000000 |
riociguat | 0 | 13 | 0.000000 |
valproate | 0 | 1155 | 0.000000 |
betamethasone | 0 | 258 | 0.000000 |
We can see from the above that the top 3 medications with most missing values are mannitol, etomidate, and phenylephrine. \ Hence we'll start with filling the missing values for these three meds in the function by making a dictionary that maps missing values.
top_3_drugs = ['mannitol','etomidate','phenylephrine']
for i in range(len(top_3_drugs)):
med_dict = {}
drug_name = top_3_drugs[i]
print(drug_name + '-----------------')
drug = pd.read_csv('./medication_names_dose/' + drug_name + '.csv', squeeze = True)
druglist = drug.str.lower().str.strip().to_list()
admins = CHF_hosp_med_admin[CHF_hosp_med_admin['medication_name'].str.contains('|'.join(druglist))]
for x in admins.route.unique():
med_dict[x] = list(admins[admins.route == x]['medication_name'].unique())
print(med_dict)
print('med names w missing route values:')
print(list(admins[admins.route.isnull()]['medication_name'].unique()))
mannitol----------------- {'Intravenous': ['mannitol 25 % intravenous solution', 'zoledronic acid 4 mg/100 ml-mannitol-0.9 % nacl intravenous piggyback', 'mannitol 20 % intravenous solution'], 'Extracorporeal': ['mannitol 25 % intravenous solution'], nan: []} med names w missing route values: ['mannitol 25 % intravenous solution'] etomidate----------------- {'Intravenous': ['etomidate 2 mg/ml intravenous solution'], nan: [], 'Intraosseous': ['etomidate 2 mg/ml intravenous solution']} med names w missing route values: ['etomidate 2 mg/ml intravenous solution'] phenylephrine----------------- {'Intravenous': ['phenylephrine 1 mg/10 ml (100 mcg/ml) in 0.9 % sod.chloride iv syringe', 'phenylephrine 1 mg/10 ml (100 mcg/ml) swfi syringe', 'phenylephrine 10 mg/ml injection solution', 'phenylephrine 40 mcg/ml in ns (jhh ped)', 'phenylephrine infusion 20 mg/250 ml ns (jhh)'], nan: [], 'Continuous IV Infusion': ['phenylephrine infusion 20 mg/250 ml ns (jhh)', 'phenylephrine infusion 40 mg/250 ml ns (jhbmc-jhh)', 'phenylephrine infusion 200 mg/250 ml ns (bmc-jhh)'], 'Extracorporeal': ['phenylephrine 10 mg/ml injection solution'], 'Intravenous (Continuous Infusion)': ['phenylephrine infusion 20 mg/250 ml ns (jhh-bmc)', 'phenylephrine infusion 40 mg/250 ml ns (jhbmc-jhh)', 'phenylephrine infusion 20 mg/250 ml ns (jhh adult ed pyxis)'], 'Both Eyes': ['phenylephrine 2.5 % eye drops (per drop)', 'phenylephrine 2.5 % eye drops'], 'Injection': ['phenylephrine 1 mg/10 ml (100 mcg/ml) in 0.9 % sod.chloride iv syringe', 'phenylephrine 10 mg/ml injection solution'], 'intravenous push': ['phenylephrine 1 mg/10 ml (100 mcg/ml) in 0.9 % sod.chloride iv syringe'], 'Right Eye': ['phenylephrine 2.5 % eye drops'], 'Left Eye': ['phenylephrine 2.5 % eye drops', 'phenylephrine 2.5 % eye drops (per drop)'], 'Intracavernosal': ['phenylephrine 1 mg/10 ml (100 mcg/ml) syringe intracavernosal']} med names w missing route values: ['phenylephrine 10 mg/ml injection solution', 'phenylephrine 1 mg/10 ml (100 mcg/ml) in 0.9 % sod.chloride iv syringe', 'phenylephrine 1 mg/10 ml (100 mcg/ml) swfi syringe', 'phenylephrine infusion 20 mg/250 ml ns (jhh)']
By printing out all the medication names whose route values are missing, we can see that they all map to a certain routes for other entries with an identical medication name.
def convert_units(convs, val, unit, weight, minutes):
'''
This function takes in unit conversion dictionary, value amount, dose unit, closest weight in kg, and time length of admin
'''
conv_factor = convs.get('conv_factor').get(unit)
multiply_time = convs.get('multiply_min').get(unit)
# If desired unit divides by weight.
divide_kg = convs.get('divide_kg')
if divide_kg == None:
divide_kg = 0
else :
divide_kg = divide_kg.get(unit)
# If desired unit is not divided by weight.
multiply_kg = convs.get('multiply_kg')
if multiply_kg == None:
multiply_kg = 0
else :
multiply_kg = multiply_kg.get(unit)
# Apply conversion factors and adjustments.
try:
val = val * conv_factor
except:
print("Unit: ", unit)
print(type(val))
print(val)
print(type(conv_factor))
print(conv_factor)
if divide_kg == 1: # Needs to divide by patient weight
val = val / weight
if multiply_kg == 1:
val = val * weight # Needs to multiply by patient weight
if multiply_time == 1:
val = val * multiply_time
return val
def get_min_to_next(curr_pat_id, next_pat_id, curr_enc_id, next_enc_id, taken_time, next_taken_time, hosp_disch_time):
'''
This function finds the time to the next admin, or the end of the ICU stay if it's the last admin of the encounter.
'''
# Check if next admin is same encounter or not.
if (curr_pat_id == next_pat_id) & (curr_enc_id == next_enc_id):
return (next_taken_time - taken_time).total_seconds()/60
else:
return(hosp_disch_time - taken_time).total_seconds()/60
def get_feat(df_hosp_orig, df_meds):
'''
This function adds columns to the original dataframe with binary flags for each medication class for each hospital stay.
'''
df_hosp = df_hosp_orig.copy()
for i in range(len(all_drug_classes)):
drug = pd.read_csv(all_drug_classes[i])
drug_name = drug.columns.values[0]
drug_list = list(drug[drug_name].str.lower())
# Only keep rows where sugars are not being used as a solvent
if drug_name == 'Sugars':
df_meds = df_meds[~df_meds['medication_name'].str.contains('piggyback|in dextrose|in 5% dextrose')]
# Subset dataframe for specific medications in the current medication class
temp = df_meds[df_meds['medication_name'].str.contains('|'.join(drug_list), na = False)]
df_hosp[drug_name] = df_hosp['pat_enc_csn_sid'].isin(temp.pat_enc_csn_sid).astype(int)
return df_hosp
def get_feat_wRoute(df_hosp_orig, df_meds):
'''
This function adds columns to the original dataframe with binary flags for each medication class and categorized to different routes for each hospital stay.
'''
df_hosp = df_hosp_orig.copy()
for i in range(len(all_drug_classes)):
drug = pd.read_csv(all_drug_classes[i])
drug_name = drug.columns.values[0]
drug_list = list(drug[drug_name].str.lower())
# Only keep rows where sugars are not being used as a solvent
if drug_name == 'Sugars':
df_meds = df_meds[~df_meds['medication_name'].str.contains('piggyback|in dextrose|in 5% dextrose')]
# Subset dataframe for specific medications in the current medication class
temp = df_meds[df_meds['medication_name'].str.contains('|'.join(drug_list), na = False)]
for routes in temp.route.unique():
feat = temp[temp.route == routes]['pat_enc_csn_sid'].unique()
df_hosp[drug_name + '_{}'.format(routes)] = df_hosp['pat_enc_csn_sid'].isin(feat).astype(int)
return df_hosp
def get_drug_dosage_feat(drug_name, df_cohort):
print(drug_name)
inp = df_cohort.copy()
result = CHF_hosp_icu_stays[['osler_sid','pat_enc_csn_sid']].copy()
#admins = CHF_hosp_med_admin.copy()
drug = pd.read_csv('./medication_names_dose/' + drug_name + '.csv', squeeze = True)
druglist = drug.str.lower().str.strip().to_list()
# Get hosp length of stay (LOS) in minutes.
inp['LOS'] = (inp['hosp_disch_time'] - inp['hosp_admsn_time']).dt.total_seconds()/60 # in minutes
admins = CHF_hosp_med_admin[CHF_hosp_med_admin['medication_name'].str.contains('|'.join(druglist))]
if admins.shape[0] == 0:
print(drug_name + ' has no admins')
return 'no admins'
admins = admins.dropna(subset=['sig','dose_unit']).copy()
# Drop admins that have "mL" as dose unit.
admins = admins[~(admins['dose_unit']=='mL')].copy()
# Get minutes between admins.
admins = admins.merge(inp[['osler_sid','pat_enc_csn_sid','LOS']], on=['osler_sid','pat_enc_csn_sid'], how='left')
if(drug_name in(['ketamine'])):
admins['dose_unit'] = admins['dose_unit'].str.replace(' PCA', '')
# Find correct dose units.
med_unit = pd.read_csv('./medication_dose_convs/med_units.csv')
med_unit.set_index('drugname', inplace=True)
#Get dose units string.
target_unit = med_unit.loc[drug_name,'unit']
target_unit_str = ''.join(target_unit.split('/')[0:-1]) #mcg
unit_convs = pd.read_csv('./medication_dose_convs/all_med_doses_units_'+ target_unit_str + '.csv')
unit_convs.set_index('index', inplace=True)
unit_convs = unit_convs.to_dict()
# Do the above for medications but not specified by given routes
admins['next_taken_time'] = admins['taken_time'].shift(periods = -1)
admins['next_osler_sid'] = admins['osler_sid'].shift(periods = -1)
admins['next_pat_enc_csn_sid'] = admins['pat_enc_csn_sid'].shift(periods = -1)
admins['min_to_next'] = admins.apply(lambda row: get_min_to_next(row['osler_sid'], row['next_osler_sid'], row['pat_enc_csn_sid'], row['next_pat_enc_csn_sid'],
row['taken_time'], row['next_taken_time'], row['hosp_disch_time']), axis = 1)
# Get weights (already extracted from most recent)
admins = pd.merge(left = admins, right = weights , on = ['osler_sid','pat_enc_csn_sid'])
admins['dose'] = admins.apply(lambda row: convert_units(unit_convs, row['sig'], row['dose_unit'],row['weight'], row['min_to_next']), axis = 1)
admins['dose_by_time'] = admins['dose'] * admins['min_to_next']
admins = admins.groupby(['osler_sid', 'pat_enc_csn_sid'], as_index = False).sum()
admins['average_dose_' + drug_name ] = admins['dose_by_time'] / admins['LOS']
result = result.merge(admins[['osler_sid','pat_enc_csn_sid','average_dose_' + drug_name]], on = ['osler_sid','pat_enc_csn_sid'], how = 'left')
return result
def get_drug_dosage_featwRoute(drug_name, df_cohort):
print(drug_name)
inp = df_cohort.copy()
result = CHF_hosp_icu_stays[['osler_sid','pat_enc_csn_sid']].copy()
#admins = CHF_hosp_med_admin.copy()
drug = pd.read_csv('./medication_names_dose/' + drug_name + '.csv', squeeze = True)
druglist = drug.str.lower().str.strip().to_list()
# Get hosp length of stay (LOS) in minutes.
inp['LOS'] = (inp['hosp_disch_time'] - inp['hosp_admsn_time']).dt.total_seconds()/60 # in minutes
admins = CHF_hosp_med_admin[CHF_hosp_med_admin['medication_name'].str.contains('|'.join(druglist))]
if admins.shape[0] == 0:
print(drug_name + ' has no admins')
return 'no admins'
admins = admins.dropna(subset=['sig','dose_unit','route']).copy()
# Drop admins that have "mL" as dose unit.
admins = admins[~(admins['dose_unit']=='mL')].copy()
# Get minutes between admins.
admins = admins.merge(inp[['osler_sid','pat_enc_csn_sid','LOS']], on=['osler_sid','pat_enc_csn_sid'], how='left')
if(drug_name in(['ketamine'])):
admins['dose_unit'] = admins['dose_unit'].str.replace(' PCA', '')
# Find correct dose units.
med_unit = pd.read_csv('./medication_dose_convs/med_units.csv')
# Set index
med_unit.set_index('drugname', inplace=True)
#Get dose units string.
target_unit = med_unit.loc[drug_name,'unit']
target_unit_str = ''.join(target_unit.split('/')[0:-1]) #mcg
unit_convs = pd.read_csv('./medication_dose_convs/all_med_doses_units_'+ target_unit_str + '.csv')
unit_convs.set_index('index', inplace=True)
unit_convs = unit_convs.to_dict()
for rte in admins['route'].unique():
# For each route, get the subset of admins
admins_rte = admins[admins['route'] == rte].copy()
# Perform shift operations to get all med administrations in chronological order
admins_rte['next_taken_time'] = admins_rte['taken_time'].shift(periods = -1)
admins_rte['next_osler_sid'] = admins_rte['osler_sid'].shift(periods = -1)
admins_rte['next_pat_enc_csn_sid'] = admins_rte['pat_enc_csn_sid'].shift(periods=-1)
# check if the route has any administrations in the timeframe
if admins_rte.shape[0] == 0:
#print(drug_name)
#print(rte)
print(drug_name + ' ' + rte + ' has no admins in this timeframe')
continue
# Find mins to next medication administration
admins_rte['min_to_next'] = admins_rte.apply(lambda row: get_min_to_next(row['osler_sid'], row['next_osler_sid'], row['pat_enc_csn_sid'], row['next_pat_enc_csn_sid'],
row['taken_time'], row['next_taken_time'], row['hosp_disch_time']), axis = 1)
admins_rte = pd.merge(left = admins_rte, right = weights , on = ['osler_sid','pat_enc_csn_sid'])
# Convert units of medication (sig -> dose)
admins_rte['dose'] = admins_rte.apply(lambda row: convert_units(unit_convs, row['sig'], row['dose_unit'],row['weight'], row['min_to_next']), axis = 1)
# For each admin, multiply the dose per weight by time.
admins_rte['dose_by_time'] = admins_rte['dose'] * admins_rte['min_to_next']
# Add up all doses of those for the whole ICU stay, then divide by length of ICU stay to get average dose/weight over whole stay.
admins_rte = admins_rte.groupby(['osler_sid', 'pat_enc_csn_sid'], as_index = False).sum()
admins_rte['average_dose_' + drug_name + '_' + rte] = admins_rte['dose_by_time'] / admins_rte['LOS']
result = result.merge(admins_rte[['osler_sid','pat_enc_csn_sid','average_dose_' + drug_name + '_' + rte]], on = ['osler_sid','pat_enc_csn_sid'], how = 'outer')
return result
# Generate dosage features
merged = CHF_hosp_icu_stays.copy()
for i in range(len(all_drug_doses)):
# Get just drug name.
drug_name = all_drug_doses[i].split('/')[-1]
drug_name = drug_name.split('.')[0]
temp = get_drug_dosage_feat(drug_name, CHF_hosp_icu_stays)
#temp = get_drug_dosage_featwRoute(drug_name, CHF_hosp_icu_stays)
if isinstance(temp, pd.DataFrame) == False:
continue
merged = pd.merge(left = merged, right = temp, on = ['osler_sid','pat_enc_csn_sid'], how = 'left')
betamethasone cortisone dexamethasone dexmedetomidine diazepam dobutamine dopamine epinephrine Unit: mcg/mL <class 'float'> 2.0 <class 'NoneType'> None Unit: mcg/mL <class 'float'> 0.05 <class 'NoneType'> None Unit: mcg/mL <class 'float'> 0.13 <class 'NoneType'> None etomidate fentanyl fosphenytoin gabapentin hydralazine hydromorphone isoproterenol ketamine labetalol lacosamide levetiracetam lorazepam mannitol methylprednisolone midazolam milrinone morphine nesiritide nicardipine nitroglycerin nitroprusside norepinephrine oxycodone phenobarbital phenylephrine Unit: mcg/mL <class 'float'> 0.3 <class 'NoneType'> None Unit: mcg/mL <class 'float'> 0.1 <class 'NoneType'> None phenytoin prednisone pregabalin propofol remifentanil riociguat triamcinolone valproate vasopressin
temp = merged.drop(['osler_sid','pat_enc_csn_sid','hosp_admsn_time','hosp_disch_time','labels'], axis = 1)
temp = pd.concat([temp.mean(), temp.std()], axis = 1).reset_index()
temp.columns = ['medication','mean','std']
temp['mean ± std'] = round(temp['mean'], 4).astype(str) + ' ± '+ round(temp['std'], 4).astype(str)
temp['medication'] = temp['medication'].str.replace('average_dose_','')
medstable = pd.merge(left = temp, right = med_units, how = 'left', left_on = 'medication', right_on = 'drugname')
medstable = medstable.drop(['mean','std','drugname'], axis = 1)
medstable.to_csv('medstable.csv', index = False)
#medstable.to_csv('medstablewRoutes.csv', index = False)
import scipy.stats as stats
df0 = merged[merged.labels == 0].drop(['osler_sid','pat_enc_csn_sid','hosp_admsn_time','hosp_disch_time','labels'], axis=1)
df1 = merged[merged.labels == 1].drop(['osler_sid','pat_enc_csn_sid','hosp_admsn_time','hosp_disch_time','labels'], axis=1)
pvalues = []
meds = []
for col in df0.columns:
try:
_, pvalue = stats.mannwhitneyu(x=df0[col].dropna(), y=df1[col].dropna(), alternative = 'two-sided')
pvalues.append(pvalue)
meds.append(col)
except ValueError:
pvalues.append('nan')
med_stat_table = pd.concat([df0.median(), df0.count(), df1.median(), df1.count()], axis = 1)
med_stat_table['p-value'] = pvalues
#med_stat_table = med_stat_table.applymap(lambda x: round(x, 3)).reset_index()
med_stat_table = med_stat_table.reset_index()
med_stat_table.columns = ['medication','label 0 median','label 0 counts','label 1 median','label 1 counts','p-value']
med_stat_table['medication'] = med_stat_table['medication'].str.replace('average_dose_','')
med_stat_table.to_csv('med_stat_table.csv', index = False)
med_stat_table.head()
/home/idies/miniconda3/lib/python3.8/site-packages/scipy/stats/stats.py:7028: RuntimeWarning: divide by zero encountered in scalar divide z = (bigu - meanrank) / sd
medication | label 0 median | label 0 counts | label 1 median | label 1 counts | p-value | |
---|---|---|---|---|---|---|
0 | betamethasone | 2443.768997 | 4 | 8343.149212 | 2 | 0.10021 |
1 | cortisone | 1440.934743 | 374 | 848.796527 | 54 | 0.0467538 |
2 | dexamethasone | 6.764266 | 273 | 7.467784 | 55 | 0.269819 |
3 | dexmedetomidine | 0.000025 | 1110 | 0.000016 | 225 | 9.71769e-05 |
4 | diazepam | 1597.359365 | 222 | 1022.481078 | 52 | 0.152724 |
df0[col].dropna()
0 0.001003 4 0.014419 11 0.001893 16 0.005221 18 0.000082 ... 4454 0.000609 4455 0.000152 4456 0.000077 4461 0.001681 4464 0.000712 Name: average_dose_vasopressin, Length: 605, dtype: float64
merged.head()
osler_sid | pat_enc_csn_sid | hosp_admsn_time | hosp_disch_time | labels | average_dose_betamethasone | average_dose_cortisone | average_dose_dexamethasone | average_dose_dexmedetomidine | average_dose_diazepam | ... | average_dose_phenylephrine | average_dose_phenytoin | average_dose_prednisone | average_dose_pregabalin | average_dose_propofol | average_dose_remifentanil | average_dose_riociguat | average_dose_triamcinolone | average_dose_valproate | average_dose_vasopressin | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 000432FF-214F-460C-9AB9-DB40A7265A82 | 1000414574 | 2017-12-10 17:12:00 | 2018-01-01 13:10:00 | 0.00 | nan | nan | nan | 0.00 | nan | ... | nan | nan | nan | nan | 0.05 | nan | nan | nan | nan | 0.00 |
1 | 00081C8C-F7E4-4352-8786-54D8DCDC2AF9 | 1000468132 | 2016-09-01 10:49:00 | 2016-09-23 17:03:00 | 1.00 | nan | nan | nan | nan | nan | ... | 1.21 | nan | nan | nan | nan | nan | nan | nan | nan | nan |
2 | 000EE0D9-9115-4281-9F96-D15AC0CBAF4E | 1000375948 | 2019-07-16 17:06:00 | 2019-07-27 17:01:00 | 1.00 | nan | nan | nan | 0.00 | nan | ... | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan |
3 | 000F988E-763F-47E7-A92F-980028A69795 | 1000202595 | 2018-03-22 00:39:00 | 2018-03-31 19:23:00 | 0.00 | nan | nan | 1.86 | nan | nan | ... | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan |
4 | 000FC796-7639-402E-BA61-CEC8AD52C5E2 | 1000010641 | 2016-09-25 10:04:00 | 2016-10-13 14:19:00 | 0.00 | nan | nan | nan | nan | nan | ... | nan | nan | 18.80 | nan | 0.14 | nan | nan | nan | nan | 0.01 |
5 rows × 47 columns
medication_rte_features = get_feat_wRoute(CHF_hosp_icu_stays, CHF_hosp_med_admin)
/home/idies/miniconda3/lib/python3.8/site-packages/pandas/core/strings.py:2001: UserWarning: This pattern has match groups. To actually get the groups, use str.extract. return func(self, *args, **kwargs)
medication_features = get_feat(CHF_hosp_icu_stays, CHF_hosp_med_admin)
medication_rte_features.to_csv('medication_rte_features.csv', index = False)
medication_features.to_csv('medication_features.csv', index = False)
medication_rte_features
osler_sid | pat_enc_csn_sid | hosp_admsn_time | hosp_disch_time | labels | Acetaminophen_Oral | Acetaminophen_Per NG Tube | Acetaminophen_nan | Acetaminophen_Rectal | Acetaminophen_Nasogastric | ... | Vasopressors_intravenous push | Vasopressors_Right Eye | Vasopressors_Intramuscular | Vasopressors_Topical | Vasopressors_Intraosseous | Vasopressors_Intracameral | Vasopressors_Left Eye | Vasopressors_central venous line infusion | Vasopressors_Intravitreal | Vasopressors_Intracavernosal | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 000432FF-214F-460C-9AB9-DB40A7265A82 | 1000414574 | 2017-12-10 17:12:00 | 2018-01-01 13:10:00 | 0.0 | 1 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 00081C8C-F7E4-4352-8786-54D8DCDC2AF9 | 1000468132 | 2016-09-01 10:49:00 | 2016-09-23 17:03:00 | 1.0 | 1 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 000EE0D9-9115-4281-9F96-D15AC0CBAF4E | 1000375948 | 2019-07-16 17:06:00 | 2019-07-27 17:01:00 | 1.0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 000F988E-763F-47E7-A92F-980028A69795 | 1000202595 | 2018-03-22 00:39:00 | 2018-03-31 19:23:00 | 0.0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 000FC796-7639-402E-BA61-CEC8AD52C5E2 | 1000010641 | 2016-09-25 10:04:00 | 2016-10-13 14:19:00 | 0.0 | 1 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
4483 | FFD7AAF8-E048-43CF-BA9C-7E41BB86AA1E | 1000447878 | 2017-02-04 00:58:00 | 2017-02-05 16:15:00 | 1.0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4484 | FFE0640C-3AFF-485F-8DA8-EFB313567864 | 1000360263 | 2018-09-15 15:35:00 | 2018-10-03 16:09:00 | 0.0 | 1 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4485 | FFE68A1A-2EB6-4687-87E4-9D2495B14134 | 1000328439 | 2016-10-19 06:33:00 | 2016-10-24 11:28:00 | 0.0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4486 | FFEDBC8A-4706-41DA-8C1B-F7F03715ABBB | 1000367783 | 2020-06-19 21:44:00 | 2020-09-03 14:17:00 | 1.0 | 1 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4487 | FFFA2A47-2BC5-4371-B0F9-E89DB30F186A | 1000543690 | 2019-01-04 08:56:00 | 2019-01-11 17:08:00 | 0.0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4488 rows × 608 columns
df = pd.DataFrame()
df['meds'] = medication_features.columns[7:]
df['count'] = medication_features.iloc[:, 7:].sum().values/4488*100
df = df.sort_values('count')
sns.set_theme()
plt.figure(figsize = (18,18))
plt.barh(df['meds'],df['count'])
plt.title('Medication Class Prevalence in CHF Hospital Stays (N = {})'.format(len(CHF_hosp_icu_stays)), fontsize = 18)
plt.xlim([0, 102])
plt.xticks(fontsize = 14)
plt.yticks(fontsize = 14)
plt.xlabel('prevalence (%)',fontsize = 16)
plt.show()
NameErrorTraceback (most recent call last) <ipython-input-368-4514580d180f> in <module> 1 df = pd.DataFrame() ----> 2 df['meds'] = medication_features.columns[7:] 3 df['count'] = medication_features.iloc[:, 7:].sum().values/4488*100 4 df = df.sort_values('count') 5 NameError: name 'medication_features' is not defined
df = df[df['count'] > 80]
plt.figure(figsize = (7,3))
plt.barh(df['meds'],df['count'])
plt.title('Medication Class Prevalence in CHF Hospital Stays (zoomed in on prev > 80 %) (N = 4,488)', fontsize = 14)
plt.xlim([0, 102])
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('prevalence (%)',fontsize = 12)
plt.show()
temp0 = medication_features[medication_features.labels == 0]
temp1 = medication_features[medication_features.labels == 1]
df0 = pd.DataFrame()
df1 = pd.DataFrame()
df0['meds'] = temp0.columns[7:]
df0['count'] = temp0.iloc[:, 7:].sum().values/temp0.shape[0]*100
df1['meds'] = temp1.columns[7:]
df1['count'] = temp1.iloc[:, 7:].sum().values/temp1.shape[0]*100
df0 = df0.sort_values('count')
df1 = df1.sort_values('count')
#plt.figure(figsize = (18,18))
plt.barh(df0.iloc[-5:, :]['meds'],df0.iloc[-5:, :]['count'])
plt.title('Label 0: CHF Hospital Stays w/o Readmissions (N = 3,801)', fontsize = 14)
plt.xlim([0, 102])
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('prevalence (%)',fontsize = 12)
plt.show()
#plt.figure(figsize = (18,18))
plt.barh(df1.iloc[-5:, :]['meds'],df1.iloc[-5:, :]['count'])
plt.title('Label 1: CHF Hospital Stays w/ Readmissions (N = 687)', fontsize = 14)
plt.xlim([0, 102])
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('prevalence (%)',fontsize = 12)
plt.show()
df1.iloc[-5:, :]
meds | count | |
---|---|---|
0 | Acetaminophen | 84.570597 |
5 | Anticoagulants | 90.393013 |
35 | Opioids | 91.848617 |
27 | Laxatives | 94.468705 |
19 | CrystalloidFluids | 98.835517 |
# Drop admins missing dose info.
med_admin = med_admin.dropna(subset=['dose_unit'])
# Save off all dose units.
admins = med_admin.dose_unit.value_counts().reset_index()
admins['prop'] = admins['dose_unit'] / admins['dose_unit'].sum()
admins.to_csv('all_med_doses_units_raw.csv', index=False)
msno.matrix(med_admin, figsize = (5,5))
<AxesSubplot:>
# Get med names for each medication
def iqr(x):
try:
x = x.dropna()
q75, q25 = np.percentile(x, [75 ,25])
return q75 - q25
except:
return np.nan
med_stat = pd.DataFrame()
temp = merged.iloc[:, 5:]
# All nan values were skipped
med_stat['mean'] = temp.mean()
med_stat['std'] = temp.std()
med_stat['min'] = temp.min()
med_stat['median'] = temp.median()
med_stat['max'] = temp.max()
med_stat = med_stat.reset_index()
med_stat['IQR'] = temp.apply(iqr, axis = 1)
med_stat['index'] = med_stat['index'].str.replace('average_dose_','')
med_stat = med_stat.rename(columns = {'index':'medication'})
med_stat
medication | mean | std | min | median | max | IQR | |
---|---|---|---|---|---|---|---|
0 | betamethasone | 4578.35 | 4087.01 | 0.00 | 5391.91 | 10732.27 | 121.88 |
1 | cortisone | 40941.70 | 629273.20 | 0.00 | 1354.81 | 12984738.77 | 36.67 |
2 | dexamethasone | 20.58 | 36.61 | 0.00 | 7.23 | 292.45 | 92.16 |
3 | dexmedetomidine | 0.00 | 0.01 | 0.00 | 0.00 | 0.50 | 24761.21 |
4 | diazepam | 2353.79 | 2140.29 | 12.01 | 1514.07 | 9610.09 | 218.52 |
5 | dobutamine | 0.50 | 3.26 | 0.00 | 0.04 | 44.65 | nan |
6 | dopamine | 7.40 | 141.13 | 0.00 | 0.02 | 2765.79 | 2633.51 |
7 | epinephrine | 0.12 | 2.65 | 0.00 | 0.00 | 104.48 | nan |
8 | etomidate | 164.15 | 164.91 | 0.22 | 123.91 | 1649.43 | 1448.10 |
9 | fentanyl | 0.11 | 0.63 | 0.00 | 0.03 | 15.14 | 483.71 |
10 | fosphenytoin | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 216.62 |
11 | gabapentin | 70170.98 | 139540.80 | 40.61 | 17939.13 | 1191163.03 | 0.03 |
12 | hydralazine | 3070.74 | 3714.80 | 25.45 | 1915.75 | 49069.42 | 75871.36 |
13 | hydromorphone | 282.58 | 1664.38 | 0.02 | 94.03 | 37710.51 | 17.63 |
14 | isoproterenol | 1.46 | 4.80 | 0.00 | 0.01 | 29.43 | 0.00 |
15 | ketamine | 98.37 | 378.74 | 0.00 | 0.07 | 3372.39 | 55.27 |
16 | labetalol | 6530.43 | 15221.57 | 2.42 | 2888.69 | 194604.18 | 0.29 |
17 | lacosamide | 11024.32 | 22797.40 | 409.19 | 4182.32 | 177035.40 | 1376.44 |
18 | levetiracetam | 120199.76 | 285055.79 | 1524.17 | 42141.05 | 3993069.07 | 74.37 |
19 | lorazepam | 277.87 | 375.27 | 0.06 | 160.16 | 3999.02 | 399.99 |
20 | mannitol | 252149.75 | 305406.73 | 8.37 | 137638.44 | 1095231.50 | 124.82 |
21 | methylprednisolone | 606.60 | 2037.34 | 0.52 | 113.18 | 29156.47 | 226.84 |
22 | midazolam | 274.55 | 429.10 | 0.00 | 137.18 | 4998.83 | 333.16 |
23 | milrinone | 0.01 | 0.01 | 0.00 | 0.00 | 0.16 | 231.45 |
24 | morphine | 1851.89 | 8802.39 | 0.00 | 571.60 | 160099.79 | 1426.39 |
25 | nesiritide | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.69 |
26 | nicardipine | 1798.72 | 23445.61 | 0.00 | 5.67 | 398497.08 | 0.03 |
27 | nitroglycerin | 136.07 | 2876.34 | 0.00 | 0.60 | 86532.47 | 0.05 |
28 | nitroprusside | 1.80 | 39.15 | 0.00 | 0.01 | 1177.87 | 476.97 |
29 | norepinephrine | 0.20 | 4.39 | 0.00 | 0.00 | 134.04 | 8669.32 |
30 | oxycodone | 955.01 | 1370.15 | 6.32 | 432.35 | 13292.48 | 199.57 |
31 | phenobarbital | 517.43 | 1491.42 | 2.22 | 30.92 | 5650.02 | 1339.15 |
32 | phenylephrine | 36.97 | 692.08 | 0.00 | 0.56 | 18927.16 | 81.31 |
33 | phenytoin | 67.78 | 176.08 | 0.00 | 0.00 | 1159.71 | 245.24 |
34 | prednisone | 48.74 | 108.34 | 0.29 | 13.20 | 969.63 | 318.60 |
35 | pregabalin | 7699.11 | 11609.82 | 94.01 | 4245.61 | 90412.48 | 270.69 |
36 | propofol | 61.63 | 521.73 | 0.00 | 0.14 | 16575.43 | 107.86 |
37 | remifentanil | 0.00 | nan | 0.00 | 0.00 | 0.00 | 86.31 |
38 | riociguat | 267.70 | 315.38 | 44.69 | 267.70 | 490.71 | 54723.07 |
39 | triamcinolone | 10144.81 | 18509.90 | 0.00 | 0.00 | 75629.91 | 3685.62 |
40 | valproate | 535.08 | 1159.17 | 8.95 | 128.80 | 5973.81 | 598.12 |
41 | vasopressin | 0.15 | 1.81 | 0.00 | 0.00 | 39.21 | 682.82 |
all_drug_doses
df_med = pd.DataFrame()
for i in range(len(all_drug_doses)):
drug_class = pd.read_csv(all_drug_doses[i])
#med_dict[all_drug_classes[i].split('/')[-1].split('.')[0]] = drug_class.iloc[:, 0].to_list()
df_med = pd.concat([df_med, drug_class], axis = 1)
meds = pd.melt(df_med).dropna()
meds.columns = ['medication','medication name']
meds
#meds = meds.set_index('drug class')
medication | medication name | |
---|---|---|
0 | betamethasone | betamethasone |
1 | betamethasone | BSP 0820 |
2 | betamethasone | Celestone |
3 | betamethasone | Selestoject |
16 | cortisone | cortisone |
... | ... | ... |
643 | valproate | Stavzor |
644 | valproate | Depacon |
656 | vasopressin | vasopressin |
657 | vasopressin | Vasostrict |
658 | vasopressin | Pitressin |
178 rows × 2 columns
temp = med_stat.merge(med_units, how = 'left', left_on='medication', right_on='drugname').drop('drugname', axis=1)
merged = temp.merge(meds, how='left', on='medication')
# #merged.groupby(['medication','medication name'])['mean','std','min','max','IQR','unit'].last()
# grouped = merged.groupby(['medication','mean','std','min','max','IQR','unit'])['medication name'].apply(lambda x: x.join(','))
# grouped = pd.DataFrame(grouped).reset_index()
# grouped
grouped.to_csv('med_dose.csv')