Module qbiome.qutil
Expand source code
import statsmodels.api as sm
lowess = sm.nonparametric.lowess
import warnings
import pylab as plt
import pandas as pd
def saveFIG(filename='tmp.pdf',
axis=False,
transparent=True):
"""save fig for publication
Args:
filename (str, optional): filename to save figure. (Default value = 'tmp.pdf')
axis (bool, optional): if True then show axis. (Default value = False)
transparent (bool, optional): if True background is transparent. (Default value = True)
Returns:
"""
import pylab as plt
plt.subplots_adjust(top = 1, bottom = 0, right = 1, left = 0,
hspace = 0, wspace = 0)
plt.margins(0,0)
if not axis:
plt.gca().xaxis.set_major_locator(plt.NullLocator())
plt.gca().yaxis.set_major_locator(plt.NullLocator())
plt.savefig(filename,dpi=300, bbox_inches = 'tight',
pad_inches =.1,transparent=transparent)
return
def qsmooth(df,
index,
columns,
var=None,
interpolate=False,
alpha=.9,
lowess_fraction=0.6,
normalize=True):
"""smooth dataframes after pivoting and slicing
Args:
df (pandas.DataFrame): dataframe in long format
index (str): pivot index
columns (str): pivot columns
var (list[str], optional): list of variables to plot (Default value = None)
interpolate (bool, optional): remove Nans by spline fit (Default value = False)
alpha (float, optional): parameter passed to exponential smoothing (Default value = .9)
lowess_fraction (float): smoothing coefficient for LOWESS (Default value = 0.6)
normalize (bool): if True normalize (Default value = True)
Returns:
pandas.DataFrame: smooth dataframe
"""
timeunit=index
df=df.pivot(index=index,columns=columns)
df.columns=[x[1] for x in df.columns]
if interpolate:
df=df.interpolate(method='spline',order=2,limit_direction='both')
if var is not None:
if not isinstance(var, list):
warning('var needs to be a list')
var=[var]
df_=df[var]
biomes=var
else:
df_=df.copy()
biomes=df.columns
DF=None
for i in biomes:
df__=df_[i].ewm(alpha=alpha).mean()
w = lowess(df__.values,df__.index.values, frac=lowess_fraction)
df__=pd.DataFrame(w,columns=[timeunit,i]).set_index(timeunit)
if normalize:
df__=(df__-df__.min())/(df__.max()-df__.min())
if DF is None:
DF=df__.reset_index()
else:
DF=DF.merge(df__.reset_index(),on=timeunit)
return DF.set_index(timeunit)
def qplot(df,
preindex,
index,
columns,
timeunit=None,
var=None,
interpolate=False,
alpha=.9,
lowess_fraction=0.6,
normalize=True,
ax=None,
xlim0=None,
xlim1=None,
legend_label=None,
filename=None,
transparent=True,
save=True,
fontsize=18,
title=None):
"""plot dataframes after pivoting and slicing
Args:
df (pandas.DataFrame): dataframe in long format
preindex (str): if not None, set index to preindex
index (str): pivot index
columns (str): pivot columns
timeunit (str, optional): label for unit of time. If None, set to index (Default value = None)
var (list[str], optional): list of variables to plot (Default value = None)
interpolate (bool, optional): remove Nans by spline fit (Default value = False)
alpha (float, optional): parameter passed to exponential smoothing (Default value = .9)
lowess_fraction (float): smoothing coefficient for LOWESS (Default value = 0.6)
normalize (bool): if True normalize (Default value = True)
ax (axis handle): If None generate figure and axes (Default value = None)
xlim0 (float, optional): left limit of x axis (Default value = None)
xlim1 (float, optional): right limit of x axis (Default value = None)
legend_label (str, optional): optional suffix added to legend (Default value = None)
filename (str, optional): output filename including extension (Default value = None)
transparent (bool, optional): if True background is transparent (Default value = True)
save (bool, optional): if True save file (Default value = True)
fontsize (int, optional): fontsize (Default value = 18)
title (str, optional): title string (Default value = None)
Returns:
pandas.DataFrame: concatenated dataframe plotted
"""
if timeunit is None:
timeunit=index
df=df.set_index(preindex).pivot(index=index,columns=columns)
df.columns=[x[1] for x in df.columns]
if interpolate:
df=df.interpolate(method='spline',order=2,limit_direction='both')
if ax is None:
fig=plt.figure(figsize=[8,4])
ax=plt.gca()
if var is not None:
if not isinstance(var, list):
warning('var needs to be a list')
var=[var]
df_=df[var]
biomes=var
else:
df_=df.copy()
biomes=df.columns
DF=None
for i in biomes:
df__=df_[i].ewm(alpha=alpha).mean()
w = lowess(df__.values,df__.index.values, frac=lowess_fraction)
df__=pd.DataFrame(w,columns=[timeunit,i]).set_index(timeunit)
if normalize:
df__=(df__-df__.min())/(df__.max()-df__.min())
df__.plot(ax=ax,label=i,style='-',lw=4,ms=8,alpha=.75)
if DF is None:
DF=df__.reset_index()
else:
DF=DF.merge(df__.reset_index(),on=timeunit)
if legend_label is not None:
biomes=[x+legend_label for x in biomes]
ax.legend(biomes,loc='center left', bbox_to_anchor=(1, 0.5))
ax.set_xlim(xlim0,xlim1)
if normalize:
ax.set_ylim(-.1,1.1)
tg='normalized '
else:
tg=''
ax.set_ylabel(tg+'concentration',fontsize=fontsize,labelpad=10,color='.5')
ax.set_xlabel('['+timeunit+']',fontsize=fontsize,labelpad=10,color='.5')
if title is not None:
ax.set_title(title,y=1.03,fontsize=fontsize+2,fontweight='demi')
ax.tick_params(axis='x', labelsize=fontsize,labelcolor='.5' )
ax.tick_params(axis='y', labelsize=fontsize,labelcolor='.5')
if filename is None:
filename='_'.join(biomes)+'.png'
if save:
saveFIG(filename,axis=True,transparent=transparent)
return DF.set_index(timeunit)
def customDataFormatter(datafile,
metafile,
META_PROP,
COL_SELECT,
BIOMES=None,
sample_id_col_names=['Samples','sequence_barcode']):
"""custom data formatter. See example.
Args:
datafile (str): path to datafile
metafile (str): path to metadatafile
META_PROP (list[str]): list of meta properties in metaddata file
COL_SELECT (dict[str,str]): dict to rename columns for sample_id, subject_id, timeunit
BIOMES (list[str]): list of biomes of interest (Default value = None)
sample_id_col_names: (Default value = ['Samples','sequence_barcode']):
Returns:
pandas.DataFrame,pandas.DataFrame: data and subject_id vs meta properties
"""
df_=pd.read_csv(datafile).set_index(sample_id_col_names[0])
mf=pd.read_csv(metafile).set_index(sample_id_col_names[1])
mf.index.name='Samples'
mf=mf.reset_index()
df=df_.merge(mf,on='Samples')
df=df.rename(columns=COL_SELECT)
if BIOMES is None:
BIOMES=list(df_.columns.values[1:])
df1=df[['sample_id','subject_id','week']+BIOMES].drop(['subject_id','week'],axis=1)
df2=df[['sample_id','subject_id','week']]
data=df1.melt(id_vars='sample_id',value_vars=BIOMES).merge(df2,on='sample_id')
property_map=df[['subject_id']+META_PROP]
return data,property_map
Functions
def customDataFormatter(datafile, metafile, META_PROP, COL_SELECT, BIOMES=None, sample_id_col_names=['Samples', 'sequence_barcode'])
-
custom data formatter. See example.
Args
datafile
:str
- path to datafile
metafile
:str
- path to metadatafile
META_PROP
:list[str]
- list of meta properties in metaddata file
COL_SELECT
:dict[str,str]
- dict to rename columns for sample_id, subject_id, timeunit
BIOMES
:list[str]
- list of biomes of interest (Default value = None)
sample_id_col_names
- (Default value = ['Samples','sequence_barcode']):
Returns
pandas.DataFrame,pandas.DataFrame
- data and subject_id vs meta properties
Expand source code
def customDataFormatter(datafile, metafile, META_PROP, COL_SELECT, BIOMES=None, sample_id_col_names=['Samples','sequence_barcode']): """custom data formatter. See example. Args: datafile (str): path to datafile metafile (str): path to metadatafile META_PROP (list[str]): list of meta properties in metaddata file COL_SELECT (dict[str,str]): dict to rename columns for sample_id, subject_id, timeunit BIOMES (list[str]): list of biomes of interest (Default value = None) sample_id_col_names: (Default value = ['Samples','sequence_barcode']): Returns: pandas.DataFrame,pandas.DataFrame: data and subject_id vs meta properties """ df_=pd.read_csv(datafile).set_index(sample_id_col_names[0]) mf=pd.read_csv(metafile).set_index(sample_id_col_names[1]) mf.index.name='Samples' mf=mf.reset_index() df=df_.merge(mf,on='Samples') df=df.rename(columns=COL_SELECT) if BIOMES is None: BIOMES=list(df_.columns.values[1:]) df1=df[['sample_id','subject_id','week']+BIOMES].drop(['subject_id','week'],axis=1) df2=df[['sample_id','subject_id','week']] data=df1.melt(id_vars='sample_id',value_vars=BIOMES).merge(df2,on='sample_id') property_map=df[['subject_id']+META_PROP] return data,property_map
def qplot(df, preindex, index, columns, timeunit=None, var=None, interpolate=False, alpha=0.9, lowess_fraction=0.6, normalize=True, ax=None, xlim0=None, xlim1=None, legend_label=None, filename=None, transparent=True, save=True, fontsize=18, title=None)
-
plot dataframes after pivoting and slicing
Args
df
:pandas.DataFrame
- dataframe in long format
preindex
:str
- if not None, set index to preindex
index
:str
- pivot index
columns
:str
- pivot columns
timeunit
:str
, optional- label for unit of time. If None, set to index (Default value = None)
var
:list[str]
, optional- list of variables to plot (Default value = None)
interpolate
:bool
, optional- remove Nans by spline fit (Default value = False)
alpha
:float
, optional- parameter passed to exponential smoothing (Default value = .9)
lowess_fraction
:float
- smoothing coefficient for LOWESS (Default value = 0.6)
normalize
:bool
- if True normalize (Default value = True)
ax
:axis handle
- If None generate figure and axes (Default value = None)
xlim0
:float
, optional- left limit of x axis (Default value = None)
xlim1
:float
, optional- right limit of x axis (Default value = None)
legend_label
:str
, optional- optional suffix added to legend (Default value = None)
filename
:str
, optional- output filename including extension (Default value = None)
transparent
:bool
, optional- if True background is transparent (Default value = True)
save
:bool
, optional- if True save file (Default value = True)
fontsize
:int
, optional- fontsize (Default value = 18)
title
:str
, optional- title string (Default value = None)
Returns
pandas.DataFrame
- concatenated dataframe plotted
Expand source code
def qplot(df, preindex, index, columns, timeunit=None, var=None, interpolate=False, alpha=.9, lowess_fraction=0.6, normalize=True, ax=None, xlim0=None, xlim1=None, legend_label=None, filename=None, transparent=True, save=True, fontsize=18, title=None): """plot dataframes after pivoting and slicing Args: df (pandas.DataFrame): dataframe in long format preindex (str): if not None, set index to preindex index (str): pivot index columns (str): pivot columns timeunit (str, optional): label for unit of time. If None, set to index (Default value = None) var (list[str], optional): list of variables to plot (Default value = None) interpolate (bool, optional): remove Nans by spline fit (Default value = False) alpha (float, optional): parameter passed to exponential smoothing (Default value = .9) lowess_fraction (float): smoothing coefficient for LOWESS (Default value = 0.6) normalize (bool): if True normalize (Default value = True) ax (axis handle): If None generate figure and axes (Default value = None) xlim0 (float, optional): left limit of x axis (Default value = None) xlim1 (float, optional): right limit of x axis (Default value = None) legend_label (str, optional): optional suffix added to legend (Default value = None) filename (str, optional): output filename including extension (Default value = None) transparent (bool, optional): if True background is transparent (Default value = True) save (bool, optional): if True save file (Default value = True) fontsize (int, optional): fontsize (Default value = 18) title (str, optional): title string (Default value = None) Returns: pandas.DataFrame: concatenated dataframe plotted """ if timeunit is None: timeunit=index df=df.set_index(preindex).pivot(index=index,columns=columns) df.columns=[x[1] for x in df.columns] if interpolate: df=df.interpolate(method='spline',order=2,limit_direction='both') if ax is None: fig=plt.figure(figsize=[8,4]) ax=plt.gca() if var is not None: if not isinstance(var, list): warning('var needs to be a list') var=[var] df_=df[var] biomes=var else: df_=df.copy() biomes=df.columns DF=None for i in biomes: df__=df_[i].ewm(alpha=alpha).mean() w = lowess(df__.values,df__.index.values, frac=lowess_fraction) df__=pd.DataFrame(w,columns=[timeunit,i]).set_index(timeunit) if normalize: df__=(df__-df__.min())/(df__.max()-df__.min()) df__.plot(ax=ax,label=i,style='-',lw=4,ms=8,alpha=.75) if DF is None: DF=df__.reset_index() else: DF=DF.merge(df__.reset_index(),on=timeunit) if legend_label is not None: biomes=[x+legend_label for x in biomes] ax.legend(biomes,loc='center left', bbox_to_anchor=(1, 0.5)) ax.set_xlim(xlim0,xlim1) if normalize: ax.set_ylim(-.1,1.1) tg='normalized ' else: tg='' ax.set_ylabel(tg+'concentration',fontsize=fontsize,labelpad=10,color='.5') ax.set_xlabel('['+timeunit+']',fontsize=fontsize,labelpad=10,color='.5') if title is not None: ax.set_title(title,y=1.03,fontsize=fontsize+2,fontweight='demi') ax.tick_params(axis='x', labelsize=fontsize,labelcolor='.5' ) ax.tick_params(axis='y', labelsize=fontsize,labelcolor='.5') if filename is None: filename='_'.join(biomes)+'.png' if save: saveFIG(filename,axis=True,transparent=transparent) return DF.set_index(timeunit)
def qsmooth(df, index, columns, var=None, interpolate=False, alpha=0.9, lowess_fraction=0.6, normalize=True)
-
smooth dataframes after pivoting and slicing
Args
df
:pandas.DataFrame
- dataframe in long format
index
:str
- pivot index
columns
:str
- pivot columns
var
:list[str]
, optional- list of variables to plot (Default value = None)
interpolate
:bool
, optional- remove Nans by spline fit (Default value = False)
alpha
:float
, optional- parameter passed to exponential smoothing (Default value = .9)
lowess_fraction
:float
- smoothing coefficient for LOWESS (Default value = 0.6)
normalize
:bool
- if True normalize (Default value = True)
Returns
pandas.DataFrame
- smooth dataframe
Expand source code
def qsmooth(df, index, columns, var=None, interpolate=False, alpha=.9, lowess_fraction=0.6, normalize=True): """smooth dataframes after pivoting and slicing Args: df (pandas.DataFrame): dataframe in long format index (str): pivot index columns (str): pivot columns var (list[str], optional): list of variables to plot (Default value = None) interpolate (bool, optional): remove Nans by spline fit (Default value = False) alpha (float, optional): parameter passed to exponential smoothing (Default value = .9) lowess_fraction (float): smoothing coefficient for LOWESS (Default value = 0.6) normalize (bool): if True normalize (Default value = True) Returns: pandas.DataFrame: smooth dataframe """ timeunit=index df=df.pivot(index=index,columns=columns) df.columns=[x[1] for x in df.columns] if interpolate: df=df.interpolate(method='spline',order=2,limit_direction='both') if var is not None: if not isinstance(var, list): warning('var needs to be a list') var=[var] df_=df[var] biomes=var else: df_=df.copy() biomes=df.columns DF=None for i in biomes: df__=df_[i].ewm(alpha=alpha).mean() w = lowess(df__.values,df__.index.values, frac=lowess_fraction) df__=pd.DataFrame(w,columns=[timeunit,i]).set_index(timeunit) if normalize: df__=(df__-df__.min())/(df__.max()-df__.min()) if DF is None: DF=df__.reset_index() else: DF=DF.merge(df__.reset_index(),on=timeunit) return DF.set_index(timeunit)
def saveFIG(filename='tmp.pdf', axis=False, transparent=True)
-
save fig for publication
Args
filename
:str
, optional- filename to save figure. (Default value = 'tmp.pdf')
axis
:bool
, optional- if True then show axis. (Default value = False)
transparent
:bool
, optional- if True background is transparent. (Default value = True)
Returns:
Expand source code
def saveFIG(filename='tmp.pdf', axis=False, transparent=True): """save fig for publication Args: filename (str, optional): filename to save figure. (Default value = 'tmp.pdf') axis (bool, optional): if True then show axis. (Default value = False) transparent (bool, optional): if True background is transparent. (Default value = True) Returns: """ import pylab as plt plt.subplots_adjust(top = 1, bottom = 0, right = 1, left = 0, hspace = 0, wspace = 0) plt.margins(0,0) if not axis: plt.gca().xaxis.set_major_locator(plt.NullLocator()) plt.gca().yaxis.set_major_locator(plt.NullLocator()) plt.savefig(filename,dpi=300, bbox_inches = 'tight', pad_inches =.1,transparent=transparent) return