Module qbiome.quantizer
Expand source code
import re
import string
import pickle
import numpy as np
import pandas as pd
from scipy import interpolate
from sklearn.ensemble import RandomForestRegressor
# helper functions for sorting
# https://stackoverflow.com/questions/5967500/how-to-correctly-sort-a-string-with-a-number-inside
def _atof(text):
try:
retval = float(text)
except ValueError:
retval = text
return retval
def _natural_keys(text):
"""
alist.sort(key=_natural_keys) sorts in human order
http://nedbatchelder.com/blog/200712/human_sorting.html
(See Toothy's implementation in the comments)
float regex comes from https://stackoverflow.com/a/12643073/190597
"""
return [ _atof(c) for c in re.split(r'[+-]?([0-9]+(?:[.][0-9]*)?|[.][0-9]+)', text) ]
class Quantizer:
"""Handles quantization and dequantization of data
"""
def __init__(self, num_levels=5):
"""Initalization
Args:
num_levels (int, optional): Number of quantization levels. Defaults to 5.
"""
# use tuple for immutability
self.num_levels = num_levels
"""number of quantization levels"""
labels = tuple(string.ascii_uppercase[:num_levels])
self.labels = {label: idx for idx, label in enumerate(labels)}
"""ex. `{A: 0, B: 1, ...}`"""
self.variable_bin_map = {}
"""key-value pairs `{biome_name: quantization map}`"""
self.column_names = None
"""a list of columns in the format `{biome}_{week}`"""
self.subject_id_column = None
"""cache this column to add back to the label matrix with `self.add_meta_to_matrix`"""
self.random_forest_dict = {}
"""key-value pairs `{biome_name: sklearn.ensemble.RandomForestRegressor}`"""
def save_quantizer_states(self, out_fname):
"""Save `self.column_names, self.subject_id_column, self.variable_bin_map,
self.random_forest_dict`. Call this after calling `self.quantize_df`
Args:
out_fname (str): output file name
"""
states = {
'column_names': self.column_names,
'subject_id_column': self.subject_id_column,
'variable_bin_map': self.variable_bin_map,
'random_forest_dict': self.random_forest_dict
}
with open(out_fname, 'wb') as f:
pickle.dump(states, f, protocol=pickle.HIGHEST_PROTOCOL)
def load_quantizer_states(self, in_fname):
"""Load in `self.column_names, self.variable_bin_map, self.random_forest_dict` from file
Args:
in_fname (str): input file name
"""
with open(in_fname, 'rb') as f:
states = pickle.load(f)
self.column_names = states['column_names']
self.subject_id_column = states['subject_id_column']
self.variable_bin_map = states['variable_bin_map']
self.random_forest_dict = states['random_forest_dict']
def pivot_into_quantize_format(self, data):
"""Pivot the data into a format the quantizer can quantize
Input data format, produced by `DataFormatter.load_data`:
| sample_id | subject_id | variable | week | value |
|:----------------|-------------:|:-----------------|-------:|---------:|
| MBSMPL0020-6-10 | 1 | Actinobacteriota | 27 | 0.36665 |
| MBSMPL0020-6-10 | 1 | Bacteroidota | 27 | 0.507248 |
| MBSMPL0020-6-10 | 1 | Campilobacterota | 27 | 0.002032 |
| MBSMPL0020-6-10 | 1 | Desulfobacterota | 27 | 0.005058 |
| MBSMPL0020-6-10 | 1 | Firmicutes | 27 | 0.057767 |
Output format:
| subject_id | Acidobacteriota_35 | Actinobacteriota_1 | Actinobacteriota_2 |
|-------------:|---------------------:|:---------------------|---------------------:|
| 1 | nan | 0.36665 | nan |
| 10 | nan | 0.36665 | nan |
| 11 | nan | 0.36665 | nan |
Args:
data (pandas.DataFrame): see format above
Returns:
pandas.DataFrame: see format above
"""
# some hacky intermediate format used by quantizer only
# so this probably shouldn't go into DataFormatter
melted = pd.concat([
data.subject_id,
data.variable + '_' + data.week.astype(str),
data.value
], axis=1).rename(columns={0: 'variable'})
to_quantize = melted.pivot_table(
index='subject_id', columns='variable', dropna=False)['value'].reset_index()
return to_quantize
def quantize_df(self, data):
"""This function must be called before calling any of the dequantization procedures. It populates `self.column_names, self.subject_id_column, self.variable_bin_map`
Input data format, produced by `DataFormatter.load_data`:
| sample_id | subject_id | variable | week | value |
|:----------------|-------------:|:-----------------|-------:|---------:|
| MBSMPL0020-6-10 | 1 | Actinobacteriota | 27 | 0.36665 |
| MBSMPL0020-6-10 | 1 | Bacteroidota | 27 | 0.507248 |
| MBSMPL0020-6-10 | 1 | Campilobacterota | 27 | 0.002032 |
| MBSMPL0020-6-10 | 1 | Desulfobacterota | 27 | 0.005058 |
| MBSMPL0020-6-10 | 1 | Firmicutes | 27 | 0.057767 |
Output data format:
| subject_id | Acidobacteriota_35 | Actinobacteriota_1 | Actinobacteriota_2 |
|-------------:|---------------------:|:---------------------|---------------------:|
| 1 | nan | A | nan |
| 10 | nan | A | nan |
| 11 | nan | A | nan |
| 12 | nan | D | nan |
| 14 | nan | A | nan |
Args:
data (pandas.DataFrame): see format above
Returns:
pandas.DataFrame: see format above
"""
to_quantize = self.pivot_into_quantize_format(data)
self.column_names = to_quantize.columns[1:] # skip subject_id, only biome names
# cache the subject_id column to add back to a dequantized matrix
self.subject_id_column = to_quantize.subject_id
return self._quantize_df(to_quantize)
def _quantize_df(self, to_quantize):
"""Quantize a data frame in quantizable format
Input data format:
| subject_id | Acidobacteriota_35 | Actinobacteriota_1 | Actinobacteriota_2 |
|-------------:|---------------------:|:---------------------|---------------------:|
| 1 | nan | 0.36665 | nan |
| 10 | nan | 0.36665 | nan |
| 11 | nan | 0.36665 | nan |
Output data format:
| subject_id | Acidobacteriota_35 | Actinobacteriota_1 | Actinobacteriota_2 |
|-------------:|---------------------:|:---------------------|---------------------:|
| 1 | nan | A | nan |
| 10 | nan | A | nan |
| 11 | nan | A | nan |
Args:
to_quantize (pd.DataFrame): data frame in quantizable format
Returns:
pandas.DataFrame: see format above
"""
quantized = pd.DataFrame() # return df
if not self.variable_bin_map:
for col in self.column_names:
cut, bins = pd.cut(to_quantize[col], self.num_levels,
labels=list(self.labels.keys()), retbins=True)
quantized[col] = cut
self.variable_bin_map[col] = bins
else: # use existing bins
for col in self.column_names:
cut = pd.cut(to_quantize[col], self.variable_bin_map[col],
labels=list(self.labels.keys()))
quantized[col] = cut
# sort the columns by name in a natural order
quantized = quantized.reindex(sorted(quantized.columns, key=_natural_keys), axis=1)
quantized.insert(0, 'subject_id', to_quantize.subject_id)
return quantized
def get_qnet_inputs(self, quantized_df):
"""Retrieve the feature names and data matrix from a quantized data frame produced by `self.quantize_df`
Args:
quantized_df (pandas.DataFrame): a quantized data frame produced by `self.quantize_df`
Returns:
list: a list of feature names, ex. `['Acidobacteriota_35', 'Actinobacteriota_1', 'Actinobacteriota_2']`
numpy.ndarray: a 2D data array of quantized labels (`'A', 'B', ...,` or empty string `''` for NaN)
"""
# skip subject_id column
df = quantized_df.drop(columns='subject_id')
# matrix = df.astype(str).replace('nan', '').to_numpy(dtype=str)
# matrix = df.astype(str).fillna('').to_numpy(dtype=str)
matrix = df.astype(str).replace('nan', '').fillna('').to_numpy(dtype=str)
# sanity-check matrix contains only empty strings and label strings
valid_labels = list(self.labels.keys()) + ['']
is_valid = np.isin(np.unique(matrix), valid_labels).all()
if not is_valid:
import warnings
#raise Exception('The label matrix contains strings that are neither the empty string nor the label strings')
warnings.warn('The label matrix contains strings that are neither the empty string nor the label strings')
return df.columns, matrix
def quantize_new_subject(self, subject_data, subject_id=None):
"""Construct and quantize a new subject with missing data
Input format:
| subject_id | variable | week | value |
|-------------:|:-----------------|-------:|---------:|
| 1 | Actinobacteriota | 1 | 0.36665 |
| 1 | Bacteroidota | 1 | 0.507248 |
| 1 | Campilobacterota | 1 | 0.002032 |
| 1 | Desulfobacterota | 1 | 0.005058 |
| 1 | Firmicutes | 1 | 0.057767 |
Output format:
| subject_id | Acidobacteriota_35 | Actinobacteriota_1 | Actinobacteriota_2 |
|-------------:|---------------------:|:---------------------|---------------------:|
| 1 | nan | A | nan |
| 1 | nan | A | nan |
| 1 | nan | A | nan |
| 1 | nan | D | nan |
Args:
subject_data ([type]): subject data frame with some but maybe not all the timestamps
subject_id (str, optional): if not None, add the subject_id as a column; if None, assume that the input has a column named subject_id. Defaults to None.
Returns:
pd.DataFrame: quantized subject data frame with complete timestamps, see format above
"""
if subject_id is None and not 'subject_id' in subject_data.columns:
raise Exception('You must provide a subject_id if there is none in the input data frame')
if subject_id is not None:
subject_data['subject_id'] = subject_id
new_subject = self.pivot_into_quantize_format(subject_data)
# add columns that are in self.column_names but not in pivoted as np.nan
for column in self.column_names:
if column not in new_subject.columns:
new_subject[column] = np.nan
return self._quantize_df(new_subject)
def get_bin_array_of_index(self, idx):
"""Return the `pandas.cut` bin array corresponding to the sequence index by looking up `self.variable_bin_map[self.column_names[idx]]`
Args:
idx (int): index into `self.column_names`
Returns:
numpy.ndarray: bins
"""
col = self.column_names[idx]
bin_arr = self.variable_bin_map[col]
return bin_arr
def quantize_value(self, val, bin_arr):
"""Quantize a numeric value into a label. This function is the inverse of `self.dequantize_label`
Args:
val (float): number to quantize
bin_arr (numpy.ndarray): bins produced by `pandas.cut` or retrieved using `self.get_bin_array_of_index`
Returns:
str: quantized label string
"""
label = pd.cut([val], bin_arr, labels=list(self.labels.keys()))[0]
return label
# procedures and helpers for dequantization follows
def _fit_random_forest_one_biome(self, x, y):
idx_old = np.arange(len(x))
fx = interpolate.interp1d(idx_old, x, fill_value='extrapolate')
fy = interpolate.interp1d(idx_old, y, fill_value='extrapolate')
idx = np.arange(0, len(x), 0.01)
X = fx(idx)[:, np.newaxis]
Y = fy(idx)
model = RandomForestRegressor()
model.fit(X, Y)
return model
def compute_average_df(self, df):
"""Take the average over the input data frame by grouping by `variable, week`
Input data format:
| sample_id | subject_id | variable | week | value |
|:----------------|-------------:|:-----------------|-------:|---------:|
| MBSMPL0020-6-10 | 1 | Actinobacteriota | 27 | 0.36665 |
| MBSMPL0020-6-10 | 1 | Bacteroidota | 27 | 0.507248 |
| MBSMPL0020-6-10 | 1 | Campilobacterota | 27 | 0.002032 |
| MBSMPL0020-6-10 | 1 | Desulfobacterota | 27 | 0.005058 |
| MBSMPL0020-6-10 | 1 | Firmicutes | 27 | 0.057767 |
Output data format:
| variable | week | value |
|:-----------------|-------:|---------:|
| Actinobacteriota | 27 | 0.36665 |
| Bacteroidota | 27 | 0.507248 |
| Campilobacterota | 27 | 0.002032 |
Args:
df (pandas.DataFrame): see format above
Returns:
pandas.DataFrame: the average data frame, see format above
"""
avg = df[['variable', 'week', 'value']].groupby(
by=['variable', 'week']).mean().reset_index()
return avg
def fit_random_forest(self, data, dequantized_data):
"""Fit a random forest regressor for each of the biome. Use as input the average of the quantized data. Fit regressor to the average of the original data as output. Populate `self.random_forest_dict`.
Input format for both data frames:
| subject_id | variable | week | value |
|-------------:|:-----------------|-------:|---------:|
| 1 | Actinobacteriota | 27 | 0.36665 |
| 1 | Bacteroidota | 27 | 0.507248 |
| 1 | Campilobacterota | 27 | 0.002032 |
Args:
data (pandas.DataFrame): see format above
dequantized_data (pandas.DataFrame): see format above
"""
if self.random_forest_dict: # already populated
return
# take avg of data and dequantized_data, grouped by week and biome
# want to map dequantized to original, hence dequantized is input
inputs = self.compute_average_df(dequantized_data)
outputs = self.compute_average_df(data)
for biome in inputs.variable.unique():
x = inputs[inputs.variable == biome].value
y = outputs[outputs.variable == biome].value
model = self._fit_random_forest_one_biome(x, y)
self.random_forest_dict[biome] = model
def dequantize_label(self, label, bin_arr):
"""Dequantize a label string into a numeric value. This function is the inverse of `self.quantize_value`. If the input is an empty string, the return value will be `numpy.nan`
Args:
label (str): label string
bin_arr (numpy.ndarray): bins produced by `pandas.cut` or retrieved using `self.get_bin_array_of_index`
Returns:
float: the dequantized numeric value
"""
if label is np.nan or label.lower() == 'nan' or label not in self.labels:
return np.nan
low = self.labels[label]
high = low + 1
val = (bin_arr[low] + bin_arr[high]) / 2
return val
def dequantize_sequence(self, label_seq):
"""Dequantize an entire label sequence
Args:
label_seq (numpy.ndarray): 1D array of label strings
Returns:
numpy.ndarray: 1D array of floats
"""
numeric_seq = np.empty(label_seq.shape)
for idx, label in enumerate(label_seq):
bin_arr = self.get_bin_array_of_index(idx)
numeric_seq[idx] = self.dequantize_label(label, bin_arr)
return numeric_seq
def dequantize_to_df(self, matrix):
"""Dequantize a label matrix (with no column names, just the qnet input matrix) into a data frame with numeric values. To make the output data frame into a format `seaborn` can easily plot, apply `self.melt_into_plot_format`
Output format:
| subject_id | Acidobacteriota_35 | Actinobacteriota_1 | Actinobacteriota_2 |
|-------------:|---------------------:|:---------------------|---------------------:|
| 1 | nan | 0.36665 | nan |
| 10 | nan | 0.36665 | nan |
| 11 | nan | 0.36665 | nan |
Args:
matrix (numpy.ndarray): 2D matrix of label strings
Returns:
pandas.DataFrame: see format above
"""
numeric_matrix = np.empty(matrix.shape)
for idx, seq in enumerate(matrix):
numeric_matrix[idx] = self.dequantize_sequence(seq)
df = self.add_meta_to_matrix(numeric_matrix)
return df
def add_meta_to_matrix(self, matrix, add_subject_id=True):
"""Add back `self.subject_ud` and `self.column_names` to the data matrix to convert it into a data frame
Output format:
| subject_id | Acidobacteriota_35 | Actinobacteriota_1 | Actinobacteriota_2 |
|-------------:|---------------------:|:---------------------|---------------------:|
| 1 | nan | A | nan |
| 10 | nan | A | nan |
| 11 | nan | A | nan |
| 12 | nan | D | nan |
| 14 | nan | A | nan |
Args:
matrix (np.ndarray): 2D matrix of either label strings or numeric values
add_subject_id (bool, optional): whether to add back the cached subject_id column. Defaults to True.
Returns:
pandas.DataFrame: see format above
"""
df = pd.DataFrame(matrix, columns=self.column_names)
if add_subject_id:
df = pd.concat([self.subject_id_column, df], axis=1)
return df
def melt_into_plot_format(self, data):
"""Melt data into a format that `seaborn` can easily plot
Input format:
| subject_id | Acidobacteriota_35 | Actinobacteriota_1 | Actinobacteriota_2 |
|-------------:|---------------------:|:---------------------|---------------------:|
| 1 | nan | 0.36665 | nan |
| 10 | nan | 0.36665 | nan |
| 11 | nan | 0.36665 | nan |
Output format:
| subject_id | variable | week | value |
|-------------:|:-----------------|-------:|---------:|
| 1 | Actinobacteriota | 27 | 0.36665 |
| 1 | Bacteroidota | 27 | 0.507248 |
| 1 | Campilobacterota | 27 | 0.002032 |
Args:
data (pandas.DataFrame): numeric data, see format above
Returns:
pandas.DataFrame: see format above
"""
# pivot into plottable format
melted = data.melt(id_vars='subject_id')
# split variable names
splitted = melted.variable.str.extract(r'([\D|\d]+)_(\d+)', expand=True)
splitted.rename(columns={0: 'variable', 1: 'week'}, inplace=True)
splitted.week = splitted.week.astype(int)
plot_df = pd.concat([
melted.subject_id, splitted, melted.value
], axis=1)
return plot_df
def apply_random_forest_regressor(self, data):
"""Apply the trained biome regressor on the data to reduce the conversion distortion resulted from quantization-dequantization. If the data frame has columns other than `variable, week, value`, the extra columns will be averaged over using `self.compute_average_df`
Data formats for the input and the output are the same:
| variable | week | value |
|:-----------------|-------:|---------:|
| Actinobacteriota | 27 | 0.36665 |
| Bacteroidota | 27 | 0.507248 |
| Campilobacterota | 27 | 0.002032 |
Args:
data (pandas.DataFrame): see format above
Raises:
Exception: `self.random_forest_dict` hasn't been populated. You need to call `self.fit_random_forest` first
Returns:
pandas.DataFrame: with columns `variable, week, value`
"""
if not self.random_forest_dict:
raise Exception('No random forest models. First train with fit_random_forest')
avg_data = self.compute_average_df(data)
dataframes = []
for biome in avg_data.variable.unique():
x = avg_data[avg_data.variable == biome].value
# check if there is NaN
if x.isnull().any():
raise Exception('There are NaNs in the inputs. Please run the forecaster to fill in all the NaNs first')
x = x.to_numpy()[:, np.newaxis]
model = self.random_forest_dict[biome]
pred = model.predict(x)
df = pd.DataFrame({
'variable': biome,
'week': avg_data[avg_data.variable == biome].week,
'value': pred
})
dataframes.append(df)
ret = pd.concat(dataframes)
return ret
Classes
class Quantizer (num_levels=5)
-
Handles quantization and dequantization of data
Initalization
Args
num_levels
:int
, optional- Number of quantization levels. Defaults to 5.
Expand source code
class Quantizer: """Handles quantization and dequantization of data """ def __init__(self, num_levels=5): """Initalization Args: num_levels (int, optional): Number of quantization levels. Defaults to 5. """ # use tuple for immutability self.num_levels = num_levels """number of quantization levels""" labels = tuple(string.ascii_uppercase[:num_levels]) self.labels = {label: idx for idx, label in enumerate(labels)} """ex. `{A: 0, B: 1, ...}`""" self.variable_bin_map = {} """key-value pairs `{biome_name: quantization map}`""" self.column_names = None """a list of columns in the format `{biome}_{week}`""" self.subject_id_column = None """cache this column to add back to the label matrix with `self.add_meta_to_matrix`""" self.random_forest_dict = {} """key-value pairs `{biome_name: sklearn.ensemble.RandomForestRegressor}`""" def save_quantizer_states(self, out_fname): """Save `self.column_names, self.subject_id_column, self.variable_bin_map, self.random_forest_dict`. Call this after calling `self.quantize_df` Args: out_fname (str): output file name """ states = { 'column_names': self.column_names, 'subject_id_column': self.subject_id_column, 'variable_bin_map': self.variable_bin_map, 'random_forest_dict': self.random_forest_dict } with open(out_fname, 'wb') as f: pickle.dump(states, f, protocol=pickle.HIGHEST_PROTOCOL) def load_quantizer_states(self, in_fname): """Load in `self.column_names, self.variable_bin_map, self.random_forest_dict` from file Args: in_fname (str): input file name """ with open(in_fname, 'rb') as f: states = pickle.load(f) self.column_names = states['column_names'] self.subject_id_column = states['subject_id_column'] self.variable_bin_map = states['variable_bin_map'] self.random_forest_dict = states['random_forest_dict'] def pivot_into_quantize_format(self, data): """Pivot the data into a format the quantizer can quantize Input data format, produced by `DataFormatter.load_data`: | sample_id | subject_id | variable | week | value | |:----------------|-------------:|:-----------------|-------:|---------:| | MBSMPL0020-6-10 | 1 | Actinobacteriota | 27 | 0.36665 | | MBSMPL0020-6-10 | 1 | Bacteroidota | 27 | 0.507248 | | MBSMPL0020-6-10 | 1 | Campilobacterota | 27 | 0.002032 | | MBSMPL0020-6-10 | 1 | Desulfobacterota | 27 | 0.005058 | | MBSMPL0020-6-10 | 1 | Firmicutes | 27 | 0.057767 | Output format: | subject_id | Acidobacteriota_35 | Actinobacteriota_1 | Actinobacteriota_2 | |-------------:|---------------------:|:---------------------|---------------------:| | 1 | nan | 0.36665 | nan | | 10 | nan | 0.36665 | nan | | 11 | nan | 0.36665 | nan | Args: data (pandas.DataFrame): see format above Returns: pandas.DataFrame: see format above """ # some hacky intermediate format used by quantizer only # so this probably shouldn't go into DataFormatter melted = pd.concat([ data.subject_id, data.variable + '_' + data.week.astype(str), data.value ], axis=1).rename(columns={0: 'variable'}) to_quantize = melted.pivot_table( index='subject_id', columns='variable', dropna=False)['value'].reset_index() return to_quantize def quantize_df(self, data): """This function must be called before calling any of the dequantization procedures. It populates `self.column_names, self.subject_id_column, self.variable_bin_map` Input data format, produced by `DataFormatter.load_data`: | sample_id | subject_id | variable | week | value | |:----------------|-------------:|:-----------------|-------:|---------:| | MBSMPL0020-6-10 | 1 | Actinobacteriota | 27 | 0.36665 | | MBSMPL0020-6-10 | 1 | Bacteroidota | 27 | 0.507248 | | MBSMPL0020-6-10 | 1 | Campilobacterota | 27 | 0.002032 | | MBSMPL0020-6-10 | 1 | Desulfobacterota | 27 | 0.005058 | | MBSMPL0020-6-10 | 1 | Firmicutes | 27 | 0.057767 | Output data format: | subject_id | Acidobacteriota_35 | Actinobacteriota_1 | Actinobacteriota_2 | |-------------:|---------------------:|:---------------------|---------------------:| | 1 | nan | A | nan | | 10 | nan | A | nan | | 11 | nan | A | nan | | 12 | nan | D | nan | | 14 | nan | A | nan | Args: data (pandas.DataFrame): see format above Returns: pandas.DataFrame: see format above """ to_quantize = self.pivot_into_quantize_format(data) self.column_names = to_quantize.columns[1:] # skip subject_id, only biome names # cache the subject_id column to add back to a dequantized matrix self.subject_id_column = to_quantize.subject_id return self._quantize_df(to_quantize) def _quantize_df(self, to_quantize): """Quantize a data frame in quantizable format Input data format: | subject_id | Acidobacteriota_35 | Actinobacteriota_1 | Actinobacteriota_2 | |-------------:|---------------------:|:---------------------|---------------------:| | 1 | nan | 0.36665 | nan | | 10 | nan | 0.36665 | nan | | 11 | nan | 0.36665 | nan | Output data format: | subject_id | Acidobacteriota_35 | Actinobacteriota_1 | Actinobacteriota_2 | |-------------:|---------------------:|:---------------------|---------------------:| | 1 | nan | A | nan | | 10 | nan | A | nan | | 11 | nan | A | nan | Args: to_quantize (pd.DataFrame): data frame in quantizable format Returns: pandas.DataFrame: see format above """ quantized = pd.DataFrame() # return df if not self.variable_bin_map: for col in self.column_names: cut, bins = pd.cut(to_quantize[col], self.num_levels, labels=list(self.labels.keys()), retbins=True) quantized[col] = cut self.variable_bin_map[col] = bins else: # use existing bins for col in self.column_names: cut = pd.cut(to_quantize[col], self.variable_bin_map[col], labels=list(self.labels.keys())) quantized[col] = cut # sort the columns by name in a natural order quantized = quantized.reindex(sorted(quantized.columns, key=_natural_keys), axis=1) quantized.insert(0, 'subject_id', to_quantize.subject_id) return quantized def get_qnet_inputs(self, quantized_df): """Retrieve the feature names and data matrix from a quantized data frame produced by `self.quantize_df` Args: quantized_df (pandas.DataFrame): a quantized data frame produced by `self.quantize_df` Returns: list: a list of feature names, ex. `['Acidobacteriota_35', 'Actinobacteriota_1', 'Actinobacteriota_2']` numpy.ndarray: a 2D data array of quantized labels (`'A', 'B', ...,` or empty string `''` for NaN) """ # skip subject_id column df = quantized_df.drop(columns='subject_id') # matrix = df.astype(str).replace('nan', '').to_numpy(dtype=str) # matrix = df.astype(str).fillna('').to_numpy(dtype=str) matrix = df.astype(str).replace('nan', '').fillna('').to_numpy(dtype=str) # sanity-check matrix contains only empty strings and label strings valid_labels = list(self.labels.keys()) + [''] is_valid = np.isin(np.unique(matrix), valid_labels).all() if not is_valid: import warnings #raise Exception('The label matrix contains strings that are neither the empty string nor the label strings') warnings.warn('The label matrix contains strings that are neither the empty string nor the label strings') return df.columns, matrix def quantize_new_subject(self, subject_data, subject_id=None): """Construct and quantize a new subject with missing data Input format: | subject_id | variable | week | value | |-------------:|:-----------------|-------:|---------:| | 1 | Actinobacteriota | 1 | 0.36665 | | 1 | Bacteroidota | 1 | 0.507248 | | 1 | Campilobacterota | 1 | 0.002032 | | 1 | Desulfobacterota | 1 | 0.005058 | | 1 | Firmicutes | 1 | 0.057767 | Output format: | subject_id | Acidobacteriota_35 | Actinobacteriota_1 | Actinobacteriota_2 | |-------------:|---------------------:|:---------------------|---------------------:| | 1 | nan | A | nan | | 1 | nan | A | nan | | 1 | nan | A | nan | | 1 | nan | D | nan | Args: subject_data ([type]): subject data frame with some but maybe not all the timestamps subject_id (str, optional): if not None, add the subject_id as a column; if None, assume that the input has a column named subject_id. Defaults to None. Returns: pd.DataFrame: quantized subject data frame with complete timestamps, see format above """ if subject_id is None and not 'subject_id' in subject_data.columns: raise Exception('You must provide a subject_id if there is none in the input data frame') if subject_id is not None: subject_data['subject_id'] = subject_id new_subject = self.pivot_into_quantize_format(subject_data) # add columns that are in self.column_names but not in pivoted as np.nan for column in self.column_names: if column not in new_subject.columns: new_subject[column] = np.nan return self._quantize_df(new_subject) def get_bin_array_of_index(self, idx): """Return the `pandas.cut` bin array corresponding to the sequence index by looking up `self.variable_bin_map[self.column_names[idx]]` Args: idx (int): index into `self.column_names` Returns: numpy.ndarray: bins """ col = self.column_names[idx] bin_arr = self.variable_bin_map[col] return bin_arr def quantize_value(self, val, bin_arr): """Quantize a numeric value into a label. This function is the inverse of `self.dequantize_label` Args: val (float): number to quantize bin_arr (numpy.ndarray): bins produced by `pandas.cut` or retrieved using `self.get_bin_array_of_index` Returns: str: quantized label string """ label = pd.cut([val], bin_arr, labels=list(self.labels.keys()))[0] return label # procedures and helpers for dequantization follows def _fit_random_forest_one_biome(self, x, y): idx_old = np.arange(len(x)) fx = interpolate.interp1d(idx_old, x, fill_value='extrapolate') fy = interpolate.interp1d(idx_old, y, fill_value='extrapolate') idx = np.arange(0, len(x), 0.01) X = fx(idx)[:, np.newaxis] Y = fy(idx) model = RandomForestRegressor() model.fit(X, Y) return model def compute_average_df(self, df): """Take the average over the input data frame by grouping by `variable, week` Input data format: | sample_id | subject_id | variable | week | value | |:----------------|-------------:|:-----------------|-------:|---------:| | MBSMPL0020-6-10 | 1 | Actinobacteriota | 27 | 0.36665 | | MBSMPL0020-6-10 | 1 | Bacteroidota | 27 | 0.507248 | | MBSMPL0020-6-10 | 1 | Campilobacterota | 27 | 0.002032 | | MBSMPL0020-6-10 | 1 | Desulfobacterota | 27 | 0.005058 | | MBSMPL0020-6-10 | 1 | Firmicutes | 27 | 0.057767 | Output data format: | variable | week | value | |:-----------------|-------:|---------:| | Actinobacteriota | 27 | 0.36665 | | Bacteroidota | 27 | 0.507248 | | Campilobacterota | 27 | 0.002032 | Args: df (pandas.DataFrame): see format above Returns: pandas.DataFrame: the average data frame, see format above """ avg = df[['variable', 'week', 'value']].groupby( by=['variable', 'week']).mean().reset_index() return avg def fit_random_forest(self, data, dequantized_data): """Fit a random forest regressor for each of the biome. Use as input the average of the quantized data. Fit regressor to the average of the original data as output. Populate `self.random_forest_dict`. Input format for both data frames: | subject_id | variable | week | value | |-------------:|:-----------------|-------:|---------:| | 1 | Actinobacteriota | 27 | 0.36665 | | 1 | Bacteroidota | 27 | 0.507248 | | 1 | Campilobacterota | 27 | 0.002032 | Args: data (pandas.DataFrame): see format above dequantized_data (pandas.DataFrame): see format above """ if self.random_forest_dict: # already populated return # take avg of data and dequantized_data, grouped by week and biome # want to map dequantized to original, hence dequantized is input inputs = self.compute_average_df(dequantized_data) outputs = self.compute_average_df(data) for biome in inputs.variable.unique(): x = inputs[inputs.variable == biome].value y = outputs[outputs.variable == biome].value model = self._fit_random_forest_one_biome(x, y) self.random_forest_dict[biome] = model def dequantize_label(self, label, bin_arr): """Dequantize a label string into a numeric value. This function is the inverse of `self.quantize_value`. If the input is an empty string, the return value will be `numpy.nan` Args: label (str): label string bin_arr (numpy.ndarray): bins produced by `pandas.cut` or retrieved using `self.get_bin_array_of_index` Returns: float: the dequantized numeric value """ if label is np.nan or label.lower() == 'nan' or label not in self.labels: return np.nan low = self.labels[label] high = low + 1 val = (bin_arr[low] + bin_arr[high]) / 2 return val def dequantize_sequence(self, label_seq): """Dequantize an entire label sequence Args: label_seq (numpy.ndarray): 1D array of label strings Returns: numpy.ndarray: 1D array of floats """ numeric_seq = np.empty(label_seq.shape) for idx, label in enumerate(label_seq): bin_arr = self.get_bin_array_of_index(idx) numeric_seq[idx] = self.dequantize_label(label, bin_arr) return numeric_seq def dequantize_to_df(self, matrix): """Dequantize a label matrix (with no column names, just the qnet input matrix) into a data frame with numeric values. To make the output data frame into a format `seaborn` can easily plot, apply `self.melt_into_plot_format` Output format: | subject_id | Acidobacteriota_35 | Actinobacteriota_1 | Actinobacteriota_2 | |-------------:|---------------------:|:---------------------|---------------------:| | 1 | nan | 0.36665 | nan | | 10 | nan | 0.36665 | nan | | 11 | nan | 0.36665 | nan | Args: matrix (numpy.ndarray): 2D matrix of label strings Returns: pandas.DataFrame: see format above """ numeric_matrix = np.empty(matrix.shape) for idx, seq in enumerate(matrix): numeric_matrix[idx] = self.dequantize_sequence(seq) df = self.add_meta_to_matrix(numeric_matrix) return df def add_meta_to_matrix(self, matrix, add_subject_id=True): """Add back `self.subject_ud` and `self.column_names` to the data matrix to convert it into a data frame Output format: | subject_id | Acidobacteriota_35 | Actinobacteriota_1 | Actinobacteriota_2 | |-------------:|---------------------:|:---------------------|---------------------:| | 1 | nan | A | nan | | 10 | nan | A | nan | | 11 | nan | A | nan | | 12 | nan | D | nan | | 14 | nan | A | nan | Args: matrix (np.ndarray): 2D matrix of either label strings or numeric values add_subject_id (bool, optional): whether to add back the cached subject_id column. Defaults to True. Returns: pandas.DataFrame: see format above """ df = pd.DataFrame(matrix, columns=self.column_names) if add_subject_id: df = pd.concat([self.subject_id_column, df], axis=1) return df def melt_into_plot_format(self, data): """Melt data into a format that `seaborn` can easily plot Input format: | subject_id | Acidobacteriota_35 | Actinobacteriota_1 | Actinobacteriota_2 | |-------------:|---------------------:|:---------------------|---------------------:| | 1 | nan | 0.36665 | nan | | 10 | nan | 0.36665 | nan | | 11 | nan | 0.36665 | nan | Output format: | subject_id | variable | week | value | |-------------:|:-----------------|-------:|---------:| | 1 | Actinobacteriota | 27 | 0.36665 | | 1 | Bacteroidota | 27 | 0.507248 | | 1 | Campilobacterota | 27 | 0.002032 | Args: data (pandas.DataFrame): numeric data, see format above Returns: pandas.DataFrame: see format above """ # pivot into plottable format melted = data.melt(id_vars='subject_id') # split variable names splitted = melted.variable.str.extract(r'([\D|\d]+)_(\d+)', expand=True) splitted.rename(columns={0: 'variable', 1: 'week'}, inplace=True) splitted.week = splitted.week.astype(int) plot_df = pd.concat([ melted.subject_id, splitted, melted.value ], axis=1) return plot_df def apply_random_forest_regressor(self, data): """Apply the trained biome regressor on the data to reduce the conversion distortion resulted from quantization-dequantization. If the data frame has columns other than `variable, week, value`, the extra columns will be averaged over using `self.compute_average_df` Data formats for the input and the output are the same: | variable | week | value | |:-----------------|-------:|---------:| | Actinobacteriota | 27 | 0.36665 | | Bacteroidota | 27 | 0.507248 | | Campilobacterota | 27 | 0.002032 | Args: data (pandas.DataFrame): see format above Raises: Exception: `self.random_forest_dict` hasn't been populated. You need to call `self.fit_random_forest` first Returns: pandas.DataFrame: with columns `variable, week, value` """ if not self.random_forest_dict: raise Exception('No random forest models. First train with fit_random_forest') avg_data = self.compute_average_df(data) dataframes = [] for biome in avg_data.variable.unique(): x = avg_data[avg_data.variable == biome].value # check if there is NaN if x.isnull().any(): raise Exception('There are NaNs in the inputs. Please run the forecaster to fill in all the NaNs first') x = x.to_numpy()[:, np.newaxis] model = self.random_forest_dict[biome] pred = model.predict(x) df = pd.DataFrame({ 'variable': biome, 'week': avg_data[avg_data.variable == biome].week, 'value': pred }) dataframes.append(df) ret = pd.concat(dataframes) return ret
Instance variables
var column_names
-
a list of columns in the format
{biome}_{week}
var labels
-
ex.
{A: 0, B: 1, ...}
var num_levels
-
number of quantization levels
var random_forest_dict
-
key-value pairs
{biome_name: sklearn.ensemble.RandomForestRegressor}
var subject_id_column
-
cache this column to add back to the label matrix with
self.add_meta_to_matrix
var variable_bin_map
-
key-value pairs
{biome_name: quantization map}
Methods
def add_meta_to_matrix(self, matrix, add_subject_id=True)
-
Add back
self.subject_ud
andself.column_names
to the data matrix to convert it into a data frameOutput format:
subject_id Acidobacteriota_35 Actinobacteriota_1 Actinobacteriota_2 1 nan A nan 10 nan A nan 11 nan A nan 12 nan D nan 14 nan A nan Args
matrix
:np.ndarray
- 2D matrix of either label strings or numeric values
add_subject_id
:bool
, optional- whether to add back the cached subject_id column. Defaults to True.
Returns
pandas.DataFrame
- see format above
Expand source code
def add_meta_to_matrix(self, matrix, add_subject_id=True): """Add back `self.subject_ud` and `self.column_names` to the data matrix to convert it into a data frame Output format: | subject_id | Acidobacteriota_35 | Actinobacteriota_1 | Actinobacteriota_2 | |-------------:|---------------------:|:---------------------|---------------------:| | 1 | nan | A | nan | | 10 | nan | A | nan | | 11 | nan | A | nan | | 12 | nan | D | nan | | 14 | nan | A | nan | Args: matrix (np.ndarray): 2D matrix of either label strings or numeric values add_subject_id (bool, optional): whether to add back the cached subject_id column. Defaults to True. Returns: pandas.DataFrame: see format above """ df = pd.DataFrame(matrix, columns=self.column_names) if add_subject_id: df = pd.concat([self.subject_id_column, df], axis=1) return df
def apply_random_forest_regressor(self, data)
-
Apply the trained biome regressor on the data to reduce the conversion distortion resulted from quantization-dequantization. If the data frame has columns other than
variable, week, value
, the extra columns will be averaged over usingself.compute_average_df
Data formats for the input and the output are the same:
variable week value Actinobacteriota 27 0.36665 Bacteroidota 27 0.507248 Campilobacterota 27 0.002032 Args
data
:pandas.DataFrame
- see format above
Raises
Exception
self.random_forest_dict
hasn't been populated. You need to callself.fit_random_forest
first
Returns
pandas.DataFrame
- with columns
variable, week, value
Expand source code
def apply_random_forest_regressor(self, data): """Apply the trained biome regressor on the data to reduce the conversion distortion resulted from quantization-dequantization. If the data frame has columns other than `variable, week, value`, the extra columns will be averaged over using `self.compute_average_df` Data formats for the input and the output are the same: | variable | week | value | |:-----------------|-------:|---------:| | Actinobacteriota | 27 | 0.36665 | | Bacteroidota | 27 | 0.507248 | | Campilobacterota | 27 | 0.002032 | Args: data (pandas.DataFrame): see format above Raises: Exception: `self.random_forest_dict` hasn't been populated. You need to call `self.fit_random_forest` first Returns: pandas.DataFrame: with columns `variable, week, value` """ if not self.random_forest_dict: raise Exception('No random forest models. First train with fit_random_forest') avg_data = self.compute_average_df(data) dataframes = [] for biome in avg_data.variable.unique(): x = avg_data[avg_data.variable == biome].value # check if there is NaN if x.isnull().any(): raise Exception('There are NaNs in the inputs. Please run the forecaster to fill in all the NaNs first') x = x.to_numpy()[:, np.newaxis] model = self.random_forest_dict[biome] pred = model.predict(x) df = pd.DataFrame({ 'variable': biome, 'week': avg_data[avg_data.variable == biome].week, 'value': pred }) dataframes.append(df) ret = pd.concat(dataframes) return ret
def compute_average_df(self, df)
-
Take the average over the input data frame by grouping by
variable, week
Input data format:
sample_id subject_id variable week value MBSMPL0020-6-10 1 Actinobacteriota 27 0.36665 MBSMPL0020-6-10 1 Bacteroidota 27 0.507248 MBSMPL0020-6-10 1 Campilobacterota 27 0.002032 MBSMPL0020-6-10 1 Desulfobacterota 27 0.005058 MBSMPL0020-6-10 1 Firmicutes 27 0.057767 Output data format: | variable | week | value | |:-----------------|-------:|---------:| | Actinobacteriota | 27 | 0.36665 | | Bacteroidota | 27 | 0.507248 | | Campilobacterota | 27 | 0.002032 |
Args
df
:pandas.DataFrame
- see format above
Returns
pandas.DataFrame
- the average data frame, see format above
Expand source code
def compute_average_df(self, df): """Take the average over the input data frame by grouping by `variable, week` Input data format: | sample_id | subject_id | variable | week | value | |:----------------|-------------:|:-----------------|-------:|---------:| | MBSMPL0020-6-10 | 1 | Actinobacteriota | 27 | 0.36665 | | MBSMPL0020-6-10 | 1 | Bacteroidota | 27 | 0.507248 | | MBSMPL0020-6-10 | 1 | Campilobacterota | 27 | 0.002032 | | MBSMPL0020-6-10 | 1 | Desulfobacterota | 27 | 0.005058 | | MBSMPL0020-6-10 | 1 | Firmicutes | 27 | 0.057767 | Output data format: | variable | week | value | |:-----------------|-------:|---------:| | Actinobacteriota | 27 | 0.36665 | | Bacteroidota | 27 | 0.507248 | | Campilobacterota | 27 | 0.002032 | Args: df (pandas.DataFrame): see format above Returns: pandas.DataFrame: the average data frame, see format above """ avg = df[['variable', 'week', 'value']].groupby( by=['variable', 'week']).mean().reset_index() return avg
def dequantize_label(self, label, bin_arr)
-
Dequantize a label string into a numeric value. This function is the inverse of
self.quantize_value
. If the input is an empty string, the return value will benumpy.nan
Args
label
:str
- label string
bin_arr
:numpy.ndarray
- bins produced by
pandas.cut
or retrieved usingself.get_bin_array_of_index
Returns
float
- the dequantized numeric value
Expand source code
def dequantize_label(self, label, bin_arr): """Dequantize a label string into a numeric value. This function is the inverse of `self.quantize_value`. If the input is an empty string, the return value will be `numpy.nan` Args: label (str): label string bin_arr (numpy.ndarray): bins produced by `pandas.cut` or retrieved using `self.get_bin_array_of_index` Returns: float: the dequantized numeric value """ if label is np.nan or label.lower() == 'nan' or label not in self.labels: return np.nan low = self.labels[label] high = low + 1 val = (bin_arr[low] + bin_arr[high]) / 2 return val
def dequantize_sequence(self, label_seq)
-
Dequantize an entire label sequence
Args
label_seq
:numpy.ndarray
- 1D array of label strings
Returns
numpy.ndarray
- 1D array of floats
Expand source code
def dequantize_sequence(self, label_seq): """Dequantize an entire label sequence Args: label_seq (numpy.ndarray): 1D array of label strings Returns: numpy.ndarray: 1D array of floats """ numeric_seq = np.empty(label_seq.shape) for idx, label in enumerate(label_seq): bin_arr = self.get_bin_array_of_index(idx) numeric_seq[idx] = self.dequantize_label(label, bin_arr) return numeric_seq
def dequantize_to_df(self, matrix)
-
Dequantize a label matrix (with no column names, just the qnet input matrix) into a data frame with numeric values. To make the output data frame into a format
seaborn
can easily plot, applyself.melt_into_plot_format
Output format:
subject_id Acidobacteriota_35 Actinobacteriota_1 Actinobacteriota_2 1 nan 0.36665 nan 10 nan 0.36665 nan 11 nan 0.36665 nan Args
matrix
:numpy.ndarray
- 2D matrix of label strings
Returns
pandas.DataFrame
- see format above
Expand source code
def dequantize_to_df(self, matrix): """Dequantize a label matrix (with no column names, just the qnet input matrix) into a data frame with numeric values. To make the output data frame into a format `seaborn` can easily plot, apply `self.melt_into_plot_format` Output format: | subject_id | Acidobacteriota_35 | Actinobacteriota_1 | Actinobacteriota_2 | |-------------:|---------------------:|:---------------------|---------------------:| | 1 | nan | 0.36665 | nan | | 10 | nan | 0.36665 | nan | | 11 | nan | 0.36665 | nan | Args: matrix (numpy.ndarray): 2D matrix of label strings Returns: pandas.DataFrame: see format above """ numeric_matrix = np.empty(matrix.shape) for idx, seq in enumerate(matrix): numeric_matrix[idx] = self.dequantize_sequence(seq) df = self.add_meta_to_matrix(numeric_matrix) return df
def fit_random_forest(self, data, dequantized_data)
-
Fit a random forest regressor for each of the biome. Use as input the average of the quantized data. Fit regressor to the average of the original data as output. Populate
self.random_forest_dict
.Input format for both data frames:
subject_id variable week value 1 Actinobacteriota 27 0.36665 1 Bacteroidota 27 0.507248 1 Campilobacterota 27 0.002032 Args
data
:pandas.DataFrame
- see format above
dequantized_data
:pandas.DataFrame
- see format above
Expand source code
def fit_random_forest(self, data, dequantized_data): """Fit a random forest regressor for each of the biome. Use as input the average of the quantized data. Fit regressor to the average of the original data as output. Populate `self.random_forest_dict`. Input format for both data frames: | subject_id | variable | week | value | |-------------:|:-----------------|-------:|---------:| | 1 | Actinobacteriota | 27 | 0.36665 | | 1 | Bacteroidota | 27 | 0.507248 | | 1 | Campilobacterota | 27 | 0.002032 | Args: data (pandas.DataFrame): see format above dequantized_data (pandas.DataFrame): see format above """ if self.random_forest_dict: # already populated return # take avg of data and dequantized_data, grouped by week and biome # want to map dequantized to original, hence dequantized is input inputs = self.compute_average_df(dequantized_data) outputs = self.compute_average_df(data) for biome in inputs.variable.unique(): x = inputs[inputs.variable == biome].value y = outputs[outputs.variable == biome].value model = self._fit_random_forest_one_biome(x, y) self.random_forest_dict[biome] = model
def get_bin_array_of_index(self, idx)
-
Return the
pandas.cut
bin array corresponding to the sequence index by looking upself.variable_bin_map[self.column_names[idx]]
Args
idx
:int
- index into
self.column_names
Returns
numpy.ndarray
- bins
Expand source code
def get_bin_array_of_index(self, idx): """Return the `pandas.cut` bin array corresponding to the sequence index by looking up `self.variable_bin_map[self.column_names[idx]]` Args: idx (int): index into `self.column_names` Returns: numpy.ndarray: bins """ col = self.column_names[idx] bin_arr = self.variable_bin_map[col] return bin_arr
def get_qnet_inputs(self, quantized_df)
-
Retrieve the feature names and data matrix from a quantized data frame produced by
self.quantize_df
Args
quantized_df
:pandas.DataFrame
- a quantized data frame produced by
self.quantize_df
Returns
list
- a list of feature names, ex.
['Acidobacteriota_35', 'Actinobacteriota_1', 'Actinobacteriota_2']
numpy.ndarray
- a 2D data array of quantized labels (
'A', 'B', ...,
or empty string''
for NaN)
Expand source code
def get_qnet_inputs(self, quantized_df): """Retrieve the feature names and data matrix from a quantized data frame produced by `self.quantize_df` Args: quantized_df (pandas.DataFrame): a quantized data frame produced by `self.quantize_df` Returns: list: a list of feature names, ex. `['Acidobacteriota_35', 'Actinobacteriota_1', 'Actinobacteriota_2']` numpy.ndarray: a 2D data array of quantized labels (`'A', 'B', ...,` or empty string `''` for NaN) """ # skip subject_id column df = quantized_df.drop(columns='subject_id') # matrix = df.astype(str).replace('nan', '').to_numpy(dtype=str) # matrix = df.astype(str).fillna('').to_numpy(dtype=str) matrix = df.astype(str).replace('nan', '').fillna('').to_numpy(dtype=str) # sanity-check matrix contains only empty strings and label strings valid_labels = list(self.labels.keys()) + [''] is_valid = np.isin(np.unique(matrix), valid_labels).all() if not is_valid: import warnings #raise Exception('The label matrix contains strings that are neither the empty string nor the label strings') warnings.warn('The label matrix contains strings that are neither the empty string nor the label strings') return df.columns, matrix
def load_quantizer_states(self, in_fname)
-
Load in
self.column_names, self.variable_bin_map, self.random_forest_dict
from fileArgs
in_fname
:str
- input file name
Expand source code
def load_quantizer_states(self, in_fname): """Load in `self.column_names, self.variable_bin_map, self.random_forest_dict` from file Args: in_fname (str): input file name """ with open(in_fname, 'rb') as f: states = pickle.load(f) self.column_names = states['column_names'] self.subject_id_column = states['subject_id_column'] self.variable_bin_map = states['variable_bin_map'] self.random_forest_dict = states['random_forest_dict']
def melt_into_plot_format(self, data)
-
Melt data into a format that
seaborn
can easily plotInput format:
subject_id Acidobacteriota_35 Actinobacteriota_1 Actinobacteriota_2 1 nan 0.36665 nan 10 nan 0.36665 nan 11 nan 0.36665 nan Output format:
subject_id variable week value 1 Actinobacteriota 27 0.36665 1 Bacteroidota 27 0.507248 1 Campilobacterota 27 0.002032 Args
data
:pandas.DataFrame
- numeric data, see format above
Returns
pandas.DataFrame
- see format above
Expand source code
def melt_into_plot_format(self, data): """Melt data into a format that `seaborn` can easily plot Input format: | subject_id | Acidobacteriota_35 | Actinobacteriota_1 | Actinobacteriota_2 | |-------------:|---------------------:|:---------------------|---------------------:| | 1 | nan | 0.36665 | nan | | 10 | nan | 0.36665 | nan | | 11 | nan | 0.36665 | nan | Output format: | subject_id | variable | week | value | |-------------:|:-----------------|-------:|---------:| | 1 | Actinobacteriota | 27 | 0.36665 | | 1 | Bacteroidota | 27 | 0.507248 | | 1 | Campilobacterota | 27 | 0.002032 | Args: data (pandas.DataFrame): numeric data, see format above Returns: pandas.DataFrame: see format above """ # pivot into plottable format melted = data.melt(id_vars='subject_id') # split variable names splitted = melted.variable.str.extract(r'([\D|\d]+)_(\d+)', expand=True) splitted.rename(columns={0: 'variable', 1: 'week'}, inplace=True) splitted.week = splitted.week.astype(int) plot_df = pd.concat([ melted.subject_id, splitted, melted.value ], axis=1) return plot_df
def pivot_into_quantize_format(self, data)
-
Pivot the data into a format the quantizer can quantize
Input data format, produced by
DataFormatter.load_data
:sample_id subject_id variable week value MBSMPL0020-6-10 1 Actinobacteriota 27 0.36665 MBSMPL0020-6-10 1 Bacteroidota 27 0.507248 MBSMPL0020-6-10 1 Campilobacterota 27 0.002032 MBSMPL0020-6-10 1 Desulfobacterota 27 0.005058 MBSMPL0020-6-10 1 Firmicutes 27 0.057767 Output format:
subject_id Acidobacteriota_35 Actinobacteriota_1 Actinobacteriota_2 1 nan 0.36665 nan 10 nan 0.36665 nan 11 nan 0.36665 nan Args
data
:pandas.DataFrame
- see format above
Returns
pandas.DataFrame
- see format above
Expand source code
def pivot_into_quantize_format(self, data): """Pivot the data into a format the quantizer can quantize Input data format, produced by `DataFormatter.load_data`: | sample_id | subject_id | variable | week | value | |:----------------|-------------:|:-----------------|-------:|---------:| | MBSMPL0020-6-10 | 1 | Actinobacteriota | 27 | 0.36665 | | MBSMPL0020-6-10 | 1 | Bacteroidota | 27 | 0.507248 | | MBSMPL0020-6-10 | 1 | Campilobacterota | 27 | 0.002032 | | MBSMPL0020-6-10 | 1 | Desulfobacterota | 27 | 0.005058 | | MBSMPL0020-6-10 | 1 | Firmicutes | 27 | 0.057767 | Output format: | subject_id | Acidobacteriota_35 | Actinobacteriota_1 | Actinobacteriota_2 | |-------------:|---------------------:|:---------------------|---------------------:| | 1 | nan | 0.36665 | nan | | 10 | nan | 0.36665 | nan | | 11 | nan | 0.36665 | nan | Args: data (pandas.DataFrame): see format above Returns: pandas.DataFrame: see format above """ # some hacky intermediate format used by quantizer only # so this probably shouldn't go into DataFormatter melted = pd.concat([ data.subject_id, data.variable + '_' + data.week.astype(str), data.value ], axis=1).rename(columns={0: 'variable'}) to_quantize = melted.pivot_table( index='subject_id', columns='variable', dropna=False)['value'].reset_index() return to_quantize
def quantize_df(self, data)
-
This function must be called before calling any of the dequantization procedures. It populates
self.column_names, self.subject_id_column, self.variable_bin_map
Input data format, produced by
DataFormatter.load_data
:sample_id subject_id variable week value MBSMPL0020-6-10 1 Actinobacteriota 27 0.36665 MBSMPL0020-6-10 1 Bacteroidota 27 0.507248 MBSMPL0020-6-10 1 Campilobacterota 27 0.002032 MBSMPL0020-6-10 1 Desulfobacterota 27 0.005058 MBSMPL0020-6-10 1 Firmicutes 27 0.057767 Output data format:
subject_id Acidobacteriota_35 Actinobacteriota_1 Actinobacteriota_2 1 nan A nan 10 nan A nan 11 nan A nan 12 nan D nan 14 nan A nan Args
data
:pandas.DataFrame
- see format above
Returns
pandas.DataFrame
- see format above
Expand source code
def quantize_df(self, data): """This function must be called before calling any of the dequantization procedures. It populates `self.column_names, self.subject_id_column, self.variable_bin_map` Input data format, produced by `DataFormatter.load_data`: | sample_id | subject_id | variable | week | value | |:----------------|-------------:|:-----------------|-------:|---------:| | MBSMPL0020-6-10 | 1 | Actinobacteriota | 27 | 0.36665 | | MBSMPL0020-6-10 | 1 | Bacteroidota | 27 | 0.507248 | | MBSMPL0020-6-10 | 1 | Campilobacterota | 27 | 0.002032 | | MBSMPL0020-6-10 | 1 | Desulfobacterota | 27 | 0.005058 | | MBSMPL0020-6-10 | 1 | Firmicutes | 27 | 0.057767 | Output data format: | subject_id | Acidobacteriota_35 | Actinobacteriota_1 | Actinobacteriota_2 | |-------------:|---------------------:|:---------------------|---------------------:| | 1 | nan | A | nan | | 10 | nan | A | nan | | 11 | nan | A | nan | | 12 | nan | D | nan | | 14 | nan | A | nan | Args: data (pandas.DataFrame): see format above Returns: pandas.DataFrame: see format above """ to_quantize = self.pivot_into_quantize_format(data) self.column_names = to_quantize.columns[1:] # skip subject_id, only biome names # cache the subject_id column to add back to a dequantized matrix self.subject_id_column = to_quantize.subject_id return self._quantize_df(to_quantize)
def quantize_new_subject(self, subject_data, subject_id=None)
-
Construct and quantize a new subject with missing data
Input format:
subject_id variable week value 1 Actinobacteriota 1 0.36665 1 Bacteroidota 1 0.507248 1 Campilobacterota 1 0.002032 1 Desulfobacterota 1 0.005058 1 Firmicutes 1 0.057767 Output format:
subject_id Acidobacteriota_35 Actinobacteriota_1 Actinobacteriota_2 1 nan A nan 1 nan A nan 1 nan A nan 1 nan D nan Args
subject_data
:[type]
- subject data frame with some but maybe not all the timestamps
subject_id
:str
, optional- if not None, add the subject_id as a column; if None, assume that the input has a column named subject_id. Defaults to None.
Returns
pd.DataFrame
- quantized subject data frame with complete timestamps, see format above
Expand source code
def quantize_new_subject(self, subject_data, subject_id=None): """Construct and quantize a new subject with missing data Input format: | subject_id | variable | week | value | |-------------:|:-----------------|-------:|---------:| | 1 | Actinobacteriota | 1 | 0.36665 | | 1 | Bacteroidota | 1 | 0.507248 | | 1 | Campilobacterota | 1 | 0.002032 | | 1 | Desulfobacterota | 1 | 0.005058 | | 1 | Firmicutes | 1 | 0.057767 | Output format: | subject_id | Acidobacteriota_35 | Actinobacteriota_1 | Actinobacteriota_2 | |-------------:|---------------------:|:---------------------|---------------------:| | 1 | nan | A | nan | | 1 | nan | A | nan | | 1 | nan | A | nan | | 1 | nan | D | nan | Args: subject_data ([type]): subject data frame with some but maybe not all the timestamps subject_id (str, optional): if not None, add the subject_id as a column; if None, assume that the input has a column named subject_id. Defaults to None. Returns: pd.DataFrame: quantized subject data frame with complete timestamps, see format above """ if subject_id is None and not 'subject_id' in subject_data.columns: raise Exception('You must provide a subject_id if there is none in the input data frame') if subject_id is not None: subject_data['subject_id'] = subject_id new_subject = self.pivot_into_quantize_format(subject_data) # add columns that are in self.column_names but not in pivoted as np.nan for column in self.column_names: if column not in new_subject.columns: new_subject[column] = np.nan return self._quantize_df(new_subject)
def quantize_value(self, val, bin_arr)
-
Quantize a numeric value into a label. This function is the inverse of
self.dequantize_label
Args
val
:float
- number to quantize
bin_arr
:numpy.ndarray
- bins produced by
pandas.cut
or retrieved usingself.get_bin_array_of_index
Returns
str
- quantized label string
Expand source code
def quantize_value(self, val, bin_arr): """Quantize a numeric value into a label. This function is the inverse of `self.dequantize_label` Args: val (float): number to quantize bin_arr (numpy.ndarray): bins produced by `pandas.cut` or retrieved using `self.get_bin_array_of_index` Returns: str: quantized label string """ label = pd.cut([val], bin_arr, labels=list(self.labels.keys()))[0] return label
def save_quantizer_states(self, out_fname)
-
Save
self.column_names, self.subject_id_column, self.variable_bin_map, self.random_forest_dict<code>. Call this after calling </code>self.quantize_df
Args
out_fname
:str
- output file name
Expand source code
def save_quantizer_states(self, out_fname): """Save `self.column_names, self.subject_id_column, self.variable_bin_map, self.random_forest_dict`. Call this after calling `self.quantize_df` Args: out_fname (str): output file name """ states = { 'column_names': self.column_names, 'subject_id_column': self.subject_id_column, 'variable_bin_map': self.variable_bin_map, 'random_forest_dict': self.random_forest_dict } with open(out_fname, 'wb') as f: pickle.dump(states, f, protocol=pickle.HIGHEST_PROTOCOL)