Module qbiome.quantizer

Expand source code
import re
import string
import pickle
import numpy as np
import pandas as pd
from scipy import interpolate
from sklearn.ensemble import RandomForestRegressor

# helper functions for sorting
# https://stackoverflow.com/questions/5967500/how-to-correctly-sort-a-string-with-a-number-inside
def _atof(text):
    try:
        retval = float(text)
    except ValueError:
        retval = text
    return retval

def _natural_keys(text):
    """
    alist.sort(key=_natural_keys) sorts in human order
    http://nedbatchelder.com/blog/200712/human_sorting.html
    (See Toothy's implementation in the comments)
    float regex comes from https://stackoverflow.com/a/12643073/190597
    """
    return [ _atof(c) for c in re.split(r'[+-]?([0-9]+(?:[.][0-9]*)?|[.][0-9]+)', text) ]

class Quantizer:
    """Handles quantization and dequantization of data
    """

    def __init__(self, num_levels=5):
        """Initalization

        Args:
            num_levels (int, optional): Number of quantization levels. Defaults to 5.
        """
        # use tuple for immutability
        self.num_levels = num_levels
        """number of quantization levels"""

        labels = tuple(string.ascii_uppercase[:num_levels])
        self.labels = {label: idx for idx, label in enumerate(labels)}
        """ex. `{A: 0, B: 1, ...}`"""

        self.variable_bin_map = {}
        """key-value pairs `{biome_name: quantization map}`"""

        self.column_names = None
        """a list of columns in the format `{biome}_{week}`"""

        self.subject_id_column = None
        """cache this column to add back to the label matrix with `self.add_meta_to_matrix`"""

        self.random_forest_dict = {}
        """key-value pairs `{biome_name: sklearn.ensemble.RandomForestRegressor}`"""

    def save_quantizer_states(self, out_fname):
        """Save `self.column_names, self.subject_id_column, self.variable_bin_map,
        self.random_forest_dict`. Call this after calling `self.quantize_df`

        Args:
            out_fname (str): output file name
        """
        states = {
            'column_names': self.column_names,
            'subject_id_column': self.subject_id_column,
            'variable_bin_map': self.variable_bin_map,
            'random_forest_dict': self.random_forest_dict
        }
        with open(out_fname, 'wb') as f:
            pickle.dump(states, f, protocol=pickle.HIGHEST_PROTOCOL)

    def load_quantizer_states(self, in_fname):
        """Load in `self.column_names, self.variable_bin_map, self.random_forest_dict` from file

        Args:
            in_fname (str): input file name
        """
        with open(in_fname, 'rb') as f:
            states = pickle.load(f)
        self.column_names = states['column_names']
        self.subject_id_column = states['subject_id_column']
        self.variable_bin_map = states['variable_bin_map']
        self.random_forest_dict = states['random_forest_dict']

    def pivot_into_quantize_format(self, data):
        """Pivot the data into a format the quantizer can quantize

        Input data format, produced by `DataFormatter.load_data`:

        | sample_id       |   subject_id | variable         |   week |    value |
        |:----------------|-------------:|:-----------------|-------:|---------:|
        | MBSMPL0020-6-10 |            1 | Actinobacteriota |     27 | 0.36665  |
        | MBSMPL0020-6-10 |            1 | Bacteroidota     |     27 | 0.507248 |
        | MBSMPL0020-6-10 |            1 | Campilobacterota |     27 | 0.002032 |
        | MBSMPL0020-6-10 |            1 | Desulfobacterota |     27 | 0.005058 |
        | MBSMPL0020-6-10 |            1 | Firmicutes       |     27 | 0.057767 |

        Output format:

        |   subject_id |   Acidobacteriota_35 | Actinobacteriota_1   |   Actinobacteriota_2 |
        |-------------:|---------------------:|:---------------------|---------------------:|
        |            1 |                  nan | 0.36665              |                  nan |
        |           10 |                  nan | 0.36665              |                  nan |
        |           11 |                  nan | 0.36665              |                  nan |

        Args:
            data (pandas.DataFrame): see format above

        Returns:
            pandas.DataFrame: see format above
        """
        # some hacky intermediate format used by quantizer only
        # so this probably shouldn't go into DataFormatter
        melted = pd.concat([
            data.subject_id,
            data.variable + '_' + data.week.astype(str),
            data.value
        ], axis=1).rename(columns={0: 'variable'})

        to_quantize = melted.pivot_table(
            index='subject_id', columns='variable', dropna=False)['value'].reset_index()
        return to_quantize

    def quantize_df(self, data):
        """This function must be called before calling any of the dequantization procedures. It populates `self.column_names, self.subject_id_column, self.variable_bin_map`

        Input data format, produced by `DataFormatter.load_data`:

        | sample_id       |   subject_id | variable         |   week |    value |
        |:----------------|-------------:|:-----------------|-------:|---------:|
        | MBSMPL0020-6-10 |            1 | Actinobacteriota |     27 | 0.36665  |
        | MBSMPL0020-6-10 |            1 | Bacteroidota     |     27 | 0.507248 |
        | MBSMPL0020-6-10 |            1 | Campilobacterota |     27 | 0.002032 |
        | MBSMPL0020-6-10 |            1 | Desulfobacterota |     27 | 0.005058 |
        | MBSMPL0020-6-10 |            1 | Firmicutes       |     27 | 0.057767 |

        Output data format:

        |   subject_id |   Acidobacteriota_35 | Actinobacteriota_1   |   Actinobacteriota_2 |
        |-------------:|---------------------:|:---------------------|---------------------:|
        |            1 |                  nan | A                    |                  nan |
        |           10 |                  nan | A                    |                  nan |
        |           11 |                  nan | A                    |                  nan |
        |           12 |                  nan | D                    |                  nan |
        |           14 |                  nan | A                    |                  nan |

        Args:
            data (pandas.DataFrame): see format above

        Returns:
            pandas.DataFrame: see format above
        """
        to_quantize = self.pivot_into_quantize_format(data)
        self.column_names = to_quantize.columns[1:] # skip subject_id, only biome names
        # cache the subject_id column to add back to a dequantized matrix
        self.subject_id_column = to_quantize.subject_id

        return self._quantize_df(to_quantize)

    def _quantize_df(self, to_quantize):
        """Quantize a data frame in quantizable format

        Input data format:

        |   subject_id |   Acidobacteriota_35 | Actinobacteriota_1   |   Actinobacteriota_2 |
        |-------------:|---------------------:|:---------------------|---------------------:|
        |            1 |                  nan | 0.36665              |                  nan |
        |           10 |                  nan | 0.36665              |                  nan |
        |           11 |                  nan | 0.36665              |                  nan |

        Output data format:

        |   subject_id |   Acidobacteriota_35 | Actinobacteriota_1   |   Actinobacteriota_2 |
        |-------------:|---------------------:|:---------------------|---------------------:|
        |            1 |                  nan | A                    |                  nan |
        |           10 |                  nan | A                    |                  nan |
        |           11 |                  nan | A                    |                  nan |

        Args:
            to_quantize (pd.DataFrame): data frame in quantizable format

        Returns:
            pandas.DataFrame: see format above
        """
        quantized = pd.DataFrame() # return df
        if not self.variable_bin_map:
            for col in self.column_names:
                cut, bins = pd.cut(to_quantize[col], self.num_levels,
                labels=list(self.labels.keys()), retbins=True)
                quantized[col] = cut
                self.variable_bin_map[col] = bins
        else: # use existing bins
            for col in self.column_names:
                cut = pd.cut(to_quantize[col], self.variable_bin_map[col],
                labels=list(self.labels.keys()))
                quantized[col] = cut

        # sort the columns by name in a natural order
        quantized = quantized.reindex(sorted(quantized.columns, key=_natural_keys), axis=1)
        quantized.insert(0, 'subject_id', to_quantize.subject_id)
        return quantized

    def get_qnet_inputs(self, quantized_df):
        """Retrieve the feature names and data matrix from a quantized data frame produced by `self.quantize_df`

        Args:
            quantized_df (pandas.DataFrame): a quantized data frame produced by `self.quantize_df`

        Returns:
            list: a list of feature names, ex. `['Acidobacteriota_35', 'Actinobacteriota_1', 'Actinobacteriota_2']`
            numpy.ndarray: a 2D data array of quantized labels (`'A', 'B', ...,` or empty string `''` for NaN)
        """
        # skip subject_id column
        df = quantized_df.drop(columns='subject_id')
        # matrix = df.astype(str).replace('nan', '').to_numpy(dtype=str)
        # matrix = df.astype(str).fillna('').to_numpy(dtype=str)
        matrix = df.astype(str).replace('nan', '').fillna('').to_numpy(dtype=str)
        # sanity-check matrix contains only empty strings and label strings
        valid_labels = list(self.labels.keys()) + ['']
        is_valid = np.isin(np.unique(matrix), valid_labels).all()
        if not is_valid:
            import warnings
            #raise Exception('The label matrix contains strings that are neither the empty string nor the label strings')
            warnings.warn('The label matrix contains strings that are neither the empty string nor the label strings')
        return df.columns, matrix

    def quantize_new_subject(self, subject_data, subject_id=None):
        """Construct and quantize a new subject with missing data

        Input format:

        |   subject_id | variable         |   week |    value |
        |-------------:|:-----------------|-------:|---------:|
        |            1 | Actinobacteriota |      1 | 0.36665  |
        |            1 | Bacteroidota     |      1 | 0.507248 |
        |            1 | Campilobacterota |      1 | 0.002032 |
        |            1 | Desulfobacterota |      1 | 0.005058 |
        |            1 | Firmicutes       |      1 | 0.057767 |

        Output format:

        |   subject_id |   Acidobacteriota_35 | Actinobacteriota_1   |   Actinobacteriota_2 |
        |-------------:|---------------------:|:---------------------|---------------------:|
        |            1 |                  nan | A                    |                  nan |
        |            1 |                  nan | A                    |                  nan |
        |            1 |                  nan | A                    |                  nan |
        |            1 |                  nan | D                    |                  nan |

        Args:
            subject_data ([type]): subject data frame with some but maybe not all the timestamps
            subject_id (str, optional): if not None, add the subject_id as a column; if None, assume that the input has a column named subject_id. Defaults to None.

        Returns:
            pd.DataFrame: quantized subject data frame with complete timestamps, see format above
        """
        if subject_id is None and not 'subject_id' in subject_data.columns:
            raise Exception('You must provide a subject_id if there is none in the input data frame')

        if subject_id is not None:
            subject_data['subject_id'] = subject_id

        new_subject = self.pivot_into_quantize_format(subject_data)
        # add columns that are in self.column_names but not in pivoted as np.nan
        for column in self.column_names:
            if column not in new_subject.columns:
                new_subject[column] = np.nan

        return self._quantize_df(new_subject)

    def get_bin_array_of_index(self, idx):
        """Return the `pandas.cut` bin array corresponding to the sequence index by looking up `self.variable_bin_map[self.column_names[idx]]`

        Args:
            idx (int): index into `self.column_names`

        Returns:
            numpy.ndarray: bins
        """
        col = self.column_names[idx]
        bin_arr = self.variable_bin_map[col]
        return bin_arr

    def quantize_value(self, val, bin_arr):
        """Quantize a numeric value into a label. This function is the inverse of `self.dequantize_label`

        Args:
            val (float): number to quantize
            bin_arr (numpy.ndarray): bins produced by `pandas.cut` or retrieved using `self.get_bin_array_of_index`

        Returns:
            str: quantized label string
        """
        label = pd.cut([val], bin_arr, labels=list(self.labels.keys()))[0]
        return label

    # procedures and helpers for dequantization follows

    def _fit_random_forest_one_biome(self, x, y):
        idx_old = np.arange(len(x))
        fx = interpolate.interp1d(idx_old, x, fill_value='extrapolate')
        fy = interpolate.interp1d(idx_old, y, fill_value='extrapolate')
        idx = np.arange(0, len(x), 0.01)
        X = fx(idx)[:, np.newaxis]
        Y = fy(idx)
        model = RandomForestRegressor()
        model.fit(X, Y)
        return model

    def compute_average_df(self, df):
        """Take the average over the input data frame by grouping by `variable, week`

        Input data format:

        | sample_id       |   subject_id | variable         |   week |    value |
        |:----------------|-------------:|:-----------------|-------:|---------:|
        | MBSMPL0020-6-10 |            1 | Actinobacteriota |     27 | 0.36665  |
        | MBSMPL0020-6-10 |            1 | Bacteroidota     |     27 | 0.507248 |
        | MBSMPL0020-6-10 |            1 | Campilobacterota |     27 | 0.002032 |
        | MBSMPL0020-6-10 |            1 | Desulfobacterota |     27 | 0.005058 |
        | MBSMPL0020-6-10 |            1 | Firmicutes       |     27 | 0.057767 |

        Output data format:
        | variable         |   week |    value |
        |:-----------------|-------:|---------:|
        | Actinobacteriota |     27 | 0.36665  |
        | Bacteroidota     |     27 | 0.507248 |
        | Campilobacterota |     27 | 0.002032 |

        Args:
            df (pandas.DataFrame): see format above

        Returns:
            pandas.DataFrame: the average data frame, see format above
        """
        avg = df[['variable', 'week', 'value']].groupby(
            by=['variable', 'week']).mean().reset_index()
        return avg

    def fit_random_forest(self, data, dequantized_data):
        """Fit a random forest regressor for each of the biome. Use as input the average of the quantized data. Fit regressor to the average of the original data as output. Populate `self.random_forest_dict`.

        Input format for both data frames:

        |   subject_id | variable         |   week |    value |
        |-------------:|:-----------------|-------:|---------:|
        |            1 | Actinobacteriota |     27 | 0.36665  |
        |            1 | Bacteroidota     |     27 | 0.507248 |
        |            1 | Campilobacterota |     27 | 0.002032 |

        Args:
            data (pandas.DataFrame): see format above
            dequantized_data (pandas.DataFrame): see format above
        """
        if self.random_forest_dict: # already populated
            return

        # take avg of data and dequantized_data, grouped by week and biome
        # want to map dequantized to original, hence dequantized is input
        inputs = self.compute_average_df(dequantized_data)
        outputs = self.compute_average_df(data)

        for biome in inputs.variable.unique():
            x = inputs[inputs.variable == biome].value
            y = outputs[outputs.variable == biome].value
            model = self._fit_random_forest_one_biome(x, y)
            self.random_forest_dict[biome] = model

    def dequantize_label(self, label, bin_arr):
        """Dequantize a label string into a numeric value. This function is the inverse of `self.quantize_value`. If the input is an empty string, the return value will be `numpy.nan`

        Args:
            label (str): label string
            bin_arr (numpy.ndarray): bins produced by `pandas.cut` or retrieved using `self.get_bin_array_of_index`

        Returns:
            float: the dequantized numeric value
        """
        if label is np.nan or label.lower() == 'nan' or label not in self.labels:
            return np.nan
        low = self.labels[label]
        high = low + 1
        val = (bin_arr[low] + bin_arr[high]) / 2
        return val

    def dequantize_sequence(self, label_seq):
        """Dequantize an entire label sequence

        Args:
            label_seq (numpy.ndarray): 1D array of label strings

        Returns:
            numpy.ndarray: 1D array of floats
        """
        numeric_seq = np.empty(label_seq.shape)
        for idx, label in enumerate(label_seq):
            bin_arr = self.get_bin_array_of_index(idx)
            numeric_seq[idx] = self.dequantize_label(label, bin_arr)
        return numeric_seq

    def dequantize_to_df(self, matrix):
        """Dequantize a label matrix (with no column names, just the qnet input matrix) into a data frame with numeric values. To make the output data frame into a format `seaborn` can easily plot, apply `self.melt_into_plot_format`

        Output format:

        |   subject_id |   Acidobacteriota_35 | Actinobacteriota_1   |   Actinobacteriota_2 |
        |-------------:|---------------------:|:---------------------|---------------------:|
        |            1 |                  nan | 0.36665              |                  nan |
        |           10 |                  nan | 0.36665              |                  nan |
        |           11 |                  nan | 0.36665              |                  nan |

        Args:
            matrix (numpy.ndarray): 2D matrix of label strings

        Returns:
            pandas.DataFrame: see format above
        """
        numeric_matrix = np.empty(matrix.shape)
        for idx, seq in enumerate(matrix):
            numeric_matrix[idx] = self.dequantize_sequence(seq)

        df = self.add_meta_to_matrix(numeric_matrix)
        return df

    def add_meta_to_matrix(self, matrix, add_subject_id=True):
        """Add back `self.subject_ud` and `self.column_names` to the data matrix to convert it into a data frame

        Output format:

        |   subject_id |   Acidobacteriota_35 | Actinobacteriota_1   |   Actinobacteriota_2 |
        |-------------:|---------------------:|:---------------------|---------------------:|
        |            1 |                  nan | A                    |                  nan |
        |           10 |                  nan | A                    |                  nan |
        |           11 |                  nan | A                    |                  nan |
        |           12 |                  nan | D                    |                  nan |
        |           14 |                  nan | A                    |                  nan |


        Args:
            matrix (np.ndarray): 2D matrix of either label strings or numeric values
            add_subject_id (bool, optional): whether to add back the cached subject_id column. Defaults to True.

        Returns:
            pandas.DataFrame: see format above
        """
        df = pd.DataFrame(matrix, columns=self.column_names)
        if add_subject_id:
            df = pd.concat([self.subject_id_column, df], axis=1)
        return df

    def melt_into_plot_format(self, data):
        """Melt data into a format that `seaborn` can easily plot

        Input format:

        |   subject_id |   Acidobacteriota_35 | Actinobacteriota_1   |   Actinobacteriota_2 |
        |-------------:|---------------------:|:---------------------|---------------------:|
        |            1 |                  nan | 0.36665              |                  nan |
        |           10 |                  nan | 0.36665              |                  nan |
        |           11 |                  nan | 0.36665              |                  nan |

        Output format:

        |   subject_id | variable         |   week |    value |
        |-------------:|:-----------------|-------:|---------:|
        |            1 | Actinobacteriota |     27 | 0.36665  |
        |            1 | Bacteroidota     |     27 | 0.507248 |
        |            1 | Campilobacterota |     27 | 0.002032 |

        Args:
            data (pandas.DataFrame): numeric data, see format above

        Returns:
            pandas.DataFrame: see format above
        """
        # pivot into plottable format
        melted = data.melt(id_vars='subject_id')
        # split variable names
        splitted = melted.variable.str.extract(r'([\D|\d]+)_(\d+)', expand=True)
        splitted.rename(columns={0: 'variable', 1: 'week'}, inplace=True)
        splitted.week = splitted.week.astype(int)
        plot_df = pd.concat([
            melted.subject_id, splitted, melted.value
        ], axis=1)
        return plot_df

    def apply_random_forest_regressor(self, data):
        """Apply the trained biome regressor on the data to reduce the conversion distortion resulted from quantization-dequantization. If the data frame has columns other than `variable, week, value`, the extra columns will be averaged over using `self.compute_average_df`

        Data formats for the input and the output are the same:

        | variable         |   week |    value |
        |:-----------------|-------:|---------:|
        | Actinobacteriota |     27 | 0.36665  |
        | Bacteroidota     |     27 | 0.507248 |
        | Campilobacterota |     27 | 0.002032 |

        Args:
            data (pandas.DataFrame): see format above

        Raises:
            Exception: `self.random_forest_dict` hasn't been populated. You need to call `self.fit_random_forest` first

        Returns:
            pandas.DataFrame: with columns `variable, week, value`
        """
        if not self.random_forest_dict:
            raise Exception('No random forest models. First train with fit_random_forest')
        avg_data = self.compute_average_df(data)
        dataframes = []
        for biome in avg_data.variable.unique():
            x = avg_data[avg_data.variable == biome].value
            # check if there is NaN
            if x.isnull().any():
                raise Exception('There are NaNs in the inputs. Please run the forecaster to fill in all the NaNs first')
            x = x.to_numpy()[:, np.newaxis]
            model = self.random_forest_dict[biome]
            pred = model.predict(x)
            df = pd.DataFrame({
                'variable': biome,
                'week': avg_data[avg_data.variable == biome].week,
                'value': pred
            })
            dataframes.append(df)
        ret = pd.concat(dataframes)
        return ret

Classes

class Quantizer (num_levels=5)

Handles quantization and dequantization of data

Initalization

Args

num_levels : int, optional
Number of quantization levels. Defaults to 5.
Expand source code
class Quantizer:
    """Handles quantization and dequantization of data
    """

    def __init__(self, num_levels=5):
        """Initalization

        Args:
            num_levels (int, optional): Number of quantization levels. Defaults to 5.
        """
        # use tuple for immutability
        self.num_levels = num_levels
        """number of quantization levels"""

        labels = tuple(string.ascii_uppercase[:num_levels])
        self.labels = {label: idx for idx, label in enumerate(labels)}
        """ex. `{A: 0, B: 1, ...}`"""

        self.variable_bin_map = {}
        """key-value pairs `{biome_name: quantization map}`"""

        self.column_names = None
        """a list of columns in the format `{biome}_{week}`"""

        self.subject_id_column = None
        """cache this column to add back to the label matrix with `self.add_meta_to_matrix`"""

        self.random_forest_dict = {}
        """key-value pairs `{biome_name: sklearn.ensemble.RandomForestRegressor}`"""

    def save_quantizer_states(self, out_fname):
        """Save `self.column_names, self.subject_id_column, self.variable_bin_map,
        self.random_forest_dict`. Call this after calling `self.quantize_df`

        Args:
            out_fname (str): output file name
        """
        states = {
            'column_names': self.column_names,
            'subject_id_column': self.subject_id_column,
            'variable_bin_map': self.variable_bin_map,
            'random_forest_dict': self.random_forest_dict
        }
        with open(out_fname, 'wb') as f:
            pickle.dump(states, f, protocol=pickle.HIGHEST_PROTOCOL)

    def load_quantizer_states(self, in_fname):
        """Load in `self.column_names, self.variable_bin_map, self.random_forest_dict` from file

        Args:
            in_fname (str): input file name
        """
        with open(in_fname, 'rb') as f:
            states = pickle.load(f)
        self.column_names = states['column_names']
        self.subject_id_column = states['subject_id_column']
        self.variable_bin_map = states['variable_bin_map']
        self.random_forest_dict = states['random_forest_dict']

    def pivot_into_quantize_format(self, data):
        """Pivot the data into a format the quantizer can quantize

        Input data format, produced by `DataFormatter.load_data`:

        | sample_id       |   subject_id | variable         |   week |    value |
        |:----------------|-------------:|:-----------------|-------:|---------:|
        | MBSMPL0020-6-10 |            1 | Actinobacteriota |     27 | 0.36665  |
        | MBSMPL0020-6-10 |            1 | Bacteroidota     |     27 | 0.507248 |
        | MBSMPL0020-6-10 |            1 | Campilobacterota |     27 | 0.002032 |
        | MBSMPL0020-6-10 |            1 | Desulfobacterota |     27 | 0.005058 |
        | MBSMPL0020-6-10 |            1 | Firmicutes       |     27 | 0.057767 |

        Output format:

        |   subject_id |   Acidobacteriota_35 | Actinobacteriota_1   |   Actinobacteriota_2 |
        |-------------:|---------------------:|:---------------------|---------------------:|
        |            1 |                  nan | 0.36665              |                  nan |
        |           10 |                  nan | 0.36665              |                  nan |
        |           11 |                  nan | 0.36665              |                  nan |

        Args:
            data (pandas.DataFrame): see format above

        Returns:
            pandas.DataFrame: see format above
        """
        # some hacky intermediate format used by quantizer only
        # so this probably shouldn't go into DataFormatter
        melted = pd.concat([
            data.subject_id,
            data.variable + '_' + data.week.astype(str),
            data.value
        ], axis=1).rename(columns={0: 'variable'})

        to_quantize = melted.pivot_table(
            index='subject_id', columns='variable', dropna=False)['value'].reset_index()
        return to_quantize

    def quantize_df(self, data):
        """This function must be called before calling any of the dequantization procedures. It populates `self.column_names, self.subject_id_column, self.variable_bin_map`

        Input data format, produced by `DataFormatter.load_data`:

        | sample_id       |   subject_id | variable         |   week |    value |
        |:----------------|-------------:|:-----------------|-------:|---------:|
        | MBSMPL0020-6-10 |            1 | Actinobacteriota |     27 | 0.36665  |
        | MBSMPL0020-6-10 |            1 | Bacteroidota     |     27 | 0.507248 |
        | MBSMPL0020-6-10 |            1 | Campilobacterota |     27 | 0.002032 |
        | MBSMPL0020-6-10 |            1 | Desulfobacterota |     27 | 0.005058 |
        | MBSMPL0020-6-10 |            1 | Firmicutes       |     27 | 0.057767 |

        Output data format:

        |   subject_id |   Acidobacteriota_35 | Actinobacteriota_1   |   Actinobacteriota_2 |
        |-------------:|---------------------:|:---------------------|---------------------:|
        |            1 |                  nan | A                    |                  nan |
        |           10 |                  nan | A                    |                  nan |
        |           11 |                  nan | A                    |                  nan |
        |           12 |                  nan | D                    |                  nan |
        |           14 |                  nan | A                    |                  nan |

        Args:
            data (pandas.DataFrame): see format above

        Returns:
            pandas.DataFrame: see format above
        """
        to_quantize = self.pivot_into_quantize_format(data)
        self.column_names = to_quantize.columns[1:] # skip subject_id, only biome names
        # cache the subject_id column to add back to a dequantized matrix
        self.subject_id_column = to_quantize.subject_id

        return self._quantize_df(to_quantize)

    def _quantize_df(self, to_quantize):
        """Quantize a data frame in quantizable format

        Input data format:

        |   subject_id |   Acidobacteriota_35 | Actinobacteriota_1   |   Actinobacteriota_2 |
        |-------------:|---------------------:|:---------------------|---------------------:|
        |            1 |                  nan | 0.36665              |                  nan |
        |           10 |                  nan | 0.36665              |                  nan |
        |           11 |                  nan | 0.36665              |                  nan |

        Output data format:

        |   subject_id |   Acidobacteriota_35 | Actinobacteriota_1   |   Actinobacteriota_2 |
        |-------------:|---------------------:|:---------------------|---------------------:|
        |            1 |                  nan | A                    |                  nan |
        |           10 |                  nan | A                    |                  nan |
        |           11 |                  nan | A                    |                  nan |

        Args:
            to_quantize (pd.DataFrame): data frame in quantizable format

        Returns:
            pandas.DataFrame: see format above
        """
        quantized = pd.DataFrame() # return df
        if not self.variable_bin_map:
            for col in self.column_names:
                cut, bins = pd.cut(to_quantize[col], self.num_levels,
                labels=list(self.labels.keys()), retbins=True)
                quantized[col] = cut
                self.variable_bin_map[col] = bins
        else: # use existing bins
            for col in self.column_names:
                cut = pd.cut(to_quantize[col], self.variable_bin_map[col],
                labels=list(self.labels.keys()))
                quantized[col] = cut

        # sort the columns by name in a natural order
        quantized = quantized.reindex(sorted(quantized.columns, key=_natural_keys), axis=1)
        quantized.insert(0, 'subject_id', to_quantize.subject_id)
        return quantized

    def get_qnet_inputs(self, quantized_df):
        """Retrieve the feature names and data matrix from a quantized data frame produced by `self.quantize_df`

        Args:
            quantized_df (pandas.DataFrame): a quantized data frame produced by `self.quantize_df`

        Returns:
            list: a list of feature names, ex. `['Acidobacteriota_35', 'Actinobacteriota_1', 'Actinobacteriota_2']`
            numpy.ndarray: a 2D data array of quantized labels (`'A', 'B', ...,` or empty string `''` for NaN)
        """
        # skip subject_id column
        df = quantized_df.drop(columns='subject_id')
        # matrix = df.astype(str).replace('nan', '').to_numpy(dtype=str)
        # matrix = df.astype(str).fillna('').to_numpy(dtype=str)
        matrix = df.astype(str).replace('nan', '').fillna('').to_numpy(dtype=str)
        # sanity-check matrix contains only empty strings and label strings
        valid_labels = list(self.labels.keys()) + ['']
        is_valid = np.isin(np.unique(matrix), valid_labels).all()
        if not is_valid:
            import warnings
            #raise Exception('The label matrix contains strings that are neither the empty string nor the label strings')
            warnings.warn('The label matrix contains strings that are neither the empty string nor the label strings')
        return df.columns, matrix

    def quantize_new_subject(self, subject_data, subject_id=None):
        """Construct and quantize a new subject with missing data

        Input format:

        |   subject_id | variable         |   week |    value |
        |-------------:|:-----------------|-------:|---------:|
        |            1 | Actinobacteriota |      1 | 0.36665  |
        |            1 | Bacteroidota     |      1 | 0.507248 |
        |            1 | Campilobacterota |      1 | 0.002032 |
        |            1 | Desulfobacterota |      1 | 0.005058 |
        |            1 | Firmicutes       |      1 | 0.057767 |

        Output format:

        |   subject_id |   Acidobacteriota_35 | Actinobacteriota_1   |   Actinobacteriota_2 |
        |-------------:|---------------------:|:---------------------|---------------------:|
        |            1 |                  nan | A                    |                  nan |
        |            1 |                  nan | A                    |                  nan |
        |            1 |                  nan | A                    |                  nan |
        |            1 |                  nan | D                    |                  nan |

        Args:
            subject_data ([type]): subject data frame with some but maybe not all the timestamps
            subject_id (str, optional): if not None, add the subject_id as a column; if None, assume that the input has a column named subject_id. Defaults to None.

        Returns:
            pd.DataFrame: quantized subject data frame with complete timestamps, see format above
        """
        if subject_id is None and not 'subject_id' in subject_data.columns:
            raise Exception('You must provide a subject_id if there is none in the input data frame')

        if subject_id is not None:
            subject_data['subject_id'] = subject_id

        new_subject = self.pivot_into_quantize_format(subject_data)
        # add columns that are in self.column_names but not in pivoted as np.nan
        for column in self.column_names:
            if column not in new_subject.columns:
                new_subject[column] = np.nan

        return self._quantize_df(new_subject)

    def get_bin_array_of_index(self, idx):
        """Return the `pandas.cut` bin array corresponding to the sequence index by looking up `self.variable_bin_map[self.column_names[idx]]`

        Args:
            idx (int): index into `self.column_names`

        Returns:
            numpy.ndarray: bins
        """
        col = self.column_names[idx]
        bin_arr = self.variable_bin_map[col]
        return bin_arr

    def quantize_value(self, val, bin_arr):
        """Quantize a numeric value into a label. This function is the inverse of `self.dequantize_label`

        Args:
            val (float): number to quantize
            bin_arr (numpy.ndarray): bins produced by `pandas.cut` or retrieved using `self.get_bin_array_of_index`

        Returns:
            str: quantized label string
        """
        label = pd.cut([val], bin_arr, labels=list(self.labels.keys()))[0]
        return label

    # procedures and helpers for dequantization follows

    def _fit_random_forest_one_biome(self, x, y):
        idx_old = np.arange(len(x))
        fx = interpolate.interp1d(idx_old, x, fill_value='extrapolate')
        fy = interpolate.interp1d(idx_old, y, fill_value='extrapolate')
        idx = np.arange(0, len(x), 0.01)
        X = fx(idx)[:, np.newaxis]
        Y = fy(idx)
        model = RandomForestRegressor()
        model.fit(X, Y)
        return model

    def compute_average_df(self, df):
        """Take the average over the input data frame by grouping by `variable, week`

        Input data format:

        | sample_id       |   subject_id | variable         |   week |    value |
        |:----------------|-------------:|:-----------------|-------:|---------:|
        | MBSMPL0020-6-10 |            1 | Actinobacteriota |     27 | 0.36665  |
        | MBSMPL0020-6-10 |            1 | Bacteroidota     |     27 | 0.507248 |
        | MBSMPL0020-6-10 |            1 | Campilobacterota |     27 | 0.002032 |
        | MBSMPL0020-6-10 |            1 | Desulfobacterota |     27 | 0.005058 |
        | MBSMPL0020-6-10 |            1 | Firmicutes       |     27 | 0.057767 |

        Output data format:
        | variable         |   week |    value |
        |:-----------------|-------:|---------:|
        | Actinobacteriota |     27 | 0.36665  |
        | Bacteroidota     |     27 | 0.507248 |
        | Campilobacterota |     27 | 0.002032 |

        Args:
            df (pandas.DataFrame): see format above

        Returns:
            pandas.DataFrame: the average data frame, see format above
        """
        avg = df[['variable', 'week', 'value']].groupby(
            by=['variable', 'week']).mean().reset_index()
        return avg

    def fit_random_forest(self, data, dequantized_data):
        """Fit a random forest regressor for each of the biome. Use as input the average of the quantized data. Fit regressor to the average of the original data as output. Populate `self.random_forest_dict`.

        Input format for both data frames:

        |   subject_id | variable         |   week |    value |
        |-------------:|:-----------------|-------:|---------:|
        |            1 | Actinobacteriota |     27 | 0.36665  |
        |            1 | Bacteroidota     |     27 | 0.507248 |
        |            1 | Campilobacterota |     27 | 0.002032 |

        Args:
            data (pandas.DataFrame): see format above
            dequantized_data (pandas.DataFrame): see format above
        """
        if self.random_forest_dict: # already populated
            return

        # take avg of data and dequantized_data, grouped by week and biome
        # want to map dequantized to original, hence dequantized is input
        inputs = self.compute_average_df(dequantized_data)
        outputs = self.compute_average_df(data)

        for biome in inputs.variable.unique():
            x = inputs[inputs.variable == biome].value
            y = outputs[outputs.variable == biome].value
            model = self._fit_random_forest_one_biome(x, y)
            self.random_forest_dict[biome] = model

    def dequantize_label(self, label, bin_arr):
        """Dequantize a label string into a numeric value. This function is the inverse of `self.quantize_value`. If the input is an empty string, the return value will be `numpy.nan`

        Args:
            label (str): label string
            bin_arr (numpy.ndarray): bins produced by `pandas.cut` or retrieved using `self.get_bin_array_of_index`

        Returns:
            float: the dequantized numeric value
        """
        if label is np.nan or label.lower() == 'nan' or label not in self.labels:
            return np.nan
        low = self.labels[label]
        high = low + 1
        val = (bin_arr[low] + bin_arr[high]) / 2
        return val

    def dequantize_sequence(self, label_seq):
        """Dequantize an entire label sequence

        Args:
            label_seq (numpy.ndarray): 1D array of label strings

        Returns:
            numpy.ndarray: 1D array of floats
        """
        numeric_seq = np.empty(label_seq.shape)
        for idx, label in enumerate(label_seq):
            bin_arr = self.get_bin_array_of_index(idx)
            numeric_seq[idx] = self.dequantize_label(label, bin_arr)
        return numeric_seq

    def dequantize_to_df(self, matrix):
        """Dequantize a label matrix (with no column names, just the qnet input matrix) into a data frame with numeric values. To make the output data frame into a format `seaborn` can easily plot, apply `self.melt_into_plot_format`

        Output format:

        |   subject_id |   Acidobacteriota_35 | Actinobacteriota_1   |   Actinobacteriota_2 |
        |-------------:|---------------------:|:---------------------|---------------------:|
        |            1 |                  nan | 0.36665              |                  nan |
        |           10 |                  nan | 0.36665              |                  nan |
        |           11 |                  nan | 0.36665              |                  nan |

        Args:
            matrix (numpy.ndarray): 2D matrix of label strings

        Returns:
            pandas.DataFrame: see format above
        """
        numeric_matrix = np.empty(matrix.shape)
        for idx, seq in enumerate(matrix):
            numeric_matrix[idx] = self.dequantize_sequence(seq)

        df = self.add_meta_to_matrix(numeric_matrix)
        return df

    def add_meta_to_matrix(self, matrix, add_subject_id=True):
        """Add back `self.subject_ud` and `self.column_names` to the data matrix to convert it into a data frame

        Output format:

        |   subject_id |   Acidobacteriota_35 | Actinobacteriota_1   |   Actinobacteriota_2 |
        |-------------:|---------------------:|:---------------------|---------------------:|
        |            1 |                  nan | A                    |                  nan |
        |           10 |                  nan | A                    |                  nan |
        |           11 |                  nan | A                    |                  nan |
        |           12 |                  nan | D                    |                  nan |
        |           14 |                  nan | A                    |                  nan |


        Args:
            matrix (np.ndarray): 2D matrix of either label strings or numeric values
            add_subject_id (bool, optional): whether to add back the cached subject_id column. Defaults to True.

        Returns:
            pandas.DataFrame: see format above
        """
        df = pd.DataFrame(matrix, columns=self.column_names)
        if add_subject_id:
            df = pd.concat([self.subject_id_column, df], axis=1)
        return df

    def melt_into_plot_format(self, data):
        """Melt data into a format that `seaborn` can easily plot

        Input format:

        |   subject_id |   Acidobacteriota_35 | Actinobacteriota_1   |   Actinobacteriota_2 |
        |-------------:|---------------------:|:---------------------|---------------------:|
        |            1 |                  nan | 0.36665              |                  nan |
        |           10 |                  nan | 0.36665              |                  nan |
        |           11 |                  nan | 0.36665              |                  nan |

        Output format:

        |   subject_id | variable         |   week |    value |
        |-------------:|:-----------------|-------:|---------:|
        |            1 | Actinobacteriota |     27 | 0.36665  |
        |            1 | Bacteroidota     |     27 | 0.507248 |
        |            1 | Campilobacterota |     27 | 0.002032 |

        Args:
            data (pandas.DataFrame): numeric data, see format above

        Returns:
            pandas.DataFrame: see format above
        """
        # pivot into plottable format
        melted = data.melt(id_vars='subject_id')
        # split variable names
        splitted = melted.variable.str.extract(r'([\D|\d]+)_(\d+)', expand=True)
        splitted.rename(columns={0: 'variable', 1: 'week'}, inplace=True)
        splitted.week = splitted.week.astype(int)
        plot_df = pd.concat([
            melted.subject_id, splitted, melted.value
        ], axis=1)
        return plot_df

    def apply_random_forest_regressor(self, data):
        """Apply the trained biome regressor on the data to reduce the conversion distortion resulted from quantization-dequantization. If the data frame has columns other than `variable, week, value`, the extra columns will be averaged over using `self.compute_average_df`

        Data formats for the input and the output are the same:

        | variable         |   week |    value |
        |:-----------------|-------:|---------:|
        | Actinobacteriota |     27 | 0.36665  |
        | Bacteroidota     |     27 | 0.507248 |
        | Campilobacterota |     27 | 0.002032 |

        Args:
            data (pandas.DataFrame): see format above

        Raises:
            Exception: `self.random_forest_dict` hasn't been populated. You need to call `self.fit_random_forest` first

        Returns:
            pandas.DataFrame: with columns `variable, week, value`
        """
        if not self.random_forest_dict:
            raise Exception('No random forest models. First train with fit_random_forest')
        avg_data = self.compute_average_df(data)
        dataframes = []
        for biome in avg_data.variable.unique():
            x = avg_data[avg_data.variable == biome].value
            # check if there is NaN
            if x.isnull().any():
                raise Exception('There are NaNs in the inputs. Please run the forecaster to fill in all the NaNs first')
            x = x.to_numpy()[:, np.newaxis]
            model = self.random_forest_dict[biome]
            pred = model.predict(x)
            df = pd.DataFrame({
                'variable': biome,
                'week': avg_data[avg_data.variable == biome].week,
                'value': pred
            })
            dataframes.append(df)
        ret = pd.concat(dataframes)
        return ret

Instance variables

var column_names

a list of columns in the format {biome}_{week}

var labels

ex. {A: 0, B: 1, ...}

var num_levels

number of quantization levels

var random_forest_dict

key-value pairs {biome_name: sklearn.ensemble.RandomForestRegressor}

var subject_id_column

cache this column to add back to the label matrix with self.add_meta_to_matrix

var variable_bin_map

key-value pairs {biome_name: quantization map}

Methods

def add_meta_to_matrix(self, matrix, add_subject_id=True)

Add back self.subject_ud and self.column_names to the data matrix to convert it into a data frame

Output format:

subject_id Acidobacteriota_35 Actinobacteriota_1 Actinobacteriota_2
1 nan A nan
10 nan A nan
11 nan A nan
12 nan D nan
14 nan A nan

Args

matrix : np.ndarray
2D matrix of either label strings or numeric values
add_subject_id : bool, optional
whether to add back the cached subject_id column. Defaults to True.

Returns

pandas.DataFrame
see format above
Expand source code
def add_meta_to_matrix(self, matrix, add_subject_id=True):
    """Add back `self.subject_ud` and `self.column_names` to the data matrix to convert it into a data frame

    Output format:

    |   subject_id |   Acidobacteriota_35 | Actinobacteriota_1   |   Actinobacteriota_2 |
    |-------------:|---------------------:|:---------------------|---------------------:|
    |            1 |                  nan | A                    |                  nan |
    |           10 |                  nan | A                    |                  nan |
    |           11 |                  nan | A                    |                  nan |
    |           12 |                  nan | D                    |                  nan |
    |           14 |                  nan | A                    |                  nan |


    Args:
        matrix (np.ndarray): 2D matrix of either label strings or numeric values
        add_subject_id (bool, optional): whether to add back the cached subject_id column. Defaults to True.

    Returns:
        pandas.DataFrame: see format above
    """
    df = pd.DataFrame(matrix, columns=self.column_names)
    if add_subject_id:
        df = pd.concat([self.subject_id_column, df], axis=1)
    return df
def apply_random_forest_regressor(self, data)

Apply the trained biome regressor on the data to reduce the conversion distortion resulted from quantization-dequantization. If the data frame has columns other than variable, week, value, the extra columns will be averaged over using self.compute_average_df

Data formats for the input and the output are the same:

variable week value
Actinobacteriota 27 0.36665
Bacteroidota 27 0.507248
Campilobacterota 27 0.002032

Args

data : pandas.DataFrame
see format above

Raises

Exception
self.random_forest_dict hasn't been populated. You need to call self.fit_random_forest first

Returns

pandas.DataFrame
with columns variable, week, value
Expand source code
def apply_random_forest_regressor(self, data):
    """Apply the trained biome regressor on the data to reduce the conversion distortion resulted from quantization-dequantization. If the data frame has columns other than `variable, week, value`, the extra columns will be averaged over using `self.compute_average_df`

    Data formats for the input and the output are the same:

    | variable         |   week |    value |
    |:-----------------|-------:|---------:|
    | Actinobacteriota |     27 | 0.36665  |
    | Bacteroidota     |     27 | 0.507248 |
    | Campilobacterota |     27 | 0.002032 |

    Args:
        data (pandas.DataFrame): see format above

    Raises:
        Exception: `self.random_forest_dict` hasn't been populated. You need to call `self.fit_random_forest` first

    Returns:
        pandas.DataFrame: with columns `variable, week, value`
    """
    if not self.random_forest_dict:
        raise Exception('No random forest models. First train with fit_random_forest')
    avg_data = self.compute_average_df(data)
    dataframes = []
    for biome in avg_data.variable.unique():
        x = avg_data[avg_data.variable == biome].value
        # check if there is NaN
        if x.isnull().any():
            raise Exception('There are NaNs in the inputs. Please run the forecaster to fill in all the NaNs first')
        x = x.to_numpy()[:, np.newaxis]
        model = self.random_forest_dict[biome]
        pred = model.predict(x)
        df = pd.DataFrame({
            'variable': biome,
            'week': avg_data[avg_data.variable == biome].week,
            'value': pred
        })
        dataframes.append(df)
    ret = pd.concat(dataframes)
    return ret
def compute_average_df(self, df)

Take the average over the input data frame by grouping by variable, week

Input data format:

sample_id subject_id variable week value
MBSMPL0020-6-10 1 Actinobacteriota 27 0.36665
MBSMPL0020-6-10 1 Bacteroidota 27 0.507248
MBSMPL0020-6-10 1 Campilobacterota 27 0.002032
MBSMPL0020-6-10 1 Desulfobacterota 27 0.005058
MBSMPL0020-6-10 1 Firmicutes 27 0.057767

Output data format: | variable | week | value | |:-----------------|-------:|---------:| | Actinobacteriota | 27 | 0.36665 | | Bacteroidota | 27 | 0.507248 | | Campilobacterota | 27 | 0.002032 |

Args

df : pandas.DataFrame
see format above

Returns

pandas.DataFrame
the average data frame, see format above
Expand source code
def compute_average_df(self, df):
    """Take the average over the input data frame by grouping by `variable, week`

    Input data format:

    | sample_id       |   subject_id | variable         |   week |    value |
    |:----------------|-------------:|:-----------------|-------:|---------:|
    | MBSMPL0020-6-10 |            1 | Actinobacteriota |     27 | 0.36665  |
    | MBSMPL0020-6-10 |            1 | Bacteroidota     |     27 | 0.507248 |
    | MBSMPL0020-6-10 |            1 | Campilobacterota |     27 | 0.002032 |
    | MBSMPL0020-6-10 |            1 | Desulfobacterota |     27 | 0.005058 |
    | MBSMPL0020-6-10 |            1 | Firmicutes       |     27 | 0.057767 |

    Output data format:
    | variable         |   week |    value |
    |:-----------------|-------:|---------:|
    | Actinobacteriota |     27 | 0.36665  |
    | Bacteroidota     |     27 | 0.507248 |
    | Campilobacterota |     27 | 0.002032 |

    Args:
        df (pandas.DataFrame): see format above

    Returns:
        pandas.DataFrame: the average data frame, see format above
    """
    avg = df[['variable', 'week', 'value']].groupby(
        by=['variable', 'week']).mean().reset_index()
    return avg
def dequantize_label(self, label, bin_arr)

Dequantize a label string into a numeric value. This function is the inverse of self.quantize_value. If the input is an empty string, the return value will be numpy.nan

Args

label : str
label string
bin_arr : numpy.ndarray
bins produced by pandas.cut or retrieved using self.get_bin_array_of_index

Returns

float
the dequantized numeric value
Expand source code
def dequantize_label(self, label, bin_arr):
    """Dequantize a label string into a numeric value. This function is the inverse of `self.quantize_value`. If the input is an empty string, the return value will be `numpy.nan`

    Args:
        label (str): label string
        bin_arr (numpy.ndarray): bins produced by `pandas.cut` or retrieved using `self.get_bin_array_of_index`

    Returns:
        float: the dequantized numeric value
    """
    if label is np.nan or label.lower() == 'nan' or label not in self.labels:
        return np.nan
    low = self.labels[label]
    high = low + 1
    val = (bin_arr[low] + bin_arr[high]) / 2
    return val
def dequantize_sequence(self, label_seq)

Dequantize an entire label sequence

Args

label_seq : numpy.ndarray
1D array of label strings

Returns

numpy.ndarray
1D array of floats
Expand source code
def dequantize_sequence(self, label_seq):
    """Dequantize an entire label sequence

    Args:
        label_seq (numpy.ndarray): 1D array of label strings

    Returns:
        numpy.ndarray: 1D array of floats
    """
    numeric_seq = np.empty(label_seq.shape)
    for idx, label in enumerate(label_seq):
        bin_arr = self.get_bin_array_of_index(idx)
        numeric_seq[idx] = self.dequantize_label(label, bin_arr)
    return numeric_seq
def dequantize_to_df(self, matrix)

Dequantize a label matrix (with no column names, just the qnet input matrix) into a data frame with numeric values. To make the output data frame into a format seaborn can easily plot, apply self.melt_into_plot_format

Output format:

subject_id Acidobacteriota_35 Actinobacteriota_1 Actinobacteriota_2
1 nan 0.36665 nan
10 nan 0.36665 nan
11 nan 0.36665 nan

Args

matrix : numpy.ndarray
2D matrix of label strings

Returns

pandas.DataFrame
see format above
Expand source code
def dequantize_to_df(self, matrix):
    """Dequantize a label matrix (with no column names, just the qnet input matrix) into a data frame with numeric values. To make the output data frame into a format `seaborn` can easily plot, apply `self.melt_into_plot_format`

    Output format:

    |   subject_id |   Acidobacteriota_35 | Actinobacteriota_1   |   Actinobacteriota_2 |
    |-------------:|---------------------:|:---------------------|---------------------:|
    |            1 |                  nan | 0.36665              |                  nan |
    |           10 |                  nan | 0.36665              |                  nan |
    |           11 |                  nan | 0.36665              |                  nan |

    Args:
        matrix (numpy.ndarray): 2D matrix of label strings

    Returns:
        pandas.DataFrame: see format above
    """
    numeric_matrix = np.empty(matrix.shape)
    for idx, seq in enumerate(matrix):
        numeric_matrix[idx] = self.dequantize_sequence(seq)

    df = self.add_meta_to_matrix(numeric_matrix)
    return df
def fit_random_forest(self, data, dequantized_data)

Fit a random forest regressor for each of the biome. Use as input the average of the quantized data. Fit regressor to the average of the original data as output. Populate self.random_forest_dict.

Input format for both data frames:

subject_id variable week value
1 Actinobacteriota 27 0.36665
1 Bacteroidota 27 0.507248
1 Campilobacterota 27 0.002032

Args

data : pandas.DataFrame
see format above
dequantized_data : pandas.DataFrame
see format above
Expand source code
def fit_random_forest(self, data, dequantized_data):
    """Fit a random forest regressor for each of the biome. Use as input the average of the quantized data. Fit regressor to the average of the original data as output. Populate `self.random_forest_dict`.

    Input format for both data frames:

    |   subject_id | variable         |   week |    value |
    |-------------:|:-----------------|-------:|---------:|
    |            1 | Actinobacteriota |     27 | 0.36665  |
    |            1 | Bacteroidota     |     27 | 0.507248 |
    |            1 | Campilobacterota |     27 | 0.002032 |

    Args:
        data (pandas.DataFrame): see format above
        dequantized_data (pandas.DataFrame): see format above
    """
    if self.random_forest_dict: # already populated
        return

    # take avg of data and dequantized_data, grouped by week and biome
    # want to map dequantized to original, hence dequantized is input
    inputs = self.compute_average_df(dequantized_data)
    outputs = self.compute_average_df(data)

    for biome in inputs.variable.unique():
        x = inputs[inputs.variable == biome].value
        y = outputs[outputs.variable == biome].value
        model = self._fit_random_forest_one_biome(x, y)
        self.random_forest_dict[biome] = model
def get_bin_array_of_index(self, idx)

Return the pandas.cut bin array corresponding to the sequence index by looking up self.variable_bin_map[self.column_names[idx]]

Args

idx : int
index into self.column_names

Returns

numpy.ndarray
bins
Expand source code
def get_bin_array_of_index(self, idx):
    """Return the `pandas.cut` bin array corresponding to the sequence index by looking up `self.variable_bin_map[self.column_names[idx]]`

    Args:
        idx (int): index into `self.column_names`

    Returns:
        numpy.ndarray: bins
    """
    col = self.column_names[idx]
    bin_arr = self.variable_bin_map[col]
    return bin_arr
def get_qnet_inputs(self, quantized_df)

Retrieve the feature names and data matrix from a quantized data frame produced by self.quantize_df

Args

quantized_df : pandas.DataFrame
a quantized data frame produced by self.quantize_df

Returns

list
a list of feature names, ex. ['Acidobacteriota_35', 'Actinobacteriota_1', 'Actinobacteriota_2']
numpy.ndarray
a 2D data array of quantized labels ('A', 'B', ..., or empty string '' for NaN)
Expand source code
def get_qnet_inputs(self, quantized_df):
    """Retrieve the feature names and data matrix from a quantized data frame produced by `self.quantize_df`

    Args:
        quantized_df (pandas.DataFrame): a quantized data frame produced by `self.quantize_df`

    Returns:
        list: a list of feature names, ex. `['Acidobacteriota_35', 'Actinobacteriota_1', 'Actinobacteriota_2']`
        numpy.ndarray: a 2D data array of quantized labels (`'A', 'B', ...,` or empty string `''` for NaN)
    """
    # skip subject_id column
    df = quantized_df.drop(columns='subject_id')
    # matrix = df.astype(str).replace('nan', '').to_numpy(dtype=str)
    # matrix = df.astype(str).fillna('').to_numpy(dtype=str)
    matrix = df.astype(str).replace('nan', '').fillna('').to_numpy(dtype=str)
    # sanity-check matrix contains only empty strings and label strings
    valid_labels = list(self.labels.keys()) + ['']
    is_valid = np.isin(np.unique(matrix), valid_labels).all()
    if not is_valid:
        import warnings
        #raise Exception('The label matrix contains strings that are neither the empty string nor the label strings')
        warnings.warn('The label matrix contains strings that are neither the empty string nor the label strings')
    return df.columns, matrix
def load_quantizer_states(self, in_fname)

Load in self.column_names, self.variable_bin_map, self.random_forest_dict from file

Args

in_fname : str
input file name
Expand source code
def load_quantizer_states(self, in_fname):
    """Load in `self.column_names, self.variable_bin_map, self.random_forest_dict` from file

    Args:
        in_fname (str): input file name
    """
    with open(in_fname, 'rb') as f:
        states = pickle.load(f)
    self.column_names = states['column_names']
    self.subject_id_column = states['subject_id_column']
    self.variable_bin_map = states['variable_bin_map']
    self.random_forest_dict = states['random_forest_dict']
def melt_into_plot_format(self, data)

Melt data into a format that seaborn can easily plot

Input format:

subject_id Acidobacteriota_35 Actinobacteriota_1 Actinobacteriota_2
1 nan 0.36665 nan
10 nan 0.36665 nan
11 nan 0.36665 nan

Output format:

subject_id variable week value
1 Actinobacteriota 27 0.36665
1 Bacteroidota 27 0.507248
1 Campilobacterota 27 0.002032

Args

data : pandas.DataFrame
numeric data, see format above

Returns

pandas.DataFrame
see format above
Expand source code
def melt_into_plot_format(self, data):
    """Melt data into a format that `seaborn` can easily plot

    Input format:

    |   subject_id |   Acidobacteriota_35 | Actinobacteriota_1   |   Actinobacteriota_2 |
    |-------------:|---------------------:|:---------------------|---------------------:|
    |            1 |                  nan | 0.36665              |                  nan |
    |           10 |                  nan | 0.36665              |                  nan |
    |           11 |                  nan | 0.36665              |                  nan |

    Output format:

    |   subject_id | variable         |   week |    value |
    |-------------:|:-----------------|-------:|---------:|
    |            1 | Actinobacteriota |     27 | 0.36665  |
    |            1 | Bacteroidota     |     27 | 0.507248 |
    |            1 | Campilobacterota |     27 | 0.002032 |

    Args:
        data (pandas.DataFrame): numeric data, see format above

    Returns:
        pandas.DataFrame: see format above
    """
    # pivot into plottable format
    melted = data.melt(id_vars='subject_id')
    # split variable names
    splitted = melted.variable.str.extract(r'([\D|\d]+)_(\d+)', expand=True)
    splitted.rename(columns={0: 'variable', 1: 'week'}, inplace=True)
    splitted.week = splitted.week.astype(int)
    plot_df = pd.concat([
        melted.subject_id, splitted, melted.value
    ], axis=1)
    return plot_df
def pivot_into_quantize_format(self, data)

Pivot the data into a format the quantizer can quantize

Input data format, produced by DataFormatter.load_data:

sample_id subject_id variable week value
MBSMPL0020-6-10 1 Actinobacteriota 27 0.36665
MBSMPL0020-6-10 1 Bacteroidota 27 0.507248
MBSMPL0020-6-10 1 Campilobacterota 27 0.002032
MBSMPL0020-6-10 1 Desulfobacterota 27 0.005058
MBSMPL0020-6-10 1 Firmicutes 27 0.057767

Output format:

subject_id Acidobacteriota_35 Actinobacteriota_1 Actinobacteriota_2
1 nan 0.36665 nan
10 nan 0.36665 nan
11 nan 0.36665 nan

Args

data : pandas.DataFrame
see format above

Returns

pandas.DataFrame
see format above
Expand source code
def pivot_into_quantize_format(self, data):
    """Pivot the data into a format the quantizer can quantize

    Input data format, produced by `DataFormatter.load_data`:

    | sample_id       |   subject_id | variable         |   week |    value |
    |:----------------|-------------:|:-----------------|-------:|---------:|
    | MBSMPL0020-6-10 |            1 | Actinobacteriota |     27 | 0.36665  |
    | MBSMPL0020-6-10 |            1 | Bacteroidota     |     27 | 0.507248 |
    | MBSMPL0020-6-10 |            1 | Campilobacterota |     27 | 0.002032 |
    | MBSMPL0020-6-10 |            1 | Desulfobacterota |     27 | 0.005058 |
    | MBSMPL0020-6-10 |            1 | Firmicutes       |     27 | 0.057767 |

    Output format:

    |   subject_id |   Acidobacteriota_35 | Actinobacteriota_1   |   Actinobacteriota_2 |
    |-------------:|---------------------:|:---------------------|---------------------:|
    |            1 |                  nan | 0.36665              |                  nan |
    |           10 |                  nan | 0.36665              |                  nan |
    |           11 |                  nan | 0.36665              |                  nan |

    Args:
        data (pandas.DataFrame): see format above

    Returns:
        pandas.DataFrame: see format above
    """
    # some hacky intermediate format used by quantizer only
    # so this probably shouldn't go into DataFormatter
    melted = pd.concat([
        data.subject_id,
        data.variable + '_' + data.week.astype(str),
        data.value
    ], axis=1).rename(columns={0: 'variable'})

    to_quantize = melted.pivot_table(
        index='subject_id', columns='variable', dropna=False)['value'].reset_index()
    return to_quantize
def quantize_df(self, data)

This function must be called before calling any of the dequantization procedures. It populates self.column_names, self.subject_id_column, self.variable_bin_map

Input data format, produced by DataFormatter.load_data:

sample_id subject_id variable week value
MBSMPL0020-6-10 1 Actinobacteriota 27 0.36665
MBSMPL0020-6-10 1 Bacteroidota 27 0.507248
MBSMPL0020-6-10 1 Campilobacterota 27 0.002032
MBSMPL0020-6-10 1 Desulfobacterota 27 0.005058
MBSMPL0020-6-10 1 Firmicutes 27 0.057767

Output data format:

subject_id Acidobacteriota_35 Actinobacteriota_1 Actinobacteriota_2
1 nan A nan
10 nan A nan
11 nan A nan
12 nan D nan
14 nan A nan

Args

data : pandas.DataFrame
see format above

Returns

pandas.DataFrame
see format above
Expand source code
def quantize_df(self, data):
    """This function must be called before calling any of the dequantization procedures. It populates `self.column_names, self.subject_id_column, self.variable_bin_map`

    Input data format, produced by `DataFormatter.load_data`:

    | sample_id       |   subject_id | variable         |   week |    value |
    |:----------------|-------------:|:-----------------|-------:|---------:|
    | MBSMPL0020-6-10 |            1 | Actinobacteriota |     27 | 0.36665  |
    | MBSMPL0020-6-10 |            1 | Bacteroidota     |     27 | 0.507248 |
    | MBSMPL0020-6-10 |            1 | Campilobacterota |     27 | 0.002032 |
    | MBSMPL0020-6-10 |            1 | Desulfobacterota |     27 | 0.005058 |
    | MBSMPL0020-6-10 |            1 | Firmicutes       |     27 | 0.057767 |

    Output data format:

    |   subject_id |   Acidobacteriota_35 | Actinobacteriota_1   |   Actinobacteriota_2 |
    |-------------:|---------------------:|:---------------------|---------------------:|
    |            1 |                  nan | A                    |                  nan |
    |           10 |                  nan | A                    |                  nan |
    |           11 |                  nan | A                    |                  nan |
    |           12 |                  nan | D                    |                  nan |
    |           14 |                  nan | A                    |                  nan |

    Args:
        data (pandas.DataFrame): see format above

    Returns:
        pandas.DataFrame: see format above
    """
    to_quantize = self.pivot_into_quantize_format(data)
    self.column_names = to_quantize.columns[1:] # skip subject_id, only biome names
    # cache the subject_id column to add back to a dequantized matrix
    self.subject_id_column = to_quantize.subject_id

    return self._quantize_df(to_quantize)
def quantize_new_subject(self, subject_data, subject_id=None)

Construct and quantize a new subject with missing data

Input format:

subject_id variable week value
1 Actinobacteriota 1 0.36665
1 Bacteroidota 1 0.507248
1 Campilobacterota 1 0.002032
1 Desulfobacterota 1 0.005058
1 Firmicutes 1 0.057767

Output format:

subject_id Acidobacteriota_35 Actinobacteriota_1 Actinobacteriota_2
1 nan A nan
1 nan A nan
1 nan A nan
1 nan D nan

Args

subject_data : [type]
subject data frame with some but maybe not all the timestamps
subject_id : str, optional
if not None, add the subject_id as a column; if None, assume that the input has a column named subject_id. Defaults to None.

Returns

pd.DataFrame
quantized subject data frame with complete timestamps, see format above
Expand source code
def quantize_new_subject(self, subject_data, subject_id=None):
    """Construct and quantize a new subject with missing data

    Input format:

    |   subject_id | variable         |   week |    value |
    |-------------:|:-----------------|-------:|---------:|
    |            1 | Actinobacteriota |      1 | 0.36665  |
    |            1 | Bacteroidota     |      1 | 0.507248 |
    |            1 | Campilobacterota |      1 | 0.002032 |
    |            1 | Desulfobacterota |      1 | 0.005058 |
    |            1 | Firmicutes       |      1 | 0.057767 |

    Output format:

    |   subject_id |   Acidobacteriota_35 | Actinobacteriota_1   |   Actinobacteriota_2 |
    |-------------:|---------------------:|:---------------------|---------------------:|
    |            1 |                  nan | A                    |                  nan |
    |            1 |                  nan | A                    |                  nan |
    |            1 |                  nan | A                    |                  nan |
    |            1 |                  nan | D                    |                  nan |

    Args:
        subject_data ([type]): subject data frame with some but maybe not all the timestamps
        subject_id (str, optional): if not None, add the subject_id as a column; if None, assume that the input has a column named subject_id. Defaults to None.

    Returns:
        pd.DataFrame: quantized subject data frame with complete timestamps, see format above
    """
    if subject_id is None and not 'subject_id' in subject_data.columns:
        raise Exception('You must provide a subject_id if there is none in the input data frame')

    if subject_id is not None:
        subject_data['subject_id'] = subject_id

    new_subject = self.pivot_into_quantize_format(subject_data)
    # add columns that are in self.column_names but not in pivoted as np.nan
    for column in self.column_names:
        if column not in new_subject.columns:
            new_subject[column] = np.nan

    return self._quantize_df(new_subject)
def quantize_value(self, val, bin_arr)

Quantize a numeric value into a label. This function is the inverse of self.dequantize_label

Args

val : float
number to quantize
bin_arr : numpy.ndarray
bins produced by pandas.cut or retrieved using self.get_bin_array_of_index

Returns

str
quantized label string
Expand source code
def quantize_value(self, val, bin_arr):
    """Quantize a numeric value into a label. This function is the inverse of `self.dequantize_label`

    Args:
        val (float): number to quantize
        bin_arr (numpy.ndarray): bins produced by `pandas.cut` or retrieved using `self.get_bin_array_of_index`

    Returns:
        str: quantized label string
    """
    label = pd.cut([val], bin_arr, labels=list(self.labels.keys()))[0]
    return label
def save_quantizer_states(self, out_fname)

Save self.column_names, self.subject_id_column, self.variable_bin_map, self.random_forest_dict<code>. Call this after calling </code>self.quantize_df

Args

out_fname : str
output file name
Expand source code
def save_quantizer_states(self, out_fname):
    """Save `self.column_names, self.subject_id_column, self.variable_bin_map,
    self.random_forest_dict`. Call this after calling `self.quantize_df`

    Args:
        out_fname (str): output file name
    """
    states = {
        'column_names': self.column_names,
        'subject_id_column': self.subject_id_column,
        'variable_bin_map': self.variable_bin_map,
        'random_forest_dict': self.random_forest_dict
    }
    with open(out_fname, 'wb') as f:
        pickle.dump(states, f, protocol=pickle.HIGHEST_PROTOCOL)