Module `qbiome.data_formatter`

Expand source code

import pandas as pd
import numpy as np

class DataFormatter:
    """Parse raw data into usable format by the Quasinet
    """

    def __init__(self):
        pass

    def load_data(self, fpath_data, fpath_meta, taxon_name='Phylum', tax_dict={'Class':'dummy'}, time_column_name='Age (days)', time_column_name_out='day',
    k_years=2, k_biomes=15):
        """Parse and join the data CSV and the metadata CSV

        Output format:

        | sample_id       |   subject_id | variable         |   week |    value |
        |:----------------|-------------:|:-----------------|-------:|---------:|
        | MBSMPL0020-6-10 |            1 | Actinobacteriota |     27 | 0.36665  |
        | MBSMPL0020-6-10 |            1 | Bacteroidota     |     27 | 0.507248 |
        | MBSMPL0020-6-10 |            1 | Campilobacterota |     27 | 0.002032 |
        | MBSMPL0020-6-10 |            1 | Desulfobacterota |     27 | 0.005058 |
        | MBSMPL0020-6-10 |            1 | Firmicutes       |     27 | 0.057767 |

        Args:
            fpath_data (str): file path for the data CSV
            fpath_meta (str): file path for the metadata CSV
            taxon_name (str, optional): name of the taxon column exactly as in the data CSV. this is the base taxonomic level for qnet construction.
            Defaults to 'Phylum'.
            tax_dict (dict, optional): dictionary of biomes/taxonomic levels for deviations from taxon_name. entities are considered at the level specified rather than taxon_name. Caution: no validation is performed.
            time_column_name (str, optional): name of the timestamp column exactly as in the metadata CSV. Defaults to 'Age (days)'.
            time_column_name_out (str, optional): name of the timestamp column in the return data frame. Defaults to 'day'.
            k_years (int, optional): in the return data frame, we keep timestamps up to the number of years specified. Defaults to 2.
            k_biomes (int, optional): in the return data frame, we keep the k most abundant biomes. Defaults to 15.

        Returns:
            pandas.DataFrame: parsed, cleaned data frame, see format above
        """
        taxa_raw = pd.read_csv(fpath_data)
        meta_raw = pd.read_csv(fpath_meta)
        taxa_sum = self._sum_taxon(taxa_raw, taxon_name, tax_dict)
        meta = self._parse_meta(meta_raw, time_column_name, time_column_name_out)
        data = self._join_data_meta(taxa_sum, meta, time_column_name_out)

        # depending on the unit of the timestamp in the original data,
        # it may be necessary to cut out days beyond 2 or more years
        # and to convert days to weeks
        if k_years is not None:
            data = self._cut_after_k_years(data, k_years)
        data = self._convert_days_to_weeks(data)

        if k_biomes is not None:
            data = self._use_top_k_biomes(data, k_biomes)

        return data

    def load_meta(self, fpath_meta, property_name='Antibiotic exposure',
    property_column_name_out='antibiotic'):
        """Return a mapping between sample_id, subject_id and meta data (ex. use antibiotics or not) in a data frame

        Output format:

        | sample_id        | antibiotic   |   subject_id |
        |:-----------------|:-------------|-------------:|
        | MBSMPL0020-6-1   | No           |            1 |
        | MBSMPL0020-6-10  | Yes          |            1 |
        | MBSMPL0020-6-100 | No           |            5 |
        | MBSMPL0020-6-101 | No           |            5 |
        | MBSMPL0020-6-102 | No           |            5 |

        Args:
            fpath_meta (str): file path for the metadata CSV
            property_name (str, optional): name of the meta data value in the property column. Defaults to 'Antibiotic exposure'.
            property_column_name_out (str, optional): name of the meta data column in the return data frame. Defaults to 'antibiotic'.

        Returns:
            pandas.DataFrame: sample_id and subject_id to meta data mapping, see format above
        """
        meta_raw = pd.read_csv(fpath_meta)
        meta = meta_raw[['Sample ID', 'Property', 'Value']]

        meta_property = meta[meta['Property'] == property_name].drop(columns='Property')
        meta_property.columns = ['sample_id', property_column_name_out]

        meta_subject_id = meta[meta['Property'] == 'Subject ID'].drop(columns='Property')
        meta_subject_id.columns = ['sample_id', 'subject_id']

        sample_id_property = pd.merge(meta_property, meta_subject_id, on='sample_id', how='outer')
        # make sure subject_id are strings
        sample_id_property.subject_id = sample_id_property.subject_id.astype(str)

        return sample_id_property

    def pivot_into_column_format(self, data):
        """Pivot the input data frame from this format:

        | sample_id       |   subject_id | variable         |   week |    value |
        |:----------------|-------------:|:-----------------|-------:|---------:|
        | MBSMPL0020-6-10 |            1 | Actinobacteriota |     27 | 0.36665  |
        | MBSMPL0020-6-10 |            1 | Bacteroidota     |     27 | 0.507248 |
        | MBSMPL0020-6-10 |            1 | Campilobacterota |     27 | 0.002032 |
        | MBSMPL0020-6-10 |            1 | Desulfobacterota |     27 | 0.005058 |
        | MBSMPL0020-6-10 |            1 | Firmicutes       |     27 | 0.057767 |

        Into this format where each column is a biome:

        | sample_id         |   week |   Acidobacteriota |   Actinobacteriota |   Bacteroidota |
        |:------------------|-------:|------------------:|-------------------:|---------------:|
        | MBSMPL0020-6-421  |      1 |               nan |           0.011904 |       0.043808 |
        | MBSMPL0020-6-777  |      1 |               nan |           9.8e-05  |       0.000686 |
        | MBSMPL0020-6-1123 |      1 |               nan |           0.005603 |       0.201417 |
        | MBSMPL0020-6-1191 |      1 |               nan |           0.002578 |       0.368164 |
        | MBSMPL0020-6-263  |      1 |               nan |           0.004344 |       0.000381 |

        Args:
            data (pandas.DataFrame): see format above

        Returns:
            pandas.DataFrame: see format above
        """
        # keep sample_id in here for later cohort identification
        pivoted = data.pivot_table(
            index=['sample_id', 'week'], columns=['variable'])['value'].reset_index()
        pivoted.sort_values(by=['week'], inplace=True)
        pivoted.reset_index(drop=True, inplace=True)
        return pivoted

    def melt_into_plot_format(self, data):
        """Melt the data into a format `seaborn` can plot easily
        From format:

        | sample_id         |   week |   Acidobacteriota |   Actinobacteriota |   Bacteroidota |
        |:------------------|-------:|------------------:|-------------------:|---------------:|
        | MBSMPL0020-6-421  |      1 |               nan |           0.011904 |       0.043808 |
        | MBSMPL0020-6-777  |      1 |               nan |           9.8e-05  |       0.000686 |
        | MBSMPL0020-6-1123 |      1 |               nan |           0.005603 |       0.201417 |
        | MBSMPL0020-6-1191 |      1 |               nan |           0.002578 |       0.368164 |
        | MBSMPL0020-6-263  |      1 |               nan |           0.004344 |       0.000381 |

        Into format:

        | sample_id         |   week | variable        |   value |
        |:------------------|-------:|:----------------|--------:|
        | MBSMPL0020-6-421  |      1 | Acidobacteriota |     nan |
        | MBSMPL0020-6-777  |      1 | Acidobacteriota |     nan |
        | MBSMPL0020-6-1123 |      1 | Acidobacteriota |     nan |
        | MBSMPL0020-6-1191 |      1 | Acidobacteriota |     nan |
        | MBSMPL0020-6-263  |      1 | Acidobacteriota |     nan |

        Args:
            data (pandas.DataFrame): see format above

        Returns:
            pandas.DataFrame: see format above
        """
        melted = data.melt(id_vars=['sample_id', 'week'])
        return melted

    def _sum_taxon(self, taxa_raw, taxon_name, tax_dict):
        #taxa = taxa_raw[['Sample ID', taxon_name, 'Relative Abundance']]
        #taxa_sum = taxa.groupby(by=['Sample ID', taxon_name]).sum()

        taxa_raw['tmp'] = taxa_raw[taxon_name]
        for x in list(tax_dict):
            taxa_raw['tmp'] = np.where(taxa_raw[x].isin(tax_dict[x]), taxa_raw[x], taxa_raw['tmp'])
        taxa = taxa_raw[['Sample ID', 'tmp', 'Relative Abundance']]
        taxa_sum = taxa.groupby(by=['Sample ID', 'tmp']).sum()

        taxa_sum.reset_index(inplace=True)
        taxa_sum.columns = ['sample_id', 'variable', 'value']
        print('There are {} unique biomes and {} unique samples'.format(
            len(taxa_sum.variable.unique()), len(taxa_sum.sample_id.unique())))
        return taxa_sum

    def _parse_meta(self, meta_raw, time_column_name, time_column_name_out):
        meta = meta_raw[['Sample ID', 'Property', 'Value']]

        meta_timestamp = meta[meta['Property'] == time_column_name].drop(columns='Property')
        meta_timestamp.columns = ['sample_id', time_column_name_out]

        meta_subject_id = meta[meta['Property'] == 'Subject ID'].drop(columns='Property')
        meta_subject_id.columns = ['sample_id', 'subject_id']

        meta = pd.merge(meta_timestamp, meta_subject_id, on='sample_id')
        return meta

    def _join_data_meta(self, data, meta, time_column_name):
        merged = pd.merge(data, meta, how='outer', on='sample_id')
        merged.columns = ['sample_id', 'variable', 'value', time_column_name, 'subject_id']
        merged.dropna(inplace=True)
        merged[time_column_name] = pd.to_numeric(merged[time_column_name],
        downcast='integer', errors='coerce').astype(int)
        # remove negative days
        merged = merged[merged[time_column_name] > 0]

        print('There are {} unique {}s'.format(
            len(merged[time_column_name].unique()), time_column_name))
        return merged

    def _cut_after_k_years(self, data, k_years):
        return data[data.day < 356 * k_years]

    def _convert_days_to_weeks(self, data):
        weeks = range(data.day.min() - 1, data.day.max() + 8, 7)
        print('There are {} unique weeks'.format(len(weeks)))
        data = pd.concat([
            data.sample_id,
            data.subject_id,
            data.variable,
            pd.cut(pd.Series(data.day), bins=weeks,
                        labels=range(1, len(weeks))),
            data.value
            ], axis=1)
        data.columns = ['sample_id', 'subject_id', 'variable', 'week', 'value']
        data.week = data.week.astype(int)
        return data

    def _use_top_k_biomes(self, data, k_biomes):
        """
        Everything except top k is labeled 'unclassified_Bacteria'
        """
        biome_measurement_counts = data.variable.value_counts()
        top_k = biome_measurement_counts.nlargest(k_biomes).index
        data.loc[~data.variable.isin(top_k), 'variable'] = 'unclassified_Bacteria'
        return data

Classes

class DataFormatter

Parse raw data into usable format by the Quasinet

Expand source code

class DataFormatter:
    """Parse raw data into usable format by the Quasinet
    """

    def __init__(self):
        pass

    def load_data(self, fpath_data, fpath_meta, taxon_name='Phylum', tax_dict={'Class':'dummy'}, time_column_name='Age (days)', time_column_name_out='day',
    k_years=2, k_biomes=15):
        """Parse and join the data CSV and the metadata CSV

        Output format:

        | sample_id       |   subject_id | variable         |   week |    value |
        |:----------------|-------------:|:-----------------|-------:|---------:|
        | MBSMPL0020-6-10 |            1 | Actinobacteriota |     27 | 0.36665  |
        | MBSMPL0020-6-10 |            1 | Bacteroidota     |     27 | 0.507248 |
        | MBSMPL0020-6-10 |            1 | Campilobacterota |     27 | 0.002032 |
        | MBSMPL0020-6-10 |            1 | Desulfobacterota |     27 | 0.005058 |
        | MBSMPL0020-6-10 |            1 | Firmicutes       |     27 | 0.057767 |

        Args:
            fpath_data (str): file path for the data CSV
            fpath_meta (str): file path for the metadata CSV
            taxon_name (str, optional): name of the taxon column exactly as in the data CSV. this is the base taxonomic level for qnet construction.
            Defaults to 'Phylum'.
            tax_dict (dict, optional): dictionary of biomes/taxonomic levels for deviations from taxon_name. entities are considered at the level specified rather than taxon_name. Caution: no validation is performed.
            time_column_name (str, optional): name of the timestamp column exactly as in the metadata CSV. Defaults to 'Age (days)'.
            time_column_name_out (str, optional): name of the timestamp column in the return data frame. Defaults to 'day'.
            k_years (int, optional): in the return data frame, we keep timestamps up to the number of years specified. Defaults to 2.
            k_biomes (int, optional): in the return data frame, we keep the k most abundant biomes. Defaults to 15.

        Returns:
            pandas.DataFrame: parsed, cleaned data frame, see format above
        """
        taxa_raw = pd.read_csv(fpath_data)
        meta_raw = pd.read_csv(fpath_meta)
        taxa_sum = self._sum_taxon(taxa_raw, taxon_name, tax_dict)
        meta = self._parse_meta(meta_raw, time_column_name, time_column_name_out)
        data = self._join_data_meta(taxa_sum, meta, time_column_name_out)

        # depending on the unit of the timestamp in the original data,
        # it may be necessary to cut out days beyond 2 or more years
        # and to convert days to weeks
        if k_years is not None:
            data = self._cut_after_k_years(data, k_years)
        data = self._convert_days_to_weeks(data)

        if k_biomes is not None:
            data = self._use_top_k_biomes(data, k_biomes)

        return data

    def load_meta(self, fpath_meta, property_name='Antibiotic exposure',
    property_column_name_out='antibiotic'):
        """Return a mapping between sample_id, subject_id and meta data (ex. use antibiotics or not) in a data frame

        Output format:

        | sample_id        | antibiotic   |   subject_id |
        |:-----------------|:-------------|-------------:|
        | MBSMPL0020-6-1   | No           |            1 |
        | MBSMPL0020-6-10  | Yes          |            1 |
        | MBSMPL0020-6-100 | No           |            5 |
        | MBSMPL0020-6-101 | No           |            5 |
        | MBSMPL0020-6-102 | No           |            5 |

        Args:
            fpath_meta (str): file path for the metadata CSV
            property_name (str, optional): name of the meta data value in the property column. Defaults to 'Antibiotic exposure'.
            property_column_name_out (str, optional): name of the meta data column in the return data frame. Defaults to 'antibiotic'.

        Returns:
            pandas.DataFrame: sample_id and subject_id to meta data mapping, see format above
        """
        meta_raw = pd.read_csv(fpath_meta)
        meta = meta_raw[['Sample ID', 'Property', 'Value']]

        meta_property = meta[meta['Property'] == property_name].drop(columns='Property')
        meta_property.columns = ['sample_id', property_column_name_out]

        meta_subject_id = meta[meta['Property'] == 'Subject ID'].drop(columns='Property')
        meta_subject_id.columns = ['sample_id', 'subject_id']

        sample_id_property = pd.merge(meta_property, meta_subject_id, on='sample_id', how='outer')
        # make sure subject_id are strings
        sample_id_property.subject_id = sample_id_property.subject_id.astype(str)

        return sample_id_property

    def pivot_into_column_format(self, data):
        """Pivot the input data frame from this format:

        | sample_id       |   subject_id | variable         |   week |    value |
        |:----------------|-------------:|:-----------------|-------:|---------:|
        | MBSMPL0020-6-10 |            1 | Actinobacteriota |     27 | 0.36665  |
        | MBSMPL0020-6-10 |            1 | Bacteroidota     |     27 | 0.507248 |
        | MBSMPL0020-6-10 |            1 | Campilobacterota |     27 | 0.002032 |
        | MBSMPL0020-6-10 |            1 | Desulfobacterota |     27 | 0.005058 |
        | MBSMPL0020-6-10 |            1 | Firmicutes       |     27 | 0.057767 |

        Into this format where each column is a biome:

        | sample_id         |   week |   Acidobacteriota |   Actinobacteriota |   Bacteroidota |
        |:------------------|-------:|------------------:|-------------------:|---------------:|
        | MBSMPL0020-6-421  |      1 |               nan |           0.011904 |       0.043808 |
        | MBSMPL0020-6-777  |      1 |               nan |           9.8e-05  |       0.000686 |
        | MBSMPL0020-6-1123 |      1 |               nan |           0.005603 |       0.201417 |
        | MBSMPL0020-6-1191 |      1 |               nan |           0.002578 |       0.368164 |
        | MBSMPL0020-6-263  |      1 |               nan |           0.004344 |       0.000381 |

        Args:
            data (pandas.DataFrame): see format above

        Returns:
            pandas.DataFrame: see format above
        """
        # keep sample_id in here for later cohort identification
        pivoted = data.pivot_table(
            index=['sample_id', 'week'], columns=['variable'])['value'].reset_index()
        pivoted.sort_values(by=['week'], inplace=True)
        pivoted.reset_index(drop=True, inplace=True)
        return pivoted

    def melt_into_plot_format(self, data):
        """Melt the data into a format `seaborn` can plot easily
        From format:

        | sample_id         |   week |   Acidobacteriota |   Actinobacteriota |   Bacteroidota |
        |:------------------|-------:|------------------:|-------------------:|---------------:|
        | MBSMPL0020-6-421  |      1 |               nan |           0.011904 |       0.043808 |
        | MBSMPL0020-6-777  |      1 |               nan |           9.8e-05  |       0.000686 |
        | MBSMPL0020-6-1123 |      1 |               nan |           0.005603 |       0.201417 |
        | MBSMPL0020-6-1191 |      1 |               nan |           0.002578 |       0.368164 |
        | MBSMPL0020-6-263  |      1 |               nan |           0.004344 |       0.000381 |

        Into format:

        | sample_id         |   week | variable        |   value |
        |:------------------|-------:|:----------------|--------:|
        | MBSMPL0020-6-421  |      1 | Acidobacteriota |     nan |
        | MBSMPL0020-6-777  |      1 | Acidobacteriota |     nan |
        | MBSMPL0020-6-1123 |      1 | Acidobacteriota |     nan |
        | MBSMPL0020-6-1191 |      1 | Acidobacteriota |     nan |
        | MBSMPL0020-6-263  |      1 | Acidobacteriota |     nan |

        Args:
            data (pandas.DataFrame): see format above

        Returns:
            pandas.DataFrame: see format above
        """
        melted = data.melt(id_vars=['sample_id', 'week'])
        return melted

    def _sum_taxon(self, taxa_raw, taxon_name, tax_dict):
        #taxa = taxa_raw[['Sample ID', taxon_name, 'Relative Abundance']]
        #taxa_sum = taxa.groupby(by=['Sample ID', taxon_name]).sum()

        taxa_raw['tmp'] = taxa_raw[taxon_name]
        for x in list(tax_dict):
            taxa_raw['tmp'] = np.where(taxa_raw[x].isin(tax_dict[x]), taxa_raw[x], taxa_raw['tmp'])
        taxa = taxa_raw[['Sample ID', 'tmp', 'Relative Abundance']]
        taxa_sum = taxa.groupby(by=['Sample ID', 'tmp']).sum()

        taxa_sum.reset_index(inplace=True)
        taxa_sum.columns = ['sample_id', 'variable', 'value']
        print('There are {} unique biomes and {} unique samples'.format(
            len(taxa_sum.variable.unique()), len(taxa_sum.sample_id.unique())))
        return taxa_sum

    def _parse_meta(self, meta_raw, time_column_name, time_column_name_out):
        meta = meta_raw[['Sample ID', 'Property', 'Value']]

        meta_timestamp = meta[meta['Property'] == time_column_name].drop(columns='Property')
        meta_timestamp.columns = ['sample_id', time_column_name_out]

        meta_subject_id = meta[meta['Property'] == 'Subject ID'].drop(columns='Property')
        meta_subject_id.columns = ['sample_id', 'subject_id']

        meta = pd.merge(meta_timestamp, meta_subject_id, on='sample_id')
        return meta

    def _join_data_meta(self, data, meta, time_column_name):
        merged = pd.merge(data, meta, how='outer', on='sample_id')
        merged.columns = ['sample_id', 'variable', 'value', time_column_name, 'subject_id']
        merged.dropna(inplace=True)
        merged[time_column_name] = pd.to_numeric(merged[time_column_name],
        downcast='integer', errors='coerce').astype(int)
        # remove negative days
        merged = merged[merged[time_column_name] > 0]

        print('There are {} unique {}s'.format(
            len(merged[time_column_name].unique()), time_column_name))
        return merged

    def _cut_after_k_years(self, data, k_years):
        return data[data.day < 356 * k_years]

    def _convert_days_to_weeks(self, data):
        weeks = range(data.day.min() - 1, data.day.max() + 8, 7)
        print('There are {} unique weeks'.format(len(weeks)))
        data = pd.concat([
            data.sample_id,
            data.subject_id,
            data.variable,
            pd.cut(pd.Series(data.day), bins=weeks,
                        labels=range(1, len(weeks))),
            data.value
            ], axis=1)
        data.columns = ['sample_id', 'subject_id', 'variable', 'week', 'value']
        data.week = data.week.astype(int)
        return data

    def _use_top_k_biomes(self, data, k_biomes):
        """
        Everything except top k is labeled 'unclassified_Bacteria'
        """
        biome_measurement_counts = data.variable.value_counts()
        top_k = biome_measurement_counts.nlargest(k_biomes).index
        data.loc[~data.variable.isin(top_k), 'variable'] = 'unclassified_Bacteria'
        return data

Methods

def load_data(self, fpath_data, fpath_meta, taxon_name='Phylum', tax_dict={'Class': 'dummy'}, time_column_name='Age (days)', time_column_name_out='day', k_years=2, k_biomes=15)

Parse and join the data CSV and the metadata CSV

Output format:

sample_id	subject_id	variable	week	value
MBSMPL0020-6-10	1	Actinobacteriota	27	0.36665
MBSMPL0020-6-10	1	Bacteroidota	27	0.507248
MBSMPL0020-6-10	1	Campilobacterota	27	0.002032
MBSMPL0020-6-10	1	Desulfobacterota	27	0.005058
MBSMPL0020-6-10	1	Firmicutes	27	0.057767

Args

fpath_data : str: file path for the data CSV
fpath_meta : str: file path for the metadata CSV
taxon_name : str, optional: name of the taxon column exactly as in the data CSV. this is the base taxonomic level for qnet construction.
Defaults to 'Phylum'.
tax_dict : dict, optional: dictionary of biomes/taxonomic levels for deviations from taxon_name. entities are considered at the level specified rather than taxon_name. Caution: no validation is performed.
time_column_name : str, optional: name of the timestamp column exactly as in the metadata CSV. Defaults to 'Age (days)'.
time_column_name_out : str, optional: name of the timestamp column in the return data frame. Defaults to 'day'.
k_years : int, optional: in the return data frame, we keep timestamps up to the number of years specified. Defaults to 2.
k_biomes : int, optional: in the return data frame, we keep the k most abundant biomes. Defaults to 15.

Returns

pandas.DataFrame: parsed, cleaned data frame, see format above

Expand source code

def load_data(self, fpath_data, fpath_meta, taxon_name='Phylum', tax_dict={'Class':'dummy'}, time_column_name='Age (days)', time_column_name_out='day',
k_years=2, k_biomes=15):
    """Parse and join the data CSV and the metadata CSV

    Output format:

    | sample_id       |   subject_id | variable         |   week |    value |
    |:----------------|-------------:|:-----------------|-------:|---------:|
    | MBSMPL0020-6-10 |            1 | Actinobacteriota |     27 | 0.36665  |
    | MBSMPL0020-6-10 |            1 | Bacteroidota     |     27 | 0.507248 |
    | MBSMPL0020-6-10 |            1 | Campilobacterota |     27 | 0.002032 |
    | MBSMPL0020-6-10 |            1 | Desulfobacterota |     27 | 0.005058 |
    | MBSMPL0020-6-10 |            1 | Firmicutes       |     27 | 0.057767 |

    Args:
        fpath_data (str): file path for the data CSV
        fpath_meta (str): file path for the metadata CSV
        taxon_name (str, optional): name of the taxon column exactly as in the data CSV. this is the base taxonomic level for qnet construction.
        Defaults to 'Phylum'.
        tax_dict (dict, optional): dictionary of biomes/taxonomic levels for deviations from taxon_name. entities are considered at the level specified rather than taxon_name. Caution: no validation is performed.
        time_column_name (str, optional): name of the timestamp column exactly as in the metadata CSV. Defaults to 'Age (days)'.
        time_column_name_out (str, optional): name of the timestamp column in the return data frame. Defaults to 'day'.
        k_years (int, optional): in the return data frame, we keep timestamps up to the number of years specified. Defaults to 2.
        k_biomes (int, optional): in the return data frame, we keep the k most abundant biomes. Defaults to 15.

    Returns:
        pandas.DataFrame: parsed, cleaned data frame, see format above
    """
    taxa_raw = pd.read_csv(fpath_data)
    meta_raw = pd.read_csv(fpath_meta)
    taxa_sum = self._sum_taxon(taxa_raw, taxon_name, tax_dict)
    meta = self._parse_meta(meta_raw, time_column_name, time_column_name_out)
    data = self._join_data_meta(taxa_sum, meta, time_column_name_out)

    # depending on the unit of the timestamp in the original data,
    # it may be necessary to cut out days beyond 2 or more years
    # and to convert days to weeks
    if k_years is not None:
        data = self._cut_after_k_years(data, k_years)
    data = self._convert_days_to_weeks(data)

    if k_biomes is not None:
        data = self._use_top_k_biomes(data, k_biomes)

    return data

def load_meta(self, fpath_meta, property_name='Antibiotic exposure', property_column_name_out='antibiotic')

Return a mapping between sample_id, subject_id and meta data (ex. use antibiotics or not) in a data frame

Output format:

sample_id	antibiotic	subject_id
MBSMPL0020-6-1	No	1
MBSMPL0020-6-10	Yes	1
MBSMPL0020-6-100	No	5
MBSMPL0020-6-101	No	5
MBSMPL0020-6-102	No	5

Args

fpath_meta : str: file path for the metadata CSV
property_name : str, optional: name of the meta data value in the property column. Defaults to 'Antibiotic exposure'.
property_column_name_out : str, optional: name of the meta data column in the return data frame. Defaults to 'antibiotic'.

Returns

pandas.DataFrame: sample_id and subject_id to meta data mapping, see format above

Expand source code

def load_meta(self, fpath_meta, property_name='Antibiotic exposure',
property_column_name_out='antibiotic'):
    """Return a mapping between sample_id, subject_id and meta data (ex. use antibiotics or not) in a data frame

    Output format:

    | sample_id        | antibiotic   |   subject_id |
    |:-----------------|:-------------|-------------:|
    | MBSMPL0020-6-1   | No           |            1 |
    | MBSMPL0020-6-10  | Yes          |            1 |
    | MBSMPL0020-6-100 | No           |            5 |
    | MBSMPL0020-6-101 | No           |            5 |
    | MBSMPL0020-6-102 | No           |            5 |

    Args:
        fpath_meta (str): file path for the metadata CSV
        property_name (str, optional): name of the meta data value in the property column. Defaults to 'Antibiotic exposure'.
        property_column_name_out (str, optional): name of the meta data column in the return data frame. Defaults to 'antibiotic'.

    Returns:
        pandas.DataFrame: sample_id and subject_id to meta data mapping, see format above
    """
    meta_raw = pd.read_csv(fpath_meta)
    meta = meta_raw[['Sample ID', 'Property', 'Value']]

    meta_property = meta[meta['Property'] == property_name].drop(columns='Property')
    meta_property.columns = ['sample_id', property_column_name_out]

    meta_subject_id = meta[meta['Property'] == 'Subject ID'].drop(columns='Property')
    meta_subject_id.columns = ['sample_id', 'subject_id']

    sample_id_property = pd.merge(meta_property, meta_subject_id, on='sample_id', how='outer')
    # make sure subject_id are strings
    sample_id_property.subject_id = sample_id_property.subject_id.astype(str)

    return sample_id_property

def melt_into_plot_format(self, data)

Melt the data into a format seaborn can plot easily From format:

sample_id	week	Acidobacteriota	Actinobacteriota	Bacteroidota
MBSMPL0020-6-421	1	nan	0.011904	0.043808
MBSMPL0020-6-777	1	nan	9.8e-05	0.000686
MBSMPL0020-6-1123	1	nan	0.005603	0.201417
MBSMPL0020-6-1191	1	nan	0.002578	0.368164
MBSMPL0020-6-263	1	nan	0.004344	0.000381

Into format:

sample_id	week	variable	value
MBSMPL0020-6-421	1	Acidobacteriota	nan
MBSMPL0020-6-777	1	Acidobacteriota	nan
MBSMPL0020-6-1123	1	Acidobacteriota	nan
MBSMPL0020-6-1191	1	Acidobacteriota	nan
MBSMPL0020-6-263	1	Acidobacteriota	nan

Args

data : pandas.DataFrame: see format above

Returns

pandas.DataFrame: see format above

Expand source code

def melt_into_plot_format(self, data):
    """Melt the data into a format `seaborn` can plot easily
    From format:

    | sample_id         |   week |   Acidobacteriota |   Actinobacteriota |   Bacteroidota |
    |:------------------|-------:|------------------:|-------------------:|---------------:|
    | MBSMPL0020-6-421  |      1 |               nan |           0.011904 |       0.043808 |
    | MBSMPL0020-6-777  |      1 |               nan |           9.8e-05  |       0.000686 |
    | MBSMPL0020-6-1123 |      1 |               nan |           0.005603 |       0.201417 |
    | MBSMPL0020-6-1191 |      1 |               nan |           0.002578 |       0.368164 |
    | MBSMPL0020-6-263  |      1 |               nan |           0.004344 |       0.000381 |

    Into format:

    | sample_id         |   week | variable        |   value |
    |:------------------|-------:|:----------------|--------:|
    | MBSMPL0020-6-421  |      1 | Acidobacteriota |     nan |
    | MBSMPL0020-6-777  |      1 | Acidobacteriota |     nan |
    | MBSMPL0020-6-1123 |      1 | Acidobacteriota |     nan |
    | MBSMPL0020-6-1191 |      1 | Acidobacteriota |     nan |
    | MBSMPL0020-6-263  |      1 | Acidobacteriota |     nan |

    Args:
        data (pandas.DataFrame): see format above

    Returns:
        pandas.DataFrame: see format above
    """
    melted = data.melt(id_vars=['sample_id', 'week'])
    return melted

def pivot_into_column_format(self, data)

Pivot the input data frame from this format:

sample_id	subject_id	variable	week	value
MBSMPL0020-6-10	1	Actinobacteriota	27	0.36665
MBSMPL0020-6-10	1	Bacteroidota	27	0.507248
MBSMPL0020-6-10	1	Campilobacterota	27	0.002032
MBSMPL0020-6-10	1	Desulfobacterota	27	0.005058
MBSMPL0020-6-10	1	Firmicutes	27	0.057767

Into this format where each column is a biome:

sample_id	week	Acidobacteriota	Actinobacteriota	Bacteroidota
MBSMPL0020-6-421	1	nan	0.011904	0.043808
MBSMPL0020-6-777	1	nan	9.8e-05	0.000686
MBSMPL0020-6-1123	1	nan	0.005603	0.201417
MBSMPL0020-6-1191	1	nan	0.002578	0.368164
MBSMPL0020-6-263	1	nan	0.004344	0.000381

Args

data : pandas.DataFrame: see format above

Returns

pandas.DataFrame: see format above

Expand source code

def pivot_into_column_format(self, data):
    """Pivot the input data frame from this format:

    | sample_id       |   subject_id | variable         |   week |    value |
    |:----------------|-------------:|:-----------------|-------:|---------:|
    | MBSMPL0020-6-10 |            1 | Actinobacteriota |     27 | 0.36665  |
    | MBSMPL0020-6-10 |            1 | Bacteroidota     |     27 | 0.507248 |
    | MBSMPL0020-6-10 |            1 | Campilobacterota |     27 | 0.002032 |
    | MBSMPL0020-6-10 |            1 | Desulfobacterota |     27 | 0.005058 |
    | MBSMPL0020-6-10 |            1 | Firmicutes       |     27 | 0.057767 |

    Into this format where each column is a biome:

    | sample_id         |   week |   Acidobacteriota |   Actinobacteriota |   Bacteroidota |
    |:------------------|-------:|------------------:|-------------------:|---------------:|
    | MBSMPL0020-6-421  |      1 |               nan |           0.011904 |       0.043808 |
    | MBSMPL0020-6-777  |      1 |               nan |           9.8e-05  |       0.000686 |
    | MBSMPL0020-6-1123 |      1 |               nan |           0.005603 |       0.201417 |
    | MBSMPL0020-6-1191 |      1 |               nan |           0.002578 |       0.368164 |
    | MBSMPL0020-6-263  |      1 |               nan |           0.004344 |       0.000381 |

    Args:
        data (pandas.DataFrame): see format above

    Returns:
        pandas.DataFrame: see format above
    """
    # keep sample_id in here for later cohort identification
    pivoted = data.pivot_table(
        index=['sample_id', 'week'], columns=['variable'])['value'].reset_index()
    pivoted.sort_values(by=['week'], inplace=True)
    pivoted.reset_index(drop=True, inplace=True)
    return pivoted