Module emergenet.utils

Expand source code
import pandas as pd
from Bio import SeqIO
from quasinet.qnet import Qnet, save_qnet, load_qnet


def parse_fasta(filepath:str) -> pd.DataFrame:
    ''' Parses a GISAID fasta file into a dataframe. Metadata should be in the format: 
    
    `Isolate name|Type|Gene name|Collection date|Protein Accession no.|Isolate ID|Lineage|Clade`

    Parameters
    ----------
    filepath: File path of fasta file
    
    Returns
    -------
    df: Dataframe with columns: `[name, subtype, segment, date, accession, sequence, HA, NA]`
    '''
    name = []
    subtype = []
    segment = []
    dates = []
    accession = []
    sequence = []
    for record in SeqIO.parse(filepath, 'fasta'):
        metadata = record.id.split('|')
        if not metadata[1].startswith('A_/_') or len(metadata[1].split('_')[2]) < 4:
            continue
        name.append(metadata[0])
        subtype.append(metadata[1].split('_')[2])
        segment.append(metadata[2])
        dates.append(metadata[3])
        accession.append(metadata[4])
        sequence.append(str(record.seq.upper()))
    df = pd.DataFrame({'name':name, 
                       'subtype':subtype,
                       'segment':segment, 
                       'date':dates,
                       'accession':accession,
                       'sequence':sequence})
    df[['HA', 'NA']] = df['subtype'].str.extract(r'H(\d+)N(\d+)')
    df['HA'] = df['HA'].apply(lambda x: 'H' + str(x))
    df['NA'] = df['NA'].apply(lambda x: 'N' + str(x))
    return df


def filter_by_date_range(df:pd.DataFrame, date_column:str, 
                         start_date:str, end_date:str) -> pd.DataFrame:
    ''' Filters a DataFrame by a date range.

    Parameters
    ----------
    df - DataFrame to filter

    date_column - Name of date column in df, entries must be in format 'YYYY-MM-DD'

    start_date - Start date in format 'YYYY-MM-DD'

    end_date - End date in format 'YYYY-MM-DD'
    
    Returns
    -------
    filtered_df - Filtered DataFrame sorted by date in descending order
    '''
    df[date_column] = pd.to_datetime(df[date_column])
    filtered_df = df[(df[date_column] >= start_date) & (df[date_column] <= end_date)]
    filtered_df.sort_values(by=[date_column], inplace=True, ascending=False)
    return filtered_df


def save_model(enet:Qnet, outfile:str, low_mem:bool=False, gz=True):
    ''' Saves an Emergenet model.

    Parameters
    ----------
    enet - An Emergenet instance

    outfile - File name to save to ('.joblib')

    low_mem - If true, save the Emergenet with low memory by deleting all data attributes
    
    gz - If true, save the gzipped Emergenet
    ''' 
    save_qnet(enet, outfile, low_mem=low_mem, gz=gz)

    
def load_model(filepath:str, gz=False) -> Qnet:
    ''' Loads an Emergenet model.

    Parameters
    ----------
    filepath - File name
    
    gz - If true, load a gzipped Emergenet

    Returns
    -------
    enet - An Emergenet instance
    ''' 
    enet = load_qnet(filepath, gz=gz)
    return enet

Functions

def filter_by_date_range(df: pandas.core.frame.DataFrame, date_column: str, start_date: str, end_date: str) ‑> pandas.core.frame.DataFrame

Filters a DataFrame by a date range.

Parameters

df - DataFrame to filter

date_column - Name of date column in df, entries must be in format 'YYYY-MM-DD'

start_date - Start date in format 'YYYY-MM-DD'

end_date - End date in format 'YYYY-MM-DD'

Returns

filtered_df - Filtered DataFrame sorted by date in descending order
 
Expand source code
def filter_by_date_range(df:pd.DataFrame, date_column:str, 
                         start_date:str, end_date:str) -> pd.DataFrame:
    ''' Filters a DataFrame by a date range.

    Parameters
    ----------
    df - DataFrame to filter

    date_column - Name of date column in df, entries must be in format 'YYYY-MM-DD'

    start_date - Start date in format 'YYYY-MM-DD'

    end_date - End date in format 'YYYY-MM-DD'
    
    Returns
    -------
    filtered_df - Filtered DataFrame sorted by date in descending order
    '''
    df[date_column] = pd.to_datetime(df[date_column])
    filtered_df = df[(df[date_column] >= start_date) & (df[date_column] <= end_date)]
    filtered_df.sort_values(by=[date_column], inplace=True, ascending=False)
    return filtered_df
def load_model(filepath: str, gz=False) ‑> quasinet.qnet.Qnet

Loads an Emergenet model.

Parameters

filepath - File name

gz - If true, load a gzipped Emergenet

Returns

enet - An Emergenet instance
 
Expand source code
def load_model(filepath:str, gz=False) -> Qnet:
    ''' Loads an Emergenet model.

    Parameters
    ----------
    filepath - File name
    
    gz - If true, load a gzipped Emergenet

    Returns
    -------
    enet - An Emergenet instance
    ''' 
    enet = load_qnet(filepath, gz=gz)
    return enet
def parse_fasta(filepath: str) ‑> pandas.core.frame.DataFrame

Parses a GISAID fasta file into a dataframe. Metadata should be in the format:

Isolate name|Type|Gene name|Collection date|Protein Accession no.|Isolate ID|Lineage|Clade

Parameters

filepath : File path of fasta file
 

Returns

df : Dataframe with columns:[name, subtype, segment, date, accession, sequence, HA, NA]``
 
Expand source code
def parse_fasta(filepath:str) -> pd.DataFrame:
    ''' Parses a GISAID fasta file into a dataframe. Metadata should be in the format: 
    
    `Isolate name|Type|Gene name|Collection date|Protein Accession no.|Isolate ID|Lineage|Clade`

    Parameters
    ----------
    filepath: File path of fasta file
    
    Returns
    -------
    df: Dataframe with columns: `[name, subtype, segment, date, accession, sequence, HA, NA]`
    '''
    name = []
    subtype = []
    segment = []
    dates = []
    accession = []
    sequence = []
    for record in SeqIO.parse(filepath, 'fasta'):
        metadata = record.id.split('|')
        if not metadata[1].startswith('A_/_') or len(metadata[1].split('_')[2]) < 4:
            continue
        name.append(metadata[0])
        subtype.append(metadata[1].split('_')[2])
        segment.append(metadata[2])
        dates.append(metadata[3])
        accession.append(metadata[4])
        sequence.append(str(record.seq.upper()))
    df = pd.DataFrame({'name':name, 
                       'subtype':subtype,
                       'segment':segment, 
                       'date':dates,
                       'accession':accession,
                       'sequence':sequence})
    df[['HA', 'NA']] = df['subtype'].str.extract(r'H(\d+)N(\d+)')
    df['HA'] = df['HA'].apply(lambda x: 'H' + str(x))
    df['NA'] = df['NA'].apply(lambda x: 'N' + str(x))
    return df
def save_model(enet: quasinet.qnet.Qnet, outfile: str, low_mem: bool = False, gz=True)

Saves an Emergenet model.

Parameters

enet - An Emergenet instance

outfile - File name to save to ('.joblib')

low_mem - If true, save the Emergenet with low memory by deleting all data attributes

gz - If true, save the gzipped Emergenet

Expand source code
def save_model(enet:Qnet, outfile:str, low_mem:bool=False, gz=True):
    ''' Saves an Emergenet model.

    Parameters
    ----------
    enet - An Emergenet instance

    outfile - File name to save to ('.joblib')

    low_mem - If true, save the Emergenet with low memory by deleting all data attributes
    
    gz - If true, save the gzipped Emergenet
    ''' 
    save_qnet(enet, outfile, low_mem=low_mem, gz=gz)