Module qbiome.mask_checker
Expand source code
import numpy as np
from itertools import product
from quasinet import qnet
class MaskChecker:
"""For sanity-checking the Quasinet model, randomly mask entries in the original data frame and let the qnet fill in predictions. If the qnet model is good, we expect a minimal amount of difference between the original data and the predicted data
"""
def __init__(self, qnet_orchestrator):
"""Initialization
Args:
qnet_orchestrator (qbiome.QnetOrchestrator): an instance with a trained qnet model
"""
self.qnet_orchestrator = qnet_orchestrator
self.quantizer = qnet_orchestrator.quantizer
def mask_and_predict(self, data, mask_percent, n_samples=100):
"""Mask the data matrix and let qnet fill in the predictions
Output format:
| subject_id | variable | week | value |
|-------------:|:-----------------|-------:|---------:|
| 1 | Actinobacteriota | 27 | 0.36665 |
| 1 | Bacteroidota | 27 | 0.507248 |
| 1 | Campilobacterota | 27 | 0.002032 |
Args:
data (numpy.ndarray): 2D array of label strings, produced by `self.get_qnet_inputs`
mask_percent (int): between 0 and 100, the percent of the data matrix to mask
n_samples (int, optional): the number of times to sample from qnet predictions for one masked entry. Defaults to 100.
Returns:
pandas.DataFrame: see format above
"""
masked = self.apply_random_mask(data, mask_percent)
predicted_matrix = np.empty(data.shape)
for idx, seq in enumerate(data):
# numeric prediction
predicted_matrix[idx] = self.qnet_orchestrator.predict_sequence(seq)
df = self.quantizer.add_meta_to_matrix(predicted_matrix)
# convert to plottable format
plot_df = self.quantizer.melt_into_plot_format(df)
return plot_df
def apply_random_mask(self, data, mask_percent):
"""Apply random mask to the data matrix by swapping out entries with empty string `''`
Args:
data (numpy.ndarray): 2D label matrix
mask_percent (int): between 0 and 100
Raises:
Exception: mask percent is not between 0 and 100
Returns:
numpy.ndarray: 2D label matrix with some entries masked to the empty string
"""
masked = data.copy()
if not 0 <= mask_percent <= 100:
raise Exception('Mask percent', mask_percent, 'is not between 0 and 100')
num_mask = masked.size * mask_percent // 100
indices = list(product(range(masked.shape[0]), range(masked.shape[1])))
idx_to_mask = np.random.choice(masked.size, num_mask, replace=False)
for idx in idx_to_mask:
row, col = indices[idx]
masked[row, col] = ''
return masked
Classes
class MaskChecker (qnet_orchestrator)
-
For sanity-checking the Quasinet model, randomly mask entries in the original data frame and let the qnet fill in predictions. If the qnet model is good, we expect a minimal amount of difference between the original data and the predicted data
Initialization
Args
qnet_orchestrator
:qbiome.QnetOrchestrator
- an instance with a trained qnet model
Expand source code
class MaskChecker: """For sanity-checking the Quasinet model, randomly mask entries in the original data frame and let the qnet fill in predictions. If the qnet model is good, we expect a minimal amount of difference between the original data and the predicted data """ def __init__(self, qnet_orchestrator): """Initialization Args: qnet_orchestrator (qbiome.QnetOrchestrator): an instance with a trained qnet model """ self.qnet_orchestrator = qnet_orchestrator self.quantizer = qnet_orchestrator.quantizer def mask_and_predict(self, data, mask_percent, n_samples=100): """Mask the data matrix and let qnet fill in the predictions Output format: | subject_id | variable | week | value | |-------------:|:-----------------|-------:|---------:| | 1 | Actinobacteriota | 27 | 0.36665 | | 1 | Bacteroidota | 27 | 0.507248 | | 1 | Campilobacterota | 27 | 0.002032 | Args: data (numpy.ndarray): 2D array of label strings, produced by `self.get_qnet_inputs` mask_percent (int): between 0 and 100, the percent of the data matrix to mask n_samples (int, optional): the number of times to sample from qnet predictions for one masked entry. Defaults to 100. Returns: pandas.DataFrame: see format above """ masked = self.apply_random_mask(data, mask_percent) predicted_matrix = np.empty(data.shape) for idx, seq in enumerate(data): # numeric prediction predicted_matrix[idx] = self.qnet_orchestrator.predict_sequence(seq) df = self.quantizer.add_meta_to_matrix(predicted_matrix) # convert to plottable format plot_df = self.quantizer.melt_into_plot_format(df) return plot_df def apply_random_mask(self, data, mask_percent): """Apply random mask to the data matrix by swapping out entries with empty string `''` Args: data (numpy.ndarray): 2D label matrix mask_percent (int): between 0 and 100 Raises: Exception: mask percent is not between 0 and 100 Returns: numpy.ndarray: 2D label matrix with some entries masked to the empty string """ masked = data.copy() if not 0 <= mask_percent <= 100: raise Exception('Mask percent', mask_percent, 'is not between 0 and 100') num_mask = masked.size * mask_percent // 100 indices = list(product(range(masked.shape[0]), range(masked.shape[1]))) idx_to_mask = np.random.choice(masked.size, num_mask, replace=False) for idx in idx_to_mask: row, col = indices[idx] masked[row, col] = '' return masked
Methods
def apply_random_mask(self, data, mask_percent)
-
Apply random mask to the data matrix by swapping out entries with empty string
''
Args
data
:numpy.ndarray
- 2D label matrix
mask_percent
:int
- between 0 and 100
Raises
Exception
- mask percent is not between 0 and 100
Returns
numpy.ndarray
- 2D label matrix with some entries masked to the empty string
Expand source code
def apply_random_mask(self, data, mask_percent): """Apply random mask to the data matrix by swapping out entries with empty string `''` Args: data (numpy.ndarray): 2D label matrix mask_percent (int): between 0 and 100 Raises: Exception: mask percent is not between 0 and 100 Returns: numpy.ndarray: 2D label matrix with some entries masked to the empty string """ masked = data.copy() if not 0 <= mask_percent <= 100: raise Exception('Mask percent', mask_percent, 'is not between 0 and 100') num_mask = masked.size * mask_percent // 100 indices = list(product(range(masked.shape[0]), range(masked.shape[1]))) idx_to_mask = np.random.choice(masked.size, num_mask, replace=False) for idx in idx_to_mask: row, col = indices[idx] masked[row, col] = '' return masked
def mask_and_predict(self, data, mask_percent, n_samples=100)
-
Mask the data matrix and let qnet fill in the predictions
Output format:
subject_id variable week value 1 Actinobacteriota 27 0.36665 1 Bacteroidota 27 0.507248 1 Campilobacterota 27 0.002032 Args
data
:numpy.ndarray
- 2D array of label strings, produced by
self.get_qnet_inputs
mask_percent
:int
- between 0 and 100, the percent of the data matrix to mask
n_samples
:int
, optional- the number of times to sample from qnet predictions for one masked entry. Defaults to 100.
Returns
pandas.DataFrame
- see format above
Expand source code
def mask_and_predict(self, data, mask_percent, n_samples=100): """Mask the data matrix and let qnet fill in the predictions Output format: | subject_id | variable | week | value | |-------------:|:-----------------|-------:|---------:| | 1 | Actinobacteriota | 27 | 0.36665 | | 1 | Bacteroidota | 27 | 0.507248 | | 1 | Campilobacterota | 27 | 0.002032 | Args: data (numpy.ndarray): 2D array of label strings, produced by `self.get_qnet_inputs` mask_percent (int): between 0 and 100, the percent of the data matrix to mask n_samples (int, optional): the number of times to sample from qnet predictions for one masked entry. Defaults to 100. Returns: pandas.DataFrame: see format above """ masked = self.apply_random_mask(data, mask_percent) predicted_matrix = np.empty(data.shape) for idx, seq in enumerate(data): # numeric prediction predicted_matrix[idx] = self.qnet_orchestrator.predict_sequence(seq) df = self.quantizer.add_meta_to_matrix(predicted_matrix) # convert to plottable format plot_df = self.quantizer.melt_into_plot_format(df) return plot_df