GeneSet Analysis¶
This notebook gives some simple examples for:
- Comparing GeneSets to eachother via some machine-learning model score.
- Ranking genes within a geneset based on some machine-learning model.
Setting up the notebook
import os
import numpy as np
import matplotlib.pyplot as plt
import holoviews as hv
hv.extension("bokeh")
%matplotlib inline
import GSForge as gsf
Declare paths used
# OS-independent path management.
from os import fspath, environ
from pathlib import Path
OSF_PATH = Path(environ.get("GSFORGE_DEMO_DATA", default="~/GSForge_demo_data")).expanduser()
AGEM_PATH = OSF_PATH.joinpath("osfstorage", "rice.nc")
BORUTA_COLL_PATH = OSF_PATH.joinpath("osfstorage", "boruta_gene_sets")
assert AGEM_PATH.exists()
Load an AnnotatedGEM
agem = gsf.AnnotatedGEM(AGEM_PATH)
agem
gsc = gsf.GeneSetCollection.from_folder(gem=agem,
target_dir=BORUTA_COLL_PATH,
name="Boruta Results")
gsc
Scoring and Judging GeneSets¶
Comparing Genes within a GeneSet¶
By feature importance
from sklearn.ensemble import RandomForestClassifier
gene_rank_mdl = RandomForestClassifier(class_weight='balanced',
n_estimators=1000, n_jobs=-1)
%%time
treatment_feature_importance = gsf.operations.rank_genes_by_model(
gsc,
selected_gene_sets=["Boruta_Treatment"],
annotation_variables="Treatment",
model=gene_rank_mdl,
n_iterations=3
)
treatment_feature_importance
Then you can optionally store such results to a GeneSet using xarray update().
gsc.gene_sets["Boruta_Treatment"].data = gsc.gene_sets["Boruta_Treatment"].data.update(treatment_feature_importance)
gsc.gene_sets["Boruta_Treatment"].data
Estimate False Discovery Rates¶
nFDR
%%time
treatment_nFDR = gsf.operations.analytics.nFDR(
gsc,
selected_gene_sets=["Boruta_Treatment"],
annotation_variables="Treatment",
model=gene_rank_mdl,
n_iterations=3
)
treatment_nFDR
gsc.gene_sets["Boruta_Treatment"].data = gsc.gene_sets["Boruta_Treatment"].data.update(treatment_nFDR)
mProbes
%%time
treatment_mProbes = gsf.operations.analytics.mProbes(
gsc,
selected_gene_sets=["Boruta_Treatment"],
annotation_variables="Treatment",
model=gene_rank_mdl,
n_iterations=3
)
treatment_mProbes
gsc.gene_sets["Boruta_Treatment"].data = gsc.gene_sets["Boruta_Treatment"].data.update(treatment_mProbes)
Examine Rankings¶
# View the xarray dataset of a GeneSet:
# gsc.gene_sets["Boruta_Treatment"].data
# View the list of 'supported' genes.
len(gsc.gene_sets["Boruta_Treatment"].gene_support())
Subset by supported genes
ds = gsc.gene_sets["Boruta_Treatment"].data.sel(Gene=gsc.gene_sets["Boruta_Treatment"].gene_support())
ds
# hv.Dataset(ds)
# hv.Scatter(hv.Dataset(ds), ["fwe_mean", "nrd_mean"])
df = ds[["feature_importance_mean", "nFDR_mean"]].to_dataframe()
plot = hv.Scatter(df, [("feature_importance_mean", "Mean Feature Importance Rank"),
("nFDR_mean", "False Discovery Rate")]
).opts(padding=0.1, width=400, height=400)
plot
# hv.save(plot, "fdr_vs_importance.png")
df = ds[["feature_importance_mean", "mProbes_mean"]].to_dataframe()
hv.Scatter(df).opts(padding=0.1, width=400, height=400)
Receiver Operating Characteristic (ROC)¶
Another way to evaluate a classification model is the ROC.
Viewing ROC curves for multi-label models is a bit indirect as we have to use a binary classifier for each unique target label, so we provide this walkthrough. Also examine the sklearn demo.
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_curve, auc, roc_auc_score, plot_roc_curve
from sklearn import preprocessing, model_selection
Get the count and annotation data from GSForge.
counts, treatment = gsf.get_data(gsc, annotation_variables="Treatment",selected_gene_sets=["Boruta_Treatment"])
classes = treatment.to_series().unique()
classes
Encode the annotation labels with a one hot encoder.
enc = preprocessing.OneHotEncoder().fit(treatment.values.reshape(-1, 1))
treatment_onehot = enc.transform(treatment.values.reshape(-1, 1)).toarray()
treatment_onehot
Split the data and encoded annotations into a train and test set.
x_train, x_test, y_train, y_test = model_selection.train_test_split(counts, treatment_onehot)
Fit the model with the training data.
%%time
roc_rf_model = OneVsRestClassifier(RandomForestClassifier(class_weight='balanced', max_depth=3,
n_estimators=1000, n_jobs=-1))
roc_rf_model = roc_rf_model.fit(x_train, y_train)
Now predict class probabilities for the test count data.
y_score = roc_rf_model.predict_proba(x_test)
y_score[:5]
fpr = dict()
tpr = dict()
roc_auc = dict()
for i, class_ in enumerate(classes):
fpr[class_], tpr[class_], _ = roc_curve(y_test[:, i], y_score[:, i])
roc_auc[class_] = auc(fpr[class_], tpr[class_])
roc_curves = {class_: hv.Curve((fpr[class_], tpr[class_]))
for class_ in classes}
hv.NdOverlay(roc_curves).opts(padding=0.05, legend_position="top", width=400, height=450)
View a Decision Tree¶
from sklearn import tree
import graphviz
Extract a single tree from a list of estimators.
control_rfc = roc_rf_model.estimators_[0]
print(f"There are {len(control_rfc.estimators_)} trees.")
selected_tree = control_rfc.estimators_[0]
graph = graphviz.Source(tree.export_graphviz(
selected_tree,
feature_names=x_train.Gene.values,
class_names=["Not CONTROL", "CONTROL"],
filled=True,
rounded=True,
special_characters=True))
graph
# graph.render('decision_tree', format="svg")
View Genes Used in a Given Trees¶
counts.Gene.isel(Gene=np.argwhere(selected_tree.feature_importances_).flatten()).values
tree_subset = counts.isel(Gene=np.argwhere(selected_tree.feature_importances_).flatten())
hv.Raster(tree_subset.values).opts(logz=True)