Examining Geneset Overlaps¶
# OS-independent path management.
from os import environ
from pathlib import Path
import holoviews as hv
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
import umap
import umap.plot
import GSForge as gsf
import matplotlib.pyplot as plt
hv.extension("bokeh")
OSF_PATH = Path(environ.get("GSFORGE_DEMO_DATA", default="~/GSForge_demo_data/")).expanduser().joinpath("osfstorage", "oryza_sativa")
GEM_PATH = OSF_PATH.joinpath("AnnotatedGEMs", "oryza_sativa_hisat2_raw.nc")
LIT_DGE_GSC_PATH = OSF_PATH.joinpath("GeneSetCollections", "literature", "DGE")
LIT_TF_PATH = OSF_PATH.joinpath("GeneSetCollections", "literature", "TF")
# BORUTA_GSC_PATH = OSF_PATH.joinpath("GeneSetCollections", "workflow_boruta")
agem = gsf.AnnotatedGEM(GEM_PATH)
agem
<GSForge.AnnotatedGEM>
Name: Oryza sativa
Selected GEM Variable: 'counts'
Gene 55986
Sample 475
%%time
lit_dge_coll = gsf.GeneSetCollection.from_folder(gem=agem, target_dir=LIT_DGE_GSC_PATH, name="Literature DGE")
lit_tf_coll = gsf.GeneSetCollection.from_folder(gem=agem, target_dir=LIT_TF_PATH, name="Literature TF")
boruta_gsc = gsf.GeneSetCollection.from_folder(gem=agem, target_dir=BORUTA_GSC_PATH, name="Boruta Results")
tf_geneset = gsf.GeneSet.from_GeneSets(*list(lit_tf_coll.gene_sets.values()), name='transcription factors')
CPU times: user 25.4 s, sys: 168 ms, total: 25.5 s
Wall time: 25.6 s
Create a Merged Collection¶
Since there are so many sets within the transcription factor collection, we will combine them into a single set.
gene_sets={**lit_dge_coll.gene_sets,
**boruta_gsc.gene_sets,
'transcription factors': tf_geneset}
combined_gsc = gsf.GeneSetCollection(gem=agem,
gene_sets={**lit_dge_coll.gene_sets,
**boruta_gsc.gene_sets,
'transcription factors': tf_geneset})
combined_gsc.summarize_gene_sets()
{'DROUGHT_UP': 1175,
'boruta_treatment': 1155,
'boruta_genotype': 853,
'HEAT_UP': 592,
'RECOV_DROUGHT_UP': 446,
'transcription factors': 276,
'DROUGHT_DOWN': 170,
'HEAT_DOWN': 106,
'RECOV_HEAT_UP': 76,
'RECOV_DROUGHT_DOWN': 58,
'RECOV_HEAT_DOWN': 43}
View Collection Overlaps¶
We will quickly notice how difficult it can be to view overlaps of many sets. Combining or selecting gene sets can help with this.
overlap_heatmap = gsf.plots.collections.WithinCollectionOverlapHeatMap(combined_gsc)
# overlap_heatmap
percent_overlap_heatmap = gsf.plots.collections.WithinCollectionOverlapHeatMap(combined_gsc, mode='percent')
(overlap_heatmap + percent_overlap_heatmap).opts(hv.opts.HeatMap(width=550, height=500, cmap='blues'))
More detailed views of overlaps is provided by using upset plots.
gsf.plots.collections.UpsetPlotInterface(combined_gsc)
<upsetplot.plotting.UpSet at 0x7feddc775910>