Examining Geneset Overlaps

# OS-independent path management.
from os import  environ
from pathlib import Path
import holoviews as hv
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
import umap
import umap.plot
import GSForge as gsf
import matplotlib.pyplot as plt
hv.extension("bokeh")
OSF_PATH = Path(environ.get("GSFORGE_DEMO_DATA", default="~/GSForge_demo_data/")).expanduser().joinpath("osfstorage", "oryza_sativa")
GEM_PATH = OSF_PATH.joinpath("AnnotatedGEMs", "oryza_sativa_hisat2_raw.nc")
LIT_DGE_GSC_PATH = OSF_PATH.joinpath("GeneSetCollections", "literature", "DGE")
LIT_TF_PATH = OSF_PATH.joinpath("GeneSetCollections", "literature", "TF")
# BORUTA_GSC_PATH = OSF_PATH.joinpath("GeneSetCollections", "workflow_boruta")
agem = gsf.AnnotatedGEM(GEM_PATH)
agem
<GSForge.AnnotatedGEM>
Name: Oryza sativa
Selected GEM Variable: 'counts'
    Gene   55986
    Sample 475
%%time
lit_dge_coll = gsf.GeneSetCollection.from_folder(gem=agem, target_dir=LIT_DGE_GSC_PATH, name="Literature DGE")
lit_tf_coll = gsf.GeneSetCollection.from_folder(gem=agem, target_dir=LIT_TF_PATH, name="Literature TF")
boruta_gsc = gsf.GeneSetCollection.from_folder(gem=agem, target_dir=BORUTA_GSC_PATH, name="Boruta Results")
tf_geneset = gsf.GeneSet.from_GeneSets(*list(lit_tf_coll.gene_sets.values()), name='transcription factors')
CPU times: user 25.4 s, sys: 168 ms, total: 25.5 s
Wall time: 25.6 s

Create a Merged Collection

Since there are so many sets within the transcription factor collection, we will combine them into a single set.

gene_sets={**lit_dge_coll.gene_sets, 
           **boruta_gsc.gene_sets, 
           'transcription factors': tf_geneset}

combined_gsc = gsf.GeneSetCollection(gem=agem, 
                                     gene_sets={**lit_dge_coll.gene_sets, 
                                                **boruta_gsc.gene_sets, 
                                                'transcription factors': tf_geneset})
combined_gsc.summarize_gene_sets()
{'DROUGHT_UP': 1175,
 'boruta_treatment': 1155,
 'boruta_genotype': 853,
 'HEAT_UP': 592,
 'RECOV_DROUGHT_UP': 446,
 'transcription factors': 276,
 'DROUGHT_DOWN': 170,
 'HEAT_DOWN': 106,
 'RECOV_HEAT_UP': 76,
 'RECOV_DROUGHT_DOWN': 58,
 'RECOV_HEAT_DOWN': 43}

View Collection Overlaps

We will quickly notice how difficult it can be to view overlaps of many sets. Combining or selecting gene sets can help with this.

overlap_heatmap = gsf.plots.collections.WithinCollectionOverlapHeatMap(combined_gsc)
# overlap_heatmap
percent_overlap_heatmap = gsf.plots.collections.WithinCollectionOverlapHeatMap(combined_gsc, mode='percent')
(overlap_heatmap + percent_overlap_heatmap).opts(hv.opts.HeatMap(width=550, height=500, cmap='blues'))

More detailed views of overlaps is provided by using upset plots.

gsf.plots.collections.UpsetPlotInterface(combined_gsc)
<upsetplot.plotting.UpSet at 0x7feddc775910>
../../_images/07-Examaining_Geneset_Overlaps_11_1.png