Creating GeneSet
s from Literature Sources¶
Recall that the minumum requirement for GeneSet
creation is simply a list of genes.
Setting up the notebook
import holoviews as hv
import numpy as np
import pandas as pd
hv.extension("bokeh")
import GSForge as gsf
Declaring used paths
# OS-independent path management.
from os import environ
from pathlib import Path
OSF_PATH = Path(environ.get("GSFORGE_DEMO_DATA", default="~/GSForge_demo_data/")).expanduser().joinpath("osfstorage", "oryza_sativa")
SI_FILE_1_PATH = OSF_PATH.joinpath('GEMmakerGEMs', 'raw_annotation_data', 'TPC2016-00158-LSBR2_Supplemental_File_1.csv')
SI_FILE_5_PATH = OSF_PATH.joinpath('GEMmakerGEMs', 'raw_annotation_data', 'TPC2016-00158-LSBR2_Supplemental_File_5.txt')
GEM_PATH = OSF_PATH.joinpath("AnnotatedGEMs", "oryza_sativa_hisat2_raw.nc")
LITERATURE_COLL_PATH = OSF_PATH.joinpath('GeneSetCollections', 'literature')
Load an AnnotatedGEM
agem = gsf.AnnotatedGEM(GEM_PATH)
agem
<GSForge.AnnotatedGEM>
Name: Oryza sativa
Selected GEM Variable: 'counts'
Gene 55986
Sample 475
Load Data¶
with open(SI_FILE_1_PATH) as myfile:
head = ''.join([next(myfile) for x in range(6)])
print(head)
#Supplemental Data. Wilkins et al. Plant Cell (2016) 10.1105/tpc.16.00158.,,,,,,,,
#Supplemental Data Set 1: Differentially expressed genes,,,,,,,,
,,,,,,,,
gene,HEAT_UP,HEAT_DOWN,RECOV_HEAT_UP,RECOV_HEAT_DOWN,DROUGHT_UP,DROUGHT_DOWN,RECOV_DROUGHT_UP,RECOV_DROUGHT_DOWN
ChrSy.fgenesh.gene.37,0,0,0,0,8,0,1,0
LOC_Os01g01610,6,0,0,0,0,0,0,0
si1_df = pd.read_csv(SI_FILE_1_PATH, skiprows=3, index_col=0)
si1_df.head()
HEAT_UP | HEAT_DOWN | RECOV_HEAT_UP | RECOV_HEAT_DOWN | DROUGHT_UP | DROUGHT_DOWN | RECOV_DROUGHT_UP | RECOV_DROUGHT_DOWN | |
---|---|---|---|---|---|---|---|---|
gene | ||||||||
ChrSy.fgenesh.gene.37 | 0 | 0 | 0 | 0 | 8 | 0 | 1 | 0 |
LOC_Os01g01610 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
LOC_Os01g01620 | 0 | 0 | 0 | 0 | 1 | 0 | 2 | 0 |
LOC_Os01g02060 | 0 | 0 | 0 | 0 | 2 | 0 | 2 | 0 |
LOC_Os01g02080 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 1 |
agem.data.Gene
<xarray.DataArray 'Gene' (Gene: 55986)> array(['LOC_Os06g05820', 'LOC_Os10g27460', 'LOC_Os02g35980', ..., 'LOC_Os03g50190', 'LOC_Os03g20020', 'LOC_Os07g03418'], dtype=object) Coordinates: * Gene (Gene) object 'LOC_Os06g05820' ... 'LOC_Os07g03418'
xarray.DataArray
'Gene'
- Gene: 55986
- 'LOC_Os06g05820' 'LOC_Os10g27460' ... 'LOC_Os07g03418'
array(['LOC_Os06g05820', 'LOC_Os10g27460', 'LOC_Os02g35980', ..., 'LOC_Os03g50190', 'LOC_Os03g20020', 'LOC_Os07g03418'], dtype=object)
- Gene(Gene)object'LOC_Os06g05820' ... 'LOC_Os07g0...
array(['LOC_Os06g05820', 'LOC_Os10g27460', 'LOC_Os02g35980', ..., 'LOC_Os03g50190', 'LOC_Os03g20020', 'LOC_Os07g03418'], dtype=object)
with open(SI_FILE_5_PATH) as myfile:
head = ''.join([next(myfile) for x in range(6)])
print(head)
#Supplemental Data Set 5: TF predictor groups
group.name members
pred.group.1 LOC_Os01g01430,LOC_Os01g60020,LOC_Os01g66120,LOC_Os03g21060,LOC_Os05g34830,LOC_Os07g37920,LOC_Os07g48450,LOC_Os11g03300,LOC_Os11g08210,LOC_Os12g03040
pred.group.2 LOC_Os01g01470,LOC_Os03g21030,LOC_Os03g42630,LOC_Os04g38720,LOC_Os06g23650,LOC_Os07g48550,LOC_Os08g40030,LOC_Os09g32260
pred.group.3 LOC_Os01g04750,LOC_Os01g04800,LOC_Os01g49830
si5_df = pd.read_csv(SI_FILE_5_PATH, skiprows=2, index_col=0, sep='\t')
si5_df.head()
members | |
---|---|
group.name | |
pred.group.1 | LOC_Os01g01430,LOC_Os01g60020,LOC_Os01g66120,L... |
pred.group.2 | LOC_Os01g01470,LOC_Os03g21030,LOC_Os03g42630,L... |
pred.group.3 | LOC_Os01g04750,LOC_Os01g04800,LOC_Os01g49830 |
pred.group.4 | LOC_Os01g07120,LOC_Os02g45450,LOC_Os04g48350,L... |
pred.group.5 | LOC_Os01g09640,LOC_Os05g10690 |
Wrangle Data and Create Collections¶
At the very least we need a name and a list of genes.
lit_dge_coll = gsf.GeneSetCollection(gem=agem, name='Literature DGE')
for col in si1_df.columns:
genes = si1_df[si1_df[col] > 0].index.values
diff = np.setdiff1d(genes, agem.data.Gene.values)
if diff.shape[0] > 0:
print(diff)
lit_dge_coll[col] = gsf.GeneSet.from_gene_array(genes, name=col)
lit_dge_coll
# genes
<GSForge.GeneSetCollection>
Literature DGE
GeneSets (8 total): Support Count
DROUGHT_UP: 1175
HEAT_UP: 592
RECOV_DROUGHT_UP: 446
DROUGHT_DOWN: 170
HEAT_DOWN: 106
... and 3 more.
lit_tf_coll = gsf.GeneSetCollection(gem=agem, name='Literature Transcription Factors')
for name, values in si5_df.iterrows():
genes = np.asarray(values.values[0].split(','))
lit_tf_coll[name] = gsf.GeneSet.from_gene_array(genes, name=name)
diff = np.setdiff1d(genes, agem.data.Gene.values)
if diff.shape[0] > 0:
print(diff)
lit_tf_coll
<GSForge.GeneSetCollection>
Literature Transcription Factors
GeneSets (62 total): Support Count
pred.group.8: 21
pred.group.6: 13
pred.group.16: 13
pred.group.46: 11
pred.group.1: 10
... and 57 more.
lit_tf_coll['pred.group.8'].get_support()
array(['LOC_Os01g39020', 'LOC_Os01g43590', 'LOC_Os01g53220',
'LOC_Os01g54550', 'LOC_Os02g13800', 'LOC_Os02g32590',
'LOC_Os03g06630', 'LOC_Os03g12370', 'LOC_Os03g25120',
'LOC_Os03g53340', 'LOC_Os04g48030', 'LOC_Os05g45410',
'LOC_Os06g35960', 'LOC_Os06g36930', 'LOC_Os07g08140',
'LOC_Os07g44690', 'LOC_Os08g43334', 'LOC_Os09g28200',
'LOC_Os09g28354', 'LOC_Os09g35790', 'LOC_Os10g28340'], dtype='<U14')
lit_dge_coll.save(LITERATURE_COLL_PATH.joinpath('DGE'))
lit_tf_coll.save(LITERATURE_COLL_PATH.joinpath('TF'))