Creating GeneSets from Literature Sources

Recall that the minumum requirement for GeneSet creation is simply a list of genes.

Setting up the notebook

import holoviews as hv
import numpy as np
import pandas as pd
hv.extension("bokeh")

import GSForge as gsf

Declaring used paths

# OS-independent path management.
from os import  environ
from pathlib import Path
OSF_PATH = Path(environ.get("GSFORGE_DEMO_DATA", default="~/GSForge_demo_data/")).expanduser().joinpath("osfstorage", "oryza_sativa")
SI_FILE_1_PATH = OSF_PATH.joinpath('GEMmakerGEMs', 'raw_annotation_data', 'TPC2016-00158-LSBR2_Supplemental_File_1.csv')
SI_FILE_5_PATH = OSF_PATH.joinpath('GEMmakerGEMs', 'raw_annotation_data', 'TPC2016-00158-LSBR2_Supplemental_File_5.txt')
GEM_PATH = OSF_PATH.joinpath("AnnotatedGEMs", "oryza_sativa_hisat2_raw.nc")
LITERATURE_COLL_PATH = OSF_PATH.joinpath('GeneSetCollections', 'literature')

Load an AnnotatedGEM

agem = gsf.AnnotatedGEM(GEM_PATH)
agem
<GSForge.AnnotatedGEM>
Name: Oryza sativa
Selected GEM Variable: 'counts'
    Gene   55986
    Sample 475

Load Data

with open(SI_FILE_1_PATH) as myfile:
    head = ''.join([next(myfile) for x in range(6)])
print(head)
#Supplemental Data. Wilkins et al. Plant Cell (2016) 10.1105/tpc.16.00158.,,,,,,,,
#Supplemental Data Set 1: Differentially expressed genes,,,,,,,,
,,,,,,,,
gene,HEAT_UP,HEAT_DOWN,RECOV_HEAT_UP,RECOV_HEAT_DOWN,DROUGHT_UP,DROUGHT_DOWN,RECOV_DROUGHT_UP,RECOV_DROUGHT_DOWN
ChrSy.fgenesh.gene.37,0,0,0,0,8,0,1,0
LOC_Os01g01610,6,0,0,0,0,0,0,0
si1_df = pd.read_csv(SI_FILE_1_PATH, skiprows=3, index_col=0)
si1_df.head()
HEAT_UP HEAT_DOWN RECOV_HEAT_UP RECOV_HEAT_DOWN DROUGHT_UP DROUGHT_DOWN RECOV_DROUGHT_UP RECOV_DROUGHT_DOWN
gene
ChrSy.fgenesh.gene.37 0 0 0 0 8 0 1 0
LOC_Os01g01610 6 0 0 0 0 0 0 0
LOC_Os01g01620 0 0 0 0 1 0 2 0
LOC_Os01g02060 0 0 0 0 2 0 2 0
LOC_Os01g02080 0 0 0 0 0 2 0 1
agem.data.Gene
<xarray.DataArray 'Gene' (Gene: 55986)>
array(['LOC_Os06g05820', 'LOC_Os10g27460', 'LOC_Os02g35980', ...,
       'LOC_Os03g50190', 'LOC_Os03g20020', 'LOC_Os07g03418'], dtype=object)
Coordinates:
  * Gene     (Gene) object 'LOC_Os06g05820' ... 'LOC_Os07g03418'
with open(SI_FILE_5_PATH) as myfile:
    head = ''.join([next(myfile) for x in range(6)])
print(head)
#Supplemental Data Set 5: TF predictor groups

group.name	members
pred.group.1	LOC_Os01g01430,LOC_Os01g60020,LOC_Os01g66120,LOC_Os03g21060,LOC_Os05g34830,LOC_Os07g37920,LOC_Os07g48450,LOC_Os11g03300,LOC_Os11g08210,LOC_Os12g03040
pred.group.2	LOC_Os01g01470,LOC_Os03g21030,LOC_Os03g42630,LOC_Os04g38720,LOC_Os06g23650,LOC_Os07g48550,LOC_Os08g40030,LOC_Os09g32260
pred.group.3	LOC_Os01g04750,LOC_Os01g04800,LOC_Os01g49830
si5_df = pd.read_csv(SI_FILE_5_PATH, skiprows=2, index_col=0, sep='\t')
si5_df.head()
members
group.name
pred.group.1 LOC_Os01g01430,LOC_Os01g60020,LOC_Os01g66120,L...
pred.group.2 LOC_Os01g01470,LOC_Os03g21030,LOC_Os03g42630,L...
pred.group.3 LOC_Os01g04750,LOC_Os01g04800,LOC_Os01g49830
pred.group.4 LOC_Os01g07120,LOC_Os02g45450,LOC_Os04g48350,L...
pred.group.5 LOC_Os01g09640,LOC_Os05g10690

Wrangle Data and Create Collections

At the very least we need a name and a list of genes.

lit_dge_coll = gsf.GeneSetCollection(gem=agem, name='Literature DGE')

for col in si1_df.columns:
    genes = si1_df[si1_df[col] > 0].index.values
    diff = np.setdiff1d(genes, agem.data.Gene.values)
    if diff.shape[0] > 0:
        print(diff)
    
    lit_dge_coll[col] = gsf.GeneSet.from_gene_array(genes, name=col)
    
lit_dge_coll
# genes
<GSForge.GeneSetCollection>
Literature DGE
GeneSets (8 total): Support Count
    DROUGHT_UP: 1175
    HEAT_UP: 592
    RECOV_DROUGHT_UP: 446
    DROUGHT_DOWN: 170
    HEAT_DOWN: 106
    ... and 3 more.
lit_tf_coll = gsf.GeneSetCollection(gem=agem, name='Literature Transcription Factors')

for name, values in si5_df.iterrows():
    genes = np.asarray(values.values[0].split(','))
    lit_tf_coll[name] = gsf.GeneSet.from_gene_array(genes, name=name)
    
    diff = np.setdiff1d(genes, agem.data.Gene.values)
    if diff.shape[0] > 0:
        print(diff)
    
lit_tf_coll
<GSForge.GeneSetCollection>
Literature Transcription Factors
GeneSets (62 total): Support Count
    pred.group.8: 21
    pred.group.6: 13
    pred.group.16: 13
    pred.group.46: 11
    pred.group.1: 10
    ... and 57 more.
lit_tf_coll['pred.group.8'].get_support()
array(['LOC_Os01g39020', 'LOC_Os01g43590', 'LOC_Os01g53220',
       'LOC_Os01g54550', 'LOC_Os02g13800', 'LOC_Os02g32590',
       'LOC_Os03g06630', 'LOC_Os03g12370', 'LOC_Os03g25120',
       'LOC_Os03g53340', 'LOC_Os04g48030', 'LOC_Os05g45410',
       'LOC_Os06g35960', 'LOC_Os06g36930', 'LOC_Os07g08140',
       'LOC_Os07g44690', 'LOC_Os08g43334', 'LOC_Os09g28200',
       'LOC_Os09g28354', 'LOC_Os09g35790', 'LOC_Os10g28340'], dtype='<U14')
lit_dge_coll.save(LITERATURE_COLL_PATH.joinpath('DGE'))
lit_tf_coll.save(LITERATURE_COLL_PATH.joinpath('TF'))