UMAP Embeddings

Uniform Manifold Approximation and Projection (UMAP) is a dimensinoal reduction technique. See the full documentation here.

from os import  environ
from pathlib import Path
import numpy as np
import umap
import umap.plot
import GSForge as gsf
import holoviews as hv
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
hv.extension("bokeh")

OSF_PATH = Path(environ.get("GSFORGE_DEMO_DATA", default="~/GSForge_demo_data/osfstorage/oryza_sativa")).expanduser()
NORMED_GEM_PATH = OSF_PATH.joinpath("AnnotatedGEMs", "oryza_sativa_hisat2_normed.nc")
agem = gsf.AnnotatedGEM(NORMED_GEM_PATH)
agem
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~/virtualenv/python3.7.1/lib/python3.7/site-packages/xarray/backends/file_manager.py in _acquire_with_cache_info(self, needs_lock)
    198             try:
--> 199                 file = self._cache[self._key]
    200             except KeyError:

~/virtualenv/python3.7.1/lib/python3.7/site-packages/xarray/backends/lru_cache.py in __getitem__(self, key)
     52         with self._lock:
---> 53             value = self._cache[key]
     54             self._cache.move_to_end(key)

KeyError: [<class 'netCDF4._netCDF4.Dataset'>, ('/home/travis/GSForge_demo_data/AnnotatedGEMs/oryza_sativa_hisat2_normed.nc',), 'r', (('clobber', True), ('diskless', False), ('format', 'NETCDF4'), ('persist', False))]

During handling of the above exception, another exception occurred:

FileNotFoundError                         Traceback (most recent call last)
/tmp/ipykernel_5750/3720900272.py in <module>
----> 1 agem = gsf.AnnotatedGEM(NORMED_GEM_PATH)
      2 agem

~/virtualenv/python3.7.1/lib/python3.7/site-packages/GSForge/models/_AnnotatedGEM.py in __init__(self, *args, **params)
     65         logger.debug('Initializing a new gsforge.AnnotatedGEM object...')
     66         if args:
---> 67             params = self.__annotated_gem_dispatch(*args, **params)
     68         super().__init__(**params)
     69         logger.debug('AnnotatedGEM initialization complete.')

~/virtualenv/python3.7.1/lib/python3.7/site-packages/GSForge/_singledispatchmethod.py in _method(*args, **kwargs)
     71         def _method(*args, **kwargs):
     72             method = self.dispatcher.dispatch(args[0].__class__)
---> 73             return method.__get__(obj, cls)(*args, **kwargs)
     74 
     75         _method.__isabstractmethod__ = self.__isabstractmethod__

~/virtualenv/python3.7.1/lib/python3.7/site-packages/GSForge/models/_AnnotatedGEM.py in _parse_netcdf_path(cls, netcdf_path, **params)
    172             A parsed parameter dictionary.
    173         """
--> 174         params = cls._parse_xarray_dataset(xr.open_dataset(netcdf_path), **params)
    175         return {"data": xr.open_dataset(netcdf_path), **params}
    176 

~/virtualenv/python3.7.1/lib/python3.7/site-packages/xarray/backends/api.py in open_dataset(filename_or_obj, engine, chunks, cache, decode_cf, mask_and_scale, decode_times, decode_timedelta, use_cftime, concat_characters, decode_coords, drop_variables, backend_kwargs, *args, **kwargs)
    499         drop_variables=drop_variables,
    500         **decoders,
--> 501         **kwargs,
    502     )
    503     ds = _dataset_from_backend_dataset(

~/virtualenv/python3.7.1/lib/python3.7/site-packages/xarray/backends/netCDF4_.py in open_dataset(self, filename_or_obj, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta, group, mode, format, clobber, diskless, persist, lock, autoclose)
    558             persist=persist,
    559             lock=lock,
--> 560             autoclose=autoclose,
    561         )
    562 

~/virtualenv/python3.7.1/lib/python3.7/site-packages/xarray/backends/netCDF4_.py in open(cls, filename, mode, format, group, clobber, diskless, persist, lock, lock_maker, autoclose)
    378             netCDF4.Dataset, filename, mode=mode, kwargs=kwargs
    379         )
--> 380         return cls(manager, group=group, mode=mode, lock=lock, autoclose=autoclose)
    381 
    382     def _acquire(self, needs_lock=True):

~/virtualenv/python3.7.1/lib/python3.7/site-packages/xarray/backends/netCDF4_.py in __init__(self, manager, group, mode, lock, autoclose)
    326         self._group = group
    327         self._mode = mode
--> 328         self.format = self.ds.data_model
    329         self._filename = self.ds.filepath()
    330         self.is_remote = is_remote_uri(self._filename)

~/virtualenv/python3.7.1/lib/python3.7/site-packages/xarray/backends/netCDF4_.py in ds(self)
    387     @property
    388     def ds(self):
--> 389         return self._acquire()
    390 
    391     def open_store_variable(self, name, var):

~/virtualenv/python3.7.1/lib/python3.7/site-packages/xarray/backends/netCDF4_.py in _acquire(self, needs_lock)
    381 
    382     def _acquire(self, needs_lock=True):
--> 383         with self._manager.acquire_context(needs_lock) as root:
    384             ds = _nc4_require_group(root, self._group, self._mode)
    385         return ds

/opt/python/3.7.1/lib/python3.7/contextlib.py in __enter__(self)
    110         del self.args, self.kwds, self.func
    111         try:
--> 112             return next(self.gen)
    113         except StopIteration:
    114             raise RuntimeError("generator didn't yield") from None

~/virtualenv/python3.7.1/lib/python3.7/site-packages/xarray/backends/file_manager.py in acquire_context(self, needs_lock)
    185     def acquire_context(self, needs_lock=True):
    186         """Context manager for acquiring a file."""
--> 187         file, cached = self._acquire_with_cache_info(needs_lock)
    188         try:
    189             yield file

~/virtualenv/python3.7.1/lib/python3.7/site-packages/xarray/backends/file_manager.py in _acquire_with_cache_info(self, needs_lock)
    203                     kwargs = kwargs.copy()
    204                     kwargs["mode"] = self._mode
--> 205                 file = self._opener(*self._args, **kwargs)
    206                 if self._mode == "w":
    207                     # ensure file doesn't get overriden when opened again

src/netCDF4/_netCDF4.pyx in netCDF4._netCDF4.Dataset.__init__()

src/netCDF4/_netCDF4.pyx in netCDF4._netCDF4._ensure_nc_success()

FileNotFoundError: [Errno 2] No such file or directory: b'/home/travis/GSForge_demo_data/AnnotatedGEMs/oryza_sativa_hisat2_normed.nc'

Plotting with umap.plot

By far the easiest method.

counts, labels = gsf.get_gem_data(agem, annotation_variables=['treatment', 'genotype', 'time'], 
                                  count_transform=lambda c: np.log2(c + 0.25))
mapper = umap.UMAP(random_state=42, metric='manhattan', n_neighbors=50).fit(counts.values)
fig, axes = plt.subplots(1, 3, figsize=(21, 7))
umap.plot.points(mapper, labels=labels['treatment'], background='black', ax=axes[0], color_key_cmap='Set1');
umap.plot.points(mapper, labels=labels['genotype'], background='black', ax=axes[1], color_key_cmap='Set2');
umap.plot.points(mapper, labels=labels['time'], background='black', ax=axes[2], color_key_cmap='plasma');
../../_images/UMAP_Embeddings_4_0.png

Plotting Manually

For posters, papers and other occasions that may have specific formatting requirements you will want to access the embedding data directly and construct your plot to said requirements.

# Set up custom colors.
treatment_cmap = {
    'CONTROL': '#1f78b4',
    'HEAT': '#e31a1c',
    'RECOV_HEAT': '#fb9a99',
    'DROUGHT': '#6a3d9a',
    'RECOV_DROUGHT': '#cab2d6',
}
treatment_colors = labels['treatment'].to_series().map(treatment_cmap).values

# Declare fonts and sizes per your requirements.
plt.rcParams.update({'font.size': 10, 'font.family': 'serif'})

# Create the figure at the right size and resolution.
fig_inches = 3.5
fig_dpi = 300  # 300 is a common DPI requirement.

# Construct the figure.
fig, ax = plt.subplots(figsize=(fig_inches, fig_inches), dpi=fig_dpi)

# Fuss with the ticks. The numerical output of UMAP is not so important here.
ax.xaxis.set_visible(False)
ax.yaxis.set_visible(False)

for spine in ax.spines.values():
    spine.set_edgecolor('grey')
    spine.set_linewidth(0.5)
    
    
# Add the scatter points.
ax.scatter(mapper.embedding_[:, 0], mapper.embedding_[:, 1], s=10, marker='.', linewidths=0, c=treatment_colors);
../../_images/UMAP_Embeddings_6_0.png
handles = [mpatches.Patch(color=c, label=k) for k, c in treatment_cmap.items()]
fig, axes = plt.subplots(dpi=fig_dpi, figsize=(3, 0.75))
axes.axis('off')
axes.legend(handles=handles, ncol=2);
../../_images/UMAP_Embeddings_7_0.png