Source code for stdpopsim.annotations

"""
Infrastructure for defining information about genome annotation.
"""

import logging
import os
import attr
import numpy as np
import stdpopsim

logger = logging.getLogger(__name__)


[docs]@attr.s(kw_only=True) class Annotation: """ Class representing an annotation track. :ivar str ~.id: String that uniquely identifies the annotation. :ivar species: The species to which this annotation applies. :vartype species: :class:`.Species` :ivar str url: The URL where the packed and compressed GFF3 can be found. :ivar str intervals_url: The URL of the intervals cache of the annotations. :ivar str intervals_sha256: The SHA256 checksum of the annotations cache. :ivar str ~.description: One line description of the annotation. :ivar citations: List of citations for the annotation. :vartype citations: list of :class:`.Citation` :ivar file_pattern: The pattern used to map individual chromosome id strings to files """ id = attr.ib() species = attr.ib() url = attr.ib() gff_sha256 = attr.ib() intervals_url = attr.ib() intervals_sha256 = attr.ib() description = attr.ib() citations = attr.ib(factory=list) file_pattern = attr.ib() annotation_source = attr.ib() annotation_type = attr.ib() def __attrs_post_init__(self): self._cache = stdpopsim.CachedData( namespace=f"annotations/{self.species.id}/{self.id}", url=self.intervals_url, sha256=self.intervals_sha256, extract=True, ) # logging.info(f"annotation namespace = {self._cache.namespace}") @property def cache_path(self): return self._cache.cache_path def __str__(self): s = "GTF Annotation:\n" s += "\tspecies = {}\n".format(self.species.name) s += "\tid = {}\n".format(self.id) s += "\turl = {}\n".format(self.url) s += "\tintervals url = {}\n".format(self.intervals_url) s += "\tcached = {}\n".format(self.is_cached()) s += "\tcache_path = {}\n".format(self.cache_path) return s
[docs] def is_cached(self): """ Returns True if this annotation is cached locally. """ return self._cache.is_valid()
[docs] def download(self): """ Downloads the intervals URL and stores it in the cache directory. """ self._cache.download()
[docs] def get_chromosome_annotations(self, id): """ Returns the numpy interval array for the chromosome with the specified id. """ chrom = self.species.genome.get_chromosome(id) if not self.is_cached(): self.download() file_path = os.path.join(self.cache_path, self.file_pattern.format(id=chrom.id)) ret = np.loadtxt(file_path, dtype="int32") if len(ret) == 0: raise ValueError(f"No annotations found for {id}") return ret