genome

Genome Objects

class Genome()

__init__

def __init__(genome_fasta,
             gtf,
             name,
             db_conn,
             description=None,
             transcriptome_fasta=None,
             standalone=False,
             proteome_fasta=None,
             create=True)

Arguments:

  • gtf_path: Path to the GTF file
  • genome_fasta: Path to the genome fasta file
  • transcriptome_fasta: Path to the transcriptome fasta file
  • proteome_fasta: Path to the proteome fasta file
  • db_conn: database connection object this is a sqlalchemy engine
  • taxon_id: taxon id of the genome

genes

def genes(gene_ids=None, range=None, ignore_strand=True)

Gene id or range if range is provided it will return the genes in that range depending on the overlap type

Arguments:

  • id: Gene id, that used in the gtf file
  • range: A GenomicRange object
  • ignore_strand: whether to ignore strand this will return all the genes in the range regardless of strand

Returns:

a GenomicRangesDict object with the genes in it, each key is the gene name and the value is a GenomicRange object

transcripts

def transcripts(gene_ids=None,
                ids=None,
                range=None,
                ignore_strand=True,
                group_by_gene=True)

return transcripts by gene id, transcript id or range

Arguments:

  • gene_ids: return transcripts for these gene ids
  • ids: return transcripts with these transcript ids
  • range: return transcripts in this range
  • ignore_strand: ignore strand when searching by range
  • group_by_gene: whether to group the returned transcripts by gene id, if true the returned object will have gene ids as keys and GenomicRangesList as values, if false the returned object will have transcript ids as keys and GenomicRange as values

Returns:

a genomic ranges dict object

exons

def exons(transcript_ids=None,
          range=None,
          group_by_transcript=True,
          ignore_strand=True)

same as genes but will need to search by transcript not gene, if you do not know the transcript search for it with transcripts first

Arguments:

  • transcript_id: return all exons for this transcript id
  • id: return exon with this id
  • range: return exons in this range
  • group_by_transcript: whether to group the returned exons by transcript id, if true the returned object
  • ignore_strand: whether to ignore strand when searching by range

Returns:

a genomic ranges dict object, if not grouped by transcript the keys will be exon ids otherwise the keys will be transcript ids

coding

def coding(transcript_ids=None,
           range=None,
           group_by_transcript=True,
           ignore_strand=True)

same as exons return all the coding sequences for a transcript or a list of transcripts

Arguments:

  • transcript_id: return all coding sequences for this transcript id
  • id: return coding sequence with this id
  • range: return coding sequences in this range
  • group_by_transcript: whether to group the returned coding sequences by transcript id, if true the returned object

Returns:

a genomic ranges dict object, if not grouped by transcript the keys will be coding sequence ids otherwise the keys will be transcript ids

three_utr

def three_utr(transcript_ids=None, range=None, ignore_strand=True)

return all the 3’ utrs for a transcript or a list of transcripts

Arguments:

  • transcript_ids: return 3’ utrs for these transcript ids
  • range: return 3’ utrs in this range
  • ignore_strand: regardless of strand

Returns:

a genomic ranges dict object with transcript ids as keys and GenomicRangesList as values, the utrs are not described as separate exons but the exons are merged into one if that utr spans multiple exons. Additionally if the utrs ends in the middle of an exon the utr will end there.

five_utr

def five_utr(transcript_ids=None, ids=None, range=None, ignore_strand=True)

return all the 5’ utrs for a transcript or a list of transcripts

Arguments:

  • transcript_ids: return 3’ utrs for these transcript ids
  • range: return 3’ utrs in this range
  • ignore_strand: regardless of strand

Returns:

a genomic ranges dict object with transcript ids as keys and GenomicRangesList as values, the utrs are not described as separate exons but the exons are merged into one if that utr spans multple exons. Additionally if the utrs ends in the middle of an exon the utr will end there.

introns

def introns(transcript_ids=None,
            ids=None,
            range=None,
            group_by_transcript=True,
            ignore_strand=True)

return all the introns for a transcript or a list of transcripts

Arguments:

  • transcript_id: return introns for this transcript id
  • id: return intron with this id (introns usually are not descibed in a gtf, so this id may not be very useful since it is an auto incremented id)
  • range: return introns in this range
  • group_by_transcript: return introns grouped by transcript
  • ignore_strand: whether to ignore strand when searching by range

Returns:

return: a genomic ranges dict object, if not grouped by transcript the keys will be intron ids otherwise the keys will be transcript ids

get_sequence

def get_sequence(genomic_range, type='genome')

Get the sequence of a genomic range. This takes a single genomic range you can iterate over a GenomicRangeList or GenomicRangeDict

Arguments:

  • genomic_range: GenomicRange object

Returns:

sequence as string

add_annotation

def add_annotation(table, row_id, annots)

add arbitrary annotations as a dictionary to a specific row in a specific table

Arguments:

  • table: which table to add the annotations to
  • id: which row id to add the annotations to, this is the datbase internal id not the gene_id or transcript_id, those ids can be found in the annotations of each row
  • annots: a dictionary of annotations to add

Returns:

None but the database will be updated