Source code for bio2bel_chebi.manager

# -*- coding: utf-8 -*-

"""Manager for Bio2BEL ChEBI."""

import datetime
import logging
import time
from typing import Iterable, List, Mapping, Optional, Tuple

import pandas as pd
from networkx import relabel_nodes
from pybel import BELGraph
from pybel.constants import IDENTIFIER, NAME, NAMESPACE
from pybel.dsl import BaseEntity
from pybel.manager.models import Namespace, NamespaceEntry
from tqdm import tqdm

from bio2bel import AbstractManager
from bio2bel.manager.flask_manager import FlaskMixin
from bio2bel.manager.namespace_manager import BELNamespaceManagerMixin
from .constants import MODULE_NAME
from .models import Accession, Base, Chemical, Relation, Synonym
from .parser.accession import get_accession_df
from .parser.compounds import get_compounds_df
from .parser.inchis import get_inchis_df
from .parser.names import get_names_df
from .parser.relation import get_relations_df

__all__ = ['Manager']

log = logging.getLogger(__name__)

_chebi_bel_name = 'ChEBI Ontology'
_chebi_bel_version = datetime.datetime.utcnow().strftime('%Y%m%d%H%M')
_chebi_description = 'Relations between chemicals of biological interest'


[docs]class Manager(AbstractManager, FlaskMixin, BELNamespaceManagerMixin): """Chemical multi-hierarchy.""" _base = Base module_name = MODULE_NAME namespace_model = Chemical identifiers_recommended = 'ChEBI' identifiers_pattern = r'^CHEBI:\d+$' identifiers_miriam = 'MIR:00000002' identifiers_namespace = 'chebi' identifiers_url = 'http://identifiers.org/chebi/' flask_admin_models = [Chemical, Relation, Synonym, Accession] def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # a dictionary from CHEBI identifier (string CHEBI:\d+) to the model self.id_chemical = {} self.chebi_id_to_chemical = {} self.chebi_id_to_inchi = {}
[docs] def is_populated(self) -> bool: """Check if the database is already populated.""" return 0 < self.count_chemicals()
[docs] def count_chemicals(self) -> int: """Count the number of chemicals stored.""" return self.session.query(Chemical).count()
[docs] def count_parent_chemicals(self) -> int: """Count the number of parent chemicals stored.""" return self.session.query(Chemical).filter(Chemical.parent_id.is_(None)).count()
[docs] def count_child_chemicals(self) -> int: """Count the number of child chemicals stored.""" return self.session.query(Chemical).filter(Chemical.parent_id.isnot(None)).count()
[docs] def count_xrefs(self) -> int: """Count the number of cross-references stored.""" return self.session.query(Accession).count()
[docs] def count_synonyms(self) -> int: """Count the number of synonyms stored.""" return self.session.query(Synonym).count()
[docs] def count_inchis(self) -> int: """Count the number of inchis stored.""" return self.session.query(Chemical).filter(Chemical.inchi.isnot(None)).count()
[docs] def count_relations(self) -> int: """Count the relations in the database.""" return self._count_model(Relation)
[docs] def list_relations(self) -> List[Relation]: """List the relations in the database.""" return self.session.query(Relation).all()
[docs] def summarize(self) -> Mapping[str, int]: """Return a summary dictionary over the content of the database.""" return dict( chemicals=self.count_chemicals(), xrefs=self.count_xrefs(), relations=self.count_relations(), synonyms=self.count_synonyms(), )
[docs] def get_or_create_chemical(self, chebi_id: str, **kwargs) -> Chemical: """Get a chemical from the database by ChEBI.""" chemical = self.chebi_id_to_chemical.get(chebi_id) if chemical is not None: return chemical chemical = self.get_chemical_by_chebi_id(chebi_id) if chemical is None: chemical = Chemical(chebi_id=chebi_id, **kwargs) self.chebi_id_to_chemical[chebi_id] = chemical return chemical
[docs] def get_chemical_by_chebi_id(self, chebi_id: str) -> Optional[Chemical]: """Get a chemical from the database.""" chemical = self.session.query(Chemical).filter(Chemical.chebi_id == chebi_id).one_or_none() if not chemical: return None if chemical.parent: return chemical.parent return chemical
[docs] def get_chemical_by_chebi_name(self, name: str) -> Optional[Chemical]: """Get a chemical from the database.""" return self.session.query(Chemical).filter(Chemical.name == name).one_or_none()
[docs] def build_chebi_id_name_mapping(self) -> Mapping[str, str]: """Build a mapping from ChEBI identifier to ChEBI name.""" # FIXME handle secondary id to correct name mappings, since the name isn't stored with the secondary id entry return dict(self.session.query(Chemical.chebi_id, Chemical.name).all())
[docs] def build_chebi_name_id_mapping(self) -> Mapping[str, str]: """Build a mapping from ChEBI name to ChEBI identifier.""" return dict(self.session.query(Chemical.name, Chemical.chebi_id).all())
def _load_inchis(self, url: Optional[str] = None) -> None: """Download and insert the InChI strings. :param url: The URL (or file path) to download. Defaults to the ChEBI data. """ df = get_inchis_df(url=url) for _, (chebi_id, inchi) in tqdm(df.iterrows(), desc='InChIs', total=len(df.index)): self.chebi_id_to_inchi[str(chebi_id)] = inchi def _populate_compounds(self, url: Optional[str] = None) -> None: """Download and populate the compounds. :param url: The URL (or file path) to download. Defaults to the ChEBI data. """ df = get_compounds_df(url=url) df = df.where((pd.notnull(df)), None) log.info('preparing Compounds') parents = [] it = tqdm(df.iterrows(), desc='Compounds', total=len(df.index)) for _, (pk, status, chebi_id, source, parent_pk, name, definition, _, _, _) in it: chebi_id = chebi_id.split(':')[1] chemical = self.id_chemical[pk] = self.chebi_id_to_chemical[chebi_id] = Chemical( id=pk, # ChEBI already sends out their data in relational format status=status, chebi_id=chebi_id, parent_id=parent_pk or None, name=name, source=source, definition=definition, inchi=self.chebi_id_to_inchi.get(chebi_id) ) self.session.add(chemical) if parent_pk: parents.append((pk, parent_pk)) for child_id, parent_pk in tqdm(parents, desc='Secondaries'): child = self.chebi_id_to_chemical.get(child_id) parent = self.chebi_id_to_chemical.get(parent_pk) if child is not None and parent is not None: child.parent = parent log.info('committing Compounds') self.session.commit() def _populate_names(self, url: Optional[str] = None) -> None: """Download and insert the synonyms. :param url: The URL (or file path) to download. Defaults to the ChEBI data. """ df = get_names_df(url=url) log.info('preparing Synonyms') grouped_df = df.groupby('COMPOUND_ID') for chebi_id, sub_df in tqdm(grouped_df, desc='Synonyms', total=len(grouped_df)): chebi_id = str(int(chebi_id)) chemical = self.get_or_create_chemical(chebi_id=chebi_id) for _, (pk, _, type_, source, name, adapted, language) in sub_df.iterrows(): if isinstance(name, float) or not name: continue synonym = Synonym( id=pk, chemical=chemical, type=type_, source=source, name=name, language=language ) self.session.add(synonym) log.info('committing Synonyms') self.session.commit() def _populate_accession(self, url: Optional[str] = None) -> None: """Download and inserts the database cross references and accession numbers :param url: The URL (or file path) to download. Defaults to the ChEBI data. """ df = get_accession_df(url=url) df = df.where((pd.notnull(df)), None) log.info('preparing Accessions') grouped_df = df.groupby('COMPOUND_ID') for chebi_id, sub_df in tqdm(grouped_df, desc='Xrefs', total=len(grouped_df)): chebi_id = str(int(chebi_id)) chemical = self.get_or_create_chemical(chebi_id=chebi_id) for _, (pk, _, source, type_, accession) in sub_df.iterrows(): acc = Accession( id=pk, chemical=chemical, source=source, type=type_, accession=accession ) self.session.add(acc) log.info('committing Accessions') self.session.commit() def _populate_relations(self, url: Optional[str] = None) -> None: df = get_relations_df(url=url) for _, (pk, relation_type, source_id, target_id, status) in tqdm(df.iterrows(), total=len(df.index)): source = self.id_chemical.get(f'CHEBI:{source_id}') if source is None: continue target = self.id_chemical.get(f'CHEBI:{target_id}') if target is None: continue relation = Relation( id=pk, type=relation_type, source=source, target=target, status=status, ) self.session.add(relation) log.info('committing Relations') self.session.commit()
[docs] def populate( self, inchis_url: Optional[str] = None, compounds_url: Optional[str] = None, relations_url: Optional[str] = None, names_url: Optional[str] = None, accessions_url: Optional[str] = None, ) -> None: """Populate all tables.""" t = time.time() self._load_inchis(url=inchis_url) self._populate_compounds(url=compounds_url) # self._populate_relations(url=relations_url) # self._populate_names(url=names_url) # self._populate_accession(url=accessions_url) log.info('populated in %.2f seconds', time.time() - t)
def normalize_chemicals(self, graph: BELGraph, use_tqdm: bool = False) -> None: mapping = { node: chemical.to_bel() for node, chemical in list(self.iter_chemicals(graph, use_tqdm=use_tqdm)) } relabel_nodes(graph, mapping, copy=False)
[docs] def iter_chemicals(self, graph: BELGraph, use_tqdm: bool = False) -> Iterable[Tuple[BaseEntity, Chemical]]: """Iterate over pairs of BEL nodes and ChEBI chemicals.""" it = ( tqdm(graph, desc='ChEBI chemicals') if use_tqdm else graph ) for node in it: chemical = self.get_chemical_from_data(node) if chemical is not None: yield node, chemical
def get_chemical_from_data(self, node: BaseEntity) -> Optional[Chemical]: namespace = node.get(NAMESPACE) if not namespace or namespace.lower() not in {'chebi', 'chebiid'}: return identifier = node.get(IDENTIFIER) name = node.get(NAME) if identifier is None and name is None: raise ValueError if namespace.lower() == 'chebiid': return self.get_chemical_by_chebi_id(name) elif namespace.lower() == 'chebi': if identifier is not None: return self.get_chemical_by_chebi_id(identifier) else: # elif name is not None: return self.get_chemical_by_chebi_name(name) log.warning('Could not find ChEBI node: %r', node)
[docs] def enrich_chemical_hierarchy(self, graph: BELGraph) -> None: """Enrich the parents for all ChEBI chemicals in the graph.""" for _, data in graph.nodes(data=True): chemical = self.get_chemical_from_data(data) if chemical is None: continue parent = chemical.parent while parent is not None: graph.add_is_a(chemical.as_bel(), parent.as_bel()) chemical, parent = parent, parent.parent
def _list_equivalencies(self) -> List[Chemical]: return self.session.query(Chemical).filter(Chemical.parent_id.isnot(None)) def _iterate_relations(self): # return self.session.query(Relation).limit(100) return tqdm(self.list_relations(), total=self.count_relations(), desc='Relation')
[docs] def to_bel(self) -> BELGraph: """Export BEL.""" graph = BELGraph( name=_chebi_bel_name, version=_chebi_bel_version, description=_chebi_description, ) self.add_namespace_to_graph(graph) for relation in self._iterate_relations(): relation.add_to_graph(graph) return graph
def _create_namespace_entry_from_model(self, chemical: Chemical, namespace: Namespace) -> NamespaceEntry: """Create a namespace entry from a chemical model.""" if chemical.name: return NamespaceEntry( encoding=chemical.bel_encoding, name=chemical.name, identifier=chemical.chebi_id, namespace=namespace, ) @staticmethod def _get_identifier(chemical: Chemical) -> str: """Get the identifier from the chemical model.""" return chemical.chebi_id @staticmethod def _get_name(chemical: Chemical) -> str: """Get the name of the chemical.""" return chemical.safe_name