Source code for bio2bel_chebi.manager

# -*- coding: utf-8 -*-

"""Manager for Bio2BEL ChEBI."""

import datetime
import logging
import time
from typing import Iterable, List, Mapping, Optional, Tuple

import pandas as pd
from networkx import relabel_nodes
from pybel import BELGraph
from pybel.constants import IDENTIFIER, NAME, NAMESPACE
from pybel.dsl import BaseEntity
from pybel.manager.models import Namespace, NamespaceEntry
from tqdm import tqdm

from bio2bel import AbstractManager
from bio2bel.manager.flask_manager import FlaskMixin
from bio2bel.manager.namespace_manager import BELNamespaceManagerMixin
from .constants import MODULE_NAME
from .models import Accession, Base, Chemical, Relation, Synonym
from .parser.accession import get_accession_df
from .parser.compounds import get_compounds_df
from .parser.inchis import get_inchis_df
from .parser.names import get_names_df
from .parser.relation import get_relations_df

__all__ = ['Manager']

log = logging.getLogger(__name__)

_chebi_bel_name = 'ChEBI Ontology'
_chebi_bel_version = datetime.datetime.utcnow().strftime('%Y%m%d%H%M')
_chebi_description = 'Relations between chemicals of biological interest'


[docs]class Manager(AbstractManager, FlaskMixin, BELNamespaceManagerMixin):
    """Chemical multi-hierarchy."""

    _base = Base
    module_name = MODULE_NAME

    namespace_model = Chemical
    identifiers_recommended = 'ChEBI'
    identifiers_pattern = r'^CHEBI:\d+$'
    identifiers_miriam = 'MIR:00000002'
    identifiers_namespace = 'chebi'
    identifiers_url = 'http://identifiers.org/chebi/'

    flask_admin_models = [Chemical, Relation, Synonym, Accession]

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # a dictionary from CHEBI identifier (string CHEBI:\d+) to the model
        self.id_chemical = {}
        self.chebi_id_to_chemical = {}
        self.chebi_id_to_inchi = {}

[docs]    def is_populated(self) -> bool:
        """Check if the database is already populated."""
        return 0 < self.count_chemicals()

[docs]    def count_chemicals(self) -> int:
        """Count the number of chemicals stored."""
        return self.session.query(Chemical).count()

[docs]    def count_parent_chemicals(self) -> int:
        """Count the number of parent chemicals stored."""
        return self.session.query(Chemical).filter(Chemical.parent_id.is_(None)).count()

[docs]    def count_child_chemicals(self) -> int:
        """Count the number of child chemicals stored."""
        return self.session.query(Chemical).filter(Chemical.parent_id.isnot(None)).count()

[docs]    def count_xrefs(self) -> int:
        """Count the number of cross-references stored."""
        return self.session.query(Accession).count()

[docs]    def count_synonyms(self) -> int:
        """Count the number of synonyms stored."""
        return self.session.query(Synonym).count()

[docs]    def count_inchis(self) -> int:
        """Count the number of inchis stored."""
        return self.session.query(Chemical).filter(Chemical.inchi.isnot(None)).count()

[docs]    def count_relations(self) -> int:
        """Count the relations in the database."""
        return self._count_model(Relation)

[docs]    def list_relations(self) -> List[Relation]:
        """List the relations in the database."""
        return self.session.query(Relation).all()

[docs]    def summarize(self) -> Mapping[str, int]:
        """Return a summary dictionary over the content of the database."""
        return dict(
            chemicals=self.count_chemicals(),
            xrefs=self.count_xrefs(),
            relations=self.count_relations(),
            synonyms=self.count_synonyms(),
        )

[docs]    def get_or_create_chemical(self, chebi_id: str, **kwargs) -> Chemical:
        """Get a chemical from the database by ChEBI."""
        chemical = self.chebi_id_to_chemical.get(chebi_id)

        if chemical is not None:
            return chemical

        chemical = self.get_chemical_by_chebi_id(chebi_id)

        if chemical is None:
            chemical = Chemical(chebi_id=chebi_id, **kwargs)

        self.chebi_id_to_chemical[chebi_id] = chemical
        return chemical

[docs]    def get_chemical_by_chebi_id(self, chebi_id: str) -> Optional[Chemical]:
        """Get a chemical from the database."""
        chemical = self.session.query(Chemical).filter(Chemical.chebi_id == chebi_id).one_or_none()

        if not chemical:
            return None

        if chemical.parent:
            return chemical.parent

        return chemical

[docs]    def get_chemical_by_chebi_name(self, name: str) -> Optional[Chemical]:
        """Get a chemical from the database."""
        return self.session.query(Chemical).filter(Chemical.name == name).one_or_none()

[docs]    def build_chebi_id_name_mapping(self) -> Mapping[str, str]:
        """Build a mapping from ChEBI identifier to ChEBI name."""
        # FIXME handle secondary id to correct name mappings, since the name isn't stored with the secondary id entry
        return dict(self.session.query(Chemical.chebi_id, Chemical.name).all())

[docs]    def build_chebi_name_id_mapping(self) -> Mapping[str, str]:
        """Build a mapping from ChEBI name to ChEBI identifier."""
        return dict(self.session.query(Chemical.name, Chemical.chebi_id).all())

    def _load_inchis(self, url: Optional[str] = None) -> None:
        """Download and insert the InChI strings.

        :param url: The URL (or file path) to download. Defaults to the ChEBI data.
        """
        df = get_inchis_df(url=url)

        for _, (chebi_id, inchi) in tqdm(df.iterrows(), desc='InChIs', total=len(df.index)):
            self.chebi_id_to_inchi[str(chebi_id)] = inchi

    def _populate_compounds(self, url: Optional[str] = None) -> None:
        """Download and populate the compounds.

        :param url: The URL (or file path) to download. Defaults to the ChEBI data.
        """
        df = get_compounds_df(url=url)
        df = df.where((pd.notnull(df)), None)

        log.info('preparing Compounds')

        parents = []
        it = tqdm(df.iterrows(), desc='Compounds', total=len(df.index))
        for _, (pk, status, chebi_id, source, parent_pk, name, definition, _, _, _) in it:
            chebi_id = chebi_id.split(':')[1]

            chemical = self.id_chemical[pk] = self.chebi_id_to_chemical[chebi_id] = Chemical(
                id=pk,  # ChEBI already sends out their data in relational format
                status=status,
                chebi_id=chebi_id,
                parent_id=parent_pk or None,
                name=name,
                source=source,
                definition=definition,
                inchi=self.chebi_id_to_inchi.get(chebi_id)
            )
            self.session.add(chemical)

            if parent_pk:
                parents.append((pk, parent_pk))

        for child_id, parent_pk in tqdm(parents, desc='Secondaries'):
            child = self.chebi_id_to_chemical.get(child_id)
            parent = self.chebi_id_to_chemical.get(parent_pk)

            if child is not None and parent is not None:
                child.parent = parent

        log.info('committing Compounds')
        self.session.commit()

    def _populate_names(self, url: Optional[str] = None) -> None:
        """Download and insert the synonyms.

        :param url: The URL (or file path) to download. Defaults to the ChEBI data.
        """
        df = get_names_df(url=url)

        log.info('preparing Synonyms')
        grouped_df = df.groupby('COMPOUND_ID')
        for chebi_id, sub_df in tqdm(grouped_df, desc='Synonyms', total=len(grouped_df)):
            chebi_id = str(int(chebi_id))
            chemical = self.get_or_create_chemical(chebi_id=chebi_id)

            for _, (pk, _, type_, source, name, adapted, language) in sub_df.iterrows():

                if isinstance(name, float) or not name:
                    continue

                synonym = Synonym(
                    id=pk,
                    chemical=chemical,
                    type=type_,
                    source=source,
                    name=name,
                    language=language
                )
                self.session.add(synonym)

        log.info('committing Synonyms')
        self.session.commit()

    def _populate_accession(self, url: Optional[str] = None) -> None:
        """Download and inserts the database cross references and accession numbers

        :param url: The URL (or file path) to download. Defaults to the ChEBI data.
        """
        df = get_accession_df(url=url)
        df = df.where((pd.notnull(df)), None)

        log.info('preparing Accessions')

        grouped_df = df.groupby('COMPOUND_ID')
        for chebi_id, sub_df in tqdm(grouped_df, desc='Xrefs', total=len(grouped_df)):
            chebi_id = str(int(chebi_id))
            chemical = self.get_or_create_chemical(chebi_id=chebi_id)
            for _, (pk, _, source, type_, accession) in sub_df.iterrows():
                acc = Accession(
                    id=pk,
                    chemical=chemical,
                    source=source,
                    type=type_,
                    accession=accession
                )
                self.session.add(acc)

        log.info('committing Accessions')
        self.session.commit()

    def _populate_relations(self, url: Optional[str] = None) -> None:
        df = get_relations_df(url=url)
        for _, (pk, relation_type, source_id, target_id, status) in tqdm(df.iterrows(), total=len(df.index)):

            source = self.id_chemical.get(f'CHEBI:{source_id}')

            if source is None:
                continue

            target = self.id_chemical.get(f'CHEBI:{target_id}')

            if target is None:
                continue

            relation = Relation(
                id=pk,
                type=relation_type,
                source=source,
                target=target,
                status=status,
            )
            self.session.add(relation)

        log.info('committing Relations')
        self.session.commit()

[docs]    def populate(
            self,
            inchis_url: Optional[str] = None,
            compounds_url: Optional[str] = None,
            relations_url: Optional[str] = None,
            names_url: Optional[str] = None,
            accessions_url: Optional[str] = None,
    ) -> None:
        """Populate all tables."""
        t = time.time()

        self._load_inchis(url=inchis_url)
        self._populate_compounds(url=compounds_url)
        # self._populate_relations(url=relations_url)
        # self._populate_names(url=names_url)
        # self._populate_accession(url=accessions_url)

        log.info('populated in %.2f seconds', time.time() - t)

    def normalize_chemicals(self, graph: BELGraph, use_tqdm: bool = False) -> None:
        mapping = {
            node: chemical.to_bel()
            for node, chemical in list(self.iter_chemicals(graph, use_tqdm=use_tqdm))
        }
        relabel_nodes(graph, mapping, copy=False)

[docs]    def iter_chemicals(self, graph: BELGraph, use_tqdm: bool = False) -> Iterable[Tuple[BaseEntity, Chemical]]:
        """Iterate over pairs of BEL nodes and ChEBI chemicals."""
        it = (
            tqdm(graph, desc='ChEBI chemicals')
            if use_tqdm else
            graph
        )
        for node in it:
            chemical = self.get_chemical_from_data(node)
            if chemical is not None:
                yield node, chemical

    def get_chemical_from_data(self, node: BaseEntity) -> Optional[Chemical]:
        namespace = node.get(NAMESPACE)

        if not namespace or namespace.lower() not in {'chebi', 'chebiid'}:
            return

        identifier = node.get(IDENTIFIER)
        name = node.get(NAME)

        if identifier is None and name is None:
            raise ValueError

        if namespace.lower() == 'chebiid':
            return self.get_chemical_by_chebi_id(name)

        elif namespace.lower() == 'chebi':
            if identifier is not None:
                return self.get_chemical_by_chebi_id(identifier)
            else:  # elif name is not None:
                return self.get_chemical_by_chebi_name(name)

        log.warning('Could not find ChEBI node: %r', node)

[docs]    def enrich_chemical_hierarchy(self, graph: BELGraph) -> None:
        """Enrich the parents for all ChEBI chemicals in the graph."""
        for _, data in graph.nodes(data=True):
            chemical = self.get_chemical_from_data(data)

            if chemical is None:
                continue

            parent = chemical.parent
            while parent is not None:
                graph.add_is_a(chemical.as_bel(), parent.as_bel())
                chemical, parent = parent, parent.parent

    def _list_equivalencies(self) -> List[Chemical]:
        return self.session.query(Chemical).filter(Chemical.parent_id.isnot(None))

    def _iterate_relations(self):
        # return self.session.query(Relation).limit(100)
        return tqdm(self.list_relations(), total=self.count_relations(), desc='Relation')

[docs]    def to_bel(self) -> BELGraph:
        """Export BEL."""
        graph = BELGraph(
            name=_chebi_bel_name,
            version=_chebi_bel_version,
            description=_chebi_description,
        )

        self.add_namespace_to_graph(graph)

        for relation in self._iterate_relations():
            relation.add_to_graph(graph)

        return graph

    def _create_namespace_entry_from_model(self, chemical: Chemical, namespace: Namespace) -> NamespaceEntry:
        """Create a namespace entry from a chemical model."""
        if chemical.name:
            return NamespaceEntry(
                encoding=chemical.bel_encoding,
                name=chemical.name,
                identifier=chemical.chebi_id,
                namespace=namespace,
            )

    @staticmethod
    def _get_identifier(chemical: Chemical) -> str:
        """Get the identifier from the chemical model."""
        return chemical.chebi_id

    @staticmethod
    def _get_name(chemical: Chemical) -> str:
        """Get the name of the chemical."""
        return chemical.safe_name