| | """Utilities for interacting with InterPro.""" |
| |
|
| | import itertools |
| | import re |
| | from dataclasses import dataclass |
| | from enum import IntEnum, auto |
| | from functools import cached_property |
| |
|
| | import networkx as nx |
| | import pandas as pd |
| | from cloudpathlib import AnyPath |
| |
|
| | from src.data.esm.utils.constants import esm3 as C |
| | from src.data.esm.utils.types import PathLike |
| |
|
| |
|
| | def parse_go_terms(text: str) -> list[str]: |
| | """Parses GO terms from a string. |
| | |
| | Args: |
| | text: String containing GO terms. Example: "GO:0008309, GO:1902267" Note that GO |
| | terms have exactly 7 digits. |
| | Returns: |
| | All GO terms found in the string. Example: ['GO:0008309', 'GO:1902267'] |
| | """ |
| | return re.findall(r"GO:(?:\d{7,})", text) |
| |
|
| |
|
| | def _parse_interpro2go(path: PathLike) -> dict[str, list[str]]: |
| | """Parses InterPro2GO file into map. |
| | |
| | NOTE: this file has a very strange, non-standard format. |
| | |
| | Args: |
| | path: path to InterPro2GO file from: https://www.ebi.ac.uk/GOA/InterPro2GO |
| | Returns: |
| | Mapping from InterPro to list of associated GO terms. |
| | """ |
| | with AnyPath(path).open("r") as f: |
| | text = f.read() |
| | df = pd.Series(text.split("\n"), name="line").to_frame() |
| | df = df[~df.line.str.startswith("!")] |
| | df["interpro_id"] = df.line.apply(lambda line: re.findall(r"IPR\d+", line)) |
| | df["go_ids"] = df.line.apply(parse_go_terms) |
| | df = df[df.go_ids.apply(len).gt(0) & df.interpro_id.apply(len).eq(1)] |
| | df["interpro_id"] = df["interpro_id"].apply(lambda xs: xs[0]) |
| |
|
| | |
| | df = ( |
| | df.groupby("interpro_id")["go_ids"] |
| | .apply(lambda group: list(itertools.chain.from_iterable(group))) |
| | .reset_index() |
| | ) |
| | return dict(zip(df.interpro_id, df.go_ids)) |
| |
|
| |
|
| | class InterProEntryType(IntEnum): |
| | """InterPro types and representation counts: |
| | |
| | Family 21,942 |
| | Domain 14,053 |
| | Homologous_superfamily 3,446 |
| | Conserved_site 728 |
| | Repeat 374 |
| | Active_site 133 |
| | Binding_site 75 |
| | PTM 17 |
| | """ |
| |
|
| | ACTIVE_SITE = 0 |
| | BINDING_SITE = auto() |
| | CONSERVED_SITE = auto() |
| | DOMAIN = auto() |
| | FAMILY = auto() |
| | HOMOLOGOUS_SUPERFAMILY = auto() |
| | PTM = auto() |
| | REPEAT = auto() |
| | UNKNOWN = auto() |
| |
|
| |
|
| | @dataclass |
| | class InterProEntry: |
| | """Represents an InterPro entry.""" |
| |
|
| | id: str |
| | type: InterProEntryType |
| | name: str |
| | description: str | None = None |
| |
|
| |
|
| | class InterPro: |
| | """Convenience class interacting with InterPro ontology/data.""" |
| |
|
| | def __init__( |
| | self, |
| | entries_path: PathLike | None = None, |
| | hierarchy_path: PathLike | None = None, |
| | interpro2go_path: PathLike | None = None, |
| | ): |
| | """Constructs interface to query InterPro entries.""" |
| |
|
| | def default(x, d): |
| | return x if x is not None else d |
| |
|
| | self.entries_path = default(entries_path, C.INTERPRO_ENTRY) |
| | self.hierarchy_graph_path = default(hierarchy_path, C.INTERPRO_HIERARCHY) |
| | self.interpro2go_path = default(interpro2go_path, C.INTERPRO2GO) |
| |
|
| | @cached_property |
| | def interpro2go(self) -> dict[str, list[str]]: |
| | """Reads the InterPro to GO term mapping.""" |
| | assert self.interpro2go_path is not None |
| | return _parse_interpro2go(self.interpro2go_path) |
| |
|
| | @cached_property |
| | def entries_frame(self) -> pd.DataFrame: |
| | """Loads full InterPro entry set as a DataFrame. |
| | |
| | Colums are |
| | - "id": str interpro accession /id as |
| | - "type": InterProEntryType representing the type of annotation. |
| | - "name": Short name of the entry. |
| | """ |
| | with AnyPath(self.entries_path).open("r") as f: |
| | df = pd.read_csv(f, sep="\t") |
| | assert all( |
| | col in df.columns for col in ["ENTRY_AC", "ENTRY_TYPE", "ENTRY_NAME"] |
| | ) |
| | df.rename( |
| | columns={"ENTRY_AC": "id", "ENTRY_TYPE": "type", "ENTRY_NAME": "name"}, |
| | inplace=True, |
| | ) |
| | df["type"] = df.type.str.upper().apply( |
| | lambda type_name: InterProEntryType[type_name] |
| | ) |
| | return df |
| |
|
| | @cached_property |
| | def entries(self) -> dict[str, InterProEntry]: |
| | """Returns all InterPro entries.""" |
| | return { |
| | row.id: InterProEntry( |
| | id=row.id, |
| | type=row.type, |
| | name=row.name, |
| | ) |
| | for row in self.entries_frame.itertuples() |
| | } |
| |
|
| | def lookup_name(self, interpro_id: str) -> str | None: |
| | """Short name / title for an interpro id.""" |
| | if interpro_id not in self.entries: |
| | return None |
| | return self.entries[interpro_id].name |
| |
|
| | def lookup_entry_type(self, interpro_id: str) -> InterProEntryType: |
| | """Looks up entry-type for an interpro id.""" |
| | if interpro_id in self.entries: |
| | return self.entries[interpro_id].type |
| | else: |
| | return InterProEntryType.UNKNOWN |
| |
|
| | @cached_property |
| | def graph(self) -> nx.DiGraph: |
| | """Reads the InterPro hierarchy of InterPro.""" |
| | graph = nx.DiGraph() |
| | with AnyPath(self.hierarchy_graph_path).open("r") as f: |
| | parents = [] |
| | for line in f: |
| | ipr = line.split("::", maxsplit=1)[0] |
| | ipr_strip = ipr.lstrip("-") |
| | level = (len(ipr) - len(ipr_strip)) // 2 |
| | parents = parents[:level] |
| | graph.add_node(ipr_strip) |
| | if parents: |
| | graph.add_edge(ipr_strip, parents[-1]) |
| | parents.append(ipr_strip) |
| | return graph |
| |
|