File size: 7,681 Bytes
7c15d15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import os
import re

from anthropic import Anthropic
from openai import AzureOpenAI
from tqdm import tqdm


def process_texts_with_api(sentences):
    """
    Extract GO terms from sentences using the Anthropic API.

    Args:
        sentences (list of str): List of sentences to process.

    Environment Variables:
        ANTHROPIC_API_KEY: Anthropic API key.

    Returns:
        list of list: List of GO term lists for each sentence.
    """
    subscription_key = os.environ.get('ANTHROPIC_API_KEY', '')
    if not subscription_key:
        raise ValueError("ANTHROPIC_API_KEY environment variable is not set")

    client = Anthropic(api_key=subscription_key)

    terms_list = []
    with tqdm(total=len(sentences), desc="Processing", unit="sentence") as pbar:
        for sentence in sentences:
            try:
                message = client.messages.create(
                    max_tokens=1024,
                    system=("You are a cautious bioinformatics curator for Gene Ontology (GO). "
                            "Goal: map protein descriptions to GO terms ONLY when the description provides clear support."
                            "Prefer high precision over recall.\n\n"
                            "Rules:\n"
                            "1) Only output GO IDs you are highly confident (>=0.8) are correct matches.\n"
                            "2) If the description is too vague or ambiguous, return an empty string: ''.\n"
                            "3) Output MUST be a semicolon-separated list of GO IDs only: "
                            "'GO:XXXXXXX; GO:XXXXXXX' or ''. No other text.\n"
                            "4) Before finalizing, verify that each GO term is directly supported by words/phrases "
                            "in the description; remove any that are not."),
                    messages=[
                        {"role": "user", "content": (f"description:\n\"{sentence}\"\n\n"
                                                     "return only the GO IDs that are directly supported by this description")}
                    ],
                    model="claude-opus-4-6",
                )

                extracted_terms = message.content[0].text
                terms = re.findall(r'GO:\d{7}', extracted_terms)
                terms = list(set(terms))

            except Exception as e:
                print(f"[Warning]: {e}")
                terms = []

            terms_list.append(terms)
            print('go_terms', terms)
            pbar.update(1)

    return terms_list


def process_texts_for_ec_with_api(sentences, model_type):
    """
    Extract EC numbers from sentences using the Azure OpenAI API.

    Args:
        sentences (list of str): List of sentences to process.
        model_type (str): Unused; kept for backwards compatibility.

    Environment Variables:
        AZURE_OPENAI_ENDPOINT: Azure OpenAI endpoint URL.
        AZURE_OPENAI_KEY: Azure OpenAI API key.
        AZURE_OPENAI_DEPLOYMENT: Azure OpenAI deployment name (default: 'o4-mini').
        AZURE_OPENAI_API_VERSION: API version (default: '2024-12-01-preview').

    Returns:
        list of list: List of EC number lists for each sentence
        (e.g., [['2.7.11.1'], ['3.4.21.-']]).
    """
    endpoint = os.environ.get('AZURE_OPENAI_ENDPOINT', '')
    subscription_key = os.environ.get('AZURE_OPENAI_KEY', '')
    deployment = os.environ.get('AZURE_OPENAI_DEPLOYMENT', 'o4-mini')
    api_version = os.environ.get('AZURE_OPENAI_API_VERSION', '2024-12-01-preview')

    if not subscription_key:
        raise ValueError("AZURE_OPENAI_KEY environment variable is not set")
    if not endpoint:
        raise ValueError("AZURE_OPENAI_ENDPOINT environment variable is not set")

    client = AzureOpenAI(
        api_version=api_version,
        azure_endpoint=endpoint,
        api_key=subscription_key,
    )

    ec_list = []
    with tqdm(total=len(sentences), desc="Processing EC extraction", unit="sentence") as pbar:
        for sentence in sentences:
            try:
                response = client.chat.completions.create(
                    messages=[
                        {
                            "role": "system",
                            "content": (
                                "You are a bioinformatics expert specializing in enzyme function annotation "
                                "and EC (Enzyme Commission) number assignment based on protein function descriptions."
                            ),
                        },
                        {
                            "role": "user",
                            "content": (
                                f"Protein description:\n\"{sentence}\"\n\n"
                                "Task:\n"
                                "Determine whether the described protein has enzymatic activity, "
                                "and assign the most appropriate EC number(s) if applicable.\n\n"
                                "Instructions:\n"
                                "1. First, decide whether the protein is an enzyme.\n"
                                "   - If the description does NOT indicate catalytic activity, return exactly:\n"
                                "     ['-.-.-.-']\n"
                                "   - In this case, do NOT output any other EC numbers.\n"
                                "2. If the protein IS an enzyme, infer and assign EC number(s) based on the described catalytic activity, "
                                "even if no explicit 'EC x.x.x.x' pattern appears in the text.\n"
                                "3. When the protein is an enzyme, do NOT output ['-.-.-.-'].\n"
                                "4. Output EC numbers in canonical format a.b.c.d, where each level is either an integer or '-'.\n"
                                "5. If only partial EC information can be inferred, use '-' for unknown levels "
                                "(e.g., '2.7.-.-', '3.4.21.-').\n"
                                "6. Do NOT hallucinate EC numbers beyond what can be reasonably inferred from the description.\n"
                                "7. Output must be a Python-style list of strings, for example:\n"
                                "   ['3.5.1.13', '3.5.1.14', '3.5.1.-']\n"
                                "8. Do not include any explanation or additional text outside the list.\n"
                            ),
                        }
                    ],
                    max_tokens=1024,
                    temperature=0.3,
                    top_p=1.0,
                    model=deployment,
                )

                extracted_content = response.choices[0].message.content.strip()

                try:
                    import ast
                    ec_numbers = ast.literal_eval(extracted_content)
                    if not isinstance(ec_numbers, list):
                        ec_numbers = []
                    ec_numbers = [ec.replace('–', '-').replace('—', '-') for ec in ec_numbers if isinstance(ec, str)]
                except (ValueError, SyntaxError):
                    ec_numbers = re.findall(r'\b\d+\.[\d–-]+\.[\d–-]+\.[\d–-]+\b', extracted_content)
                    ec_numbers = [ec.replace('–', '-').replace('—', '-') for ec in ec_numbers]

                seen = set()
                ec_numbers = [x for x in ec_numbers if not (x in seen or seen.add(x))]

            except Exception as e:
                print(f"[Warning]: {e}")
                ec_numbers = []

            ec_list.append(ec_numbers)
            print('ec_numbers', ec_numbers)
            pbar.update(1)

    return ec_list