Spaces:
Sleeping
Sleeping
File size: 7,681 Bytes
7c15d15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 | import os
import re
from anthropic import Anthropic
from openai import AzureOpenAI
from tqdm import tqdm
def process_texts_with_api(sentences):
"""
Extract GO terms from sentences using the Anthropic API.
Args:
sentences (list of str): List of sentences to process.
Environment Variables:
ANTHROPIC_API_KEY: Anthropic API key.
Returns:
list of list: List of GO term lists for each sentence.
"""
subscription_key = os.environ.get('ANTHROPIC_API_KEY', '')
if not subscription_key:
raise ValueError("ANTHROPIC_API_KEY environment variable is not set")
client = Anthropic(api_key=subscription_key)
terms_list = []
with tqdm(total=len(sentences), desc="Processing", unit="sentence") as pbar:
for sentence in sentences:
try:
message = client.messages.create(
max_tokens=1024,
system=("You are a cautious bioinformatics curator for Gene Ontology (GO). "
"Goal: map protein descriptions to GO terms ONLY when the description provides clear support."
"Prefer high precision over recall.\n\n"
"Rules:\n"
"1) Only output GO IDs you are highly confident (>=0.8) are correct matches.\n"
"2) If the description is too vague or ambiguous, return an empty string: ''.\n"
"3) Output MUST be a semicolon-separated list of GO IDs only: "
"'GO:XXXXXXX; GO:XXXXXXX' or ''. No other text.\n"
"4) Before finalizing, verify that each GO term is directly supported by words/phrases "
"in the description; remove any that are not."),
messages=[
{"role": "user", "content": (f"description:\n\"{sentence}\"\n\n"
"return only the GO IDs that are directly supported by this description")}
],
model="claude-opus-4-6",
)
extracted_terms = message.content[0].text
terms = re.findall(r'GO:\d{7}', extracted_terms)
terms = list(set(terms))
except Exception as e:
print(f"[Warning]: {e}")
terms = []
terms_list.append(terms)
print('go_terms', terms)
pbar.update(1)
return terms_list
def process_texts_for_ec_with_api(sentences, model_type):
"""
Extract EC numbers from sentences using the Azure OpenAI API.
Args:
sentences (list of str): List of sentences to process.
model_type (str): Unused; kept for backwards compatibility.
Environment Variables:
AZURE_OPENAI_ENDPOINT: Azure OpenAI endpoint URL.
AZURE_OPENAI_KEY: Azure OpenAI API key.
AZURE_OPENAI_DEPLOYMENT: Azure OpenAI deployment name (default: 'o4-mini').
AZURE_OPENAI_API_VERSION: API version (default: '2024-12-01-preview').
Returns:
list of list: List of EC number lists for each sentence
(e.g., [['2.7.11.1'], ['3.4.21.-']]).
"""
endpoint = os.environ.get('AZURE_OPENAI_ENDPOINT', '')
subscription_key = os.environ.get('AZURE_OPENAI_KEY', '')
deployment = os.environ.get('AZURE_OPENAI_DEPLOYMENT', 'o4-mini')
api_version = os.environ.get('AZURE_OPENAI_API_VERSION', '2024-12-01-preview')
if not subscription_key:
raise ValueError("AZURE_OPENAI_KEY environment variable is not set")
if not endpoint:
raise ValueError("AZURE_OPENAI_ENDPOINT environment variable is not set")
client = AzureOpenAI(
api_version=api_version,
azure_endpoint=endpoint,
api_key=subscription_key,
)
ec_list = []
with tqdm(total=len(sentences), desc="Processing EC extraction", unit="sentence") as pbar:
for sentence in sentences:
try:
response = client.chat.completions.create(
messages=[
{
"role": "system",
"content": (
"You are a bioinformatics expert specializing in enzyme function annotation "
"and EC (Enzyme Commission) number assignment based on protein function descriptions."
),
},
{
"role": "user",
"content": (
f"Protein description:\n\"{sentence}\"\n\n"
"Task:\n"
"Determine whether the described protein has enzymatic activity, "
"and assign the most appropriate EC number(s) if applicable.\n\n"
"Instructions:\n"
"1. First, decide whether the protein is an enzyme.\n"
" - If the description does NOT indicate catalytic activity, return exactly:\n"
" ['-.-.-.-']\n"
" - In this case, do NOT output any other EC numbers.\n"
"2. If the protein IS an enzyme, infer and assign EC number(s) based on the described catalytic activity, "
"even if no explicit 'EC x.x.x.x' pattern appears in the text.\n"
"3. When the protein is an enzyme, do NOT output ['-.-.-.-'].\n"
"4. Output EC numbers in canonical format a.b.c.d, where each level is either an integer or '-'.\n"
"5. If only partial EC information can be inferred, use '-' for unknown levels "
"(e.g., '2.7.-.-', '3.4.21.-').\n"
"6. Do NOT hallucinate EC numbers beyond what can be reasonably inferred from the description.\n"
"7. Output must be a Python-style list of strings, for example:\n"
" ['3.5.1.13', '3.5.1.14', '3.5.1.-']\n"
"8. Do not include any explanation or additional text outside the list.\n"
),
}
],
max_tokens=1024,
temperature=0.3,
top_p=1.0,
model=deployment,
)
extracted_content = response.choices[0].message.content.strip()
try:
import ast
ec_numbers = ast.literal_eval(extracted_content)
if not isinstance(ec_numbers, list):
ec_numbers = []
ec_numbers = [ec.replace('–', '-').replace('—', '-') for ec in ec_numbers if isinstance(ec, str)]
except (ValueError, SyntaxError):
ec_numbers = re.findall(r'\b\d+\.[\d–-]+\.[\d–-]+\.[\d–-]+\b', extracted_content)
ec_numbers = [ec.replace('–', '-').replace('—', '-') for ec in ec_numbers]
seen = set()
ec_numbers = [x for x in ec_numbers if not (x in seen or seen.add(x))]
except Exception as e:
print(f"[Warning]: {e}")
ec_numbers = []
ec_list.append(ec_numbers)
print('ec_numbers', ec_numbers)
pbar.update(1)
return ec_list
|