Spaces:
Running
Running
| import os | |
| import re | |
| from anthropic import Anthropic | |
| from openai import AzureOpenAI | |
| from tqdm import tqdm | |
| def process_texts_with_api(sentences): | |
| """ | |
| Extract GO terms from sentences using the Anthropic API. | |
| Args: | |
| sentences (list of str): List of sentences to process. | |
| Environment Variables: | |
| ANTHROPIC_API_KEY: Anthropic API key. | |
| Returns: | |
| list of list: List of GO term lists for each sentence. | |
| """ | |
| subscription_key = os.environ.get('ANTHROPIC_API_KEY', '') | |
| if not subscription_key: | |
| raise ValueError("ANTHROPIC_API_KEY environment variable is not set") | |
| client = Anthropic(api_key=subscription_key) | |
| terms_list = [] | |
| with tqdm(total=len(sentences), desc="Processing", unit="sentence") as pbar: | |
| for sentence in sentences: | |
| try: | |
| message = client.messages.create( | |
| max_tokens=1024, | |
| system=("You are a cautious bioinformatics curator for Gene Ontology (GO). " | |
| "Goal: map protein descriptions to GO terms ONLY when the description provides clear support." | |
| "Prefer high precision over recall.\n\n" | |
| "Rules:\n" | |
| "1) Only output GO IDs you are highly confident (>=0.8) are correct matches.\n" | |
| "2) If the description is too vague or ambiguous, return an empty string: ''.\n" | |
| "3) Output MUST be a semicolon-separated list of GO IDs only: " | |
| "'GO:XXXXXXX; GO:XXXXXXX' or ''. No other text.\n" | |
| "4) Before finalizing, verify that each GO term is directly supported by words/phrases " | |
| "in the description; remove any that are not."), | |
| messages=[ | |
| {"role": "user", "content": (f"description:\n\"{sentence}\"\n\n" | |
| "return only the GO IDs that are directly supported by this description")} | |
| ], | |
| model="claude-opus-4-6", | |
| ) | |
| extracted_terms = message.content[0].text | |
| terms = re.findall(r'GO:\d{7}', extracted_terms) | |
| terms = list(set(terms)) | |
| except Exception as e: | |
| print(f"[Warning]: {e}") | |
| terms = [] | |
| terms_list.append(terms) | |
| print('go_terms', terms) | |
| pbar.update(1) | |
| return terms_list | |
| def process_texts_for_ec_with_api(sentences, model_type): | |
| """ | |
| Extract EC numbers from sentences using the Azure OpenAI API. | |
| Args: | |
| sentences (list of str): List of sentences to process. | |
| model_type (str): Unused; kept for backwards compatibility. | |
| Environment Variables: | |
| AZURE_OPENAI_ENDPOINT: Azure OpenAI endpoint URL. | |
| AZURE_OPENAI_KEY: Azure OpenAI API key. | |
| AZURE_OPENAI_DEPLOYMENT: Azure OpenAI deployment name (default: 'o4-mini'). | |
| AZURE_OPENAI_API_VERSION: API version (default: '2024-12-01-preview'). | |
| Returns: | |
| list of list: List of EC number lists for each sentence | |
| (e.g., [['2.7.11.1'], ['3.4.21.-']]). | |
| """ | |
| endpoint = os.environ.get('AZURE_OPENAI_ENDPOINT', '') | |
| subscription_key = os.environ.get('AZURE_OPENAI_KEY', '') | |
| deployment = os.environ.get('AZURE_OPENAI_DEPLOYMENT', 'o4-mini') | |
| api_version = os.environ.get('AZURE_OPENAI_API_VERSION', '2024-12-01-preview') | |
| if not subscription_key: | |
| raise ValueError("AZURE_OPENAI_KEY environment variable is not set") | |
| if not endpoint: | |
| raise ValueError("AZURE_OPENAI_ENDPOINT environment variable is not set") | |
| client = AzureOpenAI( | |
| api_version=api_version, | |
| azure_endpoint=endpoint, | |
| api_key=subscription_key, | |
| ) | |
| ec_list = [] | |
| with tqdm(total=len(sentences), desc="Processing EC extraction", unit="sentence") as pbar: | |
| for sentence in sentences: | |
| try: | |
| response = client.chat.completions.create( | |
| messages=[ | |
| { | |
| "role": "system", | |
| "content": ( | |
| "You are a bioinformatics expert specializing in enzyme function annotation " | |
| "and EC (Enzyme Commission) number assignment based on protein function descriptions." | |
| ), | |
| }, | |
| { | |
| "role": "user", | |
| "content": ( | |
| f"Protein description:\n\"{sentence}\"\n\n" | |
| "Task:\n" | |
| "Determine whether the described protein has enzymatic activity, " | |
| "and assign the most appropriate EC number(s) if applicable.\n\n" | |
| "Instructions:\n" | |
| "1. First, decide whether the protein is an enzyme.\n" | |
| " - If the description does NOT indicate catalytic activity, return exactly:\n" | |
| " ['-.-.-.-']\n" | |
| " - In this case, do NOT output any other EC numbers.\n" | |
| "2. If the protein IS an enzyme, infer and assign EC number(s) based on the described catalytic activity, " | |
| "even if no explicit 'EC x.x.x.x' pattern appears in the text.\n" | |
| "3. When the protein is an enzyme, do NOT output ['-.-.-.-'].\n" | |
| "4. Output EC numbers in canonical format a.b.c.d, where each level is either an integer or '-'.\n" | |
| "5. If only partial EC information can be inferred, use '-' for unknown levels " | |
| "(e.g., '2.7.-.-', '3.4.21.-').\n" | |
| "6. Do NOT hallucinate EC numbers beyond what can be reasonably inferred from the description.\n" | |
| "7. Output must be a Python-style list of strings, for example:\n" | |
| " ['3.5.1.13', '3.5.1.14', '3.5.1.-']\n" | |
| "8. Do not include any explanation or additional text outside the list.\n" | |
| ), | |
| } | |
| ], | |
| max_tokens=1024, | |
| temperature=0.3, | |
| top_p=1.0, | |
| model=deployment, | |
| ) | |
| extracted_content = response.choices[0].message.content.strip() | |
| try: | |
| import ast | |
| ec_numbers = ast.literal_eval(extracted_content) | |
| if not isinstance(ec_numbers, list): | |
| ec_numbers = [] | |
| ec_numbers = [ec.replace('–', '-').replace('—', '-') for ec in ec_numbers if isinstance(ec, str)] | |
| except (ValueError, SyntaxError): | |
| ec_numbers = re.findall(r'\b\d+\.[\d–-]+\.[\d–-]+\.[\d–-]+\b', extracted_content) | |
| ec_numbers = [ec.replace('–', '-').replace('—', '-') for ec in ec_numbers] | |
| seen = set() | |
| ec_numbers = [x for x in ec_numbers if not (x in seen or seen.add(x))] | |
| except Exception as e: | |
| print(f"[Warning]: {e}") | |
| ec_numbers = [] | |
| ec_list.append(ec_numbers) | |
| print('ec_numbers', ec_numbers) | |
| pbar.update(1) | |
| return ec_list | |