Spaces:
Runtime error
Runtime error
| import json | |
| import sys | |
| import os | |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| from collections import defaultdict | |
| # 全局变量声明 | |
| _go_data = None | |
| _protein_go_dict = None | |
| def _load_go_data(go_info_path): | |
| """懒加载GO数据""" | |
| global _go_data | |
| if _go_data is None: | |
| try: | |
| with open(go_info_path, 'r') as f: | |
| _go_data = json.load(f) | |
| except Exception as e: | |
| print(f"加载GO数据文件时发生错误: {str(e)}") | |
| _go_data = None | |
| def _load_protein_go_dict(protein2gopath): | |
| """懒加载蛋白质-GO映射数据""" | |
| global _protein_go_dict | |
| if _protein_go_dict is None: | |
| try: | |
| _protein_go_dict = {} | |
| with open(protein2gopath, 'r') as f: | |
| for line in f: | |
| data = json.loads(line) | |
| _protein_go_dict[data['protein_id']] = data['GO_id'] | |
| except Exception as e: | |
| print(f"加载蛋白质-GO映射数据时发生错误: {str(e)}") | |
| _protein_go_dict = None | |
| def get_go_definition(go_id, go_info_path): | |
| """获取GO term的定义""" | |
| _load_go_data(go_info_path) | |
| if _go_data is None: | |
| return None | |
| if not go_id.startswith('GO_'): | |
| go_id = f"GO_{go_id}" | |
| full_id = f"http://purl.obolibrary.org/obo/{go_id}" | |
| for node in _go_data['graphs'][0]['nodes']: | |
| if node['id'] == full_id: | |
| if 'meta' in node and 'definition' in node['meta']: | |
| return node['meta']['definition']['val'] | |
| return None | |
| def analyze_protein_go(protein_id, protein2gopath, go_info_path): | |
| """ | |
| 分析蛋白质的GO注释信息,包括GO ID和定义 | |
| 参数: | |
| protein_id: str - 蛋白质ID | |
| protein2gopath: str - 蛋白质-GO映射文件路径 | |
| 返回: | |
| dict - 包含GO信息的字典 | |
| """ | |
| _load_protein_go_dict(protein2gopath) | |
| if _protein_go_dict is None: | |
| return { | |
| "status": "error", | |
| "message": "GO数据加载失败" | |
| } | |
| if protein_id not in _protein_go_dict: | |
| return { | |
| "status": "error", | |
| "message": f"未找到蛋白质 {protein_id} 的GO注释" | |
| } | |
| go_ids = _protein_go_dict[protein_id] | |
| go_info = [] | |
| all_definitions = {} | |
| for go_id in go_ids: | |
| # 获取GO定义 | |
| definition = get_go_definition(go_id, go_info_path) | |
| if definition: | |
| all_definitions[go_id] = definition | |
| go_info.append({ | |
| "go_id": go_id | |
| }) | |
| return { | |
| "status": "success", | |
| "protein_id": protein_id, | |
| "go_annotations": go_info, | |
| "all_related_definitions": all_definitions | |
| } | |
| # 使用示例 | |
| if __name__ == "__main__": | |
| import argparse | |
| parser = argparse.ArgumentParser(description='Analyze protein GO annotations') | |
| parser.add_argument('--protein_id', type=str, default='A8CF74') | |
| parser.add_argument('--protein2gopath', type=str, default='data/processed_data/go_integration_final_topk2.json') | |
| parser.add_argument('--go_info_path', type=str, default='data/raw_data/go.json') | |
| args = parser.parse_args() | |
| result = analyze_protein_go(args.protein_id, args.protein2gopath, args.go_info_path) | |
| if result["status"] == "success": | |
| print(f"\nProtein {result['protein_id']} GO annotations:") | |
| for anno in result["go_annotations"]: | |
| print(f"\nGO ID: {anno['go_id']}") | |
| print("\nAll related GO ID definitions:") | |
| for go_id, definition in result["all_related_definitions"].items(): | |
| print(f"\nGO:{go_id}") | |
| print(f"Definition: {definition}") | |
| else: | |
| print(result["message"]) |