""" 作者:曾浩龙(独立开发) 创建时间:2024 年 10 月 25 日 第三方依赖库:Gradio (https://www.gradio.app/) 与 OpenAI Python API library (https://github.com/openai/openai-python) 其他说明:本项目声明仅供学习和研究使用。 """ import os import platform print(platform.python_version()) import Levenshtein import gradio as gr from openai import OpenAI def find_closest_string(user_input, valid_strings): """查找与输入字符串最接近的字符串,精确的编辑距离,根据 Levenshtein 编辑距离最小原则。""" min_distance = 999999999 # 初始化最小编辑距离为大的正数 closest_string = None for valid_string in valid_strings: # 计算 Levenshtein 距离: distance = Levenshtein.distance(user_input, valid_string) if distance < min_distance: min_distance = distance closest_string = valid_string return closest_string def process_input(user_input): """ 处理用户输入的字符串: 1 - 若在有效集合中则直接返回; 2 - 否则返回与用户输入的最接近的有效字符串。 """ if user_input in valid_strings: return user_input # 如果用户输入的字符串在集合里,直接返回。 else: # 否则,找到编辑距离最小的字符串。 return find_closest_string(user_input, valid_strings) def demo(project_TCGA, output_language="Chinese"): project_TCGA = process_input(project_TCGA) name_English, name_Chinese = project_name_TCGA[project_TCGA] tcga_link = f"https://portal.gdc.cancer.gov/projects/{project_TCGA}" output1, output2 = None, None if output_language == "Chinese": output1 = f"✍️ 简称:{project_TCGA}\n❤️ 中文全称:{name_Chinese}\n💛 英文全称:{name_English}\n🔗 链接:{tcga_link}" system_instruction = f"您是公共卫生、流行病学、癌症研究和精准医学领域的专家,对{name_Chinese}有着深入的理解。" prompt_template = f""" 您的任务是深入分析并撰写关于{name_Chinese}这种复杂疾病的摘要,内容必须准确、详实、逻辑清晰、可读性强,这对普通公众了解这种复杂疾病非常重要。 具体内容需要包括:1 - {name_Chinese}的基本定义和概述,临床病理特征;2 - {name_Chinese}的病因和风险因素;3 - {name_Chinese}的流行病学调查结果,患病率和死亡率;4 - {name_Chinese}的临床症状与早期识别;5 - {name_Chinese}的疾病进展与转移及其密切相关的生物标志物和异常基因改变;6 - {name_Chinese}的生存率与预后;7 - {name_Chinese}的诊断、治疗方法和未来研究。 """.strip() else: output1 = f"✍️ Abbreviation: {project_TCGA}\n❤️ Full name in Chinese: {name_Chinese}\n💛 Full Name in English: {name_English}\n🔗 Link: {tcga_link}" system_instruction = f"You are an expert in the fields of public health, epidemiology, cancer research, and precision medicine, with a deep comprehension of {name_English}." prompt_template = f""" Your task is to analyze and write an in-depth summary about the complex disease of {name_English} that must be accurate, informative, logical, and readable, which is very important for the general public to understand this complex disease. Specific content needs to include: 1 - Basic definition and overview of {name_English}, clinicopathologic features; 2 - Etiology and risk factors of {name_English}; 3 - Epidemiologic findings, prevalence, and mortality rates of {name_English}; 4 - Clinical signs and early recognition of {name_English}; 5 - Disease progression and metastasis of {name_English} and its closely related biomarkers and aberrant gene alterations; 6 - Survival and prognosis of {name_English}; and 7 - Diagnostics, therapeutic approaches, and future research of {name_English}. """.strip() try: # 要实例化一个 OpenAI 对象,你需要设置 OpenAI API Key、Base URL、最大重试次数以及超时限制时间。 client = OpenAI( api_key=os.environ["OPENAI_API_KEY"], base_url=os.environ["API_BASE"], max_retries=3, timeout=60, ) # 调用 client.chat.completions.create,设置关键参数。 chat_completion = client.chat.completions.create( model="gpt-4o-mini-2024-07-18", # gpt-4o-mini-2024-07-18, gpt-4-turbo messages=[ {"role": "system", "content": system_instruction}, {"role": "user", "content": prompt_template}, ], n=1, seed=42, temperature=0.50, max_tokens=3600 if output_language == "Chinese" else 2048, logprobs=False, # top_logprobs=3, presence_penalty=0.20, frequency_penalty=0.20, ) resp_text = chat_completion.choices[0].message.content.strip() # 在普通文本框不能用 "**" 渲染加粗,Markdown 才可以。因此,将输入字符串中所有的 "**" 替换为 ""。 # if "**" in resp_text: # resp_text = resp_text.replace("**", "") # if "# " in resp_text: # resp_text = resp_text.replace("# ", "") # if "#" in resp_text: # resp_text = resp_text.replace("#", "") if output_language == "Chinese": # "🤖 请注意:以下内容通过提示工程驱动的 GPT-4 Turbo 生成\n\n" output2 = "" + resp_text else: # "🤖 Note: The following content is generated by the GPT-4 Turbo driven by Prompt Engineering\n\n" output2 = "" + resp_text except Exception as e: print(str(e), "Response Error") return output1, "Response Error" return output1, output2 # TCGA 有 33 种癌症类型 project_name_TCGA = { "TCGA-ACC": ["adrenocortical carcinoma", "肾上腺皮质癌"], "TCGA-BLCA": ["bladder urothelial carcinoma", "膀胱尿路上皮癌"], "TCGA-BRCA": ["breast invasive carcinoma", "浸润性乳腺癌"], "TCGA-CESC": [ "cervical squamous cell carcinoma and endocervical adenocarcinoma", "宫颈鳞状细胞癌与宫颈内膜腺癌", ], "TCGA-CHOL": ["cholangiocarcinoma", "胆管癌"], "TCGA-COAD": ["colon adenocarcinoma", "结肠腺癌"], "TCGA-DLBC": [ "lymphoid neoplasm diffuse large B-cell lymphoma", "弥漫性大 B 细胞淋巴瘤", ], "TCGA-ESCA": ["esophageal carcinoma", "食道癌"], "TCGA-GBM": ["glioblastoma multiforme", "多形性胶质母细胞瘤"], "TCGA-HNSC": ["head and neck squamous cell carcinoma", "头颈部鳞状细胞癌"], "TCGA-KICH": ["kidney chromophobe", "肾嫌色细胞癌"], "TCGA-KIRC": ["kidney renal clear cell carcinoma", "肾透明细胞癌"], "TCGA-KIRP": ["kidney renal papillary cell carcinoma", "乳头状肾细胞癌"], "TCGA-LAML": ["acute myeloid leukemia", "急性髓系白血病"], "TCGA-LGG": ["brain lower grade glioma", "低级别脑胶质瘤"], "TCGA-LIHC": ["liver hepatocellular carcinoma", "肝细胞癌"], "TCGA-LUAD": ["lung adenocarcinoma", "肺腺癌"], "TCGA-LUSC": ["lung squamous cell carcinoma", "肺鳞状细胞癌"], "TCGA-MESO": ["mesothelioma", "间皮瘤"], "TCGA-OV": ["ovarian serous cystadenocarcinoma", "卵巢浆液性囊腺癌"], "TCGA-PAAD": ["pancreatic adenocarcinoma", "胰腺腺癌"], "TCGA-PCPG": ["pheochromocytoma and paraganglioma", "嗜铬细胞瘤和副神经节瘤"], "TCGA-PRAD": ["prostate adenocarcinoma", "前列腺腺癌"], "TCGA-READ": ["rectum adenocarcinoma", "直肠腺癌"], "TCGA-SARC": ["sarcoma", "肉瘤"], "TCGA-SKCM": ["skin cutaneous melanoma", "皮肤黑色素瘤"], "TCGA-STAD": ["stomach adenocarcinoma", "胃腺癌"], "TCGA-TGCT": ["testicular germ cell tumors", "睾丸生殖细胞肿瘤"], "TCGA-THCA": ["thyroid carcinoma", "甲状腺癌"], "TCGA-THYM": ["thymoma", "胸腺瘤"], "TCGA-UCEC": ["uterine corpus endometrial carcinoma", "子宫体子宫内膜癌"], "TCGA-UCS": ["uterine carcinosarcoma", "子宫癌肉瘤"], "TCGA-UVM": ["uveal melanoma", "眼内(葡萄膜)黑色素瘤"], } # 预定义的字符串集合: valid_strings = {input_query for input_query in project_name_TCGA.keys()} # print(len(project_name_TCGA.keys())) # input_query = input("请输入您要查询的 TCGA 项目名称:") # print(project_name_TCGA[input_query]) # print([k for k in project_name_TCGA.keys()]) # 支持 Markdown 和 HTML 内容格式: # Abbreviations, Full Names and Descriptions of All Cancer Types Covered by TCGA Project. # desc = """

🎉 Abbreviations, Full Names and Descriptions of All Cancer Types Covered by TCGA Project 🧬

""" desc = """

🎉 TCGA 项目涉及的所有癌症类型的缩写、中英文全称和描述 🧬

""" outputs = [ gr.Textbox( label="🔎 1. 您查询的 TCGA 项目的癌症类型", show_copy_button=True ), # 1. The Full Name of The Cancer Type Queried. gr.Textbox( label="👩‍⚕️ 2. 迅速了解这种癌症类型的信息", show_copy_button=True, ), # 2. Insight Into The Cancer Type Being Queried. A Quick Look At The Cancer Type Being Queried ] my_demo = gr.Interface( fn=demo, inputs=[ gr.Dropdown( choices=[k for k in project_name_TCGA.keys()], value="TCGA-READ", allow_custom_value=True, filterable=True, interactive=True, label="⌨️ 请输入您要查询的 TCGA 项目名称,如 TCGA-READ", ), # Please enter the name of the TCGA project you want to query, such as TCGA-READ. gr.Dropdown( choices=["Chinese", "English"], value="Chinese", allow_custom_value=False, label="👨‍💻 输出语言目前仅支持中文和英文", ), ], outputs=outputs, submit_btn=gr.Button("提交", variant="primary"), clear_btn=gr.Button("清除", variant="secondary"), cache_examples=True, examples=[["TCGA-READ", "Chinese"], ["TCGA-COAD", "English"]], description=desc, theme="JohnSmith9982/small_and_pretty", ) my_demo.launch(show_api=False, show_error=True)