Spaces:
Sleeping
Sleeping
| """ | |
| 作者:曾浩龙(独立开发) | |
| 创建时间:2024 年 10 月 25 日 | |
| 第三方依赖库:Gradio (https://www.gradio.app/) 与 OpenAI Python API library (https://github.com/openai/openai-python) | |
| 其他说明:本项目声明仅供学习和研究使用。 | |
| """ | |
| import os | |
| import platform | |
| print(platform.python_version()) | |
| import Levenshtein | |
| import gradio as gr | |
| from openai import OpenAI | |
| def find_closest_string(user_input, valid_strings): | |
| """查找与输入字符串最接近的字符串,精确的编辑距离,根据 Levenshtein 编辑距离最小原则。""" | |
| min_distance = 999999999 # 初始化最小编辑距离为大的正数 | |
| closest_string = None | |
| for valid_string in valid_strings: | |
| # 计算 Levenshtein 距离: | |
| distance = Levenshtein.distance(user_input, valid_string) | |
| if distance < min_distance: | |
| min_distance = distance | |
| closest_string = valid_string | |
| return closest_string | |
| def process_input(user_input): | |
| """ | |
| 处理用户输入的字符串: | |
| 1 - 若在有效集合中则直接返回; | |
| 2 - 否则返回与用户输入的最接近的有效字符串。 | |
| """ | |
| if user_input in valid_strings: | |
| return user_input # 如果用户输入的字符串在集合里,直接返回。 | |
| else: | |
| # 否则,找到编辑距离最小的字符串。 | |
| return find_closest_string(user_input, valid_strings) | |
| def demo(project_TCGA, output_language="Chinese"): | |
| project_TCGA = process_input(project_TCGA) | |
| name_English, name_Chinese = project_name_TCGA[project_TCGA] | |
| tcga_link = f"https://portal.gdc.cancer.gov/projects/{project_TCGA}" | |
| output1, output2 = None, None | |
| if output_language == "Chinese": | |
| output1 = f"✍️ 简称:{project_TCGA}\n❤️ 中文全称:{name_Chinese}\n💛 英文全称:{name_English}\n🔗 链接:{tcga_link}" | |
| system_instruction = f"您是公共卫生、流行病学、癌症研究和精准医学领域的专家,对{name_Chinese}有着深入的理解。" | |
| prompt_template = f""" | |
| 您的任务是深入分析并撰写关于{name_Chinese}这种复杂疾病的摘要,内容必须准确、详实、逻辑清晰、可读性强,这对普通公众了解这种复杂疾病非常重要。 | |
| 具体内容需要包括:1 - {name_Chinese}的基本定义和概述,临床病理特征;2 - {name_Chinese}的病因和风险因素;3 - {name_Chinese}的流行病学调查结果,患病率和死亡率;4 - {name_Chinese}的临床症状与早期识别;5 - {name_Chinese}的疾病进展与转移及其密切相关的生物标志物和异常基因改变;6 - {name_Chinese}的生存率与预后;7 - {name_Chinese}的诊断、治疗方法和未来研究。 | |
| """.strip() | |
| else: | |
| output1 = f"✍️ Abbreviation: {project_TCGA}\n❤️ Full name in Chinese: {name_Chinese}\n💛 Full Name in English: {name_English}\n🔗 Link: {tcga_link}" | |
| system_instruction = f"You are an expert in the fields of public health, epidemiology, cancer research, and precision medicine, with a deep comprehension of {name_English}." | |
| prompt_template = f""" | |
| Your task is to analyze and write an in-depth summary about the complex disease of {name_English} that must be accurate, informative, logical, and readable, which is very important for the general public to understand this complex disease. | |
| Specific content needs to include: 1 - Basic definition and overview of {name_English}, clinicopathologic features; 2 - Etiology and risk factors of {name_English}; 3 - Epidemiologic findings, prevalence, and mortality rates of {name_English}; 4 - Clinical signs and early recognition of {name_English}; 5 - Disease progression and metastasis of {name_English} and its closely related biomarkers and aberrant gene alterations; 6 - Survival and prognosis of {name_English}; and 7 - Diagnostics, therapeutic approaches, and future research of {name_English}. | |
| """.strip() | |
| try: | |
| # 要实例化一个 OpenAI 对象,你需要设置 OpenAI API Key、Base URL、最大重试次数以及超时限制时间。 | |
| client = OpenAI( | |
| api_key=os.environ["OPENAI_API_KEY"], | |
| base_url=os.environ["API_BASE"], | |
| max_retries=3, | |
| timeout=60, | |
| ) | |
| # 调用 client.chat.completions.create,设置关键参数。 | |
| chat_completion = client.chat.completions.create( | |
| model="gpt-4o-mini-2024-07-18", # gpt-4o-mini-2024-07-18, gpt-4-turbo | |
| messages=[ | |
| {"role": "system", "content": system_instruction}, | |
| {"role": "user", "content": prompt_template}, | |
| ], | |
| n=1, | |
| seed=42, | |
| temperature=0.50, | |
| max_tokens=3600 if output_language == "Chinese" else 2048, | |
| logprobs=False, | |
| # top_logprobs=3, | |
| presence_penalty=0.20, | |
| frequency_penalty=0.20, | |
| ) | |
| resp_text = chat_completion.choices[0].message.content.strip() | |
| # 在普通文本框不能用 "**" 渲染加粗,Markdown 才可以。因此,将输入字符串中所有的 "**" 替换为 ""。 | |
| # if "**" in resp_text: | |
| # resp_text = resp_text.replace("**", "") | |
| # if "# " in resp_text: | |
| # resp_text = resp_text.replace("# ", "") | |
| # if "#" in resp_text: | |
| # resp_text = resp_text.replace("#", "") | |
| if output_language == "Chinese": | |
| # "🤖 请注意:以下内容通过提示工程驱动的 GPT-4 Turbo 生成\n\n" | |
| output2 = "" + resp_text | |
| else: | |
| # "🤖 Note: The following content is generated by the GPT-4 Turbo driven by Prompt Engineering\n\n" | |
| output2 = "" + resp_text | |
| except Exception as e: | |
| print(str(e), "Response Error") | |
| return output1, "Response Error" | |
| return output1, output2 | |
| # TCGA 有 33 种癌症类型 | |
| project_name_TCGA = { | |
| "TCGA-ACC": ["adrenocortical carcinoma", "肾上腺皮质癌"], | |
| "TCGA-BLCA": ["bladder urothelial carcinoma", "膀胱尿路上皮癌"], | |
| "TCGA-BRCA": ["breast invasive carcinoma", "浸润性乳腺癌"], | |
| "TCGA-CESC": [ | |
| "cervical squamous cell carcinoma and endocervical adenocarcinoma", | |
| "宫颈鳞状细胞癌与宫颈内膜腺癌", | |
| ], | |
| "TCGA-CHOL": ["cholangiocarcinoma", "胆管癌"], | |
| "TCGA-COAD": ["colon adenocarcinoma", "结肠腺癌"], | |
| "TCGA-DLBC": [ | |
| "lymphoid neoplasm diffuse large B-cell lymphoma", | |
| "弥漫性大 B 细胞淋巴瘤", | |
| ], | |
| "TCGA-ESCA": ["esophageal carcinoma", "食道癌"], | |
| "TCGA-GBM": ["glioblastoma multiforme", "多形性胶质母细胞瘤"], | |
| "TCGA-HNSC": ["head and neck squamous cell carcinoma", "头颈部鳞状细胞癌"], | |
| "TCGA-KICH": ["kidney chromophobe", "肾嫌色细胞癌"], | |
| "TCGA-KIRC": ["kidney renal clear cell carcinoma", "肾透明细胞癌"], | |
| "TCGA-KIRP": ["kidney renal papillary cell carcinoma", "乳头状肾细胞癌"], | |
| "TCGA-LAML": ["acute myeloid leukemia", "急性髓系白血病"], | |
| "TCGA-LGG": ["brain lower grade glioma", "低级别脑胶质瘤"], | |
| "TCGA-LIHC": ["liver hepatocellular carcinoma", "肝细胞癌"], | |
| "TCGA-LUAD": ["lung adenocarcinoma", "肺腺癌"], | |
| "TCGA-LUSC": ["lung squamous cell carcinoma", "肺鳞状细胞癌"], | |
| "TCGA-MESO": ["mesothelioma", "间皮瘤"], | |
| "TCGA-OV": ["ovarian serous cystadenocarcinoma", "卵巢浆液性囊腺癌"], | |
| "TCGA-PAAD": ["pancreatic adenocarcinoma", "胰腺腺癌"], | |
| "TCGA-PCPG": ["pheochromocytoma and paraganglioma", "嗜铬细胞瘤和副神经节瘤"], | |
| "TCGA-PRAD": ["prostate adenocarcinoma", "前列腺腺癌"], | |
| "TCGA-READ": ["rectum adenocarcinoma", "直肠腺癌"], | |
| "TCGA-SARC": ["sarcoma", "肉瘤"], | |
| "TCGA-SKCM": ["skin cutaneous melanoma", "皮肤黑色素瘤"], | |
| "TCGA-STAD": ["stomach adenocarcinoma", "胃腺癌"], | |
| "TCGA-TGCT": ["testicular germ cell tumors", "睾丸生殖细胞肿瘤"], | |
| "TCGA-THCA": ["thyroid carcinoma", "甲状腺癌"], | |
| "TCGA-THYM": ["thymoma", "胸腺瘤"], | |
| "TCGA-UCEC": ["uterine corpus endometrial carcinoma", "子宫体子宫内膜癌"], | |
| "TCGA-UCS": ["uterine carcinosarcoma", "子宫癌肉瘤"], | |
| "TCGA-UVM": ["uveal melanoma", "眼内(葡萄膜)黑色素瘤"], | |
| } | |
| # 预定义的字符串集合: | |
| valid_strings = {input_query for input_query in project_name_TCGA.keys()} | |
| # print(len(project_name_TCGA.keys())) | |
| # input_query = input("请输入您要查询的 TCGA 项目名称:") | |
| # print(project_name_TCGA[input_query]) | |
| # print([k for k in project_name_TCGA.keys()]) | |
| # 支持 Markdown 和 HTML 内容格式: | |
| # Abbreviations, Full Names and Descriptions of All Cancer Types Covered by TCGA Project. | |
| # desc = """<h1 align="center" style="font-family: Latin Modern Math, sans-serif; font-size: 22px; color: #00FF7F;">🎉 Abbreviations, Full Names and Descriptions of All Cancer Types Covered by TCGA Project 🧬</h1>""" | |
| desc = """<h1 align="center" style="font-family: KaiTi, sans-serif; font-size: 22px; color: #00FF7F;">🎉 TCGA 项目涉及的所有癌症类型的缩写、中英文全称和描述 🧬</h1>""" | |
| outputs = [ | |
| gr.Textbox( | |
| label="🔎 1. 您查询的 TCGA 项目的癌症类型", show_copy_button=True | |
| ), # 1. The Full Name of The Cancer Type Queried. | |
| gr.Textbox( | |
| label="👩⚕️ 2. 迅速了解这种癌症类型的信息", | |
| show_copy_button=True, | |
| ), # 2. Insight Into The Cancer Type Being Queried. A Quick Look At The Cancer Type Being Queried | |
| ] | |
| my_demo = gr.Interface( | |
| fn=demo, | |
| inputs=[ | |
| gr.Dropdown( | |
| choices=[k for k in project_name_TCGA.keys()], | |
| value="TCGA-READ", | |
| allow_custom_value=True, | |
| filterable=True, | |
| interactive=True, | |
| label="⌨️ 请输入您要查询的 TCGA 项目名称,如 TCGA-READ", | |
| ), # Please enter the name of the TCGA project you want to query, such as TCGA-READ. | |
| gr.Dropdown( | |
| choices=["Chinese", "English"], | |
| value="Chinese", | |
| allow_custom_value=False, | |
| label="👨💻 输出语言目前仅支持中文和英文", | |
| ), | |
| ], | |
| outputs=outputs, | |
| submit_btn=gr.Button("提交", variant="primary"), | |
| clear_btn=gr.Button("清除", variant="secondary"), | |
| cache_examples=True, | |
| examples=[["TCGA-READ", "Chinese"], ["TCGA-COAD", "English"]], | |
| description=desc, | |
| theme="JohnSmith9982/small_and_pretty", | |
| ) | |
| my_demo.launch(show_api=False, show_error=True) | |