Spaces:
Sleeping
Sleeping
File size: 10,578 Bytes
42728db 8e003ee 79e49b9 9a9ff20 8e003ee 9a9ff20 9a6b9b4 9a9ff20 8e003ee 9a9ff20 8e003ee 1df5c30 8e003ee 1df5c30 bc36546 8e003ee bc36546 5420e72 8e003ee 069fa2a 8e003ee 069fa2a bc36546 8e003ee 9a9ff20 8e003ee 1df5c30 8e003ee eb66e62 8e003ee eb66e62 538dac0 bc36546 a25159d 8e003ee bc36546 82752ee a75646b 1df5c30 7c29744 1df5c30 7c29744 8e003ee a25159d 8e003ee b7e3bed 8e003ee 9a9ff20 8e003ee a7fd36f 8e003ee 170def9 82752ee a25159d 8e003ee eb66e62 8e003ee b7e3bed eb66e62 8e003ee eb66e62 8e003ee eb66e62 9a9ff20 eb66e62 8e003ee eb66e62 8e003ee eb66e62 5d84604 8e003ee | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 | """
作者:曾浩龙(独立开发)
创建时间:2024 年 10 月 25 日
第三方依赖库:Gradio (https://www.gradio.app/) 与 OpenAI Python API library (https://github.com/openai/openai-python)
其他说明:本项目声明仅供学习和研究使用。
"""
import os
import platform
print(platform.python_version())
import Levenshtein
import gradio as gr
from openai import OpenAI
def find_closest_string(user_input, valid_strings):
"""查找与输入字符串最接近的字符串,精确的编辑距离,根据 Levenshtein 编辑距离最小原则。"""
min_distance = 999999999 # 初始化最小编辑距离为大的正数
closest_string = None
for valid_string in valid_strings:
# 计算 Levenshtein 距离:
distance = Levenshtein.distance(user_input, valid_string)
if distance < min_distance:
min_distance = distance
closest_string = valid_string
return closest_string
def process_input(user_input):
"""
处理用户输入的字符串:
1 - 若在有效集合中则直接返回;
2 - 否则返回与用户输入的最接近的有效字符串。
"""
if user_input in valid_strings:
return user_input # 如果用户输入的字符串在集合里,直接返回。
else:
# 否则,找到编辑距离最小的字符串。
return find_closest_string(user_input, valid_strings)
def demo(project_TCGA, output_language="Chinese"):
project_TCGA = process_input(project_TCGA)
name_English, name_Chinese = project_name_TCGA[project_TCGA]
tcga_link = f"https://portal.gdc.cancer.gov/projects/{project_TCGA}"
output1, output2 = None, None
if output_language == "Chinese":
output1 = f"✍️ 简称:{project_TCGA}\n❤️ 中文全称:{name_Chinese}\n💛 英文全称:{name_English}\n🔗 链接:{tcga_link}"
system_instruction = f"您是公共卫生、流行病学、癌症研究和精准医学领域的专家,对{name_Chinese}有着深入的理解。"
prompt_template = f"""
您的任务是深入分析并撰写关于{name_Chinese}这种复杂疾病的摘要,内容必须准确、详实、逻辑清晰、可读性强,这对普通公众了解这种复杂疾病非常重要。
具体内容需要包括:1 - {name_Chinese}的基本定义和概述,临床病理特征;2 - {name_Chinese}的病因和风险因素;3 - {name_Chinese}的流行病学调查结果,患病率和死亡率;4 - {name_Chinese}的临床症状与早期识别;5 - {name_Chinese}的疾病进展与转移及其密切相关的生物标志物和异常基因改变;6 - {name_Chinese}的生存率与预后;7 - {name_Chinese}的诊断、治疗方法和未来研究。
""".strip()
else:
output1 = f"✍️ Abbreviation: {project_TCGA}\n❤️ Full name in Chinese: {name_Chinese}\n💛 Full Name in English: {name_English}\n🔗 Link: {tcga_link}"
system_instruction = f"You are an expert in the fields of public health, epidemiology, cancer research, and precision medicine, with a deep comprehension of {name_English}."
prompt_template = f"""
Your task is to analyze and write an in-depth summary about the complex disease of {name_English} that must be accurate, informative, logical, and readable, which is very important for the general public to understand this complex disease.
Specific content needs to include: 1 - Basic definition and overview of {name_English}, clinicopathologic features; 2 - Etiology and risk factors of {name_English}; 3 - Epidemiologic findings, prevalence, and mortality rates of {name_English}; 4 - Clinical signs and early recognition of {name_English}; 5 - Disease progression and metastasis of {name_English} and its closely related biomarkers and aberrant gene alterations; 6 - Survival and prognosis of {name_English}; and 7 - Diagnostics, therapeutic approaches, and future research of {name_English}.
""".strip()
try:
# 要实例化一个 OpenAI 对象,你需要设置 OpenAI API Key、Base URL、最大重试次数以及超时限制时间。
client = OpenAI(
api_key=os.environ["OPENAI_API_KEY"],
base_url=os.environ["API_BASE"],
max_retries=3,
timeout=60,
)
# 调用 client.chat.completions.create,设置关键参数。
chat_completion = client.chat.completions.create(
model="gpt-4o-mini-2024-07-18", # gpt-4o-mini-2024-07-18, gpt-4-turbo
messages=[
{"role": "system", "content": system_instruction},
{"role": "user", "content": prompt_template},
],
n=1,
seed=42,
temperature=0.50,
max_tokens=3600 if output_language == "Chinese" else 2048,
logprobs=False,
# top_logprobs=3,
presence_penalty=0.20,
frequency_penalty=0.20,
)
resp_text = chat_completion.choices[0].message.content.strip()
# 在普通文本框不能用 "**" 渲染加粗,Markdown 才可以。因此,将输入字符串中所有的 "**" 替换为 ""。
# if "**" in resp_text:
# resp_text = resp_text.replace("**", "")
# if "# " in resp_text:
# resp_text = resp_text.replace("# ", "")
# if "#" in resp_text:
# resp_text = resp_text.replace("#", "")
if output_language == "Chinese":
# "🤖 请注意:以下内容通过提示工程驱动的 GPT-4 Turbo 生成\n\n"
output2 = "" + resp_text
else:
# "🤖 Note: The following content is generated by the GPT-4 Turbo driven by Prompt Engineering\n\n"
output2 = "" + resp_text
except Exception as e:
print(str(e), "Response Error")
return output1, "Response Error"
return output1, output2
# TCGA 有 33 种癌症类型
project_name_TCGA = {
"TCGA-ACC": ["adrenocortical carcinoma", "肾上腺皮质癌"],
"TCGA-BLCA": ["bladder urothelial carcinoma", "膀胱尿路上皮癌"],
"TCGA-BRCA": ["breast invasive carcinoma", "浸润性乳腺癌"],
"TCGA-CESC": [
"cervical squamous cell carcinoma and endocervical adenocarcinoma",
"宫颈鳞状细胞癌与宫颈内膜腺癌",
],
"TCGA-CHOL": ["cholangiocarcinoma", "胆管癌"],
"TCGA-COAD": ["colon adenocarcinoma", "结肠腺癌"],
"TCGA-DLBC": [
"lymphoid neoplasm diffuse large B-cell lymphoma",
"弥漫性大 B 细胞淋巴瘤",
],
"TCGA-ESCA": ["esophageal carcinoma", "食道癌"],
"TCGA-GBM": ["glioblastoma multiforme", "多形性胶质母细胞瘤"],
"TCGA-HNSC": ["head and neck squamous cell carcinoma", "头颈部鳞状细胞癌"],
"TCGA-KICH": ["kidney chromophobe", "肾嫌色细胞癌"],
"TCGA-KIRC": ["kidney renal clear cell carcinoma", "肾透明细胞癌"],
"TCGA-KIRP": ["kidney renal papillary cell carcinoma", "乳头状肾细胞癌"],
"TCGA-LAML": ["acute myeloid leukemia", "急性髓系白血病"],
"TCGA-LGG": ["brain lower grade glioma", "低级别脑胶质瘤"],
"TCGA-LIHC": ["liver hepatocellular carcinoma", "肝细胞癌"],
"TCGA-LUAD": ["lung adenocarcinoma", "肺腺癌"],
"TCGA-LUSC": ["lung squamous cell carcinoma", "肺鳞状细胞癌"],
"TCGA-MESO": ["mesothelioma", "间皮瘤"],
"TCGA-OV": ["ovarian serous cystadenocarcinoma", "卵巢浆液性囊腺癌"],
"TCGA-PAAD": ["pancreatic adenocarcinoma", "胰腺腺癌"],
"TCGA-PCPG": ["pheochromocytoma and paraganglioma", "嗜铬细胞瘤和副神经节瘤"],
"TCGA-PRAD": ["prostate adenocarcinoma", "前列腺腺癌"],
"TCGA-READ": ["rectum adenocarcinoma", "直肠腺癌"],
"TCGA-SARC": ["sarcoma", "肉瘤"],
"TCGA-SKCM": ["skin cutaneous melanoma", "皮肤黑色素瘤"],
"TCGA-STAD": ["stomach adenocarcinoma", "胃腺癌"],
"TCGA-TGCT": ["testicular germ cell tumors", "睾丸生殖细胞肿瘤"],
"TCGA-THCA": ["thyroid carcinoma", "甲状腺癌"],
"TCGA-THYM": ["thymoma", "胸腺瘤"],
"TCGA-UCEC": ["uterine corpus endometrial carcinoma", "子宫体子宫内膜癌"],
"TCGA-UCS": ["uterine carcinosarcoma", "子宫癌肉瘤"],
"TCGA-UVM": ["uveal melanoma", "眼内(葡萄膜)黑色素瘤"],
}
# 预定义的字符串集合:
valid_strings = {input_query for input_query in project_name_TCGA.keys()}
# print(len(project_name_TCGA.keys()))
# input_query = input("请输入您要查询的 TCGA 项目名称:")
# print(project_name_TCGA[input_query])
# print([k for k in project_name_TCGA.keys()])
# 支持 Markdown 和 HTML 内容格式:
# Abbreviations, Full Names and Descriptions of All Cancer Types Covered by TCGA Project.
# desc = """<h1 align="center" style="font-family: Latin Modern Math, sans-serif; font-size: 22px; color: #00FF7F;">🎉 Abbreviations, Full Names and Descriptions of All Cancer Types Covered by TCGA Project 🧬</h1>"""
desc = """<h1 align="center" style="font-family: KaiTi, sans-serif; font-size: 22px; color: #00FF7F;">🎉 TCGA 项目涉及的所有癌症类型的缩写、中英文全称和描述 🧬</h1>"""
outputs = [
gr.Textbox(
label="🔎 1. 您查询的 TCGA 项目的癌症类型", show_copy_button=True
), # 1. The Full Name of The Cancer Type Queried.
gr.Textbox(
label="👩⚕️ 2. 迅速了解这种癌症类型的信息",
show_copy_button=True,
), # 2. Insight Into The Cancer Type Being Queried. A Quick Look At The Cancer Type Being Queried
]
my_demo = gr.Interface(
fn=demo,
inputs=[
gr.Dropdown(
choices=[k for k in project_name_TCGA.keys()],
value="TCGA-READ",
allow_custom_value=True,
filterable=True,
interactive=True,
label="⌨️ 请输入您要查询的 TCGA 项目名称,如 TCGA-READ",
), # Please enter the name of the TCGA project you want to query, such as TCGA-READ.
gr.Dropdown(
choices=["Chinese", "English"],
value="Chinese",
allow_custom_value=False,
label="👨💻 输出语言目前仅支持中文和英文",
),
],
outputs=outputs,
submit_btn=gr.Button("提交", variant="primary"),
clear_btn=gr.Button("清除", variant="secondary"),
cache_examples=True,
examples=[["TCGA-READ", "Chinese"], ["TCGA-COAD", "English"]],
description=desc,
theme="JohnSmith9982/small_and_pretty",
)
my_demo.launch(show_api=False, show_error=True)
|