Spaces:

happyGPT
/

TCGA-Name-Query

Sleeping

File size: 10,578 Bytes

42728db
 
 
 
 
 
8e003ee
79e49b9
 
 
9a9ff20
8e003ee
 
 
 
9a9ff20
 
9a6b9b4
9a9ff20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e003ee
9a9ff20
8e003ee
1df5c30
8e003ee
 
 
1df5c30
bc36546
8e003ee
bc36546
5420e72
8e003ee
069fa2a
8e003ee
069fa2a
bc36546
8e003ee
9a9ff20
 
8e003ee
 
 
 
 
 
 
 
 
 
1df5c30
8e003ee
 
eb66e62
8e003ee
 
 
 
eb66e62
 
538dac0
bc36546
a25159d
 
 
 
8e003ee
 
bc36546
82752ee
 
 
 
 
 
 
a75646b
1df5c30
7c29744
 
1df5c30
7c29744
 
8e003ee
 
 
a25159d
8e003ee
 
 
 
b7e3bed
8e003ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9a9ff20
 
8e003ee
 
 
a7fd36f
8e003ee
 
 
170def9
82752ee
a25159d
8e003ee
 
eb66e62
8e003ee
b7e3bed
eb66e62
8e003ee
eb66e62
8e003ee
 
 
 
 
 
eb66e62
9a9ff20
 
 
eb66e62
 
8e003ee
 
 
 
eb66e62
8e003ee
 
 
eb66e62
 
5d84604
 
8e003ee

"""
作者：曾浩龙（独立开发）
创建时间：2024 年 10 月 25 日
第三方依赖库：Gradio (https://www.gradio.app/) 与 OpenAI Python API library (https://github.com/openai/openai-python)
其他说明：本项目声明仅供学习和研究使用。
"""
import os
import platform
print(platform.python_version())

import Levenshtein
import gradio as gr
from openai import OpenAI


def find_closest_string(user_input, valid_strings):
    """查找与输入字符串最接近的字符串，精确的编辑距离，根据 Levenshtein 编辑距离最小原则。"""
    min_distance = 999999999  # 初始化最小编辑距离为大的正数
    closest_string = None

    for valid_string in valid_strings:
        # 计算 Levenshtein 距离：
        distance = Levenshtein.distance(user_input, valid_string)
        if distance < min_distance:
            min_distance = distance
            closest_string = valid_string

    return closest_string


def process_input(user_input):
    """
    处理用户输入的字符串：
      1 - 若在有效集合中则直接返回；
      2 - 否则返回与用户输入的最接近的有效字符串。
    """
    if user_input in valid_strings:
        return user_input  # 如果用户输入的字符串在集合里，直接返回。
    else:
        # 否则，找到编辑距离最小的字符串。
        return find_closest_string(user_input, valid_strings)


def demo(project_TCGA, output_language="Chinese"):
    project_TCGA = process_input(project_TCGA)
    name_English, name_Chinese = project_name_TCGA[project_TCGA]
    tcga_link = f"https://portal.gdc.cancer.gov/projects/{project_TCGA}"
    output1, output2 = None, None

    if output_language == "Chinese":
        output1 = f"✍️ 简称：{project_TCGA}\n❤️ 中文全称：{name_Chinese}\n💛 英文全称：{name_English}\n🔗 链接：{tcga_link}"
        system_instruction = f"您是公共卫生、流行病学、癌症研究和精准医学领域的专家，对{name_Chinese}有着深入的理解。"
        prompt_template = f"""
您的任务是深入分析并撰写关于{name_Chinese}这种复杂疾病的摘要，内容必须准确、详实、逻辑清晰、可读性强，这对普通公众了解这种复杂疾病非常重要。
具体内容需要包括：1 - {name_Chinese}的基本定义和概述，临床病理特征；2 - {name_Chinese}的病因和风险因素；3 - {name_Chinese}的流行病学调查结果，患病率和死亡率；4 - {name_Chinese}的临床症状与早期识别；5 - {name_Chinese}的疾病进展与转移及其密切相关的生物标志物和异常基因改变；6 - {name_Chinese}的生存率与预后；7 - {name_Chinese}的诊断、治疗方法和未来研究。
""".strip()

    else:
        output1 = f"✍️ Abbreviation: {project_TCGA}\n❤️ Full name in Chinese: {name_Chinese}\n💛 Full Name in English: {name_English}\n🔗 Link: {tcga_link}"
        system_instruction = f"You are an expert in the fields of public health, epidemiology, cancer research, and precision medicine, with a deep comprehension of {name_English}."
        prompt_template = f"""
Your task is to analyze and write an in-depth summary about the complex disease of {name_English} that must be accurate, informative, logical, and readable, which is very important for the general public to understand this complex disease.
Specific content needs to include: 1 - Basic definition and overview of {name_English}, clinicopathologic features; 2 - Etiology and risk factors of {name_English}; 3 - Epidemiologic findings, prevalence, and mortality rates of {name_English}; 4 - Clinical signs and early recognition of {name_English}; 5 - Disease progression and metastasis of {name_English} and its closely related biomarkers and aberrant gene alterations; 6 - Survival and prognosis of {name_English}; and 7 - Diagnostics, therapeutic approaches, and future research of {name_English}.
""".strip()

    try:
        # 要实例化一个 OpenAI 对象，你需要设置 OpenAI API Key、Base URL、最大重试次数以及超时限制时间。
        client = OpenAI(
            api_key=os.environ["OPENAI_API_KEY"],
            base_url=os.environ["API_BASE"],
            max_retries=3,
            timeout=60,
        )

        # 调用 client.chat.completions.create，设置关键参数。
        chat_completion = client.chat.completions.create(
            model="gpt-4o-mini-2024-07-18",  # gpt-4o-mini-2024-07-18, gpt-4-turbo
            messages=[
                {"role": "system", "content": system_instruction},
                {"role": "user", "content": prompt_template},
            ],
            n=1,
            seed=42,
            temperature=0.50,
            max_tokens=3600 if output_language == "Chinese" else 2048,
            logprobs=False,
            # top_logprobs=3,
            presence_penalty=0.20,
            frequency_penalty=0.20,
        )
        resp_text = chat_completion.choices[0].message.content.strip()

        # 在普通文本框不能用 "**" 渲染加粗，Markdown 才可以。因此，将输入字符串中所有的 "**" 替换为 ""。
        # if "**" in resp_text:
        #     resp_text = resp_text.replace("**", "")
        # if "# " in resp_text:
        #     resp_text = resp_text.replace("# ", "")
        # if "#" in resp_text:
        #     resp_text = resp_text.replace("#", "")

        if output_language == "Chinese":
            # "🤖 请注意：以下内容通过提示工程驱动的 GPT-4 Turbo 生成\n\n"
            output2 = "" + resp_text
        else:
            # "🤖 Note: The following content is generated by the GPT-4 Turbo driven by Prompt Engineering\n\n"
            output2 = "" + resp_text

    except Exception as e:
        print(str(e), "Response Error")
        return output1, "Response Error"

    return output1, output2


# TCGA 有 33 种癌症类型
project_name_TCGA = {
    "TCGA-ACC": ["adrenocortical carcinoma", "肾上腺皮质癌"],
    "TCGA-BLCA": ["bladder urothelial carcinoma", "膀胱尿路上皮癌"],
    "TCGA-BRCA": ["breast invasive carcinoma", "浸润性乳腺癌"],
    "TCGA-CESC": [
        "cervical squamous cell carcinoma and endocervical adenocarcinoma",
        "宫颈鳞状细胞癌与宫颈内膜腺癌",
    ],
    "TCGA-CHOL": ["cholangiocarcinoma", "胆管癌"],
    "TCGA-COAD": ["colon adenocarcinoma", "结肠腺癌"],
    "TCGA-DLBC": [
        "lymphoid neoplasm diffuse large B-cell lymphoma",
        "弥漫性大 B 细胞淋巴瘤",
    ],
    "TCGA-ESCA": ["esophageal carcinoma", "食道癌"],
    "TCGA-GBM": ["glioblastoma multiforme", "多形性胶质母细胞瘤"],
    "TCGA-HNSC": ["head and neck squamous cell carcinoma", "头颈部鳞状细胞癌"],
    "TCGA-KICH": ["kidney chromophobe", "肾嫌色细胞癌"],
    "TCGA-KIRC": ["kidney renal clear cell carcinoma", "肾透明细胞癌"],
    "TCGA-KIRP": ["kidney renal papillary cell carcinoma", "乳头状肾细胞癌"],
    "TCGA-LAML": ["acute myeloid leukemia", "急性髓系白血病"],
    "TCGA-LGG": ["brain lower grade glioma", "低级别脑胶质瘤"],
    "TCGA-LIHC": ["liver hepatocellular carcinoma", "肝细胞癌"],
    "TCGA-LUAD": ["lung adenocarcinoma", "肺腺癌"],
    "TCGA-LUSC": ["lung squamous cell carcinoma", "肺鳞状细胞癌"],
    "TCGA-MESO": ["mesothelioma", "间皮瘤"],
    "TCGA-OV": ["ovarian serous cystadenocarcinoma", "卵巢浆液性囊腺癌"],
    "TCGA-PAAD": ["pancreatic adenocarcinoma", "胰腺腺癌"],
    "TCGA-PCPG": ["pheochromocytoma and paraganglioma", "嗜铬细胞瘤和副神经节瘤"],
    "TCGA-PRAD": ["prostate adenocarcinoma", "前列腺腺癌"],
    "TCGA-READ": ["rectum adenocarcinoma", "直肠腺癌"],
    "TCGA-SARC": ["sarcoma", "肉瘤"],
    "TCGA-SKCM": ["skin cutaneous melanoma", "皮肤黑色素瘤"],
    "TCGA-STAD": ["stomach adenocarcinoma", "胃腺癌"],
    "TCGA-TGCT": ["testicular germ cell tumors", "睾丸生殖细胞肿瘤"],
    "TCGA-THCA": ["thyroid carcinoma", "甲状腺癌"],
    "TCGA-THYM": ["thymoma", "胸腺瘤"],
    "TCGA-UCEC": ["uterine corpus endometrial carcinoma", "子宫体子宫内膜癌"],
    "TCGA-UCS": ["uterine carcinosarcoma", "子宫癌肉瘤"],
    "TCGA-UVM": ["uveal melanoma", "眼内（葡萄膜）黑色素瘤"],
}
# 预定义的字符串集合：
valid_strings = {input_query for input_query in project_name_TCGA.keys()}
# print(len(project_name_TCGA.keys()))
# input_query = input("请输入您要查询的 TCGA 项目名称：")
# print(project_name_TCGA[input_query])
# print([k for k in project_name_TCGA.keys()])

# 支持 Markdown 和 HTML 内容格式：
# Abbreviations, Full Names and Descriptions of All Cancer Types Covered by TCGA Project.
# desc = """<h1 align="center" style="font-family: Latin Modern Math, sans-serif; font-size: 22px; color: #00FF7F;">🎉 Abbreviations, Full Names and Descriptions of All Cancer Types Covered by TCGA Project 🧬</h1>"""

desc = """<h1 align="center" style="font-family: KaiTi, sans-serif; font-size: 22px; color: #00FF7F;">🎉 TCGA 项目涉及的所有癌症类型的缩写、中英文全称和描述 🧬</h1>"""
outputs = [
    gr.Textbox(
        label="🔎 1. 您查询的 TCGA 项目的癌症类型", show_copy_button=True
    ),  # 1. The Full Name of The Cancer Type Queried.
    gr.Textbox(
        label="👩‍⚕️ 2. 迅速了解这种癌症类型的信息",
        show_copy_button=True,
    ),  # 2. Insight Into The Cancer Type Being Queried. A Quick Look At The Cancer Type Being Queried
]
my_demo = gr.Interface(
    fn=demo,
    inputs=[
        gr.Dropdown(
            choices=[k for k in project_name_TCGA.keys()],
            value="TCGA-READ",
            allow_custom_value=True,
            filterable=True,
            interactive=True,
            label="⌨️ 请输入您要查询的 TCGA 项目名称，如 TCGA-READ",
        ),  # Please enter the name of the TCGA project you want to query, such as TCGA-READ.
        gr.Dropdown(
            choices=["Chinese", "English"],
            value="Chinese",
            allow_custom_value=False,
            label="👨‍💻 输出语言目前仅支持中文和英文",
        ),
    ],
    outputs=outputs,
    submit_btn=gr.Button("提交", variant="primary"),
    clear_btn=gr.Button("清除", variant="secondary"),
    cache_examples=True,
    examples=[["TCGA-READ", "Chinese"], ["TCGA-COAD", "English"]],
    description=desc,
    theme="JohnSmith9982/small_and_pretty",
)
my_demo.launch(show_api=False, show_error=True)