Spaces:

leonsimon23
/

pharmextract

Sleeping

App Files Files Community

leonsimon23 commited on Nov 3, 2025

Commit

90e3caa

verified ·

1 Parent(s): 01ab923

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -56

app.py CHANGED Viewed

@@ -32,18 +32,15 @@ def get_extraction_config():
     ]
     return prompt_description, examples
-# --- 2. Gradio 的核心处理函数 (已更新) ---
-# 新增了对 PDF 文件处理的支持
 def process_input_and_visualize(input_text, input_file):
     """
-    接收文本或文件输入，解析内容，调用 LangExtract 处理，并返回结果给 Gradio。
     """
     source_text = ""
     # 优先处理上传的文件
     if input_file is not None:
         try:
-            # 使用 fitz (PyMuPDF) 打开 PDF 文件
-            # input_file.name 会提供 Gradio 保存的临时文件路径
             with fitz.open(input_file.name) as doc:
                 for page in doc:
                     source_text += page.get_text()
@@ -59,40 +56,41 @@ def process_input_and_visualize(input_text, input_file):
     # --- LangExtract 核心提取逻辑 ---
     prompt, examples = get_extraction_config()
-    # --- 模拟结果 (用于演示) ---
-    from langextract.data import AnnotatedDocument, Extraction, CharInterval
-    # 这里的模拟数据的位置信息可能与实际PDF文本不符，仅作演示
-    result = AnnotatedDocument(
-        text=source_text,
-        extractions=[
-            # 注意：这些硬编码的位置仅适用于特定的示例文本
-            Extraction(extraction_class='药物', extraction_text='Lisinopril', attributes={'medication_group': 'Lisinopril'}, char_interval=CharInterval(start_pos=28, end_pos=38)),
-            Extraction(extraction_class='药物', extraction_text='Metformin', attributes={'medication_group': 'Metformin'}, char_interval=CharInterval(start_pos=43, end_pos=52)),
-        ]
-    )
-    # --- 模拟结束 ---
-    # --- 真实的 LangExtract 调用 (部署时请取消注释) ---
-    # api_key = os.environ.get("LANGEXTRACT_API_KEY")
-    # if not api_key:
-    #     raise gr.Error("错误：未设置 LANGEXTRACT_API_KEY。")
-    # result = lx.extract(
-    #     text_or_documents=source_text,
-    #     prompt_description=prompt,
-    #     examples=examples,
-    #     model_id="gemini-2.5-pro",
-    #     api_key=api_key
-    # )
     # 1. 准备命名实体识别 (NER) 的高亮文本输出
     highlighted_text = []
     last_pos = 0
     sorted_extractions = sorted(result.extractions, key=lambda e: e.char_interval.start_pos)
     for entity in sorted_extractions:
         start, end = entity.char_interval.start_pos, entity.char_interval.end_pos
-        highlighted_text.append((source_text[last_pos:start], None))
-        highlighted_text.append((entity.extraction_text, entity.extraction_class))
-        last_pos = end
     highlighted_text.append((source_text[last_pos:], None))
     # 2. 准备关系提取 (RE) 的结构化 Markdown 输出
@@ -102,12 +100,15 @@ def process_input_and_visualize(input_text, input_file):
         medication_groups.setdefault(group_name, []).append(extraction)
     structured_output = "### 结构化提取结果\n\n"
-    for med_name, extractions in medication_groups.items():
-        structured_output += f"#### 药物组: {med_name}\n"
-        for extraction in sorted(extractions, key=lambda e: e.char_interval.start_pos):
-            pos_info = f" (位置: {extraction.char_interval.start_pos}-{extraction.char_interval.end_pos})"
-            structured_output += f"- **{extraction.extraction_class}**: {extraction.extraction_text}{pos_info}\n"
-        structured_output += "\n"
     # 3. 生成并保存交互式可视化文件
     session_id = str(uuid.uuid4())
@@ -127,26 +128,18 @@ def process_input_and_visualize(input_text, input_file):
     return highlighted_text, structured_output, html_path, html_path, jsonl_path
-# --- 3. 创建 Gradio 应用界面 (已更新) ---
 with gr.Blocks(theme=gr.themes.Soft(), title="药物信息提取器") as demo:
-    gr.Markdown("# ⚕️ LangExtract 药物信息提取器")
     gr.Markdown("一个基于大型语言模型的智能工具，可从**临床文本**或 **PDF 文件**中自动提取药物、剂量等信息，并进行结构化关联。")
     with gr.Row():
-        # 左侧为输入区，使用 Tab 切换
         with gr.Column(scale=1):
             with gr.Tabs():
                 with gr.TabItem("📄 临床文本输入"):
-                    input_textbox = gr.Textbox(
-                        lines=15,
-                        label="粘贴临床笔记",
-                        placeholder=textwrap.dedent("""请在此处粘贴文本...""")
-                    )
                 with gr.TabItem("📁 上传PDF文件"):
-                    input_file_uploader = gr.File(
-                        label="选择一个 PDF 文件进行分析",
-                        file_types=['.pdf']
-                    )
             submit_btn = gr.Button("🧠 提取信息", variant="primary")
             gr.Examples(
@@ -154,35 +147,29 @@ with gr.Blocks(theme=gr.themes.Soft(), title="药物信息提取器") as demo:
                     "The patient was prescribed Lisinopril and Metformin last month.\nHe takes the Lisinopril 10mg daily for hypertension, but often misses his Metformin 500mg dose which should be taken twice daily for diabetes.",
                     "Patient took 400 mg PO Ibuprofen q4h for two days for a headache.",
                 ],
-                inputs=input_textbox, # 示例仅与文本框关联
                 label="示例文本 (点击自动填充到上方文本框)"
             )
-        # 右侧为输出区
         with gr.Column(scale=2):
             with gr.Tabs():
                 with gr.TabItem("📊 总览 (NER & RE)"):
                     gr.Markdown("### 命名实体识别 (NER) - 文本高亮")
                     output_highlight = gr.HighlightedText(label="实体高亮显示", color_map={"药物": "#FF6347", "剂量": "#FFA500", "频率": "#32CD32", "病症": "#4169E1"})
                     output_structured = gr.Markdown(label="结构化关系")
                 with gr.TabItem("🌐 交互式可视化"):
                     gr.Markdown("### 交互式可视化图表")
                     output_html_viewer = gr.HTML(label="交互式图表 (可缩放和筛选)")
                 with gr.TabItem("📁 文件下载"):
                     gr.Markdown("### 下载提取结果")
                     download_html = gr.File(label="下载交互式 HTML 文件")
                     download_jsonl = gr.File(label="下载 JSONL 数据文件")
-    # 设置按钮的点击事件
-    # 输入现在是文本框和文件上传器
     submit_btn.click(
         fn=process_input_and_visualize,
         inputs=[input_textbox, input_file_uploader],
         outputs=[output_highlight, output_structured, output_html_viewer, download_html, download_jsonl]
     ).then(
-        # 任务完成后，清空输入框和文件上传器，准备下一次输入
         lambda: (None, None),
         inputs=None,
         outputs=[input_textbox, input_file_uploader]

     ]
     return prompt_description, examples
+# --- 2. Gradio 的核心处理函数 (生产级版本) ---
 def process_input_and_visualize(input_text, input_file):
     """
+    接收文本或文件输入，解析内容，调用 LangExtract 的高级功能进行处理，并返回结果。
     """
     source_text = ""
     # 优先处理上传的文件
     if input_file is not None:
         try:
             with fitz.open(input_file.name) as doc:
                 for page in doc:
                     source_text += page.get_text()
     # --- LangExtract 核心提取逻辑 ---
     prompt, examples = get_extraction_config()
+    # 从环境变量中获取 API Key (在 Hugging Face 中必须设置为 Secret)
+    api_key = os.environ.get("LANGEXTRACT_API_KEY")
+    if not api_key:
+        raise gr.Error("错误：服务器未配置 LANGEXTRACT_API_KEY。请联系管理员在 Hugging Face Space 的 Secrets 中添加它。")
+    # --- 真实的、带高级参数的 LangExtract 调用 ---
+    # 这里我们移除了所有模拟代码，直接调用 API
+    try:
+        result = lx.extract(
+            text_or_documents=source_text,
+            prompt_description=prompt,
+            examples=examples,
+            model_id="gemini-2.5-pro", # 使用强大的模型以获得最佳效果
+            api_key=api_key,
+            # --- 智能处理长文本的关键参数 ---
+            max_workers=10,          # 启用并行处理以加速，设置10个工作线程
+            extraction_passes=2,     # 执行两次提取以提高召回率 (可以设为3以获得更高精度，但会更慢)
+            max_char_buffer=1500     # 设置文本块的最大字符数，用于智能分块
+        )
+    except Exception as e:
+        # 捕获API调用或其他处理中可能发生的错误
+        raise gr.Error(f"信息提取过程中发生错误: {e}")
     # 1. 准备命名实体识别 (NER) 的高亮文本输出
     highlighted_text = []
     last_pos = 0
+    # 确保实体按位置排序，以便正确高亮显示
     sorted_extractions = sorted(result.extractions, key=lambda e: e.char_interval.start_pos)
     for entity in sorted_extractions:
         start, end = entity.char_interval.start_pos, entity.char_interval.end_pos
+        # 检查位置是否有效，防止解析错误
+        if start >= last_pos and end <= len(source_text):
+            highlighted_text.append((source_text[last_pos:start], None))
+            highlighted_text.append((entity.extraction_text, entity.extraction_class))
+            last_pos = end
     highlighted_text.append((source_text[last_pos:], None))
     # 2. 准备关系提取 (RE) 的结构化 Markdown 输出
         medication_groups.setdefault(group_name, []).append(extraction)
     structured_output = "### 结构化提取结果\n\n"
+    if not medication_groups:
+        structured_output += "未提取到任何药物信息。"
+    else:
+        for med_name, extractions in medication_groups.items():
+            structured_output += f"#### 药物组: {med_name}\n"
+            for extraction in sorted(extractions, key=lambda e: e.char_interval.start_pos):
+                pos_info = f" (位置: {extraction.char_interval.start_pos}-{extraction.char_interval.end_pos})"
+                structured_output += f"- **{extraction.extraction_class}**: {extraction.extraction_text}{pos_info}\n"
+            structured_output += "\n"
     # 3. 生成并保存交互式可视化文件
     session_id = str(uuid.uuid4())
     return highlighted_text, structured_output, html_path, html_path, jsonl_path
+# --- 3. 创建 Gradio 应用界面 (保持不变) ---
 with gr.Blocks(theme=gr.themes.Soft(), title="药物信息提取器") as demo:
+    gr.Markdown("# ⚕️ 基于大模型的药物信息提取器")
     gr.Markdown("一个基于大型语言模型的智能工具，可从**临床文本**或 **PDF 文件**中自动提取药物、剂量等信息，并进行结构化关联。")
     with gr.Row():
         with gr.Column(scale=1):
             with gr.Tabs():
                 with gr.TabItem("📄 临床文本输入"):
+                    input_textbox = gr.Textbox(lines=15, label="粘贴临床笔记", placeholder="请在此处粘贴文本...")
                 with gr.TabItem("📁 上传PDF文件"):
+                    input_file_uploader = gr.File(label="选择一个 PDF 文件进行分析", file_types=['.pdf'])
             submit_btn = gr.Button("🧠 提取信息", variant="primary")
             gr.Examples(
                     "The patient was prescribed Lisinopril and Metformin last month.\nHe takes the Lisinopril 10mg daily for hypertension, but often misses his Metformin 500mg dose which should be taken twice daily for diabetes.",
                     "Patient took 400 mg PO Ibuprofen q4h for two days for a headache.",
                 ],
+                inputs=input_textbox,
                 label="示例文本 (点击自动填充到上方文本框)"
             )
         with gr.Column(scale=2):
             with gr.Tabs():
                 with gr.TabItem("📊 总览 (NER & RE)"):
                     gr.Markdown("### 命名实体识别 (NER) - 文本高亮")
                     output_highlight = gr.HighlightedText(label="实体高亮显示", color_map={"药物": "#FF6347", "剂量": "#FFA500", "频率": "#32CD32", "病症": "#4169E1"})
                     output_structured = gr.Markdown(label="结构化关系")
                 with gr.TabItem("🌐 交互式可视化"):
                     gr.Markdown("### 交互式可视化图表")
                     output_html_viewer = gr.HTML(label="交互式图表 (可缩放和筛选)")
                 with gr.TabItem("📁 文件下载"):
                     gr.Markdown("### 下载提取结果")
                     download_html = gr.File(label="下载交互式 HTML 文件")
                     download_jsonl = gr.File(label="下载 JSONL 数据文件")
     submit_btn.click(
         fn=process_input_and_visualize,
         inputs=[input_textbox, input_file_uploader],
         outputs=[output_highlight, output_structured, output_html_viewer, download_html, download_jsonl]
     ).then(
         lambda: (None, None),
         inputs=None,
         outputs=[input_textbox, input_file_uploader]