Spaces:

leonsimon23
/

pharmextract

Sleeping

App Files Files Community

leonsimon23 commited on Nov 3, 2025

Commit

a7ae4c1

verified ·

1 Parent(s): 90e3caa

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -22

app.py CHANGED Viewed

@@ -32,13 +32,12 @@ def get_extraction_config():
     ]
     return prompt_description, examples
-# --- 2. Gradio 的核心处理函数 (生产级版本) ---
 def process_input_and_visualize(input_text, input_file):
     """
     接收文本或文件输入，解析内容，调用 LangExtract 的高级功能进行处理，并返回结果。
     """
     source_text = ""
-    # 优先处理上传的文件
     if input_file is not None:
         try:
             with fitz.open(input_file.name) as doc:
@@ -46,47 +45,40 @@ def process_input_and_visualize(input_text, input_file):
                     source_text += page.get_text()
         except Exception as e:
             raise gr.Error(f"PDF 文件解析失败: {e}")
-    # 如果没有文件，则使用文本框内容
     elif input_text and input_text.strip():
         source_text = input_text
-    # 如果两者都为空
     else:
         return None, "请输入文本或上传一个PDF文件...", None, None, None
-    # --- LangExtract 核心提取逻辑 ---
     prompt, examples = get_extraction_config()
-    # 从环境变量中获取 API Key (在 Hugging Face 中必须设置为 Secret)
     api_key = os.environ.get("LANGEXTRACT_API_KEY")
     if not api_key:
-        raise gr.Error("错误：服务器未配置 LANGEXTRACT_API_KEY。请联系管理员在 Hugging Face Space 的 Secrets 中添加它。")
-    # --- 真实的、带高级参数的 LangExtract 调用 ---
-    # 这里我们移除了所有模拟代码，直接调用 API
     try:
         result = lx.extract(
             text_or_documents=source_text,
             prompt_description=prompt,
             examples=examples,
-            model_id="gemini-2.5-pro", # 使用强大的模型以获得最佳效果
             api_key=api_key,
-            # --- 智能处理长文本的关键参数 ---
-            max_workers=10,          # 启用并行处理以加速，设置10个工作线程
-            extraction_passes=2,     # 执行两次提取以提高召回率 (可以设为3以获得更高精度，但会更慢)
-            max_char_buffer=1500     # 设置文本块的最大字符数，用于智能分块
         )
     except Exception as e:
-        # 捕获API调用或其他处理中可能发生的错误
         raise gr.Error(f"信息提取过程中发生错误: {e}")
     # 1. 准备命名实体识别 (NER) 的高亮文本输出
     highlighted_text = []
     last_pos = 0
-    # 确保实体按位置排序，以便正确高亮显示
     sorted_extractions = sorted(result.extractions, key=lambda e: e.char_interval.start_pos)
     for entity in sorted_extractions:
         start, end = entity.char_interval.start_pos, entity.char_interval.end_pos
-        # 检查位置是否有效，防止解析错误
         if start >= last_pos and end <= len(source_text):
             highlighted_text.append((source_text[last_pos:start], None))
             highlighted_text.append((entity.extraction_text, entity.extraction_class))
@@ -114,14 +106,11 @@ def process_input_and_visualize(input_text, input_file):
     session_id = str(uuid.uuid4())
     output_dir = "/tmp"
     os.makedirs(output_dir, exist_ok=True)
     jsonl_filename = f"extraction_{session_id}.jsonl"
     jsonl_path = os.path.join(output_dir, jsonl_filename)
     html_path = os.path.join(output_dir, f"visualization_{session_id}.html")
     lx.io.save_annotated_documents([result], output_name=jsonl_filename, output_dir=output_dir)
     html_content = lx.visualize(jsonl_path)
     with open(html_path, "w", encoding="utf-8") as f:
         f.write(html_content)
@@ -130,7 +119,8 @@ def process_input_and_visualize(input_text, input_file):
 # --- 3. 创建 Gradio 应用界面 (保持不变) ---
 with gr.Blocks(theme=gr.themes.Soft(), title="药物信息提取器") as demo:
-    gr.Markdown("# ⚕️ 基于大模型的药物信息提取器")
     gr.Markdown("一个基于大型语言模型的智能工具，可从**临床文本**或 **PDF 文件**中自动提取药物、剂量等信息，并进行结构化关联。")
     with gr.Row():
@@ -175,5 +165,6 @@ with gr.Blocks(theme=gr.themes.Soft(), title="药物信息提取器") as demo:
         outputs=[input_textbox, input_file_uploader]
     )
 if __name__ == "__main__":
     demo.launch()

     ]
     return prompt_description, examples
+# --- 2. Gradio 的核心处理函数 (已修复) ---
 def process_input_and_visualize(input_text, input_file):
     """
     接收文本或文件输入，解析内容，调用 LangExtract 的高级功能进行处理，并返回结果。
     """
     source_text = ""
     if input_file is not None:
         try:
             with fitz.open(input_file.name) as doc:
                     source_text += page.get_text()
         except Exception as e:
             raise gr.Error(f"PDF 文件解析失败: {e}")
     elif input_text and input_text.strip():
         source_text = input_text
     else:
         return None, "请输入文本或上传一个PDF文件...", None, None, None
     prompt, examples = get_extraction_config()
     api_key = os.environ.get("LANGEXTRACT_API_KEY")
     if not api_key:
+        raise gr.Error("错误：服务器未配置 LANGEXTRACT_API_KEY。")
     try:
+        # --- 错误修复与优化 ---
+        # 原代码: max_workers=10，这超出了免费API的每分钟2次的请求限制。
+        # 修复: 将 max_workers 降低到 2，以匹配免费套餐的配额。
+        # 优化: 切换到 gemini-1.5-flash-latest 模型，它速度更快，成本更低，通常有更宽松的免费额度。
         result = lx.extract(
             text_or_documents=source_text,
             prompt_description=prompt,
             examples=examples,
+            model_id="gemini-2.5-flash", # 推荐使用 Flash 模型
             api_key=api_key,
+            max_workers=2,               # **关键修复**：将并行工作线程数从10降至2
+            extraction_passes=2,
+            max_char_buffer=1500
         )
     except Exception as e:
         raise gr.Error(f"信息提取过程中发生错误: {e}")
     # 1. 准备命名实体识别 (NER) 的高亮文本输出
     highlighted_text = []
     last_pos = 0
     sorted_extractions = sorted(result.extractions, key=lambda e: e.char_interval.start_pos)
     for entity in sorted_extractions:
         start, end = entity.char_interval.start_pos, entity.char_interval.end_pos
         if start >= last_pos and end <= len(source_text):
             highlighted_text.append((source_text[last_pos:start], None))
             highlighted_text.append((entity.extraction_text, entity.extraction_class))
     session_id = str(uuid.uuid4())
     output_dir = "/tmp"
     os.makedirs(output_dir, exist_ok=True)
     jsonl_filename = f"extraction_{session_id}.jsonl"
     jsonl_path = os.path.join(output_dir, jsonl_filename)
     html_path = os.path.join(output_dir, f"visualization_{session_id}.html")
     lx.io.save_annotated_documents([result], output_name=jsonl_filename, output_dir=output_dir)
     html_content = lx.visualize(jsonl_path)
     with open(html_path, "w", encoding="utf-8") as f:
         f.write(html_content)
 # --- 3. 创建 Gradio 应用界面 (保持不变) ---
 with gr.Blocks(theme=gr.themes.Soft(), title="药物信息提取器") as demo:
+    # ... (界面部分代码无需修改，此处省略以保持简洁) ...
+    gr.Markdown("# ⚕️ LangExtract 药物信息提取器")
     gr.Markdown("一个基于大型语言模型的智能工具，可从**临床文本**或 **PDF 文件**中自动提取药物、剂量等信息，并进行结构化关联。")
     with gr.Row():
         outputs=[input_textbox, input_file_uploader]
     )
 if __name__ == "__main__":
     demo.launch()