Spaces:

leonsimon23
/

pharmextract

Sleeping

App Files Files Community

leonsimon23 commited on Nov 3, 2025

Commit

69c4dc7

verified ·

1 Parent(s): b4e60d5

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -26

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import textwrap
 import os
 import uuid  # 用于生成唯一的文件名
 import fitz  # 导入 PyMuPDF 库
 # --- 1. 定义 LangExtract 的提取逻辑 (保持不变) ---
 def get_extraction_config():
@@ -32,11 +33,8 @@ def get_extraction_config():
     ]
     return prompt_description, examples
-# --- 2. Gradio 的核心处理函数 (已修复) ---
 def process_input_and_visualize(input_text, input_file):
-    """
-    接收文本或文件输入，解析内容，调用 LangExtract 的高级功能进行处理，并返回结果。
-    """
     source_text = ""
     if input_file is not None:
         try:
@@ -55,23 +53,44 @@ def process_input_and_visualize(input_text, input_file):
     if not api_key:
         raise gr.Error("错误：服务器未配置 LANGEXTRACT_API_KEY。")
-    try:
-        result = lx.extract(
-            text_or_documents=source_text,
-            prompt_description=prompt,
-            examples=examples,
-            model_id="gemini-2.5-flash",
-            api_key=api_key,
-            max_workers=2,
-            extraction_passes=2,
-            max_char_buffer=1500
-        )
-    except Exception as e:
-        raise gr.Error(f"信息提取过程中发生错误: {e}")
-    grounded_extractions = [e for e in result.extractions if e.char_interval]
-    # 1. 准备命名实体识别 (NER) 的高亮文本输出
     highlighted_text = []
     last_pos = 0
     sorted_extractions = sorted(grounded_extractions, key=lambda e: e.char_interval.start_pos)
@@ -83,7 +102,6 @@ def process_input_and_visualize(input_text, input_file):
             last_pos = end
     highlighted_text.append((source_text[last_pos:], None))
-    # 2. 准备关系提取 (RE) 的结构化 Markdown 输出
     medication_groups = {}
     for extraction in result.extractions:
         group_name = extraction.attributes.get("medication_group", "未分组")
@@ -102,19 +120,13 @@ def process_input_and_visualize(input_text, input_file):
                 structured_output += f"- **{extraction.extraction_class}**: {extraction.extraction_text}{pos_info}\n"
             structured_output += "\n"
-    # 3. 生成并保存交互式可视化文件
     session_id = str(uuid.uuid4())
     output_dir = "/tmp"
     os.makedirs(output_dir, exist_ok=True)
     jsonl_filename = f"extraction_{session_id}.jsonl"
     jsonl_path = os.path.join(output_dir, jsonl_filename)
     html_path = os.path.join(output_dir, f"visualization_{session_id}.html")
-    # --- 关键修复：修正函数名的拼写错误 ---
-    # 错误: lx.io.save_annot_ated_documents(...)
-    # 正确: lx.io.save_annotated_documents(...)
     lx.io.save_annotated_documents([result], output_name=jsonl_filename, output_dir=output_dir)
     html_content = lx.visualize(jsonl_path)
     with open(html_path, "w", encoding="utf-8") as f:
         f.write(html_content)
@@ -124,6 +136,7 @@ def process_input_and_visualize(input_text, input_file):
 # --- 3. 创建 Gradio 应用界面 (保持不变) ---
 with gr.Blocks(theme=gr.themes.Soft(), title="药物信息提取器") as demo:
     gr.Markdown("# ⚕️ LangExtract 药物信息提取器")
     gr.Markdown("一个基于大型语言模型的智能工具，可从**临床文本**或 **PDF 文件**中自动提取药物、剂量等信息，并进行结构化关联。")

 import os
 import uuid  # 用于生成唯一的文件名
 import fitz  # 导入 PyMuPDF 库
+import time  # 导入 time 模块用于等待
 # --- 1. 定义 LangExtract 的提取逻辑 (保持不变) ---
 def get_extraction_config():
     ]
     return prompt_description, examples
+# --- 2. Gradio 的核心处理函数 (已增加自动重试功能) ---
 def process_input_and_visualize(input_text, input_file):
     source_text = ""
     if input_file is not None:
         try:
     if not api_key:
         raise gr.Error("错误：服务器未配置 LANGEXTRACT_API_KEY。")
+    # --- 新增：自动重试逻辑 ---
+    max_retries = 3
+    retry_delay = 5  # 秒
+    result = None
+    for attempt in range(max_retries):
+        try:
+            print(f"Attempting to call Gemini API, attempt #{attempt + 1}")
+            result = lx.extract(
+                text_or_documents=source_text,
+                prompt_description=prompt,
+                examples=examples,
+                model_id="gemini-2.5-flash",
+                api_key=api_key,
+                max_workers=2,
+                extraction_passes=2,
+                max_char_buffer=1500
+            )
+            # 如果成功，跳出循环
+            print("API call successful.")
+            break
+        except Exception as e:
+            # 捕获所有 langextract 相关的运行时错误
+            print(f"Attempt #{attempt + 1} failed with error: {e}")
+            if "503" in str(e) and attempt < max_retries - 1:
+                # 如果是 503 错误，并且还有重试机会，则等待后重试
+                print(f"Server overloaded (503). Retrying in {retry_delay} seconds...")
+                time.sleep(retry_delay)
+            else:
+                # 如果是其他错误，或已达到最大重试次数，则直接抛出异常
+                raise gr.Error(f"信息提取过程中发生错误: {e}")
+    # 如果所有重试都失败了，result 仍然会是 None，这里可以加一个检查
+    if result is None:
+        raise gr.Error("所有重试均失败，无法从API获取结果。请稍后再试。")
+    # --- 后续处理代码保持不变 ---
+    grounded_extractions = [e for e in result.extractions if e.char_interval]
     highlighted_text = []
     last_pos = 0
     sorted_extractions = sorted(grounded_extractions, key=lambda e: e.char_interval.start_pos)
             last_pos = end
     highlighted_text.append((source_text[last_pos:], None))
     medication_groups = {}
     for extraction in result.extractions:
         group_name = extraction.attributes.get("medication_group", "未分组")
                 structured_output += f"- **{extraction.extraction_class}**: {extraction.extraction_text}{pos_info}\n"
             structured_output += "\n"
     session_id = str(uuid.uuid4())
     output_dir = "/tmp"
     os.makedirs(output_dir, exist_ok=True)
     jsonl_filename = f"extraction_{session_id}.jsonl"
     jsonl_path = os.path.join(output_dir, jsonl_filename)
     html_path = os.path.join(output_dir, f"visualization_{session_id}.html")
     lx.io.save_annotated_documents([result], output_name=jsonl_filename, output_dir=output_dir)
     html_content = lx.visualize(jsonl_path)
     with open(html_path, "w", encoding="utf-8") as f:
         f.write(html_content)
 # --- 3. 创建 Gradio 应用界面 (保持不变) ---
 with gr.Blocks(theme=gr.themes.Soft(), title="药物信息提取器") as demo:
+    # ... (界面部分代码无需修改) ...
     gr.Markdown("# ⚕️ LangExtract 药物信息提取器")
     gr.Markdown("一个基于大型语言模型的智能工具，可从**临床文本**或 **PDF 文件**中自动提取药物、剂量等信息，并进行结构化关联。")