Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -4,6 +4,7 @@ import textwrap
|
|
| 4 |
import os
|
| 5 |
import uuid # 用于生成唯一的文件名
|
| 6 |
import fitz # 导入 PyMuPDF 库
|
|
|
|
| 7 |
|
| 8 |
# --- 1. 定义 LangExtract 的提取逻辑 (保持不变) ---
|
| 9 |
def get_extraction_config():
|
|
@@ -32,11 +33,8 @@ def get_extraction_config():
|
|
| 32 |
]
|
| 33 |
return prompt_description, examples
|
| 34 |
|
| 35 |
-
# --- 2. Gradio 的核心处理函数 (
|
| 36 |
def process_input_and_visualize(input_text, input_file):
|
| 37 |
-
"""
|
| 38 |
-
接收文本或文件输入,解析内容,调用 LangExtract 的高级功能进行处理,并返回结果。
|
| 39 |
-
"""
|
| 40 |
source_text = ""
|
| 41 |
if input_file is not None:
|
| 42 |
try:
|
|
@@ -55,23 +53,44 @@ def process_input_and_visualize(input_text, input_file):
|
|
| 55 |
if not api_key:
|
| 56 |
raise gr.Error("错误:服务器未配置 LANGEXTRACT_API_KEY。")
|
| 57 |
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
examples=examples,
|
| 63 |
-
model_id="gemini-2.5-flash",
|
| 64 |
-
api_key=api_key,
|
| 65 |
-
max_workers=2,
|
| 66 |
-
extraction_passes=2,
|
| 67 |
-
max_char_buffer=1500
|
| 68 |
-
)
|
| 69 |
-
except Exception as e:
|
| 70 |
-
raise gr.Error(f"信息提取过程中发生错误: {e}")
|
| 71 |
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
-
#
|
|
|
|
| 75 |
highlighted_text = []
|
| 76 |
last_pos = 0
|
| 77 |
sorted_extractions = sorted(grounded_extractions, key=lambda e: e.char_interval.start_pos)
|
|
@@ -83,7 +102,6 @@ def process_input_and_visualize(input_text, input_file):
|
|
| 83 |
last_pos = end
|
| 84 |
highlighted_text.append((source_text[last_pos:], None))
|
| 85 |
|
| 86 |
-
# 2. 准备关系提取 (RE) 的结构化 Markdown 输出
|
| 87 |
medication_groups = {}
|
| 88 |
for extraction in result.extractions:
|
| 89 |
group_name = extraction.attributes.get("medication_group", "未分组")
|
|
@@ -102,19 +120,13 @@ def process_input_and_visualize(input_text, input_file):
|
|
| 102 |
structured_output += f"- **{extraction.extraction_class}**: {extraction.extraction_text}{pos_info}\n"
|
| 103 |
structured_output += "\n"
|
| 104 |
|
| 105 |
-
# 3. 生成并保存交互式可视化文件
|
| 106 |
session_id = str(uuid.uuid4())
|
| 107 |
output_dir = "/tmp"
|
| 108 |
os.makedirs(output_dir, exist_ok=True)
|
| 109 |
jsonl_filename = f"extraction_{session_id}.jsonl"
|
| 110 |
jsonl_path = os.path.join(output_dir, jsonl_filename)
|
| 111 |
html_path = os.path.join(output_dir, f"visualization_{session_id}.html")
|
| 112 |
-
|
| 113 |
-
# --- 关键修复:修正函数名的拼写错误 ---
|
| 114 |
-
# 错误: lx.io.save_annot_ated_documents(...)
|
| 115 |
-
# 正确: lx.io.save_annotated_documents(...)
|
| 116 |
lx.io.save_annotated_documents([result], output_name=jsonl_filename, output_dir=output_dir)
|
| 117 |
-
|
| 118 |
html_content = lx.visualize(jsonl_path)
|
| 119 |
with open(html_path, "w", encoding="utf-8") as f:
|
| 120 |
f.write(html_content)
|
|
@@ -124,6 +136,7 @@ def process_input_and_visualize(input_text, input_file):
|
|
| 124 |
|
| 125 |
# --- 3. 创建 Gradio 应用界面 (保持不变) ---
|
| 126 |
with gr.Blocks(theme=gr.themes.Soft(), title="药物信息提取器") as demo:
|
|
|
|
| 127 |
gr.Markdown("# ⚕️ LangExtract 药物信息提取器")
|
| 128 |
gr.Markdown("一个基于大型语言模型的智能工具,可从**临床文本**或 **PDF 文件**中自动提取药物、剂量等信息,并进行结构化关联。")
|
| 129 |
|
|
|
|
| 4 |
import os
|
| 5 |
import uuid # 用于生成唯一的文件名
|
| 6 |
import fitz # 导入 PyMuPDF 库
|
| 7 |
+
import time # 导入 time 模块用于等待
|
| 8 |
|
| 9 |
# --- 1. 定义 LangExtract 的提取逻辑 (保持不变) ---
|
| 10 |
def get_extraction_config():
|
|
|
|
| 33 |
]
|
| 34 |
return prompt_description, examples
|
| 35 |
|
| 36 |
+
# --- 2. Gradio 的核心处理函数 (已增加自动重试功能) ---
|
| 37 |
def process_input_and_visualize(input_text, input_file):
|
|
|
|
|
|
|
|
|
|
| 38 |
source_text = ""
|
| 39 |
if input_file is not None:
|
| 40 |
try:
|
|
|
|
| 53 |
if not api_key:
|
| 54 |
raise gr.Error("错误:服务器未配置 LANGEXTRACT_API_KEY。")
|
| 55 |
|
| 56 |
+
# --- 新增:自动重试逻辑 ---
|
| 57 |
+
max_retries = 3
|
| 58 |
+
retry_delay = 5 # 秒
|
| 59 |
+
result = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
+
for attempt in range(max_retries):
|
| 62 |
+
try:
|
| 63 |
+
print(f"Attempting to call Gemini API, attempt #{attempt + 1}")
|
| 64 |
+
result = lx.extract(
|
| 65 |
+
text_or_documents=source_text,
|
| 66 |
+
prompt_description=prompt,
|
| 67 |
+
examples=examples,
|
| 68 |
+
model_id="gemini-2.5-flash",
|
| 69 |
+
api_key=api_key,
|
| 70 |
+
max_workers=2,
|
| 71 |
+
extraction_passes=2,
|
| 72 |
+
max_char_buffer=1500
|
| 73 |
+
)
|
| 74 |
+
# 如果成功,跳出循环
|
| 75 |
+
print("API call successful.")
|
| 76 |
+
break
|
| 77 |
+
except Exception as e:
|
| 78 |
+
# 捕获所有 langextract 相关的运行时错误
|
| 79 |
+
print(f"Attempt #{attempt + 1} failed with error: {e}")
|
| 80 |
+
if "503" in str(e) and attempt < max_retries - 1:
|
| 81 |
+
# 如果是 503 错误,并且还有重试机会,则等待后重试
|
| 82 |
+
print(f"Server overloaded (503). Retrying in {retry_delay} seconds...")
|
| 83 |
+
time.sleep(retry_delay)
|
| 84 |
+
else:
|
| 85 |
+
# 如果是其他错误,或已达到最大重试次数,则直接抛出异常
|
| 86 |
+
raise gr.Error(f"信息提取过程中发生错误: {e}")
|
| 87 |
+
|
| 88 |
+
# 如果所有重试都失败了,result 仍然会是 None,这里可以加一个检查
|
| 89 |
+
if result is None:
|
| 90 |
+
raise gr.Error("所有重试均失败,无法从API获取结果。请稍后再试。")
|
| 91 |
|
| 92 |
+
# --- 后续处理代码保持不变 ---
|
| 93 |
+
grounded_extractions = [e for e in result.extractions if e.char_interval]
|
| 94 |
highlighted_text = []
|
| 95 |
last_pos = 0
|
| 96 |
sorted_extractions = sorted(grounded_extractions, key=lambda e: e.char_interval.start_pos)
|
|
|
|
| 102 |
last_pos = end
|
| 103 |
highlighted_text.append((source_text[last_pos:], None))
|
| 104 |
|
|
|
|
| 105 |
medication_groups = {}
|
| 106 |
for extraction in result.extractions:
|
| 107 |
group_name = extraction.attributes.get("medication_group", "未分组")
|
|
|
|
| 120 |
structured_output += f"- **{extraction.extraction_class}**: {extraction.extraction_text}{pos_info}\n"
|
| 121 |
structured_output += "\n"
|
| 122 |
|
|
|
|
| 123 |
session_id = str(uuid.uuid4())
|
| 124 |
output_dir = "/tmp"
|
| 125 |
os.makedirs(output_dir, exist_ok=True)
|
| 126 |
jsonl_filename = f"extraction_{session_id}.jsonl"
|
| 127 |
jsonl_path = os.path.join(output_dir, jsonl_filename)
|
| 128 |
html_path = os.path.join(output_dir, f"visualization_{session_id}.html")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
lx.io.save_annotated_documents([result], output_name=jsonl_filename, output_dir=output_dir)
|
|
|
|
| 130 |
html_content = lx.visualize(jsonl_path)
|
| 131 |
with open(html_path, "w", encoding="utf-8") as f:
|
| 132 |
f.write(html_content)
|
|
|
|
| 136 |
|
| 137 |
# --- 3. 创建 Gradio 应用界面 (保持不变) ---
|
| 138 |
with gr.Blocks(theme=gr.themes.Soft(), title="药物信息提取器") as demo:
|
| 139 |
+
# ... (界面部分代码无需修改) ...
|
| 140 |
gr.Markdown("# ⚕️ LangExtract 药物信息提取器")
|
| 141 |
gr.Markdown("一个基于大型语言模型的智能工具,可从**临床文本**或 **PDF 文件**中自动提取药物、剂量等信息,并进行结构化关联。")
|
| 142 |
|