leonsimon23 commited on
Commit
69c4dc7
·
verified ·
1 Parent(s): b4e60d5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -26
app.py CHANGED
@@ -4,6 +4,7 @@ import textwrap
4
  import os
5
  import uuid # 用于生成唯一的文件名
6
  import fitz # 导入 PyMuPDF 库
 
7
 
8
  # --- 1. 定义 LangExtract 的提取逻辑 (保持不变) ---
9
  def get_extraction_config():
@@ -32,11 +33,8 @@ def get_extraction_config():
32
  ]
33
  return prompt_description, examples
34
 
35
- # --- 2. Gradio 的核心处理函数 (已修复) ---
36
  def process_input_and_visualize(input_text, input_file):
37
- """
38
- 接收文本或文件输入,解析内容,调用 LangExtract 的高级功能进行处理,并返回结果。
39
- """
40
  source_text = ""
41
  if input_file is not None:
42
  try:
@@ -55,23 +53,44 @@ def process_input_and_visualize(input_text, input_file):
55
  if not api_key:
56
  raise gr.Error("错误:服务器未配置 LANGEXTRACT_API_KEY。")
57
 
58
- try:
59
- result = lx.extract(
60
- text_or_documents=source_text,
61
- prompt_description=prompt,
62
- examples=examples,
63
- model_id="gemini-2.5-flash",
64
- api_key=api_key,
65
- max_workers=2,
66
- extraction_passes=2,
67
- max_char_buffer=1500
68
- )
69
- except Exception as e:
70
- raise gr.Error(f"信息提取过程中发生错误: {e}")
71
 
72
- grounded_extractions = [e for e in result.extractions if e.char_interval]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
- # 1. 准备命名实体识别 (NER) 的高亮文本输出
 
75
  highlighted_text = []
76
  last_pos = 0
77
  sorted_extractions = sorted(grounded_extractions, key=lambda e: e.char_interval.start_pos)
@@ -83,7 +102,6 @@ def process_input_and_visualize(input_text, input_file):
83
  last_pos = end
84
  highlighted_text.append((source_text[last_pos:], None))
85
 
86
- # 2. 准备关系提取 (RE) 的结构化 Markdown 输出
87
  medication_groups = {}
88
  for extraction in result.extractions:
89
  group_name = extraction.attributes.get("medication_group", "未分组")
@@ -102,19 +120,13 @@ def process_input_and_visualize(input_text, input_file):
102
  structured_output += f"- **{extraction.extraction_class}**: {extraction.extraction_text}{pos_info}\n"
103
  structured_output += "\n"
104
 
105
- # 3. 生成并保存交互式可视化文件
106
  session_id = str(uuid.uuid4())
107
  output_dir = "/tmp"
108
  os.makedirs(output_dir, exist_ok=True)
109
  jsonl_filename = f"extraction_{session_id}.jsonl"
110
  jsonl_path = os.path.join(output_dir, jsonl_filename)
111
  html_path = os.path.join(output_dir, f"visualization_{session_id}.html")
112
-
113
- # --- 关键修复:修正函数名的拼写错误 ---
114
- # 错误: lx.io.save_annot_ated_documents(...)
115
- # 正确: lx.io.save_annotated_documents(...)
116
  lx.io.save_annotated_documents([result], output_name=jsonl_filename, output_dir=output_dir)
117
-
118
  html_content = lx.visualize(jsonl_path)
119
  with open(html_path, "w", encoding="utf-8") as f:
120
  f.write(html_content)
@@ -124,6 +136,7 @@ def process_input_and_visualize(input_text, input_file):
124
 
125
  # --- 3. 创建 Gradio 应用界面 (保持不变) ---
126
  with gr.Blocks(theme=gr.themes.Soft(), title="药物信息提取器") as demo:
 
127
  gr.Markdown("# ⚕️ LangExtract 药物信息提取器")
128
  gr.Markdown("一个基于大型语言模型的智能工具,可从**临床文本**或 **PDF 文件**中自动提取药物、剂量等信息,并进行结构化关联。")
129
 
 
4
  import os
5
  import uuid # 用于生成唯一的文件名
6
  import fitz # 导入 PyMuPDF 库
7
+ import time # 导入 time 模块用于等待
8
 
9
  # --- 1. 定义 LangExtract 的提取逻辑 (保持不变) ---
10
  def get_extraction_config():
 
33
  ]
34
  return prompt_description, examples
35
 
36
+ # --- 2. Gradio 的核心处理函数 (已增加自动重试功能) ---
37
  def process_input_and_visualize(input_text, input_file):
 
 
 
38
  source_text = ""
39
  if input_file is not None:
40
  try:
 
53
  if not api_key:
54
  raise gr.Error("错误:服务器未配置 LANGEXTRACT_API_KEY。")
55
 
56
+ # --- 新增:自动重试逻辑 ---
57
+ max_retries = 3
58
+ retry_delay = 5 # 秒
59
+ result = None
 
 
 
 
 
 
 
 
 
60
 
61
+ for attempt in range(max_retries):
62
+ try:
63
+ print(f"Attempting to call Gemini API, attempt #{attempt + 1}")
64
+ result = lx.extract(
65
+ text_or_documents=source_text,
66
+ prompt_description=prompt,
67
+ examples=examples,
68
+ model_id="gemini-2.5-flash",
69
+ api_key=api_key,
70
+ max_workers=2,
71
+ extraction_passes=2,
72
+ max_char_buffer=1500
73
+ )
74
+ # 如果成功,跳出循环
75
+ print("API call successful.")
76
+ break
77
+ except Exception as e:
78
+ # 捕获所有 langextract 相关的运行时错误
79
+ print(f"Attempt #{attempt + 1} failed with error: {e}")
80
+ if "503" in str(e) and attempt < max_retries - 1:
81
+ # 如果是 503 错误,并且还有重试机会,则等待后重试
82
+ print(f"Server overloaded (503). Retrying in {retry_delay} seconds...")
83
+ time.sleep(retry_delay)
84
+ else:
85
+ # 如果是其他错误,或已达到最大重试次数,则直接抛出异常
86
+ raise gr.Error(f"信息提取过程中发生错误: {e}")
87
+
88
+ # 如果所有重试都失败了,result 仍然会是 None,这里可以加一个检查
89
+ if result is None:
90
+ raise gr.Error("所有重试均失败,无法从API获取结果。请稍后再试。")
91
 
92
+ # --- 后续处理代码保持不变 ---
93
+ grounded_extractions = [e for e in result.extractions if e.char_interval]
94
  highlighted_text = []
95
  last_pos = 0
96
  sorted_extractions = sorted(grounded_extractions, key=lambda e: e.char_interval.start_pos)
 
102
  last_pos = end
103
  highlighted_text.append((source_text[last_pos:], None))
104
 
 
105
  medication_groups = {}
106
  for extraction in result.extractions:
107
  group_name = extraction.attributes.get("medication_group", "未分组")
 
120
  structured_output += f"- **{extraction.extraction_class}**: {extraction.extraction_text}{pos_info}\n"
121
  structured_output += "\n"
122
 
 
123
  session_id = str(uuid.uuid4())
124
  output_dir = "/tmp"
125
  os.makedirs(output_dir, exist_ok=True)
126
  jsonl_filename = f"extraction_{session_id}.jsonl"
127
  jsonl_path = os.path.join(output_dir, jsonl_filename)
128
  html_path = os.path.join(output_dir, f"visualization_{session_id}.html")
 
 
 
 
129
  lx.io.save_annotated_documents([result], output_name=jsonl_filename, output_dir=output_dir)
 
130
  html_content = lx.visualize(jsonl_path)
131
  with open(html_path, "w", encoding="utf-8") as f:
132
  f.write(html_content)
 
136
 
137
  # --- 3. 创建 Gradio 应用界面 (保持不变) ---
138
  with gr.Blocks(theme=gr.themes.Soft(), title="药物信息提取器") as demo:
139
+ # ... (界面部分代码无需修改) ...
140
  gr.Markdown("# ⚕️ LangExtract 药物信息提取器")
141
  gr.Markdown("一个基于大型语言模型的智能工具,可从**临床文本**或 **PDF 文件**中自动提取药物、剂量等信息,并进行结构化关联。")
142