Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -32,13 +32,12 @@ def get_extraction_config():
|
|
| 32 |
]
|
| 33 |
return prompt_description, examples
|
| 34 |
|
| 35 |
-
# --- 2. Gradio 的核心处理函数 (
|
| 36 |
def process_input_and_visualize(input_text, input_file):
|
| 37 |
"""
|
| 38 |
接收文本或文件输入,解析内容,调用 LangExtract 的高级功能进行处理,并返回结果。
|
| 39 |
"""
|
| 40 |
source_text = ""
|
| 41 |
-
# 优先处理上传的文件
|
| 42 |
if input_file is not None:
|
| 43 |
try:
|
| 44 |
with fitz.open(input_file.name) as doc:
|
|
@@ -46,47 +45,40 @@ def process_input_and_visualize(input_text, input_file):
|
|
| 46 |
source_text += page.get_text()
|
| 47 |
except Exception as e:
|
| 48 |
raise gr.Error(f"PDF 文件解析失败: {e}")
|
| 49 |
-
# 如果没有文件,则使用文本框内容
|
| 50 |
elif input_text and input_text.strip():
|
| 51 |
source_text = input_text
|
| 52 |
-
# 如果两者都为空
|
| 53 |
else:
|
| 54 |
return None, "请输入文本或上传一个PDF文件...", None, None, None
|
| 55 |
|
| 56 |
-
# --- LangExtract 核心提取逻辑 ---
|
| 57 |
prompt, examples = get_extraction_config()
|
| 58 |
-
|
| 59 |
-
# 从环境变量中获取 API Key (在 Hugging Face 中必须设置为 Secret)
|
| 60 |
api_key = os.environ.get("LANGEXTRACT_API_KEY")
|
| 61 |
if not api_key:
|
| 62 |
-
raise gr.Error("错误:服务器未配置 LANGEXTRACT_API_KEY。
|
| 63 |
|
| 64 |
-
# --- 真实的、带高级参数的 LangExtract 调用 ---
|
| 65 |
-
# 这里我们移除了所有模拟代码,直接调用 API
|
| 66 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
result = lx.extract(
|
| 68 |
text_or_documents=source_text,
|
| 69 |
prompt_description=prompt,
|
| 70 |
examples=examples,
|
| 71 |
-
model_id="gemini-2.5-
|
| 72 |
api_key=api_key,
|
| 73 |
-
#
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
max_char_buffer=1500 # 设置文本块的最大字符数,用于智能分块
|
| 77 |
)
|
| 78 |
except Exception as e:
|
| 79 |
-
# 捕获API调用或其他处理中可能发生的错误
|
| 80 |
raise gr.Error(f"信息提取过程中发生错误: {e}")
|
| 81 |
|
| 82 |
# 1. 准备命名实体识别 (NER) 的高亮文本输出
|
| 83 |
highlighted_text = []
|
| 84 |
last_pos = 0
|
| 85 |
-
# 确保实体按位置排序,以便正确高亮显示
|
| 86 |
sorted_extractions = sorted(result.extractions, key=lambda e: e.char_interval.start_pos)
|
| 87 |
for entity in sorted_extractions:
|
| 88 |
start, end = entity.char_interval.start_pos, entity.char_interval.end_pos
|
| 89 |
-
# 检查位置是否有效,防止解析错误
|
| 90 |
if start >= last_pos and end <= len(source_text):
|
| 91 |
highlighted_text.append((source_text[last_pos:start], None))
|
| 92 |
highlighted_text.append((entity.extraction_text, entity.extraction_class))
|
|
@@ -114,14 +106,11 @@ def process_input_and_visualize(input_text, input_file):
|
|
| 114 |
session_id = str(uuid.uuid4())
|
| 115 |
output_dir = "/tmp"
|
| 116 |
os.makedirs(output_dir, exist_ok=True)
|
| 117 |
-
|
| 118 |
jsonl_filename = f"extraction_{session_id}.jsonl"
|
| 119 |
jsonl_path = os.path.join(output_dir, jsonl_filename)
|
| 120 |
html_path = os.path.join(output_dir, f"visualization_{session_id}.html")
|
| 121 |
-
|
| 122 |
lx.io.save_annotated_documents([result], output_name=jsonl_filename, output_dir=output_dir)
|
| 123 |
html_content = lx.visualize(jsonl_path)
|
| 124 |
-
|
| 125 |
with open(html_path, "w", encoding="utf-8") as f:
|
| 126 |
f.write(html_content)
|
| 127 |
|
|
@@ -130,7 +119,8 @@ def process_input_and_visualize(input_text, input_file):
|
|
| 130 |
|
| 131 |
# --- 3. 创建 Gradio 应用界面 (保持不变) ---
|
| 132 |
with gr.Blocks(theme=gr.themes.Soft(), title="药物信息提取器") as demo:
|
| 133 |
-
|
|
|
|
| 134 |
gr.Markdown("一个基于大型语言模型的智能工具,可从**临床文本**或 **PDF 文件**中自动提取药物、剂量等信息,并进行结构化关联。")
|
| 135 |
|
| 136 |
with gr.Row():
|
|
@@ -175,5 +165,6 @@ with gr.Blocks(theme=gr.themes.Soft(), title="药物信息提取器") as demo:
|
|
| 175 |
outputs=[input_textbox, input_file_uploader]
|
| 176 |
)
|
| 177 |
|
|
|
|
| 178 |
if __name__ == "__main__":
|
| 179 |
demo.launch()
|
|
|
|
| 32 |
]
|
| 33 |
return prompt_description, examples
|
| 34 |
|
| 35 |
+
# --- 2. Gradio 的核心处理函数 (已修复) ---
|
| 36 |
def process_input_and_visualize(input_text, input_file):
|
| 37 |
"""
|
| 38 |
接收文本或文件输入,解析内容,调用 LangExtract 的高级功能进行处理,并返回结果。
|
| 39 |
"""
|
| 40 |
source_text = ""
|
|
|
|
| 41 |
if input_file is not None:
|
| 42 |
try:
|
| 43 |
with fitz.open(input_file.name) as doc:
|
|
|
|
| 45 |
source_text += page.get_text()
|
| 46 |
except Exception as e:
|
| 47 |
raise gr.Error(f"PDF 文件解析失败: {e}")
|
|
|
|
| 48 |
elif input_text and input_text.strip():
|
| 49 |
source_text = input_text
|
|
|
|
| 50 |
else:
|
| 51 |
return None, "请输入文本或上传一个PDF文件...", None, None, None
|
| 52 |
|
|
|
|
| 53 |
prompt, examples = get_extraction_config()
|
|
|
|
|
|
|
| 54 |
api_key = os.environ.get("LANGEXTRACT_API_KEY")
|
| 55 |
if not api_key:
|
| 56 |
+
raise gr.Error("错误:服务器未配置 LANGEXTRACT_API_KEY。")
|
| 57 |
|
|
|
|
|
|
|
| 58 |
try:
|
| 59 |
+
# --- 错误修复与优化 ---
|
| 60 |
+
# 原代码: max_workers=10,这超出了免费API的每分钟2次的请求限制。
|
| 61 |
+
# 修复: 将 max_workers 降低到 2,以匹配免费套餐的配额。
|
| 62 |
+
# 优化: 切换到 gemini-1.5-flash-latest 模型,它速度更快,成本更低,通常有更宽松的免费额度。
|
| 63 |
result = lx.extract(
|
| 64 |
text_or_documents=source_text,
|
| 65 |
prompt_description=prompt,
|
| 66 |
examples=examples,
|
| 67 |
+
model_id="gemini-2.5-flash", # 推荐使用 Flash 模型
|
| 68 |
api_key=api_key,
|
| 69 |
+
max_workers=2, # **关键修复**:将并行工作线程数从10降至2
|
| 70 |
+
extraction_passes=2,
|
| 71 |
+
max_char_buffer=1500
|
|
|
|
| 72 |
)
|
| 73 |
except Exception as e:
|
|
|
|
| 74 |
raise gr.Error(f"信息提取过程中发生错误: {e}")
|
| 75 |
|
| 76 |
# 1. 准备命名实体识别 (NER) 的高亮文本输出
|
| 77 |
highlighted_text = []
|
| 78 |
last_pos = 0
|
|
|
|
| 79 |
sorted_extractions = sorted(result.extractions, key=lambda e: e.char_interval.start_pos)
|
| 80 |
for entity in sorted_extractions:
|
| 81 |
start, end = entity.char_interval.start_pos, entity.char_interval.end_pos
|
|
|
|
| 82 |
if start >= last_pos and end <= len(source_text):
|
| 83 |
highlighted_text.append((source_text[last_pos:start], None))
|
| 84 |
highlighted_text.append((entity.extraction_text, entity.extraction_class))
|
|
|
|
| 106 |
session_id = str(uuid.uuid4())
|
| 107 |
output_dir = "/tmp"
|
| 108 |
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
| 109 |
jsonl_filename = f"extraction_{session_id}.jsonl"
|
| 110 |
jsonl_path = os.path.join(output_dir, jsonl_filename)
|
| 111 |
html_path = os.path.join(output_dir, f"visualization_{session_id}.html")
|
|
|
|
| 112 |
lx.io.save_annotated_documents([result], output_name=jsonl_filename, output_dir=output_dir)
|
| 113 |
html_content = lx.visualize(jsonl_path)
|
|
|
|
| 114 |
with open(html_path, "w", encoding="utf-8") as f:
|
| 115 |
f.write(html_content)
|
| 116 |
|
|
|
|
| 119 |
|
| 120 |
# --- 3. 创建 Gradio 应用界面 (保持不变) ---
|
| 121 |
with gr.Blocks(theme=gr.themes.Soft(), title="药物信息提取器") as demo:
|
| 122 |
+
# ... (界面部分代码无需修改,此处省略以保持简洁) ...
|
| 123 |
+
gr.Markdown("# ⚕️ LangExtract 药物信息提取器")
|
| 124 |
gr.Markdown("一个基于大型语言模型的智能工具,可从**临床文本**或 **PDF 文件**中自动提取药物、剂量等信息,并进行结构化关联。")
|
| 125 |
|
| 126 |
with gr.Row():
|
|
|
|
| 165 |
outputs=[input_textbox, input_file_uploader]
|
| 166 |
)
|
| 167 |
|
| 168 |
+
|
| 169 |
if __name__ == "__main__":
|
| 170 |
demo.launch()
|