Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -32,7 +32,7 @@ def get_extraction_config():
|
|
| 32 |
]
|
| 33 |
return prompt_description, examples
|
| 34 |
|
| 35 |
-
# --- 2. Gradio 的核心处理函数 (
|
| 36 |
def process_input_and_visualize(input_text, input_file):
|
| 37 |
"""
|
| 38 |
接收文本或文件输入,解析内容,调用 LangExtract 的高级功能进行处理,并返回结果。
|
|
@@ -69,14 +69,11 @@ def process_input_and_visualize(input_text, input_file):
|
|
| 69 |
except Exception as e:
|
| 70 |
raise gr.Error(f"信息提取过程中发生错误: {e}")
|
| 71 |
|
| 72 |
-
# --- 关键修复:过滤掉无法定位的实体 ---
|
| 73 |
-
# 只有那些 char_interval 不为 None 的实体才是有位置信息的,才能被排序和高亮
|
| 74 |
grounded_extractions = [e for e in result.extractions if e.char_interval]
|
| 75 |
|
| 76 |
# 1. 准备命名实体识别 (NER) 的高亮文本输出
|
| 77 |
highlighted_text = []
|
| 78 |
last_pos = 0
|
| 79 |
-
# 现在我们对过滤后的、保证有位置信息的列表进行排序
|
| 80 |
sorted_extractions = sorted(grounded_extractions, key=lambda e: e.char_interval.start_pos)
|
| 81 |
for entity in sorted_extractions:
|
| 82 |
start, end = entity.char_interval.start_pos, entity.char_interval.end_pos
|
|
@@ -88,7 +85,6 @@ def process_input_and_visualize(input_text, input_file):
|
|
| 88 |
|
| 89 |
# 2. 准备关系提取 (RE) 的结构化 Markdown 输出
|
| 90 |
medication_groups = {}
|
| 91 |
-
# 注意:这里我们仍然遍历所有实体(包括未定位的),因为它们可能仍有有用的属性信息
|
| 92 |
for extraction in result.extractions:
|
| 93 |
group_name = extraction.attributes.get("medication_group", "未分组")
|
| 94 |
medication_groups.setdefault(group_name, []).append(extraction)
|
|
@@ -99,12 +95,10 @@ def process_input_and_visualize(input_text, input_file):
|
|
| 99 |
else:
|
| 100 |
for med_name, extractions in medication_groups.items():
|
| 101 |
structured_output += f"#### 药物组: {med_name}\n"
|
| 102 |
-
# 我们在显示时检查 char_interval 是否存在
|
| 103 |
for extraction in sorted(extractions, key=lambda e: e.char_interval.start_pos if e.char_interval else -1):
|
| 104 |
pos_info = ""
|
| 105 |
if extraction.char_interval:
|
| 106 |
pos_info = f" (位置: {extraction.char_interval.start_pos}-{extraction.char_interval.end_pos})"
|
| 107 |
-
# 即使没有位置信息,我们仍然显示实体本身
|
| 108 |
structured_output += f"- **{extraction.extraction_class}**: {extraction.extraction_text}{pos_info}\n"
|
| 109 |
structured_output += "\n"
|
| 110 |
|
|
@@ -115,7 +109,7 @@ def process_input_and_visualize(input_text, input_file):
|
|
| 115 |
jsonl_filename = f"extraction_{session_id}.jsonl"
|
| 116 |
jsonl_path = os.path.join(output_dir, jsonl_filename)
|
| 117 |
html_path = os.path.join(output_dir, f"visualization_{session_id}.html")
|
| 118 |
-
lx.io.
|
| 119 |
html_content = lx.visualize(jsonl_path)
|
| 120 |
with open(html_path, "w", encoding="utf-8") as f:
|
| 121 |
f.write(html_content)
|
|
@@ -123,9 +117,8 @@ def process_input_and_visualize(input_text, input_file):
|
|
| 123 |
return highlighted_text, structured_output, html_path, html_path, jsonl_path
|
| 124 |
|
| 125 |
|
| 126 |
-
# --- 3. 创建 Gradio 应用界面 (
|
| 127 |
with gr.Blocks(theme=gr.themes.Soft(), title="药物信息提取器") as demo:
|
| 128 |
-
# ... (界面部分代码无需修改) ...
|
| 129 |
gr.Markdown("# ⚕️ LangExtract 药物信息提取器")
|
| 130 |
gr.Markdown("一个基于大型语言模型的智能工具,可从**临床文本**或 **PDF 文件**中自动提取药物、剂量等信息,并进行结构化关联。")
|
| 131 |
|
|
@@ -159,12 +152,17 @@ with gr.Blocks(theme=gr.themes.Soft(), title="药物信息提取器") as demo:
|
|
| 159 |
with gr.TabItem("📁 文件下载"):
|
| 160 |
gr.Markdown("### 下载提取结果")
|
| 161 |
download_html = gr.File(label="下载交互式 HTML 文件")
|
|
|
|
| 162 |
download_jsonl = gr.File(label="下载 JSONL 数据文件")
|
| 163 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
submit_btn.click(
|
| 165 |
fn=process_input_and_visualize,
|
| 166 |
inputs=[input_textbox, input_file_uploader],
|
| 167 |
-
outputs=[output_highlight, output_structured, output_html_viewer, download_html,
|
| 168 |
).then(
|
| 169 |
lambda: (None, None),
|
| 170 |
inputs=None,
|
|
|
|
| 32 |
]
|
| 33 |
return prompt_description, examples
|
| 34 |
|
| 35 |
+
# --- 2. Gradio 的核心处理函数 (保持不变) ---
|
| 36 |
def process_input_and_visualize(input_text, input_file):
|
| 37 |
"""
|
| 38 |
接收文本或文件输入,解析内容,调用 LangExtract 的高级功能进行处理,并返回结果。
|
|
|
|
| 69 |
except Exception as e:
|
| 70 |
raise gr.Error(f"信息提取过程中发生错误: {e}")
|
| 71 |
|
|
|
|
|
|
|
| 72 |
grounded_extractions = [e for e in result.extractions if e.char_interval]
|
| 73 |
|
| 74 |
# 1. 准备命名实体识别 (NER) 的高亮文本输出
|
| 75 |
highlighted_text = []
|
| 76 |
last_pos = 0
|
|
|
|
| 77 |
sorted_extractions = sorted(grounded_extractions, key=lambda e: e.char_interval.start_pos)
|
| 78 |
for entity in sorted_extractions:
|
| 79 |
start, end = entity.char_interval.start_pos, entity.char_interval.end_pos
|
|
|
|
| 85 |
|
| 86 |
# 2. 准备关系提取 (RE) 的结构化 Markdown 输出
|
| 87 |
medication_groups = {}
|
|
|
|
| 88 |
for extraction in result.extractions:
|
| 89 |
group_name = extraction.attributes.get("medication_group", "未分组")
|
| 90 |
medication_groups.setdefault(group_name, []).append(extraction)
|
|
|
|
| 95 |
else:
|
| 96 |
for med_name, extractions in medication_groups.items():
|
| 97 |
structured_output += f"#### 药物组: {med_name}\n"
|
|
|
|
| 98 |
for extraction in sorted(extractions, key=lambda e: e.char_interval.start_pos if e.char_interval else -1):
|
| 99 |
pos_info = ""
|
| 100 |
if extraction.char_interval:
|
| 101 |
pos_info = f" (位置: {extraction.char_interval.start_pos}-{extraction.char_interval.end_pos})"
|
|
|
|
| 102 |
structured_output += f"- **{extraction.extraction_class}**: {extraction.extraction_text}{pos_info}\n"
|
| 103 |
structured_output += "\n"
|
| 104 |
|
|
|
|
| 109 |
jsonl_filename = f"extraction_{session_id}.jsonl"
|
| 110 |
jsonl_path = os.path.join(output_dir, jsonl_filename)
|
| 111 |
html_path = os.path.join(output_dir, f"visualization_{session_id}.html")
|
| 112 |
+
lx.io.save_annot_ated_documents([result], output_name=jsonl_filename, output_dir=output_dir)
|
| 113 |
html_content = lx.visualize(jsonl_path)
|
| 114 |
with open(html_path, "w", encoding="utf-8") as f:
|
| 115 |
f.write(html_content)
|
|
|
|
| 117 |
return highlighted_text, structured_output, html_path, html_path, jsonl_path
|
| 118 |
|
| 119 |
|
| 120 |
+
# --- 3. 创建 Gradio 应用界面 (已修复) ---
|
| 121 |
with gr.Blocks(theme=gr.themes.Soft(), title="药物信息提取器") as demo:
|
|
|
|
| 122 |
gr.Markdown("# ⚕️ LangExtract 药物信息提取器")
|
| 123 |
gr.Markdown("一个基于大型语言模型的智能工具,可从**临床文本**或 **PDF 文件**中自动提取药物、剂量等信息,并进行结构化关联。")
|
| 124 |
|
|
|
|
| 152 |
with gr.TabItem("📁 文件下载"):
|
| 153 |
gr.Markdown("### 下载提取结果")
|
| 154 |
download_html = gr.File(label="下载交互式 HTML 文件")
|
| 155 |
+
# 这是我们定义的用于接收 JSONL 文件的 Gradio 组件
|
| 156 |
download_jsonl = gr.File(label="下载 JSONL 数据文件")
|
| 157 |
|
| 158 |
+
# --- 关键修复 ---
|
| 159 |
+
# 原代码错误: ... outputs=[..., jsonl_path]
|
| 160 |
+
# 正确代码: ... outputs=[..., download_jsonl]
|
| 161 |
+
# 必须引用在上面 Blocks 布局中定义的 Gradio 组件变量。
|
| 162 |
submit_btn.click(
|
| 163 |
fn=process_input_and_visualize,
|
| 164 |
inputs=[input_textbox, input_file_uploader],
|
| 165 |
+
outputs=[output_highlight, output_structured, output_html_viewer, download_html, download_jsonl]
|
| 166 |
).then(
|
| 167 |
lambda: (None, None),
|
| 168 |
inputs=None,
|