Update app.py
Browse files換成 RashidNLP/NER-Deberta 版
app.py
CHANGED
|
@@ -1,38 +1,62 @@
|
|
| 1 |
# app.py
|
| 2 |
-
#
|
| 3 |
-
# Model: ficsort/deberta-v3-base-conll2003-ner
|
| 4 |
#
|
| 5 |
-
#
|
| 6 |
-
#
|
| 7 |
-
#
|
| 8 |
-
#
|
| 9 |
-
#
|
| 10 |
|
| 11 |
import gradio as gr
|
| 12 |
import torch
|
| 13 |
-
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
| 14 |
|
| 15 |
-
MODEL_ID = "
|
| 16 |
|
| 17 |
-
# Load once at startup
|
| 18 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
|
| 19 |
model = AutoModelForTokenClassification.from_pretrained(MODEL_ID)
|
| 20 |
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
def run_ner(text: str, max_length: int, show_tokens: bool):
|
| 31 |
text = (text or "").strip()
|
| 32 |
if not text:
|
| 33 |
return [], ""
|
| 34 |
|
| 35 |
-
#
|
| 36 |
enc = tokenizer(
|
| 37 |
text,
|
| 38 |
return_tensors="pt",
|
|
@@ -41,24 +65,23 @@ def run_ner(text: str, max_length: int, show_tokens: bool):
|
|
| 41 |
return_offsets_mapping=True,
|
| 42 |
)
|
| 43 |
|
| 44 |
-
offsets = enc.pop("offset_mapping")[0].tolist()
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
-
# 2) forward
|
| 48 |
with torch.no_grad():
|
| 49 |
out = model(**enc)
|
|
|
|
| 50 |
logits = out.logits[0] # (seq_len, num_labels)
|
| 51 |
pred_ids = logits.argmax(dim=-1).tolist()
|
| 52 |
-
|
| 53 |
id2label = model.config.id2label
|
| 54 |
|
| 55 |
-
#
|
| 56 |
-
tokens = tokenizer.convert_ids_to_tokens(enc["input_ids"][0].tolist())
|
| 57 |
-
|
| 58 |
per_token = []
|
| 59 |
for tok, pid, (st, ed) in zip(tokens, pred_ids, offsets):
|
| 60 |
-
|
| 61 |
-
if st == 0 and ed == 0 and tok in tokenizer.all_special_tokens:
|
| 62 |
continue
|
| 63 |
if st == ed:
|
| 64 |
continue
|
|
@@ -67,62 +90,35 @@ def run_ner(text: str, max_length: int, show_tokens: bool):
|
|
| 67 |
"label": id2label[pid],
|
| 68 |
"start": int(st),
|
| 69 |
"end": int(ed),
|
| 70 |
-
"score": float(torch.softmax(logits, dim=-1)[per_token.__len__() if False else 0][pid]) if False else None
|
| 71 |
})
|
| 72 |
|
| 73 |
-
|
| 74 |
-
spans = []
|
| 75 |
-
cur = None
|
| 76 |
-
|
| 77 |
-
def tok_text(st, ed):
|
| 78 |
-
return text[st:ed]
|
| 79 |
-
|
| 80 |
-
for t in per_token:
|
| 81 |
-
lab = t["label"]
|
| 82 |
-
st, ed = t["start"], t["end"]
|
| 83 |
|
| 84 |
-
|
| 85 |
-
if cur:
|
| 86 |
-
spans.append(cur)
|
| 87 |
-
cur = {"entity": lab[2:], "start": st, "end": ed}
|
| 88 |
-
elif lab.startswith("I-") and cur and cur["entity"] == lab[2:]:
|
| 89 |
-
cur["end"] = ed
|
| 90 |
-
else:
|
| 91 |
-
if cur:
|
| 92 |
-
spans.append(cur)
|
| 93 |
-
cur = None
|
| 94 |
-
|
| 95 |
-
if cur:
|
| 96 |
-
spans.append(cur)
|
| 97 |
-
|
| 98 |
-
# 5) 輸出表格 rows
|
| 99 |
table_rows = []
|
| 100 |
for s in spans:
|
| 101 |
table_rows.append([
|
| 102 |
s["entity"],
|
| 103 |
text[s["start"]:s["end"]],
|
| 104 |
-
0.0, # score 先用 0
|
| 105 |
s["start"],
|
| 106 |
s["end"],
|
| 107 |
])
|
| 108 |
|
| 109 |
debug = ""
|
| 110 |
if show_tokens:
|
| 111 |
-
|
| 112 |
for t in per_token:
|
| 113 |
-
|
| 114 |
-
debug = "\n".join(
|
| 115 |
|
| 116 |
return table_rows, debug
|
| 117 |
|
| 118 |
-
with gr.Blocks(title="
|
| 119 |
-
gr.Markdown(
|
| 120 |
-
|
| 121 |
-
# DeBERTa NER Demo (CoNLL-2003)
|
| 122 |
Model: **{MODEL_ID}**
|
| 123 |
-
|
| 124 |
-
"""
|
| 125 |
-
)
|
| 126 |
|
| 127 |
with gr.Row():
|
| 128 |
max_length = gr.Slider(64, 512, value=256, step=32, label="max_length (truncate)")
|
|
@@ -131,22 +127,23 @@ Entities: typically **PER / ORG / LOC / MISC** (CoNLL-2003 style)
|
|
| 131 |
text = gr.Textbox(
|
| 132 |
label="Input text",
|
| 133 |
lines=10,
|
| 134 |
-
value="Tim Chen
|
| 135 |
placeholder="Paste text here (e.g., OCR output).",
|
| 136 |
)
|
| 137 |
|
| 138 |
btn = gr.Button("Run NER")
|
| 139 |
|
| 140 |
out_table = gr.Dataframe(
|
| 141 |
-
label="
|
| 142 |
-
headers=["entity", "text", "
|
| 143 |
-
datatype=["str", "str", "number", "number"
|
| 144 |
interactive=False,
|
| 145 |
wrap=True,
|
| 146 |
)
|
| 147 |
|
| 148 |
-
debug_box = gr.Textbox(label="Raw token output", lines=12
|
| 149 |
|
| 150 |
btn.click(fn=run_ner, inputs=[text, max_length, show_tokens], outputs=[out_table, debug_box])
|
| 151 |
|
| 152 |
-
|
|
|
|
|
|
| 1 |
# app.py
|
| 2 |
+
# Gradio NER demo using: RashidNLP/NER-Deberta (Few-NERD labels)
|
|
|
|
| 3 |
#
|
| 4 |
+
# requirements.txt 建議:
|
| 5 |
+
# gradio>=4.0
|
| 6 |
+
# transformers>=4.35
|
| 7 |
+
# torch
|
| 8 |
+
# sentencepiece
|
| 9 |
|
| 10 |
import gradio as gr
|
| 11 |
import torch
|
| 12 |
+
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
| 13 |
|
| 14 |
+
MODEL_ID = "RashidNLP/NER-Deberta"
|
| 15 |
|
| 16 |
+
# Load once at startup
|
| 17 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
|
| 18 |
model = AutoModelForTokenClassification.from_pretrained(MODEL_ID)
|
| 19 |
|
| 20 |
+
# Put model on GPU if available (Spaces 通常是 CPU)
|
| 21 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 22 |
+
model.to(device)
|
| 23 |
+
model.eval()
|
| 24 |
+
|
| 25 |
+
def merge_bio_spans(text: str, per_token):
|
| 26 |
+
"""
|
| 27 |
+
per_token: list of dict {label, start, end}
|
| 28 |
+
returns: list of dict {entity, start, end}
|
| 29 |
+
"""
|
| 30 |
+
spans = []
|
| 31 |
+
cur = None
|
| 32 |
+
|
| 33 |
+
def close_cur():
|
| 34 |
+
nonlocal cur
|
| 35 |
+
if cur:
|
| 36 |
+
spans.append(cur)
|
| 37 |
+
cur = None
|
| 38 |
+
|
| 39 |
+
for t in per_token:
|
| 40 |
+
lab = t["label"]
|
| 41 |
+
st, ed = t["start"], t["end"]
|
| 42 |
+
|
| 43 |
+
if lab.startswith("B-"):
|
| 44 |
+
close_cur()
|
| 45 |
+
cur = {"entity": lab[2:], "start": st, "end": ed}
|
| 46 |
+
elif lab.startswith("I-") and cur and cur["entity"] == lab[2:]:
|
| 47 |
+
cur["end"] = ed
|
| 48 |
+
else:
|
| 49 |
+
close_cur()
|
| 50 |
+
|
| 51 |
+
close_cur()
|
| 52 |
+
return spans
|
| 53 |
|
| 54 |
def run_ner(text: str, max_length: int, show_tokens: bool):
|
| 55 |
text = (text or "").strip()
|
| 56 |
if not text:
|
| 57 |
return [], ""
|
| 58 |
|
| 59 |
+
# tokenize with truncation control
|
| 60 |
enc = tokenizer(
|
| 61 |
text,
|
| 62 |
return_tensors="pt",
|
|
|
|
| 65 |
return_offsets_mapping=True,
|
| 66 |
)
|
| 67 |
|
| 68 |
+
offsets = enc.pop("offset_mapping")[0].tolist()
|
| 69 |
+
input_ids = enc["input_ids"][0].tolist()
|
| 70 |
+
tokens = tokenizer.convert_ids_to_tokens(input_ids)
|
| 71 |
+
|
| 72 |
+
enc = {k: v.to(device) for k, v in enc.items()}
|
| 73 |
|
|
|
|
| 74 |
with torch.no_grad():
|
| 75 |
out = model(**enc)
|
| 76 |
+
|
| 77 |
logits = out.logits[0] # (seq_len, num_labels)
|
| 78 |
pred_ids = logits.argmax(dim=-1).tolist()
|
|
|
|
| 79 |
id2label = model.config.id2label
|
| 80 |
|
| 81 |
+
# build per-token labels (skip specials)
|
|
|
|
|
|
|
| 82 |
per_token = []
|
| 83 |
for tok, pid, (st, ed) in zip(tokens, pred_ids, offsets):
|
| 84 |
+
if tok in tokenizer.all_special_tokens:
|
|
|
|
| 85 |
continue
|
| 86 |
if st == ed:
|
| 87 |
continue
|
|
|
|
| 90 |
"label": id2label[pid],
|
| 91 |
"start": int(st),
|
| 92 |
"end": int(ed),
|
|
|
|
| 93 |
})
|
| 94 |
|
| 95 |
+
spans = merge_bio_spans(text, per_token)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
+
# Return 2D list to avoid `[object Object]`
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
table_rows = []
|
| 99 |
for s in spans:
|
| 100 |
table_rows.append([
|
| 101 |
s["entity"],
|
| 102 |
text[s["start"]:s["end"]],
|
|
|
|
| 103 |
s["start"],
|
| 104 |
s["end"],
|
| 105 |
])
|
| 106 |
|
| 107 |
debug = ""
|
| 108 |
if show_tokens:
|
| 109 |
+
lines = ["token\tlabel\t[offsets]"]
|
| 110 |
for t in per_token:
|
| 111 |
+
lines.append(f"{t['token']}\t{t['label']}\t[{t['start']},{t['end']}]")
|
| 112 |
+
debug = "\n".join(lines)
|
| 113 |
|
| 114 |
return table_rows, debug
|
| 115 |
|
| 116 |
+
with gr.Blocks(title="NER-Deberta (Few-NERD) Demo") as demo:
|
| 117 |
+
gr.Markdown(f"""
|
| 118 |
+
# NER Demo
|
|
|
|
| 119 |
Model: **{MODEL_ID}**
|
| 120 |
+
Note: This model uses **Few-NERD** style labels (more entity types than CoNLL-2003).
|
| 121 |
+
""")
|
|
|
|
| 122 |
|
| 123 |
with gr.Row():
|
| 124 |
max_length = gr.Slider(64, 512, value=256, step=32, label="max_length (truncate)")
|
|
|
|
| 127 |
text = gr.Textbox(
|
| 128 |
label="Input text",
|
| 129 |
lines=10,
|
| 130 |
+
value="Tim Chen\nSenior Software Engineer\nApple Inc.\nTaipei, Taiwan",
|
| 131 |
placeholder="Paste text here (e.g., OCR output).",
|
| 132 |
)
|
| 133 |
|
| 134 |
btn = gr.Button("Run NER")
|
| 135 |
|
| 136 |
out_table = gr.Dataframe(
|
| 137 |
+
label="Entities (spans)",
|
| 138 |
+
headers=["entity", "text", "start", "end"],
|
| 139 |
+
datatype=["str", "str", "number", "number"],
|
| 140 |
interactive=False,
|
| 141 |
wrap=True,
|
| 142 |
)
|
| 143 |
|
| 144 |
+
debug_box = gr.Textbox(label="Raw token output", lines=12)
|
| 145 |
|
| 146 |
btn.click(fn=run_ner, inputs=[text, max_length, show_tokens], outputs=[out_table, debug_box])
|
| 147 |
|
| 148 |
+
if __name__ == "__main__":
|
| 149 |
+
demo.launch()
|