Spaces:

magicboker
/

microsoft-deberta-v3-base

Running

App Files Files Community

magicboker commited on 4 days ago

Commit

85a5e50

verified ·

1 Parent(s): 85b9fc4

Update app.py

Browse files

換成 RashidNLP/NER-Deberta 版

Files changed (1) hide show

app.py +68 -71

app.py CHANGED Viewed

@@ -1,38 +1,62 @@
 # app.py
-# DeBERTa-v3-base (CoNLL-2003) NER demo with Gradio
-# Model: ficsort/deberta-v3-base-conll2003-ner
 #
-# Install:
-#   pip install -U gradio transformers torch
-#
-# Run:
-#   python app.py
 import gradio as gr
 import torch
-from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
-MODEL_ID = "ficsort/deberta-v3-base-conll2003-ner"
-# Load once at startup (faster UX)
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
 model = AutoModelForTokenClassification.from_pretrained(MODEL_ID)
-device = 0 if torch.cuda.is_available() else -1  # GPU if available, else CPU
-ner_pipe = pipeline(
-    task="token-classification",
-    model=model,
-    tokenizer=tokenizer,
-    aggregation_strategy="simple",  # merges B-/I- tags into spans
-    device=device,
-)
 def run_ner(text: str, max_length: int, show_tokens: bool):
     text = (text or "").strip()
     if not text:
         return [], ""
-    # 1) 先手動 tokenize（這裡控制 truncation/max_length）
     enc = tokenizer(
         text,
         return_tensors="pt",
@@ -41,24 +65,23 @@ def run_ner(text: str, max_length: int, show_tokens: bool):
         return_offsets_mapping=True,
     )
-    offsets = enc.pop("offset_mapping")[0].tolist()  # (seq_len, 2)
-    enc = {k: v.to(model.device) for k, v in enc.items()}
-    # 2) forward
     with torch.no_grad():
         out = model(**enc)
     logits = out.logits[0]  # (seq_len, num_labels)
     pred_ids = logits.argmax(dim=-1).tolist()
     id2label = model.config.id2label
-    # 3) 取出每個 token 的 label + offset（跳過 special tokens）
-    tokens = tokenizer.convert_ids_to_tokens(enc["input_ids"][0].tolist())
     per_token = []
     for tok, pid, (st, ed) in zip(tokens, pred_ids, offsets):
-        # 跳過 special tokens 或沒有 offset 的 token
-        if st == 0 and ed == 0 and tok in tokenizer.all_special_tokens:
             continue
         if st == ed:
             continue
@@ -67,62 +90,35 @@ def run_ner(text: str, max_length: int, show_tokens: bool):
             "label": id2label[pid],
             "start": int(st),
             "end": int(ed),
-            "score": float(torch.softmax(logits, dim=-1)[per_token.__len__() if False else 0][pid]) if False else None
         })
-    # 4) 將 BIO 標籤合併成 span（簡單版）
-    spans = []
-    cur = None
-    def tok_text(st, ed):
-        return text[st:ed]
-    for t in per_token:
-        lab = t["label"]
-        st, ed = t["start"], t["end"]
-        if lab.startswith("B-"):
-            if cur:
-                spans.append(cur)
-            cur = {"entity": lab[2:], "start": st, "end": ed}
-        elif lab.startswith("I-") and cur and cur["entity"] == lab[2:]:
-            cur["end"] = ed
-        else:
-            if cur:
-                spans.append(cur)
-                cur = None
-    if cur:
-        spans.append(cur)
-    # 5) 輸出表格 rows
     table_rows = []
     for s in spans:
         table_rows.append([
             s["entity"],
             text[s["start"]:s["end"]],
-            0.0,               # score 先用 0
             s["start"],
             s["end"],
         ])
     debug = ""
     if show_tokens:
-        debug_lines = ["token\tlabel\t[offsets]"]
         for t in per_token:
-            debug_lines.append(f"{t['token']}\t{t['label']}\t[{t['start']},{t['end']}]")
-        debug = "\n".join(debug_lines)
     return table_rows, debug
-with gr.Blocks(title="DeBERTa NER (CoNLL-2003)") as demo:
-    gr.Markdown(
-        f"""
-# DeBERTa NER Demo (CoNLL-2003)
 Model: **{MODEL_ID}**
-Entities: typically **PER / ORG / LOC / MISC** (CoNLL-2003 style)
-"""
-    )
     with gr.Row():
         max_length = gr.Slider(64, 512, value=256, step=32, label="max_length (truncate)")
@@ -131,22 +127,23 @@ Entities: typically **PER / ORG / LOC / MISC** (CoNLL-2003 style)
     text = gr.Textbox(
         label="Input text",
         lines=10,
-        value="Tim Chen works at Apple Inc. in Taipei.\nHe previously lived in New York City.",
         placeholder="Paste text here (e.g., OCR output).",
     )
     btn = gr.Button("Run NER")
     out_table = gr.Dataframe(
-        label="Extracted entities (spans)",
-        headers=["entity", "text", "score", "start", "end"],
-        datatype=["str", "str", "number", "number", "number"],
         interactive=False,
         wrap=True,
     )
-    debug_box = gr.Textbox(label="Raw token output", lines=12, visible=True)
     btn.click(fn=run_ner, inputs=[text, max_length, show_tokens], outputs=[out_table, debug_box])
-demo.launch()

 # app.py
+# Gradio NER demo using: RashidNLP/NER-Deberta (Few-NERD labels)
 #
+# requirements.txt 建議：
+# gradio>=4.0
+# transformers>=4.35
+# torch
+# sentencepiece
 import gradio as gr
 import torch
+from transformers import AutoTokenizer, AutoModelForTokenClassification
+MODEL_ID = "RashidNLP/NER-Deberta"
+# Load once at startup
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
 model = AutoModelForTokenClassification.from_pretrained(MODEL_ID)
+# Put model on GPU if available (Spaces 通常是 CPU)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device)
+model.eval()
+def merge_bio_spans(text: str, per_token):
+    """
+    per_token: list of dict {label, start, end}
+    returns: list of dict {entity, start, end}
+    """
+    spans = []
+    cur = None
+    def close_cur():
+        nonlocal cur
+        if cur:
+            spans.append(cur)
+            cur = None
+    for t in per_token:
+        lab = t["label"]
+        st, ed = t["start"], t["end"]
+        if lab.startswith("B-"):
+            close_cur()
+            cur = {"entity": lab[2:], "start": st, "end": ed}
+        elif lab.startswith("I-") and cur and cur["entity"] == lab[2:]:
+            cur["end"] = ed
+        else:
+            close_cur()
+    close_cur()
+    return spans
 def run_ner(text: str, max_length: int, show_tokens: bool):
     text = (text or "").strip()
     if not text:
         return [], ""
+    # tokenize with truncation control
     enc = tokenizer(
         text,
         return_tensors="pt",
         return_offsets_mapping=True,
     )
+    offsets = enc.pop("offset_mapping")[0].tolist()
+    input_ids = enc["input_ids"][0].tolist()
+    tokens = tokenizer.convert_ids_to_tokens(input_ids)
+    enc = {k: v.to(device) for k, v in enc.items()}
     with torch.no_grad():
         out = model(**enc)
     logits = out.logits[0]  # (seq_len, num_labels)
     pred_ids = logits.argmax(dim=-1).tolist()
     id2label = model.config.id2label
+    # build per-token labels (skip specials)
     per_token = []
     for tok, pid, (st, ed) in zip(tokens, pred_ids, offsets):
+        if tok in tokenizer.all_special_tokens:
             continue
         if st == ed:
             continue
             "label": id2label[pid],
             "start": int(st),
             "end": int(ed),
         })
+    spans = merge_bio_spans(text, per_token)
+    # Return 2D list to avoid `[object Object]`
     table_rows = []
     for s in spans:
         table_rows.append([
             s["entity"],
             text[s["start"]:s["end"]],
             s["start"],
             s["end"],
         ])
     debug = ""
     if show_tokens:
+        lines = ["token\tlabel\t[offsets]"]
         for t in per_token:
+            lines.append(f"{t['token']}\t{t['label']}\t[{t['start']},{t['end']}]")
+        debug = "\n".join(lines)
     return table_rows, debug
+with gr.Blocks(title="NER-Deberta (Few-NERD) Demo") as demo:
+    gr.Markdown(f"""
+# NER Demo
 Model: **{MODEL_ID}**
+Note: This model uses **Few-NERD** style labels (more entity types than CoNLL-2003).
+""")
     with gr.Row():
         max_length = gr.Slider(64, 512, value=256, step=32, label="max_length (truncate)")
     text = gr.Textbox(
         label="Input text",
         lines=10,
+        value="Tim Chen\nSenior Software Engineer\nApple Inc.\nTaipei, Taiwan",
         placeholder="Paste text here (e.g., OCR output).",
     )
     btn = gr.Button("Run NER")
     out_table = gr.Dataframe(
+        label="Entities (spans)",
+        headers=["entity", "text", "start", "end"],
+        datatype=["str", "str", "number", "number"],
         interactive=False,
         wrap=True,
     )
+    debug_box = gr.Textbox(label="Raw token output", lines=12)
     btn.click(fn=run_ner, inputs=[text, max_length, show_tokens], outputs=[out_table, debug_box])
+if __name__ == "__main__":
+    demo.launch()