magicboker commited on
Commit
85a5e50
·
verified ·
1 Parent(s): 85b9fc4

Update app.py

Browse files

換成 RashidNLP/NER-Deberta 版

Files changed (1) hide show
  1. app.py +68 -71
app.py CHANGED
@@ -1,38 +1,62 @@
1
  # app.py
2
- # DeBERTa-v3-base (CoNLL-2003) NER demo with Gradio
3
- # Model: ficsort/deberta-v3-base-conll2003-ner
4
  #
5
- # Install:
6
- # pip install -U gradio transformers torch
7
- #
8
- # Run:
9
- # python app.py
10
 
11
  import gradio as gr
12
  import torch
13
- from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
14
 
15
- MODEL_ID = "ficsort/deberta-v3-base-conll2003-ner"
16
 
17
- # Load once at startup (faster UX)
18
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
19
  model = AutoModelForTokenClassification.from_pretrained(MODEL_ID)
20
 
21
- device = 0 if torch.cuda.is_available() else -1 # GPU if available, else CPU
22
- ner_pipe = pipeline(
23
- task="token-classification",
24
- model=model,
25
- tokenizer=tokenizer,
26
- aggregation_strategy="simple", # merges B-/I- tags into spans
27
- device=device,
28
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  def run_ner(text: str, max_length: int, show_tokens: bool):
31
  text = (text or "").strip()
32
  if not text:
33
  return [], ""
34
 
35
- # 1) 先手動 tokenize(這裡控制 truncation/max_length)
36
  enc = tokenizer(
37
  text,
38
  return_tensors="pt",
@@ -41,24 +65,23 @@ def run_ner(text: str, max_length: int, show_tokens: bool):
41
  return_offsets_mapping=True,
42
  )
43
 
44
- offsets = enc.pop("offset_mapping")[0].tolist() # (seq_len, 2)
45
- enc = {k: v.to(model.device) for k, v in enc.items()}
 
 
 
46
 
47
- # 2) forward
48
  with torch.no_grad():
49
  out = model(**enc)
 
50
  logits = out.logits[0] # (seq_len, num_labels)
51
  pred_ids = logits.argmax(dim=-1).tolist()
52
-
53
  id2label = model.config.id2label
54
 
55
- # 3) 取出每個 token label + offset(跳過 special tokens)
56
- tokens = tokenizer.convert_ids_to_tokens(enc["input_ids"][0].tolist())
57
-
58
  per_token = []
59
  for tok, pid, (st, ed) in zip(tokens, pred_ids, offsets):
60
- # 跳過 special tokens 或沒有 offset 的 token
61
- if st == 0 and ed == 0 and tok in tokenizer.all_special_tokens:
62
  continue
63
  if st == ed:
64
  continue
@@ -67,62 +90,35 @@ def run_ner(text: str, max_length: int, show_tokens: bool):
67
  "label": id2label[pid],
68
  "start": int(st),
69
  "end": int(ed),
70
- "score": float(torch.softmax(logits, dim=-1)[per_token.__len__() if False else 0][pid]) if False else None
71
  })
72
 
73
- # 4) BIO 標籤合併成 span(簡單版)
74
- spans = []
75
- cur = None
76
-
77
- def tok_text(st, ed):
78
- return text[st:ed]
79
-
80
- for t in per_token:
81
- lab = t["label"]
82
- st, ed = t["start"], t["end"]
83
 
84
- if lab.startswith("B-"):
85
- if cur:
86
- spans.append(cur)
87
- cur = {"entity": lab[2:], "start": st, "end": ed}
88
- elif lab.startswith("I-") and cur and cur["entity"] == lab[2:]:
89
- cur["end"] = ed
90
- else:
91
- if cur:
92
- spans.append(cur)
93
- cur = None
94
-
95
- if cur:
96
- spans.append(cur)
97
-
98
- # 5) 輸出表格 rows
99
  table_rows = []
100
  for s in spans:
101
  table_rows.append([
102
  s["entity"],
103
  text[s["start"]:s["end"]],
104
- 0.0, # score 先用 0
105
  s["start"],
106
  s["end"],
107
  ])
108
 
109
  debug = ""
110
  if show_tokens:
111
- debug_lines = ["token\tlabel\t[offsets]"]
112
  for t in per_token:
113
- debug_lines.append(f"{t['token']}\t{t['label']}\t[{t['start']},{t['end']}]")
114
- debug = "\n".join(debug_lines)
115
 
116
  return table_rows, debug
117
 
118
- with gr.Blocks(title="DeBERTa NER (CoNLL-2003)") as demo:
119
- gr.Markdown(
120
- f"""
121
- # DeBERTa NER Demo (CoNLL-2003)
122
  Model: **{MODEL_ID}**
123
- Entities: typically **PER / ORG / LOC / MISC** (CoNLL-2003 style)
124
- """
125
- )
126
 
127
  with gr.Row():
128
  max_length = gr.Slider(64, 512, value=256, step=32, label="max_length (truncate)")
@@ -131,22 +127,23 @@ Entities: typically **PER / ORG / LOC / MISC** (CoNLL-2003 style)
131
  text = gr.Textbox(
132
  label="Input text",
133
  lines=10,
134
- value="Tim Chen works at Apple Inc. in Taipei.\nHe previously lived in New York City.",
135
  placeholder="Paste text here (e.g., OCR output).",
136
  )
137
 
138
  btn = gr.Button("Run NER")
139
 
140
  out_table = gr.Dataframe(
141
- label="Extracted entities (spans)",
142
- headers=["entity", "text", "score", "start", "end"],
143
- datatype=["str", "str", "number", "number", "number"],
144
  interactive=False,
145
  wrap=True,
146
  )
147
 
148
- debug_box = gr.Textbox(label="Raw token output", lines=12, visible=True)
149
 
150
  btn.click(fn=run_ner, inputs=[text, max_length, show_tokens], outputs=[out_table, debug_box])
151
 
152
- demo.launch()
 
 
1
  # app.py
2
+ # Gradio NER demo using: RashidNLP/NER-Deberta (Few-NERD labels)
 
3
  #
4
+ # requirements.txt 建議:
5
+ # gradio>=4.0
6
+ # transformers>=4.35
7
+ # torch
8
+ # sentencepiece
9
 
10
  import gradio as gr
11
  import torch
12
+ from transformers import AutoTokenizer, AutoModelForTokenClassification
13
 
14
+ MODEL_ID = "RashidNLP/NER-Deberta"
15
 
16
+ # Load once at startup
17
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
18
  model = AutoModelForTokenClassification.from_pretrained(MODEL_ID)
19
 
20
+ # Put model on GPU if available (Spaces 通常是 CPU)
21
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
22
+ model.to(device)
23
+ model.eval()
24
+
25
+ def merge_bio_spans(text: str, per_token):
26
+ """
27
+ per_token: list of dict {label, start, end}
28
+ returns: list of dict {entity, start, end}
29
+ """
30
+ spans = []
31
+ cur = None
32
+
33
+ def close_cur():
34
+ nonlocal cur
35
+ if cur:
36
+ spans.append(cur)
37
+ cur = None
38
+
39
+ for t in per_token:
40
+ lab = t["label"]
41
+ st, ed = t["start"], t["end"]
42
+
43
+ if lab.startswith("B-"):
44
+ close_cur()
45
+ cur = {"entity": lab[2:], "start": st, "end": ed}
46
+ elif lab.startswith("I-") and cur and cur["entity"] == lab[2:]:
47
+ cur["end"] = ed
48
+ else:
49
+ close_cur()
50
+
51
+ close_cur()
52
+ return spans
53
 
54
  def run_ner(text: str, max_length: int, show_tokens: bool):
55
  text = (text or "").strip()
56
  if not text:
57
  return [], ""
58
 
59
+ # tokenize with truncation control
60
  enc = tokenizer(
61
  text,
62
  return_tensors="pt",
 
65
  return_offsets_mapping=True,
66
  )
67
 
68
+ offsets = enc.pop("offset_mapping")[0].tolist()
69
+ input_ids = enc["input_ids"][0].tolist()
70
+ tokens = tokenizer.convert_ids_to_tokens(input_ids)
71
+
72
+ enc = {k: v.to(device) for k, v in enc.items()}
73
 
 
74
  with torch.no_grad():
75
  out = model(**enc)
76
+
77
  logits = out.logits[0] # (seq_len, num_labels)
78
  pred_ids = logits.argmax(dim=-1).tolist()
 
79
  id2label = model.config.id2label
80
 
81
+ # build per-token labels (skip specials)
 
 
82
  per_token = []
83
  for tok, pid, (st, ed) in zip(tokens, pred_ids, offsets):
84
+ if tok in tokenizer.all_special_tokens:
 
85
  continue
86
  if st == ed:
87
  continue
 
90
  "label": id2label[pid],
91
  "start": int(st),
92
  "end": int(ed),
 
93
  })
94
 
95
+ spans = merge_bio_spans(text, per_token)
 
 
 
 
 
 
 
 
 
96
 
97
+ # Return 2D list to avoid `[object Object]`
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  table_rows = []
99
  for s in spans:
100
  table_rows.append([
101
  s["entity"],
102
  text[s["start"]:s["end"]],
 
103
  s["start"],
104
  s["end"],
105
  ])
106
 
107
  debug = ""
108
  if show_tokens:
109
+ lines = ["token\tlabel\t[offsets]"]
110
  for t in per_token:
111
+ lines.append(f"{t['token']}\t{t['label']}\t[{t['start']},{t['end']}]")
112
+ debug = "\n".join(lines)
113
 
114
  return table_rows, debug
115
 
116
+ with gr.Blocks(title="NER-Deberta (Few-NERD) Demo") as demo:
117
+ gr.Markdown(f"""
118
+ # NER Demo
 
119
  Model: **{MODEL_ID}**
120
+ Note: This model uses **Few-NERD** style labels (more entity types than CoNLL-2003).
121
+ """)
 
122
 
123
  with gr.Row():
124
  max_length = gr.Slider(64, 512, value=256, step=32, label="max_length (truncate)")
 
127
  text = gr.Textbox(
128
  label="Input text",
129
  lines=10,
130
+ value="Tim Chen\nSenior Software Engineer\nApple Inc.\nTaipei, Taiwan",
131
  placeholder="Paste text here (e.g., OCR output).",
132
  )
133
 
134
  btn = gr.Button("Run NER")
135
 
136
  out_table = gr.Dataframe(
137
+ label="Entities (spans)",
138
+ headers=["entity", "text", "start", "end"],
139
+ datatype=["str", "str", "number", "number"],
140
  interactive=False,
141
  wrap=True,
142
  )
143
 
144
+ debug_box = gr.Textbox(label="Raw token output", lines=12)
145
 
146
  btn.click(fn=run_ner, inputs=[text, max_length, show_tokens], outputs=[out_table, debug_box])
147
 
148
+ if __name__ == "__main__":
149
+ demo.launch()