magicboker commited on
Commit
5bdae6a
·
verified ·
1 Parent(s): 85a5e50

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -5
app.py CHANGED
@@ -22,9 +22,12 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
22
  model.to(device)
23
  model.eval()
24
 
25
- def merge_bio_spans(text: str, per_token):
26
  """
27
  per_token: list of dict {label, start, end}
 
 
 
28
  returns: list of dict {entity, start, end}
29
  """
30
  spans = []
@@ -40,13 +43,37 @@ def merge_bio_spans(text: str, per_token):
40
  lab = t["label"]
41
  st, ed = t["start"], t["end"]
42
 
 
 
 
 
 
 
 
 
 
43
  if lab.startswith("B-"):
44
  close_cur()
45
  cur = {"entity": lab[2:], "start": st, "end": ed}
46
- elif lab.startswith("I-") and cur and cur["entity"] == lab[2:]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  cur["end"] = ed
48
  else:
49
  close_cur()
 
50
 
51
  close_cur()
52
  return spans
@@ -81,8 +108,8 @@ def run_ner(text: str, max_length: int, show_tokens: bool):
81
  # build per-token labels (skip specials)
82
  per_token = []
83
  for tok, pid, (st, ed) in zip(tokens, pred_ids, offsets):
84
- if tok in tokenizer.all_special_tokens:
85
- continue
86
  if st == ed:
87
  continue
88
  per_token.append({
@@ -92,7 +119,7 @@ def run_ner(text: str, max_length: int, show_tokens: bool):
92
  "end": int(ed),
93
  })
94
 
95
- spans = merge_bio_spans(text, per_token)
96
 
97
  # Return 2D list to avoid `[object Object]`
98
  table_rows = []
 
22
  model.to(device)
23
  model.eval()
24
 
25
+ def merge_spans(text: str, per_token):
26
  """
27
  per_token: list of dict {label, start, end}
28
+ Supports:
29
+ - BIO labels: B-XXX / I-XXX / O
30
+ - Non-BIO labels: XXX / O
31
  returns: list of dict {entity, start, end}
32
  """
33
  spans = []
 
43
  lab = t["label"]
44
  st, ed = t["start"], t["end"]
45
 
46
+ # normalize
47
+ if lab is None:
48
+ lab = "O"
49
+
50
+ if lab == "O":
51
+ close_cur()
52
+ continue
53
+
54
+ # BIO case
55
  if lab.startswith("B-"):
56
  close_cur()
57
  cur = {"entity": lab[2:], "start": st, "end": ed}
58
+ continue
59
+
60
+ if lab.startswith("I-"):
61
+ ent = lab[2:]
62
+ if cur and cur["entity"] == ent:
63
+ cur["end"] = ed
64
+ else:
65
+ # treat as a new span if I- appears without proper B-
66
+ close_cur()
67
+ cur = {"entity": ent, "start": st, "end": ed}
68
+ continue
69
+
70
+ # Non-BIO case: label like "person" / "ORG" / etc.
71
+ ent = lab
72
+ if cur and cur["entity"] == ent:
73
  cur["end"] = ed
74
  else:
75
  close_cur()
76
+ cur = {"entity": ent, "start": st, "end": ed}
77
 
78
  close_cur()
79
  return spans
 
108
  # build per-token labels (skip specials)
109
  per_token = []
110
  for tok, pid, (st, ed) in zip(tokens, pred_ids, offsets):
111
+ if tok in tokenizer.all_special_tokens:
112
+ continue
113
  if st == ed:
114
  continue
115
  per_token.append({
 
119
  "end": int(ed),
120
  })
121
 
122
+ spans = merge_spans(text, per_token)
123
 
124
  # Return 2D list to avoid `[object Object]`
125
  table_rows = []