rarmingaud commited on
Commit
ea5bb02
·
1 Parent(s): 1d72cba

auto annotation

Browse files
Files changed (1) hide show
  1. app.py +28 -13
app.py CHANGED
@@ -9,12 +9,20 @@ subprocess.run(["git", "clone", "https://github.com/robinarmingaud/glidre"])
9
  subprocess.run(["pip", "install", "./glidre"])
10
  subprocess.run(["pip", "install", "networkx"])
11
  subprocess.run(["pip", "install", "matplotlib"])
 
 
 
 
 
12
 
13
 
14
  import networkx as nx
15
  import matplotlib.pyplot as plt
16
  from PIL import Image
17
  import pandas as pd
 
 
 
18
 
19
 
20
 
@@ -54,7 +62,6 @@ def df_to_mentions(df_rows, text: str = "") -> tuple:
54
  start = None
55
  end = None
56
 
57
- # Auto-detect span via regex when start/end are missing
58
  if (start is None or end is None) and value and text:
59
  match = re.search(re.escape(value), text)
60
  if match:
@@ -71,6 +78,23 @@ def df_to_mentions(df_rows, text: str = "") -> tuple:
71
  return list(entity_map.values()), warnings
72
 
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  def relations_to_table(relations: List[Dict[str, Any]]) -> List[List[Any]]:
75
  rows = []
76
  for r in relations:
@@ -154,23 +178,13 @@ with gr.Blocks(title="GLiDRE — Gradio demo") as demo:
154
  with gr.Column(scale=6):
155
  text_input = gr.Textbox(label="Document text", value="The Loud Tour was the fourth overall and third world concert tour by Barbadian recording artist Rihanna.", lines=6)
156
  labels_input = gr.Textbox(label="Relation labels (comma-separated)", value="COUNTRY_OF_CITIZENSHIP, PUBLICATION_DATE, PART_OF")
157
- gr.Markdown("""
158
- **Mentions Table** — Each row describes one surface form (mention) of an entity.
159
-
160
- | Column | Description |
161
- |--------|-------------|
162
- | `id` | Entity identifier — **repeat the same `id`** across rows to add multiple mentions of the same entity |
163
- | `type` | Entity type (e.g. `PER`, `ORG`, `LOC`, `MISC`) (optional) |
164
- | `value` | The exact substring as it appears in the text |
165
- | `start` | Start character index (optional but recommended) |
166
- | `end` | End character index (optional but recommended) |
167
- """)
168
  entities_df = gr.Dataframe(headers=["id", "type", "value", "start", "end"], datatype=["number", "text", "text", "number", "number"], interactive=True, label="Mentions", column_count=5)
169
  with gr.Row():
170
  threshold = gr.Slider(0.0, 1.0, value=0.3, label="Threshold", step = 0.05)
171
  multi_label = gr.Checkbox(label="Allow multi-label (one mention pair can have multiple relations)", value=True)
172
 
173
- run = gr.Button("Run prediction")
174
 
175
  gr.Examples(
176
  label="Examples (click to load)",
@@ -198,6 +212,7 @@ with gr.Blocks(title="GLiDRE — Gradio demo") as demo:
198
  graph_out = gr.Image(label="Relation graph", type="pil")
199
  raw_json_out = gr.Textbox(label="Raw JSON output", lines=12)
200
 
 
201
  run.click(fn=predict, inputs=[text_input, labels_input, entities_df, threshold, multi_label], outputs=[relations_table, graph_out, raw_json_out])
202
 
203
 
 
9
  subprocess.run(["pip", "install", "./glidre"])
10
  subprocess.run(["pip", "install", "networkx"])
11
  subprocess.run(["pip", "install", "matplotlib"])
12
+ subprocess.run(["pip", "install", "spacy"])
13
+ subprocess.run([
14
+ "pip", "install",
15
+ "https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-3.8.0/xx_ent_wiki_sm-3.8.0-py3-none-any.whl"
16
+ ])
17
 
18
 
19
  import networkx as nx
20
  import matplotlib.pyplot as plt
21
  from PIL import Image
22
  import pandas as pd
23
+ import spacy
24
+
25
+ nlp = spacy.load("xx_ent_wiki_sm")
26
 
27
 
28
 
 
62
  start = None
63
  end = None
64
 
 
65
  if (start is None or end is None) and value and text:
66
  match = re.search(re.escape(value), text)
67
  if match:
 
78
  return list(entity_map.values()), warnings
79
 
80
 
81
+ def auto_annotate(text: str):
82
+ if not text or not text.strip():
83
+ return []
84
+ doc = nlp(text)
85
+ span_to_id: Dict[str, int] = {}
86
+ rows = []
87
+ next_id = 0
88
+ for ent in doc.ents:
89
+ surface = ent.text
90
+ if surface not in span_to_id:
91
+ span_to_id[surface] = next_id
92
+ next_id += 1
93
+ eid = span_to_id[surface]
94
+ rows.append([eid, ent.label_, surface, ent.start_char, ent.end_char])
95
+ return rows
96
+
97
+
98
  def relations_to_table(relations: List[Dict[str, Any]]) -> List[List[Any]]:
99
  rows = []
100
  for r in relations:
 
178
  with gr.Column(scale=6):
179
  text_input = gr.Textbox(label="Document text", value="The Loud Tour was the fourth overall and third world concert tour by Barbadian recording artist Rihanna.", lines=6)
180
  labels_input = gr.Textbox(label="Relation labels (comma-separated)", value="COUNTRY_OF_CITIZENSHIP, PUBLICATION_DATE, PART_OF")
181
+ annotate_btn = gr.Button("🔍 Auto-annotate using spaCy", variant="secondary")
 
 
 
 
 
 
 
 
 
 
182
  entities_df = gr.Dataframe(headers=["id", "type", "value", "start", "end"], datatype=["number", "text", "text", "number", "number"], interactive=True, label="Mentions", column_count=5)
183
  with gr.Row():
184
  threshold = gr.Slider(0.0, 1.0, value=0.3, label="Threshold", step = 0.05)
185
  multi_label = gr.Checkbox(label="Allow multi-label (one mention pair can have multiple relations)", value=True)
186
 
187
+ run = gr.Button("Run prediction", variant="primary", elem_id="run-btn")
188
 
189
  gr.Examples(
190
  label="Examples (click to load)",
 
212
  graph_out = gr.Image(label="Relation graph", type="pil")
213
  raw_json_out = gr.Textbox(label="Raw JSON output", lines=12)
214
 
215
+ annotate_btn.click(fn=auto_annotate, inputs=[text_input], outputs=[entities_df])
216
  run.click(fn=predict, inputs=[text_input, labels_input, entities_df, threshold, multi_label], outputs=[relations_table, graph_out, raw_json_out])
217
 
218