Spaces:
Running
Running
Commit ·
ea5bb02
1
Parent(s): 1d72cba
auto annotation
Browse files
app.py
CHANGED
|
@@ -9,12 +9,20 @@ subprocess.run(["git", "clone", "https://github.com/robinarmingaud/glidre"])
|
|
| 9 |
subprocess.run(["pip", "install", "./glidre"])
|
| 10 |
subprocess.run(["pip", "install", "networkx"])
|
| 11 |
subprocess.run(["pip", "install", "matplotlib"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
|
| 14 |
import networkx as nx
|
| 15 |
import matplotlib.pyplot as plt
|
| 16 |
from PIL import Image
|
| 17 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
|
| 20 |
|
|
@@ -54,7 +62,6 @@ def df_to_mentions(df_rows, text: str = "") -> tuple:
|
|
| 54 |
start = None
|
| 55 |
end = None
|
| 56 |
|
| 57 |
-
# Auto-detect span via regex when start/end are missing
|
| 58 |
if (start is None or end is None) and value and text:
|
| 59 |
match = re.search(re.escape(value), text)
|
| 60 |
if match:
|
|
@@ -71,6 +78,23 @@ def df_to_mentions(df_rows, text: str = "") -> tuple:
|
|
| 71 |
return list(entity_map.values()), warnings
|
| 72 |
|
| 73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
def relations_to_table(relations: List[Dict[str, Any]]) -> List[List[Any]]:
|
| 75 |
rows = []
|
| 76 |
for r in relations:
|
|
@@ -154,23 +178,13 @@ with gr.Blocks(title="GLiDRE — Gradio demo") as demo:
|
|
| 154 |
with gr.Column(scale=6):
|
| 155 |
text_input = gr.Textbox(label="Document text", value="The Loud Tour was the fourth overall and third world concert tour by Barbadian recording artist Rihanna.", lines=6)
|
| 156 |
labels_input = gr.Textbox(label="Relation labels (comma-separated)", value="COUNTRY_OF_CITIZENSHIP, PUBLICATION_DATE, PART_OF")
|
| 157 |
-
gr.
|
| 158 |
-
**Mentions Table** — Each row describes one surface form (mention) of an entity.
|
| 159 |
-
|
| 160 |
-
| Column | Description |
|
| 161 |
-
|--------|-------------|
|
| 162 |
-
| `id` | Entity identifier — **repeat the same `id`** across rows to add multiple mentions of the same entity |
|
| 163 |
-
| `type` | Entity type (e.g. `PER`, `ORG`, `LOC`, `MISC`) (optional) |
|
| 164 |
-
| `value` | The exact substring as it appears in the text |
|
| 165 |
-
| `start` | Start character index (optional but recommended) |
|
| 166 |
-
| `end` | End character index (optional but recommended) |
|
| 167 |
-
""")
|
| 168 |
entities_df = gr.Dataframe(headers=["id", "type", "value", "start", "end"], datatype=["number", "text", "text", "number", "number"], interactive=True, label="Mentions", column_count=5)
|
| 169 |
with gr.Row():
|
| 170 |
threshold = gr.Slider(0.0, 1.0, value=0.3, label="Threshold", step = 0.05)
|
| 171 |
multi_label = gr.Checkbox(label="Allow multi-label (one mention pair can have multiple relations)", value=True)
|
| 172 |
|
| 173 |
-
run = gr.Button("Run prediction")
|
| 174 |
|
| 175 |
gr.Examples(
|
| 176 |
label="Examples (click to load)",
|
|
@@ -198,6 +212,7 @@ with gr.Blocks(title="GLiDRE — Gradio demo") as demo:
|
|
| 198 |
graph_out = gr.Image(label="Relation graph", type="pil")
|
| 199 |
raw_json_out = gr.Textbox(label="Raw JSON output", lines=12)
|
| 200 |
|
|
|
|
| 201 |
run.click(fn=predict, inputs=[text_input, labels_input, entities_df, threshold, multi_label], outputs=[relations_table, graph_out, raw_json_out])
|
| 202 |
|
| 203 |
|
|
|
|
| 9 |
subprocess.run(["pip", "install", "./glidre"])
|
| 10 |
subprocess.run(["pip", "install", "networkx"])
|
| 11 |
subprocess.run(["pip", "install", "matplotlib"])
|
| 12 |
+
subprocess.run(["pip", "install", "spacy"])
|
| 13 |
+
subprocess.run([
|
| 14 |
+
"pip", "install",
|
| 15 |
+
"https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-3.8.0/xx_ent_wiki_sm-3.8.0-py3-none-any.whl"
|
| 16 |
+
])
|
| 17 |
|
| 18 |
|
| 19 |
import networkx as nx
|
| 20 |
import matplotlib.pyplot as plt
|
| 21 |
from PIL import Image
|
| 22 |
import pandas as pd
|
| 23 |
+
import spacy
|
| 24 |
+
|
| 25 |
+
nlp = spacy.load("xx_ent_wiki_sm")
|
| 26 |
|
| 27 |
|
| 28 |
|
|
|
|
| 62 |
start = None
|
| 63 |
end = None
|
| 64 |
|
|
|
|
| 65 |
if (start is None or end is None) and value and text:
|
| 66 |
match = re.search(re.escape(value), text)
|
| 67 |
if match:
|
|
|
|
| 78 |
return list(entity_map.values()), warnings
|
| 79 |
|
| 80 |
|
| 81 |
+
def auto_annotate(text: str):
|
| 82 |
+
if not text or not text.strip():
|
| 83 |
+
return []
|
| 84 |
+
doc = nlp(text)
|
| 85 |
+
span_to_id: Dict[str, int] = {}
|
| 86 |
+
rows = []
|
| 87 |
+
next_id = 0
|
| 88 |
+
for ent in doc.ents:
|
| 89 |
+
surface = ent.text
|
| 90 |
+
if surface not in span_to_id:
|
| 91 |
+
span_to_id[surface] = next_id
|
| 92 |
+
next_id += 1
|
| 93 |
+
eid = span_to_id[surface]
|
| 94 |
+
rows.append([eid, ent.label_, surface, ent.start_char, ent.end_char])
|
| 95 |
+
return rows
|
| 96 |
+
|
| 97 |
+
|
| 98 |
def relations_to_table(relations: List[Dict[str, Any]]) -> List[List[Any]]:
|
| 99 |
rows = []
|
| 100 |
for r in relations:
|
|
|
|
| 178 |
with gr.Column(scale=6):
|
| 179 |
text_input = gr.Textbox(label="Document text", value="The Loud Tour was the fourth overall and third world concert tour by Barbadian recording artist Rihanna.", lines=6)
|
| 180 |
labels_input = gr.Textbox(label="Relation labels (comma-separated)", value="COUNTRY_OF_CITIZENSHIP, PUBLICATION_DATE, PART_OF")
|
| 181 |
+
annotate_btn = gr.Button("🔍 Auto-annotate using spaCy", variant="secondary")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
entities_df = gr.Dataframe(headers=["id", "type", "value", "start", "end"], datatype=["number", "text", "text", "number", "number"], interactive=True, label="Mentions", column_count=5)
|
| 183 |
with gr.Row():
|
| 184 |
threshold = gr.Slider(0.0, 1.0, value=0.3, label="Threshold", step = 0.05)
|
| 185 |
multi_label = gr.Checkbox(label="Allow multi-label (one mention pair can have multiple relations)", value=True)
|
| 186 |
|
| 187 |
+
run = gr.Button("▶ Run prediction", variant="primary", elem_id="run-btn")
|
| 188 |
|
| 189 |
gr.Examples(
|
| 190 |
label="Examples (click to load)",
|
|
|
|
| 212 |
graph_out = gr.Image(label="Relation graph", type="pil")
|
| 213 |
raw_json_out = gr.Textbox(label="Raw JSON output", lines=12)
|
| 214 |
|
| 215 |
+
annotate_btn.click(fn=auto_annotate, inputs=[text_input], outputs=[entities_df])
|
| 216 |
run.click(fn=predict, inputs=[text_input, labels_input, entities_df, threshold, multi_label], outputs=[relations_table, graph_out, raw_json_out])
|
| 217 |
|
| 218 |
|