GLiDRE

Running

App Files Files Community

GLiDRE / app.py

rarmingaud

auto annotation

ea5bb02 16 days ago

raw

history blame contribute delete

8.82 kB

	import io
	import json
	import re
	from typing import Any, Dict, List

	import gradio as gr
	import subprocess
	subprocess.run(["git", "clone", "https://github.com/robinarmingaud/glidre"])
	subprocess.run(["pip", "install", "./glidre"])
	subprocess.run(["pip", "install", "networkx"])
	subprocess.run(["pip", "install", "matplotlib"])
	subprocess.run(["pip", "install", "spacy"])
	subprocess.run([
	"pip", "install",
	"https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-3.8.0/xx_ent_wiki_sm-3.8.0-py3-none-any.whl"
	])


	import networkx as nx
	import matplotlib.pyplot as plt
	from PIL import Image
	import pandas as pd
	import spacy

	nlp = spacy.load("xx_ent_wiki_sm")



	from glidre import GLiDRE

	MODEL_NAME = "cea-list-ia/glidre_multi"
	model = GLiDRE.from_pretrained(MODEL_NAME)

	def parse_labels(labels_text: str) -> List[str]:
	if not labels_text:
	return []
	parts = [p.strip() for p in labels_text.replace("\n", ",").split(",")]
	parts = [p for p in parts if p]
	return [p.upper() for p in parts]

	def df_to_mentions(df_rows, text: str = "") -> tuple:
	if isinstance(df_rows, pd.DataFrame):
	df_rows = df_rows.values.tolist()

	entity_map: Dict[int, Dict[str, Any]] = {}
	warnings: List[str] = []
	auto_id = 0
	for row in df_rows:
	if not row or all(v is None or str(v).strip() == "" for v in row):
	continue
	try:
	rid = int(row[0]) if row[0] is not None and str(row[0]).strip() != "" else auto_id
	except Exception:
	rid = auto_id
	auto_id = max(auto_id, rid) + 1
	rtype = (str(row[1]).strip() if len(row) > 1 and row[1] not in (None, "") else "MISC") or "MISC"
	value = str(row[2]).strip() if len(row) > 2 and row[2] is not None else ""
	try:
	start = int(row[3]) if len(row) > 3 and row[3] not in (None, "") and str(row[3]).strip() != "" else None
	end = int(row[4]) if len(row) > 4 and row[4] not in (None, "") and str(row[4]).strip() != "" else None
	except Exception:
	start = None
	end = None

	if (start is None or end is None) and value and text:
	match = re.search(re.escape(value), text)
	if match:
	start, end = match.start(), match.end()
	else:
	warnings.append(f"Could not find '{value}' in text; span left as None.")

	m: Dict[str, Any] = {"value": value, "start": start, "end": end}

	if rid not in entity_map:
	entity_map[rid] = {"id": rid, "type": rtype, "mentions": []}
	entity_map[rid]["mentions"].append(m)

	return list(entity_map.values()), warnings


	def auto_annotate(text: str):
	if not text or not text.strip():
	return []
	doc = nlp(text)
	span_to_id: Dict[str, int] = {}
	rows = []
	next_id = 0
	for ent in doc.ents:
	surface = ent.text
	if surface not in span_to_id:
	span_to_id[surface] = next_id
	next_id += 1
	eid = span_to_id[surface]
	rows.append([eid, ent.label_, surface, ent.start_char, ent.end_char])
	return rows


	def relations_to_table(relations: List[Dict[str, Any]]) -> List[List[Any]]:
	rows = []
	for r in relations:
	e1 = r.get("entity_1", {})
	e2 = r.get("entity_2", {})
	rows.append([
	e1[0].get("id"),
	e1[0].get("text", ""),
	r.get("relation_type", ""),
	e2[0].get("id", ""),
	e2[0].get("text", ""),
	r.get("score", ""),
	])
	return rows


	def draw_relation_graph(relations: List[Dict[str, Any]]):
	G = nx.MultiDiGraph()
	node_labels = {}
	for r in relations:
	e1 = r.get("entity_1", {})
	e2 = r.get("entity_2", {})
	id1 = str(e1[0].get("id"))
	id2 = str(e2[0].get("id"))
	val1 = e1[0].get("text", id1)
	val2 = e2[0].get("text", id2)
	node_labels[id1] = f"{val1}\n({id1})"
	node_labels[id2] = f"{val2}\n({id2})"
	relation_label = r.get("relation_type", "")
	G.add_node(id1)
	G.add_node(id2)
	G.add_edge(id1, id2, label=relation_label)

	plt.figure(figsize=(8, 6))
	pos = nx.spring_layout(G, k=0.8)
	nx.draw_networkx_nodes(G, pos, node_size=1200)
	nx.draw_networkx_labels(G, pos, labels=node_labels)
	nx.draw_networkx_edges(G, pos, arrows=True)
	edge_labels = dict(((u, v), data["label"]) for u, v, key, data in G.edges(data=True, keys=True))
	nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
	plt.axis("off")
	buf = io.BytesIO()
	plt.tight_layout()
	plt.savefig(buf, format="png", bbox_inches="tight")
	plt.close()
	buf.seek(0)
	return Image.open(buf)


	def predict(text: str, labels_text: str, entities_df, threshold: float, multi_label: bool):
	labels = parse_labels(labels_text)

	mentions, warnings = df_to_mentions(entities_df, text=text)

	if not mentions:
	msg = "⚠️ No valid mentions found. Please fill in the Mentions table with id, value for each entity."
	if warnings:
	msg += "\n" + "\n".join(warnings)
	return ([], None, f"<pre>{msg}</pre>", "")
	relations = model.predict_entities(text=text, labels=labels, mentions=mentions, threshold=threshold, multi_label=multi_label)

	table = relations_to_table(relations)
	graph_img = draw_relation_graph(relations)

	warning_html = ""
	if warnings:
	warning_html = "<p style='color:orange'>⚠️ " + "<br>".join(warnings) + "</p>"

	raw_json = json.dumps(relations, indent=2)
	return (table, graph_img, raw_json)


	with gr.Blocks(title="GLiDRE — Gradio demo") as demo:
	gr.HTML("""
	<div id="logo" style="display: flex;align-items: center">
	<img src="/gradio_api/file=images/LIST.png" alt="Logo" width="200">
	</div>
	""")
	gr.Markdown("# GLiDRE DEMO")
	with gr.Row():
	with gr.Column(scale=6):
	text_input = gr.Textbox(label="Document text", value="The Loud Tour was the fourth overall and third world concert tour by Barbadian recording artist Rihanna.", lines=6)
	labels_input = gr.Textbox(label="Relation labels (comma-separated)", value="COUNTRY_OF_CITIZENSHIP, PUBLICATION_DATE, PART_OF")
	annotate_btn = gr.Button("🔍 Auto-annotate using spaCy", variant="secondary")
	entities_df = gr.Dataframe(headers=["id", "type", "value", "start", "end"], datatype=["number", "text", "text", "number", "number"], interactive=True, label="Mentions", column_count=5)
	with gr.Row():
	threshold = gr.Slider(0.0, 1.0, value=0.3, label="Threshold", step = 0.05)
	multi_label = gr.Checkbox(label="Allow multi-label (one mention pair can have multiple relations)", value=True)

	run = gr.Button("▶ Run prediction", variant="primary", elem_id="run-btn")

	gr.Examples(
	label="Examples (click to load)",
	examples=[
	[
	"Rihanna released her album in 2016. The artist won several awards that year.",
	"AWARD_RECEIVED, RELEASE_DATE",
	[[0, "PERSON", "Rihanna", 0, 7], [0, "PERSON", "artist", 40, 46], [1, "MISC", "album", 21, 26], [2, "DATE", "2016", 30, 34]]
	],
	[
	"Steve Jobs co-founded Apple. Jobs also served as its CEO until 2011.",
	"FOUNDER, EMPLOYEE_OF, END_DATE",
	[[0, "PERSON", "Steve Jobs", 0, 10], [0, "PERSON", "Jobs", 29, 33], [1, "ORG", "Apple", 22, 27], [2, "DATE", "2011", 63, 67]]
	],
	[
	"Marie Curie, born in Warsaw, won the Nobel Prize in Physics in 1903.",
	"PLACE_OF_BIRTH, AWARD_RECEIVED",
	[[0, "PERSON", "Marie Curie", 0, 11], [1, "LOC", "Warsaw", 21, 27], [2, "MISC", "Nobel Prize in Physics", 37, 59], [3, "DATE", "1903", 63, 67]]
	]
	],
	inputs=[text_input, labels_input, entities_df]
	)
	with gr.Column(scale=4):
	relations_table = gr.Dataframe(headers=["e1_id", "e1_text", "relation", "e2_id", "e2_text", "score"], interactive=False)
	graph_out = gr.Image(label="Relation graph", type="pil")
	raw_json_out = gr.Textbox(label="Raw JSON output", lines=12)

	annotate_btn.click(fn=auto_annotate, inputs=[text_input], outputs=[entities_df])
	run.click(fn=predict, inputs=[text_input, labels_input, entities_df, threshold, multi_label], outputs=[relations_table, graph_out, raw_json_out])


	if __name__ == "__main__":
	demo.launch(favicon_path="images/LIST.svg", debug=True, allowed_paths=["images"])