Geraldine's picture
Update app.py
e308ce8 verified
from __future__ import annotations
import html
import json
import os
from typing import Any
import gradio as gr
import requests
DEFAULT_API_URL = os.getenv("IDREF_QUALINKA_API_URL", "https://idref-linker.smartbiblia.fr")
DEFAULT_API_KEY = os.getenv("IDREF_QUALINKA_API_KEY", "")
EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
PERSON_FIELDS = [
"author",
"advisor",
"jury_president",
"reviewers",
"committee_members",
]
FIELD_LABELS = {
"author": "Auteur",
"advisor": "Directeur",
"jury_president": "President du jury",
"reviewers": "Rapporteurs",
"committee_members": "Membres du jury",
}
EXAMPLE_JSON = """{
"title": "La question de l'hygiène aux Indes-Néerlandaises",
"subtitle": "Les enjeux médicaux, culturels et sociaux",
"author": "Gani Achmad JAE LANI",
"degree_type": "Thèse de doctorat",
"discipline": "Histoire et civilisations",
"granting_institution": "École des Hautes Études en Sciences Sociales",
"co_tutelle_institutions": [],
"doctoral_school": "École doctorale de l'EHESS",
"defense_year": 2017,
"advisor": "Gérard JORLAND",
"jury_president": "",
"reviewers": "",
"committee_members": "Romain BERTRAND|Patrice BOURDELAIS|Charles ILLOUZ|Annick OPINEL|Patrick ZYLBERMAN|Gérard JORLAND",
"language": "fre",
"confidence": 0.98
}"""
def split_people(value: Any) -> list[str]:
if value is None:
return []
if isinstance(value, list):
raw_items = value
else:
raw_items = str(value).split("|")
names = []
for item in raw_items:
name = str(item).strip()
if name:
names.append(name)
return names
def normalize_name_key(name: str) -> str:
return " ".join(name.lower().split())
def parse_extraction_json(json_text: str) -> dict[str, Any]:
try:
payload = json.loads(json_text)
except json.JSONDecodeError as exc:
raise gr.Error(f"JSON invalide: {exc}") from exc
if not isinstance(payload, dict):
raise gr.Error("Le JSON colle doit etre un objet.")
return payload
def extract_people(payload: dict[str, Any]) -> list[dict[str, Any]]:
people_by_key: dict[str, dict[str, Any]] = {}
for field in PERSON_FIELDS:
for name in split_people(payload.get(field)):
key = normalize_name_key(name)
if key not in people_by_key:
people_by_key[key] = {"name": name, "roles": []}
if field not in people_by_key[key]["roles"]:
people_by_key[key]["roles"].append(field)
return list(people_by_key.values())
def people_to_rows(people: list[dict[str, Any]]) -> list[list[str]]:
rows = []
for person in people:
labels = [FIELD_LABELS.get(role, role) for role in person["roles"]]
rows.append([person["name"], " | ".join(labels)])
return rows
def people_to_choices(people: list[dict[str, Any]]) -> list[str]:
choices = []
for person in people:
labels = [FIELD_LABELS.get(role, role) for role in person["roles"]]
choices.append(f"{person['name']}{' | '.join(labels)}")
return choices
def selected_choice_to_name(choice: str) -> str:
return choice.split(" — ", 1)[0].strip()
def build_align_payload(
extraction: dict[str, Any],
selected_name: str,
use_embeddings: bool,
max_candidates: int,
max_docs_per_role: int,
reference_top_k: int,
accept_threshold: float,
margin_threshold: float,
weight_name: float,
weight_attrra_source: float,
weight_attrra_note: float,
weight_references: float,
weight_institution_year: float,
) -> dict[str, Any]:
return {
"name": selected_name,
"title": str(extraction.get("title") or ""),
"subtitle": str(extraction.get("subtitle") or ""),
"discipline": str(extraction.get("discipline") or ""),
"institution": str(extraction.get("granting_institution") or extraction.get("institution") or ""),
"doctoral_school": str(extraction.get("doctoral_school") or ""),
"degree_type": str(extraction.get("degree_type") or ""),
"year": str(extraction.get("defense_year") or extraction.get("year") or ""),
"max_candidates": int(max_candidates),
"max_docs_per_role": int(max_docs_per_role),
"reference_top_k": int(reference_top_k),
"embedding_model": EMBEDDING_MODEL if use_embeddings else "",
"accept_threshold": float(accept_threshold),
"margin_threshold": float(margin_threshold),
"weight_name": float(weight_name),
"weight_attrra_source": float(weight_attrra_source),
"weight_attrra_note": float(weight_attrra_note),
"weight_references": float(weight_references),
"weight_institution_year": float(weight_institution_year),
}
def status_badge(status: str) -> str:
colors = {
"accepted": ("#dcfce7", "#166534"),
"ambiguous": ("#fef9c3", "#854d0e"),
"low_confidence": ("#fee2e2", "#991b1b"),
"not_found": ("#e5e7eb", "#374151"),
}
bg, fg = colors.get(status, ("#e5e7eb", "#374151"))
return (
f'<span style="display:inline-block;padding:4px 9px;border-radius:999px;'
f'background:{bg};color:{fg};font-weight:700;font-size:13px">{html.escape(status)}</span>'
)
def score_table(candidate: dict[str, Any] | None) -> str:
if not candidate:
return ""
score = candidate.get("score") or {}
rows = []
for key in ["final", "name", "attrra_source", "attrra_note", "references", "institution_year"]:
value = score.get(key)
value_text = "" if value is None else f"{float(value):.4f}"
rows.append(
"<tr>"
f"<td>{html.escape(key)}</td>"
f"<td><strong>{value_text}</strong></td>"
"</tr>"
)
return (
'<table class="score-table">'
"<thead><tr><th>Composante</th><th>Score</th></tr></thead>"
"<tbody>"
+ "".join(rows)
+ "</tbody></table>"
)
def evidence_block(candidate: dict[str, Any] | None) -> str:
if not candidate:
return ""
evidence = candidate.get("evidence") or {}
forms = evidence.get("preferred_forms") or []
best_refs = evidence.get("best_references") or []
def value_block(label: str, value: Any) -> str:
if value is None or value == []:
content = '<span class="muted">Aucun indice</span>'
elif isinstance(value, list):
content = "<ul>" + "".join(f"<li>{html.escape(str(item))}</li>" for item in value) + "</ul>"
else:
content = f"<p>{html.escape(str(value))}</p>"
return f"<div class='evidence-item'><h4>{label}</h4>{content}</div>"
return (
"<div class='evidence-grid'>"
+ value_block("Formes preferees", forms)
+ value_block("Meilleure source attrra", evidence.get("best_attrra_source"))
+ value_block("Meilleure note attrra", evidence.get("best_attrra_note"))
+ value_block("Meilleures references", best_refs)
+ "</div>"
)
def candidates_table(candidates: list[dict[str, Any]]) -> str:
if not candidates:
return '<p class="muted">Aucun candidat.</p>'
rows = []
for candidate in candidates:
score = candidate.get("score") or {}
forms = (candidate.get("evidence") or {}).get("preferred_forms") or []
rows.append(
"<tr>"
f"<td><a href='{html.escape(candidate.get('url') or '')}' target='_blank'>{html.escape(candidate.get('ppn') or '')}</a></td>"
f"<td>{html.escape(candidate.get('first_name') or '')}</td>"
f"<td>{html.escape(candidate.get('last_name') or '')}</td>"
f"<td>{float(score.get('final') or 0):.4f}</td>"
f"<td>{html.escape(' | '.join(str(form) for form in forms[:3]))}</td>"
"</tr>"
)
return (
'<table class="candidates-table">'
"<thead><tr><th>PPN</th><th>Prenom</th><th>Nom</th><th>Final</th><th>Formes</th></tr></thead>"
"<tbody>"
+ "".join(rows)
+ "</tbody></table>"
)
def score_value(candidate: dict[str, Any], key: str) -> float:
try:
return max(0.0, min(1.0, float((candidate.get("score") or {}).get(key) or 0.0)))
except Exception:
return 0.0
def weighted_context_score(candidate: dict[str, Any], weights: dict[str, Any]) -> float:
components = [
("attrra_source", "attrra_source"),
("attrra_note", "attrra_note"),
("references", "references"),
("institution_year", "institution_year"),
]
weighted_sum = 0.0
weight_sum = 0.0
for score_key, weight_key in components:
try:
weight = float(weights.get(weight_key, 0.0))
except Exception:
weight = 0.0
if weight <= 0.0:
continue
weighted_sum += weight * score_value(candidate, score_key)
weight_sum += weight
if weight_sum == 0.0:
return sum(score_value(candidate, key) for key, _ in components) / len(components)
return weighted_sum / weight_sum
def candidate_profile_plot(result: dict[str, Any]) -> str:
candidates = (result.get("candidates") or [])[:12]
if not candidates:
return '<p class="muted">Aucun profil candidat a afficher.</p>'
weights = result.get("score_weights") or {}
width = 760
height = 420
left = 70
right = 34
top = 28
bottom = 64
plot_w = width - left - right
plot_h = height - top - bottom
colors = ["#2563eb", "#dc2626", "#16a34a", "#9333ea", "#ea580c", "#0891b2", "#4f46e5", "#be123c"]
def x_pos(value: float) -> float:
return left + value * plot_w
def y_pos(value: float) -> float:
return top + (1.0 - value) * plot_h
grid = []
for tick in [0.0, 0.25, 0.5, 0.75, 1.0]:
x = x_pos(tick)
y = y_pos(tick)
grid.append(f'<line x1="{x:.1f}" y1="{top}" x2="{x:.1f}" y2="{top + plot_h}" class="plot-grid" />')
grid.append(f'<line x1="{left}" y1="{y:.1f}" x2="{left + plot_w}" y2="{y:.1f}" class="plot-grid" />')
grid.append(f'<text x="{x:.1f}" y="{top + plot_h + 22}" class="plot-tick">{tick:.2f}</text>')
grid.append(f'<text x="{left - 12}" y="{y + 4:.1f}" class="plot-tick" text-anchor="end">{tick:.2f}</text>')
points = [
'<circle cx="{:.1f}" cy="{:.1f}" r="7" fill="#111827"><title>Profil de reference: correspondance parfaite avec le document courant</title></circle>'.format(
x_pos(1.0),
y_pos(1.0),
),
'<text x="{:.1f}" y="{:.1f}" class="plot-label">document courant</text>'.format(
x_pos(1.0) - 112,
y_pos(1.0) - 10,
),
]
legend_rows = []
for idx, candidate in enumerate(candidates):
score = candidate.get("score") or {}
ppn = str(candidate.get("ppn") or "")
label = ppn or f"candidat {idx + 1}"
x = score_value(candidate, "name")
y = weighted_context_score(candidate, weights)
final = float(score.get("final") or 0.0)
color = colors[idx % len(colors)]
radius = 5.0 + min(7.0, final * 7.0)
points.append(
'<circle cx="{:.1f}" cy="{:.1f}" r="{:.1f}" fill="{}" fill-opacity="0.82" stroke="#111827" stroke-width="1">'
"<title>PPN {} | name {:.3f} | contexte {:.3f} | final {:.3f}</title></circle>".format(
x_pos(x),
y_pos(y),
radius,
color,
html.escape(label),
x,
y,
final,
)
)
points.append(
'<text x="{:.1f}" y="{:.1f}" class="plot-point-label">{}</text>'.format(
x_pos(x) + 8,
y_pos(y) - 8,
html.escape(label),
)
)
legend_rows.append(
"<tr>"
f"<td><span class='legend-dot' style='background:{color}'></span>{html.escape(label)}</td>"
f"<td>{x:.3f}</td>"
f"<td>{y:.3f}</td>"
f"<td>{final:.3f}</td>"
"</tr>"
)
svg = f"""
<svg class="profile-plot" viewBox="0 0 {width} {height}" role="img" aria-label="Carte des profils de similarite">
<rect x="0" y="0" width="{width}" height="{height}" fill="#ffffff" />
{''.join(grid)}
<line x1="{left}" y1="{top + plot_h}" x2="{left + plot_w}" y2="{top + plot_h}" class="plot-axis" />
<line x1="{left}" y1="{top}" x2="{left}" y2="{top + plot_h}" class="plot-axis" />
<text x="{left + plot_w / 2:.1f}" y="{height - 18}" class="plot-axis-label">similarite du nom</text>
<text x="18" y="{top + plot_h / 2:.1f}" class="plot-axis-label" transform="rotate(-90 18 {top + plot_h / 2:.1f})">similarite contexte documentaire</text>
{''.join(points)}
</svg>
"""
return (
'<div class="plot-wrap">'
f"{svg}"
'<p class="plot-note">Chaque point represente le profil de similarite d un PPN candidat par rapport au document courant. '
"L axe X utilise le score du nom; l axe Y agrege les indices documentaires avec les coefficients actifs.</p>"
'<table class="plot-legend"><thead><tr><th>PPN</th><th>Nom</th><th>Contexte</th><th>Final</th></tr></thead>'
f"<tbody>{''.join(legend_rows)}</tbody></table>"
"</div>"
)
def render_result(result: dict[str, Any]) -> str:
status = result.get("status") or ""
best_ppn = result.get("best_ppn")
best = result.get("best_candidate")
similarity = result.get("similarity") or {}
query = result.get("query") or {}
best_html = ""
if best:
best_html = f"""
<section class="panel">
<h3>Meilleur candidat</h3>
<p><strong>PPN :</strong> <a href="{html.escape(best.get('url') or '')}" target="_blank">{html.escape(best.get('ppn') or '')}</a></p>
<p><strong>Nom :</strong> {html.escape((best.get('first_name') or '') + ' ' + (best.get('last_name') or ''))}</p>
{score_table(best)}
{evidence_block(best)}
</section>
"""
return f"""
<div class="result">
<section class="panel">
<h3>Decision</h3>
<p>{status_badge(status)}</p>
<p><strong>PPN accepte :</strong> {html.escape(str(best_ppn)) if best_ppn else '<span class="muted">aucun</span>'}</p>
<p><strong>Mode de similarite :</strong> {html.escape(str(similarity.get('type') or ''))}
{f" / {html.escape(str(similarity.get('model')))}" if similarity.get('model') else ""}</p>
<p><strong>Nom aligne :</strong> {html.escape(str(query.get('name') or ''))}</p>
</section>
{best_html}
<section class="panel">
<h3>Tous les candidats</h3>
{candidates_table(result.get('candidates') or [])}
</section>
<section class="panel">
<h3>Carte des profils de similarite</h3>
{candidate_profile_plot(result)}
</section>
</div>
"""
def parse_json_ui(json_text: str) -> tuple[list[list[str]], gr.Dropdown, str, dict[str, Any]]:
payload = parse_extraction_json(json_text)
people = extract_people(payload)
if not people:
return [], gr.Dropdown(choices=[], value=None), "Aucun nom trouve dans les champs personnes.", payload
choices = people_to_choices(people)
summary = f"{len(people)} personne(s) unique(s) trouvee(s). Selectionnez une personne puis lancez l'alignement."
return people_to_rows(people), gr.Dropdown(choices=choices, value=choices[0]), summary, payload
def align_selected_ui(
extraction: dict[str, Any] | None,
selected_person: str | None,
api_url: str,
api_key: str,
use_embeddings: bool,
max_candidates: int,
max_docs_per_role: int,
reference_top_k: int,
accept_threshold: float,
margin_threshold: float,
weight_name: float,
weight_attrra_source: float,
weight_attrra_note: float,
weight_references: float,
weight_institution_year: float,
) -> tuple[str, str]:
if not extraction:
raise gr.Error("Collez et analysez d'abord le JSON d'extraction.")
if not selected_person:
raise gr.Error("Selectionnez une personne a aligner.")
selected_name = selected_choice_to_name(selected_person)
payload = build_align_payload(
extraction,
selected_name,
use_embeddings,
max_candidates,
max_docs_per_role,
reference_top_k,
accept_threshold,
margin_threshold,
weight_name,
weight_attrra_source,
weight_attrra_note,
weight_references,
weight_institution_year,
)
base_url = api_url.rstrip("/")
headers = {"Content-Type": "application/json"}
if api_key.strip():
headers["X-API-Key"] = api_key.strip()
try:
response = requests.post(f"{base_url}/align/person", headers=headers, json=payload, timeout=180)
response.raise_for_status()
result = response.json()
except requests.RequestException as exc:
raise gr.Error(f"Erreur lors de l'appel API: {exc}") from exc
except ValueError as exc:
raise gr.Error("La reponse API n'est pas un JSON valide.") from exc
return render_result(result), json.dumps(result, ensure_ascii=False, indent=2)
CSS = """
.gradio-container {
max-width: 1280px !important;
}
.muted {
color: #6b7280;
}
.panel {
border: 1px solid #e5e7eb;
border-radius: 8px;
padding: 14px 16px;
margin: 12px 0;
background: #fff;
}
.panel h3 {
margin-top: 0;
}
.score-table,
.candidates-table {
width: 100%;
border-collapse: collapse;
margin-top: 10px;
}
.score-table th,
.score-table td,
.candidates-table th,
.candidates-table td {
border-bottom: 1px solid #e5e7eb;
padding: 7px 8px;
text-align: left;
vertical-align: top;
}
.score-table th,
.candidates-table th {
background: #f9fafb;
font-weight: 700;
}
.evidence-grid {
display: grid;
grid-template-columns: repeat(2, minmax(0, 1fr));
gap: 10px;
margin-top: 12px;
}
.evidence-item {
border: 1px solid #e5e7eb;
border-radius: 8px;
padding: 10px;
background: #f9fafb;
}
.evidence-item h4 {
margin: 0 0 6px 0;
}
.evidence-item p,
.evidence-item ul {
margin: 0;
}
.plot-wrap {
overflow-x: auto;
}
.profile-plot {
width: 100%;
max-width: 760px;
min-width: 560px;
display: block;
}
.plot-grid {
stroke: #e5e7eb;
stroke-width: 1;
}
.plot-axis {
stroke: #111827;
stroke-width: 1.4;
}
.plot-tick,
.plot-label,
.plot-point-label,
.plot-axis-label {
fill: #374151;
font-size: 12px;
}
.plot-axis-label {
font-weight: 700;
}
.plot-point-label {
font-size: 11px;
}
.plot-note {
color: #6b7280;
font-size: 13px;
margin: 8px 0 10px 0;
}
.plot-legend {
width: 100%;
max-width: 760px;
border-collapse: collapse;
}
.plot-legend th,
.plot-legend td {
border-bottom: 1px solid #e5e7eb;
padding: 6px 8px;
text-align: left;
}
.legend-dot {
display: inline-block;
width: 10px;
height: 10px;
border-radius: 999px;
margin-right: 7px;
}
@media (max-width: 800px) {
.evidence-grid {
grid-template-columns: 1fr;
}
}
"""
with gr.Blocks(css=CSS, title="IdRef Qualinka Alignment") as demo:
extraction_state = gr.State({})
gr.Markdown(
"""
# Alignement IdRef / Qualinka
Collez le JSON produit par l'extraction VLM, analysez les personnes,
puis lancez l'alignement IdRef personne par personne.
"""
)
with gr.Row():
with gr.Column(scale=3):
json_input = gr.Code(
value=EXAMPLE_JSON,
language="json",
label="JSON d'extraction",
lines=22,
)
with gr.Column(scale=2):
api_url = gr.Textbox(value=DEFAULT_API_URL, label="URL de l'API IdRef Qualinka")
api_key = gr.Textbox(value=DEFAULT_API_KEY, label="API key", type="password")
use_embeddings = gr.Checkbox(
value=False,
label=f"Utiliser le mode embedding ({EMBEDDING_MODEL})",
)
with gr.Accordion("Parametres avances", open=False):
max_candidates = gr.Slider(1, 100, value=20, step=1, label="Max candidats")
max_docs_per_role = gr.Slider(0, 200, value=20, step=1, label="Max references par role")
reference_top_k = gr.Slider(1, 20, value=3, step=1, label="Top-k references")
accept_threshold = gr.Slider(0.0, 1.0, value=0.65, step=0.01, label="Seuil d'acceptation")
margin_threshold = gr.Slider(0.0, 1.0, value=0.08, step=0.01, label="Marge d'ambiguite")
gr.Markdown("### Coefficients du score final")
weight_name = gr.Slider(0.0, 1.0, value=0.40, step=0.01, label="Poids nom")
weight_attrra_source = gr.Slider(0.0, 1.0, value=0.25, step=0.01, label="Poids source attrra")
weight_attrra_note = gr.Slider(0.0, 1.0, value=0.15, step=0.01, label="Poids note attrra")
weight_references = gr.Slider(0.0, 1.0, value=0.15, step=0.01, label="Poids references")
weight_institution_year = gr.Slider(0.0, 1.0, value=0.05, step=0.01, label="Poids institution / annee")
parse_btn = gr.Button("1. Analyser les personnes", variant="primary")
parse_status = gr.Markdown()
with gr.Row():
people_table = gr.Dataframe(
headers=["Nom", "Roles extraits"],
datatype=["str", "str"],
label="Personnes uniques extraites",
interactive=False,
wrap=True,
)
selected_person = gr.Dropdown(label="Personne a aligner", choices=[])
align_btn = gr.Button("2. Lancer l'alignement pour la personne selectionnee", variant="primary")
result_html = gr.HTML(label="Resultat lisible")
raw_json = gr.Code(language="json", label="Reponse API brute", lines=18)
parse_btn.click(
parse_json_ui,
inputs=[json_input],
outputs=[people_table, selected_person, parse_status, extraction_state],
)
align_btn.click(
align_selected_ui,
inputs=[
extraction_state,
selected_person,
api_url,
api_key,
use_embeddings,
max_candidates,
max_docs_per_role,
reference_top_k,
accept_threshold,
margin_threshold,
weight_name,
weight_attrra_source,
weight_attrra_note,
weight_references,
weight_institution_year,
],
outputs=[result_html, raw_json],
)
demo.launch(mcp_server=True)