from __future__ import annotations
import html
import json
import os
from typing import Any
import gradio as gr
import requests
DEFAULT_API_URL = os.getenv("IDREF_QUALINKA_API_URL", "https://idref-linker.smartbiblia.fr")
DEFAULT_API_KEY = os.getenv("IDREF_QUALINKA_API_KEY", "")
EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
PERSON_FIELDS = [
"author",
"advisor",
"jury_president",
"reviewers",
"committee_members",
]
FIELD_LABELS = {
"author": "Auteur",
"advisor": "Directeur",
"jury_president": "President du jury",
"reviewers": "Rapporteurs",
"committee_members": "Membres du jury",
}
EXAMPLE_JSON = """{
"title": "La question de l'hygiène aux Indes-Néerlandaises",
"subtitle": "Les enjeux médicaux, culturels et sociaux",
"author": "Gani Achmad JAE LANI",
"degree_type": "Thèse de doctorat",
"discipline": "Histoire et civilisations",
"granting_institution": "École des Hautes Études en Sciences Sociales",
"co_tutelle_institutions": [],
"doctoral_school": "École doctorale de l'EHESS",
"defense_year": 2017,
"advisor": "Gérard JORLAND",
"jury_president": "",
"reviewers": "",
"committee_members": "Romain BERTRAND|Patrice BOURDELAIS|Charles ILLOUZ|Annick OPINEL|Patrick ZYLBERMAN|Gérard JORLAND",
"language": "fre",
"confidence": 0.98
}"""
def split_people(value: Any) -> list[str]:
if value is None:
return []
if isinstance(value, list):
raw_items = value
else:
raw_items = str(value).split("|")
names = []
for item in raw_items:
name = str(item).strip()
if name:
names.append(name)
return names
def normalize_name_key(name: str) -> str:
return " ".join(name.lower().split())
def parse_extraction_json(json_text: str) -> dict[str, Any]:
try:
payload = json.loads(json_text)
except json.JSONDecodeError as exc:
raise gr.Error(f"JSON invalide: {exc}") from exc
if not isinstance(payload, dict):
raise gr.Error("Le JSON colle doit etre un objet.")
return payload
def extract_people(payload: dict[str, Any]) -> list[dict[str, Any]]:
people_by_key: dict[str, dict[str, Any]] = {}
for field in PERSON_FIELDS:
for name in split_people(payload.get(field)):
key = normalize_name_key(name)
if key not in people_by_key:
people_by_key[key] = {"name": name, "roles": []}
if field not in people_by_key[key]["roles"]:
people_by_key[key]["roles"].append(field)
return list(people_by_key.values())
def people_to_rows(people: list[dict[str, Any]]) -> list[list[str]]:
rows = []
for person in people:
labels = [FIELD_LABELS.get(role, role) for role in person["roles"]]
rows.append([person["name"], " | ".join(labels)])
return rows
def people_to_choices(people: list[dict[str, Any]]) -> list[str]:
choices = []
for person in people:
labels = [FIELD_LABELS.get(role, role) for role in person["roles"]]
choices.append(f"{person['name']} — {' | '.join(labels)}")
return choices
def selected_choice_to_name(choice: str) -> str:
return choice.split(" — ", 1)[0].strip()
def build_align_payload(
extraction: dict[str, Any],
selected_name: str,
use_embeddings: bool,
max_candidates: int,
max_docs_per_role: int,
reference_top_k: int,
accept_threshold: float,
margin_threshold: float,
weight_name: float,
weight_attrra_source: float,
weight_attrra_note: float,
weight_references: float,
weight_institution_year: float,
) -> dict[str, Any]:
return {
"name": selected_name,
"title": str(extraction.get("title") or ""),
"subtitle": str(extraction.get("subtitle") or ""),
"discipline": str(extraction.get("discipline") or ""),
"institution": str(extraction.get("granting_institution") or extraction.get("institution") or ""),
"doctoral_school": str(extraction.get("doctoral_school") or ""),
"degree_type": str(extraction.get("degree_type") or ""),
"year": str(extraction.get("defense_year") or extraction.get("year") or ""),
"max_candidates": int(max_candidates),
"max_docs_per_role": int(max_docs_per_role),
"reference_top_k": int(reference_top_k),
"embedding_model": EMBEDDING_MODEL if use_embeddings else "",
"accept_threshold": float(accept_threshold),
"margin_threshold": float(margin_threshold),
"weight_name": float(weight_name),
"weight_attrra_source": float(weight_attrra_source),
"weight_attrra_note": float(weight_attrra_note),
"weight_references": float(weight_references),
"weight_institution_year": float(weight_institution_year),
}
def status_badge(status: str) -> str:
colors = {
"accepted": ("#dcfce7", "#166534"),
"ambiguous": ("#fef9c3", "#854d0e"),
"low_confidence": ("#fee2e2", "#991b1b"),
"not_found": ("#e5e7eb", "#374151"),
}
bg, fg = colors.get(status, ("#e5e7eb", "#374151"))
return (
f'{html.escape(status)}'
)
def score_table(candidate: dict[str, Any] | None) -> str:
if not candidate:
return ""
score = candidate.get("score") or {}
rows = []
for key in ["final", "name", "attrra_source", "attrra_note", "references", "institution_year"]:
value = score.get(key)
value_text = "" if value is None else f"{float(value):.4f}"
rows.append(
"
"
f"
{html.escape(key)}
"
f"
{value_text}
"
"
"
)
return (
'
'
"
Composante
Score
"
""
+ "".join(rows)
+ "
"
)
def evidence_block(candidate: dict[str, Any] | None) -> str:
if not candidate:
return ""
evidence = candidate.get("evidence") or {}
forms = evidence.get("preferred_forms") or []
best_refs = evidence.get("best_references") or []
def value_block(label: str, value: Any) -> str:
if value is None or value == []:
content = 'Aucun indice'
elif isinstance(value, list):
content = "
"
)
def candidates_table(candidates: list[dict[str, Any]]) -> str:
if not candidates:
return '
Aucun candidat.
'
rows = []
for candidate in candidates:
score = candidate.get("score") or {}
forms = (candidate.get("evidence") or {}).get("preferred_forms") or []
rows.append(
"
'
weights = result.get("score_weights") or {}
width = 760
height = 420
left = 70
right = 34
top = 28
bottom = 64
plot_w = width - left - right
plot_h = height - top - bottom
colors = ["#2563eb", "#dc2626", "#16a34a", "#9333ea", "#ea580c", "#0891b2", "#4f46e5", "#be123c"]
def x_pos(value: float) -> float:
return left + value * plot_w
def y_pos(value: float) -> float:
return top + (1.0 - value) * plot_h
grid = []
for tick in [0.0, 0.25, 0.5, 0.75, 1.0]:
x = x_pos(tick)
y = y_pos(tick)
grid.append(f'')
grid.append(f'')
grid.append(f'{tick:.2f}')
grid.append(f'{tick:.2f}')
points = [
'Profil de reference: correspondance parfaite avec le document courant'.format(
x_pos(1.0),
y_pos(1.0),
),
'document courant'.format(
x_pos(1.0) - 112,
y_pos(1.0) - 10,
),
]
legend_rows = []
for idx, candidate in enumerate(candidates):
score = candidate.get("score") or {}
ppn = str(candidate.get("ppn") or "")
label = ppn or f"candidat {idx + 1}"
x = score_value(candidate, "name")
y = weighted_context_score(candidate, weights)
final = float(score.get("final") or 0.0)
color = colors[idx % len(colors)]
radius = 5.0 + min(7.0, final * 7.0)
points.append(
''
"PPN {} | name {:.3f} | contexte {:.3f} | final {:.3f}".format(
x_pos(x),
y_pos(y),
radius,
color,
html.escape(label),
x,
y,
final,
)
)
points.append(
'{}'.format(
x_pos(x) + 8,
y_pos(y) - 8,
html.escape(label),
)
)
legend_rows.append(
"
"
f"
{html.escape(label)}
"
f"
{x:.3f}
"
f"
{y:.3f}
"
f"
{final:.3f}
"
"
"
)
svg = f"""
"""
return (
'
'
f"{svg}"
'
Chaque point represente le profil de similarite d un PPN candidat par rapport au document courant. '
"L axe X utilise le score du nom; l axe Y agrege les indices documentaires avec les coefficients actifs.
"
'
PPN
Nom
Contexte
Final
'
f"{''.join(legend_rows)}
"
"
"
)
def render_result(result: dict[str, Any]) -> str:
status = result.get("status") or ""
best_ppn = result.get("best_ppn")
best = result.get("best_candidate")
similarity = result.get("similarity") or {}
query = result.get("query") or {}
best_html = ""
if best:
best_html = f"""
PPN accepte : {html.escape(str(best_ppn)) if best_ppn else 'aucun'}
Mode de similarite : {html.escape(str(similarity.get('type') or ''))}
{f" / {html.escape(str(similarity.get('model')))}" if similarity.get('model') else ""}
Nom aligne : {html.escape(str(query.get('name') or ''))}
{best_html}
Tous les candidats
{candidates_table(result.get('candidates') or [])}