doc2gl / demo_user.py
Doc2GL Deploy
Gemini: add model fallback list and GEMINI_MODEL override to avoid 404 NOT_FOUND
7a2d1b0
"""
╔══════════════════════════════════════════════════════════════════════════════╗
║ DOC2GL v2.0 ║
║ ║
║ Description : Système de conversion de documents (PDF/Images) en graphes ║
║ de connaissances avec évaluation de la qualité par ║
║ comparaison fuzzy et sémantique ║
║ ║
║ Auteur : YOUMBI CHATUE DANIELE ║
║ Date : [10/09/2025] ║
║ Version : 2.0 ║
╚══════════════════════════════════════════════════════════════════════════════╝
"""
# ============================================
# IMPORTS DES BIBLIOTHÈQUES
# ============================================
# Interface utilisateur
import gradio as gr
gr_update = gr.update
# Manipulation de fichiers et système
import glob
import os
import io
import shutil
import tempfile
import time
import json
import base64
import logging
# Traitement de documents
import fitz # PyMuPDF pour la conversion PDF → Image
from PIL import Image # Manipulation d'images
# Base de données de graphes
from neo4j import GraphDatabase # Stockage des graphes de connaissances
# Modèles d'IA pour génération de graphes
import google.genai as genai
from google.genai import types
from gemma import generate_mermaid_from_gemma
from llama import generate_mermaid_from_llama
from Qwen import generate_mermaid_from_qwen
from nvidia_nemotron import generate_mermaid_from_nvidia
from intern_vl import generate_mermaid_from_intern
# Génération de rapports
from fpdf import FPDF
import matplotlib.pyplot as plt
# Comparaison et matching
from rapidfuzz import process, fuzz # Fuzzy matching (similarité textuelle)
from sentence_transformers import SentenceTransformer, util # Matching sémantique
import torch
import numpy as np
from dotenv import load_dotenv
# Agrégation de mindmaps
from aggregation import aggregate_mindmaps
# En haut de app.py, après les imports
__all__ = [
'convert_pdf_to_image',
'encode_image',
'generate_mermaid_from_image_gemini',
'generate_mermaid_from_llama',
'generate_mermaid_from_gemma',
'generate_mermaid_from_qwen',
'generate_mermaid_from_nvidia',
'generate_mermaid_from_intern',
'mermaid_to_json',
'load_json',
'compare_graphs_with_semantic_fast'
]
load_dotenv()
_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
os.chdir(_SCRIPT_DIR)
# ============================================
# CONFIGURATION GLOBALE
# ============================================
# Configuration du système de logging pour tracer les opérations
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
# ─────────────────────────────────────────────────────────────────────────────
# Configuration Neo4j (Base de données de graphes)
# ─────────────────────────────────────────────────────────────────────────────
NEO4J_URI = os.environ.get("NEO4J_URI", "bolt://localhost:7687") # Adresse du serveur Neo4j
NEO4J_USER = os.environ.get("NEO4J_USER", "neo4j") # Nom d'utilisateur
NEO4J_PASSWORD = os.environ.get("NEO4J_PASSWORD", "123456789") # Mot de passe
TEMP_FOLDER = "temp_images" # Dossier temporaire pour les images
OUTPUT_JSON_DIR = os.path.join(_SCRIPT_DIR, "json_files")
GT_ORIGINAL_PATH = os.path.join(_SCRIPT_DIR, "GT", "OOP_MindMAP_GroundTruth_00.json")
# ─────────────────────────────────────────────────────────────────────────────
# Variables globales pour le suivi des performances
# ─────────────────────────────────────────────────────────────────────────────
model_precision_records = [] # Stocke les métriques de chaque modèle testé
# ─────────────────────────────────────────────────────────────────────────────
# Configuration du modèle de matching sémantique (lazy loading)
# ─────────────────────────────────────────────────────────────────────────────
# Modèle multilingue optimisé pour la vitesse (12 couches au lieu de 24)
_semantic_model = None
def get_semantic_model():
global _semantic_model
if _semantic_model is None:
_semantic_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
_semantic_model.eval()
return _semantic_model
# ============================================
# SECTION 1 : FONCTIONS DE BASE (UTILITAIRES)
# ============================================
def convert_pdf_to_image(pdf_path, output_folder=TEMP_FOLDER):
"""Convertit la première page d'un PDF en image PNG."""
try:
os.makedirs(output_folder, exist_ok=True)
pdf_document = fitz.open(pdf_path)
page = pdf_document[0]
pix = page.get_pixmap()
pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
image_path = os.path.join(output_folder, f"{pdf_name}_page1.png")
pix.save(image_path)
pdf_document.close()
logging.info(f"PDF {pdf_path} converti en image {image_path} avec succès.")
return image_path
except Exception as e:
logging.error(f"Erreur lors de la conversion du PDF : {e}")
raise
def encode_image(img_path):
"""Encode une image en Base64."""
try:
with open(img_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
except Exception as e:
logging.error(f"Erreur lors de l'encodage de l'image : {e}")
raise
def escape_string(value):
"""Échappe les caractères problématiques pour Cypher."""
return value.replace("'", "''").replace("#", "\\#").replace('"', '\\"')
def load_json(file_path):
"""Charge un fichier JSON contenant un graphe."""
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
return data["nodes"], data["edges"]
# ============================================
# SECTION 2 : GÉNÉRATION DE GRAPHES PAR LES MODÈLES D'IA
# ============================================
def generate_mermaid_from_image_gemini(base64_image):
"""Génère un diagramme Mermaid à partir d'une image avec Gemini."""
api_key = os.environ.get("GEMINI_API_KEY")
if not api_key:
raise EnvironmentError("La clé API GEMINI_API_KEY n'est pas définie.")
model_from_env = os.environ.get("GEMINI_MODEL", "").strip()
model_candidates = [
model_from_env,
"gemini-2.0-flash",
"gemini-2.0-flash-lite",
"gemini-1.5-flash-latest",
"gemini-1.5-pro-latest",
]
model_candidates = [m for m in model_candidates if m]
try:
client = genai.Client(api_key=api_key)
prompt = """
Analyse cette image et extrais les entités et leurs relations pour créer un graphe de connaissances.
Génère UNIQUEMENT un diagramme Mermaid valide avec:
- Les nœuds principaux (personnes, organisations, concepts, etc.)
- Les relations entre eux
- Format: graph TD; A[Entité1] --> B[Entité2];
Important:
- Ne génère QUE le code Mermaid, sans explications
- Utilise des noms courts et clairs pour les entités
- Inclus les relations les plus importantes seulement
"""
last_error = None
response = None
for model_name in model_candidates:
try:
response = client.models.generate_content(
model=model_name,
contents=[
types.Content(
role="user",
parts=[
types.Part.from_text(text=prompt),
types.Part.from_bytes(
data=base64.b64decode(base64_image),
mime_type="image/png",
),
],
)
],
)
break
except Exception as e:
last_error = e
continue
if response is None:
raise last_error if last_error is not None else RuntimeError("Échec Gemini: aucune réponse")
mermaid_code = (getattr(response, "text", None) or "").strip()
if not mermaid_code:
try:
mermaid_code = (
response.candidates[0].content.parts[0].text
).strip()
except Exception:
mermaid_code = ""
# Nettoyer le code pour s'assurer qu'il est valide
if not mermaid_code.startswith("graph"):
mermaid_code = f"graph TD;\n{mermaid_code}"
return mermaid_code
except Exception as e:
logging.error(f"Erreur lors de la génération Mermaid avec Gemini: {e}")
raise
def mermaid_to_json(mermaid_code):
"""Convertit un code Mermaid en structure JSON."""
try:
lines = [line.strip() for line in mermaid_code.split("\n") if "-->" in line]
nodes = {}
edges = []
for line in lines:
source, rest = line.split("-->")
source = source.strip()
target = rest.strip()
source_parts = source.split("[")
source_id = source_parts[0].strip()
source_label = (
source_parts[1].replace("]", "").strip()
if len(source_parts) > 1
else source_id
)
target_parts = target.split("[")
target_id = target_parts[0].strip()
target_label = (
target_parts[1].replace("]", "").strip()
if len(target_parts) > 1
else target_id
)
if source_id not in nodes:
nodes[source_id] = source_label
if target_id not in nodes:
nodes[target_id] = target_label
edges.append({"source": source_id, "target": target_id})
return {
"nodes": [
{"id": node_id, "label": label}
for node_id, label in nodes.items()
],
"edges": edges
}
except Exception as e:
logging.error(f"Erreur lors de la conversion Mermaid → JSON : {e}")
raise
def graph_json_to_mermaid(graph_json):
"""Convertit un graphe JSON {nodes:[{id,label}], edges:[{source,target}]} en Mermaid."""
if not graph_json:
return "graph TD;"
nodes = graph_json.get("nodes") or []
edges = graph_json.get("edges") or []
def _sanitize_id(raw_id):
# Mermaid node IDs: avoid spaces/special chars
return "N_" + "".join(ch if ch.isalnum() or ch == "_" else "_" for ch in str(raw_id))
id_map = {}
lines = ["graph TD;"]
for n in nodes:
raw_id = n.get("id")
label = n.get("label") or raw_id
mid = _sanitize_id(raw_id)
id_map[str(raw_id)] = mid
safe_label = str(label).replace("\"", "'")
lines.append(f" {mid}[\"{safe_label}\"]; ")
for e in edges:
src = id_map.get(str(e.get("source")), _sanitize_id(e.get("source")))
tgt = id_map.get(str(e.get("target")), _sanitize_id(e.get("target")))
lines.append(f" {src} --> {tgt};")
return "\n".join(lines)
def mermaid_to_html(mermaid_code, elem_id):
"""Wrap Mermaid code into HTML that renders inside Gradio."""
safe = (mermaid_code or "graph TD;").replace("</script>", "</scr" + "ipt>")
return f"""
<div style=\"width:100%;min-height:520px;border:1px solid #e0e0e0;border-radius:8px;background:#fff;overflow:auto;\">
<div id=\"{elem_id}\" class=\"mermaid\" style=\"padding:16px;\">{safe}</div>
</div>
<script>
(function() {{
function render() {{
if (!window.mermaid) return;
try {{
window.mermaid.initialize({{ startOnLoad: false, securityLevel: 'loose' }});
window.mermaid.run({{ nodes: [document.getElementById('{elem_id}')] }});
}} catch (e) {{ console.error(e); }}
}}
if (!document.getElementById('mermaidjs_loader')) {{
var s = document.createElement('script');
s.id = 'mermaidjs_loader';
s.src = 'https://cdn.jsdelivr.net/npm/mermaid@10/dist/mermaid.min.js';
s.onload = render;
document.head.appendChild(s);
}} else {{
render();
}}
}})();
</script>
"""
# ============================================
# SECTION 3 : GESTION DE LA BASE NEO4J
# ============================================
def clear_neo4j():
"""Efface complètement la base de données Neo4j."""
try:
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
with driver.session() as session:
session.run("MATCH (n) DETACH DELETE n")
driver.close()
logging.info("Base de données Neo4j vidée avec succès.")
return True
except Exception as e:
logging.error(f"Neo4j indisponible ou mal configuré (clear_neo4j): {e}")
return False
def upload_gt_and_test_to_neo4j(gt_json, test_json, doc_name):
"""Upload GT et Test dans Neo4j avec déduplic
ation."""
try:
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
except Exception as e:
logging.error(f"Neo4j indisponible ou mal configuré (driver): {e}")
return False
try:
with driver.session() as session:
gt_tag = f"{doc_name}_GT"
# Upload GT
unique_gt_nodes = {}
for node in gt_json["nodes"]:
node_id = node["id"]
if node_id not in unique_gt_nodes:
unique_gt_nodes[node_id] = node
for node_id, node in unique_gt_nodes.items():
prefixed_id = f"GT_{node_id}"
session.run(
"""
MERGE (e:Entity {id:$id, doc:$doc})
ON CREATE SET e.name = $name
ON MATCH SET e.name = $name
""",
{
"id": prefixed_id,
"name": escape_string(node["label"]),
"doc": gt_tag,
},
)
unique_gt_edges = {}
for edge in gt_json["edges"]:
edge_key = (edge["source"], edge["target"])
if edge_key not in unique_gt_edges:
unique_gt_edges[edge_key] = edge
for (src, tgt), edge in unique_gt_edges.items():
src_id = f"GT_{src}"
tgt_id = f"GT_{tgt}"
session.run(
"""
MATCH (a:Entity {id:$src, doc:$doc})
MATCH (b:Entity {id:$tgt, doc:$doc})
MERGE (a)-[r:RELATED_TO {doc:$doc}]->(b)
""",
{
"src": src_id,
"tgt": tgt_id,
"doc": gt_tag,
},
)
# Upload Test
unique_test_nodes = {}
for node in test_json["nodes"]:
node_id = node["id"]
if node_id not in unique_test_nodes:
unique_test_nodes[node_id] = node
for node_id, node in unique_test_nodes.items():
prefixed_id = f"TEST_{node_id}"
session.run(
"""
MERGE (e:Entity {id:$id, doc:$doc})
ON CREATE SET e.name = $name
ON MATCH SET e.name = $name
""",
{
"id": prefixed_id,
"name": escape_string(node["label"]),
"doc": doc_name,
},
)
unique_test_edges = {}
for edge in test_json["edges"]:
edge_key = (edge["source"], edge["target"])
if edge_key not in unique_test_edges:
unique_test_edges[edge_key] = edge
for (src, tgt), edge in unique_test_edges.items():
src_id = f"TEST_{src}"
tgt_id = f"TEST_{tgt}"
session.run(
"""
MATCH (a:Entity {id:$src, doc:$doc})
MATCH (b:Entity {id:$tgt, doc:$doc})
MERGE (a)-[r:RELATED_TO {doc:$doc}]->(b)
""",
{
"src": src_id,
"tgt": tgt_id,
"doc": doc_name,
},
)
except Exception as e:
logging.error(f"Neo4j indisponible ou mal configuré (upload): {e}")
return False
finally:
try:
driver.close()
except Exception:
pass
logging.info(
f"GT uploadé : {len(unique_gt_nodes)} nœuds, {len(unique_gt_edges)} arêtes"
)
logging.info(
f"Test uploadé : {len(unique_test_nodes)} nœuds, {len(unique_test_edges)} arêtes"
)
return True
# ============================================
# SECTION 4 : MATCHING HYBRIDE (FUZZY + SÉMANTIQUE)
# ============================================
def compute_semantic_similarity_batch(test_labels, gt_labels):
"""Calcule la matrice de similarité sémantique en batch."""
with torch.no_grad():
test_embeddings = get_semantic_model().encode(
test_labels,
convert_to_tensor=True,
batch_size=32,
show_progress_bar=False
)
gt_embeddings = get_semantic_model().encode(
gt_labels,
convert_to_tensor=True,
batch_size=32,
show_progress_bar=False
)
similarity_matrix = util.cos_sim(test_embeddings, gt_embeddings)
return (similarity_matrix * 100).cpu().numpy()
def hybrid_match_optimized(test_labels, gt_labels, fuzzy_threshold=80,
semantic_threshold=70, alpha=0.6):
"""Matching hybride optimisé."""
if not gt_labels or not test_labels:
return [(None, 0, 0, 0) for _ in test_labels]
results = []
semantic_matrix = compute_semantic_similarity_batch(test_labels, gt_labels)
for i, test_lbl in enumerate(test_labels):
best_match = None
best_combined_score = 0
best_fuzzy = 0
best_semantic = 0
semantic_scores = semantic_matrix[i]
top_k = min(5, len(gt_labels))
top_indices = semantic_scores.argsort()[-top_k:][::-1]
for idx in top_indices:
candidate = gt_labels[idx]
semantic_score = float(semantic_scores[idx])
if semantic_score >= semantic_threshold - 20:
fuzzy_match, fuzzy_score, _ = process.extractOne(
test_lbl,
[candidate],
scorer=fuzz.ratio
)
combined_score = (alpha * fuzzy_score) + ((1 - alpha) * semantic_score)
if (fuzzy_score >= fuzzy_threshold or
semantic_score >= semantic_threshold):
if combined_score > best_combined_score:
best_combined_score = combined_score
best_match = candidate
best_fuzzy = fuzzy_score
best_semantic = semantic_score
results.append((best_match, best_combined_score, best_fuzzy, best_semantic))
return results
def compute_edge_metrics_undirected(gt_edges, test_edges, gt_id2lbl, test_id2lbl, label_mapping):
"""
Traite les arêtes comme NON-ORIENTÉES
Plus de métriques d'inversion
"""
# ═════════════════════════════════════════════════════════════════════
# ÉTAPE 1: Construire les ensembles d'arêtes NON-ORIENTÉES
# ═════════════════════════════════════════════════════════════════════
gt_undirected = set()
for e in gt_edges:
src = gt_id2lbl[e["source"]]
tgt = gt_id2lbl[e["target"]]
# Normaliser : toujours mettre le plus petit en premier
gt_undirected.add(tuple(sorted([src, tgt])))
test_undirected = set()
for e in test_edges:
src = test_id2lbl[e["source"]]
tgt = test_id2lbl[e["target"]]
src_mapped = label_mapping.get(src, src)
tgt_mapped = label_mapping.get(tgt, tgt)
if src_mapped and tgt_mapped:
test_undirected.add(tuple(sorted([src_mapped, tgt_mapped])))
# ═════════════════════════════════════════════════════════════════════
# ÉTAPE 2: Calcul des métriques
# ═════════════════════════════════════════════════════════════════════
tp = gt_undirected & test_undirected
fp = test_undirected - gt_undirected
fn = gt_undirected - test_undirected
prec = len(tp) / len(test_undirected) * 100 if test_undirected else 100
rec = len(tp) / len(gt_undirected) * 100 if gt_undirected else 100
return {
"precision_edges": round(prec, 2),
"recall_edges": round(rec, 2),
"f1_edges": round(2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0, 2),
"tp_edges": list(tp),
"fp_edges": list(fp),
"fn_edges": list(fn),
}
def compute_graph_depth(nodes, edges):
"""Calcule la profondeur maximale du graphe."""
from collections import defaultdict, deque
children = defaultdict(list)
parents = defaultdict(list)
node_ids = {n["id"] for n in nodes}
for edge in edges:
src, tgt = edge["source"], edge["target"]
if src in node_ids and tgt in node_ids:
children[src].append(tgt)
parents[tgt].append(src)
roots = [nid for nid in node_ids if nid not in parents]
if not roots:
roots = list(node_ids)
node_levels = {}
nodes_per_level = defaultdict(list)
queue = deque([(root, 0) for root in roots])
visited = set()
while queue:
node, level = queue.popleft()
if node in visited:
continue
visited.add(node)
node_levels[node] = level
nodes_per_level[level].append(node)
for child in children[node]:
if child not in visited:
queue.append((child, level + 1))
for node in node_ids:
if node not in node_levels:
node_levels[node] = 0
nodes_per_level[0].append(node)
max_depth = max(node_levels.values()) if node_levels else 0
return {
"max_depth": max_depth,
"node_levels": node_levels,
"nodes_per_level": dict(nodes_per_level)
}
def compute_hierarchical_metrics(gt_nodes, gt_edges, test_nodes, test_edges, label_mapping):
"""Calcule les métriques hiérarchiques."""
gt_hierarchy = compute_graph_depth(gt_nodes, gt_edges)
test_hierarchy = compute_graph_depth(test_nodes, test_edges)
gt_depth = gt_hierarchy["max_depth"]
test_depth = test_hierarchy["max_depth"]
depth_diff = abs(gt_depth - test_depth)
gt_id2label = {n["id"]: n["label"] for n in gt_nodes}
test_id2label = {n["id"]: n["label"] for n in test_nodes}
gt_label_levels = {}
for node_id, level in gt_hierarchy["node_levels"].items():
label = gt_id2label.get(node_id)
if label:
gt_label_levels[label] = level
correct_levels = 0
total_matched = 0
for test_id, test_level in test_hierarchy["node_levels"].items():
test_label = test_id2label.get(test_id)
if not test_label:
continue
gt_label = label_mapping.get(test_label)
if not gt_label:
continue
total_matched += 1
expected_level = gt_label_levels.get(gt_label)
if expected_level is not None and expected_level == test_level:
correct_levels += 1
level_accuracy = (
(correct_levels / total_matched * 100)
if total_matched > 0
else 0
)
max_level = max(gt_depth, test_depth)
similarity_scores = []
for level in range(max_level + 1):
gt_count = len(gt_hierarchy["nodes_per_level"].get(level, []))
test_count = len(test_hierarchy["nodes_per_level"].get(level, []))
max_count = max(gt_count, test_count, 1)
level_sim = 1 - abs(gt_count - test_count) / max_count
similarity_scores.append(level_sim)
structure_similarity = (
sum(similarity_scores) / len(similarity_scores) * 100
if similarity_scores
else 0
)
return {
"gt_depth": gt_depth,
"test_depth": test_depth,
"depth_difference": depth_diff,
"depth_match": depth_diff == 0,
"level_accuracy": round(level_accuracy, 2),
"structure_similarity": round(structure_similarity, 2),
"gt_nodes_per_level": gt_hierarchy["nodes_per_level"],
"test_nodes_per_level": test_hierarchy["nodes_per_level"],
"correct_level_count": correct_levels,
"total_matched_nodes": total_matched
}
def compute_ged_coherent(gt_nodes, gt_edges, test_nodes, test_edges,
hallucination_metrics, edge_metrics, label_mapping=None):
"""
GED avec les métriques d'arêtes
"""
try:
# ═════════════════════════════════════════════════════════════════
# ÉTAPE 1 : Opérations sur les NŒUDS (depuis hallucination_metrics)
# ═════════════════════════════════════════════════════════════════
hallucinated_list = hallucination_metrics["hallucinated_nodes"]
missed_gt_list = hallucination_metrics["missed_gt_nodes"]
num_node_deletions = len(hallucinated_list) # Supprimer du test
num_node_insertions = len(missed_gt_list) # Ajouter au test
num_node_substitutions = 0
# ═════════════════════════════════════════════════════════════════
# ÉTAPE 2 : Opérations sur les ARÊTES (depuis edge_metrics)
# ═════════════════════════════════════════════════════════════════
# Utilisation de edge_metrics
fp_edges = edge_metrics["fp_edges"] # Arêtes en trop
fn_edges = edge_metrics["fn_edges"] # Arêtes manquantes
num_edge_deletions = len(fp_edges) # À supprimer du test
num_edge_insertions = len(fn_edges) # À ajouter au test
num_edge_substitutions = 0
# ═════════════════════════════════════════════════════════════════
# ÉTAPE 3 : Calcul du GED total
# ═════════════════════════════════════════════════════════════════
ged = (num_node_deletions + num_node_insertions + num_node_substitutions +
num_edge_deletions + num_edge_insertions + num_edge_substitutions)
# Normalisation
max_ops = (len(gt_nodes) + len(gt_edges) + len(test_nodes) + len(test_edges))
normalized_ged = (ged / max_ops * 100) if max_ops > 0 else 0
similarity = 100 - normalized_ged
# ═════════════════════════════════════════════════════════════════
# LOGS DE VÉRIFICATION
# ═════════════════════════════════════════════════════════════════
logging.info("=" * 60)
logging.info("GRAPH EDIT DISTANCE (test → GT)")
logging.info("=" * 60)
logging.info(f" GED total : {ged:.2f} opérations")
logging.info(f" • GED normalisé : {normalized_ged:.2f}%")
logging.info(f" • Similarité : {similarity:.2f}%")
logging.info("")
logging.info(f" Opérations sur NŒUDS :")
logging.info(f" • Suppressions : {num_node_deletions} (= hallucinés)")
logging.info(f" • Insertions : {num_node_insertions} (= manqués GT)")
logging.info("")
logging.info(f" Opérations sur ARÊTES :")
logging.info(f" • Suppressions : {num_edge_deletions} (= arêtes en trop)")
logging.info(f" • Insertions : {num_edge_insertions} (= arêtes manquantes)")
logging.info("")
logging.info(f"🔹 COHÉRENCE AVEC MÉTRIQUES :")
logging.info(f" ✓ fp_edges = {len(fp_edges)} → Suppressions = {num_edge_deletions}")
logging.info(f" ✓ fn_edges = {len(fn_edges)} → Insertions = {num_edge_insertions}")
logging.info("=" * 60)
return {
"ged": round(ged, 2),
"normalized_ged": round(normalized_ged, 2),
"structural_similarity": round(similarity, 2),
"gt_nodes_count": len(gt_nodes),
"gt_edges_count": len(gt_edges),
"test_nodes_count": len(test_nodes),
"test_edges_count": len(test_edges),
# Opérations sur nœuds
"num_node_deletions": num_node_deletions,
"num_node_insertions": num_node_insertions,
"num_node_substitutions": num_node_substitutions,
# Opérations sur arêtes
"num_edge_deletions": num_edge_deletions,
"num_edge_insertions": num_edge_insertions,
"num_edge_substitutions": num_edge_substitutions,
# Listes détaillées
"node_deletions": hallucinated_list[:10],
"node_insertions": missed_gt_list[:10],
"edge_deletions": [f"{a} - {b}" for a, b in list(fp_edges)[:10]],
"edge_insertions": [f"{a} - {b}" for a, b in list(fn_edges)[:10]]
}
except Exception as e:
logging.error(f"⚠️ Erreur lors du calcul du GED : {e}")
return {
"ged": None,
"normalized_ged": None,
"structural_similarity": None,
"num_node_deletions": 0,
"num_node_insertions": 0,
"num_node_substitutions": 0,
"num_edge_deletions": 0,
"num_edge_insertions": 0,
"num_edge_substitutions": 0,
"error": str(e)
}
def calculate_hallucination_metrics(gt_nodes, test_nodes, mapped_test_labels):
"""Calcule les métriques d'hallucination."""
gt_labels = set(n["label"] for n in gt_nodes)
test_labels = [n["label"] for n in test_nodes]
hallucinated_nodes = []
correct_nodes = []
matched_gt_nodes = set()
already_matched_gt = {}
for test_label, matched_label in zip(test_labels, mapped_test_labels):
if matched_label is None or matched_label not in gt_labels:
hallucinated_nodes.append(test_label)
else:
if matched_label in already_matched_gt:
hallucinated_nodes.append(test_label)
else:
correct_nodes.append(test_label)
matched_gt_nodes.add(matched_label)
already_matched_gt[matched_label] = test_label
total_generated = len(test_labels)
num_hallucinated = len(hallucinated_nodes)
num_correct = len(correct_nodes)
total_gt = len(gt_labels)
num_detected = len(matched_gt_nodes)
num_missed = total_gt - num_detected
hallucination_rate = (num_hallucinated / total_generated * 100) if total_generated > 0 else 0
detection_rate = (num_detected / total_gt * 100) if total_gt > 0 else 0
return {
"hallucination_rate": round(hallucination_rate, 2),
"detection_rate": round(detection_rate, 2),
"total_generated": total_generated,
"num_hallucinated": num_hallucinated,
"num_correct": num_correct,
"hallucinated_nodes": hallucinated_nodes,
"correct_nodes": correct_nodes,
"total_gt": total_gt,
"num_detected": num_detected,
"num_missed": num_missed,
"missed_gt_nodes": list(gt_labels - matched_gt_nodes)
}
def compare_graphs_with_semantic_fast(gt_nodes, gt_edges, test_nodes, test_edges,
fuzzy_threshold=80, semantic_threshold=70,
alpha=0.6, edge_mode="hybrid"):
"""
Sans inversions + GED cohérent
"""
gt_labels = [n["label"] for n in gt_nodes]
test_labels = [n["label"] for n in test_nodes]
matches = hybrid_match_optimized(
test_labels, gt_labels, fuzzy_threshold, semantic_threshold, alpha
)
mapped_test_labels = []
matching_details = []
for test_lbl, (match, combined_score, fuzzy_score, sem_score) in zip(test_labels, matches):
if match:
mapped_test_labels.append(match)
matching_details.append({
"test": test_lbl,
"matched": match,
"combined": round(combined_score, 2),
"fuzzy": round(fuzzy_score, 2),
"semantic": round(sem_score, 2)
})
else:
mapped_test_labels.append(None)
matching_details.append({
"test": test_lbl,
"matched": "NO_MATCH",
"combined": 0,
"fuzzy": 0,
"semantic": 0
})
hallucination_metrics = calculate_hallucination_metrics(
gt_nodes, test_nodes, mapped_test_labels
)
label_mapping = {
test_lbl: match
for test_lbl, (match, _, _, _) in zip(test_labels, matches)
if match
}
hallucinated_nodes_list = hallucination_metrics["hallucinated_nodes"]
gt_set = set(gt_labels)
test_set = set([lbl for lbl in mapped_test_labels if lbl is not None])
tp_nodes = gt_set & test_set
extra_nodes = set(hallucinated_nodes_list)
fn_nodes = gt_set - test_set
gt_id2lbl = {n["id"]: n["label"] for n in gt_nodes}
test_id2lbl = {n["id"]: n["label"] for n in test_nodes}
# Utiliser la fonction non-orientée
edge_metrics = compute_edge_metrics_undirected(
gt_edges, test_edges, gt_id2lbl, test_id2lbl, label_mapping
)
prec_edges = edge_metrics["precision_edges"] / 100
rec_edges = edge_metrics["recall_edges"] / 100
f1_edges = edge_metrics["f1_edges"] / 100
tp_edges = set(edge_metrics["tp_edges"])
fp_edges = set(edge_metrics["fp_edges"])
fn_edges = set(edge_metrics["fn_edges"])
prec_nodes = (
len(tp_nodes) / (len(tp_nodes) + len(extra_nodes))
if tp_nodes or extra_nodes
else 1.0
)
rec_nodes = (
len(tp_nodes) / (len(tp_nodes) + len(fn_nodes))
if tp_nodes or fn_nodes
else 1.0
)
f1_nodes = (
2 * prec_nodes * rec_nodes / (prec_nodes + rec_nodes)
if (prec_nodes + rec_nodes)
else 0.0
)
hierarchical_metrics = compute_hierarchical_metrics(
gt_nodes, gt_edges, test_nodes, test_edges, label_mapping
)
# Passer edge_metrics au GED
ged_metrics = compute_ged_coherent(
gt_nodes, gt_edges,
test_nodes, test_edges,
hallucination_metrics,
edge_metrics,
label_mapping
)
return {
"precision_nodes": round(prec_nodes * 100, 2),
"recall_nodes": round(rec_nodes * 100, 2),
"f1_nodes": round(f1_nodes * 100, 2),
"precision_edges": round(prec_edges * 100, 2),
"recall_edges": round(rec_edges * 100, 2),
"f1_edges": round(f1_edges * 100, 2),
"overall_precision": round(((prec_nodes + prec_edges) / 2) * 100, 2),
"overall_recall": round(((rec_nodes + rec_edges) / 2) * 100, 2),
"overall_f1": round(((f1_nodes + f1_edges) / 2) * 100, 2),
"missing_nodes": list(fn_nodes),
"extra_nodes": list(extra_nodes),
"missing_edges": list(fn_edges),
"extra_edges": list(fp_edges),
"matching_details": matching_details,
"tp_edges": list(tp_edges),
"fp_edges": list(fp_edges),
"fn_edges": list(fn_edges),
"hallucination_rate": hallucination_metrics["hallucination_rate"],
"detection_rate": hallucination_metrics["detection_rate"],
"total_generated": hallucination_metrics["total_generated"],
"num_hallucinated": hallucination_metrics["num_hallucinated"],
"num_correct": hallucination_metrics["num_correct"],
"hallucinated_nodes": hallucination_metrics["hallucinated_nodes"],
"total_gt": hallucination_metrics["total_gt"],
"num_detected": hallucination_metrics["num_detected"],
"num_missed": hallucination_metrics["num_missed"],
"missed_gt_nodes": hallucination_metrics["missed_gt_nodes"],
"gt_depth": hierarchical_metrics["gt_depth"],
"test_depth": hierarchical_metrics["test_depth"],
"depth_difference": hierarchical_metrics["depth_difference"],
"depth_match": hierarchical_metrics["depth_match"],
"level_accuracy": hierarchical_metrics["level_accuracy"],
"structure_similarity": hierarchical_metrics["structure_similarity"],
"gt_nodes_per_level": hierarchical_metrics["gt_nodes_per_level"],
"test_nodes_per_level": hierarchical_metrics["test_nodes_per_level"],
"ged": ged_metrics.get("ged"),
"normalized_ged": ged_metrics.get("normalized_ged"),
"structural_similarity": ged_metrics.get("structural_similarity"),
"num_node_insertions": ged_metrics.get("num_node_insertions", 0),
"num_node_deletions": ged_metrics.get("num_node_deletions", 0),
"num_edge_insertions": ged_metrics.get("num_edge_insertions", 0),
"num_edge_deletions": ged_metrics.get("num_edge_deletions", 0),
}
# ============================================
# SECTION 5 : MÉTRIQUES D'HALLUCINATION
# ============================================
def calculate_hallucination_metrics(gt_nodes, test_nodes, mapped_test_labels):
"""
RÈGLES STRICTES :
- num_correct = nœuds test qui ont matché avec GT
- num_hallucinated = nœuds test qui n'ont PAS matché
- num_detected = nœuds GT qui ont été trouvés par le test
- num_missed = nœuds GT qui n'ont PAS été trouvés
COHÉRENCE OBLIGATOIRE :
- num_correct + num_hallucinated = total_generated
- num_detected + num_missed = total_gt
- num_correct ≤ num_detected (car un nœud test peut matcher plusieurs fois)
"""
# ═════════════════════════════════════════════════════════════════════
# ÉTAPE 1: Extraction des labels
# ═════════════════════════════════════════════════════════════════════
gt_labels = set(n["label"] for n in gt_nodes)
test_labels = [n["label"] for n in test_nodes]
# ═════════════════════════════════════════════════════════════════════
# ÉTAPE 2: Classification avec prévention des doublons
# ═════════════════════════════════════════════════════════════════════
hallucinated_nodes = []
correct_nodes = []
matched_gt_nodes = set()
already_matched_gt = {} # {gt_label: test_label} pour tracer les doublons
for test_label, matched_label in zip(test_labels, mapped_test_labels):
if matched_label is None or matched_label not in gt_labels:
hallucinated_nodes.append(test_label)
else:
# Vérifier si ce GT a déjà été matché
if matched_label in already_matched_gt:
# Doublon → compté comme hallucination
hallucinated_nodes.append(test_label)
logging.warning(
f"⚠️ Doublon : '{test_label}' matche '{matched_label}' "
f"déjà matché par '{already_matched_gt[matched_label]}'"
)
else:
# Premier match → correct
correct_nodes.append(test_label)
matched_gt_nodes.add(matched_label)
already_matched_gt[matched_label] = test_label
# ═════════════════════════════════════════════════════════════════════
# ÉTAPE 3: Calcul des statistiques
# ═════════════════════════════════════════════════════════════════════
total_generated = len(test_labels)
num_hallucinated = len(hallucinated_nodes)
num_correct = len(correct_nodes)
total_gt = len(gt_labels)
num_detected = len(matched_gt_nodes)
num_missed = total_gt - num_detected
# ═════════════════════════════════════════════════════════════════════
# ÉTAPE 4: Vérifications de cohérence
# ═════════════════════════════════════════════════════════════════════
assert num_correct + num_hallucinated == total_generated, \
f"❌ {num_correct} + {num_hallucinated}{total_generated}"
assert num_detected + num_missed == total_gt, \
f"❌ {num_detected} + {num_missed}{total_gt}"
assert num_correct == num_detected, \
f"❌ CRITIQUE : {num_correct} corrects ≠ {num_detected} détectés"
# ═════════════════════════════════════════════════════════════════════
# ÉTAPE 5: Calcul des taux
# ═════════════════════════════════════════════════════════════════════
hallucination_rate = (num_hallucinated / total_generated * 100) if total_generated > 0 else 0
detection_rate = (num_detected / total_gt * 100) if total_gt > 0 else 0
# ═════════════════════════════════════════════════════════════════════
# ÉTAPE 6: Logs détaillés
# ═════════════════════════════════════════════════════════════════════
logging.info("=" * 60)
logging.info("📊 MÉTRIQUES D'HALLUCINATION")
logging.info("=" * 60)
logging.info(f"🔹 Vue MODÈLE (test) :")
logging.info(f" • Total généré : {total_generated}")
logging.info(f" • Corrects : {num_correct}")
logging.info(f" • Hallucinés : {num_hallucinated}")
logging.info(f" ✓ {num_correct} + {num_hallucinated} = {total_generated}")
logging.info("")
logging.info(f"🔹 Vue GROUND TRUTH :")
logging.info(f" • Total GT : {total_gt}")
logging.info(f" • Détectés : {num_detected}")
logging.info(f" • Manqués : {num_missed}")
logging.info(f" ✓ {num_detected} + {num_missed} = {total_gt}")
logging.info("")
logging.info(f"🔹 COHÉRENCE :")
logging.info(f" ✓ Corrects = Détectés : {num_correct} = {num_detected}")
logging.info("=" * 60)
return {
"hallucination_rate": round(hallucination_rate, 2),
"detection_rate": round(detection_rate, 2),
"total_generated": total_generated,
"num_hallucinated": num_hallucinated,
"num_correct": num_correct,
"hallucinated_nodes": hallucinated_nodes,
"correct_nodes": correct_nodes,
"total_gt": total_gt,
"num_detected": num_detected,
"num_missed": num_missed,
"missed_gt_nodes": list(gt_labels - matched_gt_nodes)
}
# ============================================
# SECTION 6 : COMPARAISON AVEC FUZZY UNIQUEMENT
# ============================================
def compare_graphs(gt_nodes, gt_edges, test_nodes, test_edges, threshold=100):
"""
Compare deux graphes en utilisant UNIQUEMENT le fuzzy matching.
Args:
gt_nodes (list): Nœuds du ground truth
gt_edges (list): Arêtes du ground truth
test_nodes (list): Nœuds générés
test_edges (list): Arêtes générées
threshold (int): Seuil de similarité fuzzy (0-100)
Returns:
dict: Mêmes métriques que compare_graphs_with_semantic_fast
Note:
Version simplifiée sans matching sémantique.
Utilisée quand use_semantic=False dans l'interface.
Plus rapide mais moins précise pour les synonymes.
Différence avec la version sémantique:
- Pas de batch processing
- Pas de Top-K filtering
- Uniquement distance de Levenshtein
"""
# ═════════════════════════════════════════════════════════════════════
# ÉTAPE 1: Extraction des labels
# ═════════════════════════════════════════════════════════════════════
gt_labels = [n["label"] for n in gt_nodes]
test_labels = [n["label"] for n in test_nodes]
# ═════════════════════════════════════════════════════════════════════
# ÉTAPE 2: Fuzzy matching pour chaque label test
# ═════════════════════════════════════════════════════════════════════
mapped_test_labels = []
for lbl in test_labels:
# Trouver le meilleur match dans GT selon la distance de Levenshtein
match, score, _ = process.extractOne(
lbl,
gt_labels,
scorer=fuzz.ratio # Distance de Levenshtein normalisée [0-100]
)
# Accepter le match seulement si score >= seuil
if score >= threshold:
mapped_test_labels.append(match)
else:
mapped_test_labels.append(None) # Pas de match
# ═════════════════════════════════════════════════════════════════════
# ÉTAPE 3: Calcul des métriques d'hallucination
# ═════════════════════════════════════════════════════════════════════
hallucination_metrics = calculate_hallucination_metrics(
gt_nodes, test_nodes, mapped_test_labels
)
# ═════════════════════════════════════════════════════════════════════
# ÉTAPE 4: Calcul des métriques (identique à la version sémantique)
# ═════════════════════════════════════════════════════════════════════
# [Reste du code identique à compare_graphs_with_semantic_fast]
# Pour éviter la duplication, je n'annote que les différences
gt_set = set(gt_labels)
test_set = set([lbl for lbl in mapped_test_labels if lbl is not None])
tp_nodes = gt_set & test_set
fp_nodes = test_set - gt_set
fn_nodes = gt_set - test_set
# Fonction interne pour remapper les arêtes
def remap_edges(edges, id_to_label):
"""Remappe les arêtes avec fuzzy matching sur les endpoints."""
remapped = []
for e in edges:
a = id_to_label[e["source"]]
b = id_to_label[e["target"]]
# Fuzzy-map les endpoints
a_match, a_score, _ = process.extractOne(a, gt_labels, scorer=fuzz.ratio)
b_match, b_score, _ = process.extractOne(b, gt_labels, scorer=fuzz.ratio)
if a_score >= threshold: a = a_match
if b_score >= threshold: b = b_match
remapped.append((a, b))
return set(remapped)
gt_id2lbl = {n["id"]: n["label"] for n in gt_nodes}
test_id2lbl = {n["id"]: n["label"] for n in test_nodes}
gt_edges_set = remap_edges(gt_edges, gt_id2lbl)
test_edges_set = remap_edges(test_edges, test_id2lbl)
tp_edges = gt_edges_set & test_edges_set
fp_edges = test_edges_set - gt_edges_set
fn_edges = gt_edges_set - test_edges_set
# Calcul des métriques
prec_nodes = len(tp_nodes) / (len(tp_nodes) + len(fp_nodes)) if tp_nodes or fp_nodes else 1.0
rec_nodes = len(tp_nodes) / (len(tp_nodes) + len(fn_nodes)) if tp_nodes or fn_nodes else 1.0
f1_nodes = (2 * prec_nodes * rec_nodes / (prec_nodes + rec_nodes)
if (prec_nodes + rec_nodes) else 0.0)
prec_edges = len(tp_edges) / (len(tp_edges) + len(fp_edges)) if tp_edges or fp_edges else 1.0
rec_edges = len(tp_edges) / (len(tp_edges) + len(fn_edges)) if tp_edges or fn_edges else 1.0
f1_edges = (2 * prec_edges * rec_edges / (prec_edges + rec_edges)
if (prec_edges + rec_edges) else 0.0)
return {
"precision_nodes": round(prec_nodes * 100, 2),
"recall_nodes": round(rec_nodes * 100, 2),
"f1_nodes": round(f1_nodes * 100, 2),
"precision_edges": round(prec_edges * 100, 2),
"recall_edges": round(rec_edges * 100, 2),
"f1_edges": round(f1_edges * 100, 2),
"overall_precision": round(((prec_nodes + prec_edges) / 2) * 100, 2),
"overall_recall": round(((rec_nodes + rec_edges) / 2) * 100, 2),
"overall_f1": round(((f1_nodes + f1_edges) / 2) * 100, 2),
"missing_nodes": list(fn_nodes),
"extra_nodes": list(fp_nodes),
"missing_edges": list(fn_edges),
"extra_edges": list(fp_edges),
# Métriques d'hallucination (même structure que version sémantique)
"hallucination_rate": hallucination_metrics["hallucination_rate"],
"detection_rate": hallucination_metrics["detection_rate"],
"total_generated": hallucination_metrics["total_generated"],
"num_hallucinated": hallucination_metrics["num_hallucinated"],
"num_correct": hallucination_metrics["num_correct"],
"hallucinated_nodes": hallucination_metrics["hallucinated_nodes"],
"total_gt": hallucination_metrics["total_gt"],
"num_detected": hallucination_metrics["num_detected"],
"num_missed": hallucination_metrics["num_missed"],
"missed_gt_nodes": hallucination_metrics["missed_gt_nodes"]
}
# ============================================
# SECTION 7 : GÉNÉRATION DE RAPPORTS
# ============================================
def save_results_to_pdf(results, model_name):
"""
Génère un rapport PDF moderne et visuellement attractif.
"""
os.makedirs("reports", exist_ok=True)
# ═════════════════════════════════════════════════════════════════════
# HELPER : Nettoyer les caractères problématiques
# ═════════════════════════════════════════════════════════════════════
def clean_text(text):
"""
Nettoie le texte pour éviter les erreurs Unicode avec FPDF.
Remplace les caractères problématiques par des équivalents ASCII.
"""
if not isinstance(text, str):
text = str(text)
# Dictionnaire de remplacement des caractères spéciaux
replacements = {
'→': '->',
'←': '<-',
'↔': '<->',
'…': '...',
'–': '-',
'—': '-',
'"': '"',
'"': '"',
''': "'",
''': "'",
'«': '"',
'»': '"',
'×': 'x',
'÷': '/',
'≤': '<=',
'≥': '>=',
'≠': '!=',
'≈': '~=',
'∞': 'inf',
'√': 'sqrt',
'∑': 'sum',
'∏': 'prod',
'∫': 'int',
'∂': 'd',
'∆': 'delta',
'∇': 'nabla',
'∈': 'in',
'∉': 'not in',
'⊂': 'subset',
'⊃': 'superset',
'∩': 'intersect',
'∪': 'union',
'∧': 'and',
'∨': 'or',
'¬': 'not',
'⊕': 'xor',
'⊗': 'otimes',
'①': '(1)',
'②': '(2)',
'③': '(3)',
'④': '(4)',
'⑤': '(5)',
'⑥': '(6)',
'⑦': '(7)',
'⑧': '(8)',
'⑨': '(9)',
'⑩': '(10)',
}
# Appliquer les remplacements
for old, new in replacements.items():
text = text.replace(old, new)
# Supprimer les caractères non-imprimables et les emojis
# Garder uniquement : lettres, chiffres, ponctuation basique, espaces
cleaned = ""
for char in text:
code = ord(char)
# Garder les caractères ASCII imprimables + caractères latins étendus
if (32 <= code <= 126) or (160 <= code <= 255):
cleaned += char
elif code == 9 or code == 10 or code == 13: # Tab, LF, CR
cleaned += char
else:
cleaned += "?" # Remplacer les caractères non supportés
return cleaned
# ═════════════════════════════════════════════════════════════════════
# Configuration du PDF
# ═════════════════════════════════════════════════════════════════════
pdf = FPDF()
base = os.path.abspath(_SCRIPT_DIR)
regular = os.path.abspath(os.path.join(base, "DejaVuSans.ttf"))
bold = os.path.abspath(os.path.join(base, "DejaVuSans-Bold.ttf"))
if not os.path.exists(regular) or not os.path.exists(bold):
raise FileNotFoundError(
f"Police introuvable. Attendu: '{regular}' et '{bold}'."
)
import glob
cache_dirs = {base, os.getcwd()}
for d in cache_dirs:
try:
for cache_file in glob.glob(os.path.join(d, "DejaVu*.pkl")):
try:
os.remove(cache_file)
except Exception:
pass
except Exception:
pass
FONT_FAMILY = "DejaVuLocal"
pdf.add_font(FONT_FAMILY, "", regular, uni=True)
pdf.add_font(FONT_FAMILY, "B", bold, uni=True)
_orig_set_font = pdf.set_font
def _set_font(_family, *args, **kwargs):
return _orig_set_font(FONT_FAMILY, *args, **kwargs)
pdf.set_font = _set_font
pdf.add_page()
# ═════════════════════════════════════════════════════════════════════
# HELPER FUNCTIONS
# ═════════════════════════════════════════════════════════════════════
def draw_colored_header(text, color_r, color_g, color_b):
"""Dessine un en-tête coloré."""
pdf.set_fill_color(color_r, color_g, color_b)
pdf.set_text_color(255, 255, 255)
pdf.set_font("DejaVu", "B", 14)
pdf.cell(0, 10, clean_text(text), ln=True, align="C", fill=True) # ← NETTOYAGE
pdf.set_text_color(0, 0, 0)
pdf.ln(3)
def draw_section_title(text, emoji=""):
"""Dessine un titre de section."""
pdf.set_font("DejaVu", "B", 11)
pdf.set_fill_color(240, 240, 240)
pdf.cell(0, 8, clean_text(text), ln=True, fill=True) #
pdf.ln(2)
def draw_progress_bar(label, value, max_value=100, width=140):
"""Dessine une barre de progression colorée."""
pdf.set_font("DejaVu", "", 9)
# Label (nettoyé)
pdf.cell(50, 6, clean_text(label), 0, 0)
# Valeur
pdf.set_font("DejaVu", "B", 9)
pdf.cell(15, 6, f"{value:.1f}%", 0, 0, "R")
# Barre de fond (gris clair)
x_start = pdf.get_x() + 2
y_start = pdf.get_y()
pdf.set_fill_color(220, 220, 220)
pdf.rect(x_start, y_start + 1, width, 4, "F")
# Barre de progression (couleur selon valeur)
if value >= 80:
pdf.set_fill_color(76, 175, 80) # Vert
elif value >= 60:
pdf.set_fill_color(255, 193, 7) # Jaune
else:
pdf.set_fill_color(244, 67, 54) # Rouge
progress_width = (value / max_value) * width
pdf.rect(x_start, y_start + 1, progress_width, 4, "F")
pdf.ln(7)
def draw_metric_box(label, value, color_r, color_g, color_b):
"""Dessine une boîte de métrique colorée."""
pdf.set_fill_color(color_r, color_g, color_b)
pdf.set_text_color(255, 255, 255)
pdf.set_font("DejaVu", "B", 10)
# Boîte colorée (nettoyé)
pdf.cell(60, 8, clean_text(label), 1, 0, "C", fill=True)
pdf.cell(30, 8, clean_text(str(value)), 1, 0, "C", fill=True)
pdf.set_text_color(0, 0, 0)
pdf.ln()
def draw_table_header(headers):
"""Dessine l'en-tête d'un tableau."""
pdf.set_fill_color(63, 81, 181) # Bleu
pdf.set_text_color(255, 255, 255)
pdf.set_font("DejaVu", "B", 9)
col_width = 190 / len(headers)
for header in headers:
pdf.cell(col_width, 7, clean_text(header), 1, 0, "C", fill=True) # ← NETTOYAGE
pdf.ln()
pdf.set_text_color(0, 0, 0)
def draw_table_row(values, fill=False):
"""Dessine une ligne de tableau."""
pdf.set_font("DejaVu", "", 8)
if fill:
pdf.set_fill_color(245, 245, 245)
col_width = 190 / len(values)
for value in values:
pdf.cell(col_width, 6, clean_text(str(value)), 1, 0, "C", fill) # ← NETTOYAGE
pdf.ln()
# ═════════════════════════════════════════════════════════════════════
# EN-TÊTE PRINCIPAL DU RAPPORT
# ═════════════════════════════════════════════════════════════════════
draw_colored_header(f"RAPPORT DE PERFORMANCE - {model_name}", 33, 150, 243)
# date et heure
from datetime import datetime
now = datetime.now()
date_str = now.strftime('%d/%m/%Y')
heure_str = now.strftime('%H:%M:%S')
pdf.set_font("DejaVu", "", 9)
pdf.cell(0, 5, f"Date : {date_str}", ln=True, align="C")
pdf.cell(0, 5, f"Heure : {heure_str}", ln=True, align="C")
pdf.cell(0, 5, f"Modele : {model_name}", ln=True, align="C")
pdf.cell(0, 5, f"Nombre de documents : {len(results)}", ln=True, align="C")
pdf.ln(5)
# ═════════════════════════════════════════════════════════════════════
# BOUCLE SUR CHAQUE FICHIER
# ═════════════════════════════════════════════════════════════════════
for idx, res in enumerate(results, 1):
details = res["details"]
if idx > 1:
pdf.add_page()
# Titre avec nom du document + modèle
pdf.set_fill_color(96, 125, 139)
pdf.set_text_color(255, 255, 255)
pdf.set_font("DejaVu", "B", 12)
pdf.cell(0, 10, clean_text(f"Document {idx}: {res['file']}"), ln=True, fill=True, align="C")
# Ajout du modèle et de l'heure de traitement
pdf.set_text_color(0, 0, 0)
pdf.set_font("DejaVu", "", 8)
pdf.cell(0, 5, f"Modele utilise : {model_name} | Traite le : {date_str} a {heure_str}", ln=True, align="C")
pdf.ln(3)
# ─────────────────────────────────────────────────────────────────
# SECTION 1: Vue d'ensemble
# ─────────────────────────────────────────────────────────────────
draw_section_title("VUE D'ENSEMBLE")
draw_progress_bar("Precision Globale", details.get('overall_precision', 0))
draw_progress_bar("Rappel Global", details.get('overall_recall', 0))
draw_progress_bar("F1-Score Global", details.get('overall_f1', 0))
pdf.ln(3)
# ─────────────────────────────────────────────────────────────────
# SECTION 2: Métriques des Nœuds
# ─────────────────────────────────────────────────────────────────
draw_section_title("ANALYSE DES NOEUDS")
# Tableau des métriques
draw_table_header(["Metrique", "Precision", "Rappel", "F1-Score"])
draw_table_row([
"Noeuds",
f"{details.get('precision_nodes', 0)}%",
f"{details.get('recall_nodes', 0)}%",
f"{details.get('f1_nodes', 0)}%"
])
pdf.ln(3)
# Statistiques de génération
pdf.set_font("DejaVu", "B", 9)
pdf.cell(0, 6, "Statistiques de generation :", ln=True)
pdf.set_font("DejaVu", "", 9)
total_gen = details.get('total_generated', 0)
num_correct = details.get('num_correct', 0)
num_halluc = details.get('num_hallucinated', 0)
total_gt = details.get('total_gt', 0)
num_detected = details.get('num_detected', 0)
num_missed = details.get('num_missed', 0)
# Ligne 1 : Nœuds générés vs GT
pdf.cell(95, 6, f" Noeuds generes : {total_gen}", 0, 0)
pdf.cell(95, 6, f" Noeuds GT : {total_gt}", 0, 1)
# Ligne 2 : Nœuds corrects vs détectés
pdf.cell(95, 6, f" Noeuds corrects : {num_correct}", 0, 0)
pdf.cell(95, 6, f" Noeuds detectes : {num_detected}", 0, 1)
# Ligne 3 : Hallucinations vs manqués
pdf.cell(95, 6, f" Noeuds hallucines : {num_halluc}", 0, 0)
pdf.cell(95, 6, f" Noeuds GT manques : {num_missed}", 0, 1)
# Ligne 4 : Taux (hallucination + détection)
halluc_rate = details.get('hallucination_rate', 0)
detect_rate = details.get('detection_rate', 0)
pdf.cell(95, 6, f" Taux hallucination : {halluc_rate}%", 0, 0)
pdf.cell(95, 6, f" Taux detection : {detect_rate}%", 0, 1)
pdf.ln(3)
# ═════════════════════════════════════════════════════════════════
# Détails des nœuds GT manqués
# ═════════════════════════════════════════════════════════════════
missed_gt_nodes = details.get('missed_gt_nodes', [])
if missed_gt_nodes:
pdf.set_font("DejaVu", "B", 9)
pdf.set_fill_color(255, 243, 224) # Fond orange clair
pdf.cell(0, 6, f"Noeuds GT non detectes ({len(missed_gt_nodes)}) :", ln=True, fill=True)
pdf.set_font("DejaVu", "", 8)
# Afficher les 5 premiers
for i, node in enumerate(missed_gt_nodes[:5], 1):
pdf.cell(5, 5, "", 0, 0) # Indentation
pdf.cell(0, 5, clean_text(f"{i}. {node}"), ln=True)
if len(missed_gt_nodes) > 5:
pdf.cell(5, 5, "", 0, 0)
pdf.cell(0, 5, f"... et {len(missed_gt_nodes) - 5} autres", ln=True)
pdf.ln(2)
# ═════════════════════════════════════════════════════════════════
# Détails des nœuds hallucinés (si présents)
# ═════════════════════════════════════════════════════════════════
hallucinated_nodes = details.get('hallucinated_nodes', [])
if hallucinated_nodes:
pdf.set_font("DejaVu", "B", 9)
pdf.set_fill_color(255, 235, 230) # Fond rouge clair
pdf.cell(0, 6, f"Noeuds hallucines ({len(hallucinated_nodes)}) :", ln=True, fill=True)
pdf.set_font("DejaVu", "", 8)
# Afficher les 5 premiers
for i, node in enumerate(hallucinated_nodes[:5], 1):
pdf.cell(5, 5, "", 0, 0)
pdf.cell(0, 5, clean_text(f"{i}. {node}"), ln=True)
if len(hallucinated_nodes) > 5:
pdf.cell(5, 5, "", 0, 0)
pdf.cell(0, 5, f"... et {len(hallucinated_nodes) - 5} autres", ln=True)
pdf.ln(2)
pdf.ln(2)
# ─────────────────────────────────────────────────────────────────
# SECTION 3: Métriques des Arêtes
# ─────────────────────────────────────────────────────────────────
draw_section_title("ANALYSE DES ARETES") # ← Sans emoji
# Tableau des métriques
draw_table_header(["Metrique", "Precision", "Rappel", "F1-Score"])
draw_table_row([
"Aretes",
f"{details.get('precision_edges', 0)}%",
f"{details.get('recall_edges', 0)}%",
f"{details.get('f1_edges', 0)}%"
])
pdf.ln(2)
# Statistiques des arêtes
tp_edges = details.get('tp_edges', [])
fp_edges = details.get('fp_edges', [])
fn_edges = details.get('fn_edges', [])
inverted = details.get('inverted_edges', [])
if not isinstance(tp_edges, list):
tp_edges = list(tp_edges) if tp_edges else []
if not isinstance(fp_edges, list):
fp_edges = list(fp_edges) if fp_edges else []
if not isinstance(fn_edges, list):
fn_edges = list(fn_edges) if fn_edges else []
if not isinstance(inverted, list):
inverted = list(inverted) if inverted else []
num_tp = len(tp_edges)
num_fp = len(fp_edges)
num_fn = len(fn_edges)
num_inv = len(inverted)
total_gt_edges = num_tp + num_fn
total_test_edges = num_tp + num_fp + num_inv
# Tableau récapitulatif
draw_table_header(["Type", "GT", "Test", "Correctes", "Manquantes", "En trop", "Inversees"])
draw_table_row([
"Aretes",
str(total_gt_edges),
str(total_test_edges),
str(num_tp),
str(num_fn),
str(num_fp),
str(num_inv)
])
pdf.ln(3)
# ─────────────────────────────────────────────────────────────────
# SECTION 4: Hiérarchie
# ─────────────────────────────────────────────────────────────────
if 'gt_depth' in details:
draw_section_title("STRUCTURE HIERARCHIQUE") # ← Sans emoji
gt_depth = details.get('gt_depth', 0)
test_depth = details.get('test_depth', 0)
# Tableau hiérarchie
draw_table_header(["Metrique", "Valeur"])
draw_table_row(["Profondeur GT", f"{gt_depth} niveaux"], fill=True)
draw_table_row(["Profondeur Test", f"{test_depth} niveaux"])
draw_table_row(["Difference", f"{details.get('depth_difference', 0)} niveaux"], fill=True)
draw_table_row(["Precision niveaux", f"{details.get('level_accuracy', 0)}%"])
draw_table_row(["Similarite structure", f"{details.get('structure_similarity', 0)}%"], fill=True)
pdf.ln(2)
# Distribution par niveau
if details.get('gt_nodes_per_level') and details.get('test_nodes_per_level'):
pdf.set_font("DejaVu", "B", 9)
pdf.cell(0, 6, "Distribution par niveau :", ln=True)
draw_table_header(["Niveau", "Noeuds GT", "Noeuds Test", "Ecart"])
max_level = max(
max(details['gt_nodes_per_level'].keys(), default=0),
max(details['test_nodes_per_level'].keys(), default=0)
)
for level in range(max_level + 1):
gt_count = len(details['gt_nodes_per_level'].get(level, []))
test_count = len(details['test_nodes_per_level'].get(level, []))
ecart = abs(gt_count - test_count)
draw_table_row([
f"Niveau {level}",
str(gt_count),
str(test_count),
str(ecart)
], fill=(level % 2 == 0))
pdf.ln(3)
# ─────────────────────────────────────────────────────────────────
# SECTION : Graph Edit Distance (GED)
# ─────────────────────────────────────────────────────────────────
if 'ged' in details and details['ged'] is not None:
draw_section_title("GRAPH EDIT DISTANCE (GED)")
ged_value = details.get('ged', 0)
normalized_ged = details.get('normalized_ged', 0)
structural_sim = details.get('structural_similarity', 0)
# Tableau principal GED
draw_table_header(["Metrique", "Valeur"])
draw_table_row(["GED (operations)", f"{ged_value:.2f}"], fill=True)
draw_table_row(["GED Normalise", f"{normalized_ged:.2f}%"])
draw_table_row(["Similarite Structurelle", f"{structural_sim:.2f}%"], fill=True)
pdf.ln(2)
# Détails des opérations
pdf.set_font("DejaVu", "B", 9)
pdf.cell(0, 6, "Operations requises :", ln=True)
node_ins = details.get('num_node_insertions', 0)
node_del = details.get('num_node_deletions', 0)
edge_ins = details.get('num_edge_insertions', 0)
edge_del = details.get('num_edge_deletions', 0)
# Tableau des opérations
draw_table_header(["Type", "Insertions", "Suppressions", "Total"])
draw_table_row(["Noeuds", str(node_ins), str(node_del), str(node_ins + node_del)], fill=True)
draw_table_row(["Aretes", str(edge_ins), str(edge_del), str(edge_ins + edge_del)])
pdf.ln(3)
# ─────────────────────────────────────────────────────────────────
# SECTION 5: Détails des erreurs
# ─────────────────────────────────────────────────────────────────
if pdf.get_y() < 200: # Augmenter la limite pour avoir plus d'espace
draw_section_title("DETAILS DES ERREURS")
pdf.set_font("DejaVu", "", 8)
# ═════════════════════════════════════════════════════════════
# Nœuds hallucinés
# ═════════════════════════════════════════════════════════════
hallucinated_nodes = details.get('hallucinated_nodes', [])
if hallucinated_nodes:
pdf.set_font("DejaVu", "B", 9)
pdf.set_fill_color(255, 235, 230) # Fond rouge clair
pdf.cell(0, 6, f"Noeuds hallucines ({len(hallucinated_nodes)}) :", ln=True, fill=True)
pdf.set_font("DejaVu", "", 8)
for i, node in enumerate(hallucinated_nodes[:10], 1): # Top 10
pdf.cell(5, 5, "", 0, 0) # Indentation
pdf.cell(0, 5, clean_text(f"{i}. {node}"), ln=True)
if len(hallucinated_nodes) > 10:
pdf.cell(5, 5, "", 0, 0)
pdf.cell(0, 5, f"... et {len(hallucinated_nodes) - 10} autres", ln=True)
pdf.ln(2)
# ═════════════════════════════════════════════════════════════
# Nœuds GT manqués
# ═════════════════════════════════════════════════════════════
missed_gt_nodes = details.get('missed_gt_nodes', [])
if missed_gt_nodes:
pdf.set_font("DejaVu", "B", 9)
pdf.set_fill_color(255, 248, 225) # Fond orange clair
pdf.cell(0, 6, f"Noeuds GT manques ({len(missed_gt_nodes)}) :", ln=True, fill=True)
pdf.set_font("DejaVu", "", 8)
for i, node in enumerate(missed_gt_nodes[:10], 1): # Top 10
pdf.cell(5, 5, "", 0, 0) # Indentation
pdf.cell(0, 5, clean_text(f"{i}. {node}"), ln=True)
if len(missed_gt_nodes) > 10:
pdf.cell(5, 5, "", 0, 0)
pdf.cell(0, 5, f"... et {len(missed_gt_nodes) - 10} autres", ln=True)
pdf.ln(2)
# ═════════════════════════════════════════════════════════════
# Nœuds en trop (extra_nodes)
# ═════════════════════════════════════════════════════════════
extra_nodes = details.get('extra_nodes', [])
if extra_nodes:
pdf.set_font("DejaVu", "B", 9)
pdf.set_fill_color(240, 240, 255) # Fond bleu clair
pdf.cell(0, 6, f"Noeuds en trop ({len(extra_nodes)}) :", ln=True, fill=True)
pdf.set_font("DejaVu", "", 8)
for i, node in enumerate(extra_nodes[:10], 1):
pdf.cell(5, 5, "", 0, 0)
pdf.cell(0, 5, clean_text(f"{i}. {node}"), ln=True)
if len(extra_nodes) > 10:
pdf.cell(5, 5, "", 0, 0)
pdf.cell(0, 5, f"... et {len(extra_nodes) - 10} autres", ln=True)
pdf.ln(2)
# ═════════════════════════════════════════════════════════════
# Arêtes manquantes
# ═════════════════════════════════════════════════════════════
missing_edges = details.get('missing_edges', [])
if missing_edges:
pdf.set_font("DejaVu", "B", 9)
pdf.set_fill_color(255, 243, 224) # Fond jaune clair
pdf.cell(0, 6, f"Aretes manquantes ({len(missing_edges)}) :", ln=True, fill=True)
pdf.set_font("DejaVu", "", 8)
for i, edge in enumerate(missing_edges[:10], 1):
if isinstance(edge, tuple) and len(edge) == 2:
src, tgt = edge
pdf.cell(5, 5, "", 0, 0)
pdf.cell(0, 5, clean_text(f"{i}. {src} -> {tgt}"), ln=True)
if len(missing_edges) > 10:
pdf.cell(5, 5, "", 0, 0)
pdf.cell(0, 5, f"... et {len(missing_edges) - 10} autres", ln=True)
pdf.ln(2)
# ═════════════════════════════════════════════════════════════
# Arêtes en trop
# ═════════════════════════════════════════════════════════════
extra_edges = details.get('extra_edges', [])
if extra_edges:
pdf.set_font("DejaVu", "B", 9)
pdf.set_fill_color(235, 245, 255) # Fond cyan clair
pdf.cell(0, 6, f"Aretes en trop ({len(extra_edges)}) :", ln=True, fill=True)
pdf.set_font("DejaVu", "", 8)
for i, edge in enumerate(extra_edges[:10], 1):
if isinstance(edge, tuple) and len(edge) == 2:
src, tgt = edge
pdf.cell(5, 5, "", 0, 0)
pdf.cell(0, 5, clean_text(f"{i}. {src} -> {tgt}"), ln=True)
if len(extra_edges) > 10:
pdf.cell(5, 5, "", 0, 0)
pdf.cell(0, 5, f"... et {len(extra_edges) - 10} autres", ln=True)
pdf.ln(2)
# ═════════════════════════════════════════════════════════════
# Arêtes inversées
# ═════════════════════════════════════════════════════════════
inverted_edges = details.get('inverted_edges', [])
if inverted_edges:
pdf.set_font("DejaVu", "B", 9)
pdf.set_fill_color(255, 240, 245) # Fond rose clair
pdf.cell(0, 6, f"Aretes inversees ({len(inverted_edges)}) :", ln=True, fill=True)
pdf.set_font("DejaVu", "", 8)
if not isinstance(inverted_edges, list):
inverted_edges = list(inverted_edges) if inverted_edges else []
for i, edge in enumerate(inverted_edges[:10], 1):
if isinstance(edge, tuple) and len(edge) == 2:
src, tgt = edge
pdf.cell(5, 5, "", 0, 0)
pdf.cell(0, 5, clean_text(f"{i}. {src} -> {tgt} (devrait etre {tgt} -> {src})"), ln=True)
if len(inverted_edges) > 10:
pdf.cell(5, 5, "", 0, 0)
pdf.cell(0, 5, f"... et {len(inverted_edges) - 10} autres", ln=True)
# ═════════════════════════════════════════════════════════════════════
# PAGE DE SYNTHÈSE FINALE
# ═════════════════════════════════════════════════════════════════════
pdf.add_page()
draw_colored_header("SYNTHESE GLOBALE", 76, 175, 80) # ← Sans emoji
# Calcul des moyennes
n = len(results)
avg_prec = sum(r["details"].get("overall_precision", 0) for r in results) / n
avg_rec = sum(r["details"].get("overall_recall", 0) for r in results) / n
avg_f1 = sum(r["details"].get("overall_f1", 0) for r in results) / n
draw_section_title("MOYENNES SUR TOUS LES DOCUMENTS") #
draw_progress_bar("Precision Moyenne", avg_prec)
draw_progress_bar("Rappel Moyen", avg_rec)
draw_progress_bar("F1-Score Moyen", avg_f1)
pdf.ln(5)
# Tableau récapitulatif par document
draw_section_title("RECAPITULATIF PAR DOCUMENT")
draw_table_header(["Document", "Precision", "Rappel", "F1"])
for i, r in enumerate(results, 1):
d = r["details"]
draw_table_row([
r["file"][:25], # Tronquer si trop long
f"{d.get('overall_precision', 0):.1f}%",
f"{d.get('overall_recall', 0):.1f}%",
f"{d.get('overall_f1', 0):.1f}%"
], fill=(i % 2 == 0))
# ═════════════════════════════════════════════════════════════════════
# Sauvegarde
# ═════════════════════════════════════════════════════════════════════
output_path = f"reports/{model_name}_report.pdf"
pdf.output(output_path)
logging.info(f"Rapport PDF moderne sauvegarde dans {output_path}")
def plot_all_boxplots():
"""
Génère 3 box-plots comparant les performances de tous les modèles testés.
Returns:
tuple: (fig_precision, fig_recall, fig_f1)
3 figures matplotlib prêtes à afficher
Note:
Utilise la liste globale model_precision_records qui accumule
les résultats de tous les modèles testés dans la session.
Box-plots générés:
1. Distribution des précisions par modèle
2. Distribution des rappels par modèle
3. Distribution des F1-scores par modèle
Interprétation:
- Boîte: Q1 à Q3 (50% des valeurs)
- Ligne centrale: médiane
- Moustaches: min/max (hors outliers)
- Points: outliers
"""
# ═════════════════════════════════════════════════════════════════════
# ÉTAPE 1: Regroupement des métriques par modèle
# ═════════════════════════════════════════════════════════════════════
records_by_model = {}
# Parcourir tous les enregistrements
for rec in model_precision_records:
# Créer une liste pour chaque modèle si elle n'existe pas
records_by_model.setdefault(rec["model"], []).append(rec)
# Extraire les noms de modèles
models = list(records_by_model.keys())
# Créer les listes de métriques par modèle
# Format: [[prec_model1_doc1, prec_model1_doc2, ...], [prec_model2_doc1, ...], ...]
precisions = [[r["precision"] for r in records_by_model[m]] for m in models]
recalls = [[r["recall"] for r in records_by_model[m]] for m in models]
f1s = [[r["f1"] for r in records_by_model[m]] for m in models]
# ═════════════════════════════════════════════════════════════════════
# ÉTAPE 2: Génération des box-plots
# ═════════════════════════════════════════════════════════════════════
figs = []
# Boucle sur les 3 métriques
for data, ylabel, title in (
(precisions, "Précision (%)", "Distribution des Précisions"),
(recalls, "Rappel (%)", "Distribution des Rappels"),
(f1s, "F1-Score (%)", "Distribution des F1-Scores")
):
# Vérifier qu'on a des données
if not data or not models:
continue
# Créer une nouvelle figure
fig, ax = plt.subplots()
# Dessiner le box-plot
ax.boxplot(data, tick_labels=models)
# Configurer les labels
ax.set_ylabel(ylabel)
ax.set_title(title)
# Rotation des labels pour lisibilité
plt.xticks(rotation=30, ha="right")
plt.tight_layout()
figs.append(fig)
return tuple(figs)
def save_performance_plots():
"""
Sauvegarde les box-plots de performance dans des fichiers temporaires.
Returns:
list: Chemins vers les 3 fichiers PNG générés
Note:
Les fichiers sont créés dans le dossier temporaire du système
et seront automatiquement nettoyés par le système d'exploitation.
Fichiers générés:
- precision_XXXXX.png
- recall_XXXXX.png
- f1_XXXXX.png
"""
# Générer les 3 figures
figs = plot_all_boxplots()
tmp_paths = []
# Sauvegarder chaque figure dans un fichier temporaire
for fig, prefix in zip(figs, ("precision", "recall", "f1")):
# Créer un fichier temporaire
tmp = tempfile.NamedTemporaryFile(
delete=False, # Ne pas supprimer automatiquement
suffix=".png", # Extension du fichier
prefix=f"{prefix}_", # Préfixe du nom
dir=tempfile.gettempdir() # Dossier temporaire du système
)
# Sauvegarder la figure
fig.savefig(tmp.name, format="png", bbox_inches="tight")
# Fermer la figure pour libérer la mémoire
plt.close(fig)
# Fermer le fichier
tmp.close()
# Ajouter le chemin à la liste
tmp_paths.append(tmp.name)
return tmp_paths
# ============================================
# SECTION 8 : FONCTION PRINCIPALE DE TRAITEMENT
# ============================================
def process_files(files, model_choice, use_semantic=True, fuzzy_threshold=80,
semantic_threshold=70, alpha=0.6, edge_mode="hybrid"):
"""
Fonction principale : traite un batch de fichiers et évalue les performances.
Args:
files (list): Liste de fichiers uploadés (objets Gradio)
model_choice (str): Nom du modèle à utiliser
use_semantic (bool): Activer le matching sémantique (vs fuzzy seul)
fuzzy_threshold (int): Seuil fuzzy matching (0-100)
semantic_threshold (int): Seuil semantic matching (0-100)
alpha (float): Pondération fuzzy/sémantique (0-1)
Returns:
tuple: (images, mermaid_text, dropdown_update, iframe_dict,
initial_graph, summary_html, report_msg)
Pipeline complet:
1. Nettoyage de Neo4j
2. Pour chaque fichier:
a. Conversion PDF → Image (si nécessaire)
b. Encodage Base64
c. Génération Mermaid par le modèle d'IA
d. Conversion Mermaid → JSON
e. Comparaison avec Ground Truth
f. Upload dans Neo4j
g. Sauvegarde des différences
3. Calcul des métriques moyennes
4. Génération du rapport PDF
5. Construction des outputs pour Gradio
Note:
Cette fonction orchestre tout le workflow de l'application
"""
# ═════════════════════════════════════════════════════════════════════
# ÉTAPE 0: Validation des entrées
# ═════════════════════════════════════════════════════════════════════
if not files:
return None, "", gr_update(choices=[], value=None), {}, "", "", ""
# ═════════════════════════════════════════════════════════════════════
# ÉTAPE 1: Initialisation
# ═════════════════════════════════════════════════════════════════════
neo4j_ok = clear_neo4j() # Vider la base pour ce batch
if not neo4j_ok:
logging.error("Neo4j indisponible: la génération continue sans upload Neo4j.")
# Initialisation des structures de données
results = [] # Résultats de comparaison pour chaque fichier
images = [] # Images pour la galerie Gradio
mermaids = [] # Codes Mermaid générés
docs = [] # Noms des documents
iframe_dict = {} # Mapping doc_name → HTML iframe
error_messages = []
model_choice_str = str(model_choice or "")
model_key = "Gemini"
if model_choice_str.lower().startswith("gemini"):
model_key = "Gemini"
elif model_choice_str.lower().startswith("gemma"):
model_key = "GEMMA"
elif model_choice_str.lower().startswith("llama"):
model_key = "LLAMA"
elif model_choice_str.lower().startswith("qwen"):
model_key = "QWEN"
elif model_choice_str.lower().startswith("nvidia"):
model_key = "NVIDIA"
elif model_choice_str.lower().startswith("internvl"):
model_key = "InternVL"
# ═════════════════════════════════════════════════════════════════════
# ÉTAPE 2: Traitement de chaque fichier
# ═════════════════════════════════════════════════════════════════════
for file in files:
diff_path = None # Chemin du fichier de différences
try:
# ─────────────────────────────────────────────────────────────
# 2.1: Préparation
# ─────────────────────────────────────────────────────────────
# Extraire le nom du fichier (sans extension)
name = os.path.splitext(os.path.basename(file.name))[0]
# Chemin vers le Ground Truth correspondant
gt = os.path.join("GT", f"{name}.json")
gt_exists = os.path.exists(gt)
if not gt_exists:
logging.error(f"Ground truth manquant pour {name}")
# ─────────────────────────────────────────────────────────────
# 2.2: Conversion en image (si PDF)
# ─────────────────────────────────────────────────────────────
if file.name.lower().endswith(".pdf"):
img = convert_pdf_to_image(file.name)
else:
img = file.name # Déjà une image
# ─────────────────────────────────────────────────────────────
# 2.3: Encodage Base64
# ─────────────────────────────────────────────────────────────
b64 = encode_image(img)
# ─────────────────────────────────────────────────────────────
# 2.4: Génération du code Mermaid par le modèle
# ─────────────────────────────────────────────────────────────
# Sélection de la fonction selon le modèle choisi
mermaid_fn = {
"Gemini": generate_mermaid_from_image_gemini,
"LLAMA": generate_mermaid_from_llama,
"GEMMA": generate_mermaid_from_gemma,
"QWEN": generate_mermaid_from_qwen,
"NVIDIA": generate_mermaid_from_nvidia,
"InternVL": generate_mermaid_from_intern,
}.get(model_key, generate_mermaid_from_image_gemini) # Gemini par défaut
# Appel du modèle
code = mermaid_fn(b64)
# ─────────────────────────────────────────────────────────────
# 2.5: Conversion Mermaid → JSON
# ─────────────────────────────────────────────────────────────
j = mermaid_to_json(code)
if gt_exists:
gt_n, gt_e = load_json(gt)
# ─────────────────────────────────────────────────────────────
# 2.6: Comparaison avec le Ground Truth
# ─────────────────────────────────────────────────────────────
# Choix de la fonction de comparaison selon le mode
if gt_exists:
if use_semantic:
cmp = compare_graphs_with_semantic_fast(
gt_n, gt_e, j["nodes"], j["edges"],
fuzzy_threshold=fuzzy_threshold,
semantic_threshold=semantic_threshold,
alpha=alpha,
edge_mode=edge_mode
)
else:
cmp = compare_graphs(
gt_n, gt_e, j["nodes"], j["edges"],
threshold=fuzzy_threshold
)
else:
cmp = {
"overall_precision": 0,
"overall_recall": 0,
"overall_f1": 0,
"precision_nodes": 0,
"recall_nodes": 0,
"f1_nodes": 0,
"precision_edges": 0,
"recall_edges": 0,
"f1_edges": 0,
"missing_nodes": [],
"extra_nodes": [],
"missing_edges": [],
"extra_edges": [],
"hallucination_rate": 0,
"detection_rate": 0,
"total_generated": len(j.get("nodes", [])),
"num_hallucinated": 0,
"num_correct": 0,
"hallucinated_nodes": [],
"correct_nodes": [],
"total_gt": 0,
"num_detected": 0,
"num_missed": 0,
"missed_gt_nodes": []
}
# ─────────────────────────────────────────────────────────────
# 2.7: Sauvegarde des différences dans un fichier JSON
# ─────────────────────────────────────────────────────────────
diff_path = f"graph_diff_{name}.json"
diff_data = {
"missing_nodes": cmp["missing_nodes"],
"extra_nodes": cmp["extra_nodes"],
"missing_edges": cmp["missing_edges"],
"extra_edges": cmp["extra_edges"],
}
# Ajouter les détails de matching si disponibles
if "matching_details" in cmp:
diff_data["matching_details"] = cmp["matching_details"]
# Écriture du fichier
with open(diff_path, "w", encoding="utf-8") as f:
json.dump(diff_data, f, ensure_ascii=False, indent=2)
logging.info(f"Différences écrites pour {name}{diff_path}")
# ─────────────────────────────────────────────────────────────
# 2.8: Upload dans Neo4j
# ─────────────────────────────────────────────────────────────
# Charger GT et Test
if gt_exists:
gt_nodes, gt_edges = load_json(gt)
gt_json = {"nodes": [{"id": k, "label": v} for k, v in gt_nodes.items()], "edges": gt_edges}
else:
gt_json = {"nodes": [], "edges": []}
# Upload simultané du GT et du Test
if neo4j_ok:
upload_gt_and_test_to_neo4j(gt_json, j, name)
# ─────────────────────────────────────────────────────────────
# 2.9: Enregistrement des résultats
# ─────────────────────────────────────────────────────────────
results.append({
"file": name,
"precision": cmp["overall_precision"],
"details": {
**cmp,
"test_nodes": j.get("nodes", []),
"test_edges": j.get("edges", [])
}
})
images.append(Image.open(img))
mermaids.append(code)
except Exception as e:
logging.error(f"Erreur sur le fichier {file.name}: {e}")
error_messages.append(f"{os.path.basename(file.name)}: {e}")
continue
# ═════════════════════════════════════════════════════════════════════
# ÉTAPE 3: Gestion des erreurs globales
# ═════════════════════════════════════════════════════════════════════
if not results:
empty_msg = '<div style="text-align:center;padding:40px;color:#7f8c8d;background:#f8f9fa;border:2px dashed #e0e0e0;border-radius:8px;">Aucun graphe disponible</div>'
if error_messages:
details = "<br/>".join(error_messages[-8:])
empty_msg = (
'<div style="padding:16px;background:#fff3cd;border:1px solid #ffeeba;border-radius:8px;">'
'<div style="font-weight:600;margin-bottom:8px;">Erreurs lors de la génération</div>'
f'<div style="font-family:monospace;white-space:pre-wrap;">{details}</div>'
'</div>'
)
return (
None, # 1. images (gallery)
"", # 2. mermaid (textbox)
gr_update(choices=[], value=None), # 3. dropdown
{}, # 4. iframe_dict (json)
empty_msg, # 5. gt_graph_panel (html) ← AJOUTÉ
empty_msg, # 6. test_graph_panel (html) ← AJOUTÉ
"<div>Erreur sur tous les fichiers</div>", # 7. summary (html)
"" # 8. report_status (textbox)
)
# ═════════════════════════════════════════════════════════════════════
# ÉTAPE 4: Calcul des métriques moyennes
# ═════════════════════════════════════════════════════════════════════
n = len(results) # Nombre de fichiers traités avec succès
# Métriques classiques
avg_precision = round(
sum(r["details"]["overall_precision"] for r in results) / n, 2
)
avg_recall = round(
sum(r["details"]["overall_recall"] for r in results) / n, 2
)
avg_f1 = round(
sum(r["details"]["overall_f1"] for r in results) / n, 2
)
# Métriques d'hallucination
avg_hallucination = round(
sum(r["details"].get("hallucination_rate", 0) for r in results) / n, 2
)
avg_detection = round(
sum(r["details"].get("detection_rate", 0) for r in results) / n, 2
)
# Totaux cumulés
total_hallucinated = sum(r["details"].get("num_hallucinated", 0) for r in results)
total_generated = sum(r["details"].get("total_generated", 0) for r in results)
total_detected = sum(r["details"].get("num_detected", 0) for r in results)
total_gt = sum(r["details"].get("total_gt", 0) for r in results)
# ═════════════════════════════════════════════════════════════════════
# ÉTAPE 5: Enregistrement pour les graphiques de performance
# ═════════════════════════════════════════════════════════════════════
model_precision_records.append({
"model": model_choice,
"precision": avg_precision,
"recall": avg_recall,
"f1": avg_f1,
"hallucination_rate": avg_hallucination
})
# ═════════════════════════════════════════════════════════════════════
# ÉTAPE 6: Génération du rapport PDF
# ═════════════════════════════════════════════════════════════════════
try:
save_results_to_pdf(results, model_choice)
report_status = "Rapport PDF généré."
except Exception as e:
logging.error(f"⚠️ Génération PDF ignorée (l'application continue): {e}")
report_status = f"Rapport PDF non généré: {e}"
# ═════════════════════════════════════════════════════════════════════
# ÉTAPE 7: Génération du HTML de résumé pour l'interface
# ═════════════════════════════════════════════════════════════════════
# Calculer les moyennes des métriques d'arêtes
avg_precision_edges = round(
sum(r["details"]["precision_edges"] for r in results) / n, 2
)
avg_recall_edges = round(
sum(r["details"]["recall_edges"] for r in results) / n, 2
)
avg_f1_edges = round(
sum(r["details"]["f1_edges"] for r in results) / n, 2
)
avg_inversion_rate = round(
sum(r["details"].get("inversion_rate", 0) for r in results) / n, 2
)
# Calculer les moyennes des métriques hiérarchiques
avg_gt_depth = round(
sum(r["details"].get("gt_depth", 0) for r in results) / n, 1
)
avg_test_depth = round(
sum(r["details"].get("test_depth", 0) for r in results) / n, 1
)
avg_level_accuracy = round(
sum(r["details"].get("level_accuracy", 0) for r in results) / n, 2
)
avg_structure_similarity = round(
sum(r["details"].get("structure_similarity", 0) for r in results) / n, 2
)
depth_matches = sum(1 for r in results if r["details"].get("depth_match", False))
depth_match_rate = round(depth_matches / n * 100, 2)
# CALCUL DES MOYENNES GED
avg_ged = round(
sum(r["details"].get("ged", 0) for r in results if r["details"].get("ged") is not None) / n, 2
) if n > 0 else 0
avg_normalized_ged = round(
sum(r["details"].get("normalized_ged", 0) for r in results if
r["details"].get("normalized_ged") is not None) / n, 2
) if n > 0 else 0
avg_structural_sim_ged = round(
sum(r["details"].get("structural_similarity", 0) for r in results if
r["details"].get("structural_similarity") is not None) / n, 2
) if n > 0 else 0
total_node_ins = sum(r["details"].get("num_node_insertions", 0) for r in results)
total_node_del = sum(r["details"].get("num_node_deletions", 0) for r in results)
total_edge_ins = sum(r["details"].get("num_edge_insertions", 0) for r in results)
total_edge_del = sum(r["details"].get("num_edge_deletions", 0) for r in results)
summary_html = f"""
<div style='font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; background: #ffffff; padding: 24px; border-radius: 12px; border: 1px solid #e0e0e0; box-shadow: 0 1px 3px rgba(0,0,0,0.08);'>
<!-- En-tête sobre -->
<div style="border-bottom: 2px solid #f5f5f5; padding-bottom: 16px; margin-bottom: 24px;">
<h2 style='color: #2c3e50; margin: 0; font-size: 1.5em; font-weight: 600;'>
Résumé des Performances
</h2>
<p style="color: #7f8c8d; margin: 8px 0 0 0; font-size: 0.9em;">
Analyse de {n} document(s) • Modèle : {model_choice} • Mode : {edge_mode}
</p>
</div>
<!-- Métriques Principales -->
<div style="margin-bottom: 32px;">
<h3 style="color: #34495e; font-size: 1em; font-weight: 600; margin: 0 0 16px 0;">
Métriques Globales
</h3>
<div style="display: grid; grid-template-columns: repeat(3, 1fr); gap: 16px;">
<!-- Précision -->
<div style="background: #f8f9fa; padding: 16px; border-radius: 8px; border-left: 3px solid #3498db;">
<div style="color: #7f8c8d; font-size: 0.75em; text-transform: uppercase; letter-spacing: 0.5px; margin-bottom: 8px;">Précision</div>
<div style="color: #2c3e50; font-size: 2em; font-weight: 700; line-height: 1;">{avg_precision}%</div>
</div>
<!-- Rappel -->
<div style="background: #f8f9fa; padding: 16px; border-radius: 8px; border-left: 3px solid #2ecc71;">
<div style="color: #7f8c8d; font-size: 0.75em; text-transform: uppercase; letter-spacing: 0.5px; margin-bottom: 8px;">Rappel</div>
<div style="color: #2c3e50; font-size: 2em; font-weight: 700; line-height: 1;">{avg_recall}%</div>
</div>
<!-- F1-Score -->
<div style="background: #f8f9fa; padding: 16px; border-radius: 8px; border-left: 3px solid #9b59b6;">
<div style="color: #7f8c8d; font-size: 0.75em; text-transform: uppercase; letter-spacing: 0.5px; margin-bottom: 8px;">F1-Score</div>
<div style="color: #2c3e50; font-size: 2em; font-weight: 700; line-height: 1;">{avg_f1}%</div>
</div>
</div>
</div>
<!-- Analyse des Nœuds et Arêtes -->
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 24px; margin-bottom: 32px;">
<!-- Nœuds -->
<div>
<h3 style="color: #34495e; font-size: 1em; font-weight: 600; margin: 0 0 12px 0;">
Analyse des Nœuds
</h3>
<div style="background: #ffffff; border: 1px solid #e0e0e0; border-radius: 8px; overflow: hidden;">
<table style="width: 100%; border-collapse: collapse; font-size: 0.85em;">
<tr style="background: #f8f9fa;">
<th style="padding: 10px; text-align: left; color: #7f8c8d; font-weight: 600; font-size: 0.8em; text-transform: uppercase; letter-spacing: 0.5px;">Métrique</th>
<th style="padding: 10px; text-align: right; color: #7f8c8d; font-weight: 600; font-size: 0.8em; text-transform: uppercase; letter-spacing: 0.5px;">Valeur</th>
</tr>
<tr>
<td style="padding: 10px; border-top: 1px solid #f0f0f0; color: #34495e;">Précision</td>
<td style="padding: 10px; border-top: 1px solid #f0f0f0; text-align: right; font-weight: 600; color: #2c3e50;">{sum(r["details"]["precision_nodes"] for r in results) / n:.1f}%</td>
</tr>
<tr>
<td style="padding: 10px; border-top: 1px solid #f0f0f0; color: #34495e;">Rappel</td>
<td style="padding: 10px; border-top: 1px solid #f0f0f0; text-align: right; font-weight: 600; color: #2c3e50;">{sum(r["details"]["recall_nodes"] for r in results) / n:.1f}%</td>
</tr>
<tr>
<td style="padding: 10px; border-top: 1px solid #f0f0f0; color: #34495e;">F1-Score</td>
<td style="padding: 10px; border-top: 1px solid #f0f0f0; text-align: right; font-weight: 600; color: #2c3e50;">{sum(r["details"]["f1_nodes"] for r in results) / n:.1f}%</td>
</tr>
<tr style="background: #fef9e7; border-top: 2px solid #f39c12;">
<td style="padding: 10px; color: #e67e22; font-weight: 600;">Hallucination</td>
<td style="padding: 10px; text-align: right; font-weight: 700; color: #e67e22;">{avg_hallucination}%</td>
</tr>
</table>
</div>
</div>
<!-- Arêtes -->
<div>
<h3 style="color: #34495e; font-size: 1em; font-weight: 600; margin: 0 0 12px 0;">
Analyse des Arêtes
</h3>
<div style="background: #ffffff; border: 1px solid #e0e0e0; border-radius: 8px; overflow: hidden;">
<table style="width: 100%; border-collapse: collapse; font-size: 0.85em;">
<tr style="background: #f8f9fa;">
<th style="padding: 10px; text-align: left; color: #7f8c8d; font-weight: 600; font-size: 0.8em; text-transform: uppercase; letter-spacing: 0.5px;">Métrique</th>
<th style="padding: 10px; text-align: right; color: #7f8c8d; font-weight: 600; font-size: 0.8em; text-transform: uppercase; letter-spacing: 0.5px;">Valeur</th>
</tr>
<tr>
<td style="padding: 10px; border-top: 1px solid #f0f0f0; color: #34495e;">Précision</td>
<td style="padding: 10px; border-top: 1px solid #f0f0f0; text-align: right; font-weight: 600; color: #2c3e50;">{avg_precision_edges}%</td>
</tr>
<tr>
<td style="padding: 10px; border-top: 1px solid #f0f0f0; color: #34495e;">Rappel</td>
<td style="padding: 10px; border-top: 1px solid #f0f0f0; text-align: right; font-weight: 600; color: #2c3e50;">{avg_recall_edges}%</td>
</tr>
<tr>
<td style="padding: 10px; border-top: 1px solid #f0f0f0; color: #34495e;">F1-Score</td>
<td style="padding: 10px; border-top: 1px solid #f0f0f0; text-align: right; font-weight: 600; color: #2c3e50;">{avg_f1_edges}%</td>
</tr>
<tr style="background: #fef5e7; border-top: 2px solid #e67e22;">
<td style="padding: 10px; color: #d35400; font-weight: 600;">Inversions</td>
<td style="padding: 10px; text-align: right; font-weight: 700; color: #d35400;">{avg_inversion_rate}%</td>
</tr>
</table>
</div>
</div>
</div>
<!-- Hiérarchie -->
<div style="margin-bottom: 32px;">
<h3 style="color: #34495e; font-size: 1em; font-weight: 600; margin: 0 0 12px 0;">
Structure Hiérarchique
</h3>
<div style="background: #ffffff; border: 1px solid #e0e0e0; border-radius: 8px; padding: 16px;">
<div style="display: grid; grid-template-columns: repeat(4, 1fr); gap: 12px;">
<div style="text-align: center; padding: 12px; background: #f8f9fa; border-radius: 6px;">
<div style="color: #7f8c8d; font-size: 0.7em; margin-bottom: 4px;">Profondeur GT</div>
<div style="color: #2c3e50; font-size: 1.5em; font-weight: 700;">{avg_gt_depth}</div>
</div>
<div style="text-align: center; padding: 12px; background: #f8f9fa; border-radius: 6px;">
<div style="color: #7f8c8d; font-size: 0.7em; margin-bottom: 4px;">Profondeur Test</div>
<div style="color: #2c3e50; font-size: 1.5em; font-weight: 700;">{avg_test_depth}</div>
</div>
<div style="text-align: center; padding: 12px; background: #f8f9fa; border-radius: 6px;">
<div style="color: #7f8c8d; font-size: 0.7em; margin-bottom: 4px;">Précision Niveaux</div>
<div style="color: #2c3e50; font-size: 1.5em; font-weight: 700;">{avg_level_accuracy}%</div>
</div>
<div style="text-align: center; padding: 12px; background: #f8f9fa; border-radius: 6px;">
<div style="color: #7f8c8d; font-size: 0.7em; margin-bottom: 4px;">Similarité</div>
<div style="color: #2c3e50; font-size: 1.5em; font-weight: 700;">{avg_structure_similarity}%</div>
</div>
</div>
<div style="margin-top: 12px; padding-top: 12px; border-top: 1px solid #e0e0e0; color: #7f8c8d; font-size: 0.75em;">
{depth_matches} document(s) avec profondeur exacte sur {n} ({depth_match_rate}%)
</div>
</div>
</div>
<!-- Graph Edit Distance -->
<div style="margin-bottom: 32px;">
<h3 style="color: #34495e; font-size: 1em; font-weight: 600; margin: 0 0 12px 0;">
Graph Edit Distance (GED)
</h3>
<div style="background: #ffffff; border: 1px solid #e0e0e0; border-radius: 8px; padding: 16px;">
<!-- Métriques principales GED -->
<div style="display: grid; grid-template-columns: repeat(3, 1fr); gap: 12px; margin-bottom: 16px;">
<div style="text-align: center; padding: 12px; background: #f8f9fa; border-radius: 6px;">
<div style="color: #7f8c8d; font-size: 0.7em; margin-bottom: 4px;">GED Moyen</div>
<div style="color: #2c3e50; font-size: 1.5em; font-weight: 700;">{avg_ged}</div>
</div>
<div style="text-align: center; padding: 12px; background: #f8f9fa; border-radius: 6px;">
<div style="color: #7f8c8d; font-size: 0.7em; margin-bottom: 4px;">GED Normalisé</div>
<div style="color: #2c3e50; font-size: 1.5em; font-weight: 700;">{avg_normalized_ged}%</div>
</div>
<div style="text-align: center; padding: 12px; background: #f8f9fa; border-radius: 6px;">
<div style="color: #7f8c8d; font-size: 0.7em; margin-bottom: 4px;">Similarité</div>
<div style="color: #2c3e50; font-size: 1.5em; font-weight: 700;">{avg_structural_sim_ged}%</div>
</div>
</div>
<!-- Détails des opérations -->
<div style="padding-top: 12px; border-top: 1px solid #e0e0e0;">
<div style="color: #7f8c8d; font-size: 0.75em; margin-bottom: 8px; font-weight: 600;">Opérations totales requises :</div>
<div style="display: grid; grid-template-columns: repeat(2, 1fr); gap: 8px; font-size: 0.85em;">
<div style="background: #fef5e7; padding: 8px; border-radius: 4px;">
<span style="color: #7f8c8d;">Insertions nœuds :</span>
<span style="color: #e67e22; font-weight: 600; margin-left: 8px;">{total_node_ins}</span>
</div>
<div style="background: #fef5e7; padding: 8px; border-radius: 4px;">
<span style="color: #7f8c8d;">Suppressions nœuds :</span>
<span style="color: #e67e22; font-weight: 600; margin-left: 8px;">{total_node_del}</span>
</div>
<div style="background: #ebf5fb; padding: 8px; border-radius: 4px;">
<span style="color: #7f8c8d;">Insertions arêtes :</span>
<span style="color: #3498db; font-weight: 600; margin-left: 8px;">{total_edge_ins}</span>
</div>
<div style="background: #ebf5fb; padding: 8px; border-radius: 4px;">
<span style="color: #7f8c8d;">Suppressions arêtes :</span>
<span style="color: #3498db; font-weight: 600; margin-left: 8px;">{total_edge_del}</span>
</div>
</div>
<div style="margin-top: 12px; padding: 8px; background: #f0f4f8; border-radius: 4px; font-size: 0.8em; color: #5a6c7d;">
💡 Le GED mesure le nombre minimum d'opérations (insertions/suppressions) pour transformer le graphe test en graphe GT
</div>
</div>
</div>
</div>
<!-- Statistiques Détaillées -->
<div style="background: #f8f9fa; border-radius: 8px; padding: 16px;">
<h3 style="color: #34495e; font-size: 0.9em; font-weight: 600; margin: 0 0 12px 0;">
Statistiques de Génération
</h3>
<div style="display: grid; grid-template-columns: repeat(2, 1fr); gap: 16px; font-size: 0.85em;">
<div>
<span style="color: #7f8c8d;">Nœuds générés :</span>
<span style="color: #2c3e50; font-weight: 600; margin-left: 8px;">{total_generated}</span>
</div>
<div>
<span style="color: #7f8c8d;">Nœuds GT :</span>
<span style="color: #2c3e50; font-weight: 600; margin-left: 8px;">{total_gt}</span>
</div>
<div>
<span style="color: #7f8c8d;">Nœuds corrects :</span>
<span style="color: #27ae60; font-weight: 600; margin-left: 8px;">{total_generated - total_hallucinated}</span>
</div>
<div>
<span style="color: #7f8c8d;">Nœuds détectés :</span>
<span style="color: #27ae60; font-weight: 600; margin-left: 8px;">{total_detected}</span>
</div>
<div>
<span style="color: #7f8c8d;">Nœuds hallucinés :</span>
<span style="color: #e74c3c; font-weight: 600; margin-left: 8px;">{total_hallucinated}</span>
</div>
<div>
<span style="color: #7f8c8d;">Taux de détection :</span>
<span style="color: #2c3e50; font-weight: 600; margin-left: 8px;">{avg_detection}%</span>
</div>
</div>
</div>
</div>
"""
# ═════════════════════════════════════════════════════════════════════
# ÉTAPE 8: Construction des iframes pour la visualisation Neo4j
# ═════════════════════════════════════════════════════════════════════
for r in results:
doc = r["file"]
docs.append(doc)
# Rendu Mermaid directement (compatible HF, pas besoin de serveur local:8000)
gt_path = os.path.join("GT", f"{doc}.json")
if os.path.exists(gt_path):
gt_nodes, gt_edges = load_json(gt_path)
gt_json = {"nodes": [{"id": k, "label": v} for k, v in gt_nodes.items()], "edges": gt_edges}
gt_mermaid = graph_json_to_mermaid(gt_json)
else:
gt_mermaid = "graph TD;"
test_mermaid = graph_json_to_mermaid({"nodes": r["details"].get("test_nodes", []), "edges": r["details"].get("test_edges", [])})
gt_iframe_html = mermaid_to_html(gt_mermaid, f"mermaid_gt_{doc}")
test_iframe_html = mermaid_to_html(test_mermaid, f"mermaid_test_{doc}")
# Stocker les 2 iframes
iframe_dict[doc] = {
"gt": gt_iframe_html,
"test": test_iframe_html
}
# ═════════════════════════════════════════════════════════════════════
# ÉTAPE 9: Préparation des sorties pour Gradio
# ═════════════════════════════════════════════════════════════════════
# Mise à jour du dropdown avec la liste des documents
dropdown_update = gr_update(choices=docs, value=docs[0])
# Iframes initiaux (premier document)
first_doc_iframes = iframe_dict[docs[0]]
initial_gt_graph = first_doc_iframes["gt"]
initial_test_graph = first_doc_iframes["test"]
# Message de confirmation
report_msg = f"Rapport sauvegardé : reports/{model_choice}_report.pdf"
# ═════════════════════════════════════════════════════════════════
# RETOUR: Tuple de 8 éléments pour Gradio (1 de plus qu'avant)
# ═════════════════════════════════════════════════════════════════
return (
images, # 1. Galerie d'images
"\n\n".join(mermaids), # 2. Codes Mermaid
dropdown_update, # 3. Dropdown
iframe_dict, # 4. Dict des iframes
initial_gt_graph, # 5. GT du premier doc
initial_test_graph, # 6. Test du premier doc
summary_html, # 7. Résumé
report_msg # 8. Message statut
)
# ============================================
# SECTION 9 : FONCTIONS AUXILIAIRES POUR L'INTERFACE
# ============================================
def select_graph(selected_doc, iframe_dict):
"""
Retourne les 2 iframes HTML (GT + Test) pour le document sélectionné.
Args:
selected_doc (str): Nom du document sélectionné
iframe_dict (dict): Mapping doc_name → {"gt": iframe_gt, "test": iframe_test}
Returns:
tuple: (gt_iframe_html, test_iframe_html)
"""
if not selected_doc or selected_doc not in iframe_dict:
empty_msg = '<div style="text-align:center;padding:40px;color:#7f8c8d;">Aucun graphe disponible</div>'
return empty_msg, empty_msg
iframes = iframe_dict[selected_doc]
return iframes["gt"], iframes["test"]
def aggregate_and_preview(paths, threshold, fuzzy_threshold=70, semantic_threshold=70):
"""
VERSION AVEC MÉTRIQUES DE COMPARAISON
"""
if not paths or len(paths) == 0:
error_html = """
<div style="background:#fff3cd;border:1px solid #ffc107;border-radius:8px;padding:20px;text-align:center;">
<h3 style="color:#856404;">⚠️ Aucun fichier JSON</h3>
<p style="color:#856404;">Exportez d'abord les fichiers à l'étape 5.1</p>
</div>
"""
return None, error_html, "" # ← 3 outputs maintenant
logging.info(f"🔀 Début agrégation: {len(paths)} fichiers")
try:
agg = aggregate_mindmaps(
paths,
"aggregate.json",
threshold / 100,
fuzzy_threshold=fuzzy_threshold,
semantic_threshold=semantic_threshold
)
if len(agg['nodes']) == 0:
warning_html = f"""
<div style="background:#fff3cd;border:1px solid #ffc107;border-radius:8px;padding:20px;text-align:center;">
<h3 style="color:#856404;">⚠️ Graphe vide</h3>
<p style="color:#856404;">
Aucun nœud au-dessus du seuil de {threshold}%<br>
<strong>→ Diminuez le seuil (ex: 30%)</strong>
</p>
</div>
"""
return "aggregate.json", warning_html, "" # ← 3 outputs
except Exception as e:
logging.error(f"❌ Erreur agrégation: {e}")
import traceback
logging.error(traceback.format_exc())
error_html = f"""
<div style="background:#f8d7da;border:1px solid #f5c6cb;border-radius:8px;padding:20px;text-align:center;">
<h3 style="color:#721c24;">❌ Erreur</h3>
<p style="color:#721c24;font-family:monospace;font-size:0.9em;">{str(e)}</p>
</div>
"""
return None, error_html, "" # ← 3 outputs
# Générer l'iframe
ts = int(time.time())
src = f"http://127.0.0.1:8000/visual_aggregate.html?t={ts}"
iframe_html = f"""
<div style="border:1px solid #e0e0e0;border-radius:8px;overflow:hidden;background:white;">
<div style="background:#f8f9fa;padding:12px;border-bottom:1px solid #e0e0e0;">
<h4 style="margin:0;color:#2c3e50;">
🔗 Graphe Agrégé
<span style="color:#7f8c8d;font-size:0.85em;font-weight:normal;">
({len(agg['nodes'])} nœuds, {len(agg['edges'])} arêtes)
</span>
</h4>
<p style="margin:5px 0 0 0;font-size:0.75em;color:#7f8c8d;">
Seuils: Fréquence {threshold}% | Fuzzy {fuzzy_threshold}% | Sémantique {semantic_threshold}%
</p>
</div>
<iframe src="{src}"
style="width:100%;height:700px;border:none;background:white;display:block;">
</iframe>
</div>
"""
# CORRECTION : Passer les seuils à la fonction de comparaison
metrics_html = compare_aggregated_with_gt(
GT_ORIGINAL_PATH,
"aggregate.json",
fuzzy_threshold=fuzzy_threshold,
semantic_threshold=semantic_threshold,
frequency_threshold=threshold
)
logging.info(f"✅ Visualisation générée: {src}")
return "aggregate.json", iframe_html, metrics_html # ← 3 outputs
def export_mindmaps_to_json(files, model_choice="Gemini", output_dir=None):
"""
Exporte les mindmaps en fichiers JSON pour agrégation.
"""
if output_dir is None:
output_dir = OUTPUT_JSON_DIR
os.makedirs(output_dir, exist_ok=True)
model_functions = {
"Gemini": generate_mermaid_from_image_gemini,
"LLAMA": generate_mermaid_from_llama,
"GEMMA": generate_mermaid_from_gemma,
"QWEN": generate_mermaid_from_qwen,
"NVIDIA": generate_mermaid_from_nvidia,
"InternVL": generate_mermaid_from_intern,
}
mermaid_fn = model_functions.get(model_choice, generate_mermaid_from_image_gemini)
saved_paths = []
logging.info("=" * 70)
logging.info(f"📤 EXPORT JSON - Modèle: {model_choice}")
logging.info(f"📁 Dossier: {output_dir}")
logging.info(f"📄 Fichiers: {len(files) if files else 0}")
logging.info("=" * 70)
if not files:
logging.warning("⚠️ Aucun fichier à exporter")
return []
# Traitement de tous les fichiers
for idx, file in enumerate(files, 1):
try:
# Récupérer le chemin du fichier
file_path = file.name if hasattr(file, 'name') else file
logging.info(f"\n[{idx}/{len(files)}] Traitement: {os.path.basename(file_path)}")
# Conversion en image si PDF
img = file_path
if file_path.lower().endswith(".pdf"):
logging.info(" 📄 → 🖼️ Conversion PDF...")
img = convert_pdf_to_image(file_path)
# Encodage Base64
logging.info(" 🔐 Encodage...")
b64 = encode_image(img)
# Génération Mermaid
logging.info(f" 🤖 Génération avec {model_choice}...")
mermaid = mermaid_fn(b64)
# Conversion JSON
logging.info(" 🔄 Conversion JSON...")
graph_json = mermaid_to_json(mermaid)
# Sauvegarde
base_name = os.path.splitext(os.path.basename(file_path))[0]
output_path = os.path.join(output_dir, f"{base_name}.json")
with open(output_path, "w", encoding="utf-8") as f:
json.dump(graph_json, f, ensure_ascii=False, indent=2)
saved_paths.append(output_path)
logging.info(f" ✅ Exporté: {output_path}")
logging.info(f" • Nœuds: {len(graph_json['nodes'])}")
logging.info(f" • Arêtes: {len(graph_json['edges'])}")
except Exception as e:
logging.error(f" ❌ Erreur: {e}")
import traceback
logging.error(traceback.format_exc())
continue
logging.info("=" * 70)
logging.info(f"✅ EXPORT TERMINÉ: {len(saved_paths)}/{len(files)}")
logging.info("=" * 70)
return saved_paths
def compare_aggregated_with_gt(gt_path, agg_json_path, fuzzy_threshold=70, semantic_threshold=70,
frequency_threshold=40):
"""
Filtrage selon 'freq' en %
"""
try:
# Charger les deux graphes
with open(gt_path, 'r', encoding='utf-8') as f:
gt_data = json.load(f)
with open(agg_json_path, 'r', encoding='utf-8') as f:
agg_data = json.load(f)
gt_nodes = gt_data.get('nodes', [])
gt_edges = gt_data.get('edges', [])
all_agg_nodes = agg_data.get('nodes', [])
all_agg_edges = agg_data.get('edges', [])
# Filtrer les nœuds selon leur fréquence (valeurs déjà en %)
agg_nodes = [n for n in all_agg_nodes if n.get('freq', 0) >= frequency_threshold]
# Nettoyer les labels (retirer " (XX.XX%)")
for node in agg_nodes:
label = node.get('label', '')
if '(' in label:
# "OOP (33.33%)" → "OOP"
node['label'] = label.split('(')[0].strip()
# Logs de diagnostic
logging.info("═" * 70)
logging.info(f"🔍 FILTRAGE PAR FRÉQUENCE (seuil: {frequency_threshold}%)")
logging.info("═" * 70)
logging.info(f"📊 Avant filtrage : {len(all_agg_nodes)} nœuds, {len(all_agg_edges)} arêtes")
logging.info(f"📊 Après filtrage : {len(agg_nodes)} nœuds")
if all_agg_nodes:
sample = all_agg_nodes[0]
logging.info(f"📊 Exemple nœud brut : {sample}")
logging.info(f"📊 Valeur 'freq' : {sample.get('freq', 'NON TROUVÉ')}")
if not agg_nodes:
logging.error(f"❌ AUCUN NŒUD après filtrage à {frequency_threshold}%")
logging.error(f" → Diminuez le seuil ou vérifiez aggregate.json")
return '<div style="background:#f8d7da;padding:20px;border-radius:8px;">❌ Aucun nœud ne passe le seuil</div>'
# Créer un set des IDs valides
valid_node_ids = {n['id'] for n in agg_nodes}
# Filtrer les arêtes (garder seulement celles entre nœuds valides)
agg_edges = [
e for e in all_agg_edges
if e['source'] in valid_node_ids and e['target'] in valid_node_ids
]
logging.info(f"📊 Arêtes conservées : {len(agg_edges)}")
# Afficher les nœuds conservés
logging.info(f"\n📝 Nœuds conservés (≥{frequency_threshold}%) :")
for i, n in enumerate(sorted(agg_nodes, key=lambda x: x.get('freq', 0), reverse=True)[:10], 1):
logging.info(f" {i}. [{n.get('freq', 0):.1f}%] {n['label']}")
if len(agg_nodes) > 10:
logging.info(f" ... et {len(agg_nodes) - 10} autres")
logging.info("═" * 70)
gt_labels = [n["label"] for n in gt_nodes]
agg_labels = [n["label"] for n in agg_nodes]
matches = hybrid_match_optimized(
agg_labels, gt_labels,
fuzzy_threshold=fuzzy_threshold,
semantic_threshold=semantic_threshold,
alpha=0.6
)
label_mapping = {}
matched_gt_labels = set()
for agg_lbl, (match, combined_score, fuzzy_score, sem_score) in zip(agg_labels, matches):
if match and match not in matched_gt_labels:
label_mapping[agg_lbl] = match
matched_gt_labels.add(match)
gt_labels_set = set(gt_labels)
num_common_nodes = len(matched_gt_labels)
num_missing_nodes = len(gt_labels_set) - num_common_nodes
num_extra_nodes = len(agg_labels) - num_common_nodes
node_precision = (num_common_nodes / len(agg_labels) * 100) if agg_labels else 0
node_recall = (num_common_nodes / len(gt_labels) * 100) if gt_labels else 0
node_f1 = (2 * node_precision * node_recall / (node_precision + node_recall)) if (
node_precision + node_recall) > 0 else 0
gt_id2lbl = {n["id"]: n["label"] for n in gt_nodes}
agg_id2lbl = {n["id"]: n["label"] for n in agg_nodes}
gt_edges_set = set()
for e in gt_edges:
src = gt_id2lbl.get(e["source"])
tgt = gt_id2lbl.get(e["target"])
if src and tgt:
gt_edges_set.add(tuple(sorted([src, tgt])))
agg_edges_set = set()
for e in agg_edges:
src = agg_id2lbl.get(e["source"])
tgt = agg_id2lbl.get(e["target"])
if src and tgt:
src_mapped = label_mapping.get(src, src)
tgt_mapped = label_mapping.get(tgt, tgt)
if src_mapped in gt_labels_set and tgt_mapped in gt_labels_set:
agg_edges_set.add(tuple(sorted([src_mapped, tgt_mapped])))
common_edges = gt_edges_set & agg_edges_set
missing_edges = gt_edges_set - agg_edges_set
extra_edges = agg_edges_set - gt_edges_set
edge_precision = (len(common_edges) / len(agg_edges_set) * 100) if agg_edges_set else 0
edge_recall = (len(common_edges) / len(gt_edges_set) * 100) if gt_edges_set else 0
edge_f1 = (2 * edge_precision * edge_recall / (edge_precision + edge_recall)) if (
edge_precision + edge_recall) > 0 else 0
global_score = (node_f1 + edge_f1) / 2
# Logs finaux
logging.info("═" * 70)
logging.info("🔍 RÉSULTATS DE COMPARAISON")
logging.info("═" * 70)
logging.info(f"📊 Nœuds GT : {len(gt_labels)} | Agrégé : {len(agg_labels)}")
logging.info(f"📊 Matchés : {num_common_nodes} | Manquants : {num_missing_nodes} | En trop : {num_extra_nodes}")
logging.info(
f"📊 Arêtes GT : {len(gt_edges_set)} | Agrégé : {len(agg_edges_set)} | Communes : {len(common_edges)}")
logging.info(f"🎯 Score global : {global_score:.1f}%")
logging.info(f" • F1 nœuds : {node_f1:.1f}%")
logging.info(f" • F1 arêtes : {edge_f1:.1f}%")
logging.info("═" * 70)
# Couleur selon le score
if global_score >= 80:
score_color = "#10b981"
score_icon = "✅"
score_label = "Excellent"
elif global_score >= 60:
score_color = "#3b82f6"
score_icon = "👍"
score_label = "Bon"
elif global_score >= 40:
score_color = "#f59e0b"
score_icon = "⚠️"
score_label = "Moyen"
else:
score_color = "#ef4444"
score_icon = "❌"
score_label = "Faible"
# Générer le HTML (MÊME CODE QU'AVANT)
metrics_html = f"""
<div style="background: white; border: 2px solid #e0e0e0; border-radius: 12px; padding: 24px; margin: 20px 0; box-shadow: 0 4px 12px rgba(0,0,0,0.1);">
<!-- Score Global -->
<div style="text-align: center; margin-bottom: 24px; padding: 20px; background: linear-gradient(135deg, {score_color}15 0%, {score_color}05 100%); border-radius: 8px; border: 2px solid {score_color};">
<div style="font-size: 3em; margin-bottom: 8px;">{score_icon}</div>
<div style="font-size: 2.5em; font-weight: 700; color: {score_color}; margin-bottom: 4px;">{global_score:.1f}%</div>
<div style="font-size: 1.1em; color: #555; font-weight: 600;">{score_label} - Correspondance avec le GT Original</div>
<div style="font-size: 0.85em; color: #888; margin-top: 8px;">Marge d'amélioration : {100 - global_score:.1f}%</div>
<div style="font-size: 0.75em; color: #999; margin-top: 4px; font-style: italic;">
Seuils: Fréquence {frequency_threshold}% | Fuzzy {fuzzy_threshold}% | Sémantique {semantic_threshold}%
</div>
<div style="font-size: 0.7em; color: #666; margin-top: 4px; padding: 6px; background: #f5f5f5; border-radius: 4px;">
ℹ️ Nœuds filtrés : {len(agg_nodes)}/{len(all_agg_nodes)} • Arêtes filtrées : {len(agg_edges)}/{len(all_agg_edges)}
</div>
</div>
<!-- Métriques Détaillées -->
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 16px; margin-bottom: 20px;">
<!-- Nœuds -->
<div style="background: #f8f9fa; padding: 16px; border-radius: 8px; border-left: 4px solid #667eea;">
<h4 style="margin: 0 0 12px 0; color: #667eea; font-size: 1em;">📊 Nœuds</h4>
<div style="margin-bottom: 8px;">
<span style="color: #888; font-size: 0.85em;">Précision</span>
<div style="display: flex; align-items: center; gap: 8px;">
<div style="flex: 1; height: 8px; background: #e0e0e0; border-radius: 4px; overflow: hidden;">
<div style="width: {node_precision}%; height: 100%; background: #667eea;"></div>
</div>
<span style="font-weight: 700; color: #667eea; min-width: 50px;">{node_precision:.1f}%</span>
</div>
</div>
<div style="margin-bottom: 8px;">
<span style="color: #888; font-size: 0.85em;">Rappel</span>
<div style="display: flex; align-items: center; gap: 8px;">
<div style="flex: 1; height: 8px; background: #e0e0e0; border-radius: 4px; overflow: hidden;">
<div style="width: {node_recall}%; height: 100%; background: #10b981;"></div>
</div>
<span style="font-weight: 700; color: #10b981; min-width: 50px;">{node_recall:.1f}%</span>
</div>
</div>
<div>
<span style="color: #888; font-size: 0.85em;">F1-Score</span>
<div style="display: flex; align-items: center; gap: 8px;">
<div style="flex: 1; height: 8px; background: #e0e0e0; border-radius: 4px; overflow: hidden;">
<div style="width: {node_f1}%; height: 100%; background: #9b59b6;"></div>
</div>
<span style="font-weight: 700; color: #9b59b6; min-width: 50px;">{node_f1:.1f}%</span>
</div>
</div>
</div>
<!-- Arêtes -->
<div style="background: #f8f9fa; padding: 16px; border-radius: 8px; border-left: 4px solid #f59e0b;">
<h4 style="margin: 0 0 12px 0; color: #f59e0b; font-size: 1em;">🔗 Arêtes</h4>
<div style="margin-bottom: 8px;">
<span style="color: #888; font-size: 0.85em;">Précision</span>
<div style="display: flex; align-items: center; gap: 8px;">
<div style="flex: 1; height: 8px; background: #e0e0e0; border-radius: 4px; overflow: hidden;">
<div style="width: {edge_precision}%; height: 100%; background: #667eea;"></div>
</div>
<span style="font-weight: 700; color: #667eea; min-width: 50px;">{edge_precision:.1f}%</span>
</div>
</div>
<div style="margin-bottom: 8px;">
<span style="color: #888; font-size: 0.85em;">Rappel</span>
<div style="display: flex; align-items: center; gap: 8px;">
<div style="flex: 1; height: 8px; background: #e0e0e0; border-radius: 4px; overflow: hidden;">
<div style="width: {edge_recall}%; height: 100%; background: #10b981;"></div>
</div>
<span style="font-weight: 700; color: #10b981; min-width: 50px;">{edge_recall:.1f}%</span>
</div>
</div>
<div>
<span style="color: #888; font-size: 0.85em;">F1-Score</span>
<div style="display: flex; align-items: center; gap: 8px;">
<div style="flex: 1; height: 8px; background: #e0e0e0; border-radius: 4px; overflow: hidden;">
<div style="width: {edge_f1}%; height: 100%; background: #9b59b6;"></div>
</div>
<span style="font-weight: 700; color: #9b59b6; min-width: 50px;">{edge_f1:.1f}%</span>
</div>
</div>
</div>
</div>
<!-- Statistiques -->
<div style="background: #fafafa; padding: 16px; border-radius: 8px;">
<h4 style="margin: 0 0 12px 0; color: #555; font-size: 0.95em;">📈 Statistiques Détaillées</h4>
<div style="display: grid; grid-template-columns: repeat(3, 1fr); gap: 12px; font-size: 0.85em;">
<div>
<div style="color: #888;">GT Original</div>
<div style="font-weight: 600; color: #2c3e50;">{len(gt_nodes)} nœuds, {len(gt_edges_set)} arêtes</div>
</div>
<div>
<div style="color: #888;">Agrégé (filtré)</div>
<div style="font-weight: 600; color: #2c3e50;">{len(agg_nodes)} nœuds, {len(agg_edges_set)} arêtes</div>
</div>
<div>
<div style="color: #888;">Communs</div>
<div style="font-weight: 600; color: #10b981;">{num_common_nodes} nœuds, {len(common_edges)} arêtes</div>
</div>
<div>
<div style="color: #888;">Manquants</div>
<div style="font-weight: 600; color: #ef4444;">{num_missing_nodes} nœuds, {len(missing_edges)} arêtes</div>
</div>
<div>
<div style="color: #888;">En trop</div>
<div style="font-weight: 600; color: #f59e0b;">{num_extra_nodes} nœuds, {len(extra_edges)} arêtes</div>
</div>
<div>
<div style="color: #888;">Couverture</div>
<div style="font-weight: 600; color: #3b82f6;">{node_recall:.1f}% du GT</div>
</div>
</div>
</div>
<!-- Recommandations -->
<div style="margin-top: 16px; padding: 12px; background: #eff6ff; border-left: 4px solid #3b82f6; border-radius: 4px;">
<div style="font-weight: 600; color: #1e40af; margin-bottom: 4px;">💡 Recommandations</div>
<div style="font-size: 0.85em; color: #3b82f6;">
{'✅ Excellent ! Le graphe agrégé est très fidèle au GT original.' if global_score >= 80 else
'👍 Bon résultat. Ajustez les seuils de clustering pour améliorer la couverture.' if global_score >= 60 else
f'⚠️ Résultat moyen. Diminuez le seuil de fréquence (actuellement {frequency_threshold}%) ou les seuils fuzzy/sémantique pour capturer plus de nœuds.' if global_score >= 40 else
f'❌ Faible correspondance. Diminuez drastiquement le seuil de fréquence (actuellement {frequency_threshold}%) et les seuils fuzzy/sémantique.'}
</div>
</div>
</div>
"""
return metrics_html
except Exception as e:
logging.error(f"❌ Erreur comparaison agrégé/GT : {e}")
import traceback
logging.error(traceback.format_exc())
return f"""
<div style="background:#f8d7da;border:1px solid #f5c6cb;border-radius:8px;padding:20px;text-align:center;">
<h3 style="color:#721c24;">❌ Erreur de comparaison</h3>
<p style="color:#721c24;font-family:monospace;font-size:0.9em;">{str(e)}</p>
</div>
"""
# ============================================
# SECTION 9 : FONCTIONS AUXILIAIRES POUR L'INTERFACE
# ============================================
def load_and_visualize_gt_original(gt_path=None):
"""
Charge le ground truth original et génère sa visualisation.
Args:
gt_path (str): Chemin vers le fichier JSON du GT original
Returns:
str: HTML iframe pour la visualisation
"""
# Utiliser le chemin par défaut si aucun chemin fourni
if not gt_path:
gt_path = GT_ORIGINAL_PATH
if not os.path.exists(gt_path):
error_html = f"""
<div style="background:#fff3cd;border:1px solid #ffc107;border-radius:8px;padding:20px;text-align:center;">
<h3 style="color:#856404;">⚠️ Fichier GT introuvable</h3>
<p style="color:#856404;">Chemin : {gt_path}</p>
<p style="color:#856404;font-size:0.9em;">Vérifiez que le fichier existe ou modifiez GT_ORIGINAL_PATH dans le code</p>
</div>
"""
return error_html
try:
# Charger le JSON
with open(gt_path, 'r', encoding='utf-8') as f:
gt_data = json.load(f)
logging.info(
f"✅ GT original chargé: {len(gt_data.get('nodes', []))} nœuds, {len(gt_data.get('edges', []))} arêtes")
# Sauvegarder temporairement pour la visualisation
temp_gt_path = "temp_gt_original.json"
with open(temp_gt_path, 'w', encoding='utf-8') as f:
json.dump(gt_data, f, ensure_ascii=False, indent=2)
# Générer l'iframe
ts = int(time.time())
src = f"http://127.0.0.1:8000/visual_gt_original.html?t={ts}"
iframe_html = f"""
<div style="border:1px solid #e0e0e0;border-radius:8px;overflow:hidden;background:white;">
<div style="background:linear-gradient(135deg, #10b981 0%, #059669 100%);padding:12px;border-bottom:1px solid #e0e0e0;">
<h4 style="margin:0;color:white;">
📘 Ground Truth Original (Professeur)
<span style="color:#d1fae5;font-size:0.85em;font-weight:normal;">
({len(gt_data.get('nodes', []))} nœuds, {len(gt_data.get('edges', []))} arêtes)
</span>
</h4>
<p style="margin:5px 0 0 0;font-size:0.75em;color:#d1fae5;">
Référence : {os.path.basename(gt_path)}
</p>
</div>
<iframe src="{src}"
style="width:100%;height:700px;border:none;background:white;display:block;">
</iframe>
</div>
"""
return iframe_html
except Exception as e:
logging.error(f"❌ Erreur lors du chargement du GT: {e}")
error_html = f"""
<div style="background:#f8d7da;border:1px solid #f5c6cb;border-radius:8px;padding:20px;text-align:center;">
<h3 style="color:#721c24;">❌ Erreur de chargement</h3>
<p style="color:#721c24;font-family:monospace;font-size:0.9em;">{str(e)}</p>
</div>
"""
return error_html
def download_aggregated_graph():
"""Télécharge le graphe agrégé au format JSON."""
try:
if os.path.exists("aggregated_graph.json"):
return "aggregated_graph.json"
else:
return None
except Exception as e:
logging.error(f"Erreur lors du téléchargement du graphe agrégé: {e}")
return None
def generate_pdf_report():
"""Génère un rapport PDF des résultats."""
try:
# Vérifier s'il y a des résultats
if os.path.exists("reports"):
report_files = glob.glob("reports/*.pdf")
if report_files:
return report_files[-1] # Dernier rapport généré
# Sinon générer un rapport vide
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=16)
pdf.cell(200, 10, txt="Rapport Doc2GL", ln=True, align='C')
pdf.set_font("Arial", size=12)
pdf.cell(200, 10, txt="Aucune donnée à afficher", ln=True, align='C')
os.makedirs("reports", exist_ok=True)
report_path = "reports/rapport_vide.pdf"
pdf.output(report_path)
return report_path
except Exception as e:
logging.error(f"Erreur lors de la génération du rapport PDF: {e}")
return None
# ============================================
# SECTION 10 : INTERFACE GRADIO
# ============================================
def create_auth_interface():
"""Crée une interface d'authentification simple."""
app_password = os.environ.get("APP_PASSWORD", "")
def check_login(username, password):
if not app_password:
return True, "Accès autorisé"
if username == "admin" and password == app_password:
return True, "Accès autorisé"
return False, "Identifiants incorrects"
with gr.Blocks(
title="Doc2GL - Connexion",
css=""".auth-container {
max-width: 400px;
margin: 100px auto;
padding: 40px;
background: #18181b;
border: 1px solid #27272a;
border-radius: 16px;
text-align: center;
}
.auth-title {
font-size: 2.5em;
font-weight: bold;
margin-bottom: 10px;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
}
.auth-subtitle {
color: #a1a1aa;
margin-bottom: 30px;
}
""",
theme=gr.themes.Soft()
) as auth_demo:
with gr.Column(elem_classes=["auth-container"]):
gr.HTML('<h1 class="auth-title">🧠 Doc2GL</h1>')
gr.HTML('<p class="auth-subtitle">Document to Graph Learning</p>')
with gr.Column():
username = gr.Textbox(
label="Nom d'utilisateur",
placeholder="admin",
value="admin"
)
password = gr.Textbox(
label="Mot de passe",
type="password",
placeholder="AZERTY123"
)
login_btn = gr.Button(
"🔐 Se connecter",
variant="primary",
size="lg"
)
result = gr.HTML("")
login_btn.click(
fn=check_login,
inputs=[username, password],
outputs=result
)
return auth_demo
def gradio_interface():
"""Point d'entrée principal pour l'interface Gradio."""
# Vérifier si un mot de passe est configuré
app_password = os.environ.get("APP_PASSWORD", "")
# ═════════════════════════════════════════════════════════════════════
# CSS PERSONNALISÉ (Variables CSS pour personnalisation facile)
# ═════════════════════════════════════════════════════════════════════
custom_css = r"""
@import url('https://fonts.googleapis.com/css2?family=Inter:ital,opsz,wght@0,14..32,100..900;1,14..32,100..900&display=swap');
/* ===== GLOBAL RESET ===== */
*, *::before, *::after { box-sizing: border-box; margin: 0; }
:root {
--bg: #09090b;
--surface: #18181b;
--surface-2: #1f1f23;
--surface-3: #27272a;
--border: #27272a;
--border-light: #3f3f46;
--text: #fafafa;
--text-2: #a1a1aa;
--text-3: #71717a;
--text-4: #52525b;
--violet-400: #a78bfa;
--violet-500: #8b5cf6;
--violet-600: #7c3aed;
--violet-900: rgba(139,92,246,0.12);
--emerald-400: #34d399;
--emerald-500: #10b981;
--emerald-600: #059669;
--emerald-900: rgba(52,211,153,0.12);
--amber-900: rgba(245,158,11,0.15);
--amber-400: #fbbf24;
--font: 'Inter', system-ui, -apple-system, sans-serif;
}
/* ===== AUTH CONTAINER ===== */
.auth-container {
max-width: 400px;
margin: 100px auto;
padding: 40px;
background: var(--surface);
border: 1px solid var(--border);
border-radius: 16px;
text-align: center;
}
.auth-title {
font-size: 2.5em;
font-weight: bold;
margin-bottom: 10px;
background: linear-gradient(135deg, var(--violet-400), var(--emerald-400));
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
}
.auth-subtitle {
color: var(--text-3);
margin-bottom: 30px;
}
/* ===== BASE ===== */
body {
background: var(--bg) !important;
color: var(--text) !important;
font-family: var(--font) !important;
-webkit-font-smoothing: antialiased;
}
.gradio-container {
background: var(--bg) !important;
max-width: 1100px !important;
margin: 0 auto !important;
padding: 0 32px 60px !important;
font-family: var(--font) !important;
color: var(--text) !important;
}
/* ===== HIDE GRADIO CHROME ===== */
footer, .built-with, .show-api, .svelte-1ed2p3z,
.gradio-container > .prose,
#component-0 > .prose { display: none !important; }
.gradio-container .gap { gap: 16px !important; }
.block { border: none !important; box-shadow: none !important; background: transparent !important; padding: 0 !important; }
.form { background: transparent !important; border: none !important; gap: 20px !important; }
.panel { background: transparent !important; border: none !important; }
/* ===== HERO ===== */
.hero {
position: relative;
background: var(--surface);
border: 1px solid var(--border);
border-radius: 28px;
padding: 72px 48px 64px;
margin: 16px 0 40px;
text-align: center;
overflow: hidden;
}
.hero::before {
content: '';
position: absolute; inset: 0;
background:
radial-gradient(ellipse 80% 60% at 50% -10%, rgba(139,92,246,0.25), transparent),
radial-gradient(ellipse 60% 50% at 80% 110%, rgba(52,211,153,0.1), transparent);
pointer-events: none;
}
.hero-badge {
display: inline-flex; align-items: center; gap: 8px;
background: rgba(255,255,255,0.05);
border: 1px solid var(--border-light);
padding: 7px 18px;
border-radius: 100px;
font-size: 0.8rem; font-weight: 500;
color: var(--text-2);
margin-bottom: 24px;
}
.hero-badge .live {
width: 8px; height: 8px;
background: var(--emerald-400);
border-radius: 50%;
box-shadow: 0 0 8px var(--emerald-400);
animation: blink 2.5s ease-in-out infinite;
}
@keyframes blink {
0%,100% { opacity:1; } 50% { opacity:0.3; }
}
.hero h1 {
color: var(--text) !important;
font-size: 3rem;
font-weight: 800;
letter-spacing: -0.04em;
line-height: 1;
margin: 0 0 12px;
}
.hero h1 span {
background: linear-gradient(135deg, var(--violet-400), var(--emerald-400));
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
}
.hero p {
color: var(--text-3) !important;
font-size: 1.1rem;
font-weight: 400;
margin: 0;
line-height: 1.6;
}
/* ===== CARDS ===== */
.card-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 14px; margin-bottom: 24px; }
.card {
background: var(--surface);
border: 1px solid var(--border);
border-radius: 16px;
padding: 24px;
transition: all 0.15s ease;
}
.card:hover { border-color: var(--border-light); box-shadow: 0 4px 20px rgba(0,0,0,0.3); }
.card-icon {
width: 40px; height: 40px;
border-radius: 12px;
display: flex; align-items: center; justify-content: center;
font-size: 1.1rem;
margin-bottom: 14px;
}
.card-icon.violet { background: var(--violet-900); }
.card-icon.emerald { background: var(--emerald-900); }
.card h3 { font-size: 0.92rem; font-weight: 700; color: var(--text); margin: 0 0 6px; }
.card p { font-size: 0.82rem; color: var(--text-3); line-height: 1.55; margin: 0; }
/* ===== SECTION ===== */
.section-wrap {
background: var(--surface);
border: 1px solid var(--border);
border-radius: 20px;
padding: 28px;
margin: 24px 0;
}
.section-label {
display: inline-block;
font-size: 0.68rem; font-weight: 700;
text-transform: uppercase; letter-spacing: 0.08em;
padding: 4px 12px;
border-radius: 100px;
margin-bottom: 16px;
}
.section-label.optional { background: var(--amber-900); color: var(--amber-400); }
.section-label.results { background: var(--violet-900); color: var(--violet-400); }
.section-title {
font-size: 1.05rem; font-weight: 700; color: var(--text);
margin: 0 0 4px;
}
.section-desc {
font-size: 0.84rem; color: var(--text-3); margin: 0 0 20px; line-height: 1.55;
}
/* ===== BUTTONS ===== */
button, .gr-button { font-family: var(--font) !important; cursor: pointer; }
button.primary-button, .primary-button > button {
background: var(--text) !important;
color: var(--bg) !important;
font-size: 0.92rem !important;
padding: 13px 28px !important;
font-weight: 600 !important;
border-radius: 12px !important;
border: none !important;
box-shadow: 0 0 0 1px rgba(255,255,255,0.1) !important;
transition: all 0.15s ease !important;
letter-spacing: -0.01em !important;
}
button.primary-button:hover, .primary-button > button:hover {
background: #e4e4e7 !important;
box-shadow: 0 4px 16px rgba(255,255,255,0.06) !important;
transform: translateY(-1px) !important;
}
button.primary-button:active, .primary-button > button:active {
transform: translateY(0) !important;
}
button.secondary-button, .secondary-button > button {
background: var(--surface-2) !important;
color: var(--text-2) !important;
font-weight: 600 !important;
border-radius: 12px !important;
border: 1px solid var(--border) !important;
box-shadow: none !important;
transition: all 0.15s ease !important;
font-size: 0.88rem !important;
}
button.secondary-button:hover, .secondary-button > button:hover {
background: var(--surface-3) !important;
color: var(--text) !important;
border-color: #52525b !important;
}
/* ===== TABS ===== */
.tabs { margin-top: 0 !important; }
.tabs > .tab-nav {
background: var(--surface) !important;
border: 1px solid var(--border) !important;
border-radius: 14px !important;
padding: 5px !important;
gap: 4px !important;
margin-bottom: 16px !important;
display: inline-flex !important;
}
.tabs > .tab-nav > button {
background: transparent !important;
color: var(--text-3) !important;
border: none !important;
border-radius: 10px !important;
padding: 10px 20px !important;
font-weight: 600 !important;
font-size: 0.84rem !important;
transition: all 0.12s ease !important;
}
.tabs > .tab-nav > button.selected {
background: var(--surface-2) !important;
color: var(--text) !important;
}
.tabitem {
background: var(--surface) !important;
border-radius: 20px !important;
padding: 32px !important;
border: 1px solid var(--border) !important;
}
/* ===== FORM ELEMENTS ===== */
input, textarea, select {
background: var(--surface-2) !important;
color: var(--text) !important;
border: 1px solid var(--border) !important;
border-radius: 10px !important;
font-family: var(--font) !important;
font-size: 0.9rem !important;
padding: 10px 14px !important;
transition: all 0.12s ease !important;
}
input:focus, textarea:focus, select:focus {
border-color: var(--violet-500) !important;
box-shadow: 0 0 0 2px rgba(139,92,246,0.1) !important;
outline: none !important;
}
.wrap { background: var(--surface-2) !important; border: 1px solid var(--border) !important; border-radius: 10px !important; }
label, .label-wrap > span {
color: var(--text-2) !important;
font-weight: 600 !important;
font-size: 0.82rem !important;
}
.gr-check-radio { accent-color: var(--violet-500) !important; }
.gradio-slider input[type="range"] { accent-color: var(--violet-500) !important; }
.info { color: var(--text-4) !important; font-size: 0.78rem !important; }
/* ===== ACCORDION ===== */
.gradio-accordion {
background: var(--surface) !important;
border: 1px solid var(--border) !important;
border-radius: 14px !important;
margin: 16px 0 !important;
overflow: hidden;
}
.gradio-accordion > .label-wrap {
padding: 16px 20px !important;
font-weight: 600 !important;
background: var(--surface-2) !important;
}
/* ===== HELP NOTES ===== */
.help-note { display: none; }
/* ===== GRAPH PANELS ===== */
.graph-panel-header {
padding: 14px 20px;
border-radius: 14px 14px 0 0;
text-align: center;
font-weight: 700;
font-size: 0.88rem;
letter-spacing: -0.01em;
}
.graph-panel-header.gt {
background: linear-gradient(135deg, var(--emerald-500), var(--emerald-600));
color: white;
}
.graph-panel-header.test {
background: linear-gradient(135deg, var(--violet-500), var(--violet-600));
color: white;
}
.graph-panel-header.agg {
background: linear-gradient(135deg, #f59e0b, #d97706);
color: white;
}
.graph-placeholder {
background: var(--surface-2);
border: 2px dashed var(--border);
border-radius: 14px;
padding: 48px;
text-align: center;
color: var(--text-4);
display: flex;
flex-direction: column;
align-items: center; justify-content: center;
gap: 16px;
font-size: 0.88rem; font-weight: 500;
}
.graph-placeholder svg {
width: 40px; height: 40px;
stroke: var(--text-4);
}
/* ===== FILE UPLOAD ===== */
.gradio-file, .gradio-files { border-radius: 14px !important; }
/* ===== GALLERY & PLOTS ===== */
.gradio-gallery { border-radius: 14px !important; overflow: hidden; border: 1px solid var(--border) !important; }
.gradio-plot { background: var(--surface) !important; border-radius: 14px !important; border: 1px solid var(--border) !important; }
/* ===== SCROLLBAR ===== */
::-webkit-scrollbar { width: 5px; height: 5px; }
::-webkit-scrollbar-track { background: transparent; }
::-webkit-scrollbar-thumb { background: var(--surface-3); border-radius: 10px; }
/* ===== RESPONSIVE ===== */
@media (max-width: 768px) {
.card-grid { grid-template-columns: 1fr; }
.hero h1 { font-size: 2rem; }
.hero { padding: 48px 24px 40px; border-radius: 20px; }
.gradio-container { padding: 0 16px 40px !important; }
}
"""
# ═════════════════════════════════════════════════════════════════════
# CONSTRUCTION DE L'INTERFACE (AVEC OU SANS AUTH)
# ═════════════════════════════════════════════════════════════════════
with gr.Blocks(title="Doc2GL", css=custom_css, theme=gr.themes.Base()) as demo:
# État d'authentification
authenticated = gr.State(False)
# ═════════════════════════════════════════════════════════════════════
# CONTENEUR D'AUTHENTIFICATION (visible si non authentifié)
# ═════════════════════════════════════════════════════════════════════
with gr.Column(visible=bool(app_password), elem_id="login_container") as login_container:
with gr.Column(elem_classes=["auth-container"]):
gr.HTML('<h1 class="auth-title">🧠 Doc2GL</h1>')
gr.HTML('<p class="auth-subtitle">Document to Graph Learning</p>')
with gr.Column():
username = gr.Textbox(
label="Nom d'utilisateur",
placeholder="admin",
value="admin"
)
password = gr.Textbox(
label="Mot de passe",
type="password",
placeholder="Entrez votre mot de passe"
)
login_btn = gr.Button(
"🔐 Se connecter",
variant="primary",
size="lg"
)
auth_result = gr.HTML("")
# ═════════════════════════════════════════════════════════════════════
# CONTENEUR PRINCIPAL (visible si authentifié ou si pas de mot de passe)
# ═════════════════════════════════════════════════════════════════════
with gr.Column(visible=not bool(app_password), elem_id="app_container") as app_container:
# Variable d'état (non utilisée dans ce code mais disponible)
redirect_state = gr.State("")
# ─────────────────────────────────────────────────────────────────
# HEADER
# ─────────────────────────────────────────────────────────────────
gr.HTML('''
<div class="hero">
<div class="hero-badge"><span class="live"></span> Document Intelligence</div>
<h1>Doc<span>2</span>GL</h1>
<p>Transformez vos documents en graphes de connaissances intelligents</p>
</div>
''')
# ─────────────────────────────────────────────────────────────────
# ÉTAPES 1-2: Upload et Modèle
# ─────────────────────────────────────────────────────────────────
gr.HTML('''
<div class="card-grid">
<div class="card">
<div class="card-icon violet">&#128196;</div>
<h3>Importez vos documents</h3>
<p>Glissez vos PDF ou images ci-dessous pour commencer l'analyse.</p>
</div>
<div class="card">
<div class="card-icon emerald">&#129302;</div>
<h3>Choisissez votre mod&egrave;le</h3>
<p>S&eacute;lectionnez le mod&egrave;le d'IA le plus adapt&eacute; &agrave; vos documents.</p>
</div>
</div>
''')
# Zone d'upload et sélection du modèle (côte à côte)
with gr.Row():
with gr.Column(scale=1):
file_input = gr.Files(
label="📁 Documents (PDF/Images)",
file_types=[".pdf", ".jpg", ".jpeg", ".png"]
)
with gr.Column(scale=1):
model_selector = gr.Dropdown(
label="🤖 Modèle d'IA",
choices=[
"Gemini 1.5 Flash",
"Gemma 2 9B",
"LLaMA 3 8B",
"Qwen 2 7B",
"NVIDIA Nemotron Nano 12B",
"InternVL 2 8B"
],
value="Gemini 1.5 Flash",
info="Choisissez le modèle pour générer les graphes"
)
# Options de traitement
with gr.Row():
use_semantic_checkbox = gr.Checkbox(
label="🧠 Utiliser la similarité sémantique",
value=True,
info="Combine fuzzy matching et embeddings pour une meilleure précision"
)
with gr.Accordion("⚙️ Paramètres avancés", open=False):
with gr.Row():
fuzzy_threshold_slider = gr.Slider(
label="🎯 Seuil fuzzy (%)",
minimum=60,
maximum=100,
value=80,
step=5,
info="Seuil de similarité textuelle (fuzzy matching)"
)
semantic_threshold_slider = gr.Slider(
label="🔍 Seuil sémantique (%)",
minimum=50,
maximum=90,
value=70,
step=5,
info="Seuil de similarité sémantique (embeddings)"
)
with gr.Row():
alpha_slider = gr.Slider(
label="⚖️ Alpha (poids sémantique)",
minimum=0.0,
maximum=1.0,
value=0.6,
step=0.1,
info="Poids du matching sémantique vs fuzzy (0=fuzzy seul, 1=sémantique seul)"
)
edge_mode_radio = gr.Radio(
label="🔗 Mode de détection des arêtes",
choices=["hybrid", "co-occurrence", "semantic"],
value="hybrid",
info="Stratégie pour détecter les relations entre entités"
)
# Bouton de génération
generate_btn = gr.Button(
"🚀 Générer les graphes",
variant="primary",
size="lg",
elem_classes=["primary-button"]
)
# ─────────────────────────────────────────────────────────────────
# RÉSULTATS (Galerie + Mermaid)
# ─────────────────────────────────────────────────────────────────
gr.HTML('''
<div class="section-wrap">
<span class="section-label results">📊 Résultats</span>
<h3 class="section-title">Graphes générés</h3>
<p class="section-desc">Visualisez les graphes extraits de vos documents</p>
</div>
''')
image_preview = gr.Gallery(label="📷 Pages analysées", show_label=True, elem_id="gallery", columns=2, height="auto", allow_preview=True)
mermaid_output = gr.Textbox(label="📝 Code Mermaid", lines=10, interactive=True, placeholder="Le code Mermaid apparaîtra ici après génération...")
# Dropdown pour sélectionner un document
docs_dropdown = gr.Dropdown(
label="📄 Sélectionner un document",
choices=[],
info="Choisissez un document pour visualiser son graphe détaillé"
)
# Conteneur pour les iframes (stockage interne)
hidden_iframes = gr.State({})
# Panneaux pour les graphes GT et Test
with gr.Row():
with gr.Column():
gt_graph_panel = gr.HTML(
value='<div class="graph-placeholder">Sélectionnez un document pour voir le graphe GT</div>',
label="📊 Graphe Ground Truth"
)
with gr.Column():
test_graph_panel = gr.HTML(
value='<div class="graph-placeholder">Sélectionnez un document pour voir le graphe Test</div>',
label="🔬 Graphe Généré"
)
# Résumé des performances
comparison_result_html = gr.HTML(
value='<div style="text-align:center;padding:24px;color:var(--text-4);font-size:0.86rem;font-weight:500;">Les métriques apparaîtront ici après génération</div>',
label="📈 Résumé"
)
# Statut du rapport
report_status = gr.HTML(
value="",
visible=False
)
# ─────────────────────────────────────────────────────────────────
# ONGLETS : Vue agrégée vs GT Original + Performances
# ─────────────────────────────────────────────────────────────────
with gr.Tabs(elem_classes=["tabs"]):
with gr.TabItem("📊 Vue agrégée"):
gr.HTML('''
<div class="section-wrap">
<span class="section-label optional">🔗 Vue agrégée</span>
<h3 class="section-title">Graphe consolidé</h3>
<p class="section-desc">Vue d'ensemble de tous les graphes combinés</p>
</div>
''')
aggregated_graph = gr.HTML(
value='<div class="graph-placeholder">Les graphes agrégés apparaîtront ici après traitement</div>',
label="🔗 Graphe agrégé"
)
with gr.Row():
aggregate_btn = gr.Button(
"🔗 Agréger les graphes",
variant="secondary",
elem_classes=["secondary-button"]
)
download_agg_btn = gr.Button(
"📥 Télécharger (JSON)",
variant="secondary",
elem_classes=["secondary-button"]
)
with gr.TabItem("📈 Performances"):
gr.HTML('''
<div class="section-wrap">
<span class="section-label results">📈 Performances</span>
<h3 class="section-title">Métriques détaillées</h3>
<p class="section-desc">Analyse quantitative de la qualité des graphes générés</p>
</div>
''')
performance_plot = gr.Plot(
label="📊 Boxplots des métriques",
visible=False
)
download_report_btn = gr.Button(
"📄 Générer le rapport PDF",
variant="primary",
elem_classes=["primary-button"]
)
# ─────────────────────────────────────────────────────────────────
# GESTION DES ÉVÉNEMENTS
# ─────────────────────────────────────────────────────────────────
# Authentification
def check_login(username, password):
if not app_password:
return True, gr.update(visible=False), gr.update(visible=True)
if username == "admin" and password == app_password:
return True, gr.update(visible=False), gr.update(visible=True)
return False, gr.update(visible=True), gr.update(visible=False)
def show_auth_error(success):
if success:
return ""
return '<div style="color: #ef4444; text-align: center; margin-top: 10px;">❌ Identifiants incorrects</div>'
login_btn.click(
fn=check_login,
inputs=[username, password],
outputs=[authenticated, login_container, app_container]
).then(
fn=show_auth_error,
inputs=[authenticated],
outputs=[auth_result]
)
# Génération des graphes
generate_btn.click(
fn=process_files,
inputs=[
file_input,
model_selector,
use_semantic_checkbox,
fuzzy_threshold_slider,
semantic_threshold_slider,
alpha_slider,
edge_mode_radio
],
outputs=[
image_preview,
mermaid_output,
docs_dropdown,
hidden_iframes,
gt_graph_panel,
test_graph_panel,
comparison_result_html,
report_status
]
)
# Sélection de document
docs_dropdown.change(
fn=select_graph,
inputs=[docs_dropdown, hidden_iframes],
outputs=[gt_graph_panel, test_graph_panel]
)
# Comparaison avec GT original
with gr.Row():
gt_file_input = gr.File(
label="📁 Ground Truth original (JSON)",
file_types=[".json"],
visible=False
)
compare_with_gt_btn = gr.Button(
"🔍 Comparer avec GT original",
variant="secondary",
elem_classes=["secondary-button"],
visible=False
)
# Téléchargements
download_agg_btn.click(
fn=download_aggregated_graph,
inputs=[],
outputs=[gr.File()]
)
download_report_btn.click(
fn=generate_pdf_report,
inputs=[],
outputs=[gr.File()]
)
# ═════════════════════════════════════════════════════════════════════
# LANCEMENT DE L'INTERFACE
# ═════════════════════════════════════════════════════════════════════
demo.launch(show_api=False)
# ============================================
# POINT D'ENTRÉE DU PROGRAMME
# ============================================
def start_http_server(port=8000):
"""Lance un serveur HTTP en arrière-plan pour servir les fichiers statiques
(visual.html, neovis.js, graph_diff JSON, etc.) nécessaires aux iframes."""
import http.server
import socketserver
import threading
handler = http.server.SimpleHTTPRequestHandler
try:
httpd = socketserver.TCPServer(("", port), handler)
thread = threading.Thread(target=httpd.serve_forever, daemon=True)
thread.start()
logging.info(f"Serveur HTTP démarré sur le port {port}")
except Exception as e:
logging.warning(f"Impossible de démarrer le serveur HTTP sur le port {port}: {e}")
if __name__ == "__main__":
# Vérifier l'authentification si APP_PASSWORD est configuré
app_password = os.environ.get("APP_PASSWORD", "")
if app_password:
# Lancer l'interface avec authentification
start_http_server(8000)
gradio_interface()
else:
# Pas de mot de passe, lancer directement l'interface principale
start_http_server(8000)
gradio_interface()