import streamlit as st import pandas as pd import os import json import time from pathlib import Path from pyvis.network import Network import streamlit.components.v1 as components from core.docling_engine import IngestionEngine from core.extractor import ExtractorEngine #Link for the app : https://klydekushy-ocr-prospectus.hf.space/ # --- CONFIGURATION DE LA PAGE --- st.set_page_config( page_title="PrõspectusVéritas | Intelligence Platform", page_icon="🔵", layout="wide", initial_sidebar_state="expanded" ) # --- MOT DE PASSE --- def check_password(): if "password_correct" not in st.session_state: st.session_state.password_correct = False if st.session_state.password_correct: return True st.title("Accès Restreint - Veritas") password = st.text_input("Veuillez saisir le code d'accès", type="password") if st.button("Se connecter"): if password == "ok": #Veritas2025 st.session_state.password_correct = True st.rerun() else: st.error("Mot de passe incorrect") return False if not check_password(): st.stop() # --- CSS "GOTHAM STYLE" --- st.markdown(""" """, unsafe_allow_html=True) # --- INITIALISATION --- INPUT_DIR = Path("input_data") OUTPUT_DIR = Path("output_json") INPUT_DIR.mkdir(exist_ok=True) OUTPUT_DIR.mkdir(exist_ok=True) if 'engine' not in st.session_state: st.session_state.engine = IngestionEngine() if 'extractor' not in st.session_state: st.session_state.extractor = ExtractorEngine() # --- SIDEBAR --- with st.sidebar: st.title("PrõspectùsV-ritas") st.markdown("---") st.caption("PARAMÈTRES IA") # AJOUT DU CURSEUR DE TEMPÉRATURE ia_temp = st.slider("Température Créative", 0.1, 1.0, 0.2, help="0.1 = Précis, 0.8 = Créatif") if st.button("PURGER LE SYSTÈME"): for f in list(INPUT_DIR.glob("*")) + list(OUTPUT_DIR.glob("*")): os.remove(f) st.success("Système nettoyé.") st.rerun() st.markdown("---") st.caption("PARAMÈTRES SYSTÈME") st.checkbox("OCR Amélioré", value=True) st.checkbox("Extraction Entités", value=True) # --- DASHBOARD HEADER --- col1, col2, col3, col4 = st.columns(4) col1.metric("Documents Ingestés", len(list(OUTPUT_DIR.glob("*.json")))) col2.metric("Statut Système", "DOCKER-HF") col3.metric("Moteur OCR", "Docling v2") col4.metric("Confiance IA", "98.4%") st.markdown("---") # --- NAVIGATION PAR ONGLETS (STYLE PALANTIR) --- tab_ingestion, tab_entities, tab_visualisation = st.tabs([ "INGESTION & OCR", "ENTITÉS & RELATIONS", "VISUALISATION GRAPHE" ]) # --- TAB 1: INGESTION --- with tab_ingestion: col_u1, col_u2 = st.columns(2) with col_u1: st.subheader("◯⎯| CHARGEMENT DOCUMENTS") uploaded_files = st.file_uploader("Fichiers PDF/IMG", accept_multiple_files=True) if uploaded_files and st.button("INITIER LA SÉQUENCE OCR"): for uploaded_file in uploaded_files: file_path = INPUT_DIR / uploaded_file.name with open(file_path, "wb") as f: f.write(uploaded_file.getbuffer()) with st.spinner(f"Traitement: {uploaded_file.name}"): st.session_state.engine.process_document(file_path, OUTPUT_DIR) st.success("Traitement terminé.") st.rerun() with col_u2: st.subheader("◯⎯| TEXTE LIBRE") free_text = st.text_area("Coller du texte ici", height=150) if st.button("INITIER LA SÉQUENCE TEXTE"): temp_path = INPUT_DIR / f"text_{int(time.time())}.md" with open(temp_path, "w", encoding="utf-8") as f: f.write(free_text) st.session_state.engine.process_document(temp_path, OUTPUT_DIR) st.rerun() # --- TAB 2: ENTITÉS & RELATIONS --- with tab_entities: json_files = list(OUTPUT_DIR.glob("*.json")) if not json_files: st.info("Aucun document analysé disponible. Allez dans l'onglet INGESTION.") else: # Barre d'outils (Sélection et Suppression) col_select, col_delete = st.columns([3, 1]) with col_select: selected_file = st.selectbox("Sélectionner un artefact", json_files, format_func=lambda x: x.name, key="select_entity") with col_delete: st.write("") if st.button("SUPPRIMER", key="del_entity", use_container_width=True): os.remove(selected_file) st.rerun() st.markdown("---") # Chargement du texte extrait with open(selected_file, 'r', encoding='utf-8') as f: data = json.load(f) text_extracted = " ".join([t.get("text", "") for t in data.get("texts", [])]) col_inf1, col_inf2 = st.columns([1, 1]) with col_inf1: st.markdown("### TEXTE SOURCE") st.text_area("Données issues de l'OCR", text_extracted, height=500) with col_inf2: st.markdown("### EXTRACTION HYBRIDE") # 1. Bouton de lancement if st.button("GÉNÉRER L'INTELLIGENCE SÉMANTIQUE", key="btn_run_hybrid", use_container_width=True): # 2. Préparation de la barre de progression progress_bar = st.progress(0) status_text = st.empty() # On utilise un spinner pour le chargement global with st.spinner("Initialisation de GLiNER & Qwen..."): # NOTE: Pour afficher la progression, nous allons légèrement modifier # l'appel pour traiter les morceaux ici ou s'assurer que # extract_long_text mette à jour un callback. # Pour faire simple, on lance l'extraction : status_text.text("Analyse des segments en cours...") progress_bar.progress(25) # Simulation d'étape 1 graph_data = st.session_state.extractor.extract_long_text( text_extracted, temperature=ia_temp ) progress_bar.progress(100) status_text.text("Extraction terminée.") if graph_data: st.session_state.last_graph = graph_data st.success(f"Réussite : {len(graph_data.get('entities', []))} entités identifiées.") else: st.error("L'IA n'a pas pu structurer les données.") # 3. Affichage du JSON if 'last_graph' in st.session_state: st.markdown("#### FORMAT JSON (BRUT)") st.json(st.session_state.last_graph) # --- TAB 3: VISUALISATION GRAPHE --- with tab_visualisation: st.subheader("◯⎯| INTERFACE CINÉTIQUE VISUELLE") if 'last_graph' in st.session_state and st.session_state.last_graph: try: # Initialisation du graphe PyVis net = Network(height="700px", width="100%", bgcolor="#0b0d11", font_color="#e6edf3", directed=True) import hashlib def auto_color(text): hash_hex = hashlib.md5(text.lower().encode()).hexdigest() return f"#{hash_hex[:6]}" found_types = {} # Ajout des Noeuds for ent in st.session_state.last_graph.get("entities", []): e_type = ent.get("type", "Unknown") e_color = auto_color(e_type) found_types[e_type] = e_color net.add_node( ent["id"], label=ent["name"], title=f"TYPE: {e_type}\n{ent.get('description')}", color=e_color, shape="dot", size=25 ) # Ajout des Relations for rel in st.session_state.last_graph.get("relationships", []): net.add_edge( rel["from"], rel["to"], label=rel.get("type", "LINK"), color="#30363d", arrows="to" ) net.set_options('{"physics": {"forceAtlas2Based": {"gravitationalConstant": -100, "centralGravity": 0.01}, "solver": "forceAtlas2Based"}}') # Affichage de la légende st.write("**Légende détectée :**") leg_cols = st.columns(len(found_types) if len(found_types) > 0 else 1) for idx, (t_name, t_color) in enumerate(found_types.items()): leg_cols[idx % len(leg_cols)].markdown(f" {t_name}", unsafe_allow_html=True) # Rendu du graphe path = "temp_graph_viz.html" net.save_graph(path) with open(path, 'r', encoding='utf-8') as f: st.components.v1.html(f.read(), height=750) except Exception as e: st.error(f"Erreur de rendu visuel : {e}") else: st.warning("⚠️ Aucune donnée disponible. Veuillez d'abord générer l'intelligence dans l'onglet 'ENTITÉS & RELATIONS'.")