Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import os | |
| import json | |
| import time | |
| from pathlib import Path | |
| from pyvis.network import Network | |
| import streamlit.components.v1 as components | |
| from core.docling_engine import IngestionEngine | |
| from core.extractor import ExtractorEngine | |
| #Link for the app : https://klydekushy-ocr-prospectus.hf.space/ | |
| # --- CONFIGURATION DE LA PAGE --- | |
| st.set_page_config( | |
| page_title="PrõspectusVéritas | Intelligence Platform", | |
| page_icon="🔵", | |
| layout="wide", | |
| initial_sidebar_state="expanded" | |
| ) | |
| # --- MOT DE PASSE --- | |
| def check_password(): | |
| if "password_correct" not in st.session_state: | |
| st.session_state.password_correct = False | |
| if st.session_state.password_correct: | |
| return True | |
| st.title("Accès Restreint - Veritas") | |
| password = st.text_input("Veuillez saisir le code d'accès", type="password") | |
| if st.button("Se connecter"): | |
| if password == "ok": #Veritas2025 | |
| st.session_state.password_correct = True | |
| st.rerun() | |
| else: | |
| st.error("Mot de passe incorrect") | |
| return False | |
| if not check_password(): | |
| st.stop() | |
| # --- CSS "GOTHAM STYLE" --- | |
| st.markdown(""" | |
| <style> | |
| @import url('https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@300;400;700&display=swap'); | |
| html, body, .stApp, h1, h2, h3, h4, .stText, .stMarkdown, .stTextInput, .stTextArea { | |
| font-family: 'Space Grotesk', sans-serif !important; | |
| } | |
| .stApp { background-color: #0b0d11; } | |
| [data-testid="stSidebar"] { background-color: #12151e; border-right: 1px solid #30363d; } | |
| div[data-testid="stMetricValue"] { font-size: 24px; color: #29b5e8; } | |
| div[data-testid="metric-container"] { background-color: #1c2128; border: 1px solid #30363d; padding: 15px; border-radius: 4px; } | |
| .stButton>button { background-color: #29b5e8; color: white; border-radius: 0px; text-transform: uppercase; font-weight: bold; } | |
| h1, h2, h3 { color: #e6edf3; font-weight: 300; text-transform: uppercase; } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # --- INITIALISATION --- | |
| INPUT_DIR = Path("input_data") | |
| OUTPUT_DIR = Path("output_json") | |
| INPUT_DIR.mkdir(exist_ok=True) | |
| OUTPUT_DIR.mkdir(exist_ok=True) | |
| if 'engine' not in st.session_state: | |
| st.session_state.engine = IngestionEngine() | |
| if 'extractor' not in st.session_state: | |
| st.session_state.extractor = ExtractorEngine() | |
| # --- SIDEBAR --- | |
| with st.sidebar: | |
| st.title("PrõspectùsV-ritas") | |
| st.markdown("---") | |
| st.caption("PARAMÈTRES IA") | |
| # AJOUT DU CURSEUR DE TEMPÉRATURE | |
| ia_temp = st.slider("Température Créative", 0.1, 1.0, 0.2, help="0.1 = Précis, 0.8 = Créatif") | |
| if st.button("PURGER LE SYSTÈME"): | |
| for f in list(INPUT_DIR.glob("*")) + list(OUTPUT_DIR.glob("*")): | |
| os.remove(f) | |
| st.success("Système nettoyé.") | |
| st.rerun() | |
| st.markdown("---") | |
| st.caption("PARAMÈTRES SYSTÈME") | |
| st.checkbox("OCR Amélioré", value=True) | |
| st.checkbox("Extraction Entités", value=True) | |
| # --- DASHBOARD HEADER --- | |
| col1, col2, col3, col4 = st.columns(4) | |
| col1.metric("Documents Ingestés", len(list(OUTPUT_DIR.glob("*.json")))) | |
| col2.metric("Statut Système", "DOCKER-HF") | |
| col3.metric("Moteur OCR", "Docling v2") | |
| col4.metric("Confiance IA", "98.4%") | |
| st.markdown("---") | |
| # --- NAVIGATION PAR ONGLETS (STYLE PALANTIR) --- | |
| tab_ingestion, tab_entities, tab_visualisation = st.tabs([ | |
| "INGESTION & OCR", | |
| "ENTITÉS & RELATIONS", | |
| "VISUALISATION GRAPHE" | |
| ]) | |
| # --- TAB 1: INGESTION --- | |
| with tab_ingestion: | |
| col_u1, col_u2 = st.columns(2) | |
| with col_u1: | |
| st.subheader("◯⎯| CHARGEMENT DOCUMENTS") | |
| uploaded_files = st.file_uploader("Fichiers PDF/IMG", accept_multiple_files=True) | |
| if uploaded_files and st.button("INITIER LA SÉQUENCE OCR"): | |
| for uploaded_file in uploaded_files: | |
| file_path = INPUT_DIR / uploaded_file.name | |
| with open(file_path, "wb") as f: | |
| f.write(uploaded_file.getbuffer()) | |
| with st.spinner(f"Traitement: {uploaded_file.name}"): | |
| st.session_state.engine.process_document(file_path, OUTPUT_DIR) | |
| st.success("Traitement terminé.") | |
| st.rerun() | |
| with col_u2: | |
| st.subheader("◯⎯| TEXTE LIBRE") | |
| free_text = st.text_area("Coller du texte ici", height=150) | |
| if st.button("INITIER LA SÉQUENCE TEXTE"): | |
| temp_path = INPUT_DIR / f"text_{int(time.time())}.md" | |
| with open(temp_path, "w", encoding="utf-8") as f: f.write(free_text) | |
| st.session_state.engine.process_document(temp_path, OUTPUT_DIR) | |
| st.rerun() | |
| # --- TAB 2: ENTITÉS & RELATIONS --- | |
| with tab_entities: | |
| json_files = list(OUTPUT_DIR.glob("*.json")) | |
| if not json_files: | |
| st.info("Aucun document analysé disponible. Allez dans l'onglet INGESTION.") | |
| else: | |
| # Barre d'outils (Sélection et Suppression) | |
| col_select, col_delete = st.columns([3, 1]) | |
| with col_select: | |
| selected_file = st.selectbox("Sélectionner un artefact", json_files, format_func=lambda x: x.name, key="select_entity") | |
| with col_delete: | |
| st.write("") | |
| if st.button("SUPPRIMER", key="del_entity", use_container_width=True): | |
| os.remove(selected_file) | |
| st.rerun() | |
| st.markdown("---") | |
| # Chargement du texte extrait | |
| with open(selected_file, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| text_extracted = " ".join([t.get("text", "") for t in data.get("texts", [])]) | |
| col_inf1, col_inf2 = st.columns([1, 1]) | |
| with col_inf1: | |
| st.markdown("### TEXTE SOURCE") | |
| st.text_area("Données issues de l'OCR", text_extracted, height=500) | |
| with col_inf2: | |
| st.markdown("### EXTRACTION HYBRIDE") | |
| # 1. Bouton de lancement | |
| if st.button("GÉNÉRER L'INTELLIGENCE SÉMANTIQUE", key="btn_run_hybrid", use_container_width=True): | |
| # 2. Préparation de la barre de progression | |
| progress_bar = st.progress(0) | |
| status_text = st.empty() | |
| # On utilise un spinner pour le chargement global | |
| with st.spinner("Initialisation de GLiNER & Qwen..."): | |
| # NOTE: Pour afficher la progression, nous allons légèrement modifier | |
| # l'appel pour traiter les morceaux ici ou s'assurer que | |
| # extract_long_text mette à jour un callback. | |
| # Pour faire simple, on lance l'extraction : | |
| status_text.text("Analyse des segments en cours...") | |
| progress_bar.progress(25) # Simulation d'étape 1 | |
| graph_data = st.session_state.extractor.extract_long_text( | |
| text_extracted, | |
| temperature=ia_temp | |
| ) | |
| progress_bar.progress(100) | |
| status_text.text("Extraction terminée.") | |
| if graph_data: | |
| st.session_state.last_graph = graph_data | |
| st.success(f"Réussite : {len(graph_data.get('entities', []))} entités identifiées.") | |
| else: | |
| st.error("L'IA n'a pas pu structurer les données.") | |
| # 3. Affichage du JSON | |
| if 'last_graph' in st.session_state: | |
| st.markdown("#### FORMAT JSON (BRUT)") | |
| st.json(st.session_state.last_graph) | |
| # --- TAB 3: VISUALISATION GRAPHE --- | |
| with tab_visualisation: | |
| st.subheader("◯⎯| INTERFACE CINÉTIQUE VISUELLE") | |
| if 'last_graph' in st.session_state and st.session_state.last_graph: | |
| try: | |
| # Initialisation du graphe PyVis | |
| net = Network(height="700px", width="100%", bgcolor="#0b0d11", font_color="#e6edf3", directed=True) | |
| import hashlib | |
| def auto_color(text): | |
| hash_hex = hashlib.md5(text.lower().encode()).hexdigest() | |
| return f"#{hash_hex[:6]}" | |
| found_types = {} | |
| # Ajout des Noeuds | |
| for ent in st.session_state.last_graph.get("entities", []): | |
| e_type = ent.get("type", "Unknown") | |
| e_color = auto_color(e_type) | |
| found_types[e_type] = e_color | |
| net.add_node( | |
| ent["id"], | |
| label=ent["name"], | |
| title=f"TYPE: {e_type}\n{ent.get('description')}", | |
| color=e_color, | |
| shape="dot", | |
| size=25 | |
| ) | |
| # Ajout des Relations | |
| for rel in st.session_state.last_graph.get("relationships", []): | |
| net.add_edge( | |
| rel["from"], | |
| rel["to"], | |
| label=rel.get("type", "LINK"), | |
| color="#30363d", | |
| arrows="to" | |
| ) | |
| net.set_options('{"physics": {"forceAtlas2Based": {"gravitationalConstant": -100, "centralGravity": 0.01}, "solver": "forceAtlas2Based"}}') | |
| # Affichage de la légende | |
| st.write("**Légende détectée :**") | |
| leg_cols = st.columns(len(found_types) if len(found_types) > 0 else 1) | |
| for idx, (t_name, t_color) in enumerate(found_types.items()): | |
| leg_cols[idx % len(leg_cols)].markdown(f"<span style='color:{t_color}'>●</span> {t_name}", unsafe_allow_html=True) | |
| # Rendu du graphe | |
| path = "temp_graph_viz.html" | |
| net.save_graph(path) | |
| with open(path, 'r', encoding='utf-8') as f: | |
| st.components.v1.html(f.read(), height=750) | |
| except Exception as e: | |
| st.error(f"Erreur de rendu visuel : {e}") | |
| else: | |
| st.warning("⚠️ Aucune donnée disponible. Veuillez d'abord générer l'intelligence dans l'onglet 'ENTITÉS & RELATIONS'.") | |