import streamlit as st
import pandas as pd
import os
import json
import time
from pathlib import Path
from pyvis.network import Network
import streamlit.components.v1 as components
from core.docling_engine import IngestionEngine
from core.extractor import ExtractorEngine
#Link for the app : https://klydekushy-ocr-prospectus.hf.space/
# --- CONFIGURATION DE LA PAGE ---
st.set_page_config(
page_title="PrõspectusVéritas | Intelligence Platform",
page_icon="🔵",
layout="wide",
initial_sidebar_state="expanded"
)
# --- MOT DE PASSE ---
def check_password():
if "password_correct" not in st.session_state:
st.session_state.password_correct = False
if st.session_state.password_correct:
return True
st.title("Accès Restreint - Veritas")
password = st.text_input("Veuillez saisir le code d'accès", type="password")
if st.button("Se connecter"):
if password == "ok": #Veritas2025
st.session_state.password_correct = True
st.rerun()
else:
st.error("Mot de passe incorrect")
return False
if not check_password():
st.stop()
# --- CSS "GOTHAM STYLE" ---
st.markdown("""
""", unsafe_allow_html=True)
# --- INITIALISATION ---
INPUT_DIR = Path("input_data")
OUTPUT_DIR = Path("output_json")
INPUT_DIR.mkdir(exist_ok=True)
OUTPUT_DIR.mkdir(exist_ok=True)
if 'engine' not in st.session_state:
st.session_state.engine = IngestionEngine()
if 'extractor' not in st.session_state:
st.session_state.extractor = ExtractorEngine()
# --- SIDEBAR ---
with st.sidebar:
st.title("PrõspectùsV-ritas")
st.markdown("---")
st.caption("PARAMÈTRES IA")
# AJOUT DU CURSEUR DE TEMPÉRATURE
ia_temp = st.slider("Température Créative", 0.1, 1.0, 0.2, help="0.1 = Précis, 0.8 = Créatif")
if st.button("PURGER LE SYSTÈME"):
for f in list(INPUT_DIR.glob("*")) + list(OUTPUT_DIR.glob("*")):
os.remove(f)
st.success("Système nettoyé.")
st.rerun()
st.markdown("---")
st.caption("PARAMÈTRES SYSTÈME")
st.checkbox("OCR Amélioré", value=True)
st.checkbox("Extraction Entités", value=True)
# --- DASHBOARD HEADER ---
col1, col2, col3, col4 = st.columns(4)
col1.metric("Documents Ingestés", len(list(OUTPUT_DIR.glob("*.json"))))
col2.metric("Statut Système", "DOCKER-HF")
col3.metric("Moteur OCR", "Docling v2")
col4.metric("Confiance IA", "98.4%")
st.markdown("---")
# --- NAVIGATION PAR ONGLETS (STYLE PALANTIR) ---
tab_ingestion, tab_entities, tab_visualisation = st.tabs([
"INGESTION & OCR",
"ENTITÉS & RELATIONS",
"VISUALISATION GRAPHE"
])
# --- TAB 1: INGESTION ---
with tab_ingestion:
col_u1, col_u2 = st.columns(2)
with col_u1:
st.subheader("◯⎯| CHARGEMENT DOCUMENTS")
uploaded_files = st.file_uploader("Fichiers PDF/IMG", accept_multiple_files=True)
if uploaded_files and st.button("INITIER LA SÉQUENCE OCR"):
for uploaded_file in uploaded_files:
file_path = INPUT_DIR / uploaded_file.name
with open(file_path, "wb") as f:
f.write(uploaded_file.getbuffer())
with st.spinner(f"Traitement: {uploaded_file.name}"):
st.session_state.engine.process_document(file_path, OUTPUT_DIR)
st.success("Traitement terminé.")
st.rerun()
with col_u2:
st.subheader("◯⎯| TEXTE LIBRE")
free_text = st.text_area("Coller du texte ici", height=150)
if st.button("INITIER LA SÉQUENCE TEXTE"):
temp_path = INPUT_DIR / f"text_{int(time.time())}.md"
with open(temp_path, "w", encoding="utf-8") as f: f.write(free_text)
st.session_state.engine.process_document(temp_path, OUTPUT_DIR)
st.rerun()
# --- TAB 2: ENTITÉS & RELATIONS ---
with tab_entities:
json_files = list(OUTPUT_DIR.glob("*.json"))
if not json_files:
st.info("Aucun document analysé disponible. Allez dans l'onglet INGESTION.")
else:
# Barre d'outils (Sélection et Suppression)
col_select, col_delete = st.columns([3, 1])
with col_select:
selected_file = st.selectbox("Sélectionner un artefact", json_files, format_func=lambda x: x.name, key="select_entity")
with col_delete:
st.write("")
if st.button("SUPPRIMER", key="del_entity", use_container_width=True):
os.remove(selected_file)
st.rerun()
st.markdown("---")
# Chargement du texte extrait
with open(selected_file, 'r', encoding='utf-8') as f:
data = json.load(f)
text_extracted = " ".join([t.get("text", "") for t in data.get("texts", [])])
col_inf1, col_inf2 = st.columns([1, 1])
with col_inf1:
st.markdown("### TEXTE SOURCE")
st.text_area("Données issues de l'OCR", text_extracted, height=500)
with col_inf2:
st.markdown("### EXTRACTION HYBRIDE")
# 1. Bouton de lancement
if st.button("GÉNÉRER L'INTELLIGENCE SÉMANTIQUE", key="btn_run_hybrid", use_container_width=True):
# 2. Préparation de la barre de progression
progress_bar = st.progress(0)
status_text = st.empty()
# On utilise un spinner pour le chargement global
with st.spinner("Initialisation de GLiNER & Qwen..."):
# NOTE: Pour afficher la progression, nous allons légèrement modifier
# l'appel pour traiter les morceaux ici ou s'assurer que
# extract_long_text mette à jour un callback.
# Pour faire simple, on lance l'extraction :
status_text.text("Analyse des segments en cours...")
progress_bar.progress(25) # Simulation d'étape 1
graph_data = st.session_state.extractor.extract_long_text(
text_extracted,
temperature=ia_temp
)
progress_bar.progress(100)
status_text.text("Extraction terminée.")
if graph_data:
st.session_state.last_graph = graph_data
st.success(f"Réussite : {len(graph_data.get('entities', []))} entités identifiées.")
else:
st.error("L'IA n'a pas pu structurer les données.")
# 3. Affichage du JSON
if 'last_graph' in st.session_state:
st.markdown("#### FORMAT JSON (BRUT)")
st.json(st.session_state.last_graph)
# --- TAB 3: VISUALISATION GRAPHE ---
with tab_visualisation:
st.subheader("◯⎯| INTERFACE CINÉTIQUE VISUELLE")
if 'last_graph' in st.session_state and st.session_state.last_graph:
try:
# Initialisation du graphe PyVis
net = Network(height="700px", width="100%", bgcolor="#0b0d11", font_color="#e6edf3", directed=True)
import hashlib
def auto_color(text):
hash_hex = hashlib.md5(text.lower().encode()).hexdigest()
return f"#{hash_hex[:6]}"
found_types = {}
# Ajout des Noeuds
for ent in st.session_state.last_graph.get("entities", []):
e_type = ent.get("type", "Unknown")
e_color = auto_color(e_type)
found_types[e_type] = e_color
net.add_node(
ent["id"],
label=ent["name"],
title=f"TYPE: {e_type}\n{ent.get('description')}",
color=e_color,
shape="dot",
size=25
)
# Ajout des Relations
for rel in st.session_state.last_graph.get("relationships", []):
net.add_edge(
rel["from"],
rel["to"],
label=rel.get("type", "LINK"),
color="#30363d",
arrows="to"
)
net.set_options('{"physics": {"forceAtlas2Based": {"gravitationalConstant": -100, "centralGravity": 0.01}, "solver": "forceAtlas2Based"}}')
# Affichage de la légende
st.write("**Légende détectée :**")
leg_cols = st.columns(len(found_types) if len(found_types) > 0 else 1)
for idx, (t_name, t_color) in enumerate(found_types.items()):
leg_cols[idx % len(leg_cols)].markdown(f"● {t_name}", unsafe_allow_html=True)
# Rendu du graphe
path = "temp_graph_viz.html"
net.save_graph(path)
with open(path, 'r', encoding='utf-8') as f:
st.components.v1.html(f.read(), height=750)
except Exception as e:
st.error(f"Erreur de rendu visuel : {e}")
else:
st.warning("⚠️ Aucune donnée disponible. Veuillez d'abord générer l'intelligence dans l'onglet 'ENTITÉS & RELATIONS'.")