Spaces:

klydekushy
/

OCR_PROSPECTUS

Sleeping

File size: 2,677 Bytes

5de3226

import streamlit as st
import os
from pathlib import Path
from core.docling_engine import IngestionEngine
from core.extractor import ExtractorEngine

# Configuration de la page
st.set_page_config(page_title="Prõspectus Veritas", layout="wide")

# Initialisation des moteurs dans la session
if 'ingestor' not in st.session_state:
    st.session_state.ingestor = IngestionEngine()
if 'extractor' not in st.session_state:
    st.session_state.extractor = ExtractorEngine()

st.title("🕵️ Prõspectus Veritas - Intelligence Documentaire")

# Création des dossiers nécessaires
UPLOAD_DIR = Path("uploads")
JSON_DIR = Path("json_output")
UPLOAD_DIR.mkdir(exist_ok=True)
JSON_DIR.mkdir(exist_ok=True)

col1, col2 = st.columns([1, 2])

with col1:
    st.header("1. Ingestion")
    uploaded_files = st.file_uploader("Charger des documents (PDF, Image)", accept_multiple_files=True)
    
    if st.button("Lancer l'analyse OCR"):
        for uploaded_file in uploaded_files:
            file_path = UPLOAD_DIR / uploaded_file.name
            with open(file_path, "wb") as f:
                f.write(uploaded_file.getbuffer())
            
            with st.spinner(f"Lecture de {uploaded_file.name}..."):
                result = st.session_state.ingestor.process_document(file_path, JSON_DIR)
                if result["status"] == "success":
                    st.success(result["message"])
                else:
                    st.error(f"Erreur sur {uploaded_file.name}: {result['message']}")

with col2:
    st.header("2. Analyse & Graphe")
    json_files = list(JSON_DIR.glob("*.json"))
    
    if json_files:
        selected_json = st.selectbox("Sélectionner un document analysé", json_files, format_func=lambda x: x.name)
        
        with open(selected_json, 'r', encoding='utf-8') as f:
            import json
            data = json.load(f)
            # Extraction du texte pour l'IA
            text_to_analyze = ""
            if "texts" in data:
                text_to_analyze = " ".join([t.get("text", "") for t in data["texts"]])
            
            st.text_area("Texte extrait par l'OCR", text_to_analyze[:2000], height=200)
            
            if st.button("🧠 Extraire le Graphe de Connaissance"):
                with st.spinner("Appel du cerveau Qwen (Timeout 3min)..."):
                    graph_raw = st.session_state.extractor.extract_graph(text_to_analyze)
                    if graph_raw:
                        st.session_state.last_graph = graph_raw
                        st.json(graph_raw)
                    else:
                        st.error("Le cerveau n'a pas répondu à temps ou le format est invalide.")