OCR_PROSPECTUS / app.py
klydekushy's picture
Create app.py
5de3226 verified
raw
history blame
2.68 kB
import streamlit as st
import os
from pathlib import Path
from core.docling_engine import IngestionEngine
from core.extractor import ExtractorEngine
# Configuration de la page
st.set_page_config(page_title="Prõspectus Veritas", layout="wide")
# Initialisation des moteurs dans la session
if 'ingestor' not in st.session_state:
st.session_state.ingestor = IngestionEngine()
if 'extractor' not in st.session_state:
st.session_state.extractor = ExtractorEngine()
st.title("🕵️ Prõspectus Veritas - Intelligence Documentaire")
# Création des dossiers nécessaires
UPLOAD_DIR = Path("uploads")
JSON_DIR = Path("json_output")
UPLOAD_DIR.mkdir(exist_ok=True)
JSON_DIR.mkdir(exist_ok=True)
col1, col2 = st.columns([1, 2])
with col1:
st.header("1. Ingestion")
uploaded_files = st.file_uploader("Charger des documents (PDF, Image)", accept_multiple_files=True)
if st.button("Lancer l'analyse OCR"):
for uploaded_file in uploaded_files:
file_path = UPLOAD_DIR / uploaded_file.name
with open(file_path, "wb") as f:
f.write(uploaded_file.getbuffer())
with st.spinner(f"Lecture de {uploaded_file.name}..."):
result = st.session_state.ingestor.process_document(file_path, JSON_DIR)
if result["status"] == "success":
st.success(result["message"])
else:
st.error(f"Erreur sur {uploaded_file.name}: {result['message']}")
with col2:
st.header("2. Analyse & Graphe")
json_files = list(JSON_DIR.glob("*.json"))
if json_files:
selected_json = st.selectbox("Sélectionner un document analysé", json_files, format_func=lambda x: x.name)
with open(selected_json, 'r', encoding='utf-8') as f:
import json
data = json.load(f)
# Extraction du texte pour l'IA
text_to_analyze = ""
if "texts" in data:
text_to_analyze = " ".join([t.get("text", "") for t in data["texts"]])
st.text_area("Texte extrait par l'OCR", text_to_analyze[:2000], height=200)
if st.button("🧠 Extraire le Graphe de Connaissance"):
with st.spinner("Appel du cerveau Qwen (Timeout 3min)..."):
graph_raw = st.session_state.extractor.extract_graph(text_to_analyze)
if graph_raw:
st.session_state.last_graph = graph_raw
st.json(graph_raw)
else:
st.error("Le cerveau n'a pas répondu à temps ou le format est invalide.")