import streamlit as st import os import shutil import csv import json import pandas as pd from datetime import datetime from neo4j import GraphDatabase from pyvis.network import Network import streamlit.components.v1 as components from dotenv import load_dotenv # --- IMPORT MODULI SPECIFICI --- # 1. Ingestion from src.ingestion.semantic_splitter import ActivaSemanticSplitter # 2. Extraction (Importiamo anche la classe GraphTriple per la ricostruzione dei dati) from src.extraction.extractor import NeuroSymbolicExtractor, GraphTriple # 3. Graph Building (Loader & Resolver) from src.graph.graph_loader import KnowledgeGraphPersister from src.graph.entity_resolver import EntityResolver # --- CONFIGURAZIONE INIZIALE --- load_dotenv() st.set_page_config( page_title="Activa Semantic Discovery", layout="wide", initial_sidebar_state="expanded", page_icon="🧠" ) # --- FUNZIONI DI UTILITÀ NEO4J (Frontend) --- def get_driver(uri, user, password): if not uri or not password: return None try: return GraphDatabase.driver(uri, auth=(user, password)) except Exception as e: return None def run_query(driver, query, params=None): if driver is None: return [] with driver.session() as session: result = session.run(query, params) return [r.data() for r in result] # --- LOGICA FEEDBACK LOOP (HUMAN-IN-THE-LOOP) --- def reject_relationship(driver, rel_id, subj, pred, obj, reason="Human Rejection"): """ 1. Cancella dal DB (Azione Reale). 2. Salva in CSV per Active Learning (Data Lineage del rifiuto). """ query = "MATCH ()-[r]->() WHERE elementId(r) = $id DELETE r" try: run_query(driver, query, {"id": rel_id}) except Exception as e: st.error(f"Errore durante la cancellazione: {e}") return False log_file = "data/processed/rejected_triples.csv" os.makedirs("data/processed", exist_ok=True) file_exists = os.path.isfile(log_file) try: with open(log_file, mode='a', newline='', encoding='utf-8') as f: writer = csv.writer(f) if not file_exists: writer.writerow(["timestamp", "subject", "predicate", "object", "reason"]) writer.writerow([datetime.now(), subj, pred, obj, reason]) return True except Exception as e: st.warning(f"Relazione cancellata dal DB, ma errore nel log CSV: {e}") return True # --- UI: SIDEBAR & CONFIGURAZIONE --- st.sidebar.title("⚙️ Configurazione") env_uri = os.getenv("NEO4J_URI", "") env_user = os.getenv("NEO4J_USER", "neo4j") env_password = os.getenv("NEO4J_PASSWORD", "") env_hf_token = os.getenv("HF_TOKEN", "") st.sidebar.subheader("Backend LLM") # Se il token HF è presente nei secrets/env, lo mostriamo mascherato, altrimenti input hf_token_input = st.sidebar.text_input("HuggingFace Token (Opzionale per Cloud)", value=env_hf_token, type="password") if hf_token_input: os.environ["HF_TOKEN"] = hf_token_input st.sidebar.subheader("Connessione Neo4j AuraDB") uri = st.sidebar.text_input("URI", value=env_uri, placeholder="neo4j+s://...") user = st.sidebar.text_input("User", value=env_user) password = st.sidebar.text_input("Password", value=env_password, type="password") driver = None if uri and password: driver = get_driver(uri, user, password) if driver: st.sidebar.success("🟢 Connesso al Knowledge Graph") os.environ["NEO4J_URI"] = uri os.environ["NEO4J_USER"] = user os.environ["NEO4J_PASSWORD"] = password else: st.sidebar.error("🔴 Errore connessione") # --- UI: HEADER PRINCIPALE --- st.title("🧠 Automated Semantic Discovery Prototype") st.markdown("**Generazione Neuro-Simbolica & Validazione Human-in-the-Loop**") # --- TAB NAVIGAZIONE --- tab_gen, tab_val, tab_vis = st.tabs([ "⚙️ 1. Generazione (Pipeline)", "🔍 2. Validazione (Active Learning)", "🕸️ 3. Visualizzazione (Graph)" ]) # ============================================================================== # TAB 1: GENERAZIONE (PIPELINE DI ESECUZIONE) # ============================================================================== with tab_gen: st.header("Pipeline di Ingestione ed Estrazione") col_src, col_act = st.columns([1, 2]) # 1. Selezione File with col_src: st.subheader("Sorgente Dati") data_source = st.radio("Modalità:", ("📂 Esempi Demo", "⬆️ Upload (Sperimentale)")) selected_file_path = None os.makedirs("data/raw", exist_ok=True) os.makedirs("data/processed", exist_ok=True) os.makedirs("data/examples", exist_ok=True) if data_source == "📂 Esempi Demo": files = [f for f in os.listdir("data/examples") if f.endswith(".txt")] if files: choice = st.selectbox("Seleziona scenario:", files) if choice: src = os.path.join("data/examples", choice) dst = os.path.join("data/raw", choice) shutil.copy(src, dst) selected_file_path = choice else: st.warning("Nessun file in data/examples") else: uploaded = st.file_uploader("Carica .txt", type="txt") if uploaded: with open(os.path.join("data/raw", uploaded.name), "wb") as f: f.write(uploaded.getbuffer()) selected_file_path = uploaded.name # 2. Esecuzione Step with col_act: if selected_file_path: st.info(f"File attivo: **{selected_file_path}**") c1, c2, c3 = st.columns(3) # --- STEP A: CHUNKING --- with c1: if st.button("A. Semantic Chunking"): with st.status("Analisi vettoriale in corso...", expanded=True) as status: try: # 1. Leggi il testo raw with open(os.path.join("data/raw", selected_file_path), "r", encoding="utf-8") as f: text_content = f.read() # 2. Inizializza Splitter e processa status.write("Caricamento modelli di embedding...") splitter = ActivaSemanticSplitter() # Usa default huggingface status.write("Calcolo distanze coseno...") chunks, dists, threshold = splitter.create_chunks(text_content) # 3. Salvataggio intermedio chunk_file = "data/processed/chunks.json" with open(chunk_file, "w", encoding="utf-8") as f: json.dump(chunks, f, ensure_ascii=False, indent=2) status.update(label="Chunking Completato!", state="complete", expanded=False) st.success(f"Generati {len(chunks)} frammenti semantici.") # Optional: Mostra grafico se generato if os.path.exists("chunking_analysis.png"): st.image("chunking_analysis.png", caption="Analisi Coerenza") except Exception as e: status.update(label="Errore Chunking", state="error") st.error(f"Errore: {e}") # --- STEP B: EXTRACTION --- with c2: if st.button("B. Info Extraction"): chunk_file = "data/processed/chunks.json" if not os.path.exists(chunk_file): st.error("Esegui prima il Chunking!") else: with st.status("Estrazione Neuro-Simbolica...", expanded=True) as status: try: # 1. Carica Chunks with open(chunk_file, "r", encoding="utf-8") as f: chunks = json.load(f) # 2. Init Extractor (Rileva HF_TOKEN da env) status.write("Inizializzazione LLM (Locale/Cloud)...") extractor = NeuroSymbolicExtractor() # Usa default params all_triples = [] progress_bar = st.progress(0) # 3. Loop su chunk for i, chunk in enumerate(chunks): status.write(f"Processando chunk {i+1}/{len(chunks)}...") result = extractor.extract(chunk, source_id=selected_file_path) # Converti oggetti Pydantic in dict per serializzazione JSON triples_dicts = [t.model_dump() for t in result.triples] all_triples.extend(triples_dicts) progress_bar.progress((i + 1) / len(chunks)) # 4. Salvataggio Raw Triples triples_file = "data/processed/triples_raw.json" with open(triples_file, "w", encoding="utf-8") as f: json.dump(all_triples, f, ensure_ascii=False, indent=2) status.update(label="Estrazione Completata!", state="complete", expanded=False) st.success(f"Estratte {len(all_triples)} triple candidate.") except Exception as e: status.update(label="Errore Estrazione", state="error") st.error(f"Errore: {e}") # --- STEP C: GRAPH BUILDING --- with c3: if st.button("C. Popola Neo4j", type="primary"): triples_file = "data/processed/triples_raw.json" if not os.path.exists(triples_file): st.error("Esegui prima l'estrazione!") elif not driver: st.error("Connettiti al DB prima!") else: with st.status("Costruzione Grafo...", expanded=True) as status: try: # 1. Carica Raw Triples with open(triples_file, "r", encoding="utf-8") as f: raw_data = json.load(f) # Ricostruisci oggetti GraphTriple (necessari per il Resolver) triples_objs = [GraphTriple(**t) for t in raw_data] # 2. Entity Resolution status.write("Entity Resolution (DBSCAN Clustering)...") resolver = EntityResolver(similarity_threshold=0.85) resolved_triples = resolver.resolve_entities(triples_objs) status.write(f"Entità normalizzate. Triple da inserire: {len(resolved_triples)}") # 3. Persistenza Neo4j status.write("Scrittura Batch su Neo4j...") persister = KnowledgeGraphPersister() # Prende credenziali da env persister.save_triples(resolved_triples) persister.close() status.update(label="Grafo Aggiornato!", state="complete", expanded=False) st.success("🚀 Grafo costruito con successo su AuraDB!") st.balloons() except Exception as e: status.update(label="Errore Costruzione", state="error") st.error(f"Errore: {e}") else: st.write("👈 Seleziona un file per iniziare.") # ============================================================================== # TAB 2: VALIDAZIONE (HUMAN IN THE LOOP) # ============================================================================== with tab_val: st.header("Curation & Feedback Loop") st.markdown("Validazione delle triple estratte. Le relazioni rifiutate vengono usate per il fine-tuning.") if driver: # Query aggiornata per mostrare nodi reali cypher_val = """ MATCH (s)-[r]->(o) RETURN elementId(r) as id, COALESCE(s.label, s.name, head(labels(s))) as Soggetto, type(r) as Predicato, COALESCE(o.label, o.name, head(labels(o))) as Oggetto, COALESCE(r.confidence, 0.85) as Confidenza ORDER BY Confidenza ASC LIMIT 50 """ triples_data = run_query(driver, cypher_val) if triples_data: df = pd.DataFrame(triples_data) st.write(f"Relazioni candidate ({len(df)}):") event = st.dataframe( df.drop(columns=["id"]), selection_mode="single-row", on_select="rerun", use_container_width=True, hide_index=True ) if len(event.selection.rows) > 0: idx = event.selection.rows[0] row = df.iloc[idx] st.divider() col_warn, col_btn = st.columns([3, 1]) with col_warn: st.warning(f"Rifiutare: **{row['Soggetto']}** --[{row['Predicato']}]--> **{row['Oggetto']}**?") with col_btn: if st.button("🗑️ CONFERMA RIFIUTO", type="primary"): success = reject_relationship(driver, row['id'], row['Soggetto'], row['Predicato'], row['Oggetto']) if success: st.success("Relazione eliminata!") st.rerun() else: st.info("Nessuna relazione nel grafo.") else: st.warning("Database non connesso.") # ============================================================================== # TAB 3: VISUALIZZAZIONE (PYVIS) # ============================================================================== with tab_vis: st.header("Esplorazione Topologica") if driver: physics = st.checkbox("Abilita Fisica", value=True) cypher_vis = """ MATCH (s)-[r]->(o) RETURN COALESCE(s.label, s.name, head(labels(s))) as src, type(r) as rel, COALESCE(o.label, o.name, head(labels(o))) as dst LIMIT 100 """ graph_data = run_query(driver, cypher_vis) if graph_data: net = Network(height="600px", width="100%", bgcolor="#222222", font_color="white", notebook=False) for item in graph_data: src = str(item['src']) dst = str(item['dst']) rel = str(item['rel']) net.add_node(src, label=src, color="#4facfe", title=src) net.add_node(dst, label=dst, color="#00f2fe", title=dst) net.add_edge(src, dst, title=rel, label=rel) net.toggle_physics(physics) path = "data/processed/graph_viz.html" os.makedirs("data/processed", exist_ok=True) net.save_graph(path) with open(path, 'r', encoding='utf-8') as f: html_string = f.read() components.html(html_string, height=600, scrolling=True) else: st.info("Grafo vuoto.") else: st.warning("Database non connesso.") # Footer st.markdown("---") st.caption("Activa Digital | NextGenTech | Prototipo v1.0")