Spaces:

NextGenTech
/

AutomatedSemanticDiscovery

Sleeping

App Files Files Community

GaetanoParente commited on Dec 31, 2025

Commit

9bcfc23

verified ·

1 Parent(s): cfc197c

Update app.py

Browse files

Files changed (1) hide show

app.py +144 -64

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ import streamlit as st
 import os
 import shutil
 import csv
 import pandas as pd
 from datetime import datetime
 from neo4j import GraphDatabase
@@ -9,10 +10,14 @@ from pyvis.network import Network
 import streamlit.components.v1 as components
 from dotenv import load_dotenv
-# Import moduli interni (assumendo che la struttura src/ esista)
-from src.ingestion import semantic_splitter
-from src.extraction import extractor
-from src.graph import graph_builder
 # --- CONFIGURAZIONE INIZIALE ---
 load_dotenv()
@@ -23,7 +28,7 @@ st.set_page_config(
     page_icon="🧠"
 )
-# --- FUNZIONI DI UTILITÀ NEO4J ---
 def get_driver(uri, user, password):
     if not uri or not password:
         return None
@@ -45,7 +50,6 @@ def reject_relationship(driver, rel_id, subj, pred, obj, reason="Human Rejection
     1. Cancella dal DB (Azione Reale).
     2. Salva in CSV per Active Learning (Data Lineage del rifiuto).
     """
-    # 1. Cancellazione Reale
     query = "MATCH ()-[r]->() WHERE elementId(r) = $id DELETE r"
     try:
         run_query(driver, query, {"id": rel_id})
@@ -53,7 +57,6 @@ def reject_relationship(driver, rel_id, subj, pred, obj, reason="Human Rejection
         st.error(f"Errore durante la cancellazione: {e}")
         return False
-    # 2. Logging per Fine-Tuning
     log_file = "data/processed/rejected_triples.csv"
     os.makedirs("data/processed", exist_ok=True)
     file_exists = os.path.isfile(log_file)
@@ -72,24 +75,27 @@ def reject_relationship(driver, rel_id, subj, pred, obj, reason="Human Rejection
 # --- UI: SIDEBAR & CONFIGURAZIONE ---
 st.sidebar.title("⚙️ Configurazione")
-# Gestione credenziali: Priorità a .env, fallback su input utente
 env_uri = os.getenv("NEO4J_URI", "")
 env_user = os.getenv("NEO4J_USER", "neo4j")
-# Non pre-compiliamo la password se non è nell'env per sicurezza
 env_password = os.getenv("NEO4J_PASSWORD", "")
 st.sidebar.subheader("Connessione Neo4j AuraDB")
 uri = st.sidebar.text_input("URI", value=env_uri, placeholder="neo4j+s://...")
 user = st.sidebar.text_input("User", value=env_user)
 password = st.sidebar.text_input("Password", value=env_password, type="password")
-# Inizializza Driver
 driver = None
 if uri and password:
     driver = get_driver(uri, user, password)
     if driver:
         st.sidebar.success("🟢 Connesso al Knowledge Graph")
-        # Setta variabili d'ambiente per i moduli backend che le usano
         os.environ["NEO4J_URI"] = uri
         os.environ["NEO4J_USER"] = user
         os.environ["NEO4J_PASSWORD"] = password
@@ -113,15 +119,16 @@ tab_gen, tab_val, tab_vis = st.tabs([
 with tab_gen:
     st.header("Pipeline di Ingestione ed Estrazione")
-    # Selezione Sorgente Dati (Golden Path)
     col_src, col_act = st.columns([1, 2])
     with col_src:
         st.subheader("Sorgente Dati")
         data_source = st.radio("Modalità:", ("📂 Esempi Demo", "⬆️ Upload (Sperimentale)"))
         selected_file_path = None
         os.makedirs("data/raw", exist_ok=True)
         os.makedirs("data/examples", exist_ok=True)
         if data_source == "📂 Esempi Demo":
@@ -129,7 +136,9 @@ with tab_gen:
             if files:
                 choice = st.selectbox("Seleziona scenario:", files)
                 if choice:
-                    shutil.copy(os.path.join("data/examples", choice), os.path.join("data/raw", choice))
                     selected_file_path = choice
             else:
                 st.warning("Nessun file in data/examples")
@@ -140,35 +149,122 @@ with tab_gen:
                     f.write(uploaded.getbuffer())
                 selected_file_path = uploaded.name
     with col_act:
         if selected_file_path:
             st.info(f"File attivo: **{selected_file_path}**")
             c1, c2, c3 = st.columns(3)
-            # Step A: Chunking
             with c1:
                 if st.button("A. Semantic Chunking"):
-                    with st.spinner('Analisi vettoriale...'):
-                        # semantic_splitter.run(selected_file_path) # Scommenta in prod
-                        st.success("Chunking OK")
-            # Step B: Extraction
             with c2:
                 if st.button("B. Info Extraction"):
-                    with st.spinner('NLP Extraction...'):
-                        # extractor.run() # Scommenta in prod
-                        st.success("Estrazione OK")
-            # Step C: Graph
             with c3:
                 if st.button("C. Popola Neo4j", type="primary"):
-                    if not driver:
                         st.error("Connettiti al DB prima!")
                     else:
-                        with st.spinner('Scrittura su Neo4j...'):
-                            # graph_builder.run() # Scommenta in prod
-                            st.success("Grafo aggiornato!")
-                            st.balloons()
         else:
             st.write("👈 Seleziona un file per iniziare.")
@@ -177,20 +273,16 @@ with tab_gen:
 # ==============================================================================
 with tab_val:
     st.header("Curation & Feedback Loop")
-    st.markdown("""
-    In questa sezione, l'esperto di dominio valida le triple estratte.
-    Le relazioni rifiutate vengono rimosse dal grafo e loggate per il **fine-tuning** futuro dei modelli.
-    """)
     if driver:
-        # Recupera triple (COALESCE gestisce casi in cui 'name' o 'label' mancano)
-        # Nota: Assicurati che i tuoi nodi abbiano una proprietà 'name' o 'label'
         cypher_val = """
         MATCH (s)-[r]->(o)
         RETURN elementId(r) as id,
-               COALESCE(s.name, s.label, head(labels(s))) as Soggetto,
                type(r) as Predicato,
-               COALESCE(o.name, o.label, head(labels(o))) as Oggetto,
                COALESCE(r.confidence, 0.85) as Confidenza
         ORDER BY Confidenza ASC LIMIT 50
         """
@@ -198,9 +290,7 @@ with tab_val:
         if triples_data:
             df = pd.DataFrame(triples_data)
-            # Tabella interattiva
-            st.write(f"Mostrando {len(df)} relazioni candidate (ordinate per confidenza):")
             event = st.dataframe(
                 df.drop(columns=["id"]),
@@ -210,7 +300,6 @@ with tab_val:
                 hide_index=True
             )
-            # Logica di rifiuto
             if len(event.selection.rows) > 0:
                 idx = event.selection.rows[0]
                 row = df.iloc[idx]
@@ -218,17 +307,17 @@ with tab_val:
                 st.divider()
                 col_warn, col_btn = st.columns([3, 1])
                 with col_warn:
-                    st.warning(f"Vuoi rifiutare la relazione: **{row['Soggetto']}** --[{row['Predicato']}]--> **{row['Oggetto']}**?")
                 with col_btn:
                     if st.button("🗑️ CONFERMA RIFIUTO", type="primary"):
                         success = reject_relationship(driver, row['id'], row['Soggetto'], row['Predicato'], row['Oggetto'])
                         if success:
-                            st.success("Relazione eliminata e feedback registrato!")
                             st.rerun()
         else:
-            st.info("Nessuna relazione trovata nel grafo.")
     else:
-        st.warning("Connessione al database non attiva.")
 # ==============================================================================
 # TAB 3: VISUALIZZAZIONE (PYVIS)
@@ -237,36 +326,29 @@ with tab_vis:
     st.header("Esplorazione Topologica")
     if driver:
-        physics = st.checkbox("Abilita Fisica (Gravità)", value=True)
-        # Limita a 100 nodi per performance rendering
         cypher_vis = """
         MATCH (s)-[r]->(o)
-        RETURN COALESCE(s.name, head(labels(s))) as src,
                type(r) as rel,
-               COALESCE(o.name, head(labels(o))) as dst
         LIMIT 100
         """
         graph_data = run_query(driver, cypher_vis)
         if graph_data:
-            # Creazione Network PyVis
             net = Network(height="600px", width="100%", bgcolor="#222222", font_color="white", notebook=False)
             for item in graph_data:
-                # Aggiungi nodi e archi
-                src_lbl = str(item['src'])
-                dst_lbl = str(item['dst'])
-                rel_lbl = str(item['rel'])
-                net.add_node(src_lbl, label=src_lbl, color="#4facfe", title=src_lbl)
-                net.add_node(dst_lbl, label=dst_lbl, color="#00f2fe", title=dst_lbl)
-                net.add_edge(src_lbl, dst_lbl, title=rel_lbl, label=rel_lbl)
-            # Opzioni fisica
             net.toggle_physics(physics)
-            # Salva e renderizza HTML
             path = "data/processed/graph_viz.html"
             os.makedirs("data/processed", exist_ok=True)
             net.save_graph(path)
@@ -274,12 +356,10 @@ with tab_vis:
             with open(path, 'r', encoding='utf-8') as f:
                 html_string = f.read()
             components.html(html_string, height=600, scrolling=True)
-            st.caption(f"Visualizzazione dei primi {len(graph_data)} archi.")
         else:
-            st.info("Grafo vuoto o impossibile recuperare dati.")
     else:
-        st.warning("Connessione al database non attiva.")
 # Footer
 st.markdown("---")

 import os
 import shutil
 import csv
+import json
 import pandas as pd
 from datetime import datetime
 from neo4j import GraphDatabase
 import streamlit.components.v1 as components
 from dotenv import load_dotenv
+# --- IMPORT MODULI SPECIFICI ---
+# 1. Ingestion
+from src.ingestion.semantic_splitter import ActivaSemanticSplitter
+# 2. Extraction (Importiamo anche la classe GraphTriple per la ricostruzione dei dati)
+from src.extraction.extractor import NeuroSymbolicExtractor, GraphTriple
+# 3. Graph Building (Loader & Resolver)
+from src.graph.graph_loader import KnowledgeGraphPersister
+from src.graph.entity_resolver import EntityResolver
 # --- CONFIGURAZIONE INIZIALE ---
 load_dotenv()
     page_icon="🧠"
 )
+# --- FUNZIONI DI UTILITÀ NEO4J (Frontend) ---
 def get_driver(uri, user, password):
     if not uri or not password:
         return None
     1. Cancella dal DB (Azione Reale).
     2. Salva in CSV per Active Learning (Data Lineage del rifiuto).
     """
     query = "MATCH ()-[r]->() WHERE elementId(r) = $id DELETE r"
     try:
         run_query(driver, query, {"id": rel_id})
         st.error(f"Errore durante la cancellazione: {e}")
         return False
     log_file = "data/processed/rejected_triples.csv"
     os.makedirs("data/processed", exist_ok=True)
     file_exists = os.path.isfile(log_file)
 # --- UI: SIDEBAR & CONFIGURAZIONE ---
 st.sidebar.title("⚙️ Configurazione")
 env_uri = os.getenv("NEO4J_URI", "")
 env_user = os.getenv("NEO4J_USER", "neo4j")
 env_password = os.getenv("NEO4J_PASSWORD", "")
+env_hf_token = os.getenv("HF_TOKEN", "")
+st.sidebar.subheader("Backend LLM")
+# Se il token HF è presente nei secrets/env, lo mostriamo mascherato, altrimenti input
+hf_token_input = st.sidebar.text_input("HuggingFace Token (Opzionale per Cloud)", value=env_hf_token, type="password")
+if hf_token_input:
+    os.environ["HF_TOKEN"] = hf_token_input
 st.sidebar.subheader("Connessione Neo4j AuraDB")
 uri = st.sidebar.text_input("URI", value=env_uri, placeholder="neo4j+s://...")
 user = st.sidebar.text_input("User", value=env_user)
 password = st.sidebar.text_input("Password", value=env_password, type="password")
 driver = None
 if uri and password:
     driver = get_driver(uri, user, password)
     if driver:
         st.sidebar.success("🟢 Connesso al Knowledge Graph")
         os.environ["NEO4J_URI"] = uri
         os.environ["NEO4J_USER"] = user
         os.environ["NEO4J_PASSWORD"] = password
 with tab_gen:
     st.header("Pipeline di Ingestione ed Estrazione")
     col_src, col_act = st.columns([1, 2])
+    # 1. Selezione File
     with col_src:
         st.subheader("Sorgente Dati")
         data_source = st.radio("Modalità:", ("📂 Esempi Demo", "⬆️ Upload (Sperimentale)"))
         selected_file_path = None
         os.makedirs("data/raw", exist_ok=True)
+        os.makedirs("data/processed", exist_ok=True)
         os.makedirs("data/examples", exist_ok=True)
         if data_source == "📂 Esempi Demo":
             if files:
                 choice = st.selectbox("Seleziona scenario:", files)
                 if choice:
+                    src = os.path.join("data/examples", choice)
+                    dst = os.path.join("data/raw", choice)
+                    shutil.copy(src, dst)
                     selected_file_path = choice
             else:
                 st.warning("Nessun file in data/examples")
                     f.write(uploaded.getbuffer())
                 selected_file_path = uploaded.name
+    # 2. Esecuzione Step
     with col_act:
         if selected_file_path:
             st.info(f"File attivo: **{selected_file_path}**")
             c1, c2, c3 = st.columns(3)
+            # --- STEP A: CHUNKING ---
             with c1:
                 if st.button("A. Semantic Chunking"):
+                    with st.status("Analisi vettoriale in corso...", expanded=True) as status:
+                        try:
+                            # 1. Leggi il testo raw
+                            with open(os.path.join("data/raw", selected_file_path), "r", encoding="utf-8") as f:
+                                text_content = f.read()
+                            # 2. Inizializza Splitter e processa
+                            status.write("Caricamento modelli di embedding...")
+                            splitter = ActivaSemanticSplitter() # Usa default huggingface
+                            status.write("Calcolo distanze coseno...")
+                            chunks, dists, threshold = splitter.create_chunks(text_content)
+                            # 3. Salvataggio intermedio
+                            chunk_file = "data/processed/chunks.json"
+                            with open(chunk_file, "w", encoding="utf-8") as f:
+                                json.dump(chunks, f, ensure_ascii=False, indent=2)
+                            status.update(label="Chunking Completato!", state="complete", expanded=False)
+                            st.success(f"Generati {len(chunks)} frammenti semantici.")
+                            # Optional: Mostra grafico se generato
+                            if os.path.exists("chunking_analysis.png"):
+                                st.image("chunking_analysis.png", caption="Analisi Coerenza")
+                        except Exception as e:
+                            status.update(label="Errore Chunking", state="error")
+                            st.error(f"Errore: {e}")
+            # --- STEP B: EXTRACTION ---
             with c2:
                 if st.button("B. Info Extraction"):
+                    chunk_file = "data/processed/chunks.json"
+                    if not os.path.exists(chunk_file):
+                        st.error("Esegui prima il Chunking!")
+                    else:
+                        with st.status("Estrazione Neuro-Simbolica...", expanded=True) as status:
+                            try:
+                                # 1. Carica Chunks
+                                with open(chunk_file, "r", encoding="utf-8") as f:
+                                    chunks = json.load(f)
+                                # 2. Init Extractor (Rileva HF_TOKEN da env)
+                                status.write("Inizializzazione LLM (Locale/Cloud)...")
+                                extractor = NeuroSymbolicExtractor() # Usa default params
+                                all_triples = []
+                                progress_bar = st.progress(0)
+                                # 3. Loop su chunk
+                                for i, chunk in enumerate(chunks):
+                                    status.write(f"Processando chunk {i+1}/{len(chunks)}...")
+                                    result = extractor.extract(chunk, source_id=selected_file_path)
+                                    # Converti oggetti Pydantic in dict per serializzazione JSON
+                                    triples_dicts = [t.model_dump() for t in result.triples]
+                                    all_triples.extend(triples_dicts)
+                                    progress_bar.progress((i + 1) / len(chunks))
+                                # 4. Salvataggio Raw Triples
+                                triples_file = "data/processed/triples_raw.json"
+                                with open(triples_file, "w", encoding="utf-8") as f:
+                                    json.dump(all_triples, f, ensure_ascii=False, indent=2)
+                                status.update(label="Estrazione Completata!", state="complete", expanded=False)
+                                st.success(f"Estratte {len(all_triples)} triple candidate.")
+                            except Exception as e:
+                                status.update(label="Errore Estrazione", state="error")
+                                st.error(f"Errore: {e}")
+            # --- STEP C: GRAPH BUILDING ---
             with c3:
                 if st.button("C. Popola Neo4j", type="primary"):
+                    triples_file = "data/processed/triples_raw.json"
+                    if not os.path.exists(triples_file):
+                        st.error("Esegui prima l'estrazione!")
+                    elif not driver:
                         st.error("Connettiti al DB prima!")
                     else:
+                        with st.status("Costruzione Grafo...", expanded=True) as status:
+                            try:
+                                # 1. Carica Raw Triples
+                                with open(triples_file, "r", encoding="utf-8") as f:
+                                    raw_data = json.load(f)
+                                # Ricostruisci oggetti GraphTriple (necessari per il Resolver)
+                                triples_objs = [GraphTriple(**t) for t in raw_data]
+                                # 2. Entity Resolution
+                                status.write("Entity Resolution (DBSCAN Clustering)...")
+                                resolver = EntityResolver(similarity_threshold=0.85)
+                                resolved_triples = resolver.resolve_entities(triples_objs)
+                                status.write(f"Entità normalizzate. Triple da inserire: {len(resolved_triples)}")
+                                # 3. Persistenza Neo4j
+                                status.write("Scrittura Batch su Neo4j...")
+                                persister = KnowledgeGraphPersister() # Prende credenziali da env
+                                persister.save_triples(resolved_triples)
+                                persister.close()
+                                status.update(label="Grafo Aggiornato!", state="complete", expanded=False)
+                                st.success("🚀 Grafo costruito con successo su AuraDB!")
+                                st.balloons()
+                            except Exception as e:
+                                status.update(label="Errore Costruzione", state="error")
+                                st.error(f"Errore: {e}")
         else:
             st.write("👈 Seleziona un file per iniziare.")
 # ==============================================================================
 with tab_val:
     st.header("Curation & Feedback Loop")
+    st.markdown("Validazione delle triple estratte. Le relazioni rifiutate vengono usate per il fine-tuning.")
     if driver:
+        # Query aggiornata per mostrare nodi reali
         cypher_val = """
         MATCH (s)-[r]->(o)
         RETURN elementId(r) as id,
+               COALESCE(s.label, s.name, head(labels(s))) as Soggetto,
                type(r) as Predicato,
+               COALESCE(o.label, o.name, head(labels(o))) as Oggetto,
                COALESCE(r.confidence, 0.85) as Confidenza
         ORDER BY Confidenza ASC LIMIT 50
         """
         if triples_data:
             df = pd.DataFrame(triples_data)
+            st.write(f"Relazioni candidate ({len(df)}):")
             event = st.dataframe(
                 df.drop(columns=["id"]),
                 hide_index=True
             )
             if len(event.selection.rows) > 0:
                 idx = event.selection.rows[0]
                 row = df.iloc[idx]
                 st.divider()
                 col_warn, col_btn = st.columns([3, 1])
                 with col_warn:
+                    st.warning(f"Rifiutare: **{row['Soggetto']}** --[{row['Predicato']}]--> **{row['Oggetto']}**?")
                 with col_btn:
                     if st.button("🗑️ CONFERMA RIFIUTO", type="primary"):
                         success = reject_relationship(driver, row['id'], row['Soggetto'], row['Predicato'], row['Oggetto'])
                         if success:
+                            st.success("Relazione eliminata!")
                             st.rerun()
         else:
+            st.info("Nessuna relazione nel grafo.")
     else:
+        st.warning("Database non connesso.")
 # ==============================================================================
 # TAB 3: VISUALIZZAZIONE (PYVIS)
     st.header("Esplorazione Topologica")
     if driver:
+        physics = st.checkbox("Abilita Fisica", value=True)
         cypher_vis = """
         MATCH (s)-[r]->(o)
+        RETURN COALESCE(s.label, s.name, head(labels(s))) as src,
                type(r) as rel,
+               COALESCE(o.label, o.name, head(labels(o))) as dst
         LIMIT 100
         """
         graph_data = run_query(driver, cypher_vis)
         if graph_data:
             net = Network(height="600px", width="100%", bgcolor="#222222", font_color="white", notebook=False)
             for item in graph_data:
+                src = str(item['src'])
+                dst = str(item['dst'])
+                rel = str(item['rel'])
+                net.add_node(src, label=src, color="#4facfe", title=src)
+                net.add_node(dst, label=dst, color="#00f2fe", title=dst)
+                net.add_edge(src, dst, title=rel, label=rel)
             net.toggle_physics(physics)
             path = "data/processed/graph_viz.html"
             os.makedirs("data/processed", exist_ok=True)
             net.save_graph(path)
             with open(path, 'r', encoding='utf-8') as f:
                 html_string = f.read()
             components.html(html_string, height=600, scrolling=True)
         else:
+            st.info("Grafo vuoto.")
     else:
+        st.warning("Database non connesso.")
 # Footer
 st.markdown("---")