GaetanoParente's picture
Update app.py
9bcfc23 verified
raw
history blame
16.5 kB
import streamlit as st
import os
import shutil
import csv
import json
import pandas as pd
from datetime import datetime
from neo4j import GraphDatabase
from pyvis.network import Network
import streamlit.components.v1 as components
from dotenv import load_dotenv
# --- IMPORT MODULI SPECIFICI ---
# 1. Ingestion
from src.ingestion.semantic_splitter import ActivaSemanticSplitter
# 2. Extraction (Importiamo anche la classe GraphTriple per la ricostruzione dei dati)
from src.extraction.extractor import NeuroSymbolicExtractor, GraphTriple
# 3. Graph Building (Loader & Resolver)
from src.graph.graph_loader import KnowledgeGraphPersister
from src.graph.entity_resolver import EntityResolver
# --- CONFIGURAZIONE INIZIALE ---
load_dotenv()
st.set_page_config(
page_title="Activa Semantic Discovery",
layout="wide",
initial_sidebar_state="expanded",
page_icon="🧠"
)
# --- FUNZIONI DI UTILITÀ NEO4J (Frontend) ---
def get_driver(uri, user, password):
if not uri or not password:
return None
try:
return GraphDatabase.driver(uri, auth=(user, password))
except Exception as e:
return None
def run_query(driver, query, params=None):
if driver is None:
return []
with driver.session() as session:
result = session.run(query, params)
return [r.data() for r in result]
# --- LOGICA FEEDBACK LOOP (HUMAN-IN-THE-LOOP) ---
def reject_relationship(driver, rel_id, subj, pred, obj, reason="Human Rejection"):
"""
1. Cancella dal DB (Azione Reale).
2. Salva in CSV per Active Learning (Data Lineage del rifiuto).
"""
query = "MATCH ()-[r]->() WHERE elementId(r) = $id DELETE r"
try:
run_query(driver, query, {"id": rel_id})
except Exception as e:
st.error(f"Errore durante la cancellazione: {e}")
return False
log_file = "data/processed/rejected_triples.csv"
os.makedirs("data/processed", exist_ok=True)
file_exists = os.path.isfile(log_file)
try:
with open(log_file, mode='a', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
if not file_exists:
writer.writerow(["timestamp", "subject", "predicate", "object", "reason"])
writer.writerow([datetime.now(), subj, pred, obj, reason])
return True
except Exception as e:
st.warning(f"Relazione cancellata dal DB, ma errore nel log CSV: {e}")
return True
# --- UI: SIDEBAR & CONFIGURAZIONE ---
st.sidebar.title("⚙️ Configurazione")
env_uri = os.getenv("NEO4J_URI", "")
env_user = os.getenv("NEO4J_USER", "neo4j")
env_password = os.getenv("NEO4J_PASSWORD", "")
env_hf_token = os.getenv("HF_TOKEN", "")
st.sidebar.subheader("Backend LLM")
# Se il token HF è presente nei secrets/env, lo mostriamo mascherato, altrimenti input
hf_token_input = st.sidebar.text_input("HuggingFace Token (Opzionale per Cloud)", value=env_hf_token, type="password")
if hf_token_input:
os.environ["HF_TOKEN"] = hf_token_input
st.sidebar.subheader("Connessione Neo4j AuraDB")
uri = st.sidebar.text_input("URI", value=env_uri, placeholder="neo4j+s://...")
user = st.sidebar.text_input("User", value=env_user)
password = st.sidebar.text_input("Password", value=env_password, type="password")
driver = None
if uri and password:
driver = get_driver(uri, user, password)
if driver:
st.sidebar.success("🟢 Connesso al Knowledge Graph")
os.environ["NEO4J_URI"] = uri
os.environ["NEO4J_USER"] = user
os.environ["NEO4J_PASSWORD"] = password
else:
st.sidebar.error("🔴 Errore connessione")
# --- UI: HEADER PRINCIPALE ---
st.title("🧠 Automated Semantic Discovery Prototype")
st.markdown("**Generazione Neuro-Simbolica & Validazione Human-in-the-Loop**")
# --- TAB NAVIGAZIONE ---
tab_gen, tab_val, tab_vis = st.tabs([
"⚙️ 1. Generazione (Pipeline)",
"🔍 2. Validazione (Active Learning)",
"🕸️ 3. Visualizzazione (Graph)"
])
# ==============================================================================
# TAB 1: GENERAZIONE (PIPELINE DI ESECUZIONE)
# ==============================================================================
with tab_gen:
st.header("Pipeline di Ingestione ed Estrazione")
col_src, col_act = st.columns([1, 2])
# 1. Selezione File
with col_src:
st.subheader("Sorgente Dati")
data_source = st.radio("Modalità:", ("📂 Esempi Demo", "⬆️ Upload (Sperimentale)"))
selected_file_path = None
os.makedirs("data/raw", exist_ok=True)
os.makedirs("data/processed", exist_ok=True)
os.makedirs("data/examples", exist_ok=True)
if data_source == "📂 Esempi Demo":
files = [f for f in os.listdir("data/examples") if f.endswith(".txt")]
if files:
choice = st.selectbox("Seleziona scenario:", files)
if choice:
src = os.path.join("data/examples", choice)
dst = os.path.join("data/raw", choice)
shutil.copy(src, dst)
selected_file_path = choice
else:
st.warning("Nessun file in data/examples")
else:
uploaded = st.file_uploader("Carica .txt", type="txt")
if uploaded:
with open(os.path.join("data/raw", uploaded.name), "wb") as f:
f.write(uploaded.getbuffer())
selected_file_path = uploaded.name
# 2. Esecuzione Step
with col_act:
if selected_file_path:
st.info(f"File attivo: **{selected_file_path}**")
c1, c2, c3 = st.columns(3)
# --- STEP A: CHUNKING ---
with c1:
if st.button("A. Semantic Chunking"):
with st.status("Analisi vettoriale in corso...", expanded=True) as status:
try:
# 1. Leggi il testo raw
with open(os.path.join("data/raw", selected_file_path), "r", encoding="utf-8") as f:
text_content = f.read()
# 2. Inizializza Splitter e processa
status.write("Caricamento modelli di embedding...")
splitter = ActivaSemanticSplitter() # Usa default huggingface
status.write("Calcolo distanze coseno...")
chunks, dists, threshold = splitter.create_chunks(text_content)
# 3. Salvataggio intermedio
chunk_file = "data/processed/chunks.json"
with open(chunk_file, "w", encoding="utf-8") as f:
json.dump(chunks, f, ensure_ascii=False, indent=2)
status.update(label="Chunking Completato!", state="complete", expanded=False)
st.success(f"Generati {len(chunks)} frammenti semantici.")
# Optional: Mostra grafico se generato
if os.path.exists("chunking_analysis.png"):
st.image("chunking_analysis.png", caption="Analisi Coerenza")
except Exception as e:
status.update(label="Errore Chunking", state="error")
st.error(f"Errore: {e}")
# --- STEP B: EXTRACTION ---
with c2:
if st.button("B. Info Extraction"):
chunk_file = "data/processed/chunks.json"
if not os.path.exists(chunk_file):
st.error("Esegui prima il Chunking!")
else:
with st.status("Estrazione Neuro-Simbolica...", expanded=True) as status:
try:
# 1. Carica Chunks
with open(chunk_file, "r", encoding="utf-8") as f:
chunks = json.load(f)
# 2. Init Extractor (Rileva HF_TOKEN da env)
status.write("Inizializzazione LLM (Locale/Cloud)...")
extractor = NeuroSymbolicExtractor() # Usa default params
all_triples = []
progress_bar = st.progress(0)
# 3. Loop su chunk
for i, chunk in enumerate(chunks):
status.write(f"Processando chunk {i+1}/{len(chunks)}...")
result = extractor.extract(chunk, source_id=selected_file_path)
# Converti oggetti Pydantic in dict per serializzazione JSON
triples_dicts = [t.model_dump() for t in result.triples]
all_triples.extend(triples_dicts)
progress_bar.progress((i + 1) / len(chunks))
# 4. Salvataggio Raw Triples
triples_file = "data/processed/triples_raw.json"
with open(triples_file, "w", encoding="utf-8") as f:
json.dump(all_triples, f, ensure_ascii=False, indent=2)
status.update(label="Estrazione Completata!", state="complete", expanded=False)
st.success(f"Estratte {len(all_triples)} triple candidate.")
except Exception as e:
status.update(label="Errore Estrazione", state="error")
st.error(f"Errore: {e}")
# --- STEP C: GRAPH BUILDING ---
with c3:
if st.button("C. Popola Neo4j", type="primary"):
triples_file = "data/processed/triples_raw.json"
if not os.path.exists(triples_file):
st.error("Esegui prima l'estrazione!")
elif not driver:
st.error("Connettiti al DB prima!")
else:
with st.status("Costruzione Grafo...", expanded=True) as status:
try:
# 1. Carica Raw Triples
with open(triples_file, "r", encoding="utf-8") as f:
raw_data = json.load(f)
# Ricostruisci oggetti GraphTriple (necessari per il Resolver)
triples_objs = [GraphTriple(**t) for t in raw_data]
# 2. Entity Resolution
status.write("Entity Resolution (DBSCAN Clustering)...")
resolver = EntityResolver(similarity_threshold=0.85)
resolved_triples = resolver.resolve_entities(triples_objs)
status.write(f"Entità normalizzate. Triple da inserire: {len(resolved_triples)}")
# 3. Persistenza Neo4j
status.write("Scrittura Batch su Neo4j...")
persister = KnowledgeGraphPersister() # Prende credenziali da env
persister.save_triples(resolved_triples)
persister.close()
status.update(label="Grafo Aggiornato!", state="complete", expanded=False)
st.success("🚀 Grafo costruito con successo su AuraDB!")
st.balloons()
except Exception as e:
status.update(label="Errore Costruzione", state="error")
st.error(f"Errore: {e}")
else:
st.write("👈 Seleziona un file per iniziare.")
# ==============================================================================
# TAB 2: VALIDAZIONE (HUMAN IN THE LOOP)
# ==============================================================================
with tab_val:
st.header("Curation & Feedback Loop")
st.markdown("Validazione delle triple estratte. Le relazioni rifiutate vengono usate per il fine-tuning.")
if driver:
# Query aggiornata per mostrare nodi reali
cypher_val = """
MATCH (s)-[r]->(o)
RETURN elementId(r) as id,
COALESCE(s.label, s.name, head(labels(s))) as Soggetto,
type(r) as Predicato,
COALESCE(o.label, o.name, head(labels(o))) as Oggetto,
COALESCE(r.confidence, 0.85) as Confidenza
ORDER BY Confidenza ASC LIMIT 50
"""
triples_data = run_query(driver, cypher_val)
if triples_data:
df = pd.DataFrame(triples_data)
st.write(f"Relazioni candidate ({len(df)}):")
event = st.dataframe(
df.drop(columns=["id"]),
selection_mode="single-row",
on_select="rerun",
use_container_width=True,
hide_index=True
)
if len(event.selection.rows) > 0:
idx = event.selection.rows[0]
row = df.iloc[idx]
st.divider()
col_warn, col_btn = st.columns([3, 1])
with col_warn:
st.warning(f"Rifiutare: **{row['Soggetto']}** --[{row['Predicato']}]--> **{row['Oggetto']}**?")
with col_btn:
if st.button("🗑️ CONFERMA RIFIUTO", type="primary"):
success = reject_relationship(driver, row['id'], row['Soggetto'], row['Predicato'], row['Oggetto'])
if success:
st.success("Relazione eliminata!")
st.rerun()
else:
st.info("Nessuna relazione nel grafo.")
else:
st.warning("Database non connesso.")
# ==============================================================================
# TAB 3: VISUALIZZAZIONE (PYVIS)
# ==============================================================================
with tab_vis:
st.header("Esplorazione Topologica")
if driver:
physics = st.checkbox("Abilita Fisica", value=True)
cypher_vis = """
MATCH (s)-[r]->(o)
RETURN COALESCE(s.label, s.name, head(labels(s))) as src,
type(r) as rel,
COALESCE(o.label, o.name, head(labels(o))) as dst
LIMIT 100
"""
graph_data = run_query(driver, cypher_vis)
if graph_data:
net = Network(height="600px", width="100%", bgcolor="#222222", font_color="white", notebook=False)
for item in graph_data:
src = str(item['src'])
dst = str(item['dst'])
rel = str(item['rel'])
net.add_node(src, label=src, color="#4facfe", title=src)
net.add_node(dst, label=dst, color="#00f2fe", title=dst)
net.add_edge(src, dst, title=rel, label=rel)
net.toggle_physics(physics)
path = "data/processed/graph_viz.html"
os.makedirs("data/processed", exist_ok=True)
net.save_graph(path)
with open(path, 'r', encoding='utf-8') as f:
html_string = f.read()
components.html(html_string, height=600, scrolling=True)
else:
st.info("Grafo vuoto.")
else:
st.warning("Database non connesso.")
# Footer
st.markdown("---")
st.caption("Activa Digital | NextGenTech | Prototipo v1.0")