Spaces:

NextGenTech
/

AutomatedSemanticDiscovery

Sleeping

App Files Files Community

AutomatedSemanticDiscovery / app.py

GaetanoParente

Update app.py

9bcfc23 verified 5 months ago

raw

history blame

16.5 kB

	import streamlit as st
	import os
	import shutil
	import csv
	import json
	import pandas as pd
	from datetime import datetime
	from neo4j import GraphDatabase
	from pyvis.network import Network
	import streamlit.components.v1 as components
	from dotenv import load_dotenv

	# --- IMPORT MODULI SPECIFICI ---
	# 1. Ingestion
	from src.ingestion.semantic_splitter import ActivaSemanticSplitter
	# 2. Extraction (Importiamo anche la classe GraphTriple per la ricostruzione dei dati)
	from src.extraction.extractor import NeuroSymbolicExtractor, GraphTriple
	# 3. Graph Building (Loader & Resolver)
	from src.graph.graph_loader import KnowledgeGraphPersister
	from src.graph.entity_resolver import EntityResolver

	# --- CONFIGURAZIONE INIZIALE ---
	load_dotenv()
	st.set_page_config(
	page_title="Activa Semantic Discovery",
	layout="wide",
	initial_sidebar_state="expanded",
	page_icon="🧠"
	)

	# --- FUNZIONI DI UTILITÀ NEO4J (Frontend) ---
	def get_driver(uri, user, password):
	if not uri or not password:
	return None
	try:
	return GraphDatabase.driver(uri, auth=(user, password))
	except Exception as e:
	return None

	def run_query(driver, query, params=None):
	if driver is None:
	return []
	with driver.session() as session:
	result = session.run(query, params)
	return [r.data() for r in result]

	# --- LOGICA FEEDBACK LOOP (HUMAN-IN-THE-LOOP) ---
	def reject_relationship(driver, rel_id, subj, pred, obj, reason="Human Rejection"):
	"""
	1. Cancella dal DB (Azione Reale).
	2. Salva in CSV per Active Learning (Data Lineage del rifiuto).
	"""
	query = "MATCH ()-[r]->() WHERE elementId(r) = $id DELETE r"
	try:
	run_query(driver, query, {"id": rel_id})
	except Exception as e:
	st.error(f"Errore durante la cancellazione: {e}")
	return False

	log_file = "data/processed/rejected_triples.csv"
	os.makedirs("data/processed", exist_ok=True)
	file_exists = os.path.isfile(log_file)

	try:
	with open(log_file, mode='a', newline='', encoding='utf-8') as f:
	writer = csv.writer(f)
	if not file_exists:
	writer.writerow(["timestamp", "subject", "predicate", "object", "reason"])
	writer.writerow([datetime.now(), subj, pred, obj, reason])
	return True
	except Exception as e:
	st.warning(f"Relazione cancellata dal DB, ma errore nel log CSV: {e}")
	return True

	# --- UI: SIDEBAR & CONFIGURAZIONE ---
	st.sidebar.title("⚙️ Configurazione")

	env_uri = os.getenv("NEO4J_URI", "")
	env_user = os.getenv("NEO4J_USER", "neo4j")
	env_password = os.getenv("NEO4J_PASSWORD", "")
	env_hf_token = os.getenv("HF_TOKEN", "")

	st.sidebar.subheader("Backend LLM")
	# Se il token HF è presente nei secrets/env, lo mostriamo mascherato, altrimenti input
	hf_token_input = st.sidebar.text_input("HuggingFace Token (Opzionale per Cloud)", value=env_hf_token, type="password")
	if hf_token_input:
	os.environ["HF_TOKEN"] = hf_token_input

	st.sidebar.subheader("Connessione Neo4j AuraDB")
	uri = st.sidebar.text_input("URI", value=env_uri, placeholder="neo4j+s://...")
	user = st.sidebar.text_input("User", value=env_user)
	password = st.sidebar.text_input("Password", value=env_password, type="password")

	driver = None
	if uri and password:
	driver = get_driver(uri, user, password)
	if driver:
	st.sidebar.success("🟢 Connesso al Knowledge Graph")
	os.environ["NEO4J_URI"] = uri
	os.environ["NEO4J_USER"] = user
	os.environ["NEO4J_PASSWORD"] = password
	else:
	st.sidebar.error("🔴 Errore connessione")

	# --- UI: HEADER PRINCIPALE ---
	st.title("🧠 Automated Semantic Discovery Prototype")
	st.markdown("Generazione Neuro-Simbolica & Validazione Human-in-the-Loop")

	# --- TAB NAVIGAZIONE ---
	tab_gen, tab_val, tab_vis = st.tabs([
	"⚙️ 1. Generazione (Pipeline)",
	"🔍 2. Validazione (Active Learning)",
	"🕸️ 3. Visualizzazione (Graph)"
	])

	# ==============================================================================
	# TAB 1: GENERAZIONE (PIPELINE DI ESECUZIONE)
	# ==============================================================================
	with tab_gen:
	st.header("Pipeline di Ingestione ed Estrazione")

	col_src, col_act = st.columns([1, 2])

	# 1. Selezione File
	with col_src:
	st.subheader("Sorgente Dati")
	data_source = st.radio("Modalità:", ("📂 Esempi Demo", "⬆️ Upload (Sperimentale)"))

	selected_file_path = None
	os.makedirs("data/raw", exist_ok=True)
	os.makedirs("data/processed", exist_ok=True)
	os.makedirs("data/examples", exist_ok=True)

	if data_source == "📂 Esempi Demo":
	files = [f for f in os.listdir("data/examples") if f.endswith(".txt")]
	if files:
	choice = st.selectbox("Seleziona scenario:", files)
	if choice:
	src = os.path.join("data/examples", choice)
	dst = os.path.join("data/raw", choice)
	shutil.copy(src, dst)
	selected_file_path = choice
	else:
	st.warning("Nessun file in data/examples")
	else:
	uploaded = st.file_uploader("Carica .txt", type="txt")
	if uploaded:
	with open(os.path.join("data/raw", uploaded.name), "wb") as f:
	f.write(uploaded.getbuffer())
	selected_file_path = uploaded.name

	# 2. Esecuzione Step
	with col_act:
	if selected_file_path:
	st.info(f"File attivo: {selected_file_path}")
	c1, c2, c3 = st.columns(3)

	# --- STEP A: CHUNKING ---
	with c1:
	if st.button("A. Semantic Chunking"):
	with st.status("Analisi vettoriale in corso...", expanded=True) as status:
	try:
	# 1. Leggi il testo raw
	with open(os.path.join("data/raw", selected_file_path), "r", encoding="utf-8") as f:
	text_content = f.read()

	# 2. Inizializza Splitter e processa
	status.write("Caricamento modelli di embedding...")
	splitter = ActivaSemanticSplitter() # Usa default huggingface

	status.write("Calcolo distanze coseno...")
	chunks, dists, threshold = splitter.create_chunks(text_content)

	# 3. Salvataggio intermedio
	chunk_file = "data/processed/chunks.json"
	with open(chunk_file, "w", encoding="utf-8") as f:
	json.dump(chunks, f, ensure_ascii=False, indent=2)

	status.update(label="Chunking Completato!", state="complete", expanded=False)
	st.success(f"Generati {len(chunks)} frammenti semantici.")

	# Optional: Mostra grafico se generato
	if os.path.exists("chunking_analysis.png"):
	st.image("chunking_analysis.png", caption="Analisi Coerenza")

	except Exception as e:
	status.update(label="Errore Chunking", state="error")
	st.error(f"Errore: {e}")

	# --- STEP B: EXTRACTION ---
	with c2:
	if st.button("B. Info Extraction"):
	chunk_file = "data/processed/chunks.json"
	if not os.path.exists(chunk_file):
	st.error("Esegui prima il Chunking!")
	else:
	with st.status("Estrazione Neuro-Simbolica...", expanded=True) as status:
	try:
	# 1. Carica Chunks
	with open(chunk_file, "r", encoding="utf-8") as f:
	chunks = json.load(f)

	# 2. Init Extractor (Rileva HF_TOKEN da env)
	status.write("Inizializzazione LLM (Locale/Cloud)...")
	extractor = NeuroSymbolicExtractor() # Usa default params

	all_triples = []
	progress_bar = st.progress(0)

	# 3. Loop su chunk
	for i, chunk in enumerate(chunks):
	status.write(f"Processando chunk {i+1}/{len(chunks)}...")
	result = extractor.extract(chunk, source_id=selected_file_path)
	# Converti oggetti Pydantic in dict per serializzazione JSON
	triples_dicts = [t.model_dump() for t in result.triples]
	all_triples.extend(triples_dicts)
	progress_bar.progress((i + 1) / len(chunks))

	# 4. Salvataggio Raw Triples
	triples_file = "data/processed/triples_raw.json"
	with open(triples_file, "w", encoding="utf-8") as f:
	json.dump(all_triples, f, ensure_ascii=False, indent=2)

	status.update(label="Estrazione Completata!", state="complete", expanded=False)
	st.success(f"Estratte {len(all_triples)} triple candidate.")

	except Exception as e:
	status.update(label="Errore Estrazione", state="error")
	st.error(f"Errore: {e}")

	# --- STEP C: GRAPH BUILDING ---
	with c3:
	if st.button("C. Popola Neo4j", type="primary"):
	triples_file = "data/processed/triples_raw.json"
	if not os.path.exists(triples_file):
	st.error("Esegui prima l'estrazione!")
	elif not driver:
	st.error("Connettiti al DB prima!")
	else:
	with st.status("Costruzione Grafo...", expanded=True) as status:
	try:
	# 1. Carica Raw Triples
	with open(triples_file, "r", encoding="utf-8") as f:
	raw_data = json.load(f)

	# Ricostruisci oggetti GraphTriple (necessari per il Resolver)
	triples_objs = [GraphTriple(**t) for t in raw_data]

	# 2. Entity Resolution
	status.write("Entity Resolution (DBSCAN Clustering)...")
	resolver = EntityResolver(similarity_threshold=0.85)
	resolved_triples = resolver.resolve_entities(triples_objs)
	status.write(f"Entità normalizzate. Triple da inserire: {len(resolved_triples)}")

	# 3. Persistenza Neo4j
	status.write("Scrittura Batch su Neo4j...")
	persister = KnowledgeGraphPersister() # Prende credenziali da env
	persister.save_triples(resolved_triples)
	persister.close()

	status.update(label="Grafo Aggiornato!", state="complete", expanded=False)
	st.success("🚀 Grafo costruito con successo su AuraDB!")
	st.balloons()

	except Exception as e:
	status.update(label="Errore Costruzione", state="error")
	st.error(f"Errore: {e}")
	else:
	st.write("👈 Seleziona un file per iniziare.")

	# ==============================================================================
	# TAB 2: VALIDAZIONE (HUMAN IN THE LOOP)
	# ==============================================================================
	with tab_val:
	st.header("Curation & Feedback Loop")
	st.markdown("Validazione delle triple estratte. Le relazioni rifiutate vengono usate per il fine-tuning.")

	if driver:
	# Query aggiornata per mostrare nodi reali
	cypher_val = """
	MATCH (s)-[r]->(o)
	RETURN elementId(r) as id,
	COALESCE(s.label, s.name, head(labels(s))) as Soggetto,
	type(r) as Predicato,
	COALESCE(o.label, o.name, head(labels(o))) as Oggetto,
	COALESCE(r.confidence, 0.85) as Confidenza
	ORDER BY Confidenza ASC LIMIT 50
	"""
	triples_data = run_query(driver, cypher_val)

	if triples_data:
	df = pd.DataFrame(triples_data)
	st.write(f"Relazioni candidate ({len(df)}):")

	event = st.dataframe(
	df.drop(columns=["id"]),
	selection_mode="single-row",
	on_select="rerun",
	use_container_width=True,
	hide_index=True
	)

	if len(event.selection.rows) > 0:
	idx = event.selection.rows[0]
	row = df.iloc[idx]

	st.divider()
	col_warn, col_btn = st.columns([3, 1])
	with col_warn:
	st.warning(f"Rifiutare: {row['Soggetto']} --[{row['Predicato']}]--> {row['Oggetto']}?")
	with col_btn:
	if st.button("🗑️ CONFERMA RIFIUTO", type="primary"):
	success = reject_relationship(driver, row['id'], row['Soggetto'], row['Predicato'], row['Oggetto'])
	if success:
	st.success("Relazione eliminata!")
	st.rerun()
	else:
	st.info("Nessuna relazione nel grafo.")
	else:
	st.warning("Database non connesso.")

	# ==============================================================================
	# TAB 3: VISUALIZZAZIONE (PYVIS)
	# ==============================================================================
	with tab_vis:
	st.header("Esplorazione Topologica")

	if driver:
	physics = st.checkbox("Abilita Fisica", value=True)
	cypher_vis = """
	MATCH (s)-[r]->(o)
	RETURN COALESCE(s.label, s.name, head(labels(s))) as src,
	type(r) as rel,
	COALESCE(o.label, o.name, head(labels(o))) as dst
	LIMIT 100
	"""
	graph_data = run_query(driver, cypher_vis)

	if graph_data:
	net = Network(height="600px", width="100%", bgcolor="#222222", font_color="white", notebook=False)

	for item in graph_data:
	src = str(item['src'])
	dst = str(item['dst'])
	rel = str(item['rel'])

	net.add_node(src, label=src, color="#4facfe", title=src)
	net.add_node(dst, label=dst, color="#00f2fe", title=dst)
	net.add_edge(src, dst, title=rel, label=rel)

	net.toggle_physics(physics)
	path = "data/processed/graph_viz.html"
	os.makedirs("data/processed", exist_ok=True)
	net.save_graph(path)

	with open(path, 'r', encoding='utf-8') as f:
	html_string = f.read()
	components.html(html_string, height=600, scrolling=True)
	else:
	st.info("Grafo vuoto.")
	else:
	st.warning("Database non connesso.")

	# Footer
	st.markdown("---")
	st.caption("Activa Digital \| NextGenTech \| Prototipo v1.0")