| import streamlit as st |
| import os |
| import shutil |
| import csv |
| import json |
| import pandas as pd |
| from datetime import datetime |
| from neo4j import GraphDatabase |
| from pyvis.network import Network |
| import streamlit.components.v1 as components |
| from dotenv import load_dotenv |
|
|
| |
| |
| from src.ingestion.semantic_splitter import ActivaSemanticSplitter |
| |
| from src.extraction.extractor import NeuroSymbolicExtractor, GraphTriple |
| |
| from src.graph.graph_loader import KnowledgeGraphPersister |
| from src.graph.entity_resolver import EntityResolver |
|
|
| |
| load_dotenv() |
| st.set_page_config( |
| page_title="Activa Semantic Discovery", |
| layout="wide", |
| initial_sidebar_state="expanded", |
| page_icon="🧠" |
| ) |
|
|
| |
| def get_driver(uri, user, password): |
| if not uri or not password: |
| return None |
| try: |
| return GraphDatabase.driver(uri, auth=(user, password)) |
| except Exception as e: |
| return None |
|
|
| def run_query(driver, query, params=None): |
| if driver is None: |
| return [] |
| with driver.session() as session: |
| result = session.run(query, params) |
| return [r.data() for r in result] |
|
|
| |
| def reject_relationship(driver, rel_id, subj, pred, obj, reason="Human Rejection"): |
| """ |
| 1. Cancella dal DB (Azione Reale). |
| 2. Salva in CSV per Active Learning (Data Lineage del rifiuto). |
| """ |
| query = "MATCH ()-[r]->() WHERE elementId(r) = $id DELETE r" |
| try: |
| run_query(driver, query, {"id": rel_id}) |
| except Exception as e: |
| st.error(f"Errore durante la cancellazione: {e}") |
| return False |
| |
| log_file = "data/processed/rejected_triples.csv" |
| os.makedirs("data/processed", exist_ok=True) |
| file_exists = os.path.isfile(log_file) |
| |
| try: |
| with open(log_file, mode='a', newline='', encoding='utf-8') as f: |
| writer = csv.writer(f) |
| if not file_exists: |
| writer.writerow(["timestamp", "subject", "predicate", "object", "reason"]) |
| writer.writerow([datetime.now(), subj, pred, obj, reason]) |
| return True |
| except Exception as e: |
| st.warning(f"Relazione cancellata dal DB, ma errore nel log CSV: {e}") |
| return True |
|
|
| |
| st.sidebar.title("⚙️ Configurazione") |
|
|
| env_uri = os.getenv("NEO4J_URI", "") |
| env_user = os.getenv("NEO4J_USER", "neo4j") |
| env_password = os.getenv("NEO4J_PASSWORD", "") |
| env_hf_token = os.getenv("HF_TOKEN", "") |
|
|
| st.sidebar.subheader("Backend LLM") |
| |
| hf_token_input = st.sidebar.text_input("HuggingFace Token (Opzionale per Cloud)", value=env_hf_token, type="password") |
| if hf_token_input: |
| os.environ["HF_TOKEN"] = hf_token_input |
|
|
| st.sidebar.subheader("Connessione Neo4j AuraDB") |
| uri = st.sidebar.text_input("URI", value=env_uri, placeholder="neo4j+s://...") |
| user = st.sidebar.text_input("User", value=env_user) |
| password = st.sidebar.text_input("Password", value=env_password, type="password") |
|
|
| driver = None |
| if uri and password: |
| driver = get_driver(uri, user, password) |
| if driver: |
| st.sidebar.success("🟢 Connesso al Knowledge Graph") |
| os.environ["NEO4J_URI"] = uri |
| os.environ["NEO4J_USER"] = user |
| os.environ["NEO4J_PASSWORD"] = password |
| else: |
| st.sidebar.error("🔴 Errore connessione") |
|
|
| |
| st.title("🧠 Automated Semantic Discovery Prototype") |
| st.markdown("**Generazione Neuro-Simbolica & Validazione Human-in-the-Loop**") |
|
|
| |
| tab_gen, tab_val, tab_vis = st.tabs([ |
| "⚙️ 1. Generazione (Pipeline)", |
| "🔍 2. Validazione (Active Learning)", |
| "🕸️ 3. Visualizzazione (Graph)" |
| ]) |
|
|
| |
| |
| |
| with tab_gen: |
| st.header("Pipeline di Ingestione ed Estrazione") |
| |
| col_src, col_act = st.columns([1, 2]) |
| |
| |
| with col_src: |
| st.subheader("Sorgente Dati") |
| data_source = st.radio("Modalità:", ("📂 Esempi Demo", "⬆️ Upload (Sperimentale)")) |
| |
| selected_file_path = None |
| os.makedirs("data/raw", exist_ok=True) |
| os.makedirs("data/processed", exist_ok=True) |
| os.makedirs("data/examples", exist_ok=True) |
|
|
| if data_source == "📂 Esempi Demo": |
| files = [f for f in os.listdir("data/examples") if f.endswith(".txt")] |
| if files: |
| choice = st.selectbox("Seleziona scenario:", files) |
| if choice: |
| src = os.path.join("data/examples", choice) |
| dst = os.path.join("data/raw", choice) |
| shutil.copy(src, dst) |
| selected_file_path = choice |
| else: |
| st.warning("Nessun file in data/examples") |
| else: |
| uploaded = st.file_uploader("Carica .txt", type="txt") |
| if uploaded: |
| with open(os.path.join("data/raw", uploaded.name), "wb") as f: |
| f.write(uploaded.getbuffer()) |
| selected_file_path = uploaded.name |
|
|
| |
| with col_act: |
| if selected_file_path: |
| st.info(f"File attivo: **{selected_file_path}**") |
| c1, c2, c3 = st.columns(3) |
| |
| |
| with c1: |
| if st.button("A. Semantic Chunking"): |
| with st.status("Analisi vettoriale in corso...", expanded=True) as status: |
| try: |
| |
| with open(os.path.join("data/raw", selected_file_path), "r", encoding="utf-8") as f: |
| text_content = f.read() |
| |
| |
| status.write("Caricamento modelli di embedding...") |
| splitter = ActivaSemanticSplitter() |
| |
| status.write("Calcolo distanze coseno...") |
| chunks, dists, threshold = splitter.create_chunks(text_content) |
| |
| |
| chunk_file = "data/processed/chunks.json" |
| with open(chunk_file, "w", encoding="utf-8") as f: |
| json.dump(chunks, f, ensure_ascii=False, indent=2) |
| |
| status.update(label="Chunking Completato!", state="complete", expanded=False) |
| st.success(f"Generati {len(chunks)} frammenti semantici.") |
| |
| |
| if os.path.exists("chunking_analysis.png"): |
| st.image("chunking_analysis.png", caption="Analisi Coerenza") |
| |
| except Exception as e: |
| status.update(label="Errore Chunking", state="error") |
| st.error(f"Errore: {e}") |
| |
| |
| with c2: |
| if st.button("B. Info Extraction"): |
| chunk_file = "data/processed/chunks.json" |
| if not os.path.exists(chunk_file): |
| st.error("Esegui prima il Chunking!") |
| else: |
| with st.status("Estrazione Neuro-Simbolica...", expanded=True) as status: |
| try: |
| |
| with open(chunk_file, "r", encoding="utf-8") as f: |
| chunks = json.load(f) |
| |
| |
| status.write("Inizializzazione LLM (Locale/Cloud)...") |
| extractor = NeuroSymbolicExtractor() |
| |
| all_triples = [] |
| progress_bar = st.progress(0) |
| |
| |
| for i, chunk in enumerate(chunks): |
| status.write(f"Processando chunk {i+1}/{len(chunks)}...") |
| result = extractor.extract(chunk, source_id=selected_file_path) |
| |
| triples_dicts = [t.model_dump() for t in result.triples] |
| all_triples.extend(triples_dicts) |
| progress_bar.progress((i + 1) / len(chunks)) |
| |
| |
| triples_file = "data/processed/triples_raw.json" |
| with open(triples_file, "w", encoding="utf-8") as f: |
| json.dump(all_triples, f, ensure_ascii=False, indent=2) |
| |
| status.update(label="Estrazione Completata!", state="complete", expanded=False) |
| st.success(f"Estratte {len(all_triples)} triple candidate.") |
| |
| except Exception as e: |
| status.update(label="Errore Estrazione", state="error") |
| st.error(f"Errore: {e}") |
|
|
| |
| with c3: |
| if st.button("C. Popola Neo4j", type="primary"): |
| triples_file = "data/processed/triples_raw.json" |
| if not os.path.exists(triples_file): |
| st.error("Esegui prima l'estrazione!") |
| elif not driver: |
| st.error("Connettiti al DB prima!") |
| else: |
| with st.status("Costruzione Grafo...", expanded=True) as status: |
| try: |
| |
| with open(triples_file, "r", encoding="utf-8") as f: |
| raw_data = json.load(f) |
| |
| |
| triples_objs = [GraphTriple(**t) for t in raw_data] |
| |
| |
| status.write("Entity Resolution (DBSCAN Clustering)...") |
| resolver = EntityResolver(similarity_threshold=0.85) |
| resolved_triples = resolver.resolve_entities(triples_objs) |
| status.write(f"Entità normalizzate. Triple da inserire: {len(resolved_triples)}") |
| |
| |
| status.write("Scrittura Batch su Neo4j...") |
| persister = KnowledgeGraphPersister() |
| persister.save_triples(resolved_triples) |
| persister.close() |
| |
| status.update(label="Grafo Aggiornato!", state="complete", expanded=False) |
| st.success("🚀 Grafo costruito con successo su AuraDB!") |
| st.balloons() |
| |
| except Exception as e: |
| status.update(label="Errore Costruzione", state="error") |
| st.error(f"Errore: {e}") |
| else: |
| st.write("👈 Seleziona un file per iniziare.") |
|
|
| |
| |
| |
| with tab_val: |
| st.header("Curation & Feedback Loop") |
| st.markdown("Validazione delle triple estratte. Le relazioni rifiutate vengono usate per il fine-tuning.") |
| |
| if driver: |
| |
| cypher_val = """ |
| MATCH (s)-[r]->(o) |
| RETURN elementId(r) as id, |
| COALESCE(s.label, s.name, head(labels(s))) as Soggetto, |
| type(r) as Predicato, |
| COALESCE(o.label, o.name, head(labels(o))) as Oggetto, |
| COALESCE(r.confidence, 0.85) as Confidenza |
| ORDER BY Confidenza ASC LIMIT 50 |
| """ |
| triples_data = run_query(driver, cypher_val) |
| |
| if triples_data: |
| df = pd.DataFrame(triples_data) |
| st.write(f"Relazioni candidate ({len(df)}):") |
| |
| event = st.dataframe( |
| df.drop(columns=["id"]), |
| selection_mode="single-row", |
| on_select="rerun", |
| use_container_width=True, |
| hide_index=True |
| ) |
| |
| if len(event.selection.rows) > 0: |
| idx = event.selection.rows[0] |
| row = df.iloc[idx] |
| |
| st.divider() |
| col_warn, col_btn = st.columns([3, 1]) |
| with col_warn: |
| st.warning(f"Rifiutare: **{row['Soggetto']}** --[{row['Predicato']}]--> **{row['Oggetto']}**?") |
| with col_btn: |
| if st.button("🗑️ CONFERMA RIFIUTO", type="primary"): |
| success = reject_relationship(driver, row['id'], row['Soggetto'], row['Predicato'], row['Oggetto']) |
| if success: |
| st.success("Relazione eliminata!") |
| st.rerun() |
| else: |
| st.info("Nessuna relazione nel grafo.") |
| else: |
| st.warning("Database non connesso.") |
|
|
| |
| |
| |
| with tab_vis: |
| st.header("Esplorazione Topologica") |
| |
| if driver: |
| physics = st.checkbox("Abilita Fisica", value=True) |
| cypher_vis = """ |
| MATCH (s)-[r]->(o) |
| RETURN COALESCE(s.label, s.name, head(labels(s))) as src, |
| type(r) as rel, |
| COALESCE(o.label, o.name, head(labels(o))) as dst |
| LIMIT 100 |
| """ |
| graph_data = run_query(driver, cypher_vis) |
| |
| if graph_data: |
| net = Network(height="600px", width="100%", bgcolor="#222222", font_color="white", notebook=False) |
| |
| for item in graph_data: |
| src = str(item['src']) |
| dst = str(item['dst']) |
| rel = str(item['rel']) |
| |
| net.add_node(src, label=src, color="#4facfe", title=src) |
| net.add_node(dst, label=dst, color="#00f2fe", title=dst) |
| net.add_edge(src, dst, title=rel, label=rel) |
| |
| net.toggle_physics(physics) |
| path = "data/processed/graph_viz.html" |
| os.makedirs("data/processed", exist_ok=True) |
| net.save_graph(path) |
| |
| with open(path, 'r', encoding='utf-8') as f: |
| html_string = f.read() |
| components.html(html_string, height=600, scrolling=True) |
| else: |
| st.info("Grafo vuoto.") |
| else: |
| st.warning("Database non connesso.") |
|
|
| |
| st.markdown("---") |
| st.caption("Activa Digital | NextGenTech | Prototipo v1.0") |