Spaces:

NextGenTech
/

AutomatedSemanticDiscovery

Sleeping

App Files Files Community

GaetanoParente commited on Dec 31, 2025

Commit

4c86dc7

verified ·

1 Parent(s): 8905ad0

Upload 3 files

Browse files

Files changed (3) hide show

app.py +286 -0
docker-compose.yml +26 -0
main.py +131 -0

app.py ADDED Viewed

	@@ -0,0 +1,286 @@

+import streamlit as st
+import os
+import shutil
+import csv
+import pandas as pd
+from datetime import datetime
+from neo4j import GraphDatabase
+from pyvis.network import Network
+import streamlit.components.v1 as components
+from dotenv import load_dotenv
+# Import moduli interni (assumendo che la struttura src/ esista)
+from src.ingestion import semantic_splitter
+from src.extraction import extractor
+from src.graph import graph_builder
+# --- CONFIGURAZIONE INIZIALE ---
+load_dotenv()
+st.set_page_config(
+    page_title="Activa Semantic Discovery",
+    layout="wide",
+    initial_sidebar_state="expanded",
+    page_icon="🧠"
+)
+# --- FUNZIONI DI UTILITÀ NEO4J ---
+def get_driver(uri, user, password):
+    if not uri or not password:
+        return None
+    try:
+        return GraphDatabase.driver(uri, auth=(user, password))
+    except Exception as e:
+        return None
+def run_query(driver, query, params=None):
+    if driver is None:
+        return []
+    with driver.session() as session:
+        result = session.run(query, params)
+        return [r.data() for r in result]
+# --- LOGICA FEEDBACK LOOP (HUMAN-IN-THE-LOOP) ---
+def reject_relationship(driver, rel_id, subj, pred, obj, reason="Human Rejection"):
+    """
+    1. Cancella dal DB (Azione Reale).
+    2. Salva in CSV per Active Learning (Data Lineage del rifiuto).
+    """
+    # 1. Cancellazione Reale
+    query = "MATCH ()-[r]->() WHERE elementId(r) = $id DELETE r"
+    try:
+        run_query(driver, query, {"id": rel_id})
+    except Exception as e:
+        st.error(f"Errore durante la cancellazione: {e}")
+        return False
+    # 2. Logging per Fine-Tuning
+    log_file = "data/processed/rejected_triples.csv"
+    os.makedirs("data/processed", exist_ok=True)
+    file_exists = os.path.isfile(log_file)
+    try:
+        with open(log_file, mode='a', newline='', encoding='utf-8') as f:
+            writer = csv.writer(f)
+            if not file_exists:
+                writer.writerow(["timestamp", "subject", "predicate", "object", "reason"])
+            writer.writerow([datetime.now(), subj, pred, obj, reason])
+        return True
+    except Exception as e:
+        st.warning(f"Relazione cancellata dal DB, ma errore nel log CSV: {e}")
+        return True
+# --- UI: SIDEBAR & CONFIGURAZIONE ---
+st.sidebar.title("⚙️ Configurazione")
+# Gestione credenziali: Priorità a .env, fallback su input utente
+env_uri = os.getenv("NEO4J_URI", "")
+env_user = os.getenv("NEO4J_USER", "neo4j")
+# Non pre-compiliamo la password se non è nell'env per sicurezza
+env_password = os.getenv("NEO4J_PASSWORD", "")
+st.sidebar.subheader("Connessione Neo4j AuraDB")
+uri = st.sidebar.text_input("URI", value=env_uri, placeholder="neo4j+s://...")
+user = st.sidebar.text_input("User", value=env_user)
+password = st.sidebar.text_input("Password", value=env_password, type="password")
+# Inizializza Driver
+driver = None
+if uri and password:
+    driver = get_driver(uri, user, password)
+    if driver:
+        st.sidebar.success("🟢 Connesso al Knowledge Graph")
+        # Setta variabili d'ambiente per i moduli backend che le usano
+        os.environ["NEO4J_URI"] = uri
+        os.environ["NEO4J_USER"] = user
+        os.environ["NEO4J_PASSWORD"] = password
+    else:
+        st.sidebar.error("🔴 Errore connessione")
+# --- UI: HEADER PRINCIPALE ---
+st.title("🧠 Automated Semantic Discovery Prototype")
+st.markdown("**Generazione Neuro-Simbolica & Validazione Human-in-the-Loop**")
+# --- TAB NAVIGAZIONE ---
+tab_gen, tab_val, tab_vis = st.tabs([
+    "⚙️ 1. Generazione (Pipeline)",
+    "🔍 2. Validazione (Active Learning)",
+    "🕸️ 3. Visualizzazione (Graph)"
+])
+# ==============================================================================
+# TAB 1: GENERAZIONE (PIPELINE DI ESECUZIONE)
+# ==============================================================================
+with tab_gen:
+    st.header("Pipeline di Ingestione ed Estrazione")
+    # Selezione Sorgente Dati (Golden Path)
+    col_src, col_act = st.columns([1, 2])
+    with col_src:
+        st.subheader("Sorgente Dati")
+        data_source = st.radio("Modalità:", ("📂 Esempi Demo", "⬆️ Upload (Sperimentale)"))
+        selected_file_path = None
+        os.makedirs("data/raw", exist_ok=True)
+        os.makedirs("data/examples", exist_ok=True)
+        if data_source == "📂 Esempi Demo":
+            files = [f for f in os.listdir("data/examples") if f.endswith(".txt")]
+            if files:
+                choice = st.selectbox("Seleziona scenario:", files)
+                if choice:
+                    shutil.copy(os.path.join("data/examples", choice), os.path.join("data/raw", choice))
+                    selected_file_path = choice
+            else:
+                st.warning("Nessun file in data/examples")
+        else:
+            uploaded = st.file_uploader("Carica .txt", type="txt")
+            if uploaded:
+                with open(os.path.join("data/raw", uploaded.name), "wb") as f:
+                    f.write(uploaded.getbuffer())
+                selected_file_path = uploaded.name
+    with col_act:
+        if selected_file_path:
+            st.info(f"File attivo: **{selected_file_path}**")
+            c1, c2, c3 = st.columns(3)
+            # Step A: Chunking
+            with c1:
+                if st.button("A. Semantic Chunking"):
+                    with st.spinner('Analisi vettoriale...'):
+                        # semantic_splitter.run(selected_file_path) # Scommenta in prod
+                        st.success("Chunking OK")
+            # Step B: Extraction
+            with c2:
+                if st.button("B. Info Extraction"):
+                    with st.spinner('NLP Extraction...'):
+                        # extractor.run() # Scommenta in prod
+                        st.success("Estrazione OK")
+            # Step C: Graph
+            with c3:
+                if st.button("C. Popola Neo4j", type="primary"):
+                    if not driver:
+                        st.error("Connettiti al DB prima!")
+                    else:
+                        with st.spinner('Scrittura su Neo4j...'):
+                            # graph_builder.run() # Scommenta in prod
+                            st.success("Grafo aggiornato!")
+                            st.balloons()
+        else:
+            st.write("👈 Seleziona un file per iniziare.")
+# ==============================================================================
+# TAB 2: VALIDAZIONE (HUMAN IN THE LOOP)
+# ==============================================================================
+with tab_val:
+    st.header("Curation & Feedback Loop")
+    st.markdown("""
+    In questa sezione, l'esperto di dominio valida le triple estratte.
+    Le relazioni rifiutate vengono rimosse dal grafo e loggate per il **fine-tuning** futuro dei modelli.
+    """)
+    if driver:
+        # Recupera triple (COALESCE gestisce casi in cui 'name' o 'label' mancano)
+        # Nota: Assicurati che i tuoi nodi abbiano una proprietà 'name' o 'label'
+        cypher_val = """
+        MATCH (s)-[r]->(o)
+        RETURN elementId(r) as id,
+               COALESCE(s.name, s.label, head(labels(s))) as Soggetto,
+               type(r) as Predicato,
+               COALESCE(o.name, o.label, head(labels(o))) as Oggetto,
+               COALESCE(r.confidence, 0.85) as Confidenza
+        ORDER BY Confidenza ASC LIMIT 50
+        """
+        triples_data = run_query(driver, cypher_val)
+        if triples_data:
+            df = pd.DataFrame(triples_data)
+            # Tabella interattiva
+            st.write(f"Mostrando {len(df)} relazioni candidate (ordinate per confidenza):")
+            event = st.dataframe(
+                df.drop(columns=["id"]),
+                selection_mode="single-row",
+                on_select="rerun",
+                use_container_width=True,
+                hide_index=True
+            )
+            # Logica di rifiuto
+            if len(event.selection.rows) > 0:
+                idx = event.selection.rows[0]
+                row = df.iloc[idx]
+                st.divider()
+                col_warn, col_btn = st.columns([3, 1])
+                with col_warn:
+                    st.warning(f"Vuoi rifiutare la relazione: **{row['Soggetto']}** --[{row['Predicato']}]--> **{row['Oggetto']}**?")
+                with col_btn:
+                    if st.button("🗑️ CONFERMA RIFIUTO", type="primary"):
+                        success = reject_relationship(driver, row['id'], row['Soggetto'], row['Predicato'], row['Oggetto'])
+                        if success:
+                            st.success("Relazione eliminata e feedback registrato!")
+                            st.rerun()
+        else:
+            st.info("Nessuna relazione trovata nel grafo.")
+    else:
+        st.warning("Connessione al database non attiva.")
+# ==============================================================================
+# TAB 3: VISUALIZZAZIONE (PYVIS)
+# ==============================================================================
+with tab_vis:
+    st.header("Esplorazione Topologica")
+    if driver:
+        physics = st.checkbox("Abilita Fisica (Gravità)", value=True)
+        # Limita a 100 nodi per performance rendering
+        cypher_vis = """
+        MATCH (s)-[r]->(o)
+        RETURN COALESCE(s.name, head(labels(s))) as src,
+               type(r) as rel,
+               COALESCE(o.name, head(labels(o))) as dst
+        LIMIT 100
+        """
+        graph_data = run_query(driver, cypher_vis)
+        if graph_data:
+            # Creazione Network PyVis
+            net = Network(height="600px", width="100%", bgcolor="#222222", font_color="white", notebook=False)
+            for item in graph_data:
+                # Aggiungi nodi e archi
+                src_lbl = str(item['src'])
+                dst_lbl = str(item['dst'])
+                rel_lbl = str(item['rel'])
+                net.add_node(src_lbl, label=src_lbl, color="#4facfe", title=src_lbl)
+                net.add_node(dst_lbl, label=dst_lbl, color="#00f2fe", title=dst_lbl)
+                net.add_edge(src_lbl, dst_lbl, title=rel_lbl, label=rel_lbl)
+            # Opzioni fisica
+            net.toggle_physics(physics)
+            # Salva e renderizza HTML
+            path = "data/processed/graph_viz.html"
+            os.makedirs("data/processed", exist_ok=True)
+            net.save_graph(path)
+            with open(path, 'r', encoding='utf-8') as f:
+                html_string = f.read()
+            components.html(html_string, height=600, scrolling=True)
+            st.caption(f"Visualizzazione dei primi {len(graph_data)} archi.")
+        else:
+            st.info("Grafo vuoto o impossibile recuperare dati.")
+    else:
+        st.warning("Connessione al database non attiva.")
+# Footer
+st.markdown("---")
+st.caption("Activa Digital | NextGenTech | Prototipo v1.0")

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,26 @@

+services:
+  neo4j:
+    image: neo4j:5.15.0-community
+    container_name: activa_graph_db
+    ports:
+      - "7474:7474" # Browser UI
+      - "7687:7687" # Python Driver
+    environment:
+      - NEO4J_AUTH=neo4j/activa_semantic_lab
+      # Carica APOC e GDS automaticamente
+      - NEO4J_PLUGINS=["apoc", "graph-data-science"]
+      # CONFIGURAZIONE CRUCIALE PER N10S (Neosemantics)
+      - NEO4J_dbms_security_procedures_unrestricted=n10s.*,apoc.*
+      - NEO4J_dbms_security_procedures_allowlist=n10s.*,apoc.*,gds.*
+      # Memoria
+      - NEO4J_dbms_memory_heap_initial__size=1G
+      - NEO4J_dbms_memory_heap_max__size=2G
+    volumes:
+      # Mappa le cartelle che hai creato tu nella root
+      - ./neo4j/data:/data
+      - ./neo4j/plugins:/plugins
+    healthcheck:
+      test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider localhost:7474 || exit 1"]
+      interval: 10s
+      timeout: 5s
+      retries: 5

main.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import sys
+import os
+import time
+import glob
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+from src.ingestion.semantic_splitter import ActivaSemanticSplitter
+from src.extraction.extractor import NeuroSymbolicExtractor
+from src.validation.validator import SemanticValidator
+from src.graph.graph_loader import KnowledgeGraphPersister
+from src.graph.entity_resolver import EntityResolver
+def pipeline_execution():
+    print("\n🚀 AVVIO PIPELINE AUTOMATED DISCOVERY\n" + "="*50)
+    raw_text = load_raw_documents()
+    if not raw_text:
+        print("⚠️ Nessun file trovato in data/raw/. Uso testo di default.")
+        raw_text = """
+        La Basilica di San Marco a Venezia è il principale luogo di culto della città.
+        È uno degli esempi più noti di architettura italo-bizantina.
+        """
+    # --- FASE 1: INGESTION ---
+    print("\n[FASE 1] Ingestion & Semantic Chunking...")
+    try:
+        # Usa un modello piccolo per lo splitting veloce
+        splitter = ActivaSemanticSplitter(model_name="all-MiniLM-L6-v2")
+        # percentile_threshold=90 significa: taglia solo quando la similarità scende molto
+        chunks, _, _ = splitter.create_chunks(raw_text, percentile_threshold=90)
+        save_chunks_to_processed(chunks)
+        print(f"✅ Testo diviso in {len(chunks)} segmenti semantici.")
+    except Exception as e:
+        print(f"❌ Errore in Fase 1: {e}")
+        return
+    # --- FASE 2: EXTRACTION ---
+    print("\n[FASE 2] Init Neuro-Symbolic Core (Llama 3)...")
+    gold_path = os.path.join("data", "gold_standard", "examples.json")
+    try:
+        # Assicurati che Ollama sia attivo!
+        extractor = NeuroSymbolicExtractor(model_name="llama3", gold_standard_path=gold_path)
+    except Exception as e:
+        print(f"❌ Errore connessione Ollama: {e}")
+        return
+    all_triples = []
+    print(f"🔄 Avvio estrazione su {len(chunks)} chunk...")
+    for i, chunk in enumerate(chunks):
+        chunk_id = f"doc_sample_chunk_{i+1}"
+        print(f"\n   Processing {chunk_id} ({len(chunk)} chars)...")
+        # Invoca Llama 3
+        extraction_result = extractor.extract(chunk, source_id=chunk_id)
+        if extraction_result and extraction_result.triples:
+            count = len(extraction_result.triples)
+            print(f"   -> Estratte {count} triple.")
+            # Aggiungiamo le triple alla lista totale
+            all_triples.extend(extraction_result.triples)
+        else:
+            print("   -> Nessuna tripla trovata (o errore parsing).")
+    print(f"\n✅ Totale triple raccolte: {len(all_triples)}")
+    if not all_triples:
+        print("⚠️ Nessuna tripla da salvare. Pipeline terminata.")
+        return
+    # --- FASE 2.5: SYMBOLIC RESOLUTION & CANONICALIZATION ---
+    # Implementazione Sezione 4.1 del Documento
+    print("\n[FASE 2.5] Entity Resolution & Canonicalization (DBSCAN)...")
+    try:
+        resolver = EntityResolver(similarity_threshold=0.85)
+        # Sovrascriviamo le triple con quelle pulite
+        all_triples = resolver.resolve_entities(all_triples)
+        print("✅ Risoluzione entità completata.")
+    except Exception as e:
+        print(f"⚠️ Errore nel resolver (skip): {e}")
+    print("\n[FASE 2.6] Validazione Semantica (SHACL)...")
+    validator = SemanticValidator()
+    is_valid, report, _ = validator.validate_batch(all_triples)
+    if is_valid:
+        print("✅ Validazione passata. I dati rispettano l'ontologia.")
+    else:
+        print("⚠️  Warning: Rilevate violazioni SHACL.")
+        print("   (In produzione, queste triple verrebbero scartate o mandate in Human Review)")
+        # Per ora procediamo, ma in un sistema reale fermeremmo qui le triple corrotte.
+        print(report)
+    # --- FASE 3: PERSISTENCE ---
+    print("\n[FASE 3] Graph Construction & Persistence (Neo4j)...")
+    try:
+        persister = KnowledgeGraphPersister()
+        persister.save_triples(all_triples)
+        persister.close()
+        print("\n🎉 PIPELINE COMPLETATA CON SUCCESSO!")
+        print("👉 Vai su http://localhost:7474 ed esegui: MATCH (n)-[r]->(m) RETURN n,r,m")
+    except Exception as e:
+        print(f"❌ Errore in Fase 3 (Neo4j): {e}")
+def load_raw_documents(directory="data/raw"):
+    """Legge tutti i file .txt nella cartella raw."""
+    texts = []
+    files = glob.glob(os.path.join(directory, "*.txt"))
+    print(f"📂 Trovati {len(files)} documenti in {directory}")
+    for f_path in files:
+        with open(f_path, 'r', encoding='utf-8') as f:
+            texts.append(f.read())
+    return "\n\n".join(texts)
+def save_chunks_to_processed(chunks, directory="data/processed"):
+    """Salva i chunk su disco per debug."""
+    os.makedirs(directory, exist_ok=True)
+    with open(os.path.join(directory, "chunks_debug.txt"), "w", encoding="utf-8") as f:
+        for i, c in enumerate(chunks):
+            f.write(f"--- CHUNK {i} ---\n{c}\n\n")
+    print(f"💾 Chunk salvati in {directory}/chunks_debug.txt")
+if __name__ == "__main__":
+    start_time = time.time()
+    pipeline_execution()
+    print(f"\n⏱️ Tempo totale esecuzione: {time.time() - start_time:.2f} secondi")