Upload 3 files
Browse files- app.py +286 -0
- docker-compose.yml +26 -0
- main.py +131 -0
app.py
ADDED
|
@@ -0,0 +1,286 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import os
|
| 3 |
+
import shutil
|
| 4 |
+
import csv
|
| 5 |
+
import pandas as pd
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
from neo4j import GraphDatabase
|
| 8 |
+
from pyvis.network import Network
|
| 9 |
+
import streamlit.components.v1 as components
|
| 10 |
+
from dotenv import load_dotenv
|
| 11 |
+
|
| 12 |
+
# Import moduli interni (assumendo che la struttura src/ esista)
|
| 13 |
+
from src.ingestion import semantic_splitter
|
| 14 |
+
from src.extraction import extractor
|
| 15 |
+
from src.graph import graph_builder
|
| 16 |
+
|
| 17 |
+
# --- CONFIGURAZIONE INIZIALE ---
|
| 18 |
+
load_dotenv()
|
| 19 |
+
st.set_page_config(
|
| 20 |
+
page_title="Activa Semantic Discovery",
|
| 21 |
+
layout="wide",
|
| 22 |
+
initial_sidebar_state="expanded",
|
| 23 |
+
page_icon="🧠"
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
# --- FUNZIONI DI UTILITÀ NEO4J ---
|
| 27 |
+
def get_driver(uri, user, password):
|
| 28 |
+
if not uri or not password:
|
| 29 |
+
return None
|
| 30 |
+
try:
|
| 31 |
+
return GraphDatabase.driver(uri, auth=(user, password))
|
| 32 |
+
except Exception as e:
|
| 33 |
+
return None
|
| 34 |
+
|
| 35 |
+
def run_query(driver, query, params=None):
|
| 36 |
+
if driver is None:
|
| 37 |
+
return []
|
| 38 |
+
with driver.session() as session:
|
| 39 |
+
result = session.run(query, params)
|
| 40 |
+
return [r.data() for r in result]
|
| 41 |
+
|
| 42 |
+
# --- LOGICA FEEDBACK LOOP (HUMAN-IN-THE-LOOP) ---
|
| 43 |
+
def reject_relationship(driver, rel_id, subj, pred, obj, reason="Human Rejection"):
|
| 44 |
+
"""
|
| 45 |
+
1. Cancella dal DB (Azione Reale).
|
| 46 |
+
2. Salva in CSV per Active Learning (Data Lineage del rifiuto).
|
| 47 |
+
"""
|
| 48 |
+
# 1. Cancellazione Reale
|
| 49 |
+
query = "MATCH ()-[r]->() WHERE elementId(r) = $id DELETE r"
|
| 50 |
+
try:
|
| 51 |
+
run_query(driver, query, {"id": rel_id})
|
| 52 |
+
except Exception as e:
|
| 53 |
+
st.error(f"Errore durante la cancellazione: {e}")
|
| 54 |
+
return False
|
| 55 |
+
|
| 56 |
+
# 2. Logging per Fine-Tuning
|
| 57 |
+
log_file = "data/processed/rejected_triples.csv"
|
| 58 |
+
os.makedirs("data/processed", exist_ok=True)
|
| 59 |
+
file_exists = os.path.isfile(log_file)
|
| 60 |
+
|
| 61 |
+
try:
|
| 62 |
+
with open(log_file, mode='a', newline='', encoding='utf-8') as f:
|
| 63 |
+
writer = csv.writer(f)
|
| 64 |
+
if not file_exists:
|
| 65 |
+
writer.writerow(["timestamp", "subject", "predicate", "object", "reason"])
|
| 66 |
+
writer.writerow([datetime.now(), subj, pred, obj, reason])
|
| 67 |
+
return True
|
| 68 |
+
except Exception as e:
|
| 69 |
+
st.warning(f"Relazione cancellata dal DB, ma errore nel log CSV: {e}")
|
| 70 |
+
return True
|
| 71 |
+
|
| 72 |
+
# --- UI: SIDEBAR & CONFIGURAZIONE ---
|
| 73 |
+
st.sidebar.title("⚙️ Configurazione")
|
| 74 |
+
|
| 75 |
+
# Gestione credenziali: Priorità a .env, fallback su input utente
|
| 76 |
+
env_uri = os.getenv("NEO4J_URI", "")
|
| 77 |
+
env_user = os.getenv("NEO4J_USER", "neo4j")
|
| 78 |
+
# Non pre-compiliamo la password se non è nell'env per sicurezza
|
| 79 |
+
env_password = os.getenv("NEO4J_PASSWORD", "")
|
| 80 |
+
|
| 81 |
+
st.sidebar.subheader("Connessione Neo4j AuraDB")
|
| 82 |
+
uri = st.sidebar.text_input("URI", value=env_uri, placeholder="neo4j+s://...")
|
| 83 |
+
user = st.sidebar.text_input("User", value=env_user)
|
| 84 |
+
password = st.sidebar.text_input("Password", value=env_password, type="password")
|
| 85 |
+
|
| 86 |
+
# Inizializza Driver
|
| 87 |
+
driver = None
|
| 88 |
+
if uri and password:
|
| 89 |
+
driver = get_driver(uri, user, password)
|
| 90 |
+
if driver:
|
| 91 |
+
st.sidebar.success("🟢 Connesso al Knowledge Graph")
|
| 92 |
+
# Setta variabili d'ambiente per i moduli backend che le usano
|
| 93 |
+
os.environ["NEO4J_URI"] = uri
|
| 94 |
+
os.environ["NEO4J_USER"] = user
|
| 95 |
+
os.environ["NEO4J_PASSWORD"] = password
|
| 96 |
+
else:
|
| 97 |
+
st.sidebar.error("🔴 Errore connessione")
|
| 98 |
+
|
| 99 |
+
# --- UI: HEADER PRINCIPALE ---
|
| 100 |
+
st.title("🧠 Automated Semantic Discovery Prototype")
|
| 101 |
+
st.markdown("**Generazione Neuro-Simbolica & Validazione Human-in-the-Loop**")
|
| 102 |
+
|
| 103 |
+
# --- TAB NAVIGAZIONE ---
|
| 104 |
+
tab_gen, tab_val, tab_vis = st.tabs([
|
| 105 |
+
"⚙️ 1. Generazione (Pipeline)",
|
| 106 |
+
"🔍 2. Validazione (Active Learning)",
|
| 107 |
+
"🕸️ 3. Visualizzazione (Graph)"
|
| 108 |
+
])
|
| 109 |
+
|
| 110 |
+
# ==============================================================================
|
| 111 |
+
# TAB 1: GENERAZIONE (PIPELINE DI ESECUZIONE)
|
| 112 |
+
# ==============================================================================
|
| 113 |
+
with tab_gen:
|
| 114 |
+
st.header("Pipeline di Ingestione ed Estrazione")
|
| 115 |
+
|
| 116 |
+
# Selezione Sorgente Dati (Golden Path)
|
| 117 |
+
col_src, col_act = st.columns([1, 2])
|
| 118 |
+
|
| 119 |
+
with col_src:
|
| 120 |
+
st.subheader("Sorgente Dati")
|
| 121 |
+
data_source = st.radio("Modalità:", ("📂 Esempi Demo", "⬆️ Upload (Sperimentale)"))
|
| 122 |
+
|
| 123 |
+
selected_file_path = None
|
| 124 |
+
os.makedirs("data/raw", exist_ok=True)
|
| 125 |
+
os.makedirs("data/examples", exist_ok=True)
|
| 126 |
+
|
| 127 |
+
if data_source == "📂 Esempi Demo":
|
| 128 |
+
files = [f for f in os.listdir("data/examples") if f.endswith(".txt")]
|
| 129 |
+
if files:
|
| 130 |
+
choice = st.selectbox("Seleziona scenario:", files)
|
| 131 |
+
if choice:
|
| 132 |
+
shutil.copy(os.path.join("data/examples", choice), os.path.join("data/raw", choice))
|
| 133 |
+
selected_file_path = choice
|
| 134 |
+
else:
|
| 135 |
+
st.warning("Nessun file in data/examples")
|
| 136 |
+
else:
|
| 137 |
+
uploaded = st.file_uploader("Carica .txt", type="txt")
|
| 138 |
+
if uploaded:
|
| 139 |
+
with open(os.path.join("data/raw", uploaded.name), "wb") as f:
|
| 140 |
+
f.write(uploaded.getbuffer())
|
| 141 |
+
selected_file_path = uploaded.name
|
| 142 |
+
|
| 143 |
+
with col_act:
|
| 144 |
+
if selected_file_path:
|
| 145 |
+
st.info(f"File attivo: **{selected_file_path}**")
|
| 146 |
+
c1, c2, c3 = st.columns(3)
|
| 147 |
+
|
| 148 |
+
# Step A: Chunking
|
| 149 |
+
with c1:
|
| 150 |
+
if st.button("A. Semantic Chunking"):
|
| 151 |
+
with st.spinner('Analisi vettoriale...'):
|
| 152 |
+
# semantic_splitter.run(selected_file_path) # Scommenta in prod
|
| 153 |
+
st.success("Chunking OK")
|
| 154 |
+
|
| 155 |
+
# Step B: Extraction
|
| 156 |
+
with c2:
|
| 157 |
+
if st.button("B. Info Extraction"):
|
| 158 |
+
with st.spinner('NLP Extraction...'):
|
| 159 |
+
# extractor.run() # Scommenta in prod
|
| 160 |
+
st.success("Estrazione OK")
|
| 161 |
+
|
| 162 |
+
# Step C: Graph
|
| 163 |
+
with c3:
|
| 164 |
+
if st.button("C. Popola Neo4j", type="primary"):
|
| 165 |
+
if not driver:
|
| 166 |
+
st.error("Connettiti al DB prima!")
|
| 167 |
+
else:
|
| 168 |
+
with st.spinner('Scrittura su Neo4j...'):
|
| 169 |
+
# graph_builder.run() # Scommenta in prod
|
| 170 |
+
st.success("Grafo aggiornato!")
|
| 171 |
+
st.balloons()
|
| 172 |
+
else:
|
| 173 |
+
st.write("👈 Seleziona un file per iniziare.")
|
| 174 |
+
|
| 175 |
+
# ==============================================================================
|
| 176 |
+
# TAB 2: VALIDAZIONE (HUMAN IN THE LOOP)
|
| 177 |
+
# ==============================================================================
|
| 178 |
+
with tab_val:
|
| 179 |
+
st.header("Curation & Feedback Loop")
|
| 180 |
+
st.markdown("""
|
| 181 |
+
In questa sezione, l'esperto di dominio valida le triple estratte.
|
| 182 |
+
Le relazioni rifiutate vengono rimosse dal grafo e loggate per il **fine-tuning** futuro dei modelli.
|
| 183 |
+
""")
|
| 184 |
+
|
| 185 |
+
if driver:
|
| 186 |
+
# Recupera triple (COALESCE gestisce casi in cui 'name' o 'label' mancano)
|
| 187 |
+
# Nota: Assicurati che i tuoi nodi abbiano una proprietà 'name' o 'label'
|
| 188 |
+
cypher_val = """
|
| 189 |
+
MATCH (s)-[r]->(o)
|
| 190 |
+
RETURN elementId(r) as id,
|
| 191 |
+
COALESCE(s.name, s.label, head(labels(s))) as Soggetto,
|
| 192 |
+
type(r) as Predicato,
|
| 193 |
+
COALESCE(o.name, o.label, head(labels(o))) as Oggetto,
|
| 194 |
+
COALESCE(r.confidence, 0.85) as Confidenza
|
| 195 |
+
ORDER BY Confidenza ASC LIMIT 50
|
| 196 |
+
"""
|
| 197 |
+
triples_data = run_query(driver, cypher_val)
|
| 198 |
+
|
| 199 |
+
if triples_data:
|
| 200 |
+
df = pd.DataFrame(triples_data)
|
| 201 |
+
|
| 202 |
+
# Tabella interattiva
|
| 203 |
+
st.write(f"Mostrando {len(df)} relazioni candidate (ordinate per confidenza):")
|
| 204 |
+
|
| 205 |
+
event = st.dataframe(
|
| 206 |
+
df.drop(columns=["id"]),
|
| 207 |
+
selection_mode="single-row",
|
| 208 |
+
on_select="rerun",
|
| 209 |
+
use_container_width=True,
|
| 210 |
+
hide_index=True
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
# Logica di rifiuto
|
| 214 |
+
if len(event.selection.rows) > 0:
|
| 215 |
+
idx = event.selection.rows[0]
|
| 216 |
+
row = df.iloc[idx]
|
| 217 |
+
|
| 218 |
+
st.divider()
|
| 219 |
+
col_warn, col_btn = st.columns([3, 1])
|
| 220 |
+
with col_warn:
|
| 221 |
+
st.warning(f"Vuoi rifiutare la relazione: **{row['Soggetto']}** --[{row['Predicato']}]--> **{row['Oggetto']}**?")
|
| 222 |
+
with col_btn:
|
| 223 |
+
if st.button("🗑️ CONFERMA RIFIUTO", type="primary"):
|
| 224 |
+
success = reject_relationship(driver, row['id'], row['Soggetto'], row['Predicato'], row['Oggetto'])
|
| 225 |
+
if success:
|
| 226 |
+
st.success("Relazione eliminata e feedback registrato!")
|
| 227 |
+
st.rerun()
|
| 228 |
+
else:
|
| 229 |
+
st.info("Nessuna relazione trovata nel grafo.")
|
| 230 |
+
else:
|
| 231 |
+
st.warning("Connessione al database non attiva.")
|
| 232 |
+
|
| 233 |
+
# ==============================================================================
|
| 234 |
+
# TAB 3: VISUALIZZAZIONE (PYVIS)
|
| 235 |
+
# ==============================================================================
|
| 236 |
+
with tab_vis:
|
| 237 |
+
st.header("Esplorazione Topologica")
|
| 238 |
+
|
| 239 |
+
if driver:
|
| 240 |
+
physics = st.checkbox("Abilita Fisica (Gravità)", value=True)
|
| 241 |
+
|
| 242 |
+
# Limita a 100 nodi per performance rendering
|
| 243 |
+
cypher_vis = """
|
| 244 |
+
MATCH (s)-[r]->(o)
|
| 245 |
+
RETURN COALESCE(s.name, head(labels(s))) as src,
|
| 246 |
+
type(r) as rel,
|
| 247 |
+
COALESCE(o.name, head(labels(o))) as dst
|
| 248 |
+
LIMIT 100
|
| 249 |
+
"""
|
| 250 |
+
graph_data = run_query(driver, cypher_vis)
|
| 251 |
+
|
| 252 |
+
if graph_data:
|
| 253 |
+
# Creazione Network PyVis
|
| 254 |
+
net = Network(height="600px", width="100%", bgcolor="#222222", font_color="white", notebook=False)
|
| 255 |
+
|
| 256 |
+
for item in graph_data:
|
| 257 |
+
# Aggiungi nodi e archi
|
| 258 |
+
src_lbl = str(item['src'])
|
| 259 |
+
dst_lbl = str(item['dst'])
|
| 260 |
+
rel_lbl = str(item['rel'])
|
| 261 |
+
|
| 262 |
+
net.add_node(src_lbl, label=src_lbl, color="#4facfe", title=src_lbl)
|
| 263 |
+
net.add_node(dst_lbl, label=dst_lbl, color="#00f2fe", title=dst_lbl)
|
| 264 |
+
net.add_edge(src_lbl, dst_lbl, title=rel_lbl, label=rel_lbl)
|
| 265 |
+
|
| 266 |
+
# Opzioni fisica
|
| 267 |
+
net.toggle_physics(physics)
|
| 268 |
+
|
| 269 |
+
# Salva e renderizza HTML
|
| 270 |
+
path = "data/processed/graph_viz.html"
|
| 271 |
+
os.makedirs("data/processed", exist_ok=True)
|
| 272 |
+
net.save_graph(path)
|
| 273 |
+
|
| 274 |
+
with open(path, 'r', encoding='utf-8') as f:
|
| 275 |
+
html_string = f.read()
|
| 276 |
+
components.html(html_string, height=600, scrolling=True)
|
| 277 |
+
|
| 278 |
+
st.caption(f"Visualizzazione dei primi {len(graph_data)} archi.")
|
| 279 |
+
else:
|
| 280 |
+
st.info("Grafo vuoto o impossibile recuperare dati.")
|
| 281 |
+
else:
|
| 282 |
+
st.warning("Connessione al database non attiva.")
|
| 283 |
+
|
| 284 |
+
# Footer
|
| 285 |
+
st.markdown("---")
|
| 286 |
+
st.caption("Activa Digital | NextGenTech | Prototipo v1.0")
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
services:
|
| 2 |
+
neo4j:
|
| 3 |
+
image: neo4j:5.15.0-community
|
| 4 |
+
container_name: activa_graph_db
|
| 5 |
+
ports:
|
| 6 |
+
- "7474:7474" # Browser UI
|
| 7 |
+
- "7687:7687" # Python Driver
|
| 8 |
+
environment:
|
| 9 |
+
- NEO4J_AUTH=neo4j/activa_semantic_lab
|
| 10 |
+
# Carica APOC e GDS automaticamente
|
| 11 |
+
- NEO4J_PLUGINS=["apoc", "graph-data-science"]
|
| 12 |
+
# CONFIGURAZIONE CRUCIALE PER N10S (Neosemantics)
|
| 13 |
+
- NEO4J_dbms_security_procedures_unrestricted=n10s.*,apoc.*
|
| 14 |
+
- NEO4J_dbms_security_procedures_allowlist=n10s.*,apoc.*,gds.*
|
| 15 |
+
# Memoria
|
| 16 |
+
- NEO4J_dbms_memory_heap_initial__size=1G
|
| 17 |
+
- NEO4J_dbms_memory_heap_max__size=2G
|
| 18 |
+
volumes:
|
| 19 |
+
# Mappa le cartelle che hai creato tu nella root
|
| 20 |
+
- ./neo4j/data:/data
|
| 21 |
+
- ./neo4j/plugins:/plugins
|
| 22 |
+
healthcheck:
|
| 23 |
+
test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider localhost:7474 || exit 1"]
|
| 24 |
+
interval: 10s
|
| 25 |
+
timeout: 5s
|
| 26 |
+
retries: 5
|
main.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import os
|
| 3 |
+
import time
|
| 4 |
+
import glob
|
| 5 |
+
|
| 6 |
+
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
| 7 |
+
|
| 8 |
+
from src.ingestion.semantic_splitter import ActivaSemanticSplitter
|
| 9 |
+
from src.extraction.extractor import NeuroSymbolicExtractor
|
| 10 |
+
from src.validation.validator import SemanticValidator
|
| 11 |
+
from src.graph.graph_loader import KnowledgeGraphPersister
|
| 12 |
+
from src.graph.entity_resolver import EntityResolver
|
| 13 |
+
|
| 14 |
+
def pipeline_execution():
|
| 15 |
+
print("\n🚀 AVVIO PIPELINE AUTOMATED DISCOVERY\n" + "="*50)
|
| 16 |
+
|
| 17 |
+
raw_text = load_raw_documents()
|
| 18 |
+
|
| 19 |
+
if not raw_text:
|
| 20 |
+
print("⚠️ Nessun file trovato in data/raw/. Uso testo di default.")
|
| 21 |
+
raw_text = """
|
| 22 |
+
La Basilica di San Marco a Venezia è il principale luogo di culto della città.
|
| 23 |
+
È uno degli esempi più noti di architettura italo-bizantina.
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
# --- FASE 1: INGESTION ---
|
| 27 |
+
print("\n[FASE 1] Ingestion & Semantic Chunking...")
|
| 28 |
+
try:
|
| 29 |
+
# Usa un modello piccolo per lo splitting veloce
|
| 30 |
+
splitter = ActivaSemanticSplitter(model_name="all-MiniLM-L6-v2")
|
| 31 |
+
# percentile_threshold=90 significa: taglia solo quando la similarità scende molto
|
| 32 |
+
chunks, _, _ = splitter.create_chunks(raw_text, percentile_threshold=90)
|
| 33 |
+
save_chunks_to_processed(chunks)
|
| 34 |
+
print(f"✅ Testo diviso in {len(chunks)} segmenti semantici.")
|
| 35 |
+
except Exception as e:
|
| 36 |
+
print(f"❌ Errore in Fase 1: {e}")
|
| 37 |
+
return
|
| 38 |
+
|
| 39 |
+
# --- FASE 2: EXTRACTION ---
|
| 40 |
+
print("\n[FASE 2] Init Neuro-Symbolic Core (Llama 3)...")
|
| 41 |
+
|
| 42 |
+
gold_path = os.path.join("data", "gold_standard", "examples.json")
|
| 43 |
+
|
| 44 |
+
try:
|
| 45 |
+
# Assicurati che Ollama sia attivo!
|
| 46 |
+
extractor = NeuroSymbolicExtractor(model_name="llama3", gold_standard_path=gold_path)
|
| 47 |
+
except Exception as e:
|
| 48 |
+
print(f"❌ Errore connessione Ollama: {e}")
|
| 49 |
+
return
|
| 50 |
+
|
| 51 |
+
all_triples = []
|
| 52 |
+
|
| 53 |
+
print(f"🔄 Avvio estrazione su {len(chunks)} chunk...")
|
| 54 |
+
for i, chunk in enumerate(chunks):
|
| 55 |
+
chunk_id = f"doc_sample_chunk_{i+1}"
|
| 56 |
+
|
| 57 |
+
print(f"\n Processing {chunk_id} ({len(chunk)} chars)...")
|
| 58 |
+
|
| 59 |
+
# Invoca Llama 3
|
| 60 |
+
extraction_result = extractor.extract(chunk, source_id=chunk_id)
|
| 61 |
+
|
| 62 |
+
if extraction_result and extraction_result.triples:
|
| 63 |
+
count = len(extraction_result.triples)
|
| 64 |
+
print(f" -> Estratte {count} triple.")
|
| 65 |
+
# Aggiungiamo le triple alla lista totale
|
| 66 |
+
all_triples.extend(extraction_result.triples)
|
| 67 |
+
else:
|
| 68 |
+
print(" -> Nessuna tripla trovata (o errore parsing).")
|
| 69 |
+
|
| 70 |
+
print(f"\n✅ Totale triple raccolte: {len(all_triples)}")
|
| 71 |
+
|
| 72 |
+
if not all_triples:
|
| 73 |
+
print("⚠️ Nessuna tripla da salvare. Pipeline terminata.")
|
| 74 |
+
return
|
| 75 |
+
|
| 76 |
+
# --- FASE 2.5: SYMBOLIC RESOLUTION & CANONICALIZATION ---
|
| 77 |
+
# Implementazione Sezione 4.1 del Documento
|
| 78 |
+
print("\n[FASE 2.5] Entity Resolution & Canonicalization (DBSCAN)...")
|
| 79 |
+
try:
|
| 80 |
+
resolver = EntityResolver(similarity_threshold=0.85)
|
| 81 |
+
# Sovrascriviamo le triple con quelle pulite
|
| 82 |
+
all_triples = resolver.resolve_entities(all_triples)
|
| 83 |
+
print("✅ Risoluzione entità completata.")
|
| 84 |
+
except Exception as e:
|
| 85 |
+
print(f"⚠️ Errore nel resolver (skip): {e}")
|
| 86 |
+
|
| 87 |
+
print("\n[FASE 2.6] Validazione Semantica (SHACL)...")
|
| 88 |
+
validator = SemanticValidator()
|
| 89 |
+
is_valid, report, _ = validator.validate_batch(all_triples)
|
| 90 |
+
|
| 91 |
+
if is_valid:
|
| 92 |
+
print("✅ Validazione passata. I dati rispettano l'ontologia.")
|
| 93 |
+
else:
|
| 94 |
+
print("⚠️ Warning: Rilevate violazioni SHACL.")
|
| 95 |
+
print(" (In produzione, queste triple verrebbero scartate o mandate in Human Review)")
|
| 96 |
+
# Per ora procediamo, ma in un sistema reale fermeremmo qui le triple corrotte.
|
| 97 |
+
print(report)
|
| 98 |
+
|
| 99 |
+
# --- FASE 3: PERSISTENCE ---
|
| 100 |
+
print("\n[FASE 3] Graph Construction & Persistence (Neo4j)...")
|
| 101 |
+
try:
|
| 102 |
+
persister = KnowledgeGraphPersister()
|
| 103 |
+
persister.save_triples(all_triples)
|
| 104 |
+
persister.close()
|
| 105 |
+
print("\n🎉 PIPELINE COMPLETATA CON SUCCESSO!")
|
| 106 |
+
print("👉 Vai su http://localhost:7474 ed esegui: MATCH (n)-[r]->(m) RETURN n,r,m")
|
| 107 |
+
except Exception as e:
|
| 108 |
+
print(f"❌ Errore in Fase 3 (Neo4j): {e}")
|
| 109 |
+
|
| 110 |
+
def load_raw_documents(directory="data/raw"):
|
| 111 |
+
"""Legge tutti i file .txt nella cartella raw."""
|
| 112 |
+
texts = []
|
| 113 |
+
files = glob.glob(os.path.join(directory, "*.txt"))
|
| 114 |
+
print(f"📂 Trovati {len(files)} documenti in {directory}")
|
| 115 |
+
for f_path in files:
|
| 116 |
+
with open(f_path, 'r', encoding='utf-8') as f:
|
| 117 |
+
texts.append(f.read())
|
| 118 |
+
return "\n\n".join(texts)
|
| 119 |
+
|
| 120 |
+
def save_chunks_to_processed(chunks, directory="data/processed"):
|
| 121 |
+
"""Salva i chunk su disco per debug."""
|
| 122 |
+
os.makedirs(directory, exist_ok=True)
|
| 123 |
+
with open(os.path.join(directory, "chunks_debug.txt"), "w", encoding="utf-8") as f:
|
| 124 |
+
for i, c in enumerate(chunks):
|
| 125 |
+
f.write(f"--- CHUNK {i} ---\n{c}\n\n")
|
| 126 |
+
print(f"💾 Chunk salvati in {directory}/chunks_debug.txt")
|
| 127 |
+
|
| 128 |
+
if __name__ == "__main__":
|
| 129 |
+
start_time = time.time()
|
| 130 |
+
pipeline_execution()
|
| 131 |
+
print(f"\n⏱️ Tempo totale esecuzione: {time.time() - start_time:.2f} secondi")
|