Update app.py
Browse files
app.py
CHANGED
|
@@ -11,15 +11,12 @@ import streamlit.components.v1 as components
|
|
| 11 |
from dotenv import load_dotenv
|
| 12 |
|
| 13 |
# --- IMPORT MODULI SPECIFICI ---
|
| 14 |
-
# 1. Ingestion
|
| 15 |
from src.ingestion.semantic_splitter import ActivaSemanticSplitter
|
| 16 |
-
# 2. Extraction (Importiamo anche la classe GraphTriple per la ricostruzione dei dati)
|
| 17 |
from src.extraction.extractor import NeuroSymbolicExtractor, GraphTriple
|
| 18 |
-
# 3. Graph Building (Loader & Resolver)
|
| 19 |
from src.graph.graph_loader import KnowledgeGraphPersister
|
| 20 |
from src.graph.entity_resolver import EntityResolver
|
| 21 |
|
| 22 |
-
# --- CONFIGURAZIONE
|
| 23 |
load_dotenv()
|
| 24 |
st.set_page_config(
|
| 25 |
page_title="Activa Semantic Discovery",
|
|
@@ -28,51 +25,73 @@ st.set_page_config(
|
|
| 28 |
page_icon="🧠"
|
| 29 |
)
|
| 30 |
|
| 31 |
-
# ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
def get_driver(uri, user, password):
|
| 33 |
-
if not uri or not password:
|
| 34 |
-
return None
|
| 35 |
try:
|
| 36 |
return GraphDatabase.driver(uri, auth=(user, password))
|
| 37 |
-
except
|
| 38 |
-
return None
|
| 39 |
|
| 40 |
def run_query(driver, query, params=None):
|
| 41 |
-
if driver is None:
|
| 42 |
-
return []
|
| 43 |
with driver.session() as session:
|
| 44 |
result = session.run(query, params)
|
| 45 |
return [r.data() for r in result]
|
| 46 |
|
| 47 |
-
# ---
|
| 48 |
-
def reject_relationship(driver, rel_id, subj, pred, obj, reason="Human Rejection"):
|
| 49 |
-
"""
|
| 50 |
-
1. Cancella dal DB (Azione Reale).
|
| 51 |
-
2. Salva in CSV per Active Learning (Data Lineage del rifiuto).
|
| 52 |
-
"""
|
| 53 |
-
query = "MATCH ()-[r]->() WHERE elementId(r) = $id DELETE r"
|
| 54 |
-
try:
|
| 55 |
-
run_query(driver, query, {"id": rel_id})
|
| 56 |
-
except Exception as e:
|
| 57 |
-
st.error(f"Errore durante la cancellazione: {e}")
|
| 58 |
-
return False
|
| 59 |
-
|
| 60 |
-
log_file = "data/processed/rejected_triples.csv"
|
| 61 |
-
os.makedirs("data/processed", exist_ok=True)
|
| 62 |
-
file_exists = os.path.isfile(log_file)
|
| 63 |
-
|
| 64 |
-
try:
|
| 65 |
-
with open(log_file, mode='a', newline='', encoding='utf-8') as f:
|
| 66 |
-
writer = csv.writer(f)
|
| 67 |
-
if not file_exists:
|
| 68 |
-
writer.writerow(["timestamp", "subject", "predicate", "object", "reason"])
|
| 69 |
-
writer.writerow([datetime.now(), subj, pred, obj, reason])
|
| 70 |
-
return True
|
| 71 |
-
except Exception as e:
|
| 72 |
-
st.warning(f"Relazione cancellata dal DB, ma errore nel log CSV: {e}")
|
| 73 |
-
return True
|
| 74 |
-
|
| 75 |
-
# --- UI: SIDEBAR & CONFIGURAZIONE ---
|
| 76 |
st.sidebar.title("⚙️ Configurazione")
|
| 77 |
|
| 78 |
env_uri = os.getenv("NEO4J_URI", "")
|
|
@@ -80,14 +99,12 @@ env_user = os.getenv("NEO4J_USER", "neo4j")
|
|
| 80 |
env_password = os.getenv("NEO4J_PASSWORD", "")
|
| 81 |
env_hf_token = os.getenv("HF_TOKEN", "")
|
| 82 |
|
| 83 |
-
st.sidebar.subheader("Backend
|
| 84 |
-
|
| 85 |
-
hf_token_input
|
| 86 |
-
if hf_token_input:
|
| 87 |
-
os.environ["HF_TOKEN"] = hf_token_input
|
| 88 |
|
| 89 |
-
st.sidebar.subheader("
|
| 90 |
-
uri = st.sidebar.text_input("URI", value=env_uri
|
| 91 |
user = st.sidebar.text_input("User", value=env_user)
|
| 92 |
password = st.sidebar.text_input("Password", value=env_password, type="password")
|
| 93 |
|
|
@@ -95,188 +112,235 @@ driver = None
|
|
| 95 |
if uri and password:
|
| 96 |
driver = get_driver(uri, user, password)
|
| 97 |
if driver:
|
| 98 |
-
st.sidebar.success("🟢 Connesso
|
| 99 |
os.environ["NEO4J_URI"] = uri
|
| 100 |
os.environ["NEO4J_USER"] = user
|
| 101 |
os.environ["NEO4J_PASSWORD"] = password
|
| 102 |
else:
|
| 103 |
st.sidebar.error("🔴 Errore connessione")
|
| 104 |
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
st.title("🧠 Automated Semantic Discovery Prototype")
|
| 107 |
-
st.markdown("**
|
| 108 |
|
| 109 |
-
# --- TAB
|
| 110 |
tab_gen, tab_val, tab_vis = st.tabs([
|
| 111 |
-
"⚙️ 1.
|
| 112 |
-
"🔍 2. Validazione (
|
| 113 |
-
"🕸️ 3.
|
| 114 |
])
|
| 115 |
|
| 116 |
# ==============================================================================
|
| 117 |
-
# TAB 1:
|
| 118 |
# ==============================================================================
|
| 119 |
with tab_gen:
|
| 120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
|
| 122 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
|
|
|
| 128 |
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
st.warning("Nessun file in data/examples")
|
| 145 |
else:
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
try:
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
text_content = f.read()
|
| 166 |
-
|
| 167 |
-
# 2. Inizializza Splitter e processa
|
| 168 |
-
status.write("Caricamento modelli di embedding...")
|
| 169 |
-
splitter = ActivaSemanticSplitter() # Usa default huggingface
|
| 170 |
|
| 171 |
-
|
| 172 |
-
chunks, dists, threshold = splitter.create_chunks(text_content)
|
| 173 |
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
with open(chunk_file, "w", encoding="utf-8") as f:
|
| 177 |
-
json.dump(chunks, f, ensure_ascii=False, indent=2)
|
| 178 |
|
| 179 |
-
|
| 180 |
-
|
|
|
|
| 181 |
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
st.image("chunking_analysis.png", caption="Analisi Coerenza")
|
| 185 |
-
|
| 186 |
except Exception as e:
|
| 187 |
-
status.update(label="Errore Chunking", state="error")
|
| 188 |
st.error(f"Errore: {e}")
|
| 189 |
-
|
| 190 |
-
# --- STEP B: EXTRACTION ---
|
| 191 |
-
with c2:
|
| 192 |
-
if st.button("B. Info Extraction"):
|
| 193 |
-
chunk_file = "data/processed/chunks.json"
|
| 194 |
-
if not os.path.exists(chunk_file):
|
| 195 |
-
st.error("Esegui prima il Chunking!")
|
| 196 |
-
else:
|
| 197 |
-
with st.status("Estrazione Neuro-Simbolica...", expanded=True) as status:
|
| 198 |
-
try:
|
| 199 |
-
# 1. Carica Chunks
|
| 200 |
-
with open(chunk_file, "r", encoding="utf-8") as f:
|
| 201 |
-
chunks = json.load(f)
|
| 202 |
-
|
| 203 |
-
# 2. Init Extractor (Rileva HF_TOKEN da env)
|
| 204 |
-
status.write("Inizializzazione LLM (Locale/Cloud)...")
|
| 205 |
-
extractor = NeuroSymbolicExtractor() # Usa default params
|
| 206 |
-
|
| 207 |
-
all_triples = []
|
| 208 |
-
progress_bar = st.progress(0)
|
| 209 |
-
|
| 210 |
-
# 3. Loop su chunk
|
| 211 |
-
for i, chunk in enumerate(chunks):
|
| 212 |
-
status.write(f"Processando chunk {i+1}/{len(chunks)}...")
|
| 213 |
-
result = extractor.extract(chunk, source_id=selected_file_path)
|
| 214 |
-
# Converti oggetti Pydantic in dict per serializzazione JSON
|
| 215 |
-
triples_dicts = [t.model_dump() for t in result.triples]
|
| 216 |
-
all_triples.extend(triples_dicts)
|
| 217 |
-
progress_bar.progress((i + 1) / len(chunks))
|
| 218 |
-
|
| 219 |
-
# 4. Salvataggio Raw Triples
|
| 220 |
-
triples_file = "data/processed/triples_raw.json"
|
| 221 |
-
with open(triples_file, "w", encoding="utf-8") as f:
|
| 222 |
-
json.dump(all_triples, f, ensure_ascii=False, indent=2)
|
| 223 |
-
|
| 224 |
-
status.update(label="Estrazione Completata!", state="complete", expanded=False)
|
| 225 |
-
st.success(f"Estratte {len(all_triples)} triple candidate.")
|
| 226 |
-
|
| 227 |
-
except Exception as e:
|
| 228 |
-
status.update(label="Errore Estrazione", state="error")
|
| 229 |
-
st.error(f"Errore: {e}")
|
| 230 |
-
|
| 231 |
-
# --- STEP C: GRAPH BUILDING ---
|
| 232 |
-
with c3:
|
| 233 |
-
if st.button("C. Popola Neo4j", type="primary"):
|
| 234 |
-
triples_file = "data/processed/triples_raw.json"
|
| 235 |
-
if not os.path.exists(triples_file):
|
| 236 |
-
st.error("Esegui prima l'estrazione!")
|
| 237 |
-
elif not driver:
|
| 238 |
-
st.error("Connettiti al DB prima!")
|
| 239 |
-
else:
|
| 240 |
-
with st.status("Costruzione Grafo...", expanded=True) as status:
|
| 241 |
-
try:
|
| 242 |
-
# 1. Carica Raw Triples
|
| 243 |
-
with open(triples_file, "r", encoding="utf-8") as f:
|
| 244 |
-
raw_data = json.load(f)
|
| 245 |
-
|
| 246 |
-
# Ricostruisci oggetti GraphTriple (necessari per il Resolver)
|
| 247 |
-
triples_objs = [GraphTriple(**t) for t in raw_data]
|
| 248 |
-
|
| 249 |
-
# 2. Entity Resolution
|
| 250 |
-
status.write("Entity Resolution (DBSCAN Clustering)...")
|
| 251 |
-
resolver = EntityResolver(similarity_threshold=0.85)
|
| 252 |
-
resolved_triples = resolver.resolve_entities(triples_objs)
|
| 253 |
-
status.write(f"Entità normalizzate. Triple da inserire: {len(resolved_triples)}")
|
| 254 |
-
|
| 255 |
-
# 3. Persistenza Neo4j
|
| 256 |
-
status.write("Scrittura Batch su Neo4j...")
|
| 257 |
-
persister = KnowledgeGraphPersister() # Prende credenziali da env
|
| 258 |
-
persister.save_triples(resolved_triples)
|
| 259 |
-
persister.close()
|
| 260 |
-
|
| 261 |
-
status.update(label="Grafo Aggiornato!", state="complete", expanded=False)
|
| 262 |
-
st.success("🚀 Grafo costruito con successo su AuraDB!")
|
| 263 |
-
st.balloons()
|
| 264 |
-
|
| 265 |
-
except Exception as e:
|
| 266 |
-
status.update(label="Errore Costruzione", state="error")
|
| 267 |
-
st.error(f"Errore: {e}")
|
| 268 |
-
else:
|
| 269 |
-
st.write("👈 Seleziona un file per iniziare.")
|
| 270 |
|
| 271 |
# ==============================================================================
|
| 272 |
-
# TAB 2: VALIDAZIONE (
|
| 273 |
# ==============================================================================
|
| 274 |
with tab_val:
|
| 275 |
st.header("Curation & Feedback Loop")
|
| 276 |
-
st.markdown("Validazione delle triple estratte. Le relazioni rifiutate vengono usate per il fine-tuning.")
|
| 277 |
-
|
| 278 |
if driver:
|
| 279 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
cypher_val = """
|
| 281 |
MATCH (s)-[r]->(o)
|
| 282 |
RETURN elementId(r) as id,
|
|
@@ -290,77 +354,44 @@ with tab_val:
|
|
| 290 |
|
| 291 |
if triples_data:
|
| 292 |
df = pd.DataFrame(triples_data)
|
| 293 |
-
st.
|
| 294 |
-
|
| 295 |
-
event = st.dataframe(
|
| 296 |
-
df.drop(columns=["id"]),
|
| 297 |
-
selection_mode="single-row",
|
| 298 |
-
on_select="rerun",
|
| 299 |
-
use_container_width=True,
|
| 300 |
-
hide_index=True
|
| 301 |
-
)
|
| 302 |
-
|
| 303 |
-
if len(event.selection.rows) > 0:
|
| 304 |
-
idx = event.selection.rows[0]
|
| 305 |
-
row = df.iloc[idx]
|
| 306 |
-
|
| 307 |
-
st.divider()
|
| 308 |
-
col_warn, col_btn = st.columns([3, 1])
|
| 309 |
-
with col_warn:
|
| 310 |
-
st.warning(f"Rifiutare: **{row['Soggetto']}** --[{row['Predicato']}]--> **{row['Oggetto']}**?")
|
| 311 |
-
with col_btn:
|
| 312 |
-
if st.button("🗑️ CONFERMA RIFIUTO", type="primary"):
|
| 313 |
-
success = reject_relationship(driver, row['id'], row['Soggetto'], row['Predicato'], row['Oggetto'])
|
| 314 |
-
if success:
|
| 315 |
-
st.success("Relazione eliminata!")
|
| 316 |
-
st.rerun()
|
| 317 |
else:
|
| 318 |
-
st.info("
|
| 319 |
else:
|
| 320 |
st.warning("Database non connesso.")
|
| 321 |
|
| 322 |
# ==============================================================================
|
| 323 |
-
# TAB 3: VISUALIZZAZIONE
|
| 324 |
# ==============================================================================
|
| 325 |
with tab_vis:
|
| 326 |
st.header("Esplorazione Topologica")
|
| 327 |
-
|
| 328 |
if driver:
|
| 329 |
-
physics = st.checkbox("Abilita Fisica", value=True)
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
if graph_data:
|
| 340 |
-
net = Network(height="600px", width="100%", bgcolor="#222222", font_color="white", notebook=False)
|
| 341 |
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
|
|
|
|
|
|
|
|
|
| 346 |
|
| 347 |
-
net.
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
with open(path, 'r', encoding='utf-8') as f:
|
| 357 |
-
html_string = f.read()
|
| 358 |
-
components.html(html_string, height=600, scrolling=True)
|
| 359 |
-
else:
|
| 360 |
-
st.info("Grafo vuoto.")
|
| 361 |
else:
|
| 362 |
-
st.warning("Database non connesso.")
|
| 363 |
-
|
| 364 |
-
# Footer
|
| 365 |
-
st.markdown("---")
|
| 366 |
-
st.caption("Activa Digital | NextGenTech | Prototipo v1.0")
|
|
|
|
| 11 |
from dotenv import load_dotenv
|
| 12 |
|
| 13 |
# --- IMPORT MODULI SPECIFICI ---
|
|
|
|
| 14 |
from src.ingestion.semantic_splitter import ActivaSemanticSplitter
|
|
|
|
| 15 |
from src.extraction.extractor import NeuroSymbolicExtractor, GraphTriple
|
|
|
|
| 16 |
from src.graph.graph_loader import KnowledgeGraphPersister
|
| 17 |
from src.graph.entity_resolver import EntityResolver
|
| 18 |
|
| 19 |
+
# --- CONFIGURAZIONE PAGINA ---
|
| 20 |
load_dotenv()
|
| 21 |
st.set_page_config(
|
| 22 |
page_title="Activa Semantic Discovery",
|
|
|
|
| 25 |
page_icon="🧠"
|
| 26 |
)
|
| 27 |
|
| 28 |
+
# --- CSS CUSTOM PER UX MIGLIORATA ---
|
| 29 |
+
# Rende le card più leggibili e stilizza i messaggi di stato
|
| 30 |
+
st.markdown("""
|
| 31 |
+
<style>
|
| 32 |
+
.step-card {
|
| 33 |
+
padding: 20px;
|
| 34 |
+
border-radius: 10px;
|
| 35 |
+
border: 1px solid #e0e0e0;
|
| 36 |
+
margin-bottom: 20px;
|
| 37 |
+
background-color: #262730;
|
| 38 |
+
}
|
| 39 |
+
.step-header {
|
| 40 |
+
font-size: 1.2rem;
|
| 41 |
+
font-weight: bold;
|
| 42 |
+
margin-bottom: 10px;
|
| 43 |
+
color: #4facfe;
|
| 44 |
+
}
|
| 45 |
+
.success-box {
|
| 46 |
+
padding: 10px;
|
| 47 |
+
background-color: rgba(76, 175, 80, 0.1);
|
| 48 |
+
border-left: 5px solid #4CAF50;
|
| 49 |
+
border-radius: 5px;
|
| 50 |
+
}
|
| 51 |
+
</style>
|
| 52 |
+
""", unsafe_allow_html=True)
|
| 53 |
+
|
| 54 |
+
# --- SESSION STATE MANAGEMENT ---
|
| 55 |
+
if 'pipeline_stage' not in st.session_state:
|
| 56 |
+
st.session_state.pipeline_stage = 0 # 0: Init, 1: Chunked, 2: Extracted, 3: Loaded
|
| 57 |
+
if 'current_file' not in st.session_state:
|
| 58 |
+
st.session_state.current_file = None
|
| 59 |
+
|
| 60 |
+
def reset_pipeline():
|
| 61 |
+
st.session_state.pipeline_stage = 0
|
| 62 |
+
st.session_state.current_file = None
|
| 63 |
+
# Pulisce i file processati per evitare incongruenze
|
| 64 |
+
if os.path.exists("data/processed"):
|
| 65 |
+
shutil.rmtree("data/processed")
|
| 66 |
+
os.makedirs("data/processed", exist_ok=True)
|
| 67 |
+
|
| 68 |
+
# --- CACHING RISORSE ---
|
| 69 |
+
@st.cache_resource
|
| 70 |
+
def get_splitter():
|
| 71 |
+
return ActivaSemanticSplitter()
|
| 72 |
+
|
| 73 |
+
@st.cache_resource
|
| 74 |
+
def get_extractor():
|
| 75 |
+
return NeuroSymbolicExtractor()
|
| 76 |
+
|
| 77 |
+
@st.cache_resource
|
| 78 |
+
def get_resolver():
|
| 79 |
+
return EntityResolver(similarity_threshold=0.85)
|
| 80 |
+
|
| 81 |
+
# --- FUNZIONI NEO4J ---
|
| 82 |
def get_driver(uri, user, password):
|
| 83 |
+
if not uri or not password: return None
|
|
|
|
| 84 |
try:
|
| 85 |
return GraphDatabase.driver(uri, auth=(user, password))
|
| 86 |
+
except: return None
|
|
|
|
| 87 |
|
| 88 |
def run_query(driver, query, params=None):
|
| 89 |
+
if driver is None: return []
|
|
|
|
| 90 |
with driver.session() as session:
|
| 91 |
result = session.run(query, params)
|
| 92 |
return [r.data() for r in result]
|
| 93 |
|
| 94 |
+
# --- UI: SIDEBAR ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
st.sidebar.title("⚙️ Configurazione")
|
| 96 |
|
| 97 |
env_uri = os.getenv("NEO4J_URI", "")
|
|
|
|
| 99 |
env_password = os.getenv("NEO4J_PASSWORD", "")
|
| 100 |
env_hf_token = os.getenv("HF_TOKEN", "")
|
| 101 |
|
| 102 |
+
st.sidebar.subheader("Backend AI")
|
| 103 |
+
hf_token_input = st.sidebar.text_input("HF Token (Opzionale)", value=env_hf_token, type="password")
|
| 104 |
+
if hf_token_input: os.environ["HF_TOKEN"] = hf_token_input
|
|
|
|
|
|
|
| 105 |
|
| 106 |
+
st.sidebar.subheader("Knowledge Graph")
|
| 107 |
+
uri = st.sidebar.text_input("URI", value=env_uri)
|
| 108 |
user = st.sidebar.text_input("User", value=env_user)
|
| 109 |
password = st.sidebar.text_input("Password", value=env_password, type="password")
|
| 110 |
|
|
|
|
| 112 |
if uri and password:
|
| 113 |
driver = get_driver(uri, user, password)
|
| 114 |
if driver:
|
| 115 |
+
st.sidebar.success("🟢 Connesso a Neo4j")
|
| 116 |
os.environ["NEO4J_URI"] = uri
|
| 117 |
os.environ["NEO4J_USER"] = user
|
| 118 |
os.environ["NEO4J_PASSWORD"] = password
|
| 119 |
else:
|
| 120 |
st.sidebar.error("🔴 Errore connessione")
|
| 121 |
|
| 122 |
+
st.sidebar.divider()
|
| 123 |
+
if st.sidebar.button("🔄 Reset Pipeline", on_click=reset_pipeline):
|
| 124 |
+
st.sidebar.info("Stato resettato.")
|
| 125 |
+
|
| 126 |
+
# --- MAIN HEADER ---
|
| 127 |
st.title("🧠 Automated Semantic Discovery Prototype")
|
| 128 |
+
st.markdown("**Pipeline Sequenziale Neuro-Simbolica**")
|
| 129 |
|
| 130 |
+
# --- TAB LOGIC ---
|
| 131 |
tab_gen, tab_val, tab_vis = st.tabs([
|
| 132 |
+
"⚙️ 1. Pipeline Generativa",
|
| 133 |
+
"🔍 2. Validazione (HITL)",
|
| 134 |
+
"🕸️ 3. Esplorazione Grafo"
|
| 135 |
])
|
| 136 |
|
| 137 |
# ==============================================================================
|
| 138 |
+
# TAB 1: PIPELINE GENERATIVA (STEPPER UI)
|
| 139 |
# ==============================================================================
|
| 140 |
with tab_gen:
|
| 141 |
+
# --- SELEZIONE FILE ---
|
| 142 |
+
st.subheader("1. Sorgente Documentale")
|
| 143 |
+
|
| 144 |
+
col_sel, col_info = st.columns([1, 2])
|
| 145 |
+
with col_sel:
|
| 146 |
+
data_source = st.radio("Modalità:", ("📂 Esempi Demo", "⬆️ Upload"), horizontal=True)
|
| 147 |
+
|
| 148 |
+
selected_file = None
|
| 149 |
+
os.makedirs("data/raw", exist_ok=True)
|
| 150 |
+
os.makedirs("data/processed", exist_ok=True)
|
| 151 |
+
os.makedirs("data/examples", exist_ok=True)
|
| 152 |
+
|
| 153 |
+
if data_source == "📂 Esempi Demo":
|
| 154 |
+
files = [f for f in os.listdir("data/examples") if f.endswith(".txt")]
|
| 155 |
+
if files:
|
| 156 |
+
choice = st.selectbox("Seleziona scenario:", files, index=0)
|
| 157 |
+
if choice:
|
| 158 |
+
src = os.path.join("data/examples", choice)
|
| 159 |
+
dst = os.path.join("data/raw", choice)
|
| 160 |
+
shutil.copy(src, dst)
|
| 161 |
+
selected_file = choice
|
| 162 |
+
else:
|
| 163 |
+
st.warning("Nessun file in data/examples")
|
| 164 |
+
else:
|
| 165 |
+
uploaded = st.file_uploader("Carica .txt", type="txt")
|
| 166 |
+
if uploaded:
|
| 167 |
+
with open(os.path.join("data/raw", uploaded.name), "wb") as f:
|
| 168 |
+
f.write(uploaded.getbuffer())
|
| 169 |
+
selected_file = uploaded.name
|
| 170 |
+
|
| 171 |
+
# Logica di cambio file: se cambia il file, resetta la pipeline
|
| 172 |
+
if selected_file and selected_file != st.session_state.current_file:
|
| 173 |
+
st.session_state.current_file = selected_file
|
| 174 |
+
st.session_state.pipeline_stage = 0
|
| 175 |
+
st.rerun()
|
| 176 |
+
|
| 177 |
+
if not selected_file:
|
| 178 |
+
st.info("👈 Seleziona o carica un file per iniziare.")
|
| 179 |
+
st.stop()
|
| 180 |
+
|
| 181 |
+
st.markdown("---")
|
| 182 |
|
| 183 |
+
# --- PROGRESS BAR ---
|
| 184 |
+
# stage 0 -> 0%, stage 1 -> 33%, stage 2 -> 66%, stage 3 -> 100%
|
| 185 |
+
progress_val = int((st.session_state.pipeline_stage / 3) * 100)
|
| 186 |
+
st.progress(progress_val, text=f"Progresso Pipeline: {progress_val}%")
|
| 187 |
+
|
| 188 |
+
# ==========================
|
| 189 |
+
# FASE A: CHUNKING
|
| 190 |
+
# ==========================
|
| 191 |
+
with st.container():
|
| 192 |
+
st.markdown(f"### {'✅' if st.session_state.pipeline_stage >= 1 else '1️⃣'} Fase A: Semantic Chunking")
|
| 193 |
+
|
| 194 |
+
if st.session_state.pipeline_stage >= 1:
|
| 195 |
+
# Stato Completato: Mostra riassunto
|
| 196 |
+
with open("data/processed/chunks.json", "r") as f:
|
| 197 |
+
chunks = json.load(f)
|
| 198 |
+
st.markdown(f"""
|
| 199 |
+
<div class="success-box">
|
| 200 |
+
<b>Chunking completato!</b> Generati {len(chunks)} frammenti semantici.<br>
|
| 201 |
+
Modello vettoriale utilizzato: <i>MiniLM-L12-v2</i>
|
| 202 |
+
</div>
|
| 203 |
+
""", unsafe_allow_html=True)
|
| 204 |
+
with st.expander("Vedi dettagli frammenti"):
|
| 205 |
+
st.json(chunks[:3]) # Mostra solo i primi 3 per pulizia
|
| 206 |
+
else:
|
| 207 |
+
# Stato Attivo: Bottone azione
|
| 208 |
+
st.markdown("Segmentazione del testo basata sulla coerenza semantica vettoriale.")
|
| 209 |
+
if st.button("Avvia Analisi Semantica", type="primary"):
|
| 210 |
+
with st.spinner("Calcolo vettori e segmentazione..."):
|
| 211 |
+
try:
|
| 212 |
+
with open(os.path.join("data/raw", selected_file), "r", encoding="utf-8") as f:
|
| 213 |
+
text_content = f.read()
|
| 214 |
+
|
| 215 |
+
splitter = get_splitter()
|
| 216 |
+
chunks, dists, threshold = splitter.create_chunks(text_content)
|
| 217 |
+
|
| 218 |
+
with open("data/processed/chunks.json", "w", encoding="utf-8") as f:
|
| 219 |
+
json.dump(chunks, f, ensure_ascii=False, indent=2)
|
| 220 |
+
|
| 221 |
+
st.session_state.pipeline_stage = 1
|
| 222 |
+
st.rerun()
|
| 223 |
+
except Exception as e:
|
| 224 |
+
st.error(f"Errore: {e}")
|
| 225 |
+
|
| 226 |
+
st.markdown("⬇️")
|
| 227 |
+
|
| 228 |
+
# ==========================
|
| 229 |
+
# FASE B: EXTRACTION
|
| 230 |
+
# ==========================
|
| 231 |
+
is_step_b_unlocked = st.session_state.pipeline_stage >= 1
|
| 232 |
|
| 233 |
+
with st.container():
|
| 234 |
+
# Header grigio se bloccato, colorato se attivo
|
| 235 |
+
color = "black" if is_step_b_unlocked else "gray"
|
| 236 |
+
icon = "✅" if st.session_state.pipeline_stage >= 2 else ("2️⃣" if is_step_b_unlocked else "🔒")
|
| 237 |
+
st.markdown(f"<h3 style='color:{color}'>{icon} Fase B: Information Extraction</h3>", unsafe_allow_html=True)
|
| 238 |
|
| 239 |
+
if not is_step_b_unlocked:
|
| 240 |
+
st.caption("Completa la Fase A per sbloccare l'estrazione.")
|
| 241 |
+
|
| 242 |
+
elif st.session_state.pipeline_stage >= 2:
|
| 243 |
+
# Stato Completato
|
| 244 |
+
with open("data/processed/triples_raw.json", "r") as f:
|
| 245 |
+
triples = json.load(f)
|
| 246 |
+
st.markdown(f"""
|
| 247 |
+
<div class="success-box">
|
| 248 |
+
<b>Estrazione completata!</b> Identificate {len(triples)} triple candidate.<br>
|
| 249 |
+
Motore Neuro-Simbolico: <i>Llama3/Mistral + Dependecy Parsing</i>
|
| 250 |
+
</div>
|
| 251 |
+
""", unsafe_allow_html=True)
|
| 252 |
+
with st.expander("Vedi esempio triple"):
|
| 253 |
+
st.dataframe(pd.DataFrame(triples).head(5), hide_index=True)
|
|
|
|
| 254 |
else:
|
| 255 |
+
# Stato Attivo
|
| 256 |
+
st.markdown("Estrazione di Entità e Relazioni tramite approccio Neuro-Simbolico.")
|
| 257 |
+
if st.button("Avvia Estrazione Ontologica", type="primary"):
|
| 258 |
+
with st.spinner("Processando frammenti con LLM..."):
|
| 259 |
+
try:
|
| 260 |
+
with open("data/processed/chunks.json", "r", encoding="utf-8") as f:
|
| 261 |
+
chunks = json.load(f)
|
| 262 |
+
|
| 263 |
+
extractor = get_extractor()
|
| 264 |
+
all_triples = []
|
| 265 |
+
prog_bar = st.progress(0)
|
| 266 |
+
|
| 267 |
+
for i, chunk in enumerate(chunks):
|
| 268 |
+
res = extractor.extract(chunk, source_id=selected_file)
|
| 269 |
+
all_triples.extend([t.model_dump() for t in res.triples])
|
| 270 |
+
prog_bar.progress((i+1)/len(chunks))
|
| 271 |
+
|
| 272 |
+
with open("data/processed/triples_raw.json", "w", encoding="utf-8") as f:
|
| 273 |
+
json.dump(all_triples, f, ensure_ascii=False, indent=2)
|
| 274 |
+
|
| 275 |
+
st.session_state.pipeline_stage = 2
|
| 276 |
+
st.rerun()
|
| 277 |
+
except Exception as e:
|
| 278 |
+
st.error(f"Errore: {e}")
|
| 279 |
+
|
| 280 |
+
st.markdown("⬇️")
|
| 281 |
+
|
| 282 |
+
# ==========================
|
| 283 |
+
# FASE C: GRAPH POPULATION
|
| 284 |
+
# ==========================
|
| 285 |
+
is_step_c_unlocked = st.session_state.pipeline_stage >= 2
|
| 286 |
+
|
| 287 |
+
with st.container():
|
| 288 |
+
color = "black" if is_step_c_unlocked else "gray"
|
| 289 |
+
icon = "✅" if st.session_state.pipeline_stage >= 3 else ("3️⃣" if is_step_c_unlocked else "🔒")
|
| 290 |
+
st.markdown(f"<h3 style='color:{color}'>{icon} Fase C: Graph Construction</h3>", unsafe_allow_html=True)
|
| 291 |
+
|
| 292 |
+
if not is_step_c_unlocked:
|
| 293 |
+
st.caption("Completa la Fase B per popolare il grafo.")
|
| 294 |
+
|
| 295 |
+
elif st.session_state.pipeline_stage >= 3:
|
| 296 |
+
st.markdown("""
|
| 297 |
+
<div class="success-box">
|
| 298 |
+
<b>Grafo Aggiornato!</b> I dati sono stati caricati su Neo4j.<br>
|
| 299 |
+
Puoi esplorarli nei tab "Validazione" e "Visualizzazione".
|
| 300 |
+
</div>
|
| 301 |
+
""", unsafe_allow_html=True)
|
| 302 |
+
st.balloons()
|
| 303 |
+
if st.button("Riavvia con nuovo file"):
|
| 304 |
+
reset_pipeline()
|
| 305 |
+
st.rerun()
|
| 306 |
+
else:
|
| 307 |
+
st.markdown("Entity Resolution (Deduplica) e Caricamento su Neo4j.")
|
| 308 |
+
if not driver:
|
| 309 |
+
st.error("⚠️ Connettiti a Neo4j (nella sidebar) per procedere.")
|
| 310 |
+
else:
|
| 311 |
+
if st.button("Genera Knowledge Graph", type="primary"):
|
| 312 |
+
with st.spinner("Risoluzione entità e scrittura DB..."):
|
| 313 |
try:
|
| 314 |
+
with open("data/processed/triples_raw.json", "r", encoding="utf-8") as f:
|
| 315 |
+
raw_data = json.load(f)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
|
| 317 |
+
triples_objs = [GraphTriple(**t) for t in raw_data]
|
|
|
|
| 318 |
|
| 319 |
+
resolver = get_resolver()
|
| 320 |
+
resolved = resolver.resolve_entities(triples_objs)
|
|
|
|
|
|
|
| 321 |
|
| 322 |
+
persister = KnowledgeGraphPersister()
|
| 323 |
+
persister.save_triples(resolved)
|
| 324 |
+
persister.close()
|
| 325 |
|
| 326 |
+
st.session_state.pipeline_stage = 3
|
| 327 |
+
st.rerun()
|
|
|
|
|
|
|
| 328 |
except Exception as e:
|
|
|
|
| 329 |
st.error(f"Errore: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 330 |
|
| 331 |
# ==============================================================================
|
| 332 |
+
# TAB 2: VALIDAZIONE (Codice invariato, solo stile)
|
| 333 |
# ==============================================================================
|
| 334 |
with tab_val:
|
| 335 |
st.header("Curation & Feedback Loop")
|
|
|
|
|
|
|
| 336 |
if driver:
|
| 337 |
+
# Recupera statistiche rapide
|
| 338 |
+
stats = run_query(driver, "MATCH (n) RETURN count(n) as nodes, count{()-->()} as rels")
|
| 339 |
+
if stats:
|
| 340 |
+
c1, c2 = st.columns(2)
|
| 341 |
+
c1.metric("Nodi Totali", stats[0]['nodes'])
|
| 342 |
+
c2.metric("Relazioni", stats[0]['rels'])
|
| 343 |
+
|
| 344 |
cypher_val = """
|
| 345 |
MATCH (s)-[r]->(o)
|
| 346 |
RETURN elementId(r) as id,
|
|
|
|
| 354 |
|
| 355 |
if triples_data:
|
| 356 |
df = pd.DataFrame(triples_data)
|
| 357 |
+
st.dataframe(df.drop(columns=["id"]), use_container_width=True, hide_index=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 358 |
else:
|
| 359 |
+
st.info("Grafo vuoto.")
|
| 360 |
else:
|
| 361 |
st.warning("Database non connesso.")
|
| 362 |
|
| 363 |
# ==============================================================================
|
| 364 |
+
# TAB 3: VISUALIZZAZIONE
|
| 365 |
# ==============================================================================
|
| 366 |
with tab_vis:
|
| 367 |
st.header("Esplorazione Topologica")
|
|
|
|
| 368 |
if driver:
|
| 369 |
+
physics = st.checkbox("Abilita Fisica (Gravità)", value=True)
|
| 370 |
+
if st.button("Aggiorna Visualizzazione"):
|
| 371 |
+
cypher_vis = """
|
| 372 |
+
MATCH (s)-[r]->(o)
|
| 373 |
+
RETURN COALESCE(s.label, s.name, head(labels(s))) as src,
|
| 374 |
+
type(r) as rel,
|
| 375 |
+
COALESCE(o.label, o.name, head(labels(o))) as dst
|
| 376 |
+
LIMIT 100
|
| 377 |
+
"""
|
| 378 |
+
graph_data = run_query(driver, cypher_vis)
|
|
|
|
|
|
|
| 379 |
|
| 380 |
+
if graph_data:
|
| 381 |
+
net = Network(height="600px", width="100%", bgcolor="#222222", font_color="white", notebook=False)
|
| 382 |
+
for item in graph_data:
|
| 383 |
+
src, dst, rel = str(item['src']), str(item['dst']), str(item['rel'])
|
| 384 |
+
net.add_node(src, label=src, color="#4facfe", title=src)
|
| 385 |
+
net.add_node(dst, label=dst, color="#00f2fe", title=dst)
|
| 386 |
+
net.add_edge(src, dst, title=rel, label=rel)
|
| 387 |
|
| 388 |
+
net.toggle_physics(physics)
|
| 389 |
+
path = "data/processed/graph_viz.html"
|
| 390 |
+
os.makedirs("data/processed", exist_ok=True)
|
| 391 |
+
net.save_graph(path)
|
| 392 |
+
|
| 393 |
+
with open(path, 'r', encoding='utf-8') as f:
|
| 394 |
+
html_string = f.read()
|
| 395 |
+
components.html(html_string, height=600, scrolling=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 396 |
else:
|
| 397 |
+
st.warning("Database non connesso.")
|
|
|
|
|
|
|
|
|
|
|
|