GaetanoParente commited on
Commit
24213b8
·
verified ·
1 Parent(s): c4f394e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +291 -260
app.py CHANGED
@@ -11,15 +11,12 @@ import streamlit.components.v1 as components
11
  from dotenv import load_dotenv
12
 
13
  # --- IMPORT MODULI SPECIFICI ---
14
- # 1. Ingestion
15
  from src.ingestion.semantic_splitter import ActivaSemanticSplitter
16
- # 2. Extraction (Importiamo anche la classe GraphTriple per la ricostruzione dei dati)
17
  from src.extraction.extractor import NeuroSymbolicExtractor, GraphTriple
18
- # 3. Graph Building (Loader & Resolver)
19
  from src.graph.graph_loader import KnowledgeGraphPersister
20
  from src.graph.entity_resolver import EntityResolver
21
 
22
- # --- CONFIGURAZIONE INIZIALE ---
23
  load_dotenv()
24
  st.set_page_config(
25
  page_title="Activa Semantic Discovery",
@@ -28,51 +25,73 @@ st.set_page_config(
28
  page_icon="🧠"
29
  )
30
 
31
- # --- FUNZIONI DI UTILITÀ NEO4J (Frontend) ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  def get_driver(uri, user, password):
33
- if not uri or not password:
34
- return None
35
  try:
36
  return GraphDatabase.driver(uri, auth=(user, password))
37
- except Exception as e:
38
- return None
39
 
40
  def run_query(driver, query, params=None):
41
- if driver is None:
42
- return []
43
  with driver.session() as session:
44
  result = session.run(query, params)
45
  return [r.data() for r in result]
46
 
47
- # --- LOGICA FEEDBACK LOOP (HUMAN-IN-THE-LOOP) ---
48
- def reject_relationship(driver, rel_id, subj, pred, obj, reason="Human Rejection"):
49
- """
50
- 1. Cancella dal DB (Azione Reale).
51
- 2. Salva in CSV per Active Learning (Data Lineage del rifiuto).
52
- """
53
- query = "MATCH ()-[r]->() WHERE elementId(r) = $id DELETE r"
54
- try:
55
- run_query(driver, query, {"id": rel_id})
56
- except Exception as e:
57
- st.error(f"Errore durante la cancellazione: {e}")
58
- return False
59
-
60
- log_file = "data/processed/rejected_triples.csv"
61
- os.makedirs("data/processed", exist_ok=True)
62
- file_exists = os.path.isfile(log_file)
63
-
64
- try:
65
- with open(log_file, mode='a', newline='', encoding='utf-8') as f:
66
- writer = csv.writer(f)
67
- if not file_exists:
68
- writer.writerow(["timestamp", "subject", "predicate", "object", "reason"])
69
- writer.writerow([datetime.now(), subj, pred, obj, reason])
70
- return True
71
- except Exception as e:
72
- st.warning(f"Relazione cancellata dal DB, ma errore nel log CSV: {e}")
73
- return True
74
-
75
- # --- UI: SIDEBAR & CONFIGURAZIONE ---
76
  st.sidebar.title("⚙️ Configurazione")
77
 
78
  env_uri = os.getenv("NEO4J_URI", "")
@@ -80,14 +99,12 @@ env_user = os.getenv("NEO4J_USER", "neo4j")
80
  env_password = os.getenv("NEO4J_PASSWORD", "")
81
  env_hf_token = os.getenv("HF_TOKEN", "")
82
 
83
- st.sidebar.subheader("Backend LLM")
84
- # Se il token HF è presente nei secrets/env, lo mostriamo mascherato, altrimenti input
85
- hf_token_input = st.sidebar.text_input("HuggingFace Token (Opzionale per Cloud)", value=env_hf_token, type="password")
86
- if hf_token_input:
87
- os.environ["HF_TOKEN"] = hf_token_input
88
 
89
- st.sidebar.subheader("Connessione Neo4j AuraDB")
90
- uri = st.sidebar.text_input("URI", value=env_uri, placeholder="neo4j+s://...")
91
  user = st.sidebar.text_input("User", value=env_user)
92
  password = st.sidebar.text_input("Password", value=env_password, type="password")
93
 
@@ -95,188 +112,235 @@ driver = None
95
  if uri and password:
96
  driver = get_driver(uri, user, password)
97
  if driver:
98
- st.sidebar.success("🟢 Connesso al Knowledge Graph")
99
  os.environ["NEO4J_URI"] = uri
100
  os.environ["NEO4J_USER"] = user
101
  os.environ["NEO4J_PASSWORD"] = password
102
  else:
103
  st.sidebar.error("🔴 Errore connessione")
104
 
105
- # --- UI: HEADER PRINCIPALE ---
 
 
 
 
106
  st.title("🧠 Automated Semantic Discovery Prototype")
107
- st.markdown("**Generazione Neuro-Simbolica & Validazione Human-in-the-Loop**")
108
 
109
- # --- TAB NAVIGAZIONE ---
110
  tab_gen, tab_val, tab_vis = st.tabs([
111
- "⚙️ 1. Generazione (Pipeline)",
112
- "🔍 2. Validazione (Active Learning)",
113
- "🕸️ 3. Visualizzazione (Graph)"
114
  ])
115
 
116
  # ==============================================================================
117
- # TAB 1: GENERAZIONE (PIPELINE DI ESECUZIONE)
118
  # ==============================================================================
119
  with tab_gen:
120
- st.header("Pipeline di Ingestione ed Estrazione")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
- col_src, col_act = st.columns([1, 2])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
 
124
- # 1. Selezione File
125
- with col_src:
126
- st.subheader("Sorgente Dati")
127
- data_source = st.radio("Modalità:", ("📂 Esempi Demo", "Upload (Sperimentale)"))
 
128
 
129
- selected_file_path = None
130
- os.makedirs("data/raw", exist_ok=True)
131
- os.makedirs("data/processed", exist_ok=True)
132
- os.makedirs("data/examples", exist_ok=True)
133
-
134
- if data_source == "📂 Esempi Demo":
135
- files = [f for f in os.listdir("data/examples") if f.endswith(".txt")]
136
- if files:
137
- choice = st.selectbox("Seleziona scenario:", files)
138
- if choice:
139
- src = os.path.join("data/examples", choice)
140
- dst = os.path.join("data/raw", choice)
141
- shutil.copy(src, dst)
142
- selected_file_path = choice
143
- else:
144
- st.warning("Nessun file in data/examples")
145
  else:
146
- uploaded = st.file_uploader("Carica .txt", type="txt")
147
- if uploaded:
148
- with open(os.path.join("data/raw", uploaded.name), "wb") as f:
149
- f.write(uploaded.getbuffer())
150
- selected_file_path = uploaded.name
151
-
152
- # 2. Esecuzione Step
153
- with col_act:
154
- if selected_file_path:
155
- st.info(f"File attivo: **{selected_file_path}**")
156
- c1, c2, c3 = st.columns(3)
157
-
158
- # --- STEP A: CHUNKING ---
159
- with c1:
160
- if st.button("A. Semantic Chunking"):
161
- with st.status("Analisi vettoriale in corso...", expanded=True) as status:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  try:
163
- # 1. Leggi il testo raw
164
- with open(os.path.join("data/raw", selected_file_path), "r", encoding="utf-8") as f:
165
- text_content = f.read()
166
-
167
- # 2. Inizializza Splitter e processa
168
- status.write("Caricamento modelli di embedding...")
169
- splitter = ActivaSemanticSplitter() # Usa default huggingface
170
 
171
- status.write("Calcolo distanze coseno...")
172
- chunks, dists, threshold = splitter.create_chunks(text_content)
173
 
174
- # 3. Salvataggio intermedio
175
- chunk_file = "data/processed/chunks.json"
176
- with open(chunk_file, "w", encoding="utf-8") as f:
177
- json.dump(chunks, f, ensure_ascii=False, indent=2)
178
 
179
- status.update(label="Chunking Completato!", state="complete", expanded=False)
180
- st.success(f"Generati {len(chunks)} frammenti semantici.")
 
181
 
182
- # Optional: Mostra grafico se generato
183
- if os.path.exists("chunking_analysis.png"):
184
- st.image("chunking_analysis.png", caption="Analisi Coerenza")
185
-
186
  except Exception as e:
187
- status.update(label="Errore Chunking", state="error")
188
  st.error(f"Errore: {e}")
189
-
190
- # --- STEP B: EXTRACTION ---
191
- with c2:
192
- if st.button("B. Info Extraction"):
193
- chunk_file = "data/processed/chunks.json"
194
- if not os.path.exists(chunk_file):
195
- st.error("Esegui prima il Chunking!")
196
- else:
197
- with st.status("Estrazione Neuro-Simbolica...", expanded=True) as status:
198
- try:
199
- # 1. Carica Chunks
200
- with open(chunk_file, "r", encoding="utf-8") as f:
201
- chunks = json.load(f)
202
-
203
- # 2. Init Extractor (Rileva HF_TOKEN da env)
204
- status.write("Inizializzazione LLM (Locale/Cloud)...")
205
- extractor = NeuroSymbolicExtractor() # Usa default params
206
-
207
- all_triples = []
208
- progress_bar = st.progress(0)
209
-
210
- # 3. Loop su chunk
211
- for i, chunk in enumerate(chunks):
212
- status.write(f"Processando chunk {i+1}/{len(chunks)}...")
213
- result = extractor.extract(chunk, source_id=selected_file_path)
214
- # Converti oggetti Pydantic in dict per serializzazione JSON
215
- triples_dicts = [t.model_dump() for t in result.triples]
216
- all_triples.extend(triples_dicts)
217
- progress_bar.progress((i + 1) / len(chunks))
218
-
219
- # 4. Salvataggio Raw Triples
220
- triples_file = "data/processed/triples_raw.json"
221
- with open(triples_file, "w", encoding="utf-8") as f:
222
- json.dump(all_triples, f, ensure_ascii=False, indent=2)
223
-
224
- status.update(label="Estrazione Completata!", state="complete", expanded=False)
225
- st.success(f"Estratte {len(all_triples)} triple candidate.")
226
-
227
- except Exception as e:
228
- status.update(label="Errore Estrazione", state="error")
229
- st.error(f"Errore: {e}")
230
-
231
- # --- STEP C: GRAPH BUILDING ---
232
- with c3:
233
- if st.button("C. Popola Neo4j", type="primary"):
234
- triples_file = "data/processed/triples_raw.json"
235
- if not os.path.exists(triples_file):
236
- st.error("Esegui prima l'estrazione!")
237
- elif not driver:
238
- st.error("Connettiti al DB prima!")
239
- else:
240
- with st.status("Costruzione Grafo...", expanded=True) as status:
241
- try:
242
- # 1. Carica Raw Triples
243
- with open(triples_file, "r", encoding="utf-8") as f:
244
- raw_data = json.load(f)
245
-
246
- # Ricostruisci oggetti GraphTriple (necessari per il Resolver)
247
- triples_objs = [GraphTriple(**t) for t in raw_data]
248
-
249
- # 2. Entity Resolution
250
- status.write("Entity Resolution (DBSCAN Clustering)...")
251
- resolver = EntityResolver(similarity_threshold=0.85)
252
- resolved_triples = resolver.resolve_entities(triples_objs)
253
- status.write(f"Entità normalizzate. Triple da inserire: {len(resolved_triples)}")
254
-
255
- # 3. Persistenza Neo4j
256
- status.write("Scrittura Batch su Neo4j...")
257
- persister = KnowledgeGraphPersister() # Prende credenziali da env
258
- persister.save_triples(resolved_triples)
259
- persister.close()
260
-
261
- status.update(label="Grafo Aggiornato!", state="complete", expanded=False)
262
- st.success("🚀 Grafo costruito con successo su AuraDB!")
263
- st.balloons()
264
-
265
- except Exception as e:
266
- status.update(label="Errore Costruzione", state="error")
267
- st.error(f"Errore: {e}")
268
- else:
269
- st.write("👈 Seleziona un file per iniziare.")
270
 
271
  # ==============================================================================
272
- # TAB 2: VALIDAZIONE (HUMAN IN THE LOOP)
273
  # ==============================================================================
274
  with tab_val:
275
  st.header("Curation & Feedback Loop")
276
- st.markdown("Validazione delle triple estratte. Le relazioni rifiutate vengono usate per il fine-tuning.")
277
-
278
  if driver:
279
- # Query aggiornata per mostrare nodi reali
 
 
 
 
 
 
280
  cypher_val = """
281
  MATCH (s)-[r]->(o)
282
  RETURN elementId(r) as id,
@@ -290,77 +354,44 @@ with tab_val:
290
 
291
  if triples_data:
292
  df = pd.DataFrame(triples_data)
293
- st.write(f"Relazioni candidate ({len(df)}):")
294
-
295
- event = st.dataframe(
296
- df.drop(columns=["id"]),
297
- selection_mode="single-row",
298
- on_select="rerun",
299
- use_container_width=True,
300
- hide_index=True
301
- )
302
-
303
- if len(event.selection.rows) > 0:
304
- idx = event.selection.rows[0]
305
- row = df.iloc[idx]
306
-
307
- st.divider()
308
- col_warn, col_btn = st.columns([3, 1])
309
- with col_warn:
310
- st.warning(f"Rifiutare: **{row['Soggetto']}** --[{row['Predicato']}]--> **{row['Oggetto']}**?")
311
- with col_btn:
312
- if st.button("🗑️ CONFERMA RIFIUTO", type="primary"):
313
- success = reject_relationship(driver, row['id'], row['Soggetto'], row['Predicato'], row['Oggetto'])
314
- if success:
315
- st.success("Relazione eliminata!")
316
- st.rerun()
317
  else:
318
- st.info("Nessuna relazione nel grafo.")
319
  else:
320
  st.warning("Database non connesso.")
321
 
322
  # ==============================================================================
323
- # TAB 3: VISUALIZZAZIONE (PYVIS)
324
  # ==============================================================================
325
  with tab_vis:
326
  st.header("Esplorazione Topologica")
327
-
328
  if driver:
329
- physics = st.checkbox("Abilita Fisica", value=True)
330
- cypher_vis = """
331
- MATCH (s)-[r]->(o)
332
- RETURN COALESCE(s.label, s.name, head(labels(s))) as src,
333
- type(r) as rel,
334
- COALESCE(o.label, o.name, head(labels(o))) as dst
335
- LIMIT 100
336
- """
337
- graph_data = run_query(driver, cypher_vis)
338
-
339
- if graph_data:
340
- net = Network(height="600px", width="100%", bgcolor="#222222", font_color="white", notebook=False)
341
 
342
- for item in graph_data:
343
- src = str(item['src'])
344
- dst = str(item['dst'])
345
- rel = str(item['rel'])
 
 
 
346
 
347
- net.add_node(src, label=src, color="#4facfe", title=src)
348
- net.add_node(dst, label=dst, color="#00f2fe", title=dst)
349
- net.add_edge(src, dst, title=rel, label=rel)
350
-
351
- net.toggle_physics(physics)
352
- path = "data/processed/graph_viz.html"
353
- os.makedirs("data/processed", exist_ok=True)
354
- net.save_graph(path)
355
-
356
- with open(path, 'r', encoding='utf-8') as f:
357
- html_string = f.read()
358
- components.html(html_string, height=600, scrolling=True)
359
- else:
360
- st.info("Grafo vuoto.")
361
  else:
362
- st.warning("Database non connesso.")
363
-
364
- # Footer
365
- st.markdown("---")
366
- st.caption("Activa Digital | NextGenTech | Prototipo v1.0")
 
11
  from dotenv import load_dotenv
12
 
13
  # --- IMPORT MODULI SPECIFICI ---
 
14
  from src.ingestion.semantic_splitter import ActivaSemanticSplitter
 
15
  from src.extraction.extractor import NeuroSymbolicExtractor, GraphTriple
 
16
  from src.graph.graph_loader import KnowledgeGraphPersister
17
  from src.graph.entity_resolver import EntityResolver
18
 
19
+ # --- CONFIGURAZIONE PAGINA ---
20
  load_dotenv()
21
  st.set_page_config(
22
  page_title="Activa Semantic Discovery",
 
25
  page_icon="🧠"
26
  )
27
 
28
+ # --- CSS CUSTOM PER UX MIGLIORATA ---
29
+ # Rende le card più leggibili e stilizza i messaggi di stato
30
+ st.markdown("""
31
+ <style>
32
+ .step-card {
33
+ padding: 20px;
34
+ border-radius: 10px;
35
+ border: 1px solid #e0e0e0;
36
+ margin-bottom: 20px;
37
+ background-color: #262730;
38
+ }
39
+ .step-header {
40
+ font-size: 1.2rem;
41
+ font-weight: bold;
42
+ margin-bottom: 10px;
43
+ color: #4facfe;
44
+ }
45
+ .success-box {
46
+ padding: 10px;
47
+ background-color: rgba(76, 175, 80, 0.1);
48
+ border-left: 5px solid #4CAF50;
49
+ border-radius: 5px;
50
+ }
51
+ </style>
52
+ """, unsafe_allow_html=True)
53
+
54
+ # --- SESSION STATE MANAGEMENT ---
55
+ if 'pipeline_stage' not in st.session_state:
56
+ st.session_state.pipeline_stage = 0 # 0: Init, 1: Chunked, 2: Extracted, 3: Loaded
57
+ if 'current_file' not in st.session_state:
58
+ st.session_state.current_file = None
59
+
60
+ def reset_pipeline():
61
+ st.session_state.pipeline_stage = 0
62
+ st.session_state.current_file = None
63
+ # Pulisce i file processati per evitare incongruenze
64
+ if os.path.exists("data/processed"):
65
+ shutil.rmtree("data/processed")
66
+ os.makedirs("data/processed", exist_ok=True)
67
+
68
+ # --- CACHING RISORSE ---
69
+ @st.cache_resource
70
+ def get_splitter():
71
+ return ActivaSemanticSplitter()
72
+
73
+ @st.cache_resource
74
+ def get_extractor():
75
+ return NeuroSymbolicExtractor()
76
+
77
+ @st.cache_resource
78
+ def get_resolver():
79
+ return EntityResolver(similarity_threshold=0.85)
80
+
81
+ # --- FUNZIONI NEO4J ---
82
  def get_driver(uri, user, password):
83
+ if not uri or not password: return None
 
84
  try:
85
  return GraphDatabase.driver(uri, auth=(user, password))
86
+ except: return None
 
87
 
88
  def run_query(driver, query, params=None):
89
+ if driver is None: return []
 
90
  with driver.session() as session:
91
  result = session.run(query, params)
92
  return [r.data() for r in result]
93
 
94
+ # --- UI: SIDEBAR ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  st.sidebar.title("⚙️ Configurazione")
96
 
97
  env_uri = os.getenv("NEO4J_URI", "")
 
99
  env_password = os.getenv("NEO4J_PASSWORD", "")
100
  env_hf_token = os.getenv("HF_TOKEN", "")
101
 
102
+ st.sidebar.subheader("Backend AI")
103
+ hf_token_input = st.sidebar.text_input("HF Token (Opzionale)", value=env_hf_token, type="password")
104
+ if hf_token_input: os.environ["HF_TOKEN"] = hf_token_input
 
 
105
 
106
+ st.sidebar.subheader("Knowledge Graph")
107
+ uri = st.sidebar.text_input("URI", value=env_uri)
108
  user = st.sidebar.text_input("User", value=env_user)
109
  password = st.sidebar.text_input("Password", value=env_password, type="password")
110
 
 
112
  if uri and password:
113
  driver = get_driver(uri, user, password)
114
  if driver:
115
+ st.sidebar.success("🟢 Connesso a Neo4j")
116
  os.environ["NEO4J_URI"] = uri
117
  os.environ["NEO4J_USER"] = user
118
  os.environ["NEO4J_PASSWORD"] = password
119
  else:
120
  st.sidebar.error("🔴 Errore connessione")
121
 
122
+ st.sidebar.divider()
123
+ if st.sidebar.button("🔄 Reset Pipeline", on_click=reset_pipeline):
124
+ st.sidebar.info("Stato resettato.")
125
+
126
+ # --- MAIN HEADER ---
127
  st.title("🧠 Automated Semantic Discovery Prototype")
128
+ st.markdown("**Pipeline Sequenziale Neuro-Simbolica**")
129
 
130
+ # --- TAB LOGIC ---
131
  tab_gen, tab_val, tab_vis = st.tabs([
132
+ "⚙️ 1. Pipeline Generativa",
133
+ "🔍 2. Validazione (HITL)",
134
+ "🕸️ 3. Esplorazione Grafo"
135
  ])
136
 
137
  # ==============================================================================
138
+ # TAB 1: PIPELINE GENERATIVA (STEPPER UI)
139
  # ==============================================================================
140
  with tab_gen:
141
+ # --- SELEZIONE FILE ---
142
+ st.subheader("1. Sorgente Documentale")
143
+
144
+ col_sel, col_info = st.columns([1, 2])
145
+ with col_sel:
146
+ data_source = st.radio("Modalità:", ("📂 Esempi Demo", "⬆️ Upload"), horizontal=True)
147
+
148
+ selected_file = None
149
+ os.makedirs("data/raw", exist_ok=True)
150
+ os.makedirs("data/processed", exist_ok=True)
151
+ os.makedirs("data/examples", exist_ok=True)
152
+
153
+ if data_source == "📂 Esempi Demo":
154
+ files = [f for f in os.listdir("data/examples") if f.endswith(".txt")]
155
+ if files:
156
+ choice = st.selectbox("Seleziona scenario:", files, index=0)
157
+ if choice:
158
+ src = os.path.join("data/examples", choice)
159
+ dst = os.path.join("data/raw", choice)
160
+ shutil.copy(src, dst)
161
+ selected_file = choice
162
+ else:
163
+ st.warning("Nessun file in data/examples")
164
+ else:
165
+ uploaded = st.file_uploader("Carica .txt", type="txt")
166
+ if uploaded:
167
+ with open(os.path.join("data/raw", uploaded.name), "wb") as f:
168
+ f.write(uploaded.getbuffer())
169
+ selected_file = uploaded.name
170
+
171
+ # Logica di cambio file: se cambia il file, resetta la pipeline
172
+ if selected_file and selected_file != st.session_state.current_file:
173
+ st.session_state.current_file = selected_file
174
+ st.session_state.pipeline_stage = 0
175
+ st.rerun()
176
+
177
+ if not selected_file:
178
+ st.info("👈 Seleziona o carica un file per iniziare.")
179
+ st.stop()
180
+
181
+ st.markdown("---")
182
 
183
+ # --- PROGRESS BAR ---
184
+ # stage 0 -> 0%, stage 1 -> 33%, stage 2 -> 66%, stage 3 -> 100%
185
+ progress_val = int((st.session_state.pipeline_stage / 3) * 100)
186
+ st.progress(progress_val, text=f"Progresso Pipeline: {progress_val}%")
187
+
188
+ # ==========================
189
+ # FASE A: CHUNKING
190
+ # ==========================
191
+ with st.container():
192
+ st.markdown(f"### {'✅' if st.session_state.pipeline_stage >= 1 else '1️⃣'} Fase A: Semantic Chunking")
193
+
194
+ if st.session_state.pipeline_stage >= 1:
195
+ # Stato Completato: Mostra riassunto
196
+ with open("data/processed/chunks.json", "r") as f:
197
+ chunks = json.load(f)
198
+ st.markdown(f"""
199
+ <div class="success-box">
200
+ <b>Chunking completato!</b> Generati {len(chunks)} frammenti semantici.<br>
201
+ Modello vettoriale utilizzato: <i>MiniLM-L12-v2</i>
202
+ </div>
203
+ """, unsafe_allow_html=True)
204
+ with st.expander("Vedi dettagli frammenti"):
205
+ st.json(chunks[:3]) # Mostra solo i primi 3 per pulizia
206
+ else:
207
+ # Stato Attivo: Bottone azione
208
+ st.markdown("Segmentazione del testo basata sulla coerenza semantica vettoriale.")
209
+ if st.button("Avvia Analisi Semantica", type="primary"):
210
+ with st.spinner("Calcolo vettori e segmentazione..."):
211
+ try:
212
+ with open(os.path.join("data/raw", selected_file), "r", encoding="utf-8") as f:
213
+ text_content = f.read()
214
+
215
+ splitter = get_splitter()
216
+ chunks, dists, threshold = splitter.create_chunks(text_content)
217
+
218
+ with open("data/processed/chunks.json", "w", encoding="utf-8") as f:
219
+ json.dump(chunks, f, ensure_ascii=False, indent=2)
220
+
221
+ st.session_state.pipeline_stage = 1
222
+ st.rerun()
223
+ except Exception as e:
224
+ st.error(f"Errore: {e}")
225
+
226
+ st.markdown("⬇️")
227
+
228
+ # ==========================
229
+ # FASE B: EXTRACTION
230
+ # ==========================
231
+ is_step_b_unlocked = st.session_state.pipeline_stage >= 1
232
 
233
+ with st.container():
234
+ # Header grigio se bloccato, colorato se attivo
235
+ color = "black" if is_step_b_unlocked else "gray"
236
+ icon = "" if st.session_state.pipeline_stage >= 2 else ("2⃣" if is_step_b_unlocked else "🔒")
237
+ st.markdown(f"<h3 style='color:{color}'>{icon} Fase B: Information Extraction</h3>", unsafe_allow_html=True)
238
 
239
+ if not is_step_b_unlocked:
240
+ st.caption("Completa la Fase A per sbloccare l'estrazione.")
241
+
242
+ elif st.session_state.pipeline_stage >= 2:
243
+ # Stato Completato
244
+ with open("data/processed/triples_raw.json", "r") as f:
245
+ triples = json.load(f)
246
+ st.markdown(f"""
247
+ <div class="success-box">
248
+ <b>Estrazione completata!</b> Identificate {len(triples)} triple candidate.<br>
249
+ Motore Neuro-Simbolico: <i>Llama3/Mistral + Dependecy Parsing</i>
250
+ </div>
251
+ """, unsafe_allow_html=True)
252
+ with st.expander("Vedi esempio triple"):
253
+ st.dataframe(pd.DataFrame(triples).head(5), hide_index=True)
 
254
  else:
255
+ # Stato Attivo
256
+ st.markdown("Estrazione di Entità e Relazioni tramite approccio Neuro-Simbolico.")
257
+ if st.button("Avvia Estrazione Ontologica", type="primary"):
258
+ with st.spinner("Processando frammenti con LLM..."):
259
+ try:
260
+ with open("data/processed/chunks.json", "r", encoding="utf-8") as f:
261
+ chunks = json.load(f)
262
+
263
+ extractor = get_extractor()
264
+ all_triples = []
265
+ prog_bar = st.progress(0)
266
+
267
+ for i, chunk in enumerate(chunks):
268
+ res = extractor.extract(chunk, source_id=selected_file)
269
+ all_triples.extend([t.model_dump() for t in res.triples])
270
+ prog_bar.progress((i+1)/len(chunks))
271
+
272
+ with open("data/processed/triples_raw.json", "w", encoding="utf-8") as f:
273
+ json.dump(all_triples, f, ensure_ascii=False, indent=2)
274
+
275
+ st.session_state.pipeline_stage = 2
276
+ st.rerun()
277
+ except Exception as e:
278
+ st.error(f"Errore: {e}")
279
+
280
+ st.markdown("⬇️")
281
+
282
+ # ==========================
283
+ # FASE C: GRAPH POPULATION
284
+ # ==========================
285
+ is_step_c_unlocked = st.session_state.pipeline_stage >= 2
286
+
287
+ with st.container():
288
+ color = "black" if is_step_c_unlocked else "gray"
289
+ icon = "✅" if st.session_state.pipeline_stage >= 3 else ("3️⃣" if is_step_c_unlocked else "🔒")
290
+ st.markdown(f"<h3 style='color:{color}'>{icon} Fase C: Graph Construction</h3>", unsafe_allow_html=True)
291
+
292
+ if not is_step_c_unlocked:
293
+ st.caption("Completa la Fase B per popolare il grafo.")
294
+
295
+ elif st.session_state.pipeline_stage >= 3:
296
+ st.markdown("""
297
+ <div class="success-box">
298
+ <b>Grafo Aggiornato!</b> I dati sono stati caricati su Neo4j.<br>
299
+ Puoi esplorarli nei tab "Validazione" e "Visualizzazione".
300
+ </div>
301
+ """, unsafe_allow_html=True)
302
+ st.balloons()
303
+ if st.button("Riavvia con nuovo file"):
304
+ reset_pipeline()
305
+ st.rerun()
306
+ else:
307
+ st.markdown("Entity Resolution (Deduplica) e Caricamento su Neo4j.")
308
+ if not driver:
309
+ st.error("⚠️ Connettiti a Neo4j (nella sidebar) per procedere.")
310
+ else:
311
+ if st.button("Genera Knowledge Graph", type="primary"):
312
+ with st.spinner("Risoluzione entità e scrittura DB..."):
313
  try:
314
+ with open("data/processed/triples_raw.json", "r", encoding="utf-8") as f:
315
+ raw_data = json.load(f)
 
 
 
 
 
316
 
317
+ triples_objs = [GraphTriple(**t) for t in raw_data]
 
318
 
319
+ resolver = get_resolver()
320
+ resolved = resolver.resolve_entities(triples_objs)
 
 
321
 
322
+ persister = KnowledgeGraphPersister()
323
+ persister.save_triples(resolved)
324
+ persister.close()
325
 
326
+ st.session_state.pipeline_stage = 3
327
+ st.rerun()
 
 
328
  except Exception as e:
 
329
  st.error(f"Errore: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
 
331
  # ==============================================================================
332
+ # TAB 2: VALIDAZIONE (Codice invariato, solo stile)
333
  # ==============================================================================
334
  with tab_val:
335
  st.header("Curation & Feedback Loop")
 
 
336
  if driver:
337
+ # Recupera statistiche rapide
338
+ stats = run_query(driver, "MATCH (n) RETURN count(n) as nodes, count{()-->()} as rels")
339
+ if stats:
340
+ c1, c2 = st.columns(2)
341
+ c1.metric("Nodi Totali", stats[0]['nodes'])
342
+ c2.metric("Relazioni", stats[0]['rels'])
343
+
344
  cypher_val = """
345
  MATCH (s)-[r]->(o)
346
  RETURN elementId(r) as id,
 
354
 
355
  if triples_data:
356
  df = pd.DataFrame(triples_data)
357
+ st.dataframe(df.drop(columns=["id"]), use_container_width=True, hide_index=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
358
  else:
359
+ st.info("Grafo vuoto.")
360
  else:
361
  st.warning("Database non connesso.")
362
 
363
  # ==============================================================================
364
+ # TAB 3: VISUALIZZAZIONE
365
  # ==============================================================================
366
  with tab_vis:
367
  st.header("Esplorazione Topologica")
 
368
  if driver:
369
+ physics = st.checkbox("Abilita Fisica (Gravità)", value=True)
370
+ if st.button("Aggiorna Visualizzazione"):
371
+ cypher_vis = """
372
+ MATCH (s)-[r]->(o)
373
+ RETURN COALESCE(s.label, s.name, head(labels(s))) as src,
374
+ type(r) as rel,
375
+ COALESCE(o.label, o.name, head(labels(o))) as dst
376
+ LIMIT 100
377
+ """
378
+ graph_data = run_query(driver, cypher_vis)
 
 
379
 
380
+ if graph_data:
381
+ net = Network(height="600px", width="100%", bgcolor="#222222", font_color="white", notebook=False)
382
+ for item in graph_data:
383
+ src, dst, rel = str(item['src']), str(item['dst']), str(item['rel'])
384
+ net.add_node(src, label=src, color="#4facfe", title=src)
385
+ net.add_node(dst, label=dst, color="#00f2fe", title=dst)
386
+ net.add_edge(src, dst, title=rel, label=rel)
387
 
388
+ net.toggle_physics(physics)
389
+ path = "data/processed/graph_viz.html"
390
+ os.makedirs("data/processed", exist_ok=True)
391
+ net.save_graph(path)
392
+
393
+ with open(path, 'r', encoding='utf-8') as f:
394
+ html_string = f.read()
395
+ components.html(html_string, height=600, scrolling=True)
 
 
 
 
 
 
396
  else:
397
+ st.warning("Database non connesso.")