import hashlib import json import re from pathlib import Path from typing import Dict, List import numpy as np import streamlit as st try: from sentence_transformers import SentenceTransformer except Exception: # pragma: no cover SentenceTransformer = None STOPWORDS = { "della", "delle", "dello", "degli", "dati", "sono", "come", "questa", "questo", "nella", "nelle", "anche", "molto", "dove", "quando", "with", "that", "from", "have", "your", "will", "about", "parlami", "dimmi", "spiegami", "cosa", "quale", } def normalizza_testo(t: str) -> str: t = (t or "").replace("\n", " ").replace("\t", " ").strip() t = re.sub(r"\s+", " ", t) return t def tokenizza(testo: str) -> List[str]: candidati = re.findall(r"[A-Za-z0-9_]+", testo.lower()) return [t for t in candidati if len(t) >= 4 and t not in STOPWORDS] class LocalHashEmbedder: def __init__(self, dim: int = 384): self.dim = int(dim) self.name = f"local-hash-{self.dim}" def _tokens(self, text: str) -> List[str]: candidati = re.findall(r"[A-Za-z0-9_]+", text.lower()) return [t for t in candidati if len(t) >= 3 and t not in STOPWORDS] def encode(self, texts): if isinstance(texts, str): texts = [texts] out = np.zeros((len(texts), self.dim), dtype="float32") for i, text in enumerate(texts): clean = normalizza_testo(text) toks = self._tokens(clean) or clean.lower().split() for tok in toks: h = int(hashlib.sha1(tok.encode("utf-8", errors="ignore")).hexdigest(), 16) idx = h % self.dim sign = -1.0 if ((h >> 8) & 1) else 1.0 out[i, idx] += sign norm = float(np.linalg.norm(out[i])) if norm > 0: out[i] /= norm return out def inizializza_embedder(): model_name = "paraphrase-multilingual-MiniLM-L12-v2" local_path = Path("aio_models") / model_name if SentenceTransformer is not None and local_path.exists(): try: model = SentenceTransformer(str(local_path), local_files_only=True) return model, f"sentence-transformers(local-path): {local_path}" except Exception: pass return LocalHashEmbedder(dim=384), "local-hash-embedder (showcase fallback)" def vectorizza(model, testi: List[str]) -> np.ndarray: v = model.encode(testi) arr = np.array(v, dtype="float32") norms = np.linalg.norm(arr, axis=1, keepdims=True) norms[norms == 0] = 1.0 arr = arr / norms return arr @st.cache_data def carica_corpus() -> List[Dict[str, str]]: path = Path("demo_corpus.json") data = json.loads(path.read_text(encoding="utf-8")) for r in data: r["text"] = normalizza_testo(r.get("text", "")) return data @st.cache_resource def prepara_engine(): model, backend = inizializza_embedder() records = carica_corpus() texts = [r["text"] for r in records] mat = vectorizza(model, texts) return model, backend, records, mat def cerca(query: str, model, records, mat: np.ndarray, top_k: int = 5): qv = vectorizza(model, [query])[0] sims = mat @ qv qtok = set(tokenizza(query)) out = [] for i, s in enumerate(sims): rec = records[i] ttok = set(tokenizza(rec["text"])) overlap = len(qtok.intersection(ttok)) if qtok else 0 lex = (overlap / max(1, len(qtok))) if qtok else 0.0 score = 0.6 * float(s) + 0.4 * float(lex) out.append( { "score": score, "sim": float(s), "lex": float(lex), "domain": rec.get("domain", "generale"), "source": rec.get("source", "showcase"), "text": rec.get("text", ""), } ) out.sort(key=lambda x: x["score"], reverse=True) return out[:top_k] st.set_page_config(page_title="AIO Showcase", page_icon=":books:", layout="centered") st.title("AIO System Core - Public Showcase") st.caption("Demo pubblica controllata. Core proprietario e corpus completo restano privati.") model, backend, records, mat = prepara_engine() st.caption(f"Backend: {backend}") st.caption(f"Records demo: {len(records)}") query = st.text_input("Inserisci una domanda", value="Parlami del Mediterraneo e della geopolitica energetica") top_k = st.slider("Top K", min_value=3, max_value=10, value=5, step=1) if st.button("Esegui ricerca"): risultati = cerca(query, model, records, mat, top_k=top_k) st.subheader("Risultati") for i, r in enumerate(risultati, start=1): st.markdown( f"**{i}. [{r['domain']}]** score={r['score']:.3f} sim={r['sim']:.3f} lex={r['lex']:.3f}" ) st.caption(f"source: {r['source']}") st.write(r["text"]) st.markdown("---") st.markdown("**Licenza Showcase:** uso e studio liberi, uso commerciale solo su autorizzazione scritta.") st.markdown("Contatto: `info@rthitalia.com`")