| | import hashlib |
| | import json |
| | import re |
| | from pathlib import Path |
| | from typing import Dict, List |
| |
|
| | import numpy as np |
| | import streamlit as st |
| |
|
| | try: |
| | from sentence_transformers import SentenceTransformer |
| | except Exception: |
| | SentenceTransformer = None |
| |
|
| |
|
| | STOPWORDS = { |
| | "della", |
| | "delle", |
| | "dello", |
| | "degli", |
| | "dati", |
| | "sono", |
| | "come", |
| | "questa", |
| | "questo", |
| | "nella", |
| | "nelle", |
| | "anche", |
| | "molto", |
| | "dove", |
| | "quando", |
| | "with", |
| | "that", |
| | "from", |
| | "have", |
| | "your", |
| | "will", |
| | "about", |
| | "parlami", |
| | "dimmi", |
| | "spiegami", |
| | "cosa", |
| | "quale", |
| | } |
| |
|
| |
|
| | def normalizza_testo(t: str) -> str: |
| | t = (t or "").replace("\n", " ").replace("\t", " ").strip() |
| | t = re.sub(r"\s+", " ", t) |
| | return t |
| |
|
| |
|
| | def tokenizza(testo: str) -> List[str]: |
| | candidati = re.findall(r"[A-Za-z0-9_]+", testo.lower()) |
| | return [t for t in candidati if len(t) >= 4 and t not in STOPWORDS] |
| |
|
| |
|
| | class LocalHashEmbedder: |
| | def __init__(self, dim: int = 384): |
| | self.dim = int(dim) |
| | self.name = f"local-hash-{self.dim}" |
| |
|
| | def _tokens(self, text: str) -> List[str]: |
| | candidati = re.findall(r"[A-Za-z0-9_]+", text.lower()) |
| | return [t for t in candidati if len(t) >= 3 and t not in STOPWORDS] |
| |
|
| | def encode(self, texts): |
| | if isinstance(texts, str): |
| | texts = [texts] |
| | out = np.zeros((len(texts), self.dim), dtype="float32") |
| | for i, text in enumerate(texts): |
| | clean = normalizza_testo(text) |
| | toks = self._tokens(clean) or clean.lower().split() |
| | for tok in toks: |
| | h = int(hashlib.sha1(tok.encode("utf-8", errors="ignore")).hexdigest(), 16) |
| | idx = h % self.dim |
| | sign = -1.0 if ((h >> 8) & 1) else 1.0 |
| | out[i, idx] += sign |
| | norm = float(np.linalg.norm(out[i])) |
| | if norm > 0: |
| | out[i] /= norm |
| | return out |
| |
|
| |
|
| | def inizializza_embedder(): |
| | model_name = "paraphrase-multilingual-MiniLM-L12-v2" |
| | local_path = Path("aio_models") / model_name |
| | if SentenceTransformer is not None and local_path.exists(): |
| | try: |
| | model = SentenceTransformer(str(local_path), local_files_only=True) |
| | return model, f"sentence-transformers(local-path): {local_path}" |
| | except Exception: |
| | pass |
| | return LocalHashEmbedder(dim=384), "local-hash-embedder (showcase fallback)" |
| |
|
| |
|
| | def vectorizza(model, testi: List[str]) -> np.ndarray: |
| | v = model.encode(testi) |
| | arr = np.array(v, dtype="float32") |
| | norms = np.linalg.norm(arr, axis=1, keepdims=True) |
| | norms[norms == 0] = 1.0 |
| | arr = arr / norms |
| | return arr |
| |
|
| |
|
| | @st.cache_data |
| | def carica_corpus() -> List[Dict[str, str]]: |
| | path = Path("demo_corpus.json") |
| | data = json.loads(path.read_text(encoding="utf-8")) |
| | for r in data: |
| | r["text"] = normalizza_testo(r.get("text", "")) |
| | return data |
| |
|
| |
|
| | @st.cache_resource |
| | def prepara_engine(): |
| | model, backend = inizializza_embedder() |
| | records = carica_corpus() |
| | texts = [r["text"] for r in records] |
| | mat = vectorizza(model, texts) |
| | return model, backend, records, mat |
| |
|
| |
|
| | def cerca(query: str, model, records, mat: np.ndarray, top_k: int = 5): |
| | qv = vectorizza(model, [query])[0] |
| | sims = mat @ qv |
| | qtok = set(tokenizza(query)) |
| | out = [] |
| | for i, s in enumerate(sims): |
| | rec = records[i] |
| | ttok = set(tokenizza(rec["text"])) |
| | overlap = len(qtok.intersection(ttok)) if qtok else 0 |
| | lex = (overlap / max(1, len(qtok))) if qtok else 0.0 |
| | score = 0.6 * float(s) + 0.4 * float(lex) |
| | out.append( |
| | { |
| | "score": score, |
| | "sim": float(s), |
| | "lex": float(lex), |
| | "domain": rec.get("domain", "generale"), |
| | "source": rec.get("source", "showcase"), |
| | "text": rec.get("text", ""), |
| | } |
| | ) |
| | out.sort(key=lambda x: x["score"], reverse=True) |
| | return out[:top_k] |
| |
|
| |
|
| | st.set_page_config(page_title="AIO Showcase", page_icon=":books:", layout="centered") |
| | st.title("AIO System Core - Public Showcase") |
| | st.caption("Demo pubblica controllata. Core proprietario e corpus completo restano privati.") |
| |
|
| | model, backend, records, mat = prepara_engine() |
| | st.caption(f"Backend: {backend}") |
| | st.caption(f"Records demo: {len(records)}") |
| |
|
| | query = st.text_input("Inserisci una domanda", value="Parlami del Mediterraneo e della geopolitica energetica") |
| | top_k = st.slider("Top K", min_value=3, max_value=10, value=5, step=1) |
| |
|
| | if st.button("Esegui ricerca"): |
| | risultati = cerca(query, model, records, mat, top_k=top_k) |
| | st.subheader("Risultati") |
| | for i, r in enumerate(risultati, start=1): |
| | st.markdown( |
| | f"**{i}. [{r['domain']}]** score={r['score']:.3f} sim={r['sim']:.3f} lex={r['lex']:.3f}" |
| | ) |
| | st.caption(f"source: {r['source']}") |
| | st.write(r["text"]) |
| |
|
| | st.markdown("---") |
| | st.markdown("**Licenza Showcase:** uso e studio liberi, uso commerciale solo su autorizzazione scritta.") |
| | st.markdown("Contatto: `info@rthitalia.com`") |
| |
|