aio-system-core-showcase / app_showcase.py
RthItalia's picture
AIO public showcase bundle
f2a34d5 verified
import hashlib
import json
import re
from pathlib import Path
from typing import Dict, List
import numpy as np
import streamlit as st
try:
from sentence_transformers import SentenceTransformer
except Exception: # pragma: no cover
SentenceTransformer = None
STOPWORDS = {
"della",
"delle",
"dello",
"degli",
"dati",
"sono",
"come",
"questa",
"questo",
"nella",
"nelle",
"anche",
"molto",
"dove",
"quando",
"with",
"that",
"from",
"have",
"your",
"will",
"about",
"parlami",
"dimmi",
"spiegami",
"cosa",
"quale",
}
def normalizza_testo(t: str) -> str:
t = (t or "").replace("\n", " ").replace("\t", " ").strip()
t = re.sub(r"\s+", " ", t)
return t
def tokenizza(testo: str) -> List[str]:
candidati = re.findall(r"[A-Za-z0-9_]+", testo.lower())
return [t for t in candidati if len(t) >= 4 and t not in STOPWORDS]
class LocalHashEmbedder:
def __init__(self, dim: int = 384):
self.dim = int(dim)
self.name = f"local-hash-{self.dim}"
def _tokens(self, text: str) -> List[str]:
candidati = re.findall(r"[A-Za-z0-9_]+", text.lower())
return [t for t in candidati if len(t) >= 3 and t not in STOPWORDS]
def encode(self, texts):
if isinstance(texts, str):
texts = [texts]
out = np.zeros((len(texts), self.dim), dtype="float32")
for i, text in enumerate(texts):
clean = normalizza_testo(text)
toks = self._tokens(clean) or clean.lower().split()
for tok in toks:
h = int(hashlib.sha1(tok.encode("utf-8", errors="ignore")).hexdigest(), 16)
idx = h % self.dim
sign = -1.0 if ((h >> 8) & 1) else 1.0
out[i, idx] += sign
norm = float(np.linalg.norm(out[i]))
if norm > 0:
out[i] /= norm
return out
def inizializza_embedder():
model_name = "paraphrase-multilingual-MiniLM-L12-v2"
local_path = Path("aio_models") / model_name
if SentenceTransformer is not None and local_path.exists():
try:
model = SentenceTransformer(str(local_path), local_files_only=True)
return model, f"sentence-transformers(local-path): {local_path}"
except Exception:
pass
return LocalHashEmbedder(dim=384), "local-hash-embedder (showcase fallback)"
def vectorizza(model, testi: List[str]) -> np.ndarray:
v = model.encode(testi)
arr = np.array(v, dtype="float32")
norms = np.linalg.norm(arr, axis=1, keepdims=True)
norms[norms == 0] = 1.0
arr = arr / norms
return arr
@st.cache_data
def carica_corpus() -> List[Dict[str, str]]:
path = Path("demo_corpus.json")
data = json.loads(path.read_text(encoding="utf-8"))
for r in data:
r["text"] = normalizza_testo(r.get("text", ""))
return data
@st.cache_resource
def prepara_engine():
model, backend = inizializza_embedder()
records = carica_corpus()
texts = [r["text"] for r in records]
mat = vectorizza(model, texts)
return model, backend, records, mat
def cerca(query: str, model, records, mat: np.ndarray, top_k: int = 5):
qv = vectorizza(model, [query])[0]
sims = mat @ qv
qtok = set(tokenizza(query))
out = []
for i, s in enumerate(sims):
rec = records[i]
ttok = set(tokenizza(rec["text"]))
overlap = len(qtok.intersection(ttok)) if qtok else 0
lex = (overlap / max(1, len(qtok))) if qtok else 0.0
score = 0.6 * float(s) + 0.4 * float(lex)
out.append(
{
"score": score,
"sim": float(s),
"lex": float(lex),
"domain": rec.get("domain", "generale"),
"source": rec.get("source", "showcase"),
"text": rec.get("text", ""),
}
)
out.sort(key=lambda x: x["score"], reverse=True)
return out[:top_k]
st.set_page_config(page_title="AIO Showcase", page_icon=":books:", layout="centered")
st.title("AIO System Core - Public Showcase")
st.caption("Demo pubblica controllata. Core proprietario e corpus completo restano privati.")
model, backend, records, mat = prepara_engine()
st.caption(f"Backend: {backend}")
st.caption(f"Records demo: {len(records)}")
query = st.text_input("Inserisci una domanda", value="Parlami del Mediterraneo e della geopolitica energetica")
top_k = st.slider("Top K", min_value=3, max_value=10, value=5, step=1)
if st.button("Esegui ricerca"):
risultati = cerca(query, model, records, mat, top_k=top_k)
st.subheader("Risultati")
for i, r in enumerate(risultati, start=1):
st.markdown(
f"**{i}. [{r['domain']}]** score={r['score']:.3f} sim={r['sim']:.3f} lex={r['lex']:.3f}"
)
st.caption(f"source: {r['source']}")
st.write(r["text"])
st.markdown("---")
st.markdown("**Licenza Showcase:** uso e studio liberi, uso commerciale solo su autorizzazione scritta.")
st.markdown("Contatto: `info@rthitalia.com`")