Upload examples/run_memory.py with huggingface_hub

8f3f3ca verified 18 days ago

1.83 kB

	"""Matrix-BIOS-Memory-0.1 — grounded, citation-faithful recall (RAG).
	Ships a FAISS index + a small corpus; every answer cites the source ids it used.
	pip install torch transformers sentence-transformers faiss-cpu huggingface_hub
	"""
	import json
	import faiss
	import torch
	from huggingface_hub import snapshot_download
	from sentence_transformers import SentenceTransformer
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

	REPO = "ruslanmv/Matrix-BIOS-Memory-0.1"
	path = snapshot_download(REPO)
	cfg = json.load(open(f"{path}/memory_config.json")) # embedder / generator / top_k
	docs = json.load(open(f"{path}/docs.json")) # [{"id": ..., "text": ...}]
	index = faiss.read_index(f"{path}/index.faiss")

	embedder = SentenceTransformer(cfg["embedder"])
	gen_tok = AutoTokenizer.from_pretrained(cfg["generator"])
	gen_model = AutoModelForSeq2SeqLM.from_pretrained(cfg["generator"]).eval()

	def answer(question: str):
	qv = embedder.encode([question], normalize_embeddings=True).astype("float32")
	_, idx = index.search(qv, cfg["top_k"])
	hits = [docs[i] for i in idx[0] if 0 <= i < len(docs)]
	context = "\n".join(f"[{d['id']}] {d['text']}" for d in hits)
	prompt = ("Answer the question using ONLY the context, and cite the [id] you used.\n"
	f"Context:\n{context}\n\nQuestion: {question}\nAnswer:")
	ids = gen_tok(prompt, return_tensors="pt", truncation=True).input_ids
	with torch.no_grad():
	out = gen_model.generate(ids, max_new_tokens=64)
	return gen_tok.decode(out[0], skip_special_tokens=True), [d["id"] for d in hits]

	if __name__ == "__main__":
	for q in ["What does every effectful action in Matrix OS emit?",
	"Qual e la capitale d'Italia?"]:
	ans, sources = answer(q)
	print(f"Q: {q}\nA: {ans}\n sources: {sources}\n")