from sentence_transformers import SentenceTransformer from src.storage.chroma_store import get_collection from src.backend.llm import llm_generate EMBED_MODEL = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") def retrieve(username: str, notebook_id: str, query: str, k=6): col = get_collection(username, notebook_id) current_count = col.count() if current_count <= 0: return [] n_results = min(k, current_count) qemb = EMBED_MODEL.encode([query], normalize_embeddings=True).tolist() res = col.query( query_embeddings=qemb, n_results=n_results, include=["documents", "metadatas", "distances"], ) ids = res.get("ids", [[]])[0] docs = res.get("documents", [[]])[0] mets = res.get("metadatas", [[]])[0] dists = res.get("distances", [[]])[0] hits = [] for i in range(len(docs)): hits.append( { "id": ids[i] if i < len(ids) else f"chunk_{i}", "doc": docs[i], "meta": mets[i] if i < len(mets) else {}, "distance": dists[i] if i < len(dists) else None, } ) return hits def format_sources(hits): lines = [] for i, h in enumerate(hits, start=1): m = h.get("meta") or {} title = m.get("source_title", "source") loc = "" if m.get("page"): loc = f"p.{m['page']}" if m.get("slide"): loc = f"slide {m['slide']}" lines.append(f"[S{i}] {title} {loc}".strip()) return "\n".join(lines) def context_block(hits): blocks = [] for i, h in enumerate(hits, start=1): m = h.get("meta") or {} title = m.get("source_title", "source") loc = "" if m.get("page"): loc = f"(page {m['page']})" if m.get("slide"): loc = f"(slide {m['slide']})" blocks.append(f"[S{i}] {title} {loc}\n{h.get('doc','')}") return "\n\n---\n\n".join(blocks) def rag_answer(query: str, hits): if not hits: return "Not found in the provided sources. (No indexed chunks yet.)" prompt = f""" You are a research assistant. Answer ONLY using the sources below. Every non-trivial claim must end with citations like [S1] or [S2]. If not present in sources, say: Not found in the provided sources. Question: {query} Sources list: {format_sources(hits)} Source excerpts: {context_block(hits)} Answer with citations: """ ans = llm_generate(prompt, max_new_tokens=450, temperature=0.2) return f"{ans}\n\nSources:\n{format_sources(hits)}"