import re from sentence_transformers import SentenceTransformer import chromadb import google.generativeai as genai import re def load_documents(file_path, chunk_size=30): with open(file_path, "r", encoding="utf-8") as f: text = f.read() words = re.findall(r"\w+|\S", text) chunks = [ " ".join(words[i : i + chunk_size]) for i in range(0, len(words), chunk_size) ] return [chunk.strip() for chunk in chunks if chunk.strip()] def embed_documents(docs, model): return model.encode(docs).tolist() def build_chroma_db(docs, embeddings): client = chromadb.Client() collection = client.get_or_create_collection("rag_docs") for i, (doc, emb) in enumerate(zip(docs, embeddings)): collection.add(documents=[doc], embeddings=[emb], ids=[str(i)]) return collection def retrieve(query, collection, model, top_k=3): query_emb = model.encode([query]).tolist()[0] results = collection.query( query_embeddings=[query_emb], n_results=top_k, include=["documents"] ) return results["documents"][0] def call_gemini(query, context, api_key): genai.configure(api_key=api_key) model = genai.GenerativeModel("gemini-2.0-flash") prompt = ( "Answer the following question based only on the provided context.\n\n" f"Context:\n{context}\n\n" f"Question: {query}\n" "Answer:" ) response = model.generate_content(prompt) return response.text.strip() def build_rag_chain(api_key): docs = load_documents("data/info.txt") model = SentenceTransformer("all-MiniLM-L6-v2") embeddings = embed_documents(docs, model) collection = build_chroma_db(docs, embeddings) def rag_qa(query): retrieved_docs = retrieve(query, collection, model) context = "\n\n".join(retrieved_docs) answer = call_gemini(query, context, api_key) return answer, retrieved_docs return rag_qa