Spaces:
Sleeping
Sleeping
| import re | |
| from sentence_transformers import SentenceTransformer | |
| import chromadb | |
| import google.generativeai as genai | |
| import re | |
| def load_documents(file_path, chunk_size=30): | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| text = f.read() | |
| words = re.findall(r"\w+|\S", text) | |
| chunks = [ | |
| " ".join(words[i : i + chunk_size]) for i in range(0, len(words), chunk_size) | |
| ] | |
| return [chunk.strip() for chunk in chunks if chunk.strip()] | |
| def embed_documents(docs, model): | |
| return model.encode(docs).tolist() | |
| def build_chroma_db(docs, embeddings): | |
| client = chromadb.Client() | |
| collection = client.get_or_create_collection("rag_docs") | |
| for i, (doc, emb) in enumerate(zip(docs, embeddings)): | |
| collection.add(documents=[doc], embeddings=[emb], ids=[str(i)]) | |
| return collection | |
| def retrieve(query, collection, model, top_k=3): | |
| query_emb = model.encode([query]).tolist()[0] | |
| results = collection.query( | |
| query_embeddings=[query_emb], n_results=top_k, include=["documents"] | |
| ) | |
| return results["documents"][0] | |
| def call_gemini(query, context, api_key): | |
| genai.configure(api_key=api_key) | |
| model = genai.GenerativeModel("gemini-2.0-flash") | |
| prompt = ( | |
| "Answer the following question based only on the provided context.\n\n" | |
| f"Context:\n{context}\n\n" | |
| f"Question: {query}\n" | |
| "Answer:" | |
| ) | |
| response = model.generate_content(prompt) | |
| return response.text.strip() | |
| def build_rag_chain(api_key): | |
| docs = load_documents("data/info.txt") | |
| model = SentenceTransformer("all-MiniLM-L6-v2") | |
| embeddings = embed_documents(docs, model) | |
| collection = build_chroma_db(docs, embeddings) | |
| def rag_qa(query): | |
| retrieved_docs = retrieve(query, collection, model) | |
| context = "\n\n".join(retrieved_docs) | |
| answer = call_gemini(query, context, api_key) | |
| return answer, retrieved_docs | |
| return rag_qa | |