AsyncRAG

Sleeping

App Files Files Community

Zubaish commited on Jan 28

Commit

cf1df19

1 Parent(s): 2194516

update

Browse files

Files changed (1) hide show

rag.py +42 -16

rag.py CHANGED Viewed

@@ -1,38 +1,64 @@
 import os
 from transformers import pipeline
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_chroma import Chroma
 from config import EMBEDDING_MODEL, LLM_MODEL, CHROMA_DIR, LLM_TASK
 embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
 if os.path.exists(CHROMA_DIR) and any(os.scandir(CHROMA_DIR)):
     vectordb = Chroma(persist_directory=CHROMA_DIR, embedding_function=embeddings)
-    print("✅ Vector DB loaded")
 else:
     vectordb = None
-    print("⚠️ Vector DB missing")
-qa_pipeline = pipeline(task=LLM_TASK, model=LLM_MODEL, device_map="cpu", max_new_tokens=512, trust_remote_code=True)
 def ask_rag_with_status(question: str):
     if vectordb is None:
         return "Knowledge base not ready.", "ERROR"
     docs = vectordb.similarity_search(question, k=3)
-    context = "\n\n".join(d.page_content for d in docs)
-    # Qwen Chat Template
-    messages = [
-        {"role": "system", "content": "You are a Gandhi ji expert. Answer the question using ONLY the provided context."},
-        {"role": "user", "content": f"Context: {context}\n\nQuestion: {question}"}
-    ]
-    prompt = qa_pipeline.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    result = qa_pipeline(prompt, pad_token_id=qa_pipeline.tokenizer.eos_token_id)
-    # Extract Qwen answer
-    full_text = result[0]["generated_text"]
-    answer = full_text.split("<|im_start|>assistant")[-1].strip().replace("<|im_end|>", "")
-    return answer, ["Success"]

 import os
+import torch
 from transformers import pipeline
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_chroma import Chroma
 from config import EMBEDDING_MODEL, LLM_MODEL, CHROMA_DIR, LLM_TASK
+# 1. Initialize Embeddings
 embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
+# 2. Load Vector DB
 if os.path.exists(CHROMA_DIR) and any(os.scandir(CHROMA_DIR)):
     vectordb = Chroma(persist_directory=CHROMA_DIR, embedding_function=embeddings)
+    print("✅ Vector DB loaded successfully")
 else:
     vectordb = None
+    print("⚠️ Vector DB folder missing or empty")
+# 3. LLM Pipeline - Optimized for CPU stability
+qa_pipeline = pipeline(
+    LLM_TASK,
+    model=LLM_MODEL,
+    device_map="cpu",
+    max_new_tokens=256, # Sufficient for detailed answers
+    trust_remote_code=True,
+    model_kwargs={"torch_dtype": torch.float32} # Safer for CPU
+)
 def ask_rag_with_status(question: str):
     if vectordb is None:
         return "Knowledge base not ready.", "ERROR"
+    # Search for context
     docs = vectordb.similarity_search(question, k=3)
+    context = "\n".join([d.page_content for d in docs])
+    # Simple, clear prompt for Qwen
+    prompt = f"Context:\n{context}\n\nQuestion: {question}\n\nAnswer:"
+    try:
+        # Generate with specific stopping criteria to prevent "looping"
+        result = qa_pipeline(
+            prompt,
+            do_sample=False, # Use greedy decoding for faster, consistent answers
+            temperature=0.0,
+            pad_token_id=qa_pipeline.tokenizer.eos_token_id
+        )
+        full_output = result[0]["generated_text"]
+        # Extract everything after the word "Answer:"
+        if "Answer:" in full_output:
+            answer = full_output.split("Answer:")[-1].strip()
+        else:
+            answer = full_output.strip()
+        if not answer:
+            answer = "I found context in the documents but could not generate a coherent summary. Please rephrase."
+        return answer, ["Context retrieved", "Qwen generated answer"]
+    except Exception as e:
+        print(f"❌ Generation error: {e}")
+        return "The model timed out while thinking. Try a shorter question.", "TIMEOUT"