AsyncRAG

Sleeping

File size: 2,224 Bytes

4efaf50
cf1df19
79ff3c4
f6f60e8
6d3e4d2
e8fa82e
ffadad7
cf1df19
ebecac1
b6d77d3
cf1df19
e8fa82e
d557fa1
cf1df19
f6f60e8
 
cf1df19
abd4e0b
cf1df19
 
 
 
 
 
 
 
 
b6d77d3
79ff3c4
 
e8fa82e
4efaf50
cf1df19
e8fa82e
cf1df19
f6f60e8
cf1df19
 
1afe1ea
cf1df19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e8fa82e
cf1df19

import os
import torch
from transformers import pipeline
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from config import EMBEDDING_MODEL, LLM_MODEL, CHROMA_DIR, LLM_TASK

# 1. Initialize Embeddings
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)

# 2. Load Vector DB
if os.path.exists(CHROMA_DIR) and any(os.scandir(CHROMA_DIR)):
    vectordb = Chroma(persist_directory=CHROMA_DIR, embedding_function=embeddings)
    print("✅ Vector DB loaded successfully")
else:
    vectordb = None
    print("⚠️ Vector DB folder missing or empty")

# 3. LLM Pipeline - Optimized for CPU stability
qa_pipeline = pipeline(
    LLM_TASK, 
    model=LLM_MODEL, 
    device_map="cpu", 
    max_new_tokens=256, # Sufficient for detailed answers
    trust_remote_code=True,
    model_kwargs={"torch_dtype": torch.float32} # Safer for CPU
)

def ask_rag_with_status(question: str):
    if vectordb is None:
        return "Knowledge base not ready.", "ERROR"

    # Search for context
    docs = vectordb.similarity_search(question, k=3)
    context = "\n".join([d.page_content for d in docs])
    
    # Simple, clear prompt for Qwen
    prompt = f"Context:\n{context}\n\nQuestion: {question}\n\nAnswer:"
    
    try:
        # Generate with specific stopping criteria to prevent "looping"
        result = qa_pipeline(
            prompt, 
            do_sample=False, # Use greedy decoding for faster, consistent answers
            temperature=0.0,
            pad_token_id=qa_pipeline.tokenizer.eos_token_id
        )
        
        full_output = result[0]["generated_text"]
        
        # Extract everything after the word "Answer:"
        if "Answer:" in full_output:
            answer = full_output.split("Answer:")[-1].strip()
        else:
            answer = full_output.strip()

        if not answer:
            answer = "I found context in the documents but could not generate a coherent summary. Please rephrase."

        return answer, ["Context retrieved", "Qwen generated answer"]
        
    except Exception as e:
        print(f"❌ Generation error: {e}")
        return "The model timed out while thinking. Try a shorter question.", "TIMEOUT"