import os
import torch
from transformers import pipeline
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from config import EMBEDDING_MODEL, LLM_MODEL, CHROMA_DIR, LLM_TASK

# 1. Initialize Embeddings
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)

# 2. Load Vector DB
if os.path.exists(CHROMA_DIR) and any(os.scandir(CHROMA_DIR)):
    vectordb = Chroma(persist_directory=CHROMA_DIR, embedding_function=embeddings)
    print("✅ Vector DB loaded successfully")
else:
    vectordb = None
    print("⚠️ Vector DB folder missing or empty")

# 3. LLM Pipeline - Optimized for CPU stability
qa_pipeline = pipeline(
    LLM_TASK, 
    model=LLM_MODEL, 
    device_map="cpu", 
    max_new_tokens=256, # Sufficient for detailed answers
    trust_remote_code=True,
    model_kwargs={"torch_dtype": torch.float32} # Safer for CPU
)

def ask_rag_with_status(question: str):
    if vectordb is None:
        return "Knowledge base not ready.", "ERROR"

    # Search for context
    docs = vectordb.similarity_search(question, k=3)
    context = "\n".join([d.page_content for d in docs])
    
    # Simple, clear prompt for Qwen
    prompt = f"Context:\n{context}\n\nQuestion: {question}\n\nAnswer:"
    
    try:
        # Generate with specific stopping criteria to prevent "looping"
        result = qa_pipeline(
            prompt, 
            do_sample=False, # Use greedy decoding for faster, consistent answers
            temperature=0.0,
            pad_token_id=qa_pipeline.tokenizer.eos_token_id
        )
        
        full_output = result[0]["generated_text"]
        
        # Extract everything after the word "Answer:"
        if "Answer:" in full_output:
            answer = full_output.split("Answer:")[-1].strip()
        else:
            answer = full_output.strip()

        if not answer:
            answer = "I found context in the documents but could not generate a coherent summary. Please rephrase."

        return answer, ["Context retrieved", "Qwen generated answer"]
        
    except Exception as e:
        print(f"❌ Generation error: {e}")
        return "The model timed out while thinking. Try a shorter question.", "TIMEOUT"