import torch
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from langchain_community.vectorstores import Chroma
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

class LlamaProcessor:
    def __init__(self, model_id, hf_token):
        self.model_id = model_id
        self.hf_token = hf_token
        
        # Initialize the pipeline exactly as you requested
        self.pipe = pipeline(
            "text-generation",
            model=model_id,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            token=hf_token
        )
        
        # LangChain wrapper for the pipeline
        self.llm = HuggingFacePipeline(pipeline=self.pipe)
        self.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

    def process_pdf(self, file_path):
        loader = PyPDFLoader(file_path)
        docs = loader.load()
        
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
        splits = text_splitter.split_documents(docs)
        
        vectorstore = Chroma.from_documents(documents=splits, embedding=self.embeddings)
        return vectorstore

    def get_answer(self, query, vectorstore):
        # 1. Retrieve context
        relevant_docs = vectorstore.similarity_search(query, k=3)
        context = "\n".join([doc.page_content for doc in relevant_docs])
        
        # 2. Format as Llama 3.2 messages (Chat Template)
        messages = [
            {
                "role": "system", 
                "content": "You are a helpful assistant. Use the provided context to answer the user's question."
            },
            {
                "role": "user", 
                "content": f"Context: {context}\n\nQuestion: {query}"
            },
        ]
        
        # 3. Generate response using the pipeline's built-in chat handling
        # We call the pipeline directly to ensure the chat template is applied correctly
        outputs = self.pipe(
            messages,
            max_new_tokens=256,
            temperature=0.1,
            do_sample=True
        )
        
        # 4. Extract the content of the last message (the assistant's reply)
        return outputs[0]["generated_text"][-1]["content"]