import torch from langchain_community.document_loaders import PyPDFLoader from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline from langchain_community.vectorstores import Chroma from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline class LlamaProcessor: def __init__(self, model_id, hf_token): self.model_id = model_id self.hf_token = hf_token # Initialize the pipeline exactly as you requested self.pipe = pipeline( "text-generation", model=model_id, torch_dtype=torch.bfloat16, device_map="auto", token=hf_token ) # LangChain wrapper for the pipeline self.llm = HuggingFacePipeline(pipeline=self.pipe) self.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") def process_pdf(self, file_path): loader = PyPDFLoader(file_path) docs = loader.load() text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100) splits = text_splitter.split_documents(docs) vectorstore = Chroma.from_documents(documents=splits, embedding=self.embeddings) return vectorstore def get_answer(self, query, vectorstore): # 1. Retrieve context relevant_docs = vectorstore.similarity_search(query, k=3) context = "\n".join([doc.page_content for doc in relevant_docs]) # 2. Format as Llama 3.2 messages (Chat Template) messages = [ { "role": "system", "content": "You are a helpful assistant. Use the provided context to answer the user's question." }, { "role": "user", "content": f"Context: {context}\n\nQuestion: {query}" }, ] # 3. Generate response using the pipeline's built-in chat handling # We call the pipeline directly to ensure the chat template is applied correctly outputs = self.pipe( messages, max_new_tokens=256, temperature=0.1, do_sample=True ) # 4. Extract the content of the last message (the assistant's reply) return outputs[0]["generated_text"][-1]["content"]