from langchain_huggingface import HuggingFacePipeline from langchain_core.prompts import ChatPromptTemplate from langchain_core.output_parsers import StrOutputParser from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline import torch def get_llm(): model_id = "Qwen/Qwen3-0.6B" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, device_map="auto", torch_dtype=torch.float16 ) pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512, temperature=0.1 ) return HuggingFacePipeline(pipeline=pipe) def rewrite_query(question, chat_history): history_text = "" for turn in chat_history[-3:]: history_text += f"User: {turn['question']}\n" history_text += f"Assistant: {turn['answer'][:150]}\n" rewrite_prompt = """Given the conversation history and a follow-up question, rewrite the question into a standalone search query that will retrieve relevant information from a document. Conversation History: {history} Follow-up Question: {question} Standalone Search Query:""" prompt = ChatPromptTemplate.from_template(rewrite_prompt) llm = get_llm() chain = prompt | llm | StrOutputParser() rewritten = chain.invoke({ "history": history_text if history_text else "(none)", "question": question, }) return rewritten.strip() def answer_with_memory(vectorstore, question, chat_history, k=6): search_query = rewrite_query(question, chat_history) retrieved_docs = vectorstore.similarity_search(search_query, k=k) context_parts = [] for i, doc in enumerate(retrieved_docs, 1): context_parts.append(f"[Source {i}]\n{doc.page_content}") context = "\n\n".join(context_parts) history_text = "" for turn in chat_history: history_text += f"User: {turn['question']}\n" history_text += f"Assistant: {turn['answer']}\n" template = """You are a highly efficient and accurate AI assistant for a document Q&A system. You MUST answer the user's question directly based ONLY on the provided Context. Conversation History: {history} Context: {context} Question: {question} Answer:""" prompt = ChatPromptTemplate.from_template(template) llm = get_llm() chain = prompt | llm | StrOutputParser() answer = chain.invoke({ "history": history_text, "context": context, "question": question, }) return answer, retrieved_docs