from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableLambda from langchain_core.prompts import PromptTemplate from langchain_openai import ChatOpenAI, OpenAIEmbeddings from pydantic import BaseModel, Field from langchain_community.document_loaders import PyMuPDFLoader from langchain_core.vectorstores import InMemoryVectorStore from langchain_text_splitters import RecursiveCharacterTextSplitter from operator import itemgetter from config import NEW_REFINE_SYSTEM_PROMPT_JSON class Answer(BaseModel): enhanced_question: str = Field(description="Paraphrased question") enhanced_answer: str = Field(description="Enhanced answer") class RefinementPipeline: def __init__(self, model: str = "gpt-4.1", temperature: float = 0.1): self.llm = ChatOpenAI(model=model, temperature=temperature) self.parser = JsonOutputParser(pydantic_object=Answer) self.prompt = PromptTemplate( template=NEW_REFINE_SYSTEM_PROMPT_JSON, input_variables=["question", "answer", "context"], partial_variables={"format_instructions": self.parser.get_format_instructions()}, ) # Load and process PDF self.pdf_loader = PyMuPDFLoader("refine.pdf") self.pdf_docs = self.pdf_loader.load() # Split the document into chunks self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200) self.splits = self.text_splitter.split_documents(self.pdf_docs) # Create an in-memory vector store from the document splits self.pdf_vectorstore = InMemoryVectorStore.from_documents( documents=self.splits, embedding=OpenAIEmbeddings() ) self.pdf_retriever = self.pdf_vectorstore.as_retriever() # Define the processing chain self.chain = ( { "context": itemgetter("question") | self.pdf_retriever, "question": itemgetter("question"), "answer": itemgetter("answer") } | RunnableLambda(lambda x: { "context": "\n".join([doc.page_content for doc in x["context"]]), "question": x["question"], "answer": x["answer"] }) | self.prompt | self.llm | self.parser ) def invoke(self, question: str, answer: str): return self.chain.invoke({"question": question, "answer": answer})