File size: 2,509 Bytes
fa8ee23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29f53db
fa8ee23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.runnables import RunnableLambda
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from pydantic import BaseModel, Field
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from operator import itemgetter
from config import NEW_REFINE_SYSTEM_PROMPT_JSON


class Answer(BaseModel):
    enhanced_question: str = Field(description="Paraphrased question")
    enhanced_answer: str = Field(description="Enhanced answer")


class RefinementPipeline:
    def __init__(self, model: str = "gpt-4.1", temperature: float = 0.1):
        self.llm = ChatOpenAI(model=model, temperature=temperature)
        self.parser = JsonOutputParser(pydantic_object=Answer)
        self.prompt = PromptTemplate(
            template=NEW_REFINE_SYSTEM_PROMPT_JSON,
            input_variables=["question", "answer", "context"],
            partial_variables={"format_instructions": self.parser.get_format_instructions()},
        )

        # Load and process PDF
        self.pdf_loader = PyMuPDFLoader("refine.pdf")
        self.pdf_docs = self.pdf_loader.load()

        # Split the document into chunks
        self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
        self.splits = self.text_splitter.split_documents(self.pdf_docs)

        # Create an in-memory vector store from the document splits
        self.pdf_vectorstore = InMemoryVectorStore.from_documents(
            documents=self.splits, embedding=OpenAIEmbeddings()
        )
        self.pdf_retriever = self.pdf_vectorstore.as_retriever()

        # Define the processing chain
        self.chain = (
                {
                    "context": itemgetter("question") | self.pdf_retriever,
                    "question": itemgetter("question"),
                    "answer": itemgetter("answer")
                }
                | RunnableLambda(lambda x: {
            "context": "\n".join([doc.page_content for doc in x["context"]]),
            "question": x["question"],
            "answer": x["answer"]
        })
                | self.prompt
                | self.llm
                | self.parser
        )

    def invoke(self, question: str, answer: str):
        return self.chain.invoke({"question": question, "answer": answer})