File size: 4,815 Bytes
d2fe6cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
from langchain_community.document_loaders import Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chat_models import init_chat_model
from langchain_huggingface import HuggingFaceEmbeddings
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
import os
from langchain import hub
from dotenv import load_dotenv
from langgraph.graph import START, StateGraph
from typing import List, Dict, Any, Optional
from pydantic import BaseModel, Field
from langchain.docstore.document import Document

load_dotenv()

class State(BaseModel):
    question: str = Field(..., description="Type your question here")
    context: List[Document] = Field(
        default_factory=list,
        description="A list of Document objects",
    )
    answer: str = Field(default="", description="Answer will be here")

class DocProcessor:
    def __init__(self):
        # Load model provider
        if not os.environ.get("GOOGLE_API_KEY"):
            raise ValueError("Google Gemini API key not found in environment variables")

        self.llm = init_chat_model("gemini-2.5-flash", model_provider="google_genai")
        self.embedding_model = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2",
            model_kwargs={"device": "cpu"}
        )
        self.prompt = hub.pull("rlm/rag-prompt")
        self.vector_store = None
        self.chunk_size = 1000
        self.chunk_overlap = 200


    def process_docx(self, file_path: str) -> Dict[str, Any]:
        """
        Process a DOCX file and prepare it for querying
        
        Args:
            file_path (str): Path to the DOCX file
            
        Returns:
            Dict[str, Any]: Processing status and information
        """
        try:
            # Document Loading
            loader = Docx2txtLoader(file_path)
            pages = loader.load()

            # Text Splitting
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
            texts = text_splitter.split_documents(pages)

            # Vector Store Setup
            embedding_dim = len(self.embedding_model.embed_query("test"))
            index = faiss.IndexFlatL2(embedding_dim)

            self.vector_store = FAISS(
                embedding_function=self.embedding_model,
                index=index,
                docstore=InMemoryDocstore(),
                index_to_docstore_id={},
            )

            # Index chunks
            self.vector_store.add_documents(documents=texts)

            return {
                "status": "success",
                "message": "DOCX processed successfully",
                "num_pages": len(pages),
                "num_chunks": len(texts)
            }
        except Exception as e:
            return {
                "status": "error",
                "message": f"Error processing DOCX: {str(e)}"
            }

    def query_response(self, query: str) -> Dict[str, Any]:
        """
        Query the processed document
        
        Args:
            query (str): The question to ask about the document
            
        Returns:
            Dict[str, Any]: Answer and relevant context
        """
        if not self.vector_store:
            return {
                "status": "error",
                "message": "No document has been processed yet"
            }

        try:
            # Create state graph
            graph_builder = StateGraph(State)

            # Define retrieval step
            def retrieve(state: State):
                retrieved_docs = self.vector_store.similarity_search(state.question)
                return {"context": retrieved_docs}

            # Define generation step
            def generate(state: State):
                docs_content = "\n\n".join(doc.page_content for doc in state.context)
                messages = self.prompt.invoke({
                    "question": state.question,
                    "context": docs_content
                })
                response = self.llm.invoke(messages)
                return {"answer": response.content}

            # Build and compile the graph
            graph = graph_builder.add_sequence([retrieve, generate]).set_entry_point("retrieve").compile()

            # Execute the query
            response = graph.invoke({
                "question": query
            })

            return {
                "status": "success",
                "answer": response["answer"],
                "query": query
            }
        except Exception as e:
            return {
                "status": "error",
                "message": f"Error querying document: {str(e)}"
            }