Spaces:
Sleeping
Sleeping
File size: 4,815 Bytes
d2fe6cc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
from langchain_community.document_loaders import Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chat_models import init_chat_model
from langchain_huggingface import HuggingFaceEmbeddings
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
import os
from langchain import hub
from dotenv import load_dotenv
from langgraph.graph import START, StateGraph
from typing import List, Dict, Any, Optional
from pydantic import BaseModel, Field
from langchain.docstore.document import Document
load_dotenv()
class State(BaseModel):
question: str = Field(..., description="Type your question here")
context: List[Document] = Field(
default_factory=list,
description="A list of Document objects",
)
answer: str = Field(default="", description="Answer will be here")
class DocProcessor:
def __init__(self):
# Load model provider
if not os.environ.get("GOOGLE_API_KEY"):
raise ValueError("Google Gemini API key not found in environment variables")
self.llm = init_chat_model("gemini-2.5-flash", model_provider="google_genai")
self.embedding_model = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2",
model_kwargs={"device": "cpu"}
)
self.prompt = hub.pull("rlm/rag-prompt")
self.vector_store = None
self.chunk_size = 1000
self.chunk_overlap = 200
def process_docx(self, file_path: str) -> Dict[str, Any]:
"""
Process a DOCX file and prepare it for querying
Args:
file_path (str): Path to the DOCX file
Returns:
Dict[str, Any]: Processing status and information
"""
try:
# Document Loading
loader = Docx2txtLoader(file_path)
pages = loader.load()
# Text Splitting
text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
texts = text_splitter.split_documents(pages)
# Vector Store Setup
embedding_dim = len(self.embedding_model.embed_query("test"))
index = faiss.IndexFlatL2(embedding_dim)
self.vector_store = FAISS(
embedding_function=self.embedding_model,
index=index,
docstore=InMemoryDocstore(),
index_to_docstore_id={},
)
# Index chunks
self.vector_store.add_documents(documents=texts)
return {
"status": "success",
"message": "DOCX processed successfully",
"num_pages": len(pages),
"num_chunks": len(texts)
}
except Exception as e:
return {
"status": "error",
"message": f"Error processing DOCX: {str(e)}"
}
def query_response(self, query: str) -> Dict[str, Any]:
"""
Query the processed document
Args:
query (str): The question to ask about the document
Returns:
Dict[str, Any]: Answer and relevant context
"""
if not self.vector_store:
return {
"status": "error",
"message": "No document has been processed yet"
}
try:
# Create state graph
graph_builder = StateGraph(State)
# Define retrieval step
def retrieve(state: State):
retrieved_docs = self.vector_store.similarity_search(state.question)
return {"context": retrieved_docs}
# Define generation step
def generate(state: State):
docs_content = "\n\n".join(doc.page_content for doc in state.context)
messages = self.prompt.invoke({
"question": state.question,
"context": docs_content
})
response = self.llm.invoke(messages)
return {"answer": response.content}
# Build and compile the graph
graph = graph_builder.add_sequence([retrieve, generate]).set_entry_point("retrieve").compile()
# Execute the query
response = graph.invoke({
"question": query
})
return {
"status": "success",
"answer": response["answer"],
"query": query
}
except Exception as e:
return {
"status": "error",
"message": f"Error querying document: {str(e)}"
}
|