Dinesh310 commited on
Commit
1925b26
·
verified ·
1 Parent(s): 4928a8c

Create rag_engine.py

Browse files
Files changed (1) hide show
  1. src/rag_engine.py +65 -0
src/rag_engine.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from langchain_community.document_loaders import PyPDFLoader
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain_openai import OpenAIEmbeddings, ChatOpenAI
5
+ from langchain_community.vectorstores import FAISS
6
+ from langchain_core.prompts import ChatPromptTemplate
7
+ from langchain_core.output_parsers import StrOutputParser
8
+ from langchain_core.runnables import RunnablePassthrough, RunnableParallel
9
+
10
+ class ProjectRAGEngine:
11
+ def __init__(self, api_key):
12
+ self.embeddings = OpenAIEmbeddings(openai_api_key=api_key)
13
+ self.llm = ChatOpenAI(model="gpt-4o", openai_api_key=api_key, temperature=0)
14
+ self.vector_store = None
15
+
16
+ def process_documents(self, pdf_paths):
17
+ all_docs = []
18
+ for path in pdf_paths:
19
+ try:
20
+ loader = PyPDFLoader(path)
21
+ docs = loader.load()
22
+ all_docs.extend(docs)
23
+ except Exception as e:
24
+ print(f"Error loading {path}: {e}")
25
+
26
+ # Splitting logic to handle large reports [cite: 10]
27
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
28
+ splits = text_splitter.split_documents(all_docs)
29
+ self.vector_store = FAISS.from_documents(splits, self.embeddings)
30
+
31
+ def _format_docs(self, docs):
32
+ return "\n\n".join(doc.page_content for doc in docs)
33
+
34
+ def get_answer(self, query):
35
+ if not self.vector_store:
36
+ return "Please upload documents first.", []
37
+
38
+ # System prompt ensuring grounded responses [cite: 18, 25]
39
+ template = """
40
+ You are a professional Project Analyst. Answer strictly based on the provided context.
41
+ If the answer is not in the context, say you don't know.
42
+ Cite document names and page numbers for every answer. Include direct quotes.
43
+
44
+ Context: {context}
45
+ Question: {question}
46
+ """
47
+ prompt = ChatPromptTemplate.from_template(template)
48
+ retriever = self.vector_store.as_retriever(search_kwargs={"k": 5})
49
+
50
+ # Pure LCEL Chain composition
51
+ rag_chain_from_docs = (
52
+ RunnablePassthrough.assign(context=(lambda x: self._format_docs(x["context"])))
53
+ | prompt
54
+ | self.llm
55
+ | StrOutputParser()
56
+ )
57
+
58
+ rag_chain_with_source = RunnableParallel(
59
+ {"context": retriever, "question": RunnablePassthrough()}
60
+ ).assign(answer=rag_chain_from_docs)
61
+
62
+ result = rag_chain_with_source.invoke(query)
63
+
64
+ sources = [{"content": doc.page_content, "metadata": doc.metadata} for doc in result["context"]]
65
+ return result["answer"], sources