|
|
import os |
|
|
from langchain_community.document_loaders import PyPDFLoader |
|
|
from langchain_text_splitters import RecursiveCharacterTextSplitter |
|
|
from langchain_community.vectorstores import FAISS |
|
|
from langchain_core.prompts import ChatPromptTemplate |
|
|
from langchain_core.output_parsers import StrOutputParser |
|
|
from langchain_core.runnables import RunnablePassthrough, RunnableParallel |
|
|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
|
from langchain_openai import ChatOpenAI |
|
|
from langchain_community.chat_models import ChatLiteLLM |
|
|
from langchain_core.messages import HumanMessage, AIMessage |
|
|
|
|
|
class ProjectRAGEngine: |
|
|
def __init__(self): |
|
|
|
|
|
self.embeddings = HuggingFaceEmbeddings( |
|
|
model_name="sentence-transformers/all-MiniLM-L6-v2", |
|
|
model_kwargs={"device": "cpu"}, |
|
|
encode_kwargs={"normalize_embeddings": True} |
|
|
) |
|
|
|
|
|
self.llm = ChatOpenAI( |
|
|
model="openai/gpt-oss-120b:free", |
|
|
base_url="https://openrouter.ai/api/v1", |
|
|
api_key=os.getenv("OPENROUTER_API_KEY"), |
|
|
extra_body={"reasoning": {"enabled": True}}) |
|
|
self.vector_store = None |
|
|
|
|
|
def process_documents(self, pdf_paths): |
|
|
all_docs = [] |
|
|
|
|
|
for path in pdf_paths: |
|
|
loader = PyPDFLoader(path) |
|
|
all_docs.extend(loader.load()) |
|
|
|
|
|
splitter = RecursiveCharacterTextSplitter( |
|
|
chunk_size=500, |
|
|
chunk_overlap=50 |
|
|
) |
|
|
|
|
|
splits = splitter.split_documents(all_docs) |
|
|
|
|
|
|
|
|
self.vector_store = FAISS.from_documents( |
|
|
splits, self.embeddings |
|
|
) |
|
|
|
|
|
def _format_docs(self, docs): |
|
|
return "\n\n".join(d.page_content for d in docs) |
|
|
|
|
|
def get_answer(self, query): |
|
|
if not self.vector_store: |
|
|
return "Please upload documents first.", [] |
|
|
|
|
|
template = """ |
|
|
You are a professional Project Analyst. |
|
|
Answer strictly using the context. |
|
|
If unknown, say you don't know. |
|
|
Cite document names and page numbers. |
|
|
Context: |
|
|
{context} |
|
|
Question: |
|
|
{question} |
|
|
""" |
|
|
|
|
|
prompt = ChatPromptTemplate.from_template(template) |
|
|
retriever = self.vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 5, "lambda_mult":0.25}) |
|
|
|
|
|
rag_chain = ( |
|
|
RunnablePassthrough.assign( |
|
|
context=lambda x: self._format_docs(x["context"]) |
|
|
) |
|
|
| prompt |
|
|
| self.llm |
|
|
| StrOutputParser() |
|
|
) |
|
|
|
|
|
chain = RunnableParallel( |
|
|
{"context": retriever, "question": RunnablePassthrough()} |
|
|
).assign(answer=rag_chain) |
|
|
|
|
|
result = chain.invoke(query) |
|
|
|
|
|
sources = [ |
|
|
{"content": d.page_content, "metadata": d.metadata} |
|
|
for d in result["context"] |
|
|
] |
|
|
|
|
|
return result["answer"], sources |