File size: 1,626 Bytes
e0fa626
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
from langchain_chroma import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from src.agents.prompts import RAG_PROMPT


def build_gemini_rag_chain(pdf_path: str):
    # Load and split documents
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    texts = splitter.split_documents(documents)

    # Create vectorstore
    vectorstore = Chroma.from_documents(texts, embedding=GoogleGenerativeAIEmbeddings(model="models/embedding-001"))
    retriever = vectorstore.as_retriever()

    # Make retriever history-aware
    contextualize_q_prompt = ChatPromptTemplate.from_messages([
        ("system", "Given a chat history and the latest user question, rewrite it as a standalone question."),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ])

    model = ChatGoogleGenerativeAI(model="gemini-2.0-flash", convert_system_message_to_human=True)
    history_aware_retriever = create_history_aware_retriever(model, retriever, contextualize_q_prompt)

    # Build RAG chain
    qa_chain = create_stuff_documents_chain(model, RAG_PROMPT)
    return create_retrieval_chain(history_aware_retriever, qa_chain)