Spaces:

lvvignesh2122
/

Gemini-Rag-Fastapi-Pro

Running

App Files Files Community

lvvignesh2122 commited on Jan 12

Commit

be6b61f

1 Parent(s): 88cc76a

feat: Update RAG implementation to V2 with agentic graph and enhanced frontend

Browse files

Files changed (7) hide show

.gitignore +2 -0
agentic_rag_v2_graph.py +354 -0
frontend/index.html +5 -2
llm_utils.py +33 -0
main.py +12 -29
rag_store.py +16 -7
verify_rag.py +43 -0

.gitignore CHANGED Viewed

@@ -19,3 +19,5 @@ data/
 # OS / editor
 .vscode/
 .DS_Store

 # OS / editor
 .vscode/
 .DS_Store
+verify_log.txt
+verify_out.txt

agentic_rag_v2_graph.py ADDED Viewed

	@@ -0,0 +1,354 @@

+from typing import TypedDict, List, Optional, Annotated
+import google.generativeai as genai
+from langgraph.graph import StateGraph, END
+from langgraph.checkpoint.memory import MemorySaver
+from langgraph.graph.message import add_messages
+from langchain_core.messages import BaseMessage, HumanMessage, AIMessage
+import time
+import random
+from rag_store import search_knowledge
+from eval_logger import log_eval
+from llm_utils import generate_with_retry
+MODEL_NAME = "gemini-2.5-flash"
+MAX_RETRIES = 2
+def format_history(messages: List[BaseMessage]) -> str:
+    history_str = ""
+    for msg in messages:
+        role = "User" if isinstance(msg, HumanMessage) else "Assistant"
+        history_str += f"{role}: {msg.content}\n"
+    return history_str
+# ===============================
+# STATE
+# ===============================
+class AgentState(TypedDict):
+    messages: Annotated[List[BaseMessage], add_messages]
+    query: str
+    refined_query: str
+    decision: str
+    retrieved_chunks: List[dict]
+    retrieval_quality: str
+    retries: int
+    answer: Optional[str]
+    confidence: float
+    answer_known: bool
+# ===============================
+# LLM DECISION NODE (PLANNER)
+# ===============================
+def llm_decision_node(state: AgentState) -> AgentState:
+    history = format_history(state.get("messages", []))
+    prompt = f"""
+You are an AI agent deciding whether a question requires document retrieval.
+Answer ONLY one word:
+- use_rag
+- no_rag
+Conversation History:
+{history}
+Current Question:
+{state["query"]}
+"""
+    model = genai.GenerativeModel(MODEL_NAME)
+    resp = generate_with_retry(model, prompt)
+    decision = "use_rag"
+    if resp and "no_rag" in resp.text.lower():
+        decision = "no_rag"
+    return {**state, "decision": decision}
+# ===============================
+# RETRIEVAL NODE (TOOL)
+# ===============================
+def retrieve_node(state: AgentState) -> AgentState:
+    q = state["refined_query"] or state["query"]
+    chunks = search_knowledge(q)
+    return {**state, "retrieved_chunks": chunks}
+# ===============================
+# GRADE DOCUMENTS NODE (GRADER)
+# ===============================
+def grade_documents_node(state: AgentState) -> AgentState:
+    """
+    Determines whether the retrieved documents are relevant to the question.
+    """
+    query = state["query"]
+    retrieved_docs = state["retrieved_chunks"]
+    filtered_docs = []
+    for doc in retrieved_docs:
+        prompt = f"""
+        You are a grader assessing relevance of a retrieved document to a user question.
+        Retrieved document:
+        {doc['text']}
+        User question:
+        {query}
+        If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant.
+        Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question.
+        Answer ONLY 'yes' or 'no'.
+        """
+        model = genai.GenerativeModel(MODEL_NAME)
+        resp = generate_with_retry(model, prompt)
+        score = resp.text.strip().lower() if resp else "no"
+        if "yes" in score:
+            filtered_docs.append(doc)
+    return {**state, "retrieved_chunks": filtered_docs}
+# ===============================
+# RETRIEVAL EVALUATION (CRITIC)
+# ===============================
+def evaluate_retrieval_node(state: AgentState) -> AgentState:
+    if not state["retrieved_chunks"]:
+        return {**state, "retrieval_quality": "bad"}
+    context_sample = "\n".join(c["text"][:200] for c in state["retrieved_chunks"][:3])
+    prompt = f"""
+Evaluate whether the following retrieved context is sufficient
+to answer the question.
+Answer ONLY one word:
+- good
+- bad
+Question:
+{state["query"]}
+Context:
+{context_sample}
+"""
+    model = genai.GenerativeModel(MODEL_NAME)
+    resp = generate_with_retry(model, prompt)
+    quality = "bad"
+    if resp and "good" in resp.text.lower():
+        quality = "good"
+    return {**state, "retrieval_quality": quality}
+# ===============================
+# QUERY REFINEMENT (SELF-CORRECTION)
+# ===============================
+def refine_query_node(state: AgentState) -> AgentState:
+    history = format_history(state.get("messages", []))
+    prompt = f"""
+Rewrite the following question to improve document retrieval.
+Be concise and factual.
+Conversation History:
+{history}
+Original question:
+{state["query"]}
+"""
+    model = genai.GenerativeModel(MODEL_NAME)
+    resp = generate_with_retry(model, prompt)
+    refined = resp.text.strip() if resp else state["query"]
+    return {
+        **state,
+        "refined_query": refined,
+        "retries": state["retries"] + 1
+    }
+# ===============================
+# ANSWER WITH RAG (HIGH CONF)
+# ===============================
+def answer_with_rag_node(state: AgentState) -> AgentState:
+    context = "\n\n".join(c["text"] for c in state["retrieved_chunks"])
+    history = format_history(state.get("messages", []))
+    prompt = f"""
+Answer using ONLY the context below.
+If the answer is not present, say "I don't know".
+Context:
+{context}
+Conversation History:
+{history}
+Question:
+{state["query"]}
+"""
+    model = genai.GenerativeModel(MODEL_NAME)
+    resp = generate_with_retry(model, prompt)
+    answer_text = resp.text if resp else "Error generating answer due to quota limits."
+    answer_known = "i don't know" not in answer_text.lower()
+    confidence = min(0.95, 0.6 + (0.1 * len(state["retrieved_chunks"])))
+    log_eval(
+        query=state["query"],
+        retrieved_count=len(state["retrieved_chunks"]),
+        confidence=confidence,
+        answer_known=answer_known
+    )
+    # Append interaction to memory
+    new_messages = [
+        HumanMessage(content=state["query"]),
+        AIMessage(content=answer_text)
+    ]
+    return {
+        **state,
+        "messages": new_messages,
+        "answer": answer_text,
+        "confidence": confidence,
+        "answer_known": answer_known
+    }
+# ===============================
+# ANSWER WITHOUT RAG
+# ===============================
+def answer_direct_node(state: AgentState) -> AgentState:
+    history = format_history(state.get("messages", []))
+    prompt = f"""
+Conversation History:
+{history}
+Answer clearly and concisely:
+{state['query']}
+"""
+    model = genai.GenerativeModel(MODEL_NAME)
+    resp = generate_with_retry(model, prompt)
+    answer_text = resp.text if resp else "Error generating answer due to quota limits."
+    log_eval(
+        query=state["query"],
+        retrieved_count=0,
+        confidence=0.4,
+        answer_known=True
+    )
+    # Append interaction to memory
+    new_messages = [
+        HumanMessage(content=state["query"]),
+        AIMessage(content=answer_text)
+    ]
+    return {
+        **state,
+        "messages": new_messages,
+        "answer": answer_text,
+        "confidence": 0.4,
+        "answer_known": True
+    }
+# ===============================
+# NO ANSWER
+# ===============================
+def no_answer_node(state: AgentState) -> AgentState:
+    log_eval(
+        query=state["query"],
+        retrieved_count=0,
+        confidence=0.0,
+        answer_known=False
+    )
+    answer_text = "I don't know based on the provided documents."
+    # Append interaction to memory
+    new_messages = [
+        HumanMessage(content=state["query"]),
+        AIMessage(content=answer_text)
+    ]
+    return {
+        **state,
+        "messages": new_messages,
+        "answer": answer_text,
+        "confidence": 0.0,
+        "answer_known": False
+    }
+# ===============================
+# GRAPH BUILDER
+# ===============================
+def build_agentic_rag_v2_graph():
+    graph = StateGraph(AgentState)
+    memory = MemorySaver()
+    graph.add_node("decide", llm_decision_node)
+    graph.add_node("retrieve", retrieve_node)
+    graph.add_node("grade", grade_documents_node)
+    graph.add_node("evaluate", evaluate_retrieval_node)
+    graph.add_node("refine", refine_query_node)
+    graph.add_node("answer_rag", answer_with_rag_node)
+    graph.add_node("answer_direct", answer_direct_node)
+    graph.add_node("no_answer", no_answer_node)
+    graph.set_entry_point("decide")
+    graph.add_conditional_edges(
+        "decide",
+        lambda s: s["decision"],
+        {
+            "use_rag": "retrieve",
+            "no_rag": "answer_direct"
+        }
+    )
+    graph.add_edge("retrieve", "grade")
+    def check_relevance(state):
+        if not state["retrieved_chunks"]:
+            if state["retries"] >= MAX_RETRIES:
+                return "no_answer"
+            return "rewrite"
+        return "evaluate"
+    graph.add_conditional_edges(
+        "grade",
+        check_relevance,
+        {
+            "rewrite": "refine",
+            "evaluate": "evaluate",
+            "no_answer": "no_answer"
+        }
+    )
+    graph.add_conditional_edges(
+        "evaluate",
+        lambda s: "retry" if s["retrieval_quality"] == "bad" and s["retries"] < MAX_RETRIES else "answer",
+        {
+            "retry": "refine",
+            "answer": "answer_rag"
+        }
+    )
+    graph.add_edge("refine", "retrieve")
+    graph.add_edge("answer_rag", END)
+    graph.add_edge("answer_direct", END)
+    graph.add_edge("no_answer", END)
+    return graph.compile(checkpointer=memory)

frontend/index.html CHANGED Viewed

@@ -264,6 +264,8 @@
   <script>
     const MAX_FILE_SIZE = 50 * 1024 * 1024; // 50MB
     async function upload() {
       const fileInput = document.getElementById("files");
@@ -375,9 +377,10 @@
     }
     function newChat() {
-      document.getElementById("question").value = "";
       document.getElementById("answerBox").style.display = "none";
       document.getElementById("answerBox").innerHTML = "";
     }
     function clearHistory() {
@@ -402,7 +405,7 @@
       const res = await fetch("/ask", {
         method: "POST",
         headers: { "Content-Type": "application/json" },
-        body: JSON.stringify({ prompt: q })
       });
       const data = await res.json();

   <script>
     const MAX_FILE_SIZE = 50 * 1024 * 1024; // 50MB
+    let threadId = sessionStorage.getItem("rag_thread_id") || crypto.randomUUID();
+    sessionStorage.setItem("rag_thread_id", threadId);
     async function upload() {
       const fileInput = document.getElementById("files");
     }
     function newChat() {
       document.getElementById("answerBox").style.display = "none";
       document.getElementById("answerBox").innerHTML = "";
+      threadId = crypto.randomUUID();
+      sessionStorage.setItem("rag_thread_id", threadId);
     }
     function clearHistory() {
       const res = await fetch("/ask", {
         method: "POST",
         headers: { "Content-Type": "application/json" },
+        body: JSON.stringify({ prompt: q, thread_id: threadId })
       });
       const data = await res.json();

llm_utils.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import time
+import random
+import google.generativeai as genai
+from google.api_core import exceptions
+def generate_with_retry(model, prompt, retries=3, base_delay=2):
+    """
+    Generates content using the Gemini model with exponential backoff for rate limits.
+    """
+    for i in range(retries):
+        try:
+            return model.generate_content(prompt)
+        except Exception as e:
+            # Check for Rate Limit (429) or Quota Exceeded (ResourceExhausted)
+            is_quota_error = (
+                "429" in str(e)
+                or "quota" in str(e).lower()
+                or isinstance(e, exceptions.ResourceExhausted)
+            )
+            if is_quota_error:
+                if i < retries - 1:
+                    sleep_time = base_delay * (2 ** i) + random.uniform(0, 1)
+                    print(f"⚠️ Quota exceeded. Retrying in {sleep_time:.2f}s... (Attempt {i+1}/{retries})")
+                    time.sleep(sleep_time)
+                    continue
+                else:
+                    print(f"❌ Quota exceeded after {retries} attempts.")
+                    # We can re-raise or return None depending on preference.
+                    # Re-raising allows the caller to handle the failure (e.g. return 503 Service Unavailable)
+                    # identifying strictly as quota error might be useful.
+            raise e
+    return None

main.py CHANGED Viewed

@@ -11,6 +11,7 @@ import google.generativeai as genai
 from rag_store import ingest_documents, get_all_chunks, clear_database
 from analytics import get_analytics
 from agentic_rag_v2_graph import build_agentic_rag_v2_graph
 # =========================================================
 # ENV + MODEL
@@ -39,18 +40,7 @@ app.mount("/frontend", StaticFiles(directory="frontend"), name="frontend")
 # =========================================================
 # SECURITY
 # =========================================================
-from fastapi import Request, HTTPException, Depends
-from fastapi.security import APIKeyCookie
-ADMIN_PASSWORD = os.getenv("ADMIN_PASSWORD", "secret")
-COOKIE_NAME = "rag_auth"
-api_key_cookie = APIKeyCookie(name=COOKIE_NAME, auto_error=False)
-async def verify_admin(cookie: str = Depends(api_key_cookie)):
-    if cookie != ADMIN_PASSWORD:
-        raise HTTPException(status_code=401, detail="Unauthorized")
-    return cookie
 # =========================================================
 # STATE
@@ -63,39 +53,28 @@ answer_cache: dict[str, tuple[float, dict]] = {}
 # =========================================================
 class PromptRequest(BaseModel):
     prompt: str
-class LoginRequest(BaseModel):
-    password: str
 # =========================================================
 # ROUTES
 # =========================================================
-@app.post("/login")
-def login(data: LoginRequest):
-    if data.password != ADMIN_PASSWORD:
-        raise HTTPException(status_code=401, detail="Invalid password")
-    response = JSONResponse(content={"message": "Logged in"})
-    response.set_cookie(key=COOKIE_NAME, value=data.password, httponly=True)
-    return response
-@app.get("/me")
-def me(user: str = Depends(verify_admin)):
-    return {"status": "authenticated"}
 @app.get("/", response_class=HTMLResponse)
 def serve_ui():
     with open("frontend/index.html", "r", encoding="utf-8") as f:
         return f.read()
-@app.get("/analytics", dependencies=[Depends(verify_admin)])
 def analytics():
     return get_analytics()
 # ---------------------------------------------------------
 # UPLOAD
 # ---------------------------------------------------------
-@app.post("/upload", dependencies=[Depends(verify_admin)])
 async def upload(files: list[UploadFile] = File(...)):
     for file in files:
         ext = file.filename.split(".")[-1].lower()
@@ -144,12 +123,15 @@ async def ask(data: PromptRequest):
         context = "\n\n".join(c["text"] for c in chunks)
         model = genai.GenerativeModel(MODEL_NAME)
-        resp = model.generate_content(
             f"Summarize the following content clearly:\n\n{context}"
         )
         response = {
-            "answer": resp.text,
             "confidence": 0.95,
             "citations": []
         }
@@ -161,6 +143,7 @@ async def ask(data: PromptRequest):
     # 🟩 AGENTIC RAG (LLM + EVALUATION)
     # ==========================
     result = agentic_graph.invoke({
         "query": query,
         "refined_query": "",
         "decision": "",
@@ -170,7 +153,7 @@ async def ask(data: PromptRequest):
         "answer": None,
         "confidence": 0.0,
         "answer_known": False
-    })
     response = {
         "answer": result["answer"],

 from rag_store import ingest_documents, get_all_chunks, clear_database
 from analytics import get_analytics
 from agentic_rag_v2_graph import build_agentic_rag_v2_graph
+from llm_utils import generate_with_retry
 # =========================================================
 # ENV + MODEL
 # =========================================================
 # SECURITY
 # =========================================================
 # =========================================================
 # STATE
 # =========================================================
 class PromptRequest(BaseModel):
     prompt: str
+    thread_id: str = "default"
 # =========================================================
 # ROUTES
 # =========================================================
 @app.get("/", response_class=HTMLResponse)
 def serve_ui():
     with open("frontend/index.html", "r", encoding="utf-8") as f:
         return f.read()
+@app.get("/analytics")
 def analytics():
     return get_analytics()
 # ---------------------------------------------------------
 # UPLOAD
 # ---------------------------------------------------------
+@app.post("/upload")
 async def upload(files: list[UploadFile] = File(...)):
     for file in files:
         ext = file.filename.split(".")[-1].lower()
         context = "\n\n".join(c["text"] for c in chunks)
         model = genai.GenerativeModel(MODEL_NAME)
+        resp = generate_with_retry(
+            model,
             f"Summarize the following content clearly:\n\n{context}"
         )
+        answer_text = resp.text if resp else "Error generating summary due to quota limits."
         response = {
+            "answer": answer_text,
             "confidence": 0.95,
             "citations": []
         }
     # 🟩 AGENTIC RAG (LLM + EVALUATION)
     # ==========================
     result = agentic_graph.invoke({
+        "messages": [],
         "query": query,
         "refined_query": "",
         "decision": "",
         "answer": None,
         "confidence": 0.0,
         "answer_known": False
+    }, config={"configurable": {"thread_id": data.thread_id}})
     response = {
         "answer": result["answer"],

rag_store.py CHANGED Viewed

@@ -90,13 +90,22 @@ def ingest_documents(files):
     for file in files:
         if file.filename.endswith(".pdf"):
-            reader = PdfReader(file.file)
-            for i, page in enumerate(reader.pages):
-                text = page.extract_text()
-                if text:
-                    for chunk in chunk_text(text):
-                        texts.append(chunk)
-                        meta.append({"source": file.filename, "page": i + 1})
         elif file.filename.endswith(".txt"):
             content = file.file.read().decode("utf-8", errors="ignore")

     for file in files:
         if file.filename.endswith(".pdf"):
+            # Save temp file for pymupdf4llm
+            import tempfile
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
+                tmp.write(file.file.read())
+                tmp_path = tmp.name
+            try:
+                # Use pymupdf4llm to extract markdown with tables
+                import pymupdf4llm
+                md_text = pymupdf4llm.to_markdown(tmp_path)
+                for chunk in chunk_text(md_text):
+                    texts.append(chunk)
+                    meta.append({"source": file.filename, "page": "N/A"}) # pymupdf4llm merges pages by default
+            finally:
+                os.remove(tmp_path)
         elif file.filename.endswith(".txt"):
             content = file.file.read().decode("utf-8", errors="ignore")

verify_rag.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import asyncio
+from agentic_rag_v2_graph import build_agentic_rag_v2_graph
+async def main():
+    graph = build_agentic_rag_v2_graph()
+    thread_id = "test-thread-1"
+    config = {"configurable": {"thread_id": thread_id}}
+    print("--- Turn 1 ---")
+    inputs = {
+        "messages": [], # Initialize
+        "query": "My name is Alice.",
+        "refined_query": "",
+        "decision": "",
+        "retrieved_chunks": [],
+        "retrieval_quality": "",
+        "retries": 0,
+        "answer": None,
+        "confidence": 0.0,
+        "answer_known": False
+    }
+    result = await graph.ainvoke(inputs, config=config)
+    print(f"Answer 1: {result['answer']}")
+    print("\n--- Turn 2 ---")
+    inputs["query"] = "What is my name?"
+    # We don't need to pass 'messages' again as it should be loaded from memory,
+    # but the graph definition expects it in TypedDict.
+    # We can pass empty list, it will be merged/ignored depending on implementation?
+    # Actually, MemorySaver loads the state. The input 'messages' is merged.
+    # Since we defined 'add_messages', passing empty list is fine (no new messages to add yet).
+    inputs["messages"] = []
+    result = await graph.ainvoke(inputs, config=config)
+    print(f"Answer 2: {result['answer']}")
+if __name__ == "__main__":
+    try:
+        asyncio.run(main())
+    except Exception as e:
+        import traceback
+        traceback.print_exc()