AsyncRAG

Sleeping

App Files Files Community

Zubaish commited on Jan 27

Commit

a42513a

1 Parent(s): 81345e2

Frontend: robust answer + status handling

Browse files

Files changed (3) hide show

app.py +8 -9
frontend/index.html +21 -67
rag.py +50 -75

app.py CHANGED Viewed

@@ -1,22 +1,21 @@
 from fastapi import FastAPI
-from fastapi.responses import HTMLResponse
-from fastapi.staticfiles import StaticFiles
 from pydantic import BaseModel
 from rag import ask_rag_with_status
 app = FastAPI()
-app.mount("/frontend", StaticFiles(directory="frontend"), name="frontend")
 class Query(BaseModel):
     question: str
-@app.get("/", response_class=HTMLResponse)
-def home():
-    with open("frontend/index.html", "r", encoding="utf-8") as f:
-        return f.read()
 @app.post("/chat")
 def chat(q: Query):
     answer, status = ask_rag_with_status(q.question)
-    return {"answer": answer, "status": status}

+# app.py
 from fastapi import FastAPI
 from pydantic import BaseModel
 from rag import ask_rag_with_status
 app = FastAPI()
 class Query(BaseModel):
     question: str
+@app.get("/")
+def health():
+    return {"status": "ok"}
 @app.post("/chat")
 def chat(q: Query):
     answer, status = ask_rag_with_status(q.question)
+    return {
+        "answer": answer,
+        "status": status,
+    }

frontend/index.html CHANGED Viewed

@@ -1,67 +1,21 @@
-<!DOCTYPE html>
-<html>
-<head>
-  <meta charset="UTF-8" />
-  <title>HubRAG</title>
-  <style>
-    body {
-      font-family: sans-serif;
-      max-width: 800px;
-      margin: 40px auto;
-    }
-    textarea {
-      width: 100%;
-      padding: 10px;
-    }
-    button {
-      margin-top: 10px;
-      padding: 8px 16px;
-    }
-    pre {
-      background: #f5f5f5;
-      padding: 10px;
-      white-space: pre-wrap;
-    }
-  </style>
-</head>
-<body>
-<h2>📄 HubRAG (HF Space)</h2>
-<textarea id="q" rows="4" placeholder="Ask a question about the documents..."></textarea>
-<br/>
-<button onclick="ask()">Ask</button>
-<h3>Status</h3>
-<ul id="status"></ul>
-<h3>Answer</h3>
-<pre id="answer"></pre>
-<script>
-async function ask() {
-  const q = document.getElementById("q").value;
-  document.getElementById("answer").textContent = "Thinking...";
-  document.getElementById("status").innerHTML = "";
-  const res = await fetch("/ask", {   // <-- ensure this matches backend
-    method: "POST",
-    headers: { "Content-Type": "application/json" },
-    body: JSON.stringify({ question: q })
-  });
-  const data = await res.json();
-  document.getElementById("answer").textContent =
-    data.answer || "No answer";
-  (data.status || []).forEach(s => {
-    const li = document.createElement("li");
-    li.textContent = s;
-    document.getElementById("status").appendChild(li);
-  });
-}
-</script>
-</body>
-</html>

+# app.py
+from fastapi import FastAPI
+from pydantic import BaseModel
+from rag import ask_rag_with_status
+app = FastAPI()
+class Query(BaseModel):
+    question: str
+@app.get("/")
+def health():
+    return {"status": "ok"}
+@app.post("/chat")
+def chat(q: Query):
+    answer, status = ask_rag_with_status(q.question)
+    return {
+        "answer": answer,
+        "status": status,
+    }

rag.py CHANGED Viewed

@@ -1,64 +1,33 @@
-import os
-from typing import List
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
-from langchain_community.document_loaders import PyPDFLoader
-from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import Chroma
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from config import (
-    KB_DIR,
-    VECTOR_DB_DIR,
     EMBEDDING_MODEL,
     LLM_MODEL,
 )
-# --------------------------------------------------
-# Embeddings (CPU-safe)
-# --------------------------------------------------
 embeddings = HuggingFaceEmbeddings(
     model_name=EMBEDDING_MODEL
 )
-# --------------------------------------------------
-# Load PDFs (if any)
-# --------------------------------------------------
-documents = []
-if os.path.exists(KB_DIR):
-    for file in os.listdir(KB_DIR):
-        if file.lower().endswith(".pdf"):
-            loader = PyPDFLoader(os.path.join(KB_DIR, file))
-            documents.extend(loader.load())
-# --------------------------------------------------
-# Split documents
-# --------------------------------------------------
-splitter = RecursiveCharacterTextSplitter(
-    chunk_size=500,
-    chunk_overlap=50
-)
-splits = splitter.split_documents(documents) if documents else []
-# --------------------------------------------------
-# Vector DB (ONLY if docs exist)
-# --------------------------------------------------
-vectordb = None
-retriever = None
-if splits:
-    vectordb = Chroma.from_documents(
-        splits,
-        embedding=embeddings,
-        persist_directory=VECTOR_DB_DIR
     )
-    retriever = vectordb.as_retriever(search_kwargs={"k": 3})
-# --------------------------------------------------
-# Load LLM (CPU ONLY, NO ACCELERATE)
-# --------------------------------------------------
 tokenizer = AutoTokenizer.from_pretrained(
     LLM_MODEL,
     trust_remote_code=True
@@ -66,36 +35,35 @@ tokenizer = AutoTokenizer.from_pretrained(
 model = AutoModelForCausalLM.from_pretrained(
     LLM_MODEL,
-    trust_remote_code=True
 )
-llm = pipeline(
-    "text-generation",
-    model=model,
-    tokenizer=tokenizer,
-    max_new_tokens=256,
-    do_sample=False
-)
-# --------------------------------------------------
-# Public RAG API
-# --------------------------------------------------
-def ask_rag_with_status(question: str):
     status = []
-    if retriever is None:
-        return {
-            "answer": "❌ Knowledge base is empty. Please upload PDFs to the dataset or storage.",
-            "status": ["⚠️ No documents indexed"]
-        }
-    status.append("🔍 Retrieving documents...")
-    docs = retriever.get_relevant_documents(question)
     context = "\n\n".join(d.page_content for d in docs)
     prompt = f"""
-Use the following context to answer the question.
 Context:
 {context}
@@ -103,13 +71,20 @@ Context:
 Question:
 {question}
-Answer clearly and concisely.
 """
-    status.append("🧠 Generating answer...")
-    result = llm(prompt)[0]["generated_text"]
-    return {
-        "answer": result.strip(),
-        "status": status
-    }

+# rag.py
+from typing import List, Tuple
 from langchain_community.vectorstores import Chroma
 from langchain_community.embeddings import HuggingFaceEmbeddings
+from transformers import AutoTokenizer, AutoModelForCausalLM
 from config import (
     EMBEDDING_MODEL,
     LLM_MODEL,
+    CHROMA_DIR,
+    TOP_K,
 )
+import torch
+# --- Embeddings ---
 embeddings = HuggingFaceEmbeddings(
     model_name=EMBEDDING_MODEL
 )
+# --- Vector DB (safe load) ---
+try:
+    vectordb = Chroma(
+        persist_directory=CHROMA_DIR,
+        embedding_function=embeddings,
     )
+except Exception:
+    vectordb = None
+# --- LLM ---
 tokenizer = AutoTokenizer.from_pretrained(
     LLM_MODEL,
     trust_remote_code=True
 model = AutoModelForCausalLM.from_pretrained(
     LLM_MODEL,
+    trust_remote_code=True,
+    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+    device_map="auto",
 )
+def ask_rag_with_status(question: str) -> Tuple[str, List[str]]:
     status = []
+    if not vectordb:
+        return (
+            "⚠️ Knowledge base is not loaded yet. Upload documents first.",
+            ["Vector DB not initialized"],
+        )
+    docs = vectordb.similarity_search(question, k=TOP_K)
+    if not docs:
+        return (
+            "⚠️ I could not find relevant information in the knowledge base.",
+            ["No documents retrieved"],
+        )
     context = "\n\n".join(d.page_content for d in docs)
+    status.append(f"Retrieved {len(docs)} chunks")
     prompt = f"""
+You are a helpful assistant.
+Answer ONLY using the context below.
 Context:
 {context}
 Question:
 {question}
+Answer:
 """
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    with torch.no_grad():
+        output = model.generate(
+            **inputs,
+            max_new_tokens=256,
+            do_sample=True,
+            temperature=0.7,
+        )
+    answer = tokenizer.decode(output[0], skip_special_tokens=True)
+    answer = answer.split("Answer:")[-1].strip()
+    return answer, status