Spaces:

Vanshcc
/

qa-rag-fastapi

Sleeping

App Files Files Community

vansh27 commited on Jan 7

Commit

a86c572

1 Parent(s): e080e6a

Deploy Fastapi RAG System

Browse files

Files changed (8) hide show

Dockerfile +24 -0
README.md +17 -10
db.py +15 -0
embeddings.py +35 -0
llm.py +69 -0
main.py +84 -0
requirements.txt +5 -0
retrieval.py +23 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,24 @@

+FROM python:3.10-slim
+# Set working directory
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements first (cache-friendly)
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy all project files
+COPY . .
+# Expose FastAPI port
+EXPOSE 7860
+# Start FastAPI
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,12 +1,19 @@
 ---
-title: Qa Rag Fastapi
-emoji: 🦀
-colorFrom: pink
-colorTo: indigo
-sdk: docker
-pinned: false
-license: mit
-short_description: RAG for QA
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# 📘 RAG Question Answering System
+A **Retrieval-Augmented Generation (RAG)** based Question Answering system built using **FastAPI**, **Hugging Face embeddings**, **SQLite**, and a **free GenAI LLM API**.
+The system answers questions **strictly based on provided documents**, includes **evidence and confidence**, and prevents hallucination.
 ---
+## 🚀 Features
+- 📄 Upload plain text documents (`.txt`)
+- 🧠 Semantic chunking (not line-based)
+- 🔍 Embedding-based retrieval using Hugging Face
+- 🤖 GenAI-based answer generation
+- 📊 Confidence score computation
+- 📌 Evidence-backed responses
+- 🚫 Hallucination prevention
+- ⚡ FastAPI-based REST API
+---

db.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import sqlite3
+conn = sqlite3.connect("rag.db", check_same_thread=False)
+cursor = conn.cursor()
+cursor.execute("""
+CREATE TABLE IF NOT EXISTS chunks (
+    document TEXT,
+    chunk_id INTEGER,
+    text TEXT,
+    embedding TEXT
+)
+""")
+conn.commit()

embeddings.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import re
+import numpy as np
+from sentence_transformers import SentenceTransformer
+embedding_model = SentenceTransformer(
+    "sentence-transformers/all-MiniLM-L6-v2"
+)
+def cosine_similarity(a, b):
+    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
+def semantic_chunking(text, similarity_threshold=0.75):
+    sentences = re.split(r'(?<=[.!?])\s+', text)
+    sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
+    if len(sentences) <= 1:
+        return sentences
+    embeddings = embedding_model.encode(sentences)
+    chunks = []
+    current_chunk = [sentences[0]]
+    for i in range(1, len(sentences)):
+        sim = cosine_similarity(embeddings[i - 1], embeddings[i])
+        if sim >= similarity_threshold:
+            current_chunk.append(sentences[i])
+        else:
+            chunks.append(" ".join(current_chunk))
+            current_chunk = [sentences[i]]
+    chunks.append(" ".join(current_chunk))
+    return chunks

llm.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import requests
+import os
+import time
+HF_API_KEY = os.getenv("HF_API_KEY")
+API_URL = (
+    "https://router.huggingface.co/hf-inference/models/google/flan-t5-base"
+)
+headers = {
+    "Authorization": f"Bearer {HF_API_KEY}",
+    "Content-Type": "application/json"
+}
+def build_prompt(question, chunks):
+    context = "\n".join([c[3] for c in chunks])
+    return f"""
+You are a strict question answering system.
+Answer ONLY using the context below.
+If the answer is not present, say:
+"I don't know based on the provided context."
+Context:
+{context}
+Question:
+{question}
+Answer:
+"""
+def call_llm(prompt, max_retries=5, wait_seconds=6):
+    for _ in range(max_retries):
+        try:
+            response = requests.post(
+                API_URL,
+                headers=headers,
+                json={"inputs": prompt},
+                timeout=30
+            )
+            if not response.text:
+                time.sleep(wait_seconds)
+                continue
+            try:
+                data = response.json()
+            except ValueError:
+                time.sleep(wait_seconds)
+                continue
+            if isinstance(data, dict) and "error" in data:
+                if "loading" in data["error"].lower():
+                    time.sleep(wait_seconds)
+                    continue
+                return "I don't know based on the provided context"
+            if isinstance(data, list) and len(data) > 0:
+                return data[0].get("generated_text", "").strip()
+        except requests.exceptions.RequestException:
+            time.sleep(wait_seconds)
+    return "I don't know based on the provided context"

main.py ADDED Viewed

	@@ -0,0 +1,84 @@

+from fastapi import FastAPI, UploadFile, File
+import json
+from db import conn, cursor
+from embeddings import semantic_chunking, embedding_model
+from retrieval import retrieve_top_chunks
+from llm import build_prompt, call_llm
+app = FastAPI(
+    title="RAG QA System",
+    version="1.0"
+)
+@app.get("/health")
+def health():
+    return {"status": "OK"}
+@app.post("/ingest")
+async def ingest(file: UploadFile = File(...)):
+    text = (await file.read()).decode("utf-8")
+    chunks = semantic_chunking(text)
+    embeddings = embedding_model.encode(chunks)
+    for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
+        cursor.execute(
+            "INSERT INTO chunks VALUES (?, ?, ?, ?)",
+            (file.filename, i, chunk, json.dumps(emb.tolist()))
+        )
+    conn.commit()
+    return {
+        "message": "Document ingested",
+        "chunks": len(chunks)
+    }
+@app.post("/ask")
+def ask(question: str):
+    top_chunks = retrieve_top_chunks(question)
+    if not top_chunks:
+        return {
+            "question": question,
+            "answer": "I don't know based on the provided context",
+            "confidence": 0.0,
+            "evidence": []
+        }
+    best_score = max(c[0] for c in top_chunks)
+    CONFIDENCE_THRESHOLD = 0.6
+    if best_score < CONFIDENCE_THRESHOLD:
+        return {
+            "question": question,
+            "answer": "I don't know based on the provided context",
+            "confidence": round(float(best_score), 2),
+            "evidence": []
+        }
+    prompt = build_prompt(question, top_chunks)
+    answer = call_llm(prompt)
+    # deterministic fallback
+    if "I don't know based on the provided context" in answer:
+        answer = top_chunks[0][3]
+    evidence = [
+        {
+            "document": doc,
+            "chunk_id": cid,
+            "text": text
+        }
+        for _, doc, cid, text in top_chunks
+    ]
+    return {
+        "question": question,
+        "answer": answer,
+        "confidence": round(float(best_score), 2),
+        "evidence": evidence
+    }

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+fastapi
+uvicorn
+sentence-transformers
+numpy
+requests

retrieval.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import json
+import numpy as np
+from db import cursor
+from embeddings import embedding_model, cosine_similarity
+def retrieve_top_chunks(question, k=3):
+    q_emb = embedding_model.encode(question)
+    cursor.execute(
+        "SELECT document, chunk_id, text, embedding FROM chunks"
+    )
+    rows = cursor.fetchall()
+    scored = []
+    for doc, cid, text, emb_json in rows:
+        emb = np.array(json.loads(emb_json))
+        score = cosine_similarity(q_emb, emb)
+        scored.append((score, doc, cid, text))
+    scored.sort(reverse=True, key=lambda x: x[0])
+    return scored[:k]