Spaces:
Sleeping
Sleeping
Deploy Fastapi RAG System
Browse files- Dockerfile +24 -0
- README.md +17 -10
- db.py +15 -0
- embeddings.py +35 -0
- llm.py +69 -0
- main.py +84 -0
- requirements.txt +5 -0
- retrieval.py +23 -0
Dockerfile
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
# Set working directory
|
| 4 |
+
WORKDIR /app
|
| 5 |
+
|
| 6 |
+
# Install system dependencies
|
| 7 |
+
RUN apt-get update && apt-get install -y \
|
| 8 |
+
git \
|
| 9 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 10 |
+
|
| 11 |
+
# Copy requirements first (cache-friendly)
|
| 12 |
+
COPY requirements.txt .
|
| 13 |
+
|
| 14 |
+
# Install Python dependencies
|
| 15 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 16 |
+
|
| 17 |
+
# Copy all project files
|
| 18 |
+
COPY . .
|
| 19 |
+
|
| 20 |
+
# Expose FastAPI port
|
| 21 |
+
EXPOSE 7860
|
| 22 |
+
|
| 23 |
+
# Start FastAPI
|
| 24 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
|
@@ -1,12 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
-
title: Qa Rag Fastapi
|
| 3 |
-
emoji: 🦀
|
| 4 |
-
colorFrom: pink
|
| 5 |
-
colorTo: indigo
|
| 6 |
-
sdk: docker
|
| 7 |
-
pinned: false
|
| 8 |
-
license: mit
|
| 9 |
-
short_description: RAG for QA
|
| 10 |
-
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 📘 RAG Question Answering System
|
| 2 |
+
|
| 3 |
+
A **Retrieval-Augmented Generation (RAG)** based Question Answering system built using **FastAPI**, **Hugging Face embeddings**, **SQLite**, and a **free GenAI LLM API**.
|
| 4 |
+
The system answers questions **strictly based on provided documents**, includes **evidence and confidence**, and prevents hallucination.
|
| 5 |
+
|
| 6 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
+
## 🚀 Features
|
| 9 |
+
|
| 10 |
+
- 📄 Upload plain text documents (`.txt`)
|
| 11 |
+
- 🧠 Semantic chunking (not line-based)
|
| 12 |
+
- 🔍 Embedding-based retrieval using Hugging Face
|
| 13 |
+
- 🤖 GenAI-based answer generation
|
| 14 |
+
- 📊 Confidence score computation
|
| 15 |
+
- 📌 Evidence-backed responses
|
| 16 |
+
- 🚫 Hallucination prevention
|
| 17 |
+
- ⚡ FastAPI-based REST API
|
| 18 |
+
|
| 19 |
+
---
|
db.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sqlite3
|
| 2 |
+
|
| 3 |
+
conn = sqlite3.connect("rag.db", check_same_thread=False)
|
| 4 |
+
cursor = conn.cursor()
|
| 5 |
+
|
| 6 |
+
cursor.execute("""
|
| 7 |
+
CREATE TABLE IF NOT EXISTS chunks (
|
| 8 |
+
document TEXT,
|
| 9 |
+
chunk_id INTEGER,
|
| 10 |
+
text TEXT,
|
| 11 |
+
embedding TEXT
|
| 12 |
+
)
|
| 13 |
+
""")
|
| 14 |
+
|
| 15 |
+
conn.commit()
|
embeddings.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import numpy as np
|
| 3 |
+
from sentence_transformers import SentenceTransformer
|
| 4 |
+
|
| 5 |
+
embedding_model = SentenceTransformer(
|
| 6 |
+
"sentence-transformers/all-MiniLM-L6-v2"
|
| 7 |
+
)
|
| 8 |
+
|
| 9 |
+
def cosine_similarity(a, b):
|
| 10 |
+
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def semantic_chunking(text, similarity_threshold=0.75):
|
| 14 |
+
sentences = re.split(r'(?<=[.!?])\s+', text)
|
| 15 |
+
sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
|
| 16 |
+
|
| 17 |
+
if len(sentences) <= 1:
|
| 18 |
+
return sentences
|
| 19 |
+
|
| 20 |
+
embeddings = embedding_model.encode(sentences)
|
| 21 |
+
|
| 22 |
+
chunks = []
|
| 23 |
+
current_chunk = [sentences[0]]
|
| 24 |
+
|
| 25 |
+
for i in range(1, len(sentences)):
|
| 26 |
+
sim = cosine_similarity(embeddings[i - 1], embeddings[i])
|
| 27 |
+
|
| 28 |
+
if sim >= similarity_threshold:
|
| 29 |
+
current_chunk.append(sentences[i])
|
| 30 |
+
else:
|
| 31 |
+
chunks.append(" ".join(current_chunk))
|
| 32 |
+
current_chunk = [sentences[i]]
|
| 33 |
+
|
| 34 |
+
chunks.append(" ".join(current_chunk))
|
| 35 |
+
return chunks
|
llm.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
import os
|
| 3 |
+
import time
|
| 4 |
+
|
| 5 |
+
HF_API_KEY = os.getenv("HF_API_KEY")
|
| 6 |
+
|
| 7 |
+
API_URL = (
|
| 8 |
+
"https://router.huggingface.co/hf-inference/models/google/flan-t5-base"
|
| 9 |
+
)
|
| 10 |
+
|
| 11 |
+
headers = {
|
| 12 |
+
"Authorization": f"Bearer {HF_API_KEY}",
|
| 13 |
+
"Content-Type": "application/json"
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def build_prompt(question, chunks):
|
| 18 |
+
context = "\n".join([c[3] for c in chunks])
|
| 19 |
+
|
| 20 |
+
return f"""
|
| 21 |
+
You are a strict question answering system.
|
| 22 |
+
|
| 23 |
+
Answer ONLY using the context below.
|
| 24 |
+
If the answer is not present, say:
|
| 25 |
+
"I don't know based on the provided context."
|
| 26 |
+
|
| 27 |
+
Context:
|
| 28 |
+
{context}
|
| 29 |
+
|
| 30 |
+
Question:
|
| 31 |
+
{question}
|
| 32 |
+
|
| 33 |
+
Answer:
|
| 34 |
+
"""
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def call_llm(prompt, max_retries=5, wait_seconds=6):
|
| 38 |
+
for _ in range(max_retries):
|
| 39 |
+
try:
|
| 40 |
+
response = requests.post(
|
| 41 |
+
API_URL,
|
| 42 |
+
headers=headers,
|
| 43 |
+
json={"inputs": prompt},
|
| 44 |
+
timeout=30
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
if not response.text:
|
| 48 |
+
time.sleep(wait_seconds)
|
| 49 |
+
continue
|
| 50 |
+
|
| 51 |
+
try:
|
| 52 |
+
data = response.json()
|
| 53 |
+
except ValueError:
|
| 54 |
+
time.sleep(wait_seconds)
|
| 55 |
+
continue
|
| 56 |
+
|
| 57 |
+
if isinstance(data, dict) and "error" in data:
|
| 58 |
+
if "loading" in data["error"].lower():
|
| 59 |
+
time.sleep(wait_seconds)
|
| 60 |
+
continue
|
| 61 |
+
return "I don't know based on the provided context"
|
| 62 |
+
|
| 63 |
+
if isinstance(data, list) and len(data) > 0:
|
| 64 |
+
return data[0].get("generated_text", "").strip()
|
| 65 |
+
|
| 66 |
+
except requests.exceptions.RequestException:
|
| 67 |
+
time.sleep(wait_seconds)
|
| 68 |
+
|
| 69 |
+
return "I don't know based on the provided context"
|
main.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, UploadFile, File
|
| 2 |
+
import json
|
| 3 |
+
|
| 4 |
+
from db import conn, cursor
|
| 5 |
+
from embeddings import semantic_chunking, embedding_model
|
| 6 |
+
from retrieval import retrieve_top_chunks
|
| 7 |
+
from llm import build_prompt, call_llm
|
| 8 |
+
|
| 9 |
+
app = FastAPI(
|
| 10 |
+
title="RAG QA System",
|
| 11 |
+
version="1.0"
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
@app.get("/health")
|
| 16 |
+
def health():
|
| 17 |
+
return {"status": "OK"}
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
@app.post("/ingest")
|
| 21 |
+
async def ingest(file: UploadFile = File(...)):
|
| 22 |
+
text = (await file.read()).decode("utf-8")
|
| 23 |
+
|
| 24 |
+
chunks = semantic_chunking(text)
|
| 25 |
+
embeddings = embedding_model.encode(chunks)
|
| 26 |
+
|
| 27 |
+
for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
|
| 28 |
+
cursor.execute(
|
| 29 |
+
"INSERT INTO chunks VALUES (?, ?, ?, ?)",
|
| 30 |
+
(file.filename, i, chunk, json.dumps(emb.tolist()))
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
conn.commit()
|
| 34 |
+
return {
|
| 35 |
+
"message": "Document ingested",
|
| 36 |
+
"chunks": len(chunks)
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
@app.post("/ask")
|
| 41 |
+
def ask(question: str):
|
| 42 |
+
top_chunks = retrieve_top_chunks(question)
|
| 43 |
+
|
| 44 |
+
if not top_chunks:
|
| 45 |
+
return {
|
| 46 |
+
"question": question,
|
| 47 |
+
"answer": "I don't know based on the provided context",
|
| 48 |
+
"confidence": 0.0,
|
| 49 |
+
"evidence": []
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
best_score = max(c[0] for c in top_chunks)
|
| 53 |
+
CONFIDENCE_THRESHOLD = 0.6
|
| 54 |
+
|
| 55 |
+
if best_score < CONFIDENCE_THRESHOLD:
|
| 56 |
+
return {
|
| 57 |
+
"question": question,
|
| 58 |
+
"answer": "I don't know based on the provided context",
|
| 59 |
+
"confidence": round(float(best_score), 2),
|
| 60 |
+
"evidence": []
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
prompt = build_prompt(question, top_chunks)
|
| 64 |
+
answer = call_llm(prompt)
|
| 65 |
+
|
| 66 |
+
# deterministic fallback
|
| 67 |
+
if "I don't know based on the provided context" in answer:
|
| 68 |
+
answer = top_chunks[0][3]
|
| 69 |
+
|
| 70 |
+
evidence = [
|
| 71 |
+
{
|
| 72 |
+
"document": doc,
|
| 73 |
+
"chunk_id": cid,
|
| 74 |
+
"text": text
|
| 75 |
+
}
|
| 76 |
+
for _, doc, cid, text in top_chunks
|
| 77 |
+
]
|
| 78 |
+
|
| 79 |
+
return {
|
| 80 |
+
"question": question,
|
| 81 |
+
"answer": answer,
|
| 82 |
+
"confidence": round(float(best_score), 2),
|
| 83 |
+
"evidence": evidence
|
| 84 |
+
}
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn
|
| 3 |
+
sentence-transformers
|
| 4 |
+
numpy
|
| 5 |
+
requests
|
retrieval.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import numpy as np
|
| 3 |
+
|
| 4 |
+
from db import cursor
|
| 5 |
+
from embeddings import embedding_model, cosine_similarity
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def retrieve_top_chunks(question, k=3):
|
| 9 |
+
q_emb = embedding_model.encode(question)
|
| 10 |
+
|
| 11 |
+
cursor.execute(
|
| 12 |
+
"SELECT document, chunk_id, text, embedding FROM chunks"
|
| 13 |
+
)
|
| 14 |
+
rows = cursor.fetchall()
|
| 15 |
+
|
| 16 |
+
scored = []
|
| 17 |
+
for doc, cid, text, emb_json in rows:
|
| 18 |
+
emb = np.array(json.loads(emb_json))
|
| 19 |
+
score = cosine_similarity(q_emb, emb)
|
| 20 |
+
scored.append((score, doc, cid, text))
|
| 21 |
+
|
| 22 |
+
scored.sort(reverse=True, key=lambda x: x[0])
|
| 23 |
+
return scored[:k]
|