vansh27 commited on
Commit
a86c572
·
1 Parent(s): e080e6a

Deploy Fastapi RAG System

Browse files
Files changed (8) hide show
  1. Dockerfile +24 -0
  2. README.md +17 -10
  3. db.py +15 -0
  4. embeddings.py +35 -0
  5. llm.py +69 -0
  6. main.py +84 -0
  7. requirements.txt +5 -0
  8. retrieval.py +23 -0
Dockerfile ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # Set working directory
4
+ WORKDIR /app
5
+
6
+ # Install system dependencies
7
+ RUN apt-get update && apt-get install -y \
8
+ git \
9
+ && rm -rf /var/lib/apt/lists/*
10
+
11
+ # Copy requirements first (cache-friendly)
12
+ COPY requirements.txt .
13
+
14
+ # Install Python dependencies
15
+ RUN pip install --no-cache-dir -r requirements.txt
16
+
17
+ # Copy all project files
18
+ COPY . .
19
+
20
+ # Expose FastAPI port
21
+ EXPOSE 7860
22
+
23
+ # Start FastAPI
24
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,12 +1,19 @@
 
 
 
 
 
1
  ---
2
- title: Qa Rag Fastapi
3
- emoji: 🦀
4
- colorFrom: pink
5
- colorTo: indigo
6
- sdk: docker
7
- pinned: false
8
- license: mit
9
- short_description: RAG for QA
10
- ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 📘 RAG Question Answering System
2
+
3
+ A **Retrieval-Augmented Generation (RAG)** based Question Answering system built using **FastAPI**, **Hugging Face embeddings**, **SQLite**, and a **free GenAI LLM API**.
4
+ The system answers questions **strictly based on provided documents**, includes **evidence and confidence**, and prevents hallucination.
5
+
6
  ---
 
 
 
 
 
 
 
 
 
7
 
8
+ ## 🚀 Features
9
+
10
+ - 📄 Upload plain text documents (`.txt`)
11
+ - 🧠 Semantic chunking (not line-based)
12
+ - 🔍 Embedding-based retrieval using Hugging Face
13
+ - 🤖 GenAI-based answer generation
14
+ - 📊 Confidence score computation
15
+ - 📌 Evidence-backed responses
16
+ - 🚫 Hallucination prevention
17
+ - ⚡ FastAPI-based REST API
18
+
19
+ ---
db.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sqlite3
2
+
3
+ conn = sqlite3.connect("rag.db", check_same_thread=False)
4
+ cursor = conn.cursor()
5
+
6
+ cursor.execute("""
7
+ CREATE TABLE IF NOT EXISTS chunks (
8
+ document TEXT,
9
+ chunk_id INTEGER,
10
+ text TEXT,
11
+ embedding TEXT
12
+ )
13
+ """)
14
+
15
+ conn.commit()
embeddings.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import numpy as np
3
+ from sentence_transformers import SentenceTransformer
4
+
5
+ embedding_model = SentenceTransformer(
6
+ "sentence-transformers/all-MiniLM-L6-v2"
7
+ )
8
+
9
+ def cosine_similarity(a, b):
10
+ return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
11
+
12
+
13
+ def semantic_chunking(text, similarity_threshold=0.75):
14
+ sentences = re.split(r'(?<=[.!?])\s+', text)
15
+ sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
16
+
17
+ if len(sentences) <= 1:
18
+ return sentences
19
+
20
+ embeddings = embedding_model.encode(sentences)
21
+
22
+ chunks = []
23
+ current_chunk = [sentences[0]]
24
+
25
+ for i in range(1, len(sentences)):
26
+ sim = cosine_similarity(embeddings[i - 1], embeddings[i])
27
+
28
+ if sim >= similarity_threshold:
29
+ current_chunk.append(sentences[i])
30
+ else:
31
+ chunks.append(" ".join(current_chunk))
32
+ current_chunk = [sentences[i]]
33
+
34
+ chunks.append(" ".join(current_chunk))
35
+ return chunks
llm.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import os
3
+ import time
4
+
5
+ HF_API_KEY = os.getenv("HF_API_KEY")
6
+
7
+ API_URL = (
8
+ "https://router.huggingface.co/hf-inference/models/google/flan-t5-base"
9
+ )
10
+
11
+ headers = {
12
+ "Authorization": f"Bearer {HF_API_KEY}",
13
+ "Content-Type": "application/json"
14
+ }
15
+
16
+
17
+ def build_prompt(question, chunks):
18
+ context = "\n".join([c[3] for c in chunks])
19
+
20
+ return f"""
21
+ You are a strict question answering system.
22
+
23
+ Answer ONLY using the context below.
24
+ If the answer is not present, say:
25
+ "I don't know based on the provided context."
26
+
27
+ Context:
28
+ {context}
29
+
30
+ Question:
31
+ {question}
32
+
33
+ Answer:
34
+ """
35
+
36
+
37
+ def call_llm(prompt, max_retries=5, wait_seconds=6):
38
+ for _ in range(max_retries):
39
+ try:
40
+ response = requests.post(
41
+ API_URL,
42
+ headers=headers,
43
+ json={"inputs": prompt},
44
+ timeout=30
45
+ )
46
+
47
+ if not response.text:
48
+ time.sleep(wait_seconds)
49
+ continue
50
+
51
+ try:
52
+ data = response.json()
53
+ except ValueError:
54
+ time.sleep(wait_seconds)
55
+ continue
56
+
57
+ if isinstance(data, dict) and "error" in data:
58
+ if "loading" in data["error"].lower():
59
+ time.sleep(wait_seconds)
60
+ continue
61
+ return "I don't know based on the provided context"
62
+
63
+ if isinstance(data, list) and len(data) > 0:
64
+ return data[0].get("generated_text", "").strip()
65
+
66
+ except requests.exceptions.RequestException:
67
+ time.sleep(wait_seconds)
68
+
69
+ return "I don't know based on the provided context"
main.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, UploadFile, File
2
+ import json
3
+
4
+ from db import conn, cursor
5
+ from embeddings import semantic_chunking, embedding_model
6
+ from retrieval import retrieve_top_chunks
7
+ from llm import build_prompt, call_llm
8
+
9
+ app = FastAPI(
10
+ title="RAG QA System",
11
+ version="1.0"
12
+ )
13
+
14
+
15
+ @app.get("/health")
16
+ def health():
17
+ return {"status": "OK"}
18
+
19
+
20
+ @app.post("/ingest")
21
+ async def ingest(file: UploadFile = File(...)):
22
+ text = (await file.read()).decode("utf-8")
23
+
24
+ chunks = semantic_chunking(text)
25
+ embeddings = embedding_model.encode(chunks)
26
+
27
+ for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
28
+ cursor.execute(
29
+ "INSERT INTO chunks VALUES (?, ?, ?, ?)",
30
+ (file.filename, i, chunk, json.dumps(emb.tolist()))
31
+ )
32
+
33
+ conn.commit()
34
+ return {
35
+ "message": "Document ingested",
36
+ "chunks": len(chunks)
37
+ }
38
+
39
+
40
+ @app.post("/ask")
41
+ def ask(question: str):
42
+ top_chunks = retrieve_top_chunks(question)
43
+
44
+ if not top_chunks:
45
+ return {
46
+ "question": question,
47
+ "answer": "I don't know based on the provided context",
48
+ "confidence": 0.0,
49
+ "evidence": []
50
+ }
51
+
52
+ best_score = max(c[0] for c in top_chunks)
53
+ CONFIDENCE_THRESHOLD = 0.6
54
+
55
+ if best_score < CONFIDENCE_THRESHOLD:
56
+ return {
57
+ "question": question,
58
+ "answer": "I don't know based on the provided context",
59
+ "confidence": round(float(best_score), 2),
60
+ "evidence": []
61
+ }
62
+
63
+ prompt = build_prompt(question, top_chunks)
64
+ answer = call_llm(prompt)
65
+
66
+ # deterministic fallback
67
+ if "I don't know based on the provided context" in answer:
68
+ answer = top_chunks[0][3]
69
+
70
+ evidence = [
71
+ {
72
+ "document": doc,
73
+ "chunk_id": cid,
74
+ "text": text
75
+ }
76
+ for _, doc, cid, text in top_chunks
77
+ ]
78
+
79
+ return {
80
+ "question": question,
81
+ "answer": answer,
82
+ "confidence": round(float(best_score), 2),
83
+ "evidence": evidence
84
+ }
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ sentence-transformers
4
+ numpy
5
+ requests
retrieval.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import numpy as np
3
+
4
+ from db import cursor
5
+ from embeddings import embedding_model, cosine_similarity
6
+
7
+
8
+ def retrieve_top_chunks(question, k=3):
9
+ q_emb = embedding_model.encode(question)
10
+
11
+ cursor.execute(
12
+ "SELECT document, chunk_id, text, embedding FROM chunks"
13
+ )
14
+ rows = cursor.fetchall()
15
+
16
+ scored = []
17
+ for doc, cid, text, emb_json in rows:
18
+ emb = np.array(json.loads(emb_json))
19
+ score = cosine_similarity(q_emb, emb)
20
+ scored.append((score, doc, cid, text))
21
+
22
+ scored.sort(reverse=True, key=lambda x: x[0])
23
+ return scored[:k]