Spaces:

MohitG012
/

RAG_on_User_data

Sleeping

App Files Files Community

MohitGupta41 commited on Dec 21, 2025

Commit

4d922fd

1 Parent(s): 4575791

FastAPI RAG backend (Docker)

Browse files

Files changed (2) hide show

rag.py +219 -23
requirements.txt +1 -0

rag.py CHANGED Viewed

@@ -1,7 +1,123 @@
 import uuid
-from dataclasses import dataclass
-from typing import List, Dict, Any, Tuple
 import time
 import numpy as np
 import faiss
@@ -10,85 +126,165 @@ from sentence_transformers import SentenceTransformer
 # PDF extraction
 import fitz  # pymupdf
-# LLM (choose 1)
-from transformers import pipeline
 # -----------------------------
 # Globals (MVP)
 # -----------------------------
 EMBEDDER = SentenceTransformer("all-MiniLM-L6-v2")
-# For MVP: use a smallish instruct model if possible
-# NOTE: Mistral 7B is heavy; if you can't run it locally, use a smaller HF model.
 GENERATOR = pipeline(
-    "text2text-generation",
-    model="google/flan-t5-base",
-    max_new_tokens=256
 )
-SESSIONS: Dict[str, Dict[str, Any]] = {}  # session_id -> {chunks, index, created_at}
 # -----------------------------
 # Helpers
 # -----------------------------
 def extract_text_from_pdf(pdf_bytes: bytes) -> str:
     doc = fitz.open(stream=pdf_bytes, filetype="pdf")
     pages = []
     for page in doc:
         pages.append(page.get_text("text"))
     text = "\n".join(pages).strip()
-    print(text)
     return text
 def chunk_text(text: str, chunk_size_words: int = 350, overlap_words: int = 60) -> List[str]:
     words = text.split()
-    chunks = []
     step = max(1, chunk_size_words - overlap_words)
     for i in range(0, len(words), step):
         chunk = words[i:i + chunk_size_words]
         if chunk:
             chunks.append(" ".join(chunk))
-    return chunks
 def build_faiss_index(vectors: np.ndarray) -> faiss.Index:
     vectors = vectors.astype("float32")
     dim = vectors.shape[1]
-    index = faiss.IndexFlatIP(dim)  # cosine-like if vectors normalized
     faiss.normalize_L2(vectors)
     index.add(vectors)
     return index
-def retrieve_top_k(query: str, chunks: List[str], index: faiss.Index, k: int = 3) -> List[Tuple[int, float, str]]:
     q = EMBEDDER.encode([query], convert_to_numpy=True).astype("float32")
     faiss.normalize_L2(q)
     scores, ids = index.search(q, k)
-    results = []
     for rank, idx in enumerate(ids[0]):
         if idx == -1:
             continue
         results.append((int(idx), float(scores[0][rank]), chunks[int(idx)]))
     return results
 def generate_answer(question: str, context: str) -> str:
-    prompt = (
-        "Answer using ONLY the provided context. "
-        "If not found in the context, say: Not found in the provided documents.\n\n"
-        f"Context:\n{context}\n\nQuestion:\n{question}\n\nAnswer:"
     )
-    out = GENERATOR(prompt)
     return out[0]["generated_text"].strip()
 def create_session(chunks: List[str]) -> str:
     embeddings = EMBEDDER.encode(chunks, convert_to_numpy=True)
     index = build_faiss_index(embeddings)
@@ -96,6 +292,6 @@ def create_session(chunks: List[str]) -> str:
     SESSIONS[session_id] = {
         "chunks": chunks,
         "index": index,
-        "created_at": time.time()
     }
     return session_id

+# import uuid
+# from dataclasses import dataclass
+# from typing import List, Dict, Any, Tuple
+# import time
+# import numpy as np
+# import faiss
+# from sentence_transformers import SentenceTransformer
+# # PDF extraction
+# import fitz  # pymupdf
+# # LLM (choose 1)
+# from transformers import pipeline
+# # -----------------------------
+# # Globals (MVP)
+# # -----------------------------
+# EMBEDDER = SentenceTransformer("all-MiniLM-L6-v2")
+# # For MVP: use a smallish instruct model if possible
+# # NOTE: Mistral 7B is heavy; if you can't run it locally, use a smaller HF model.
+# GENERATOR = pipeline(
+#     "text2text-generation",
+#     model="google/flan-t5-base",
+#     max_new_tokens=256
+# )
+# SESSIONS: Dict[str, Dict[str, Any]] = {}  # session_id -> {chunks, index, created_at}
+# # -----------------------------
+# # Helpers
+# # -----------------------------
+# def extract_text_from_pdf(pdf_bytes: bytes) -> str:
+#     doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+#     pages = []
+#     for page in doc:
+#         pages.append(page.get_text("text"))
+#     text = "\n".join(pages).strip()
+#     print(text)
+#     return text
+# def chunk_text(text: str, chunk_size_words: int = 350, overlap_words: int = 60) -> List[str]:
+#     words = text.split()
+#     chunks = []
+#     step = max(1, chunk_size_words - overlap_words)
+#     for i in range(0, len(words), step):
+#         chunk = words[i:i + chunk_size_words]
+#         if chunk:
+#             chunks.append(" ".join(chunk))
+#     return chunks
+# def build_faiss_index(vectors: np.ndarray) -> faiss.Index:
+#     vectors = vectors.astype("float32")
+#     dim = vectors.shape[1]
+#     index = faiss.IndexFlatIP(dim)  # cosine-like if vectors normalized
+#     faiss.normalize_L2(vectors)
+#     index.add(vectors)
+#     return index
+# def retrieve_top_k(query: str, chunks: List[str], index: faiss.Index, k: int = 3) -> List[Tuple[int, float, str]]:
+#     q = EMBEDDER.encode([query], convert_to_numpy=True).astype("float32")
+#     faiss.normalize_L2(q)
+#     scores, ids = index.search(q, k)
+#     results = []
+#     for rank, idx in enumerate(ids[0]):
+#         if idx == -1:
+#             continue
+#         results.append((int(idx), float(scores[0][rank]), chunks[int(idx)]))
+#     return results
+# def generate_answer(question: str, context: str) -> str:
+#     prompt = (
+#         "Answer using ONLY the provided context. "
+#         "If not found in the context, say: Not found in the provided documents.\n\n"
+#         f"Context:\n{context}\n\nQuestion:\n{question}\n\nAnswer:"
+#     )
+#     out = GENERATOR(prompt)
+#     return out[0]["generated_text"].strip()
+# def create_session(chunks: List[str]) -> str:
+#     embeddings = EMBEDDER.encode(chunks, convert_to_numpy=True)
+#     index = build_faiss_index(embeddings)
+#     session_id = str(uuid.uuid4())
+#     SESSIONS[session_id] = {
+#         "chunks": chunks,
+#         "index": index,
+#         "created_at": time.time()
+#     }
+#     return session_id
+# rag.py
 import uuid
 import time
+from typing import List, Dict, Any, Tuple
 import numpy as np
 import faiss
 # PDF extraction
 import fitz  # pymupdf
+# LLM (Qwen)
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 # -----------------------------
 # Globals (MVP)
 # -----------------------------
+# Embeddings model (fast + solid baseline)
 EMBEDDER = SentenceTransformer("all-MiniLM-L6-v2")
+# Qwen Instruct model (better than flan-t5-base)
+QWEN_MODEL_ID = "Qwen/Qwen2.5-3B-Instruct"
+# Load tokenizer + model
+tokenizer = AutoTokenizer.from_pretrained(QWEN_MODEL_ID)
+model = AutoModelForCausalLM.from_pretrained(
+    QWEN_MODEL_ID,
+    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+    device_map="auto",  # uses GPU if available; otherwise CPU
+)
+# Text-generation pipeline for CausalLMs
 GENERATOR = pipeline(
+    "text-generation",
+    model=model,
+    tokenizer=tokenizer,
 )
+# In-memory session store: session_id -> {chunks, index, created_at}
+SESSIONS: Dict[str, Dict[str, Any]] = {}
 # -----------------------------
 # Helpers
 # -----------------------------
 def extract_text_from_pdf(pdf_bytes: bytes) -> str:
+    """
+    Extract plain text from a PDF using PyMuPDF.
+    Note: For scanned/image PDFs, you'll need OCR (out of scope for MVP).
+    """
     doc = fitz.open(stream=pdf_bytes, filetype="pdf")
     pages = []
     for page in doc:
         pages.append(page.get_text("text"))
     text = "\n".join(pages).strip()
     return text
 def chunk_text(text: str, chunk_size_words: int = 350, overlap_words: int = 60) -> List[str]:
+    """
+    Word-window chunking with overlap.
+    chunk_size_words: size of each chunk window
+    overlap_words: how many words to overlap between chunks
+    step = chunk_size_words - overlap_words
+    """
     words = text.split()
+    chunks: List[str] = []
     step = max(1, chunk_size_words - overlap_words)
     for i in range(0, len(words), step):
         chunk = words[i:i + chunk_size_words]
         if chunk:
             chunks.append(" ".join(chunk))
+    return chunks
 def build_faiss_index(vectors: np.ndarray) -> faiss.Index:
+    """
+    Build a FAISS index using inner product (IP). If vectors are L2-normalized,
+    IP approximates cosine similarity.
+    """
     vectors = vectors.astype("float32")
     dim = vectors.shape[1]
+    # Inner product index (cosine-like after normalization)
+    index = faiss.IndexFlatIP(dim)
     faiss.normalize_L2(vectors)
     index.add(vectors)
     return index
+def retrieve_top_k(
+    query: str,
+    chunks: List[str],
+    index: faiss.Index,
+    k: int = 3
+) -> List[Tuple[int, float, str]]:
+    """
+    Embed the query, search FAISS, and return (chunk_id, score, chunk_text).
+    """
     q = EMBEDDER.encode([query], convert_to_numpy=True).astype("float32")
     faiss.normalize_L2(q)
     scores, ids = index.search(q, k)
+    results: List[Tuple[int, float, str]] = []
     for rank, idx in enumerate(ids[0]):
         if idx == -1:
             continue
         results.append((int(idx), float(scores[0][rank]), chunks[int(idx)]))
     return results
+def _build_qwen_prompt(question: str, context: str) -> str:
+    """
+    Build a chat-formatted prompt using Qwen's chat template for better instruction following.
+    """
+    messages = [
+        {
+            "role": "system",
+            "content": (
+                "You are a medical QA assistant. "
+                "Answer using ONLY the provided context. "
+                "If the answer is not present in the context, say exactly: "
+                "'Not found in the provided documents.'"
+            ),
+        },
+        {
+            "role": "user",
+            "content": f"Context:\n{context}\n\nQuestion:\n{question}",
+        },
+    ]
+    prompt = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True
+    )
+    return prompt
 def generate_answer(question: str, context: str) -> str:
+    """
+    Generate an answer grounded strictly in retrieved context using Qwen Instruct.
+    """
+    prompt = _build_qwen_prompt(question, context)
+    out = GENERATOR(
+        prompt,
+        max_new_tokens=256,
+        temperature=0.2,
+        do_sample=True,
+        return_full_text=False
     )
     return out[0]["generated_text"].strip()
 def create_session(chunks: List[str]) -> str:
+    """
+    Create a retrieval session by embedding chunks and building a FAISS index.
+    """
     embeddings = EMBEDDER.encode(chunks, convert_to_numpy=True)
     index = build_faiss_index(embeddings)
     SESSIONS[session_id] = {
         "chunks": chunks,
         "index": index,
+        "created_at": time.time(),
     }
     return session_id

requirements.txt CHANGED Viewed

@@ -7,3 +7,4 @@ faiss-cpu
 pymupdf
 transformers
 torch

 pymupdf
 transformers
 torch
+accelerate