Spaces:

lakzz
/

pdf-reader

Sleeping

App Files Files Community

kiranmadhusud commited on Mar 16

Commit

6237214

1 Parent(s): 540c55e

fix RAG app

Browse files

Files changed (1) hide show

rag_pipeline.py +42 -27

rag_pipeline.py CHANGED Viewed

@@ -1,13 +1,13 @@
-from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
 from sentence_transformers import SentenceTransformer
-from langchain_text_splitters import RecursiveCharacterTextSplitter
 import faiss
 import numpy as np
 import torch
-# ── Models (all free, no API key needed) ──────────────────────────────────────
-EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"   # lightweight embedder
-LLM_MODEL       = "google/flan-t5-base"                       # swap for flan-t5-large if on GPU
 class RAGPipeline:
     def __init__(self):
@@ -15,35 +15,26 @@ class RAGPipeline:
         self.embedder = SentenceTransformer(EMBEDDING_MODEL)
         print("Loading LLM...")
-        self.tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)
-        self.model = AutoModelForSeq2SeqLM.from_pretrained(LLM_MODEL)
         self.llm = pipeline(
-            "text2text-generation",
-            model=self.model,
-            tokenizer=self.tokenizer,
-            max_new_tokens=300,
-            device=0 if torch.cuda.is_available() else -1,
         )
         self.index = None
         self.chunks = []
-        self.splitter = RecursiveCharacterTextSplitter(
-            chunk_size=500, chunk_overlap=50
-        )
     def ingest(self, text: str):
-        """Chunk text, embed it, store in FAISS."""
-        self.chunks = self.splitter.split_text(text)
         embeddings = self.embedder.encode(self.chunks, show_progress_bar=False)
         embeddings = np.array(embeddings).astype("float32")
         dim = embeddings.shape[1]
         self.index = faiss.IndexFlatL2(dim)
         self.index.add(embeddings)
         return len(self.chunks)
     def retrieve(self, query: str, top_k: int = 3):
-        """Find the most relevant chunks for a query."""
         if self.index is None:
             return []
         q_emb = self.embedder.encode([query]).astype("float32")
@@ -51,17 +42,41 @@ class RAGPipeline:
         return [self.chunks[i] for i in indices[0] if i < len(self.chunks)]
     def answer(self, query: str):
-        """Full RAG: retrieve → build prompt → generate."""
         context_chunks = self.retrieve(query)
         if not context_chunks:
             return "⚠️ Please upload a document first."
         context = "\n\n".join(context_chunks)
-        prompt = (
-            f"Answer the question based only on the context below.\n\n"
-            f"Context:\n{context}\n\n"
-            f"Question: {query}\n\n"
-            f"Answer:"
         )
-        result = self.llm(prompt)[0]["generated_text"]
-        return result

+# rag_pipeline.py
+from transformers import pipeline
 from sentence_transformers import SentenceTransformer
 import faiss
 import numpy as np
 import torch
+EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
+LLM_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"   # ✅ works with text-generation
 class RAGPipeline:
     def __init__(self):
         self.embedder = SentenceTransformer(EMBEDDING_MODEL)
         print("Loading LLM...")
         self.llm = pipeline(
+            "text-generation",              # ✅ use this instead
+            model=LLM_MODEL,
+            torch_dtype=torch.float32,
+            device_map="auto" if torch.cuda.is_available() else None,
         )
         self.index = None
         self.chunks = []
     def ingest(self, text: str):
+        self.chunks = split_text(text)
         embeddings = self.embedder.encode(self.chunks, show_progress_bar=False)
         embeddings = np.array(embeddings).astype("float32")
         dim = embeddings.shape[1]
         self.index = faiss.IndexFlatL2(dim)
         self.index.add(embeddings)
         return len(self.chunks)
     def retrieve(self, query: str, top_k: int = 3):
         if self.index is None:
             return []
         q_emb = self.embedder.encode([query]).astype("float32")
         return [self.chunks[i] for i in indices[0] if i < len(self.chunks)]
     def answer(self, query: str):
         context_chunks = self.retrieve(query)
         if not context_chunks:
             return "⚠️ Please upload a document first."
         context = "\n\n".join(context_chunks)
+        # TinyLlama uses ChatML format
+        prompt = f"""<|system|>
+You are a helpful assistant. Answer only based on the context provided.</s>
+<|user|>
+Context:
+{context}
+Question: {query}</s>
+<|assistant|>"""
+        result = self.llm(
+            prompt,
+            max_new_tokens=300,
+            do_sample=False,
+            temperature=1.0,
+            pad_token_id=self.llm.tokenizer.eos_token_id,
         )
+        # Strip the prompt — return only the generated part
+        generated = result[0]["generated_text"]
+        answer = generated.split("<|assistant|>")[-1].strip()
+        return answer
+def split_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list:
+    chunks = []
+    start = 0
+    while start < len(text):
+        end = start + chunk_size
+        chunks.append(text[start:end])
+        start += chunk_size - overlap
+    return [c.strip() for c in chunks if c.strip()]