Spaces:

ShanenThomas
/

PDF_RAG

Sleeping

App Files Files Community

ShanenThomas commited on Aug 16, 2025

Commit

685d934

verified ·

1 Parent(s): 3459268

Update app.py

Browse files

Files changed (1) hide show

app.py +126 -88

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import os
-import faiss
 import numpy as np
 import gradio as gr
 from typing import List, Tuple
@@ -7,69 +6,96 @@ from pypdf import PdfReader
 from sentence_transformers import SentenceTransformer
 from huggingface_hub import InferenceClient
-# ==============================
 # Config
-# ==============================
-GEN_MODEL = "mistralai/Mistral-7B-Instruct-v0.2"
 HF_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")  # set in Space Secrets
 EMB_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
 CHUNK_SIZE = 900
 CHUNK_OVERLAP = 150
 TOP_K = 4
-# ==============================
-# Globals (lifetime of the Space)
-# ==============================
 emb = SentenceTransformer(EMB_MODEL_NAME)
-index = None                      # FAISS index (inner product / cosine)
-doc_chunks: List[str] = []        # text chunks
-doc_meta: List[dict] = []         # {"file": "..."}
 client = InferenceClient(model=GEN_MODEL, token=HF_TOKEN)
-# ==============================
 # Helpers
-# ==============================
 def _chunk_text(text: str, size: int, overlap: int) -> List[str]:
-    chunks = []
-    start = 0
-    n = len(text)
-    step = size - overlap
-    while start < n:
-        end = min(start + size, n)
-        chunks.append(text[start:end])
-        start += step
-    return [c.strip() for c in chunks if c.strip()]
 def _embed(texts: List[str]) -> np.ndarray:
-    # 384-d for MiniLM; normalize for cosine/IP search
     X = emb.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
     return np.asarray(X, dtype=np.float32)
 def _ensure_index(dim: int):
-    global index
-    index = faiss.IndexFlatIP(dim)  # cosine via normalized vectors
-def _extract_text_from_pdf(path: str) -> str:
-    reader = PdfReader(path)
-    pages = []
-    for p in reader.pages:
-        t = p.extract_text() or ""
-        pages.append(t)
-    return "\n".join(pages)
-# ==============================
 # Build index
-# ==============================
 def build_from_pdfs(files) -> str:
-    global index, doc_chunks, doc_meta
     doc_chunks, doc_meta = [], []
-    # 1) read PDFs → 2) chunk → collect
     for f in files:
-        try:
-            text = _extract_text_from_pdf(f.name)
-        except Exception as e:
-            return f"Failed to read {os.path.basename(f.name)}: {e}"
         chunks = _chunk_text(text, CHUNK_SIZE, CHUNK_OVERLAP)
         for c in chunks:
             doc_chunks.append(c)
@@ -78,80 +104,92 @@ def build_from_pdfs(files) -> str:
     if not doc_chunks:
         return "No text extracted. Check your PDFs."
-    # 3) embeddings → FAISS
     E = _embed(doc_chunks)
     _ensure_index(E.shape[1])
-    index.add(E)
     return f"Indexed {len(doc_chunks)} chunks from {len(files)} file(s)."
-# ==============================
 # Retrieval + Generation
-# ==============================
 def _retrieve(query: str, k: int = TOP_K) -> Tuple[List[int], List[str]]:
-    qv = _embed([query])  # shape (1, d)
-    sims, idxs = index.search(qv, k)  # inner product similarity
-    ids = idxs[0].tolist()
-    # Filter out -1 (in case FAISS returns for empty)
-    ids = [i for i in ids if i >= 0]
     return ids, [doc_chunks[i] for i in ids]
-SYSTEM_PROMPT = (
-    "You are a helpful assistant. Use the given CONTEXT to answer the QUESTION.\n"
-    "If the answer is not in the context, say you don't know.\n"
-    "Provide a concise answer and list source filenames as [source: file.pdf] at the end."
-)
-def _mistral_prompt(question: str, context: str) -> str:
-    # Simple Mistral-instruct prompt format
-    return (
-        f"[INST] {SYSTEM_PROMPT}\n\n"
-        f"QUESTION: {question}\n\n"
-        f"CONTEXT:\n{context}\n"
-        f"[/INST]"
-    )
 def answer(question: str) -> str:
     if not question.strip():
         return "Ask a question."
-    if index is None or not doc_chunks:
         return "Upload PDFs and click **Build Index** first."
     ids, ctx_chunks = _retrieve(question, TOP_K)
-    # keep contexts reasonably short per chunk
-    previews = []
-    contexts = []
-    files = []
     for rank, i in enumerate(ids, start=1):
         chunk = doc_chunks[i][:1000]
         fname = doc_meta[i]["file"]
         contexts.append(f"[{rank}] {fname}\n{chunk}")
-        previews.append(f"[{rank}] {fname}")
         files.append(fname)
     context_str = "\n\n---\n".join(contexts)
-    prompt = _mistral_prompt(question, context_str)
     try:
-        # Use hosted Inference API; returns a single string
-        out = client.text_generation(
-            prompt,
-            max_new_tokens=512,
-            temperature=0.2,
-            top_p=0.95,
-            repetition_penalty=1.05,
-            do_sample=True,
-            return_full_text=False,
-        )
-        # Ensure sources are visible at the end
-        unique_files = ", ".join(sorted(set(files)))
         return f"{out.strip()}\n\nSources: {unique_files}"
     except Exception as e:
-        return f"Generation error: {e}\n(Verify your HUGGINGFACEHUB_API_TOKEN and model name.)"
-# ==============================
 # UI
-# ==============================
 with gr.Blocks(title="Mistral 7B PDF-RAG") as demo:
     gr.Markdown("# 📚 PDF-RAG (Mistral-7B-Instruct)\nUpload PDFs → Build Index → Ask questions. Answers cite sources.")
@@ -167,7 +205,7 @@ with gr.Blocks(title="Mistral 7B PDF-RAG") as demo:
     build_btn.click(build_from_pdfs, inputs=[files], outputs=[status])
     ask_btn.click(answer, inputs=[q], outputs=[a])
-    q.submit(answer, inputs=[q], outputs=[a])  # hit Enter to ask
 if __name__ == "__main__":
     demo.launch()

 import os
 import numpy as np
 import gradio as gr
 from typing import List, Tuple
 from sentence_transformers import SentenceTransformer
 from huggingface_hub import InferenceClient
+# -------------------------------------------------
 # Config
+# -------------------------------------------------
+# You can swap to another chat model if needed, e.g.:
+# "mistralai/Mistral-Nemo-Instruct-2407" or "meta-llama/Llama-3.1-8B-Instruct"
+GEN_MODEL = os.getenv("GEN_MODEL", "mistralai/Mistral-7B-Instruct-v0.2")
 HF_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")  # set in Space Secrets
 EMB_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
 CHUNK_SIZE = 900
 CHUNK_OVERLAP = 150
 TOP_K = 4
+# -------------------------------------------------
+# Try FAISS; fallback to pure NumPy search
+# -------------------------------------------------
+USE_FAISS = True
+try:
+    import faiss  # type: ignore
+except Exception:
+    USE_FAISS = False
+# -------------------------------------------------
+# Globals
+# -------------------------------------------------
 emb = SentenceTransformer(EMB_MODEL_NAME)
+index = None              # FAISS index (if available)
+matrix = None             # fallback: stacked embeddings
+doc_chunks: List[str] = []
+doc_meta: List[dict] = []
 client = InferenceClient(model=GEN_MODEL, token=HF_TOKEN)
+SYSTEM_PROMPT = (
+    "You are a helpful assistant. Use the given CONTEXT to answer the QUESTION.\n"
+    "If the answer is not in the context, say you don't know.\n"
+    "Be concise and list source filenames as [source: file.pdf] at the end."
+)
+# -------------------------------------------------
 # Helpers
+# -------------------------------------------------
+def _extract_text_from_pdf(path: str) -> str:
+    r = PdfReader(path)
+    pages = [(p.extract_text() or "") for p in r.pages]
+    return "\n".join(pages)
 def _chunk_text(text: str, size: int, overlap: int) -> List[str]:
+    chunks, step = [], size - overlap
+    i, n = 0, len(text)
+    while i < n:
+        chunk = text[i:i+size].strip()
+        if chunk: chunks.append(chunk)
+        i += step
+    return chunks
 def _embed(texts: List[str]) -> np.ndarray:
     X = emb.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
     return np.asarray(X, dtype=np.float32)
 def _ensure_index(dim: int):
+    global index, matrix
+    if USE_FAISS:
+        index = faiss.IndexFlatIP(dim)
+    else:
+        index = None
+        matrix = None
+def _add_embeddings(E: np.ndarray):
+    global matrix
+    if USE_FAISS:
+        index.add(E)
+    else:
+        matrix = E if matrix is None else np.vstack([matrix, E])
+def _search(qv: np.ndarray, k: int):
+    if USE_FAISS:
+        return index.search(qv, k)  # returns (D, I)
+    sims = matrix @ qv[0]  # IP because vectors are normalized
+    I = np.argsort(-sims)[:k]
+    D = sims[I]
+    return D[None, :], I[None, :]
+# -------------------------------------------------
 # Build index
+# -------------------------------------------------
 def build_from_pdfs(files) -> str:
+    global doc_chunks, doc_meta
     doc_chunks, doc_meta = [], []
     for f in files:
+        text = _extract_text_from_pdf(f.name)
         chunks = _chunk_text(text, CHUNK_SIZE, CHUNK_OVERLAP)
         for c in chunks:
             doc_chunks.append(c)
     if not doc_chunks:
         return "No text extracted. Check your PDFs."
     E = _embed(doc_chunks)
     _ensure_index(E.shape[1])
+    _add_embeddings(E)
     return f"Indexed {len(doc_chunks)} chunks from {len(files)} file(s)."
+# -------------------------------------------------
 # Retrieval + Generation
+# -------------------------------------------------
 def _retrieve(query: str, k: int = TOP_K) -> Tuple[List[int], List[str]]:
+    qv = _embed([query])
+    _, idxs = _search(qv, k)
+    ids = [i for i in idxs[0].tolist() if i >= 0]
     return ids, [doc_chunks[i] for i in ids]
+def _call_chat(messages):
+    """
+    Try several Hugging Face client paths for max compatibility.
+    Returns generated string or raises last exception.
+    """
+    # 1) Newer helper
+    try:
+        resp = client.chat_completion(messages=messages, max_tokens=512, temperature=0.2, top_p=0.95)
+        # resp.choices[0].message.content (object or dict)
+        choice = resp.choices[0]
+        msg = getattr(choice, "message", None) or choice["message"]
+        return getattr(msg, "content", None) or msg["content"]
+    except Exception as e1:
+        last = e1
+    # 2) OpenAI-style
+    try:
+        resp = client.chat.completions.create(model=GEN_MODEL, messages=messages, max_tokens=512, temperature=0.2, top_p=0.95)
+        choice = resp.choices[0]
+        msg = getattr(choice, "message", None) or choice["message"]
+        return getattr(msg, "content", None) or msg["content"]
+    except Exception as e2:
+        last = e2
+    # 3) Text generation with a single composed prompt
+    try:
+        prompt = f"[INST] {SYSTEM_PROMPT}\n\n{messages[-1]['content']} [/INST]"
+        return client.text_generation(prompt, max_new_tokens=512, temperature=0.2, top_p=0.95,
+                                      repetition_penalty=1.05, do_sample=True, return_full_text=False).strip()
+    except Exception as e3:
+        last = e3
+    # 4) Old conversational task
+    try:
+        conv = client.conversational(
+            past_user_inputs=[],
+            generated_responses=[],
+            text=messages[-1]["content"],
+            parameters={"temperature": 0.2, "max_new_tokens": 512},
+        )
+        return conv["generated_text"] if isinstance(conv, dict) else conv.generated_text
+    except Exception as e4:
+        last = e4
+    raise last
 def answer(question: str) -> str:
     if not question.strip():
         return "Ask a question."
+    if (USE_FAISS and index is None) or (not USE_FAISS and matrix is None) or not doc_chunks:
         return "Upload PDFs and click **Build Index** first."
     ids, ctx_chunks = _retrieve(question, TOP_K)
+    contexts, files = [], []
     for rank, i in enumerate(ids, start=1):
         chunk = doc_chunks[i][:1000]
         fname = doc_meta[i]["file"]
         contexts.append(f"[{rank}] {fname}\n{chunk}")
         files.append(fname)
     context_str = "\n\n---\n".join(contexts)
+    messages = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": f"QUESTION: {question}\n\nCONTEXT:\n{context_str}"},
+    ]
     try:
+        out = _call_chat(messages)
+        unique_files = ", ".join(sorted(set(files))) if files else "N/A"
         return f"{out.strip()}\n\nSources: {unique_files}"
     except Exception as e:
+        return f"Generation error: {e}\n(Verify your HUGGINGFACEHUB_API_TOKEN and model availability.)"
+# -------------------------------------------------
 # UI
+# -------------------------------------------------
 with gr.Blocks(title="Mistral 7B PDF-RAG") as demo:
     gr.Markdown("# 📚 PDF-RAG (Mistral-7B-Instruct)\nUpload PDFs → Build Index → Ask questions. Answers cite sources.")
     build_btn.click(build_from_pdfs, inputs=[files], outputs=[status])
     ask_btn.click(answer, inputs=[q], outputs=[a])
+    q.submit(answer, inputs=[q], outputs=[a])
 if __name__ == "__main__":
     demo.launch()