Spaces:

prithvi1029
/

agentic-document-intelligence

Sleeping

App Files Files Community

prithvi1029 commited on 9 days ago

Commit

d26337d

verified ·

1 Parent(s): e2117cd

Update app.py

Browse files

Files changed (1) hide show

app.py +139 -43

app.py CHANGED Viewed

@@ -1,67 +1,163 @@
 import gradio as gr
-from langchain_community.document_loaders import PyPDFLoader
-from langchain_text_splitters import RecursiveCharacterTextSplitter
-from langchain_community.embeddings import HuggingFaceEmbeddings
-from langchain_community.vectorstores import FAISS
-from langchain_huggingface import HuggingFaceEndpoint
-def run_qa(pdf_path, question):
-    if pdf_path is None or not question.strip():
-        return "Please upload a PDF and enter a question."
-    # Load PDF
-    loader = PyPDFLoader(pdf_path)
-    docs = loader.load()
-    # Split
-    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
-    chunks = splitter.split_documents(docs)
-    # Embeddings
-    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
-    vectordb = FAISS.from_documents(chunks, embeddings)
-    # Retrieve
-    retriever = vectordb.as_retriever(search_kwargs={"k": 4})
-    retrieved_docs = retriever.get_relevant_documents(question)
-    context = "\n\n".join([d.page_content for d in retrieved_docs])
-    # Hugging Face LLM
-    llm = HuggingFaceEndpoint(
-        repo_id="mistralai/Mistral-7B-Instruct-v0.2",
-        temperature=0.2,
-        max_new_tokens=512,
-    )
-    prompt = f"""
-You are a helpful assistant. Answer ONLY using the context.
-If the answer is not present, say "I don't know".
 Context:
 {context}
-Question:
-{question}
-Answer:
-"""
-    answer = llm.invoke(prompt)
-    sources = "\n\n".join([d.page_content[:500] for d in retrieved_docs[:2]])
-    return f"### Answer\n{answer}\n\n---\n### Sources\n{sources}"
-with gr.Blocks(title="Agentic Document Intelligence") as demo:
-    gr.Markdown("# 📄 Agentic Document Intelligence (HF LLM)")
     pdf = gr.File(label="Upload PDF", type="filepath")
-    question = gr.Textbox(label="Ask a question")
-    output = gr.Markdown()
-    gr.Button("Run").click(run_qa, inputs=[pdf, question], outputs=output)
 demo.launch()

+import os
+import re
 import gradio as gr
+import faiss
+import numpy as np
+from pypdf import PdfReader
+from sentence_transformers import SentenceTransformer
+from huggingface_hub import InferenceClient
+# -----------------------------
+# Config
+# -----------------------------
+HF_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN") or os.getenv("HF_TOKEN")
+# Pick a model that works with Inference API (you can change this)
+HF_LLM_MODEL = os.getenv("HF_LLM_MODEL", "mistralai/Mistral-7B-Instruct-v0.3")
+EMBED_MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
+TOP_K = 4
+# -----------------------------
+# Helpers
+# -----------------------------
+def clean_text(s: str) -> str:
+    s = re.sub(r"\s+", " ", s)
+    return s.strip()
+def chunk_text(text: str, chunk_size=900, overlap=150):
+    chunks = []
+    start = 0
+    n = len(text)
+    while start < n:
+        end = min(n, start + chunk_size)
+        chunks.append(text[start:end])
+        start = end - overlap
+        if start < 0:
+            start = 0
+        if end == n:
+            break
+    return [c for c in (clean_text(x) for x in chunks) if len(c) > 30]
+def pdf_to_text(pdf_path: str) -> str:
+    reader = PdfReader(pdf_path)
+    pages = []
+    for p in reader.pages:
+        t = p.extract_text() or ""
+        if t.strip():
+            pages.append(t)
+    return "\n".join(pages)
+def build_faiss_index(chunks, embedder):
+    vectors = embedder.encode(chunks, convert_to_numpy=True, normalize_embeddings=True)
+    dim = vectors.shape[1]
+    index = faiss.IndexFlatIP(dim)  # cosine similarity since normalized
+    index.add(vectors.astype(np.float32))
+    return index, vectors
+def retrieve(query, embedder, index, chunks, k=TOP_K):
+    qv = embedder.encode([query], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
+    scores, ids = index.search(qv, k)
+    hits = []
+    for score, idx in zip(scores[0], ids[0]):
+        if idx == -1:
+            continue
+        hits.append((float(score), chunks[int(idx)]))
+    return hits
+def hf_generate(client: InferenceClient, prompt: str) -> str:
+    # Works with many chat/instruct models using "text_generation"
+    out = client.text_generation(
+        prompt,
+        max_new_tokens=450,
+        temperature=0.2,
+        top_p=0.9,
+        repetition_penalty=1.08,
+    )
+    return out.strip()
+# -----------------------------
+# App logic (cached state)
+# -----------------------------
+embedder = SentenceTransformer(EMBED_MODEL_NAME)
+def on_upload(pdf_path):
+    if not pdf_path:
+        return None, None, "Please upload a PDF."
+    text = pdf_to_text(pdf_path)
+    if not text.strip():
+        return None, None, "Could not extract text from this PDF (it may be scanned). Try a text-based PDF."
+    chunks = chunk_text(text)
+    if len(chunks) < 2:
+        return None, None, "Not enough extractable text to build RAG index."
+    index, _ = build_faiss_index(chunks, embedder)
+    return index, chunks, f"✅ Indexed {len(chunks)} chunks. Now ask a question."
+def answer_question(index, chunks, question):
+    if index is None or chunks is None:
+        return "Upload a PDF first."
+    if not question or not question.strip():
+        return "Type a question."
+    if not HF_TOKEN:
+        return (
+            "HF token not found. Go to Space → Settings → Variables and secrets → "
+            "add Secret named HUGGINGFACEHUB_API_TOKEN, then Restart Space."
+        )
+    hits = retrieve(question, embedder, index, chunks, k=TOP_K)
+    context = "\n\n".join([f"[{i+1}] {h[1]}" for i, h in enumerate(hits)])
+    prompt = f"""You are a helpful assistant. Answer using ONLY the context.
+If the answer is not in the context, say "I don't know from the provided document."
+Question: {question}
 Context:
 {context}
+Answer:"""
+    client = InferenceClient(model=HF_LLM_MODEL, token=HF_TOKEN)
+    ans = hf_generate(client, prompt)
+    sources = "\n\n".join([f"**Source {i+1} (score={hits[i][0]:.3f})**\n{hits[i][1][:600]}..." for i in range(len(hits))])
+    return f"### Answer\n{ans}\n\n---\n### Retrieved Sources\n{sources}"
+# -----------------------------
+# UI
+# -----------------------------
+with gr.Blocks(title="Agentic Document Intelligence (HF RAG)") as demo:
+    gr.Markdown("# 📄 Agentic Document Intelligence\nUpload a PDF and ask questions (RAG) — using Hugging Face Inference API.")
     pdf = gr.File(label="Upload PDF", type="filepath")
+    status = gr.Markdown()
+    index_state = gr.State(None)
+    chunks_state = gr.State(None)
+    pdf.change(
+        fn=on_upload,
+        inputs=[pdf],
+        outputs=[index_state, chunks_state, status],
+    )
+    question = gr.Textbox(label="Ask a question", placeholder="e.g., What is the payment term?")
+    out = gr.Markdown()
+    btn = gr.Button("Run")
+    btn.click(
+        fn=answer_question,
+        inputs=[index_state, chunks_state, question],
+        outputs=[out],
+    )
 demo.launch()