Spaces:

akazmi
/

Documents-Reader-RAG

Sleeping

App Files Files Community

akazmi commited on Jun 24, 2025

Commit

7aec612

verified ·

1 Parent(s): 45d1cd8

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -41

app.py CHANGED Viewed

@@ -1,77 +1,86 @@
 import gradio as gr
 import pdfplumber
 from sentence_transformers import SentenceTransformer
-from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
 import numpy as np
 from sklearn.metrics.pairwise import cosine_similarity
-import torch
-# ✅ Load models
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 embedder = SentenceTransformer("all-MiniLM-L6-v2", device=device)
-model_name = "google/flan-t5-small"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
 qa_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
-# ✅ Step 1: Read PDF using pdfplumber
 def read_pdf(file_path):
     try:
         with pdfplumber.open(file_path) as pdf:
             text = "\n".join([page.extract_text() or "" for page in pdf.pages])
-        return text.strip()
     except Exception as e:
-        return f"❌ Failed to read PDF: {e}"
-# ✅ Step 2: Chunk document text
-def chunk_text(text, chunk_size=500):
-    words = text.split()
-    return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
-# ✅ Step 3: Semantic retrieval of relevant chunks
-def get_relevant_chunks(question, chunks, top_k=2):
-    question_embedding = embedder.encode([question])
-    chunk_embeddings = embedder.encode(chunks)
-    similarities = cosine_similarity(question_embedding, chunk_embeddings)[0]
-    top_k_indices = np.argsort(similarities)[-top_k:][::-1]
-    return "\n\n".join([chunks[i] for i in top_k_indices])
-# ✅ Step 4: Generate answer with retrieved context
 def answer_question(pdf_file, user_question):
-    if pdf_file is None or user_question.strip() == "":
-        return "⚠️ Please upload a document and enter a question."
     text = read_pdf(pdf_file.name)
-    if not text:
-        return "⚠️ PDF has no readable text."
     chunks = chunk_text(text)
-    relevant_context = get_relevant_chunks(user_question, chunks, top_k=2)
-    prompt = f"""You are a helpful assistant. Use the context to answer the question.
-Context:
-{relevant_context}
-Question: {user_question}
-Answer:"""
     try:
-        result = qa_pipeline(prompt, max_new_tokens=200)
         return result[0]["generated_text"].split("Answer:")[-1].strip()
     except Exception as e:
-        return f"❌ Error during generation: {e}"
-# ✅ Step 5: Gradio UI
 with gr.Blocks() as demo:
-    gr.Markdown("### 📄 Ask Questions from Your PDF Document (RAG-based QA)")
-    with gr.Row():
-        pdf_input = gr.File(label="📁 Upload PDF", file_types=[".pdf"])
-        question_input = gr.Textbox(label="❓ Ask a question about the document")
-    answer_output = gr.Textbox(label="🧠 Answer", lines=8)
-    submit_btn = gr.Button("🔍 Get Answer")
-    submit_btn.click(fn=answer_question, inputs=[pdf_input, question_input], outputs=answer_output)
 demo.launch()

 import gradio as gr
 import pdfplumber
+import torch
 from sentence_transformers import SentenceTransformer
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 import numpy as np
 from sklearn.metrics.pairwise import cosine_similarity
+import re
+# Load models
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 embedder = SentenceTransformer("all-MiniLM-L6-v2", device=device)
+model_name = "google/flan-t5-base"  # stronger than 'small'
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
 qa_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
+# Extract and clean PDF text
 def read_pdf(file_path):
     try:
         with pdfplumber.open(file_path) as pdf:
             text = "\n".join([page.extract_text() or "" for page in pdf.pages])
+        return re.sub(r'\n+', '\n', text.strip())
     except Exception as e:
+        return f"❌ PDF reading failed: {e}"
+# Chunk the text into clean sentence-like blocks
+def chunk_text(text, max_length=500):
+    sentences = re.split(r'(?<=[.!?])\s+', text)
+    chunks = []
+    current_chunk = ""
+    for sentence in sentences:
+        if len(current_chunk) + len(sentence) <= max_length:
+            current_chunk += sentence + " "
+        else:
+            chunks.append(current_chunk.strip())
+            current_chunk = sentence + " "
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    return chunks
+# Embed and get top chunks
+def get_top_chunks(question, chunks, k=2):
+    q_embed = embedder.encode([question])
+    chunk_embeds = embedder.encode(chunks)
+    sims = cosine_similarity(q_embed, chunk_embeds)[0]
+    top_k_idx = np.argsort(sims)[-k:][::-1]
+    return "\n\n".join([chunks[i] for i in top_k_idx])
+# Generate answer
 def answer_question(pdf_file, user_question):
+    if not pdf_file or not user_question.strip():
+        return "⚠️ Upload a PDF and enter your question."
     text = read_pdf(pdf_file.name)
+    if not text or text.startswith("❌"):
+        return text
     chunks = chunk_text(text)
+    relevant = get_top_chunks(user_question, chunks)
+    prompt = (
+        f"You are a legal document assistant. Based on the context below, "
+        f"answer the question briefly and clearly.\n\n"
+        f"Context:\n{relevant}\n\n"
+        f"Question: {user_question}\n\nAnswer:"
+    )
     try:
+        result = qa_pipeline(prompt, max_new_tokens=256, do_sample=False)
         return result[0]["generated_text"].split("Answer:")[-1].strip()
     except Exception as e:
+        return f"❌ Generation error: {e}"
+# Gradio interface
 with gr.Blocks() as demo:
+    gr.Markdown("## 📚 Legal Document Q&A Assistant")
+    pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
+    question_input = gr.Textbox(label="Ask a question")
+    answer_output = gr.Textbox(label="Answer", lines=8)
+    ask_button = gr.Button("Get Answer")
+    ask_button.click(answer_question, inputs=[pdf_input, question_input], outputs=answer_output)
 demo.launch()