Spaces:

akazmi
/

Documents-Reader-RAG

Sleeping

App Files Files Community

akazmi commited on Jun 24, 2025

Commit

0c58ac5

verified ·

1 Parent(s): 7aec612

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -56

app.py CHANGED Viewed

@@ -1,86 +1,85 @@
 import gradio as gr
-import pdfplumber
 import torch
-from sentence_transformers import SentenceTransformer
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 import numpy as np
 from sklearn.metrics.pairwise import cosine_similarity
-import re
-# Load models
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-embedder = SentenceTransformer("all-MiniLM-L6-v2", device=device)
-model_name = "google/flan-t5-base"  # stronger than 'small'
 tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
-qa_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
-# Extract and clean PDF text
 def read_pdf(file_path):
     try:
         with pdfplumber.open(file_path) as pdf:
-            text = "\n".join([page.extract_text() or "" for page in pdf.pages])
-        return re.sub(r'\n+', '\n', text.strip())
     except Exception as e:
-        return f"❌ PDF reading failed: {e}"
-# Chunk the text into clean sentence-like blocks
-def chunk_text(text, max_length=500):
-    sentences = re.split(r'(?<=[.!?])\s+', text)
-    chunks = []
-    current_chunk = ""
     for sentence in sentences:
-        if len(current_chunk) + len(sentence) <= max_length:
-            current_chunk += sentence + " "
         else:
-            chunks.append(current_chunk.strip())
-            current_chunk = sentence + " "
-    if current_chunk:
-        chunks.append(current_chunk.strip())
     return chunks
-# Embed and get top chunks
-def get_top_chunks(question, chunks, k=2):
-    q_embed = embedder.encode([question])
-    chunk_embeds = embedder.encode(chunks)
-    sims = cosine_similarity(q_embed, chunk_embeds)[0]
-    top_k_idx = np.argsort(sims)[-k:][::-1]
-    return "\n\n".join([chunks[i] for i in top_k_idx])
-# Generate answer
-def answer_question(pdf_file, user_question):
-    if not pdf_file or not user_question.strip():
-        return "⚠️ Upload a PDF and enter your question."
-    text = read_pdf(pdf_file.name)
-    if not text or text.startswith("❌"):
-        return text
-    chunks = chunk_text(text)
-    relevant = get_top_chunks(user_question, chunks)
     prompt = (
-        f"You are a legal document assistant. Based on the context below, "
-        f"answer the question briefly and clearly.\n\n"
-        f"Context:\n{relevant}\n\n"
-        f"Question: {user_question}\n\nAnswer:"
     )
     try:
-        result = qa_pipeline(prompt, max_new_tokens=256, do_sample=False)
-        return result[0]["generated_text"].split("Answer:")[-1].strip()
     except Exception as e:
-        return f"❌ Generation error: {e}"
-# Gradio interface
 with gr.Blocks() as demo:
-    gr.Markdown("## 📚 Legal Document Q&A Assistant")
-    pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
-    question_input = gr.Textbox(label="Ask a question")
-    answer_output = gr.Textbox(label="Answer", lines=8)
-    ask_button = gr.Button("Get Answer")
-    ask_button.click(answer_question, inputs=[pdf_input, question_input], outputs=answer_output)
 demo.launch()

 import gradio as gr
 import torch
+import pdfplumber
+import re
 import numpy as np
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
+# ===== Load Embedding Model =====
+embedder = SentenceTransformer("all-MiniLM-L6-v2")
+# ===== Load QA Model =====
+model_name = "mistralai/Mistral-7B-Instruct-v0.1"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map="auto")
+qa_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
+# ===== Read PDF and Clean =====
 def read_pdf(file_path):
     try:
         with pdfplumber.open(file_path) as pdf:
+            return "\n".join(page.extract_text() or "" for page in pdf.pages)
     except Exception as e:
+        return f"Error reading PDF: {str(e)}"
+# ===== Smart Sentence Chunking =====
+def chunk_text(text, max_len=500):
+    sentences = re.split(r'(?<=[.؟!])\s+', text)
+    chunks, current = [], ""
     for sentence in sentences:
+        if len(current) + len(sentence) <= max_len:
+            current += sentence + " "
         else:
+            chunks.append(current.strip())
+            current = sentence + " "
+    if current:
+        chunks.append(current.strip())
     return chunks
+# ===== Semantic Retrieval =====
+def get_relevant_chunks(question, chunks, top_k=2):
+    q_vec = embedder.encode([question])
+    c_vecs = embedder.encode(chunks)
+    sims = cosine_similarity(q_vec, c_vecs)[0]
+    top_indices = np.argsort(sims)[-top_k:][::-1]
+    return "\n\n".join([chunks[i] for i in top_indices])
+# ===== Generate Answer =====
+def answer_question(file, question):
+    if not file:
+        return "⚠️ Please upload a PDF."
+    if not question.strip():
+        return "⚠️ Please enter a question."
+    raw_text = read_pdf(file.name)
+    if raw_text.startswith("Error"):
+        return raw_text
+    chunks = chunk_text(raw_text)
+    context = get_relevant_chunks(question, chunks)
     prompt = (
+        f"You are a legal expert. Based on the context below, answer the question in a detailed and explanatory manner.\n\n"
+        f"Context:\n{context}\n\n"
+        f"Question: {question}\n\n"
+        f"Answer:"
     )
     try:
+        response = qa_pipeline(prompt, max_new_tokens=300, do_sample=False, temperature=0.3)
+        return response[0]["generated_text"].split("Answer:")[-1].strip()
     except Exception as e:
+        return f"Error generating answer: {e}"
+# ===== Gradio Interface =====
 with gr.Blocks() as demo:
+    gr.Markdown("## 📘 Document Question Answering (RAG-powered)")
+    file = gr.File(label="Upload PDF", file_types=[".pdf"])
+    question = gr.Textbox(label="Ask a question", placeholder="e.g., Is there any section for cost audit?")
+    answer = gr.Textbox(label="Answer", lines=10)
+    submit = gr.Button("Get Answer")
+    submit.click(fn=answer_question, inputs=[file, question], outputs=answer)
 demo.launch()