Spaces:

akazmi
/

Documents-Reader-RAG

Sleeping

App Files Files Community

akazmi commited on Jun 24, 2025

Commit

b373765

verified ·

1 Parent(s): 7d298ed

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -46

app.py CHANGED Viewed

@@ -1,102 +1,98 @@
 import gradio as gr
 import os
 from groq import Groq
-from PyPDF2 import PdfReader
-# Initialize Groq client
 def initialize_groq():
     return Groq(api_key=os.getenv("GROQ_API_KEY"))
-# Clean common typos in user questions
 def clean_question(user_question):
-    corrections = {
-        "slaps": "slabs",
-        "salried": "salaried",
-        "slabbs": "slabs"
-    }
-    for wrong, correct in corrections.items():
-        user_question = user_question.replace(wrong, correct)
     return user_question
-# Read uploaded PDF and return its text
 def read_pdf(uploaded_file):
     try:
-        reader = PdfReader(uploaded_file)
-        text = ""
-        for page in reader.pages:
-            page_text = page.extract_text()
-            if page_text:
-                text += page_text
-        return text
     except Exception as e:
-        return f"Error reading PDF: {str(e)}"
-# Split text into chunks for retrieval
 def chunk_text(text, chunk_size=3000):
     return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
-# Basic keyword overlap similarity
 def similarity(query, text):
     query_words = set(query.lower().split())
     text_words = set(text.lower().split())
     return len(query_words & text_words)
-# Get most relevant chunk of document
 def retrieve_relevant_document(user_question, document_text):
     chunks = chunk_text(document_text)
-    if not chunks:
-        return "No readable content in the PDF."
-    return max(chunks, key=lambda chunk: similarity(user_question, chunk))
-# Generate answer using Groq model
 def answer_question(file, user_question):
-    if file is None:
         return "Please upload a PDF document."
     user_question = clean_question(user_question)
     document_text = read_pdf(file)
-    if not document_text or "error" in document_text.lower():
-        return "Unable to read document or it's empty."
     relevant_chunk = retrieve_relevant_document(user_question, document_text)
-    # Build the prompt for the LLM
-    prompt = f"""You are a tax and law expert. Read the document and answer the user query concisely.
 User Question: {user_question}
 Relevant Extract from Document:
 {relevant_chunk}
 """
-    client = initialize_groq()
     try:
-        chat_completion = client.chat.completions.create(
             messages=[{"role": "user", "content": prompt}],
-            model="llama3-8b-8192",
         )
-        return chat_completion.choices[0].message.content
     except Exception as e:
-        return f"Error generating answer: {str(e)}"
-# Create Gradio Interface
 def create_interface():
     with gr.Blocks() as demo:
-        gr.Markdown("## 📄 Legal Document Q&A Chatbot\nUpload a PDF and ask questions based on its contents.")
         file_input = gr.File(label="Upload PDF", type="filepath", file_types=[".pdf"])
-        question_input = gr.Textbox(label="Enter your question", placeholder="E.g., What are the tax slabs for salaried individuals?")
         answer_output = gr.Textbox(label="Answer")
-        submit_btn = gr.Button("Ask")
-        submit_btn.click(fn=answer_question, inputs=[file_input, question_input], outputs=answer_output)
     return demo
-# Launch the app
 if __name__ == "__main__":
     demo = create_interface()
     demo.launch()

 import gradio as gr
 import os
 from groq import Groq
+import pdfplumber
+import pytesseract
+from PIL import Image
+from pdf2image import convert_from_path
+# --- Helper Functions ---
 def initialize_groq():
     return Groq(api_key=os.getenv("GROQ_API_KEY"))
 def clean_question(user_question):
+    corrections = {"slaps": "slabs", "salried": "salaried"}
+    for wrong, right in corrections.items():
+        user_question = user_question.replace(wrong, right)
     return user_question
 def read_pdf(uploaded_file):
     try:
+        with pdfplumber.open(uploaded_file.name) as pdf:
+            full_text = ""
+            for page in pdf.pages:
+                text = page.extract_text()
+                if text:
+                    full_text += text
+        if not full_text.strip():
+            # OCR fallback
+            images = convert_from_path(uploaded_file.name)
+            full_text = ""
+            for img in images:
+                text = pytesseract.image_to_string(img)
+                full_text += text
+        return full_text.strip()
     except Exception as e:
+        return f"Error reading PDF: {e}"
 def chunk_text(text, chunk_size=3000):
     return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
 def similarity(query, text):
     query_words = set(query.lower().split())
     text_words = set(text.lower().split())
     return len(query_words & text_words)
 def retrieve_relevant_document(user_question, document_text):
     chunks = chunk_text(document_text)
+    return max(chunks, key=lambda chunk: similarity(user_question, chunk)) if chunks else ""
 def answer_question(file, user_question):
+    if not file:
         return "Please upload a PDF document."
     user_question = clean_question(user_question)
     document_text = read_pdf(file)
+    if not document_text:
+        return "❌ Document appears empty or unreadable. Please try a different file."
     relevant_chunk = retrieve_relevant_document(user_question, document_text)
+    prompt = f"""You are a tax/legal assistant. Read the following extract and answer the user's query.
 User Question: {user_question}
 Relevant Extract from Document:
 {relevant_chunk}
 """
     try:
+        client = initialize_groq()
+        response = client.chat.completions.create(
             messages=[{"role": "user", "content": prompt}],
+            model="llama3-8b-8192"
         )
+        return response.choices[0].message.content
     except Exception as e:
+        return f"Error generating answer: {e}"
+# --- Gradio UI ---
 def create_interface():
     with gr.Blocks() as demo:
+        gr.Markdown("## 📄 Legal Document Q&A\nUpload a PDF and ask questions based on its content.")
         file_input = gr.File(label="Upload PDF", type="filepath", file_types=[".pdf"])
+        question_input = gr.Textbox(label="Your Question")
         answer_output = gr.Textbox(label="Answer")
+        submit = gr.Button("Ask")
+        submit.click(fn=answer_question, inputs=[file_input, question_input], outputs=answer_output)
     return demo
+# Launch
 if __name__ == "__main__":
     demo = create_interface()
     demo.launch()