akazmi's picture
Update app.py
b373765 verified
import gradio as gr
import os
from groq import Groq
import pdfplumber
import pytesseract
from PIL import Image
from pdf2image import convert_from_path
# --- Helper Functions ---
def initialize_groq():
return Groq(api_key=os.getenv("GROQ_API_KEY"))
def clean_question(user_question):
corrections = {"slaps": "slabs", "salried": "salaried"}
for wrong, right in corrections.items():
user_question = user_question.replace(wrong, right)
return user_question
def read_pdf(uploaded_file):
try:
with pdfplumber.open(uploaded_file.name) as pdf:
full_text = ""
for page in pdf.pages:
text = page.extract_text()
if text:
full_text += text
if not full_text.strip():
# OCR fallback
images = convert_from_path(uploaded_file.name)
full_text = ""
for img in images:
text = pytesseract.image_to_string(img)
full_text += text
return full_text.strip()
except Exception as e:
return f"Error reading PDF: {e}"
def chunk_text(text, chunk_size=3000):
return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
def similarity(query, text):
query_words = set(query.lower().split())
text_words = set(text.lower().split())
return len(query_words & text_words)
def retrieve_relevant_document(user_question, document_text):
chunks = chunk_text(document_text)
return max(chunks, key=lambda chunk: similarity(user_question, chunk)) if chunks else ""
def answer_question(file, user_question):
if not file:
return "Please upload a PDF document."
user_question = clean_question(user_question)
document_text = read_pdf(file)
if not document_text:
return "❌ Document appears empty or unreadable. Please try a different file."
relevant_chunk = retrieve_relevant_document(user_question, document_text)
prompt = f"""You are a tax/legal assistant. Read the following extract and answer the user's query.
User Question: {user_question}
Relevant Extract from Document:
{relevant_chunk}
"""
try:
client = initialize_groq()
response = client.chat.completions.create(
messages=[{"role": "user", "content": prompt}],
model="llama3-8b-8192"
)
return response.choices[0].message.content
except Exception as e:
return f"Error generating answer: {e}"
# --- Gradio UI ---
def create_interface():
with gr.Blocks() as demo:
gr.Markdown("## πŸ“„ Legal Document Q&A\nUpload a PDF and ask questions based on its content.")
file_input = gr.File(label="Upload PDF", type="filepath", file_types=[".pdf"])
question_input = gr.Textbox(label="Your Question")
answer_output = gr.Textbox(label="Answer")
submit = gr.Button("Ask")
submit.click(fn=answer_question, inputs=[file_input, question_input], outputs=answer_output)
return demo
# Launch
if __name__ == "__main__":
demo = create_interface()
demo.launch()