import gradio as gr import os from groq import Groq import pdfplumber import pytesseract from PIL import Image from pdf2image import convert_from_path # --- Helper Functions --- def initialize_groq(): return Groq(api_key=os.getenv("GROQ_API_KEY")) def clean_question(user_question): corrections = {"slaps": "slabs", "salried": "salaried"} for wrong, right in corrections.items(): user_question = user_question.replace(wrong, right) return user_question def read_pdf(uploaded_file): try: with pdfplumber.open(uploaded_file.name) as pdf: full_text = "" for page in pdf.pages: text = page.extract_text() if text: full_text += text if not full_text.strip(): # OCR fallback images = convert_from_path(uploaded_file.name) full_text = "" for img in images: text = pytesseract.image_to_string(img) full_text += text return full_text.strip() except Exception as e: return f"Error reading PDF: {e}" def chunk_text(text, chunk_size=3000): return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)] def similarity(query, text): query_words = set(query.lower().split()) text_words = set(text.lower().split()) return len(query_words & text_words) def retrieve_relevant_document(user_question, document_text): chunks = chunk_text(document_text) return max(chunks, key=lambda chunk: similarity(user_question, chunk)) if chunks else "" def answer_question(file, user_question): if not file: return "Please upload a PDF document." user_question = clean_question(user_question) document_text = read_pdf(file) if not document_text: return "❌ Document appears empty or unreadable. Please try a different file." relevant_chunk = retrieve_relevant_document(user_question, document_text) prompt = f"""You are a tax/legal assistant. Read the following extract and answer the user's query. User Question: {user_question} Relevant Extract from Document: {relevant_chunk} """ try: client = initialize_groq() response = client.chat.completions.create( messages=[{"role": "user", "content": prompt}], model="llama3-8b-8192" ) return response.choices[0].message.content except Exception as e: return f"Error generating answer: {e}" # --- Gradio UI --- def create_interface(): with gr.Blocks() as demo: gr.Markdown("## 📄 Legal Document Q&A\nUpload a PDF and ask questions based on its content.") file_input = gr.File(label="Upload PDF", type="filepath", file_types=[".pdf"]) question_input = gr.Textbox(label="Your Question") answer_output = gr.Textbox(label="Answer") submit = gr.Button("Ask") submit.click(fn=answer_question, inputs=[file_input, question_input], outputs=answer_output) return demo # Launch if __name__ == "__main__": demo = create_interface() demo.launch()