Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import os | |
| from groq import Groq | |
| import pdfplumber | |
| import pytesseract | |
| from PIL import Image | |
| from pdf2image import convert_from_path | |
| # --- Helper Functions --- | |
| def initialize_groq(): | |
| return Groq(api_key=os.getenv("GROQ_API_KEY")) | |
| def clean_question(user_question): | |
| corrections = {"slaps": "slabs", "salried": "salaried"} | |
| for wrong, right in corrections.items(): | |
| user_question = user_question.replace(wrong, right) | |
| return user_question | |
| def read_pdf(uploaded_file): | |
| try: | |
| with pdfplumber.open(uploaded_file.name) as pdf: | |
| full_text = "" | |
| for page in pdf.pages: | |
| text = page.extract_text() | |
| if text: | |
| full_text += text | |
| if not full_text.strip(): | |
| # OCR fallback | |
| images = convert_from_path(uploaded_file.name) | |
| full_text = "" | |
| for img in images: | |
| text = pytesseract.image_to_string(img) | |
| full_text += text | |
| return full_text.strip() | |
| except Exception as e: | |
| return f"Error reading PDF: {e}" | |
| def chunk_text(text, chunk_size=3000): | |
| return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)] | |
| def similarity(query, text): | |
| query_words = set(query.lower().split()) | |
| text_words = set(text.lower().split()) | |
| return len(query_words & text_words) | |
| def retrieve_relevant_document(user_question, document_text): | |
| chunks = chunk_text(document_text) | |
| return max(chunks, key=lambda chunk: similarity(user_question, chunk)) if chunks else "" | |
| def answer_question(file, user_question): | |
| if not file: | |
| return "Please upload a PDF document." | |
| user_question = clean_question(user_question) | |
| document_text = read_pdf(file) | |
| if not document_text: | |
| return "β Document appears empty or unreadable. Please try a different file." | |
| relevant_chunk = retrieve_relevant_document(user_question, document_text) | |
| prompt = f"""You are a tax/legal assistant. Read the following extract and answer the user's query. | |
| User Question: {user_question} | |
| Relevant Extract from Document: | |
| {relevant_chunk} | |
| """ | |
| try: | |
| client = initialize_groq() | |
| response = client.chat.completions.create( | |
| messages=[{"role": "user", "content": prompt}], | |
| model="llama3-8b-8192" | |
| ) | |
| return response.choices[0].message.content | |
| except Exception as e: | |
| return f"Error generating answer: {e}" | |
| # --- Gradio UI --- | |
| def create_interface(): | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## π Legal Document Q&A\nUpload a PDF and ask questions based on its content.") | |
| file_input = gr.File(label="Upload PDF", type="filepath", file_types=[".pdf"]) | |
| question_input = gr.Textbox(label="Your Question") | |
| answer_output = gr.Textbox(label="Answer") | |
| submit = gr.Button("Ask") | |
| submit.click(fn=answer_question, inputs=[file_input, question_input], outputs=answer_output) | |
| return demo | |
| # Launch | |
| if __name__ == "__main__": | |
| demo = create_interface() | |
| demo.launch() | |