import fitz # PyMuPDF import json import gradio as gr from groq import Groq from dotenv import load_dotenv import os # Load environment variables load_dotenv() GROQ_API_KEY = os.getenv("GROQ_API_KEY") # Initialize Groq Client client = Groq(api_key=GROQ_API_KEY) def extract_form_fields(pdf_bytes): if not pdf_bytes or len(pdf_bytes) == 0: raise ValueError("Uploaded file is empty or not a valid PDF.") try: doc = fitz.open(stream=pdf_bytes, filetype="pdf") except Exception as e: raise ValueError(f"Failed to open PDF: {e}") form_fields = {} for page in doc: for widget in page.widgets(): key = widget.field_name value = widget.field_value if widget.field_value else "" form_fields[key] = value return form_fields def get_pdf_text(pdf_bytes): if not pdf_bytes or len(pdf_bytes) == 0: raise ValueError("Uploaded file is empty or not a valid PDF.") try: doc = fitz.open(stream=pdf_bytes, filetype="pdf") except Exception as e: raise ValueError(f"Failed to open PDF: {e}") text = "" for page in doc: text += page.get_text() return text def get_field_details(form_fields, pdf_text): prompt = f""" You are an expert at analyzing and auto-filling PDF form fields. Here is the extracted PDF text: {pdf_text} Based on this, explain the meaning or expected value of each of the following fields in JSON format: {json.dumps(list(form_fields.keys()), indent=2)} Return your output in the following JSON format: {{ "field_name_1": "description", "field_name_2": "description", ... }} """ response = client.chat.completions.create( model="llama3-8b-8192", messages=[{"role": "user", "content": prompt}] ) try: explanation = response.choices[0].message.content return explanation except Exception as e: return f"Failed to analyze fields: {str(e)}" def analyze_form(pdf_file): if not pdf_file: return "❌ No file provided.", "", "" try: with open(pdf_file.name, 'rb') as f: pdf_bytes = f.read() fields = extract_form_fields(pdf_bytes) pdf_text = get_pdf_text(pdf_bytes) explanation = get_field_details(fields, pdf_text) return "✅ Analysis complete.", json.dumps(fields, indent=2), explanation except Exception as e: return f"❌ Error: {str(e)}", "", "" # Gradio Interface description = "📄 Upload a tax or registration form PDF. This tool extracts form fields and explains what each one likely means or requires using LLM." iface = gr.Interface( fn=analyze_form, inputs=gr.File(label="Upload PDF", type="filepath"), outputs=[ gr.Textbox(label="Status"), gr.Code(label="📋 Extracted Form Fields (JSON)", language="json"), gr.Code(label="💡 Field Descriptions (JSON)", language="json"), ], title="📄 Form Field Analyzer", description=description, theme="default" ) if __name__ == "__main__": iface.launch()