mehakkhan's picture
Update app.py
5374478 verified
import fitz # PyMuPDF
import json
import gradio as gr
from groq import Groq
from dotenv import load_dotenv
import os
# Load environment variables
load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
# Initialize Groq Client
client = Groq(api_key=GROQ_API_KEY)
def extract_form_fields(pdf_bytes):
if not pdf_bytes or len(pdf_bytes) == 0:
raise ValueError("Uploaded file is empty or not a valid PDF.")
try:
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
except Exception as e:
raise ValueError(f"Failed to open PDF: {e}")
form_fields = {}
for page in doc:
for widget in page.widgets():
key = widget.field_name
value = widget.field_value if widget.field_value else ""
form_fields[key] = value
return form_fields
def get_pdf_text(pdf_bytes):
if not pdf_bytes or len(pdf_bytes) == 0:
raise ValueError("Uploaded file is empty or not a valid PDF.")
try:
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
except Exception as e:
raise ValueError(f"Failed to open PDF: {e}")
text = ""
for page in doc:
text += page.get_text()
return text
def get_field_details(form_fields, pdf_text):
prompt = f"""
You are an expert at analyzing and auto-filling PDF form fields.
Here is the extracted PDF text:
{pdf_text}
Based on this, explain the meaning or expected value of each of the following fields in JSON format:
{json.dumps(list(form_fields.keys()), indent=2)}
Return your output in the following JSON format:
{{ "field_name_1": "description", "field_name_2": "description", ... }}
"""
response = client.chat.completions.create(
model="llama3-8b-8192",
messages=[{"role": "user", "content": prompt}]
)
try:
explanation = response.choices[0].message.content
return explanation
except Exception as e:
return f"Failed to analyze fields: {str(e)}"
def analyze_form(pdf_file):
if not pdf_file:
return "❌ No file provided.", "", ""
try:
with open(pdf_file.name, 'rb') as f:
pdf_bytes = f.read()
fields = extract_form_fields(pdf_bytes)
pdf_text = get_pdf_text(pdf_bytes)
explanation = get_field_details(fields, pdf_text)
return "βœ… Analysis complete.", json.dumps(fields, indent=2), explanation
except Exception as e:
return f"❌ Error: {str(e)}", "", ""
# Gradio Interface
description = "πŸ“„ Upload a tax or registration form PDF. This tool extracts form fields and explains what each one likely means or requires using LLM."
iface = gr.Interface(
fn=analyze_form,
inputs=gr.File(label="Upload PDF", type="filepath"),
outputs=[
gr.Textbox(label="Status"),
gr.Code(label="πŸ“‹ Extracted Form Fields (JSON)", language="json"),
gr.Code(label="πŸ’‘ Field Descriptions (JSON)", language="json"),
],
title="πŸ“„ Form Field Analyzer",
description=description,
theme="default"
)
if __name__ == "__main__":
iface.launch()