Spaces:

mehakkhan
/

Form-Field-Analyzer

Sleeping

File size: 3,074 Bytes

import fitz  # PyMuPDF
import json
import gradio as gr
from groq import Groq
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

# Initialize Groq Client
client = Groq(api_key=GROQ_API_KEY)

def extract_form_fields(pdf_bytes):
    if not pdf_bytes or len(pdf_bytes) == 0:
        raise ValueError("Uploaded file is empty or not a valid PDF.")

    try:
        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    except Exception as e:
        raise ValueError(f"Failed to open PDF: {e}")

    form_fields = {}
    for page in doc:
        for widget in page.widgets():
            key = widget.field_name
            value = widget.field_value if widget.field_value else ""
            form_fields[key] = value

    return form_fields

def get_pdf_text(pdf_bytes):
    if not pdf_bytes or len(pdf_bytes) == 0:
        raise ValueError("Uploaded file is empty or not a valid PDF.")

    try:
        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    except Exception as e:
        raise ValueError(f"Failed to open PDF: {e}")

    text = ""
    for page in doc:
        text += page.get_text()
    return text

def get_field_details(form_fields, pdf_text):
    prompt = f"""
You are an expert at analyzing and auto-filling PDF form fields. 
Here is the extracted PDF text:
{pdf_text}

Based on this, explain the meaning or expected value of each of the following fields in JSON format:

{json.dumps(list(form_fields.keys()), indent=2)}

Return your output in the following JSON format:
{{ "field_name_1": "description", "field_name_2": "description", ... }}
    """

    response = client.chat.completions.create(
        model="llama3-8b-8192",
        messages=[{"role": "user", "content": prompt}]
    )

    try:
        explanation = response.choices[0].message.content
        return explanation
    except Exception as e:
        return f"Failed to analyze fields: {str(e)}"

def analyze_form(pdf_file):
    if not pdf_file:
        return "❌ No file provided.", "", ""

    try:
        with open(pdf_file.name, 'rb') as f:
            pdf_bytes = f.read()

        fields = extract_form_fields(pdf_bytes)
        pdf_text = get_pdf_text(pdf_bytes)
        explanation = get_field_details(fields, pdf_text)

        return "✅ Analysis complete.", json.dumps(fields, indent=2), explanation
    except Exception as e:
        return f"❌ Error: {str(e)}", "", ""

# Gradio Interface
description = "📄 Upload a tax or registration form PDF. This tool extracts form fields and explains what each one likely means or requires using LLM."

iface = gr.Interface(
    fn=analyze_form,
    inputs=gr.File(label="Upload PDF", type="filepath"),
    outputs=[
        gr.Textbox(label="Status"),
        gr.Code(label="📋 Extracted Form Fields (JSON)", language="json"),
        gr.Code(label="💡 Field Descriptions (JSON)", language="json"),
    ],
    title="📄 Form Field Analyzer",
    description=description,
    theme="default"
)

if __name__ == "__main__":
    iface.launch()