File size: 3,074 Bytes
d177c4e
 
5374478
d177c4e
65e500b
 
 
5374478
65e500b
5374478
d177c4e
65e500b
5374478
d177c4e
 
5374478
 
 
 
 
 
 
 
d177c4e
 
5374478
 
 
 
 
d177c4e
 
 
5374478
 
 
 
 
 
 
 
65e500b
 
 
 
d177c4e
 
 
 
 
 
 
 
 
 
 
 
 
 
5374478
d177c4e
 
 
 
 
65e500b
5374478
 
c1373aa
5374478
 
 
 
 
d177c4e
 
5374478
 
 
 
 
 
 
 
c1373aa
5374478
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65e500b
 
5374478
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import fitz  # PyMuPDF
import json
import gradio as gr
from groq import Groq
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

# Initialize Groq Client
client = Groq(api_key=GROQ_API_KEY)

def extract_form_fields(pdf_bytes):
    if not pdf_bytes or len(pdf_bytes) == 0:
        raise ValueError("Uploaded file is empty or not a valid PDF.")

    try:
        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    except Exception as e:
        raise ValueError(f"Failed to open PDF: {e}")

    form_fields = {}
    for page in doc:
        for widget in page.widgets():
            key = widget.field_name
            value = widget.field_value if widget.field_value else ""
            form_fields[key] = value

    return form_fields

def get_pdf_text(pdf_bytes):
    if not pdf_bytes or len(pdf_bytes) == 0:
        raise ValueError("Uploaded file is empty or not a valid PDF.")

    try:
        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    except Exception as e:
        raise ValueError(f"Failed to open PDF: {e}")

    text = ""
    for page in doc:
        text += page.get_text()
    return text

def get_field_details(form_fields, pdf_text):
    prompt = f"""
You are an expert at analyzing and auto-filling PDF form fields. 
Here is the extracted PDF text:
{pdf_text}

Based on this, explain the meaning or expected value of each of the following fields in JSON format:

{json.dumps(list(form_fields.keys()), indent=2)}

Return your output in the following JSON format:
{{ "field_name_1": "description", "field_name_2": "description", ... }}
    """

    response = client.chat.completions.create(
        model="llama3-8b-8192",
        messages=[{"role": "user", "content": prompt}]
    )

    try:
        explanation = response.choices[0].message.content
        return explanation
    except Exception as e:
        return f"Failed to analyze fields: {str(e)}"

def analyze_form(pdf_file):
    if not pdf_file:
        return "❌ No file provided.", "", ""

    try:
        with open(pdf_file.name, 'rb') as f:
            pdf_bytes = f.read()

        fields = extract_form_fields(pdf_bytes)
        pdf_text = get_pdf_text(pdf_bytes)
        explanation = get_field_details(fields, pdf_text)

        return "βœ… Analysis complete.", json.dumps(fields, indent=2), explanation
    except Exception as e:
        return f"❌ Error: {str(e)}", "", ""

# Gradio Interface
description = "πŸ“„ Upload a tax or registration form PDF. This tool extracts form fields and explains what each one likely means or requires using LLM."

iface = gr.Interface(
    fn=analyze_form,
    inputs=gr.File(label="Upload PDF", type="filepath"),
    outputs=[
        gr.Textbox(label="Status"),
        gr.Code(label="πŸ“‹ Extracted Form Fields (JSON)", language="json"),
        gr.Code(label="πŸ’‘ Field Descriptions (JSON)", language="json"),
    ],
    title="πŸ“„ Form Field Analyzer",
    description=description,
    theme="default"
)

if __name__ == "__main__":
    iface.launch()