Spaces:
Sleeping
Sleeping
File size: 3,074 Bytes
d177c4e 5374478 d177c4e 65e500b 5374478 65e500b 5374478 d177c4e 65e500b 5374478 d177c4e 5374478 d177c4e 5374478 d177c4e 5374478 65e500b d177c4e 5374478 d177c4e 65e500b 5374478 c1373aa 5374478 d177c4e 5374478 c1373aa 5374478 65e500b 5374478 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
import fitz # PyMuPDF
import json
import gradio as gr
from groq import Groq
from dotenv import load_dotenv
import os
# Load environment variables
load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
# Initialize Groq Client
client = Groq(api_key=GROQ_API_KEY)
def extract_form_fields(pdf_bytes):
if not pdf_bytes or len(pdf_bytes) == 0:
raise ValueError("Uploaded file is empty or not a valid PDF.")
try:
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
except Exception as e:
raise ValueError(f"Failed to open PDF: {e}")
form_fields = {}
for page in doc:
for widget in page.widgets():
key = widget.field_name
value = widget.field_value if widget.field_value else ""
form_fields[key] = value
return form_fields
def get_pdf_text(pdf_bytes):
if not pdf_bytes or len(pdf_bytes) == 0:
raise ValueError("Uploaded file is empty or not a valid PDF.")
try:
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
except Exception as e:
raise ValueError(f"Failed to open PDF: {e}")
text = ""
for page in doc:
text += page.get_text()
return text
def get_field_details(form_fields, pdf_text):
prompt = f"""
You are an expert at analyzing and auto-filling PDF form fields.
Here is the extracted PDF text:
{pdf_text}
Based on this, explain the meaning or expected value of each of the following fields in JSON format:
{json.dumps(list(form_fields.keys()), indent=2)}
Return your output in the following JSON format:
{{ "field_name_1": "description", "field_name_2": "description", ... }}
"""
response = client.chat.completions.create(
model="llama3-8b-8192",
messages=[{"role": "user", "content": prompt}]
)
try:
explanation = response.choices[0].message.content
return explanation
except Exception as e:
return f"Failed to analyze fields: {str(e)}"
def analyze_form(pdf_file):
if not pdf_file:
return "β No file provided.", "", ""
try:
with open(pdf_file.name, 'rb') as f:
pdf_bytes = f.read()
fields = extract_form_fields(pdf_bytes)
pdf_text = get_pdf_text(pdf_bytes)
explanation = get_field_details(fields, pdf_text)
return "β
Analysis complete.", json.dumps(fields, indent=2), explanation
except Exception as e:
return f"β Error: {str(e)}", "", ""
# Gradio Interface
description = "π Upload a tax or registration form PDF. This tool extracts form fields and explains what each one likely means or requires using LLM."
iface = gr.Interface(
fn=analyze_form,
inputs=gr.File(label="Upload PDF", type="filepath"),
outputs=[
gr.Textbox(label="Status"),
gr.Code(label="π Extracted Form Fields (JSON)", language="json"),
gr.Code(label="π‘ Field Descriptions (JSON)", language="json"),
],
title="π Form Field Analyzer",
description=description,
theme="default"
)
if __name__ == "__main__":
iface.launch()
|