Spaces:
Sleeping
Sleeping
| import fitz # PyMuPDF | |
| import json | |
| import gradio as gr | |
| from groq import Groq | |
| from dotenv import load_dotenv | |
| import os | |
| # Load environment variables | |
| load_dotenv() | |
| GROQ_API_KEY = os.getenv("GROQ_API_KEY") | |
| # Initialize Groq Client | |
| client = Groq(api_key=GROQ_API_KEY) | |
| def extract_form_fields(pdf_bytes): | |
| if not pdf_bytes or len(pdf_bytes) == 0: | |
| raise ValueError("Uploaded file is empty or not a valid PDF.") | |
| try: | |
| doc = fitz.open(stream=pdf_bytes, filetype="pdf") | |
| except Exception as e: | |
| raise ValueError(f"Failed to open PDF: {e}") | |
| form_fields = {} | |
| for page in doc: | |
| for widget in page.widgets(): | |
| key = widget.field_name | |
| value = widget.field_value if widget.field_value else "" | |
| form_fields[key] = value | |
| return form_fields | |
| def get_pdf_text(pdf_bytes): | |
| if not pdf_bytes or len(pdf_bytes) == 0: | |
| raise ValueError("Uploaded file is empty or not a valid PDF.") | |
| try: | |
| doc = fitz.open(stream=pdf_bytes, filetype="pdf") | |
| except Exception as e: | |
| raise ValueError(f"Failed to open PDF: {e}") | |
| text = "" | |
| for page in doc: | |
| text += page.get_text() | |
| return text | |
| def get_field_details(form_fields, pdf_text): | |
| prompt = f""" | |
| You are an expert at analyzing and auto-filling PDF form fields. | |
| Here is the extracted PDF text: | |
| {pdf_text} | |
| Based on this, explain the meaning or expected value of each of the following fields in JSON format: | |
| {json.dumps(list(form_fields.keys()), indent=2)} | |
| Return your output in the following JSON format: | |
| {{ "field_name_1": "description", "field_name_2": "description", ... }} | |
| """ | |
| response = client.chat.completions.create( | |
| model="llama3-8b-8192", | |
| messages=[{"role": "user", "content": prompt}] | |
| ) | |
| try: | |
| explanation = response.choices[0].message.content | |
| return explanation | |
| except Exception as e: | |
| return f"Failed to analyze fields: {str(e)}" | |
| def analyze_form(pdf_file): | |
| if not pdf_file: | |
| return "β No file provided.", "", "" | |
| try: | |
| with open(pdf_file.name, 'rb') as f: | |
| pdf_bytes = f.read() | |
| fields = extract_form_fields(pdf_bytes) | |
| pdf_text = get_pdf_text(pdf_bytes) | |
| explanation = get_field_details(fields, pdf_text) | |
| return "β Analysis complete.", json.dumps(fields, indent=2), explanation | |
| except Exception as e: | |
| return f"β Error: {str(e)}", "", "" | |
| # Gradio Interface | |
| description = "π Upload a tax or registration form PDF. This tool extracts form fields and explains what each one likely means or requires using LLM." | |
| iface = gr.Interface( | |
| fn=analyze_form, | |
| inputs=gr.File(label="Upload PDF", type="filepath"), | |
| outputs=[ | |
| gr.Textbox(label="Status"), | |
| gr.Code(label="π Extracted Form Fields (JSON)", language="json"), | |
| gr.Code(label="π‘ Field Descriptions (JSON)", language="json"), | |
| ], | |
| title="π Form Field Analyzer", | |
| description=description, | |
| theme="default" | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() | |