mehakkhan commited on
Commit
65e500b
Β·
verified Β·
1 Parent(s): 4b2df89

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -30
app.py CHANGED
@@ -1,27 +1,48 @@
1
  import fitz # PyMuPDF
2
- import os
3
  import json
4
  import gradio as gr
5
  from groq import Groq
 
 
 
 
 
 
6
 
7
- # πŸ” Initialize Groq Client
8
- client = Groq(api_key=os.getenv("GROQ_API_KEY"))
9
 
10
  def extract_form_fields(pdf_bytes):
11
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
 
 
 
 
 
 
 
12
  form_fields = {}
13
  for page in doc:
14
- widgets = page.widgets()
15
- if widgets:
16
- for widget in widgets:
17
- key = widget.field_name
18
- value = widget.field_value if widget.field_value else ""
19
- form_fields[key] = value
20
  return form_fields
21
 
22
  def get_pdf_text(pdf_bytes):
23
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
24
- return "\n".join([page.get_text() for page in doc])
 
 
 
 
 
 
 
 
 
 
25
 
26
  def get_field_details(form_fields, pdf_text):
27
  prompt = f"""
@@ -36,19 +57,26 @@ Based on this, explain the meaning or expected value of each of the following fi
36
  Return your output in the following JSON format:
37
  {{ "field_name_1": "description", "field_name_2": "description", ... }}
38
  """
 
39
  response = client.chat.completions.create(
40
  model="llama3-8b-8192",
41
  messages=[{"role": "user", "content": prompt}]
42
  )
43
 
44
- return response.choices[0].message.content
 
 
 
 
45
 
46
  def analyze_form(pdf_file):
47
  if not pdf_file:
48
  return "❌ No file provided.", "", ""
49
 
50
  try:
51
- pdf_bytes = pdf_file.read()
 
 
52
  fields = extract_form_fields(pdf_bytes)
53
  pdf_text = get_pdf_text(pdf_bytes)
54
  explanation = get_field_details(fields, pdf_text)
@@ -57,19 +85,21 @@ def analyze_form(pdf_file):
57
  except Exception as e:
58
  return f"❌ Error: {str(e)}", "", ""
59
 
60
- # 🎨 Gradio Interface
61
- with gr.Blocks() as demo:
62
- gr.Markdown("## πŸ“„ Form Field Analyzer")
63
- gr.Markdown("Upload a **tax or registration PDF form**. This tool extracts fillable fields and describes what each might represent.")
64
-
65
- with gr.Row():
66
- file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
67
- analyze_btn = gr.Button("Analyze Form")
68
-
69
- status_output = gr.Textbox(label="Status")
70
- fields_output = gr.Code(label="πŸ“‹ Extracted Fields (JSON)", language="json")
71
- desc_output = gr.Code(label="πŸ’‘ Field Descriptions (LLM Response)", language="json")
72
-
73
- analyze_btn.click(fn=analyze_form, inputs=[file_input], outputs=[status_output, fields_output, desc_output])
74
-
75
- demo.launch()
 
 
 
1
  import fitz # PyMuPDF
 
2
  import json
3
  import gradio as gr
4
  from groq import Groq
5
+ from dotenv import load_dotenv
6
+ import os
7
+
8
+ # Load environment variables
9
+ load_dotenv()
10
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
11
 
12
+ # Initialize Groq Client
13
+ client = Groq(api_key=GROQ_API_KEY)
14
 
15
  def extract_form_fields(pdf_bytes):
16
+ if not pdf_bytes or len(pdf_bytes) == 0:
17
+ raise ValueError("Uploaded file is empty or not a valid PDF.")
18
+
19
+ try:
20
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
21
+ except Exception as e:
22
+ raise ValueError(f"Failed to open PDF: {e}")
23
+
24
  form_fields = {}
25
  for page in doc:
26
+ for widget in page.widgets():
27
+ key = widget.field_name
28
+ value = widget.field_value if widget.field_value else ""
29
+ form_fields[key] = value
30
+
 
31
  return form_fields
32
 
33
  def get_pdf_text(pdf_bytes):
34
+ if not pdf_bytes or len(pdf_bytes) == 0:
35
+ raise ValueError("Uploaded file is empty or not a valid PDF.")
36
+
37
+ try:
38
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
39
+ except Exception as e:
40
+ raise ValueError(f"Failed to open PDF: {e}")
41
+
42
+ text = ""
43
+ for page in doc:
44
+ text += page.get_text()
45
+ return text
46
 
47
  def get_field_details(form_fields, pdf_text):
48
  prompt = f"""
 
57
  Return your output in the following JSON format:
58
  {{ "field_name_1": "description", "field_name_2": "description", ... }}
59
  """
60
+
61
  response = client.chat.completions.create(
62
  model="llama3-8b-8192",
63
  messages=[{"role": "user", "content": prompt}]
64
  )
65
 
66
+ try:
67
+ explanation = response.choices[0].message.content
68
+ return explanation
69
+ except Exception as e:
70
+ return f"Failed to analyze fields: {str(e)}"
71
 
72
  def analyze_form(pdf_file):
73
  if not pdf_file:
74
  return "❌ No file provided.", "", ""
75
 
76
  try:
77
+ with open(pdf_file.name, 'rb') as f:
78
+ pdf_bytes = f.read()
79
+
80
  fields = extract_form_fields(pdf_bytes)
81
  pdf_text = get_pdf_text(pdf_bytes)
82
  explanation = get_field_details(fields, pdf_text)
 
85
  except Exception as e:
86
  return f"❌ Error: {str(e)}", "", ""
87
 
88
+ # Gradio Interface
89
+ description = "πŸ“„ Upload a tax or registration form PDF. This tool extracts form fields and explains what each one likely means or requires using LLM."
90
+
91
+ iface = gr.Interface(
92
+ fn=analyze_form,
93
+ inputs=gr.File(label="Upload PDF", type="filepath"),
94
+ outputs=[
95
+ gr.Textbox(label="Status"),
96
+ gr.Code(label="πŸ“‹ Extracted Form Fields (JSON)", language="json"),
97
+ gr.Code(label="πŸ’‘ Field Descriptions (JSON)", language="json"),
98
+ ],
99
+ title="πŸ“„ Form Field Analyzer",
100
+ description=description,
101
+ theme="default"
102
+ )
103
+
104
+ if __name__ == "__main__":
105
+ iface.launch()