mehakkhan commited on
Commit
c1373aa
Β·
verified Β·
1 Parent(s): 1db0e5a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +90 -61
app.py CHANGED
@@ -1,44 +1,29 @@
 
1
  import fitz # PyMuPDF
2
  import json
3
- import gradio as gr
4
  from groq import Groq
5
  from dotenv import load_dotenv
6
  import os
7
 
8
- # Load environment variables
9
  load_dotenv()
10
- GROQ_API_KEY = os.getenv("GROQ_API_KEY")
11
 
12
  # Initialize Groq Client
13
- client = Groq(api_key=GROQ_API_KEY)
14
 
15
  def extract_form_fields(pdf_bytes):
16
- if not pdf_bytes or len(pdf_bytes) == 0:
17
- raise ValueError("Uploaded file is empty or not a valid PDF.")
18
-
19
- try:
20
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
21
- except Exception as e:
22
- raise ValueError(f"Failed to open PDF: {e}")
23
-
24
  form_fields = {}
25
  for page in doc:
26
- for widget in page.widgets():
27
- key = widget.field_name
28
- value = widget.field_value if widget.field_value else ""
29
- form_fields[key] = value
30
-
 
31
  return form_fields
32
 
33
  def get_pdf_text(pdf_bytes):
34
- if not pdf_bytes or len(pdf_bytes) == 0:
35
- raise ValueError("Uploaded file is empty or not a valid PDF.")
36
-
37
- try:
38
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
39
- except Exception as e:
40
- raise ValueError(f"Failed to open PDF: {e}")
41
-
42
  text = ""
43
  for page in doc:
44
  text += page.get_text()
@@ -57,53 +42,97 @@ Based on this, explain the meaning or expected value of each of the following fi
57
  Return your output in the following JSON format:
58
  {{ "field_name_1": "description", "field_name_2": "description", ... }}
59
  """
60
-
61
  response = client.chat.completions.create(
62
  model="llama3-8b-8192",
63
  messages=[{"role": "user", "content": prompt}]
64
  )
 
 
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  try:
67
- explanation = response.choices[0].message.content
68
- return explanation
69
- except Exception as e:
70
- return f"Failed to analyze fields: {str(e)}"
 
 
71
 
72
- def analyze_form(pdf_file):
73
- if not pdf_file:
74
- return "❌ No file provided.", "", ""
75
 
 
76
  try:
77
- with open(pdf_file.name, 'rb') as f:
78
- pdf_bytes = f.read()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
- fields = extract_form_fields(pdf_bytes)
81
- pdf_text = get_pdf_text(pdf_bytes)
82
- explanation = get_field_details(fields, pdf_text)
 
83
 
84
- field_output = (
85
- json.dumps(fields, indent=2)
86
- if fields else "⚠️ No form fields detected in this PDF."
 
87
  )
88
- return "βœ… Analysis complete.", field_output, explanation
89
- except Exception as e:
90
- return f"❌ Error: {str(e)}", "", ""
91
-
92
- # Gradio Interface
93
- description = "πŸ“„ Upload a tax or registration form PDF. This tool extracts form fields and explains what each one likely means or requires using LLM."
94
-
95
- iface = gr.Interface(
96
- fn=analyze_form,
97
- inputs=gr.File(label="Upload PDF", type="filepath"),
98
- outputs=[
99
- gr.Textbox(label="Status"),
100
- gr.Code(label="πŸ“‹ Extracted Form Fields (JSON)", language="json"),
101
- gr.Code(label="πŸ’‘ Field Descriptions (JSON)", language="json"),
102
- ],
103
- title="πŸ“„ Form Field Analyzer",
104
- description=description,
105
- theme="default"
106
- )
107
 
108
  if __name__ == "__main__":
109
- iface.launch()
 
1
+ import gradio as gr
2
  import fitz # PyMuPDF
3
  import json
 
4
  from groq import Groq
5
  from dotenv import load_dotenv
6
  import os
7
 
 
8
  load_dotenv()
 
9
 
10
  # Initialize Groq Client
11
+ client = Groq(api_key=os.getenv("GROQ_API_KEY"))
12
 
13
  def extract_form_fields(pdf_bytes):
14
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
 
 
 
 
 
 
 
15
  form_fields = {}
16
  for page in doc:
17
+ widgets = page.widgets()
18
+ if widgets:
19
+ for widget in widgets:
20
+ key = widget.field_name
21
+ value = widget.field_value if widget.field_value else ""
22
+ form_fields[key] = value
23
  return form_fields
24
 
25
  def get_pdf_text(pdf_bytes):
26
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
 
 
 
 
 
 
 
27
  text = ""
28
  for page in doc:
29
  text += page.get_text()
 
42
  Return your output in the following JSON format:
43
  {{ "field_name_1": "description", "field_name_2": "description", ... }}
44
  """
 
45
  response = client.chat.completions.create(
46
  model="llama3-8b-8192",
47
  messages=[{"role": "user", "content": prompt}]
48
  )
49
+ explanation = response.choices[0].message.content
50
+ return explanation
51
 
52
+ def fill_pdf_fields(pdf_bytes, field_values):
53
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
54
+ for page in doc:
55
+ widgets = page.widgets()
56
+ if widgets:
57
+ for widget in widgets:
58
+ key = widget.field_name
59
+ if key in field_values:
60
+ widget.field_value = field_values[key]
61
+ widget.update()
62
+ return doc.write()
63
+
64
+ def analyze_pdf(file):
65
+ if file is None:
66
+ return "⚠️ Please upload a valid PDF.", "", {}, None
67
+
68
+ pdf_bytes = file.read()
69
  try:
70
+ fields = extract_form_fields(pdf_bytes)
71
+ text = get_pdf_text(pdf_bytes)
72
+ explanation = get_field_details(fields, text)
73
+
74
+ if not fields:
75
+ return "⚠️ No form fields detected in the PDF.", "", {}, None
76
 
77
+ return json.dumps(fields, indent=2), explanation, fields, pdf_bytes
78
+ except Exception as e:
79
+ return f"Error: {str(e)}", "", {}, None
80
 
81
+ def fill_and_download(field_inputs, original_pdf_bytes):
82
  try:
83
+ filled_pdf_bytes = fill_pdf_fields(original_pdf_bytes, field_inputs)
84
+ return ("filled_form.pdf", filled_pdf_bytes)
85
+ except Exception as e:
86
+ return f"Error filling PDF: {str(e)}"
87
+
88
+ def app():
89
+ with gr.Blocks() as demo:
90
+ gr.Markdown("## πŸ“„ Smart Form Filler & Analyzer")
91
+
92
+ with gr.Row():
93
+ file_input = gr.File(label="Upload a PDF", file_types=[".pdf"])
94
+ analyze_btn = gr.Button("πŸ” Analyze PDF")
95
+
96
+ extract_output = gr.Code(label="πŸ“‹ Extracted Form Fields")
97
+ description_output = gr.Code(label="πŸ’‘ Field Descriptions")
98
+
99
+ inputs_group = gr.Group(visible=False)
100
+ field_values = {}
101
+ input_boxes = []
102
+
103
+ original_pdf_bytes = gr.State()
104
+ filled_pdf_output = gr.File(label="⬇️ Download Filled PDF")
105
+ fill_btn = gr.Button("πŸ“ Fill and Download PDF")
106
+
107
+ def display_fields_and_inputs(file):
108
+ extracted, desc, fields, pdf_bytes = analyze_pdf(file)
109
+ inputs_group.visible = bool(fields)
110
+ input_boxes.clear()
111
+ if fields:
112
+ for key, value in fields.items():
113
+ box = gr.Textbox(label=key, value=value, interactive=True)
114
+ input_boxes.append(box)
115
+ inputs_group.children = input_boxes
116
+ return extracted, desc, inputs_group.update(visible=True), pdf_bytes
117
+
118
+ analyze_btn.click(
119
+ fn=display_fields_and_inputs,
120
+ inputs=[file_input],
121
+ outputs=[extract_output, description_output, inputs_group, original_pdf_bytes]
122
+ )
123
 
124
+ def get_filled_pdf(*args):
125
+ field_values = {box.label: value for box, value in zip(input_boxes, args[:-1])}
126
+ pdf_bytes = args[-1]
127
+ return fill_and_download(field_values, pdf_bytes)
128
 
129
+ fill_btn.click(
130
+ fn=get_filled_pdf,
131
+ inputs=input_boxes + [original_pdf_bytes],
132
+ outputs=[filled_pdf_output]
133
  )
134
+
135
+ return demo
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
  if __name__ == "__main__":
138
+ app().launch()