Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,240 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import fitz
|
| 3 |
+
from PIL import Image
|
| 4 |
+
import pytesseract
|
| 5 |
+
import gradio as gr
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
config_val = "--psm 6 -c tessedit_char_whitelist=0123456789,.-+"
|
| 10 |
+
|
| 11 |
+
# Rectangles for Form 1040 Pages 1 & 2
|
| 12 |
+
page1_rects = [
|
| 13 |
+
[(464, 399), (576, 399), (575, 409), (462, 410)],
|
| 14 |
+
[(462, 519), (577, 518), (577, 531), (463, 529)],
|
| 15 |
+
[(225, 517), (340, 518), (339, 530), (224, 530)],
|
| 16 |
+
[(225, 530), (339, 532), (340, 541), (225, 542)],
|
| 17 |
+
[(464, 531), (576, 531), (576, 542), (464, 542)],
|
| 18 |
+
[(464, 589), (578, 589), (577, 602), (464, 602)],
|
| 19 |
+
[(463, 624), (578, 626), (576, 639), (464, 637)],
|
| 20 |
+
[(462, 652), (576, 651), (577, 661), (464, 663)],
|
| 21 |
+
[(463, 661), (578, 664), (578, 676), (462, 674)],
|
| 22 |
+
[(464, 699), (578, 684), (578, 699), (464, 699)]
|
| 23 |
+
]
|
| 24 |
+
page2_rects = [
|
| 25 |
+
[(462, 15), (575, 15), (576, 26), (463, 26)],
|
| 26 |
+
[(462, 62), (577, 63), (579, 75), (462, 73)],
|
| 27 |
+
[(463, 98), (576, 98), (578, 110), (462, 110)],
|
| 28 |
+
[(461, 111), (576, 111), (578, 123), (459, 122)]
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
schedule1_rects = [
|
| 32 |
+
[(470, 204), (579, 203), (577, 216), (471, 216)], # Schedule 1 Line 3
|
| 33 |
+
[(470, 228), (577, 229), (576, 240), (470, 240)], # Schedule 1 Line 5
|
| 34 |
+
[(362, 274), (466, 274), (468, 288), (360, 288)] # Schedule 1 Line 8
|
| 35 |
+
]
|
| 36 |
+
|
| 37 |
+
adjusted_page1_rects = [[(x, y + 23) for (x, y) in rect] for rect in page1_rects]
|
| 38 |
+
adjusted_page2_rects = [[(x, y + 23) for (x, y) in rect] for rect in page2_rects]
|
| 39 |
+
|
| 40 |
+
def get_bounding_rect(points):
|
| 41 |
+
xs = [pt[0] for pt in points]
|
| 42 |
+
ys = [pt[1] for pt in points]
|
| 43 |
+
return fitz.Rect(min(xs), min(ys), max(xs), max(ys))
|
| 44 |
+
|
| 45 |
+
def extract_numeric_values(pdf_file, schedule1_file=None, client_name="Unknown Client"):
|
| 46 |
+
try:
|
| 47 |
+
# ---- All existing code inside try ----
|
| 48 |
+
if isinstance(pdf_file, str):
|
| 49 |
+
doc = fitz.open(pdf_file)
|
| 50 |
+
else:
|
| 51 |
+
pdf_file.seek(0)
|
| 52 |
+
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
|
| 53 |
+
|
| 54 |
+
if len(doc) < 2:
|
| 55 |
+
return "Error: Main PDF must have at least 2 pages.", None
|
| 56 |
+
|
| 57 |
+
zoom = fitz.Matrix(2, 2)
|
| 58 |
+
page1 = doc[0]
|
| 59 |
+
page2 = doc[1]
|
| 60 |
+
|
| 61 |
+
page1_values, page2_values = [], []
|
| 62 |
+
|
| 63 |
+
for rect_points in adjusted_page1_rects:
|
| 64 |
+
rect = get_bounding_rect(rect_points)
|
| 65 |
+
pix = page1.get_pixmap(matrix=zoom, clip=rect)
|
| 66 |
+
cropped_img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
| 67 |
+
w, h = cropped_img.size
|
| 68 |
+
val_img = cropped_img.crop((int(0.4 * w), 0, w, h))
|
| 69 |
+
raw = pytesseract.image_to_string(val_img, config=config_val).strip()
|
| 70 |
+
value_text = re.sub(r"[^\d,.\-+]", "", raw)
|
| 71 |
+
page1_values.append(value_text)
|
| 72 |
+
|
| 73 |
+
for rect_points in adjusted_page2_rects:
|
| 74 |
+
rect = get_bounding_rect(rect_points)
|
| 75 |
+
pix = page2.get_pixmap(matrix=zoom, clip=rect)
|
| 76 |
+
cropped_img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
| 77 |
+
w, h = cropped_img.size
|
| 78 |
+
val_img = cropped_img.crop((int(0.4 * w), 0, w, h))
|
| 79 |
+
raw = pytesseract.image_to_string(val_img, config=config_val).strip()
|
| 80 |
+
value_text = re.sub(r"[^\d,.\-+]", "", raw)
|
| 81 |
+
page2_values.append(value_text)
|
| 82 |
+
|
| 83 |
+
doc.close()
|
| 84 |
+
|
| 85 |
+
output = [f"1040 Value {i+1}: {val}" for i, val in enumerate(page1_values + page2_values)]
|
| 86 |
+
all_extracated_values = page1_values + page2_values
|
| 87 |
+
schedule1_values = []
|
| 88 |
+
|
| 89 |
+
if schedule1_file:
|
| 90 |
+
if isinstance(schedule1_file, str):
|
| 91 |
+
doc = fitz.open(schedule1_file)
|
| 92 |
+
else:
|
| 93 |
+
schedule1_file.seek(0)
|
| 94 |
+
doc = fitz.open(stream=schedule1_file.read(), filetype="pdf")
|
| 95 |
+
|
| 96 |
+
if len(doc) >= 1:
|
| 97 |
+
page = doc[0]
|
| 98 |
+
schedule1_values = []
|
| 99 |
+
for idx, rect_points in enumerate(schedule1_rects):
|
| 100 |
+
rect = get_bounding_rect(rect_points)
|
| 101 |
+
pix = page.get_pixmap(matrix=zoom, clip=rect)
|
| 102 |
+
cropped_img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
| 103 |
+
w, h = cropped_img.size
|
| 104 |
+
val_img = cropped_img.crop((int(0.4 * w), 0, w, h))
|
| 105 |
+
raw = pytesseract.image_to_string(val_img, config=config_val).strip()
|
| 106 |
+
value_text = re.sub(r"[^\d,.\-+]", "", raw)
|
| 107 |
+
schedule1_values.append(value_text)
|
| 108 |
+
schedule1 = schedule1_values
|
| 109 |
+
output += [f"Schedule 1 Line {i*2+1 if i < 2 else 8}: {val}" for i, val in enumerate(schedule1_values)]
|
| 110 |
+
|
| 111 |
+
doc.close()
|
| 112 |
+
|
| 113 |
+
save_to_csv_flat(all_extracated_values, schedule1_values, client_name=client_name)
|
| 114 |
+
return "\n".join(output), "Client_Output_Data_Form_1040.csv"
|
| 115 |
+
|
| 116 |
+
except Exception as e:
|
| 117 |
+
return f"Error occurred:\n{str(e)}", None
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def save_to_csv_flat(all_extracted_values, schedule1_values, client_name="Unknown Client", csv_path="Client_Output_Data_Form_1040.csv"):
|
| 121 |
+
# Header components
|
| 122 |
+
header_level_1 = [
|
| 123 |
+
"Client Name","Gross Comp", "Taxable Wages", "Taxable Interest Income: Sch. B", "Tax- Exempt Interest",
|
| 124 |
+
"Qualified Dividends", "Ordinary Dividends", "Long Term Capital Gain or Loss",
|
| 125 |
+
"Other Adjustments (from Schedule 1)", "Business Income or Loss (Schedule C)",
|
| 126 |
+
"Rent/ Royalty (Schedule E)", "Other Income", "Standard Deduction", "Qualified Business Income Deduction",
|
| 127 |
+
"Taxable Income", "Tax", "", "", "Total Tax"
|
| 128 |
+
]
|
| 129 |
+
header_level_2 = [
|
| 130 |
+
"","W2 Box 5", "Line 1", "Line 2b", "Line 2a", "Line 3a", "Line 3b", "Line 7",
|
| 131 |
+
"Line 10", "Schedule 1, Line 3", "Schedule 1, Line 5", "Schedule 1, Line 8",
|
| 132 |
+
"Line 12", "Line 13", "Line 15", "Line 16", "Line 20, Schedule 3", "Line 23, Schedule 2", "Line 24"
|
| 133 |
+
]
|
| 134 |
+
|
| 135 |
+
# Flatten headers for CSV
|
| 136 |
+
flat_columns = [
|
| 137 |
+
f"{h1.strip()} - {h2.strip()}" if h1.strip() and h2.strip()
|
| 138 |
+
else (h1.strip() + h2.strip()) for h1, h2 in zip(header_level_1, header_level_2)
|
| 139 |
+
]
|
| 140 |
+
|
| 141 |
+
# If file doesn't exist, create new DataFrame and write headers
|
| 142 |
+
if os.path.exists(csv_path):
|
| 143 |
+
df = pd.read_csv(csv_path)
|
| 144 |
+
else:
|
| 145 |
+
df = pd.DataFrame(columns=flat_columns)
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
# Create new row with None
|
| 150 |
+
new_row = pd.Series([None] * len(flat_columns), index=flat_columns)
|
| 151 |
+
new_row.iloc[0] = client_name
|
| 152 |
+
# Map Page 1-2 values
|
| 153 |
+
line_mapping = {
|
| 154 |
+
"Taxable Wages - Line 1": 0,
|
| 155 |
+
"Taxable Interest Income: Sch. B - Line 2b": 1,
|
| 156 |
+
"Tax- Exempt Interest - Line 2a": 2,
|
| 157 |
+
"Qualified Dividends - Line 3a": 3,
|
| 158 |
+
"Ordinary Dividends - Line 3b": 4,
|
| 159 |
+
"Long Term Capital Gain or Loss - Line 7": 5,
|
| 160 |
+
"Other Adjustments (from Schedule 1) - Line 10": 6,
|
| 161 |
+
"Standard Deduction - Line 12": 7,
|
| 162 |
+
"Qualified Business Income Deduction - Line 13": 8,
|
| 163 |
+
"Taxable Income - Line 15": 9,
|
| 164 |
+
"Tax - Line 16": 10,
|
| 165 |
+
"Line 20, Schedule 3": 11,
|
| 166 |
+
"Line 23, Schedule 2": 12,
|
| 167 |
+
"Total Tax - Line 24": 13
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
for key, idx in line_mapping.items():
|
| 171 |
+
if idx < len(all_extracted_values):
|
| 172 |
+
new_row[key] = all_extracted_values[idx] if all_extracted_values[idx] != '' else '0'
|
| 173 |
+
|
| 174 |
+
# Add Schedule 1 values
|
| 175 |
+
if schedule1_values:
|
| 176 |
+
new_row["Business Income or Loss (Schedule C) - Schedule 1, Line 3"] = schedule1_values[0] if schedule1_values[0] != '' else '0'
|
| 177 |
+
new_row["Rent/ Royalty (Schedule E) - Schedule 1, Line 5"] = schedule1_values[1] if schedule1_values[1] != '' else '0'
|
| 178 |
+
new_row["Other Income - Schedule 1, Line 8"] = schedule1_values[2] if schedule1_values[2] != '' else '0'
|
| 179 |
+
|
| 180 |
+
# Append and save
|
| 181 |
+
df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
|
| 182 |
+
df.to_csv(csv_path, index=False)
|
| 183 |
+
print(f" Data saved to CSV: {csv_path}")
|
| 184 |
+
|
| 185 |
+
# Gradio UI
|
| 186 |
+
iface = gr.Interface(
|
| 187 |
+
fn=extract_numeric_values,
|
| 188 |
+
inputs=[
|
| 189 |
+
gr.File(label="Upload Main Form 1040 PDF (Required)", file_types=[".pdf"]),
|
| 190 |
+
gr.File(label="Upload Schedule 1 PDF (Optional)", file_types=[".pdf"]),
|
| 191 |
+
gr.Textbox(label="Client Name", placeholder="Enter client name")
|
| 192 |
+
],
|
| 193 |
+
outputs=[
|
| 194 |
+
gr.Textbox(label="Extracted Numeric Values", lines=20),
|
| 195 |
+
gr.File(label="Download Excel Output")
|
| 196 |
+
],
|
| 197 |
+
title="Tax PDF Extractor",
|
| 198 |
+
description="Upload Form 1040 (at least 2 pages). Optionally upload Schedule 1 for extra fields."
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
# with gr.Blocks(title="Tax PDF Extractor") as demo:
|
| 202 |
+
# gr.Markdown("## Tax PDF Extractor")
|
| 203 |
+
# gr.Markdown("Upload Form 1040 (at least 2 pages). Optionally upload Schedule 1 for extra fields.")
|
| 204 |
+
|
| 205 |
+
# client_name = gr.Textbox(label="Client Name (Required)", placeholder="Enter your full name")
|
| 206 |
+
|
| 207 |
+
# form_1040 = gr.File(label="Upload Main Form 1040 PDF (Required)", file_types=[".pdf"])
|
| 208 |
+
|
| 209 |
+
# has_schedule1 = gr.Radio(
|
| 210 |
+
# choices=["Yes", "No"],
|
| 211 |
+
# label="Do you have Schedule 1?",
|
| 212 |
+
# value="No"
|
| 213 |
+
# )
|
| 214 |
+
|
| 215 |
+
# schedule1 = gr.File(label="Upload Schedule 1 PDF (Optional)", file_types=[".pdf"], visible=False)
|
| 216 |
+
|
| 217 |
+
# # Show/hide schedule1 upload box
|
| 218 |
+
# def toggle_schedule1(choice):
|
| 219 |
+
# return gr.update(visible=choice == "Yes")
|
| 220 |
+
|
| 221 |
+
# has_schedule1.change(fn=toggle_schedule1, inputs=has_schedule1, outputs=schedule1)
|
| 222 |
+
|
| 223 |
+
# output_text = gr.Textbox(label="Extracted Numeric Values", lines=20)
|
| 224 |
+
# output_file = gr.File(label="Download Excel Output")
|
| 225 |
+
|
| 226 |
+
# def wrapper_extract(main_pdf, schedule1_pdf, client_name):
|
| 227 |
+
# if not client_name:
|
| 228 |
+
# return "Error: Client name is required.", None
|
| 229 |
+
# return extract_numeric_values(main_pdf, schedule1_pdf)
|
| 230 |
+
|
| 231 |
+
# submit_btn = gr.Button("Extract Data")
|
| 232 |
+
|
| 233 |
+
# submit_btn.click(
|
| 234 |
+
# fn=wrapper_extract,
|
| 235 |
+
# inputs=[form_1040, schedule1, client_name],
|
| 236 |
+
# outputs=[output_text, output_file]
|
| 237 |
+
# )
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
iface.launch(share=True)
|