Shami96's picture
Update app.py
1804090 verified
raw
history blame
2.09 kB
import gradio as gr
import os
from docx import Document
from utils import extract_text_from_pdf, parse_pdf_to_dict
from docx.shared import RGBColor
from docx import Document
from docx.shared import RGBColor
from difflib import get_close_matches
def is_red_color(run):
color = run.font.color
if color is None:
return False
if color.rgb:
r, g, b = color.rgb[0], color.rgb[1], color.rgb[2]
return r > 180 and g < 100 and b < 100 # any strong red tint
return False
def replace_red_text_with_data(doc_path, data_dict):
doc = Document(doc_path)
for para in doc.paragraphs:
for run in para.runs:
if is_red_color(run):
red_text = run.text.strip()
# Try fuzzy match
key_match = get_close_matches(red_text.lower(), [k.lower() for k in data_dict.keys()], n=1, cutoff=0.6)
if key_match:
# Find original key with matching text
for key in data_dict:
if key.lower() == key_match[0]:
run.text = data_dict[key]
break
return doc
def process_files(pdf_file, template_docx):
pdf_path = pdf_file
doc_path = template_docx
output_path = "filled_output.docx"
# Extract and parse PDF
raw_text = extract_text_from_pdf(pdf_path)
data_dict = parse_pdf_to_dict(raw_text)
# Replace red text with data
final_doc = replace_red_text_with_data(doc_path, data_dict)
# Save final document
final_doc.save(output_path)
return output_path
demo = gr.Interface(
fn=process_files,
inputs=[
gr.File(label="Upload PDF Report", file_types=[".pdf"]),
gr.File(label="Upload Word Template (.docx)", file_types=[".docx"])
],
outputs=gr.File(label="Download Filled Report (.docx)"),
title="Audit Report Generator",
description="Upload a PDF and a Word template. This tool will auto-fill red-highlighted fields with data from the PDF."
)
if __name__ == "__main__":
demo.launch()