File size: 2,128 Bytes
459372e
 
 
 
 
50613d0
 
31d231c
 
 
9e5331a
 
 
 
 
 
 
 
 
 
 
31d231c
 
459372e
 
31d231c
 
9e5331a
31d231c
 
 
 
 
 
459372e
 
 
 
c19a2c1
 
459372e
 
 
 
 
 
 
 
 
 
 
c19a2c1
459372e
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import gradio as gr
import os
from docx import Document
from utils import extract_text_from_pdf, parse_pdf_to_dict

from docx.shared import RGBColor

from docx import Document
from docx.shared import RGBColor

def is_red_color(run):
    color = run.font.color
    if color is None:
        return False
    if color.rgb:
        r, g, b = color.rgb[0], color.rgb[1], color.rgb[2]
        return r >= 200 and g <= 80 and b <= 80  # flexible red check
    if color.theme_color:  # fallback if theme_color is set instead of RGB
        return str(color.theme_color).lower().endswith("accent2") or str(color.theme_color).lower().endswith("red")
    return False

def replace_red_text_with_data(word_path, data_dict):
    doc = Document(word_path)

    for para in doc.paragraphs:
        full_text = para.text
        for i, run in enumerate(para.runs):
            if is_red_color(run):
                # Search for the key (label) before this red text
                preceding_text = ''.join(r.text for r in para.runs[:i]).lower()
                for key in data_dict:
                    if key.lower() in preceding_text:
                        run.text = data_dict[key]
                        break

    return doc

def process_files(pdf_file, template_docx):
    pdf_path = pdf_file
    doc_path = template_docx
    output_path = "filled_output.docx"

    # Extract and parse PDF
    raw_text = extract_text_from_pdf(pdf_path)
    data_dict = parse_pdf_to_dict(raw_text)

    # Replace red text with data
    final_doc = replace_red_text_with_data(doc_path, data_dict)

    # Save final document
    final_doc.save(output_path)
    return output_path

demo = gr.Interface(
    fn=process_files,
    inputs=[
        gr.File(label="Upload PDF Report", file_types=[".pdf"]),
        gr.File(label="Upload Word Template (.docx)", file_types=[".docx"])
    ],
    outputs=gr.File(label="Download Filled Report (.docx)"),
    title="Audit Report Generator",
    description="Upload a PDF and a Word template. This tool will auto-fill red-highlighted fields with data from the PDF."
)

if __name__ == "__main__":
    demo.launch()