Spaces:
Running
Running
| import gradio as gr | |
| import tempfile | |
| import shutil | |
| from pdf_extractor import extract_text_pdf_raw | |
| from word_extractor import extract_red_text_with_labels, is_red_font | |
| from docx import Document | |
| from docx.shared import RGBColor | |
| import difflib | |
| def find_best_match(target, candidates): | |
| match = difflib.get_close_matches(target, candidates, n=1, cutoff=0.5) | |
| return match[0] if match else None | |
| def replace_red_text_in_doc(doc_path, replacements): | |
| doc = Document(doc_path) | |
| for para in doc.paragraphs: | |
| for run in para.runs: | |
| if is_red_font(run): | |
| old_text = run.text.strip() | |
| new_text = find_best_match(old_text, replacements) | |
| if new_text: | |
| run.text = new_text | |
| run.font.color.rgb = RGBColor(0, 0, 0) # Set to black | |
| for table in doc.tables: | |
| for row in table.rows: | |
| for cell in row.cells: | |
| for para in cell.paragraphs: | |
| for run in para.runs: | |
| if is_red_font(run): | |
| old_text = run.text.strip() | |
| new_text = find_best_match(old_text, replacements) | |
| if new_text: | |
| run.text = new_text | |
| run.font.color.rgb = RGBColor(0, 0, 0) # Set to black | |
| temp_dir = tempfile.mkdtemp() | |
| updated_path = f"{temp_dir}/updated.docx" | |
| doc.save(updated_path) | |
| return updated_path | |
| def process_files(pdf_file, word_file): | |
| pdf_path = pdf_file.name | |
| word_path = word_file.name | |
| pdf_text = extract_text_pdf_raw(pdf_path) | |
| word_data = extract_red_text_with_labels(word_path) | |
| # Flatten red text entries | |
| red_values = [] | |
| for values in word_data.values(): | |
| red_values.extend(values) | |
| red_values = list(set(red_values)) # dedupe | |
| # Match red values to PDF | |
| replacements = [] | |
| for val in red_values: | |
| match = find_best_match(val, pdf_text) | |
| if match: | |
| replacements.append(match) | |
| # Replace in Word | |
| updated_doc_path = replace_red_text_in_doc(word_path, replacements) | |
| return updated_doc_path | |
| gr.Interface( | |
| fn=process_files, | |
| inputs=[ | |
| gr.File(label="Upload PDF File", type="file"), | |
| gr.File(label="Upload Word File", type="file") | |
| ], | |
| outputs=gr.File(label="Download Updated Word File"), | |
| title="Red Text Replacer", | |
| description="Upload a PDF and Word document. Red-colored text in the Word doc will be replaced by matching content from the PDF." | |
| ).launch() |