import json from docx import Document from docx.shared import RGBColor import re def load_json(filepath): with open(filepath, 'r') as file: return json.load(file) def flatten_json(y, prefix=''): out = {} for key, val in y.items(): new_key = f"{prefix}.{key}" if prefix else key if isinstance(val, dict): out.update(flatten_json(val, new_key)) else: out[new_key] = val out[key] = val return out def is_red(run): color = run.font.color return color and (color.rgb == RGBColor(255, 0, 0) or getattr(color, "theme_color", None) == 1) def get_value_as_string(value, field_name=""): if isinstance(value, list): if len(value) == 0: return "" elif len(value) == 1: return str(value[0]) else: if "australian company number" in field_name.lower() or "company number" in field_name.lower(): return value else: return " ".join(str(v) for v in value) else: return str(value) def find_matching_json_value(field_name, flat_json): """Completely dynamic matching without manual mappings""" field_name = field_name.strip() # Try exact match first if field_name in flat_json: print(f" ✅ Direct match found for key '{field_name}'") return flat_json[field_name] # Try case-insensitive exact match for key, value in flat_json.items(): if key.lower() == field_name.lower(): print(f" ✅ Case-insensitive match found for key '{field_name}' with JSON key '{key}'") return value # Try suffix matching (for nested keys like "section.field") for key, value in flat_json.items(): if '.' in key and key.split('.')[-1].lower() == field_name.lower(): print(f" ✅ Suffix match found for key '{field_name}' with JSON key '{key}'") return value # Try partial matching - remove parentheses and special chars clean_field = re.sub(r'[^\w\s]', ' ', field_name.lower()).strip() clean_field = re.sub(r'\s+', ' ', clean_field) # Multiple spaces to single for key, value in flat_json.items(): clean_key = re.sub(r'[^\w\s]', ' ', key.lower()).strip() clean_key = re.sub(r'\s+', ' ', clean_key) if clean_field == clean_key: print(f" ✅ Clean match found for key '{field_name}' with JSON key '{key}'") return value # Word-based fuzzy matching field_words = set(word.lower() for word in re.findall(r'\b\w+\b', field_name) if len(word) > 2) if not field_words: return None best_match = None best_score = 0 best_key = None for key, value in flat_json.items(): key_words = set(word.lower() for word in re.findall(r'\b\w+\b', key) if len(word) > 2) if not key_words: continue # Calculate similarity score common_words = field_words.intersection(key_words) if common_words: # Use Jaccard similarity: intersection / union similarity = len(common_words) / len(field_words.union(key_words)) # Bonus for high word coverage in field_name coverage = len(common_words) / len(field_words) final_score = (similarity * 0.6) + (coverage * 0.4) if final_score > best_score: best_score = final_score best_match = value best_key = key if best_match and best_score >= 0.3: # Lowered threshold for more matches print(f" ✅ Fuzzy match found for key '{field_name}' with JSON key '{best_key}' (score: {best_score:.2f})") return best_match print(f" ❌ No match found for '{field_name}'") return None def get_clean_text(cell): text = "" for paragraph in cell.paragraphs: for run in paragraph.runs: text += run.text return text.strip() def has_red_text(cell): for paragraph in cell.paragraphs: for run in paragraph.runs: if is_red(run) and run.text.strip(): return True return False def extract_red_text_segments(cell): """Extract all red text segments from a cell with better multi-line handling""" red_segments = [] for para_idx, paragraph in enumerate(cell.paragraphs): current_segment = "" segment_runs = [] for run_idx, run in enumerate(paragraph.runs): if is_red(run): if run.text: # Include even empty red runs for proper replacement current_segment += run.text segment_runs.append((para_idx, run_idx, run)) else: # End of current red segment if segment_runs: # Changed from current_segment.strip() to segment_runs red_segments.append({ 'text': current_segment, 'runs': segment_runs.copy(), 'paragraph_idx': para_idx }) current_segment = "" segment_runs = [] # Handle segment at end of paragraph if segment_runs: # Changed from current_segment.strip() to segment_runs red_segments.append({ 'text': current_segment, 'runs': segment_runs.copy(), 'paragraph_idx': para_idx }) return red_segments def replace_red_text_in_cell(cell, replacement_text): """Enhanced cell replacement with better multi-line and multi-segment handling""" red_segments = extract_red_text_segments(cell) if not red_segments: return 0 # If we have multiple segments, try to match each individually first if len(red_segments) > 1: replacements_made = 0 for segment in red_segments: segment_text = segment['text'].strip() if segment_text: # Try to find specific match for this segment # This would require access to flat_json, so we'll handle it in the calling function pass # If no individual matches, replace all with the single replacement if replacements_made == 0: return replace_all_red_segments(red_segments, replacement_text) # Single segment or fallback - replace all red text with the replacement return replace_all_red_segments(red_segments, replacement_text) def replace_all_red_segments(red_segments, replacement_text): """Replace all red segments with the replacement text""" if not red_segments: return 0 # Handle multi-line replacement text if '\n' in replacement_text: replacement_lines = replacement_text.split('\n') else: replacement_lines = [replacement_text] replacements_made = 0 # Replace first segment with first line if red_segments and replacement_lines: first_segment = red_segments[0] if first_segment['runs']: first_run = first_segment['runs'][0][2] # (para_idx, run_idx, run) first_run.text = replacement_lines[0] first_run.font.color.rgb = RGBColor(0, 0, 0) replacements_made = 1 # Clear other runs in first segment for _, _, run in first_segment['runs'][1:]: run.text = '' # Clear all other red segments for segment in red_segments[1:]: for _, _, run in segment['runs']: run.text = '' # If we have multiple lines, add them to the same paragraph or create new runs if len(replacement_lines) > 1 and red_segments: try: # Get the paragraph that contains the first run first_run = red_segments[0]['runs'][0][2] paragraph = first_run.element.getparent() # Get the paragraph element # Add remaining lines as new runs in the same paragraph with line breaks for line in replacement_lines[1:]: if line.strip(): # Only add non-empty lines # Add a line break run from docx.oxml import OxmlElement, ns br = OxmlElement('w:br') first_run.element.append(br) # Add the text as a new run new_run = paragraph.add_run(line.strip()) new_run.font.color.rgb = RGBColor(0, 0, 0) except: # If we can't add line breaks, just put everything in the first run if red_segments and red_segments[0]['runs']: first_run = red_segments[0]['runs'][0][2] # Join all lines with spaces instead of line breaks first_run.text = ' '.join(replacement_lines) first_run.font.color.rgb = RGBColor(0, 0, 0) return replacements_made def handle_multiple_red_segments_in_cell(cell, flat_json): """Handle cells with multiple red text segments dynamically""" red_segments = extract_red_text_segments(cell) if not red_segments: return 0 print(f" 🔍 Found {len(red_segments)} red text segments in cell") replacements_made = 0 unmatched_segments = [] # Try to match each segment individually for i, segment in enumerate(red_segments): segment_text = segment['text'].strip() if not segment_text: continue print(f" Segment {i+1}: '{segment_text[:50]}...'") # Find JSON match for this segment json_value = find_matching_json_value(segment_text, flat_json) if json_value is not None: replacement_text = get_value_as_string(json_value, segment_text) # Handle list values if isinstance(json_value, list) and len(json_value) > 1: replacement_text = "\n".join(str(item) for item in json_value if str(item).strip()) success = replace_single_segment(segment, replacement_text) if success: replacements_made += 1 print(f" ✅ Replaced segment '{segment_text[:30]}...' with '{replacement_text[:30]}...'") else: unmatched_segments.append(segment) print(f" ⏳ No individual match for segment '{segment_text[:30]}...'") # If we have unmatched segments, try to match the combined text if unmatched_segments and replacements_made == 0: combined_text = " ".join(seg['text'] for seg in red_segments).strip() print(f" 🔄 Trying combined text match: '{combined_text[:50]}...'") json_value = find_matching_json_value(combined_text, flat_json) if json_value is not None: replacement_text = get_value_as_string(json_value, combined_text) if isinstance(json_value, list) and len(json_value) > 1: replacement_text = "\n".join(str(item) for item in json_value if str(item).strip()) # Replace all segments with the combined replacement replacements_made = replace_all_red_segments(red_segments, replacement_text) print(f" ✅ Replaced combined text with '{replacement_text[:50]}...'") return replacements_made def replace_single_segment(segment, replacement_text): """Replace a single red text segment""" if not segment['runs']: return False # Replace first run with new text first_run = segment['runs'][0][2] # (para_idx, run_idx, run) first_run.text = replacement_text first_run.font.color.rgb = RGBColor(0, 0, 0) # Clear remaining runs in the segment for _, _, run in segment['runs'][1:]: run.text = '' return True def process_tables(document, flat_json): """Enhanced table processing with better dynamic detection""" replacements_made = 0 for table_idx, table in enumerate(document.tables): print(f"\n🔍 Processing table {table_idx + 1}:") # Dynamically detect table type by analyzing content table_type = detect_table_type(table) print(f" 📋 Detected table type: {table_type}") if table_type == "vehicle_registration": vehicle_replacements = handle_vehicle_registration_table(table, flat_json) replacements_made += vehicle_replacements continue elif table_type == "print_accreditation": print_replacements = handle_print_accreditation_section(table, flat_json) replacements_made += print_replacements continue # Process as regular key-value table for row_idx, row in enumerate(table.rows): if len(row.cells) < 1: continue # Process each cell for red text for cell_idx, cell in enumerate(row.cells): if has_red_text(cell): cell_replacements = handle_multiple_red_segments_in_cell(cell, flat_json) replacements_made += cell_replacements # If no individual segment matches found, try context-based matching if cell_replacements == 0: context_replacements = try_context_based_replacement(cell, row, table, flat_json) replacements_made += context_replacements return replacements_made def detect_table_type(table): """Dynamically detect table type based on content""" # Get text from first few rows sample_text = "" for row in table.rows[:3]: for cell in row.cells: sample_text += get_clean_text(cell).lower() + " " # Vehicle registration indicators vehicle_indicators = ["registration number", "sub-contractor", "weight verification", "rfs suspension"] vehicle_score = sum(1 for indicator in vehicle_indicators if indicator in sample_text) # Print accreditation indicators print_indicators = ["print name", "position title"] print_score = sum(1 for indicator in print_indicators if indicator in sample_text) if vehicle_score >= 3: return "vehicle_registration" elif print_score >= 2: return "print_accreditation" else: return "key_value" def try_context_based_replacement(cell, row, table, flat_json): """Try to find replacement using context from surrounding cells""" replacements_made = 0 # Get context from row headers/labels row_context = "" if len(row.cells) > 1: # First cell might be a label first_cell_text = get_clean_text(row.cells[0]).strip() if first_cell_text and not has_red_text(row.cells[0]): row_context = first_cell_text # Get red text from the cell red_segments = extract_red_text_segments(cell) for segment in red_segments: red_text = segment['text'].strip() if not red_text: continue # Try combining context with red text if row_context: context_queries = [ f"{row_context} {red_text}", f"{row_context}", red_text ] for query in context_queries: json_value = find_matching_json_value(query, flat_json) if json_value is not None: replacement_text = get_value_as_string(json_value, query) success = replace_single_segment(segment, replacement_text) if success: replacements_made += 1 print(f" ✅ Context-based replacement: '{query}' -> '{replacement_text[:30]}...'") break return replacements_made def handle_australian_company_number(row, company_numbers): replacements_made = 0 for i, digit in enumerate(company_numbers): cell_idx = i + 1 if cell_idx < len(row.cells): cell = row.cells[cell_idx] if has_red_text(cell): cell_replacements = replace_red_text_in_cell(cell, str(digit)) replacements_made += cell_replacements print(f" -> Placed digit '{digit}' in cell {cell_idx + 1}") return replacements_made def handle_vehicle_registration_table(table, flat_json): """Handle the Vehicle Registration Numbers table with column-based data""" replacements_made = 0 # Look for the vehicle registration data in the flattened JSON vehicle_section = None # Try to find the vehicle registration section for key, value in flat_json.items(): if "vehicle registration numbers of records examined" in key.lower(): if isinstance(value, dict): # This should be the nested structure vehicle_section = value print(f" ✅ Found vehicle data in key: '{key}'") break if not vehicle_section: # Try alternative approach - look for individual column keys potential_columns = {} for key, value in flat_json.items(): if any(col_name in key.lower() for col_name in ["registration number", "sub-contractor", "weight verification", "rfs suspension"]): # Extract the column name from the flattened key if "." in key: column_name = key.split(".")[-1] else: column_name = key potential_columns[column_name] = value if potential_columns: vehicle_section = potential_columns print(f" ✅ Found vehicle data from flattened keys: {list(vehicle_section.keys())}") else: print(f" ❌ Vehicle registration data not found in JSON") return 0 print(f" ✅ Found vehicle registration data with {len(vehicle_section)} columns") # Find header row (usually row 0 or 1) header_row_idx = -1 header_row = None for row_idx, row in enumerate(table.rows): row_text = "".join(get_clean_text(cell).lower() for cell in row.cells) if "registration" in row_text and "number" in row_text: header_row_idx = row_idx header_row = row break if header_row_idx == -1: print(f" ❌ Could not find header row in vehicle table") return 0 print(f" ✅ Found header row at index {header_row_idx}") # Create mapping between column indices and JSON keys column_mapping = {} for col_idx, cell in enumerate(header_row.cells): header_text = get_clean_text(cell).strip() if not header_text or header_text.lower() == "no.": continue # Try to match header text with JSON keys best_match = None best_score = 0 # Normalize header text for better matching normalized_header = header_text.lower().replace("(", " (").replace(")", ") ").strip() for json_key in vehicle_section.keys(): normalized_json = json_key.lower().strip() # Try exact match first (after normalization) if normalized_header == normalized_json: best_match = json_key best_score = 1.0 break # Try word-based matching header_words = set(word.lower() for word in normalized_header.split() if len(word) > 2) json_words = set(word.lower() for word in normalized_json.split() if len(word) > 2) if header_words and json_words: common_words = header_words.intersection(json_words) score = len(common_words) / max(len(header_words), len(json_words)) if score > best_score and score >= 0.3: # At least 30% match best_score = score best_match = json_key # Try substring matching for cases like "RegistrationNumber" vs "Registration Number" header_clean = normalized_header.replace(" ", "").replace("-", "").replace("(", "").replace(")", "") json_clean = normalized_json.replace(" ", "").replace("-", "").replace("(", "").replace(")", "") if header_clean in json_clean or json_clean in header_clean: if len(header_clean) > 5 and len(json_clean) > 5: # Only for meaningful matches substring_score = min(len(header_clean), len(json_clean)) / max(len(header_clean), len(json_clean)) if substring_score > best_score and substring_score >= 0.6: best_score = substring_score best_match = json_key if best_match: column_mapping[col_idx] = best_match print(f" 📌 Column {col_idx + 1} ('{header_text}') -> '{best_match}' (score: {best_score:.2f})") if not column_mapping: print(f" ❌ No column mappings found") return 0 # Determine how many data rows we need based on the JSON arrays max_data_rows = 0 for json_key, data in vehicle_section.items(): if isinstance(data, list): max_data_rows = max(max_data_rows, len(data)) print(f" 📌 Need to populate {max_data_rows} data rows") # Process all required data rows for data_row_index in range(max_data_rows): table_row_idx = header_row_idx + 1 + data_row_index # Check if this table row exists, if not, add it if table_row_idx >= len(table.rows): print(f" ⚠️ Row {table_row_idx + 1} doesn't exist - table only has {len(table.rows)} rows") print(f" ➕ Adding new row for vehicle {data_row_index + 1}") # Add a new row to the table new_row = table.add_row() print(f" ✅ Successfully added row {len(table.rows)} to the table") row = table.rows[table_row_idx] print(f" 📌 Processing data row {table_row_idx + 1} (vehicle {data_row_index + 1})") # Fill in data for each mapped column for col_idx, json_key in column_mapping.items(): if col_idx < len(row.cells): cell = row.cells[col_idx] # Get the data for this column and row column_data = vehicle_section.get(json_key, []) if isinstance(column_data, list) and data_row_index < len(column_data): replacement_value = str(column_data[data_row_index]) # Check if cell has red text or is empty (needs data) cell_text = get_clean_text(cell) if has_red_text(cell) or not cell_text.strip(): # If cell is empty, add the text directly if not cell_text.strip(): cell.text = replacement_value replacements_made += 1 print(f" -> Added '{replacement_value}' to empty cell (column '{json_key}')") else: # If cell has red text, replace it cell_replacements = replace_red_text_in_cell(cell, replacement_value) replacements_made += cell_replacements if cell_replacements > 0: print(f" -> Replaced red text with '{replacement_value}' (column '{json_key}')") return replacements_made def handle_print_accreditation_section(table, flat_json): """Handle the special case of print accreditation name with 2 values""" replacements_made = 0 # Look for the print accreditation name data print_data = flat_json.get("print accreditation name.print accreditation name", []) if not isinstance(print_data, list) or len(print_data) < 2: return 0 name_value = print_data[0] # "Simon Anderson" position_value = print_data[1] # "Director" print(f" 📋 Print accreditation data: Name='{name_value}', Position='{position_value}'") # Find rows with "Print Name" and "Position Title" for row_idx, row in enumerate(table.rows): if len(row.cells) >= 2: # Check if this row has the headers cell1_text = get_clean_text(row.cells[0]).lower() cell2_text = get_clean_text(row.cells[1]).lower() if "print name" in cell1_text and "position title" in cell2_text: print(f" 📍 Found header row {row_idx + 1}: '{cell1_text}' | '{cell2_text}'") # Check the next row for red text to replace if row_idx + 1 < len(table.rows): data_row = table.rows[row_idx + 1] if len(data_row.cells) >= 2: # Replace Print Name (first cell) if has_red_text(data_row.cells[0]): cell_replacements = replace_red_text_in_cell(data_row.cells[0], name_value) replacements_made += cell_replacements if cell_replacements > 0: print(f" ✅ Replaced Print Name: '{name_value}'") # Replace Position Title (second cell) if has_red_text(data_row.cells[1]): cell_replacements = replace_red_text_in_cell(data_row.cells[1], position_value) replacements_made += cell_replacements if cell_replacements > 0: print(f" ✅ Replaced Position Title: '{position_value}'") break # Found the section, no need to continue return replacements_made def process_single_column_sections(cell, field_name, flat_json): json_value = find_matching_json_value(field_name, flat_json) if json_value is not None: replacement_text = get_value_as_string(json_value, field_name) if isinstance(json_value, list) and len(json_value) > 1: replacement_text = "\n".join(str(item) for item in json_value) if has_red_text(cell): print(f" ✅ Replacing red text in single-column section: '{field_name}'") print(f" ✅ Replacement text:\n{replacement_text}") cell_replacements = replace_red_text_in_cell(cell, replacement_text) if cell_replacements > 0: print(f" -> Replaced with: '{replacement_text[:100]}...'") return cell_replacements return 0 def process_tables(document, flat_json): """Process tables to find key-value pairs and replace red values""" replacements_made = 0 for table_idx, table in enumerate(document.tables): print(f"\n🔍 Processing table {table_idx + 1}:") # Check if this is the vehicle registration table table_text = "" for row in table.rows[:3]: # Check first 3 rows for cell in row.cells: table_text += get_clean_text(cell).lower() + " " # Look for vehicle registration indicators (need multiple indicators to avoid false positives) vehicle_indicators = ["registration number", "sub-contractor", "weight verification", "rfs suspension"] indicator_count = sum(1 for indicator in vehicle_indicators if indicator in table_text) if indicator_count >= 3: # Require at least 3 indicators to be sure it's a vehicle table print(f" 🚗 Detected Vehicle Registration table") vehicle_replacements = handle_vehicle_registration_table(table, flat_json) replacements_made += vehicle_replacements continue # Skip normal processing for this table # Check if this is the print accreditation table print_accreditation_indicators = ["print name", "position title"] indicator_count = sum(1 for indicator in print_accreditation_indicators if indicator in table_text) if indicator_count >= 2: # Require at least 2 indicators to be sure it's a print accreditation table print(f" 📋 Detected Print Accreditation table") print_accreditation_replacements = handle_print_accreditation_section(table, flat_json) replacements_made += print_accreditation_replacements continue # Skip normal processing for this table for row_idx, row in enumerate(table.rows): if len(row.cells) < 1: # Skip empty rows continue # Get the key from the first column key_cell = row.cells[0] key_text = get_clean_text(key_cell) if not key_text: continue print(f" 📌 Row {row_idx + 1}: Key = '{key_text}'") # Check if this key exists in our JSON json_value = find_matching_json_value(key_text, flat_json) if json_value is not None: replacement_text = get_value_as_string(json_value, key_text) # Special handling for Australian Company Number if ("australian company number" in key_text.lower() or "company number" in key_text.lower()) and isinstance(json_value, list): cell_replacements = handle_australian_company_number(row, json_value) replacements_made += cell_replacements # Handle section headers (like Attendance List, Nature of Business) where content is in next row elif ("attendance list" in key_text.lower() or "nature of" in key_text.lower()) and row_idx + 1 < len(table.rows): print(f" ✅ Section header detected, checking next row for content...") next_row = table.rows[row_idx + 1] # Check all cells in the next row for red text for cell_idx, cell in enumerate(next_row.cells): if has_red_text(cell): print(f" ✅ Found red text in next row, cell {cell_idx + 1}") # For list values, join with line breaks if isinstance(json_value, list): replacement_text = "\n".join(str(item) for item in json_value) cell_replacements = replace_red_text_in_cell(cell, replacement_text) replacements_made += cell_replacements if cell_replacements > 0: print(f" -> Replaced section content with: '{replacement_text[:100]}...'") elif len(row.cells) == 1 or (len(row.cells) > 1 and not any(has_red_text(row.cells[i]) for i in range(1, len(row.cells)))): if has_red_text(key_cell): cell_replacements = process_single_column_sections(key_cell, key_text, flat_json) replacements_made += cell_replacements else: for cell_idx in range(1, len(row.cells)): value_cell = row.cells[cell_idx] if has_red_text(value_cell): print(f" ✅ Found red text in column {cell_idx + 1}") cell_replacements = replace_red_text_in_cell(value_cell, replacement_text) replacements_made += cell_replacements else: if len(row.cells) == 1 and has_red_text(key_cell): red_text = "" for paragraph in key_cell.paragraphs: for run in paragraph.runs: if is_red(run): red_text += run.text if red_text.strip(): section_value = find_matching_json_value(red_text.strip(), flat_json) if section_value is not None: section_replacement = get_value_as_string(section_value, red_text.strip()) cell_replacements = replace_red_text_in_cell(key_cell, section_replacement) replacements_made += cell_replacements # Handle tables where red text appears in multiple columns (like contact info tables) for cell_idx in range(len(row.cells)): cell = row.cells[cell_idx] if has_red_text(cell): # Get the red text from this cell red_text = "" for paragraph in cell.paragraphs: for run in paragraph.runs: if is_red(run): red_text += run.text if red_text.strip(): # Try to find a direct mapping for this red text section_value = find_matching_json_value(red_text.strip(), flat_json) if section_value is not None: section_replacement = get_value_as_string(section_value, red_text.strip()) cell_replacements = replace_red_text_in_cell(cell, section_replacement) replacements_made += cell_replacements if cell_replacements > 0: print(f" ✅ Replaced red text '{red_text.strip()[:30]}...' with '{section_replacement[:30]}...' in cell {cell_idx + 1}") return replacements_made def process_paragraphs(document, flat_json): replacements_made = 0 print(f"\n🔍 Processing paragraphs:") for para_idx, paragraph in enumerate(document.paragraphs): red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()] if red_runs: full_text = paragraph.text.strip() red_text_only = "".join(run.text for run in red_runs).strip() print(f" 📌 Paragraph {para_idx + 1}: Found red text: '{red_text_only}'") # Try to match the red text specifically first json_value = find_matching_json_value(red_text_only, flat_json) # If no match, try some common patterns if json_value is None: # Check for signature patterns if "AUDITOR SIGNATURE" in red_text_only.upper() or "DATE" in red_text_only.upper(): json_value = find_matching_json_value("auditor signature", flat_json) elif "OPERATOR SIGNATURE" in red_text_only.upper(): json_value = find_matching_json_value("operator signature", flat_json) if json_value is not None: replacement_text = get_value_as_string(json_value) print(f" ✅ Replacing red text with: '{replacement_text}'") red_runs[0].text = replacement_text red_runs[0].font.color.rgb = RGBColor(0, 0, 0) for run in red_runs[1:]: run.text = '' replacements_made += 1 return replacements_made def process_hf(json_file, docx_file, output_file): """ Accepts file-like objects or file paths. For Hugging Face: json_file, docx_file, output_file will be file-like objects. """ try: # --- Load JSON (file or file-like) --- if hasattr(json_file, "read"): json_data = json.load(json_file) else: with open(json_file, 'r', encoding='utf-8') as f: json_data = json.load(f) flat_json = flatten_json(json_data) print("📄 Available JSON keys (sample):") for i, (key, value) in enumerate(sorted(flat_json.items())): if i < 10: print(f" - {key}: {value}") print(f" ... and {len(flat_json) - 10} more keys\n") # --- Load DOCX (file or file-like) --- if hasattr(docx_file, "read"): doc = Document(docx_file) else: doc = Document(docx_file) table_replacements = process_tables(doc, flat_json) paragraph_replacements = process_paragraphs(doc, flat_json) total_replacements = table_replacements + paragraph_replacements # --- Save DOCX output (file or file-like) --- if hasattr(output_file, "write"): doc.save(output_file) else: doc.save(output_file) print(f"\n✅ Document saved as: {output_file}") print(f"✅ Total replacements: {total_replacements} ({table_replacements} in tables, {paragraph_replacements} in paragraphs)") except FileNotFoundError as e: print(f"❌ File not found: {e}") except Exception as e: print(f"❌ Error: {e}") import traceback traceback.print_exc() main(json_path, docx_path, output_path)