Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

Shami96 commited on Aug 22

Commit

4091415

verified ·

1 Parent(s): 6067db5

Update updated_word.py

Browse files

Files changed (1) hide show

updated_word.py +81 -38

updated_word.py CHANGED Viewed

@@ -570,7 +570,7 @@ def handle_attendance_list_table_enhanced(table, flat_json):
     return replacements_made
 def fix_management_summary_details_column(table, flat_json):
-    """Enhanced management summary processing with better data matching"""
     replacements_made = 0
     print(f"    🎯 FIX: Management Summary DETAILS column processing")
@@ -600,45 +600,42 @@ def fix_management_summary_details_column(table, flat_json):
     for mgmt_type in mgmt_types:
         print(f"    ✅ Confirmed {mgmt_type} table processing")
-        # Look for management data in the JSON - FIXED: Use exact key matching
-        mgmt_data = None
-        # Try direct key match first
         if mgmt_type in flat_json:
-            mgmt_data = flat_json[mgmt_type]
-            print(f"    ✅ Found direct match for: '{mgmt_type}'")
-        # FIXED: Look for the actual keys in the JSON data
         if not mgmt_data:
-            # Search for keys that contain the management type
             for key, value in flat_json.items():
-                if mgmt_type.lower().replace(" ", "") in key.lower().replace(" ", ""):
-                    mgmt_data = value
-                    print(f"    ✅ Found data using key: '{key}'")
-                    break
-        # FIXED: Also check for flattened keys like "Mass Management Summary.Std 5. Verification"
         if not mgmt_data:
-            mgmt_data = {}
             for key, value in flat_json.items():
-                if mgmt_type.lower() in key.lower():
-                    # Extract the standard part (after the dot)
-                    if "." in key:
-                        std_key = key.split(".", 1)[1]  # Get everything after first dot
-                        mgmt_data[std_key] = value
-                        print(f"    ✅ Found standard data: '{std_key}' = {value}")
-            if mgmt_data:
-                print(f"    ✅ Collected {len(mgmt_data)} standards for {mgmt_type}")
         if not mgmt_data:
             print(f"    ⚠️ No JSON data found for {mgmt_type}")
             continue
-        # FIXED: Handle both dict and direct value formats
-        if not isinstance(mgmt_data, dict):
-            print(f"    ⚠️ Management data is not a dict for {mgmt_type}: {type(mgmt_data)}")
-            continue
         # Process the table rows
         for row_idx, row in enumerate(table.rows):
@@ -654,47 +651,93 @@ def fix_management_summary_details_column(table, flat_json):
                 if not has_red_text(details_cell):
                     continue
-                # FIXED: Better standard matching
                 replacement_value = None
-                # Look for specific standards with better matching
                 if "std 1" in standard_text and ("daily" in standard_text or "check" in standard_text):
                     replacement_value = find_best_standard_value(mgmt_data, ["Std 1. Daily Check", "Std 1", "Daily Check"])
-                elif "std 5" in standard_text and ("verification" in standard_text or "internal review" in standard_text):
                     if "mass" in mgmt_type.lower():
                         replacement_value = find_best_standard_value(mgmt_data, ["Std 5. Verification", "Std 5", "Verification"])
                     else:
                         replacement_value = find_best_standard_value(mgmt_data, ["Std 5. Internal Review", "Std 5", "Internal Review"])
                 elif "std 6" in standard_text or "internal review" in standard_text:
                     replacement_value = find_best_standard_value(mgmt_data, ["Std 6. Internal Review", "Std 6", "Internal Review"])
                 elif "std 7" in standard_text:
-                    replacement_value = find_best_standard_value(mgmt_data, ["Std 7. Internal Review", "Std 7"])
                 # Apply replacement if found
                 if replacement_value:
-                    replacement_text = get_value_as_string(replacement_value, f"Standard for {mgmt_type}")
                     cell_replacements = replace_red_text_in_cell(details_cell, replacement_text)
                     replacements_made += cell_replacements
                     if cell_replacements:
-                        print(f"      ✅ Replaced {standard_text} details in {mgmt_type}")
                 else:
                     print(f"      ⚠️ No replacement found for '{standard_text}' in {mgmt_type}")
     return replacements_made
 def find_best_standard_value(mgmt_data, candidate_keys):
-    """Find the best matching value for a standard from management data"""
     for candidate in candidate_keys:
         if candidate in mgmt_data:
             return mgmt_data[candidate]
-    # Try fuzzy matching
-    for key, value in mgmt_data.items():
-        for candidate in candidate_keys:
             if candidate.lower() in key.lower() or key.lower() in candidate.lower():
                 return value
     return None
 # ============================================================================

     return replacements_made
 def fix_management_summary_details_column(table, flat_json):
+    """FIXED: Enhanced management summary processing that handles both dict and flattened JSON structures"""
     replacements_made = 0
     print(f"    🎯 FIX: Management Summary DETAILS column processing")
     for mgmt_type in mgmt_types:
         print(f"    ✅ Confirmed {mgmt_type} table processing")
+        # FIXED: Build management data dict from multiple sources
+        mgmt_data = {}
+        # Strategy 1: Look for direct nested dict in original JSON (before flattening)
         if mgmt_type in flat_json:
+            direct_data = flat_json[mgmt_type]
+            if isinstance(direct_data, dict):
+                mgmt_data = direct_data
+                print(f"    ✅ Found direct nested dict for: '{mgmt_type}' with {len(mgmt_data)} standards")
+        # Strategy 2: Look for flattened keys like "Mass Management Summary.Std 5. Verification"
         if not mgmt_data:
             for key, value in flat_json.items():
+                if key.startswith(mgmt_type + "."):
+                    # Extract the standard part (after the management type)
+                    std_key = key[len(mgmt_type) + 1:]  # Remove "Mass Management Summary." prefix
+                    mgmt_data[std_key] = value
+                    print(f"    ✅ Found flattened standard: '{std_key}' = {value}")
+            if mgmt_data:
+                print(f"    ✅ Collected {len(mgmt_data)} standards from flattened keys for {mgmt_type}")
+        # Strategy 3: Search for keys that contain the management type
         if not mgmt_data:
             for key, value in flat_json.items():
+                if mgmt_type.lower().replace(" ", "") in key.lower().replace(" ", ""):
+                    if isinstance(value, dict):
+                        mgmt_data = value
+                        print(f"    ✅ Found data using key variation: '{key}'")
+                        break
         if not mgmt_data:
             print(f"    ⚠️ No JSON data found for {mgmt_type}")
             continue
+        print(f"    📋 Processing {mgmt_type} with standards: {list(mgmt_data.keys())}")
         # Process the table rows
         for row_idx, row in enumerate(table.rows):
                 if not has_red_text(details_cell):
                     continue
+                print(f"      🔍 Processing standard: '{standard_text}'")
+                # FIXED: Better standard matching with multiple strategies
                 replacement_value = None
+                # Strategy 1: Direct standard matching
                 if "std 1" in standard_text and ("daily" in standard_text or "check" in standard_text):
                     replacement_value = find_best_standard_value(mgmt_data, ["Std 1. Daily Check", "Std 1", "Daily Check"])
+                    print(f"      🎯 Looking for Std 1 Daily Check")
+                elif "std 5" in standard_text:
                     if "mass" in mgmt_type.lower():
                         replacement_value = find_best_standard_value(mgmt_data, ["Std 5. Verification", "Std 5", "Verification"])
+                        print(f"      🎯 Looking for Std 5 Verification (Mass)")
                     else:
                         replacement_value = find_best_standard_value(mgmt_data, ["Std 5. Internal Review", "Std 5", "Internal Review"])
+                        print(f"      🎯 Looking for Std 5 Internal Review (Fatigue)")
                 elif "std 6" in standard_text or "internal review" in standard_text:
                     replacement_value = find_best_standard_value(mgmt_data, ["Std 6. Internal Review", "Std 6", "Internal Review"])
+                    print(f"      🎯 Looking for Std 6 Internal Review")
                 elif "std 7" in standard_text:
+                    replacement_value = find_best_standard_value(mgmt_data, ["Std 7. Internal Review", "Std 7", "Internal Review"])
+                    print(f"      🎯 Looking for Std 7 Internal Review")
+                # Strategy 2: Fuzzy matching if direct doesn't work
+                if not replacement_value:
+                    print(f"      🔍 No direct match, trying fuzzy matching...")
+                    for std_key, std_value in mgmt_data.items():
+                        std_key_lower = std_key.lower()
+                        if "std" in standard_text:
+                            # Extract std number from both
+                            std_match = re.search(r'std\s*(\d+)', standard_text)
+                            key_match = re.search(r'std\s*(\d+)', std_key_lower)
+                            if std_match and key_match and std_match.group(1) == key_match.group(1):
+                                replacement_value = std_value
+                                print(f"      ✅ Fuzzy matched by std number: {std_key}")
+                                break
                 # Apply replacement if found
                 if replacement_value:
+                    # Handle list values properly
+                    if isinstance(replacement_value, list):
+                        if len(replacement_value) == 1:
+                            replacement_text = str(replacement_value[0])
+                        else:
+                            replacement_text = "\n".join(str(item) for item in replacement_value)
+                    else:
+                        replacement_text = str(replacement_value)
                     cell_replacements = replace_red_text_in_cell(details_cell, replacement_text)
                     replacements_made += cell_replacements
                     if cell_replacements:
+                        print(f"      ✅ Replaced '{standard_text}' details in {mgmt_type} with: '{replacement_text[:50]}...'")
                 else:
                     print(f"      ⚠️ No replacement found for '{standard_text}' in {mgmt_type}")
+                    print(f"      📋 Available standards: {list(mgmt_data.keys())}")
     return replacements_made
 def find_best_standard_value(mgmt_data, candidate_keys):
+    """FIXED: Find the best matching value for a standard from management data"""
+    print(f"        🔍 Searching for candidates: {candidate_keys}")
+    print(f"        📋 In available keys: {list(mgmt_data.keys())}")
+    # Direct match
     for candidate in candidate_keys:
         if candidate in mgmt_data:
+            print(f"        ✅ Direct match found: '{candidate}'")
             return mgmt_data[candidate]
+    # Case insensitive match
+    for candidate in candidate_keys:
+        for key, value in mgmt_data.items():
+            if candidate.lower() == key.lower():
+                print(f"        ✅ Case-insensitive match found: '{key}' for '{candidate}'")
+                return value
+    # Partial match
+    for candidate in candidate_keys:
+        for key, value in mgmt_data.items():
             if candidate.lower() in key.lower() or key.lower() in candidate.lower():
+                print(f"        ✅ Partial match found: '{key}' for '{candidate}'")
                 return value
+    print(f"        ❌ No match found for any candidate")
     return None
 # ============================================================================