Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

Shami96 commited on Aug 22

Commit

6067db5

verified ·

1 Parent(s): d9eda51

Update updated_word.py

Browse files

Files changed (1) hide show

updated_word.py +59 -64

updated_word.py CHANGED Viewed

@@ -600,41 +600,44 @@ def fix_management_summary_details_column(table, flat_json):
     for mgmt_type in mgmt_types:
         print(f"    ✅ Confirmed {mgmt_type} table processing")
-        # Look for management data in the JSON
         mgmt_data = None
         # Try direct key match first
         if mgmt_type in flat_json:
             mgmt_data = flat_json[mgmt_type]
-        # Try variations of the key
         if not mgmt_data:
-            for key in flat_json.keys():
-                key_lower = key.lower()
-                mgmt_lower = mgmt_type.lower()
-                if mgmt_lower in key_lower or key_lower in mgmt_lower:
-                    mgmt_data = flat_json[key]
-                    print(f"    ✅ Found data using key variation: '{key}'")
                     break
-        # If still no data, look for individual standard data
         if not mgmt_data:
-            # Collect individual standard entries
             mgmt_data = {}
             for key, value in flat_json.items():
-                key_lower = key.lower()
-                # Look for standard entries related to this management type
-                if ("std " in key_lower and
-                    (("mass" in mgmt_type.lower() and any(term in key_lower for term in ["verification", "internal review"])) or
-                     ("maintenance" in mgmt_type.lower() and any(term in key_lower for term in ["daily check", "internal review"])) or
-                     ("fatigue" in mgmt_type.lower() and any(term in key_lower for term in ["internal review"])))):
-                    mgmt_data[key] = value
             if mgmt_data:
-                print(f"    ✅ Collected individual standard data: {list(mgmt_data.keys())}")
-        if not mgmt_data or not isinstance(mgmt_data, dict):
-            print(f"    ⚠️ No JSON management dict found for {mgmt_type}, skipping this type")
             continue
         # Process the table rows
@@ -648,46 +651,34 @@ def fix_management_summary_details_column(table, flat_json):
                 if "standard" in standard_text or "requirement" in standard_text or "details" in standard_text:
                     continue
-                # Look for specific standards
-                if "std 5" in standard_text or "verification" in standard_text:
-                    if has_red_text(details_cell):
-                        std_val = find_best_standard_value(mgmt_data, ["Std 5. Verification", "Std 5 Verification", "Std 5", "Verification"])
-                        if std_val:
-                            replacement_text = get_value_as_string(std_val, "Std 5. Verification")
-                            cell_replacements = replace_red_text_in_cell(details_cell, replacement_text)
-                            replacements_made += cell_replacements
-                            if cell_replacements:
-                                print(f"      ✅ Replaced Std 5. Verification details for {mgmt_type}")
-                elif "std 6" in standard_text or "internal review" in standard_text:
-                    if has_red_text(details_cell):
-                        std_val = find_best_standard_value(mgmt_data, ["Std 6. Internal Review", "Std 6 Internal Review", "Std 6", "Internal Review"])
-                        if std_val:
-                            replacement_text = get_value_as_string(std_val, "Std 6. Internal Review")
-                            cell_replacements = replace_red_text_in_cell(details_cell, replacement_text)
-                            replacements_made += cell_replacements
-                            if cell_replacements:
-                                print(f"      ✅ Replaced Std 6. Internal Review details for {mgmt_type}")
-                elif "std 1" in standard_text or "daily check" in standard_text:
-                    if has_red_text(details_cell):
-                        std_val = find_best_standard_value(mgmt_data, ["Std 1. Daily Check", "Std 1 Daily Check", "Std 1", "Daily Check"])
-                        if std_val:
-                            replacement_text = get_value_as_string(std_val, "Std 1. Daily Check")
-                            cell_replacements = replace_red_text_in_cell(details_cell, replacement_text)
-                            replacements_made += cell_replacements
-                            if cell_replacements:
-                                print(f"      ✅ Replaced Std 1. Daily Check details for {mgmt_type}")
                 elif "std 7" in standard_text:
-                    if has_red_text(details_cell):
-                        std_val = find_best_standard_value(mgmt_data, ["Std 7. Internal Review", "Std 7 Internal Review", "Std 7"])
-                        if std_val:
-                            replacement_text = get_value_as_string(std_val, "Std 7. Internal Review")
-                            cell_replacements = replace_red_text_in_cell(details_cell, replacement_text)
-                            replacements_made += cell_replacements
-                            if cell_replacements:
-                                print(f"      ✅ Replaced Std 7. Internal Review details for {mgmt_type}")
     return replacements_made
@@ -1346,7 +1337,7 @@ def process_paragraphs(document, flat_json):
 def process_headings(document, flat_json):
     """
-    IMPROVED: Better heading processing that avoids mixing company data
     """
     replacements_made = 0
     print(f"\n🔍 Processing headings:")
@@ -1435,7 +1426,7 @@ def process_headings(document, flat_json):
     return replacements_made
 def process_red_text_in_heading_paragraph(paragraph, paragraph_text, flat_json, operator_name):
-    """Process red text found in heading paragraphs"""
     replacements_made = 0
     red_text_segments = []
@@ -1495,23 +1486,25 @@ def process_red_text_in_heading_paragraph(paragraph, paragraph_text, flat_json,
                     print(f"      ✅ Found match with combined query: {kv[0]}")
                     break
-    # Apply the replacement if we found a suitable value
     if replacement_value:
         red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
         if red_runs:
             red_runs[0].text = replacement_value
             red_runs[0].font.color.rgb = RGBColor(0, 0, 0)
             for run in red_runs[1:]:
                 run.text = ''
             replacements_made = 1
-            print(f"      ✅ Replaced with: '{replacement_value}'")
     else:
         print(f"      ❌ No suitable replacement found for: '{combined_red_text}'")
     return replacements_made
 def process_red_text_in_context_paragraph(paragraph, heading_text, flat_json, operator_name):
-    """Process red text found in paragraphs following headings"""
     replacements_made = 0
     red_text_segments = []
@@ -1571,16 +1564,18 @@ def process_red_text_in_context_paragraph(paragraph, heading_text, flat_json, op
                     print(f"      ✅ Found match with combined query: {kv[0]}")
                     break
-    # Apply the replacement if we found a suitable value
     if replacement_value:
         red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
         if red_runs:
             red_runs[0].text = replacement_value
             red_runs[0].font.color.rgb = RGBColor(0, 0, 0)
             for run in red_runs[1:]:
                 run.text = ''
             replacements_made = 1
-            print(f"      ✅ Replaced with: '{replacement_value}'")
     else:
         print(f"      ❌ No suitable replacement found for: '{combined_red_text}'")

     for mgmt_type in mgmt_types:
         print(f"    ✅ Confirmed {mgmt_type} table processing")
+        # Look for management data in the JSON - FIXED: Use exact key matching
         mgmt_data = None
         # Try direct key match first
         if mgmt_type in flat_json:
             mgmt_data = flat_json[mgmt_type]
+            print(f"    ✅ Found direct match for: '{mgmt_type}'")
+        # FIXED: Look for the actual keys in the JSON data
         if not mgmt_data:
+            # Search for keys that contain the management type
+            for key, value in flat_json.items():
+                if mgmt_type.lower().replace(" ", "") in key.lower().replace(" ", ""):
+                    mgmt_data = value
+                    print(f"    ✅ Found data using key: '{key}'")
                     break
+        # FIXED: Also check for flattened keys like "Mass Management Summary.Std 5. Verification"
         if not mgmt_data:
             mgmt_data = {}
             for key, value in flat_json.items():
+                if mgmt_type.lower() in key.lower():
+                    # Extract the standard part (after the dot)
+                    if "." in key:
+                        std_key = key.split(".", 1)[1]  # Get everything after first dot
+                        mgmt_data[std_key] = value
+                        print(f"    ✅ Found standard data: '{std_key}' = {value}")
             if mgmt_data:
+                print(f"    ✅ Collected {len(mgmt_data)} standards for {mgmt_type}")
+        if not mgmt_data:
+            print(f"    ⚠️ No JSON data found for {mgmt_type}")
+            continue
+        # FIXED: Handle both dict and direct value formats
+        if not isinstance(mgmt_data, dict):
+            print(f"    ⚠️ Management data is not a dict for {mgmt_type}: {type(mgmt_data)}")
             continue
         # Process the table rows
                 if "standard" in standard_text or "requirement" in standard_text or "details" in standard_text:
                     continue
+                if not has_red_text(details_cell):
+                    continue
+                # FIXED: Better standard matching
+                replacement_value = None
+                # Look for specific standards with better matching
+                if "std 1" in standard_text and ("daily" in standard_text or "check" in standard_text):
+                    replacement_value = find_best_standard_value(mgmt_data, ["Std 1. Daily Check", "Std 1", "Daily Check"])
+                elif "std 5" in standard_text and ("verification" in standard_text or "internal review" in standard_text):
+                    if "mass" in mgmt_type.lower():
+                        replacement_value = find_best_standard_value(mgmt_data, ["Std 5. Verification", "Std 5", "Verification"])
+                    else:
+                        replacement_value = find_best_standard_value(mgmt_data, ["Std 5. Internal Review", "Std 5", "Internal Review"])
+                elif "std 6" in standard_text or "internal review" in standard_text:
+                    replacement_value = find_best_standard_value(mgmt_data, ["Std 6. Internal Review", "Std 6", "Internal Review"])
                 elif "std 7" in standard_text:
+                    replacement_value = find_best_standard_value(mgmt_data, ["Std 7. Internal Review", "Std 7"])
+                # Apply replacement if found
+                if replacement_value:
+                    replacement_text = get_value_as_string(replacement_value, f"Standard for {mgmt_type}")
+                    cell_replacements = replace_red_text_in_cell(details_cell, replacement_text)
+                    replacements_made += cell_replacements
+                    if cell_replacements:
+                        print(f"      ✅ Replaced {standard_text} details in {mgmt_type}")
+                else:
+                    print(f"      ⚠️ No replacement found for '{standard_text}' in {mgmt_type}")
     return replacements_made
 def process_headings(document, flat_json):
     """
+    FIXED: Better heading processing with proper red text replacement
     """
     replacements_made = 0
     print(f"\n🔍 Processing headings:")
     return replacements_made
 def process_red_text_in_heading_paragraph(paragraph, paragraph_text, flat_json, operator_name):
+    """Process red text found in heading paragraphs - FIXED"""
     replacements_made = 0
     red_text_segments = []
                     print(f"      ✅ Found match with combined query: {kv[0]}")
                     break
+    # FIXED: Apply the replacement if we found a suitable value
     if replacement_value:
         red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
         if red_runs:
+            # Replace the first red run with the new text
             red_runs[0].text = replacement_value
             red_runs[0].font.color.rgb = RGBColor(0, 0, 0)
+            # Clear subsequent red runs
             for run in red_runs[1:]:
                 run.text = ''
             replacements_made = 1
+            print(f"      ✅ Replaced heading red text with: '{replacement_value}'")
     else:
         print(f"      ❌ No suitable replacement found for: '{combined_red_text}'")
     return replacements_made
 def process_red_text_in_context_paragraph(paragraph, heading_text, flat_json, operator_name):
+    """Process red text found in paragraphs following headings - FIXED"""
     replacements_made = 0
     red_text_segments = []
                     print(f"      ✅ Found match with combined query: {kv[0]}")
                     break
+    # FIXED: Apply the replacement if we found a suitable value
     if replacement_value:
         red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
         if red_runs:
+            # Replace the first red run with the new text
             red_runs[0].text = replacement_value
             red_runs[0].font.color.rgb = RGBColor(0, 0, 0)
+            # Clear subsequent red runs
             for run in red_runs[1:]:
                 run.text = ''
             replacements_made = 1
+            print(f"      ✅ Replaced context red text with: '{replacement_value}'")
     else:
         print(f"      ❌ No suitable replacement found for: '{combined_red_text}'")