Shami96 commited on
Commit
ab686bc
Β·
verified Β·
1 Parent(s): 5737d0c

Update update_docx_with_pdf.py

Browse files
Files changed (1) hide show
  1. update_docx_with_pdf.py +165 -16
update_docx_with_pdf.py CHANGED
@@ -510,32 +510,149 @@ class NHVASMerger:
510
 
511
  # ───────────────────────────── summary tables (unchanged logic) ─────────────────────────────
512
  def build_summary_maps(self, pdf_json: dict) -> dict:
 
513
  out = {v: {} for v in SUMMARY_SECTIONS.values()}
514
  try:
515
  tables = pdf_json["extracted_data"]["all_tables"]
516
  except Exception:
517
  return out
518
 
519
- for t in tables:
 
 
 
520
  headers = [re.sub(r"\s+", " ", (h or "")).strip().upper() for h in t.get("headers", [])]
521
- if "DETAILS" not in headers:
522
  continue
523
- section_key_raw = next((h for h in headers if h in SUMMARY_SECTIONS), None)
524
- if not section_key_raw:
 
525
  continue
526
- section_name = SUMMARY_SECTIONS[section_key_raw]
527
- for row in t.get("data", []):
528
- if not row: continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
529
  left = str(row[0]) if len(row) >= 1 else ""
530
- right = str(row[1]) if len(row) >= 2 else ""
 
 
 
 
 
 
 
531
  left_norm = self.normalize_std_label(left)
532
- if left_norm and right:
533
- prev = out[section_name].get(left_norm, "")
534
- merged_text = (prev + " " + right).strip() if prev else right.strip()
535
- out[section_name][left_norm] = merged_text
536
-
 
 
 
 
 
537
  for sec in out:
538
  out[sec] = {k: [_smart_space(v)] for k, v in out[sec].items() if v}
 
 
 
 
 
 
 
 
539
  return out
540
 
541
  # ───────────────────────────── NEW: find cell by label in tables ─────────────────────────────
@@ -1269,10 +1386,42 @@ class NHVASMerger:
1269
  if op.get("phone"):
1270
  merged["Operator contact details"]["Operator Telephone Number"] = [_smart_space(op["phone"])]
1271
 
1272
- # Attendance
1273
  if "attendance" in pdf_extracted and "Attendance List (Names and Position Titles)" in merged:
1274
- merged["Attendance List (Names and Position Titles)"]["Attendance List (Names and Position Titles)"] = _clean_list(pdf_extracted["attendance"])
1275
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1276
  # Business summary
1277
  if "business_summary" in pdf_extracted and "Nature of the Operators Business (Summary)" in merged:
1278
  merged["Nature of the Operators Business (Summary)"]["Nature of the Operators Business (Summary):"] = [_smart_space(pdf_extracted["business_summary"])]
 
510
 
511
  # ───────────────────────────── summary tables (unchanged logic) ─────────────────────────────
512
  def build_summary_maps(self, pdf_json: dict) -> dict:
513
+ """Enhanced summary mapping that correctly identifies detailed summary tables."""
514
  out = {v: {} for v in SUMMARY_SECTIONS.values()}
515
  try:
516
  tables = pdf_json["extracted_data"]["all_tables"]
517
  except Exception:
518
  return out
519
 
520
+ self.log_debug(f"Processing {len(tables)} tables total")
521
+
522
+ for i, t in enumerate(tables):
523
+ page = t.get("page", "?")
524
  headers = [re.sub(r"\s+", " ", (h or "")).strip().upper() for h in t.get("headers", [])]
525
+ if not headers:
526
  continue
527
+
528
+ data_rows = t.get("data", [])
529
+ if not data_rows:
530
  continue
531
+
532
+ self.log_debug(f"Table {i} (page {page}): headers = {headers[:5]}")
533
+
534
+ # Check for DETAILS column - but be more flexible
535
+ table_header_text = " ".join(headers).upper()
536
+ has_details_column = "DETAILS" in table_header_text
537
+
538
+ self.log_debug(f" Has DETAILS column: {has_details_column}")
539
+
540
+ if not has_details_column:
541
+ # Check if this might be a summary table without explicit "DETAILS" header
542
+ # Look for management type + detailed content
543
+ has_management_keyword = any(keyword in table_header_text for keyword in
544
+ ["MAINTENANCE MANAGEMENT", "MASS MANAGEMENT", "FATIGUE MANAGEMENT"])
545
+
546
+ if not has_management_keyword:
547
+ self.log_debug(f" Skipping - no DETAILS column and no management keywords")
548
+ continue
549
+ else:
550
+ self.log_debug(f" No DETAILS column but has management keyword - checking content...")
551
+
552
+ # Look for "Std X." patterns in the first column
553
+ has_standards = False
554
+ is_detailed_summary = False
555
+ section_type = None
556
+ sample_content = []
557
+
558
+ for row in data_rows[:3]: # Check first few rows
559
+ if not row:
560
+ continue
561
+ first_cell = str(row[0]).strip()
562
+ if re.match(r"Std\s+\d+\.", first_cell, re.IGNORECASE):
563
+ has_standards = True
564
+ self.log_debug(f" Found standard: {first_cell}")
565
+
566
+ # Check all cells in this row for detailed content
567
+ for i in range(1, len(row)):
568
+ cell_content = str(row[i]).strip()
569
+ if len(cell_content) > 50: # Detailed content is much longer
570
+ sample_content.append(cell_content[:200]) # Store sample for analysis
571
+ if not re.match(r"^[VNC\s]*$", cell_content):
572
+ is_detailed_summary = True
573
+ self.log_debug(f" Found detailed content: {cell_content[:100]}...")
574
+ break
575
+
576
+ self.log_debug(f" Has standards: {has_standards}, Is detailed: {is_detailed_summary}")
577
+
578
+ if not has_standards:
579
+ self.log_debug(f" Skipping - no standards found")
580
+ continue
581
+
582
+ if not is_detailed_summary:
583
+ self.log_debug(f" Skipping - not detailed summary (content too short or just V/NC)")
584
+ continue
585
+
586
+ # Identify management type from headers AND content
587
+ if "MAINTENANCE" in table_header_text:
588
+ section_type = "Maintenance Management Summary"
589
+ elif "MASS" in table_header_text:
590
+ section_type = "Mass Management Summary"
591
+ elif "FATIGUE" in table_header_text:
592
+ section_type = "Fatigue Management Summary"
593
+ else:
594
+ # Fallback: analyze the actual standard content to determine type
595
+ combined_content = " ".join(sample_content).lower()
596
+ self.log_debug(f" Analyzing content keywords: {combined_content[:200]}...")
597
+
598
+ # Identify by content keywords
599
+ if any(keyword in combined_content for keyword in [
600
+ "daily check", "fault", "maintenance", "repair", "service", "workshop"
601
+ ]):
602
+ section_type = "Maintenance Management Summary"
603
+ self.log_debug(f" Identified as Maintenance by content")
604
+ elif any(keyword in combined_content for keyword in [
605
+ "mass", "weight", "verification", "vehicle", "load", "gauge"
606
+ ]):
607
+ section_type = "Mass Management Summary"
608
+ self.log_debug(f" Identified as Mass by content")
609
+ elif any(keyword in combined_content for keyword in [
610
+ "fatigue", "scheduling", "rostering", "duty", "medical", "driver"
611
+ ]):
612
+ section_type = "Fatigue Management Summary"
613
+ self.log_debug(f" Identified as Fatigue by content")
614
+
615
+ if not section_type:
616
+ self.log_debug(f" Could not determine section type for table with headers: {headers[:3]}")
617
+ continue
618
+
619
+ self.log_debug(f" βœ… Processing {section_type} table from page {page}")
620
+
621
+ # Extract the data from the detailed content
622
+ standards_found = 0
623
+ for row in data_rows:
624
+ if not row:
625
+ continue
626
  left = str(row[0]) if len(row) >= 1 else ""
627
+
628
+ # Find the details content (longest cell in the row)
629
+ details_content = ""
630
+ for i in range(1, len(row)):
631
+ cell_content = str(row[i]).strip()
632
+ if len(cell_content) > len(details_content):
633
+ details_content = cell_content
634
+
635
  left_norm = self.normalize_std_label(left)
636
+ if left_norm and details_content and len(details_content) > 50:
637
+ prev = out[section_type].get(left_norm, "")
638
+ merged_text = (prev + " " + details_content).strip() if prev else details_content.strip()
639
+ out[section_type][left_norm] = merged_text
640
+ standards_found += 1
641
+ self.log_debug(f" Added {left_norm}: {details_content[:100]}...")
642
+
643
+ self.log_debug(f" Extracted {standards_found} standards from {section_type}")
644
+
645
+ # Convert to list format as expected by the rest of the code
646
  for sec in out:
647
  out[sec] = {k: [_smart_space(v)] for k, v in out[sec].items() if v}
648
+
649
+ self.log_debug(f"Summary maps built: {list(out.keys())}")
650
+ for section_name, data in out.items():
651
+ if data:
652
+ self.log_debug(f" βœ… {section_name}: {len(data)} standards found - {list(data.keys())[:3]}...")
653
+ else:
654
+ self.log_debug(f" ❌ {section_name}: No data found")
655
+
656
  return out
657
 
658
  # ───────────────────────────── NEW: find cell by label in tables ─────────────────────────────
 
1386
  if op.get("phone"):
1387
  merged["Operator contact details"]["Operator Telephone Number"] = [_smart_space(op["phone"])]
1388
 
1389
+ # Attendance - Modified logic
1390
  if "attendance" in pdf_extracted and "Attendance List (Names and Position Titles)" in merged:
1391
+ # Get expected count from DOCX template
1392
+ docx_attendance = merged["Attendance List (Names and Position Titles)"]["Attendance List (Names and Position Titles)"]
1393
+ expected_count = len(docx_attendance) if isinstance(docx_attendance, list) else 1
1394
+
1395
+ # Get PDF attendance (already processed by existing extraction)
1396
+ pdf_attendance_raw = pdf_extracted["attendance"]
1397
+
1398
+ # The PDF might have combined entries like "Name1 - Position1 Name2 - Position2"
1399
+ # Split them properly
1400
+ separated_attendance = []
1401
+ for entry in pdf_attendance_raw:
1402
+ # Handle combined entries like "Grant Pontifex - Manager Jodie Jones - Auditor"
1403
+ if " - " in entry:
1404
+ # Try to split by pattern: Position followed by Name
1405
+ import re
1406
+ # Look for pattern: Name - Position Name - Position
1407
+ match = re.search(r'([A-Z][a-z]+\s+[A-Z][a-z]+)\s*-\s*([A-Z][a-z]+)\s+([A-Z][a-z]+\s+[A-Z][a-z]+)\s*-\s*([A-Z][a-z]+)', entry)
1408
+ if match:
1409
+ person1 = f"{match.group(1)} - {match.group(2)}"
1410
+ person2 = f"{match.group(3)} - {match.group(4)}"
1411
+ separated_attendance.extend([person1, person2])
1412
+ else:
1413
+ # If pattern doesn't match, keep as single entry
1414
+ separated_attendance.append(entry)
1415
+ else:
1416
+ separated_attendance.append(entry)
1417
+
1418
+ # Limit to expected count
1419
+ final_attendance = separated_attendance[:expected_count]
1420
+
1421
+ self.log_debug(f"Attendance: DOCX expects {expected_count}, PDF has {len(separated_attendance)}, using: {final_attendance}")
1422
+
1423
+ merged["Attendance List (Names and Position Titles)"]["Attendance List (Names and Position Titles)"] = _clean_list(final_attendance)
1424
+
1425
  # Business summary
1426
  if "business_summary" in pdf_extracted and "Nature of the Operators Business (Summary)" in merged:
1427
  merged["Nature of the Operators Business (Summary)"]["Nature of the Operators Business (Summary):"] = [_smart_space(pdf_extracted["business_summary"])]