Spaces:
Running
Running
Update update_docx_with_pdf.py
Browse files- update_docx_with_pdf.py +165 -16
update_docx_with_pdf.py
CHANGED
|
@@ -510,32 +510,149 @@ class NHVASMerger:
|
|
| 510 |
|
| 511 |
# βββββββββββββββββββββββββββββ summary tables (unchanged logic) βββββββββββββββββββββββββββββ
|
| 512 |
def build_summary_maps(self, pdf_json: dict) -> dict:
|
|
|
|
| 513 |
out = {v: {} for v in SUMMARY_SECTIONS.values()}
|
| 514 |
try:
|
| 515 |
tables = pdf_json["extracted_data"]["all_tables"]
|
| 516 |
except Exception:
|
| 517 |
return out
|
| 518 |
|
| 519 |
-
|
|
|
|
|
|
|
|
|
|
| 520 |
headers = [re.sub(r"\s+", " ", (h or "")).strip().upper() for h in t.get("headers", [])]
|
| 521 |
-
if
|
| 522 |
continue
|
| 523 |
-
|
| 524 |
-
|
|
|
|
| 525 |
continue
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 529 |
left = str(row[0]) if len(row) >= 1 else ""
|
| 530 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 531 |
left_norm = self.normalize_std_label(left)
|
| 532 |
-
if left_norm and
|
| 533 |
-
prev = out[
|
| 534 |
-
merged_text = (prev + " " +
|
| 535 |
-
out[
|
| 536 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 537 |
for sec in out:
|
| 538 |
out[sec] = {k: [_smart_space(v)] for k, v in out[sec].items() if v}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 539 |
return out
|
| 540 |
|
| 541 |
# βββββββββββββββββββββββββββββ NEW: find cell by label in tables βββββββββββββββββββββββββββββ
|
|
@@ -1269,10 +1386,42 @@ class NHVASMerger:
|
|
| 1269 |
if op.get("phone"):
|
| 1270 |
merged["Operator contact details"]["Operator Telephone Number"] = [_smart_space(op["phone"])]
|
| 1271 |
|
| 1272 |
-
# Attendance
|
| 1273 |
if "attendance" in pdf_extracted and "Attendance List (Names and Position Titles)" in merged:
|
| 1274 |
-
|
| 1275 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1276 |
# Business summary
|
| 1277 |
if "business_summary" in pdf_extracted and "Nature of the Operators Business (Summary)" in merged:
|
| 1278 |
merged["Nature of the Operators Business (Summary)"]["Nature of the Operators Business (Summary):"] = [_smart_space(pdf_extracted["business_summary"])]
|
|
|
|
| 510 |
|
| 511 |
# βββββββββββββββββββββββββββββ summary tables (unchanged logic) βββββββββββββββββββββββββββββ
|
| 512 |
def build_summary_maps(self, pdf_json: dict) -> dict:
|
| 513 |
+
"""Enhanced summary mapping that correctly identifies detailed summary tables."""
|
| 514 |
out = {v: {} for v in SUMMARY_SECTIONS.values()}
|
| 515 |
try:
|
| 516 |
tables = pdf_json["extracted_data"]["all_tables"]
|
| 517 |
except Exception:
|
| 518 |
return out
|
| 519 |
|
| 520 |
+
self.log_debug(f"Processing {len(tables)} tables total")
|
| 521 |
+
|
| 522 |
+
for i, t in enumerate(tables):
|
| 523 |
+
page = t.get("page", "?")
|
| 524 |
headers = [re.sub(r"\s+", " ", (h or "")).strip().upper() for h in t.get("headers", [])]
|
| 525 |
+
if not headers:
|
| 526 |
continue
|
| 527 |
+
|
| 528 |
+
data_rows = t.get("data", [])
|
| 529 |
+
if not data_rows:
|
| 530 |
continue
|
| 531 |
+
|
| 532 |
+
self.log_debug(f"Table {i} (page {page}): headers = {headers[:5]}")
|
| 533 |
+
|
| 534 |
+
# Check for DETAILS column - but be more flexible
|
| 535 |
+
table_header_text = " ".join(headers).upper()
|
| 536 |
+
has_details_column = "DETAILS" in table_header_text
|
| 537 |
+
|
| 538 |
+
self.log_debug(f" Has DETAILS column: {has_details_column}")
|
| 539 |
+
|
| 540 |
+
if not has_details_column:
|
| 541 |
+
# Check if this might be a summary table without explicit "DETAILS" header
|
| 542 |
+
# Look for management type + detailed content
|
| 543 |
+
has_management_keyword = any(keyword in table_header_text for keyword in
|
| 544 |
+
["MAINTENANCE MANAGEMENT", "MASS MANAGEMENT", "FATIGUE MANAGEMENT"])
|
| 545 |
+
|
| 546 |
+
if not has_management_keyword:
|
| 547 |
+
self.log_debug(f" Skipping - no DETAILS column and no management keywords")
|
| 548 |
+
continue
|
| 549 |
+
else:
|
| 550 |
+
self.log_debug(f" No DETAILS column but has management keyword - checking content...")
|
| 551 |
+
|
| 552 |
+
# Look for "Std X." patterns in the first column
|
| 553 |
+
has_standards = False
|
| 554 |
+
is_detailed_summary = False
|
| 555 |
+
section_type = None
|
| 556 |
+
sample_content = []
|
| 557 |
+
|
| 558 |
+
for row in data_rows[:3]: # Check first few rows
|
| 559 |
+
if not row:
|
| 560 |
+
continue
|
| 561 |
+
first_cell = str(row[0]).strip()
|
| 562 |
+
if re.match(r"Std\s+\d+\.", first_cell, re.IGNORECASE):
|
| 563 |
+
has_standards = True
|
| 564 |
+
self.log_debug(f" Found standard: {first_cell}")
|
| 565 |
+
|
| 566 |
+
# Check all cells in this row for detailed content
|
| 567 |
+
for i in range(1, len(row)):
|
| 568 |
+
cell_content = str(row[i]).strip()
|
| 569 |
+
if len(cell_content) > 50: # Detailed content is much longer
|
| 570 |
+
sample_content.append(cell_content[:200]) # Store sample for analysis
|
| 571 |
+
if not re.match(r"^[VNC\s]*$", cell_content):
|
| 572 |
+
is_detailed_summary = True
|
| 573 |
+
self.log_debug(f" Found detailed content: {cell_content[:100]}...")
|
| 574 |
+
break
|
| 575 |
+
|
| 576 |
+
self.log_debug(f" Has standards: {has_standards}, Is detailed: {is_detailed_summary}")
|
| 577 |
+
|
| 578 |
+
if not has_standards:
|
| 579 |
+
self.log_debug(f" Skipping - no standards found")
|
| 580 |
+
continue
|
| 581 |
+
|
| 582 |
+
if not is_detailed_summary:
|
| 583 |
+
self.log_debug(f" Skipping - not detailed summary (content too short or just V/NC)")
|
| 584 |
+
continue
|
| 585 |
+
|
| 586 |
+
# Identify management type from headers AND content
|
| 587 |
+
if "MAINTENANCE" in table_header_text:
|
| 588 |
+
section_type = "Maintenance Management Summary"
|
| 589 |
+
elif "MASS" in table_header_text:
|
| 590 |
+
section_type = "Mass Management Summary"
|
| 591 |
+
elif "FATIGUE" in table_header_text:
|
| 592 |
+
section_type = "Fatigue Management Summary"
|
| 593 |
+
else:
|
| 594 |
+
# Fallback: analyze the actual standard content to determine type
|
| 595 |
+
combined_content = " ".join(sample_content).lower()
|
| 596 |
+
self.log_debug(f" Analyzing content keywords: {combined_content[:200]}...")
|
| 597 |
+
|
| 598 |
+
# Identify by content keywords
|
| 599 |
+
if any(keyword in combined_content for keyword in [
|
| 600 |
+
"daily check", "fault", "maintenance", "repair", "service", "workshop"
|
| 601 |
+
]):
|
| 602 |
+
section_type = "Maintenance Management Summary"
|
| 603 |
+
self.log_debug(f" Identified as Maintenance by content")
|
| 604 |
+
elif any(keyword in combined_content for keyword in [
|
| 605 |
+
"mass", "weight", "verification", "vehicle", "load", "gauge"
|
| 606 |
+
]):
|
| 607 |
+
section_type = "Mass Management Summary"
|
| 608 |
+
self.log_debug(f" Identified as Mass by content")
|
| 609 |
+
elif any(keyword in combined_content for keyword in [
|
| 610 |
+
"fatigue", "scheduling", "rostering", "duty", "medical", "driver"
|
| 611 |
+
]):
|
| 612 |
+
section_type = "Fatigue Management Summary"
|
| 613 |
+
self.log_debug(f" Identified as Fatigue by content")
|
| 614 |
+
|
| 615 |
+
if not section_type:
|
| 616 |
+
self.log_debug(f" Could not determine section type for table with headers: {headers[:3]}")
|
| 617 |
+
continue
|
| 618 |
+
|
| 619 |
+
self.log_debug(f" β
Processing {section_type} table from page {page}")
|
| 620 |
+
|
| 621 |
+
# Extract the data from the detailed content
|
| 622 |
+
standards_found = 0
|
| 623 |
+
for row in data_rows:
|
| 624 |
+
if not row:
|
| 625 |
+
continue
|
| 626 |
left = str(row[0]) if len(row) >= 1 else ""
|
| 627 |
+
|
| 628 |
+
# Find the details content (longest cell in the row)
|
| 629 |
+
details_content = ""
|
| 630 |
+
for i in range(1, len(row)):
|
| 631 |
+
cell_content = str(row[i]).strip()
|
| 632 |
+
if len(cell_content) > len(details_content):
|
| 633 |
+
details_content = cell_content
|
| 634 |
+
|
| 635 |
left_norm = self.normalize_std_label(left)
|
| 636 |
+
if left_norm and details_content and len(details_content) > 50:
|
| 637 |
+
prev = out[section_type].get(left_norm, "")
|
| 638 |
+
merged_text = (prev + " " + details_content).strip() if prev else details_content.strip()
|
| 639 |
+
out[section_type][left_norm] = merged_text
|
| 640 |
+
standards_found += 1
|
| 641 |
+
self.log_debug(f" Added {left_norm}: {details_content[:100]}...")
|
| 642 |
+
|
| 643 |
+
self.log_debug(f" Extracted {standards_found} standards from {section_type}")
|
| 644 |
+
|
| 645 |
+
# Convert to list format as expected by the rest of the code
|
| 646 |
for sec in out:
|
| 647 |
out[sec] = {k: [_smart_space(v)] for k, v in out[sec].items() if v}
|
| 648 |
+
|
| 649 |
+
self.log_debug(f"Summary maps built: {list(out.keys())}")
|
| 650 |
+
for section_name, data in out.items():
|
| 651 |
+
if data:
|
| 652 |
+
self.log_debug(f" β
{section_name}: {len(data)} standards found - {list(data.keys())[:3]}...")
|
| 653 |
+
else:
|
| 654 |
+
self.log_debug(f" β {section_name}: No data found")
|
| 655 |
+
|
| 656 |
return out
|
| 657 |
|
| 658 |
# βββββββββββββββββββββββββββββ NEW: find cell by label in tables βββββββββββββββββββββββββββββ
|
|
|
|
| 1386 |
if op.get("phone"):
|
| 1387 |
merged["Operator contact details"]["Operator Telephone Number"] = [_smart_space(op["phone"])]
|
| 1388 |
|
| 1389 |
+
# Attendance - Modified logic
|
| 1390 |
if "attendance" in pdf_extracted and "Attendance List (Names and Position Titles)" in merged:
|
| 1391 |
+
# Get expected count from DOCX template
|
| 1392 |
+
docx_attendance = merged["Attendance List (Names and Position Titles)"]["Attendance List (Names and Position Titles)"]
|
| 1393 |
+
expected_count = len(docx_attendance) if isinstance(docx_attendance, list) else 1
|
| 1394 |
+
|
| 1395 |
+
# Get PDF attendance (already processed by existing extraction)
|
| 1396 |
+
pdf_attendance_raw = pdf_extracted["attendance"]
|
| 1397 |
+
|
| 1398 |
+
# The PDF might have combined entries like "Name1 - Position1 Name2 - Position2"
|
| 1399 |
+
# Split them properly
|
| 1400 |
+
separated_attendance = []
|
| 1401 |
+
for entry in pdf_attendance_raw:
|
| 1402 |
+
# Handle combined entries like "Grant Pontifex - Manager Jodie Jones - Auditor"
|
| 1403 |
+
if " - " in entry:
|
| 1404 |
+
# Try to split by pattern: Position followed by Name
|
| 1405 |
+
import re
|
| 1406 |
+
# Look for pattern: Name - Position Name - Position
|
| 1407 |
+
match = re.search(r'([A-Z][a-z]+\s+[A-Z][a-z]+)\s*-\s*([A-Z][a-z]+)\s+([A-Z][a-z]+\s+[A-Z][a-z]+)\s*-\s*([A-Z][a-z]+)', entry)
|
| 1408 |
+
if match:
|
| 1409 |
+
person1 = f"{match.group(1)} - {match.group(2)}"
|
| 1410 |
+
person2 = f"{match.group(3)} - {match.group(4)}"
|
| 1411 |
+
separated_attendance.extend([person1, person2])
|
| 1412 |
+
else:
|
| 1413 |
+
# If pattern doesn't match, keep as single entry
|
| 1414 |
+
separated_attendance.append(entry)
|
| 1415 |
+
else:
|
| 1416 |
+
separated_attendance.append(entry)
|
| 1417 |
+
|
| 1418 |
+
# Limit to expected count
|
| 1419 |
+
final_attendance = separated_attendance[:expected_count]
|
| 1420 |
+
|
| 1421 |
+
self.log_debug(f"Attendance: DOCX expects {expected_count}, PDF has {len(separated_attendance)}, using: {final_attendance}")
|
| 1422 |
+
|
| 1423 |
+
merged["Attendance List (Names and Position Titles)"]["Attendance List (Names and Position Titles)"] = _clean_list(final_attendance)
|
| 1424 |
+
|
| 1425 |
# Business summary
|
| 1426 |
if "business_summary" in pdf_extracted and "Nature of the Operators Business (Summary)" in merged:
|
| 1427 |
merged["Nature of the Operators Business (Summary)"]["Nature of the Operators Business (Summary):"] = [_smart_space(pdf_extracted["business_summary"])]
|