Spaces:

HaiderAUT
/

document_comparison

Sleeping

App Files Files Community

HaiderAUT commited on Jun 7, 2025

Commit

2b4f1a9

verified ·

1 Parent(s): d620dcf

Update app.py

Browse files

Files changed (1) hide show

app.py +385 -1035

app.py CHANGED Viewed

@@ -1,1035 +1,385 @@
-###############################################################################
-#  CAA  ⇄  OneReg   rule-level diff viewer   (section format)                 #
-###############################################################################
-import io
-import os
-import re
-import html
-import traceback
-import difflib
-import platform
-import pandas as pd
-from datetime import datetime
-import fitz                         # PyMuPDF (for future OneReg table OCR)
-from PyPDF2 import PdfReader        # plain text extraction
-import gradio as gr                 # UI
-from dotenv import load_dotenv      # optional .env support
-# from google import genai            # uncomment when OCRing OneReg tables
-# from google.genai import type
-# ─────────────────────────────────────────────────────────────────────────────
-# 0.  ENV / API KEY (Gemini – *not* used yet, but wired for future)
-# ─────────────────────────────────────────────────────────────────────────────
-#load_dotenv()
-#API_KEY = os.getenv("GOOGLE_API_KEY", "")  # intentionally blank‑safe
-#client = genai.Client(api_key=api_key)
-_table_caption = re.compile(r'^\s*Table\s+\d+\.', re.I)
-# ═════════════════════════════════════════════════════════════════════════════
-# 1. PDF → TEXT
-# ═════════════════════════════════════════════════════════════════════════════
-def extract_pdf_text(pdf_file) -> str:
-    reader = PdfReader(pdf_file)
-    out_pf= "\n".join(p.extract_text() or "" for p in reader.pages)
-    print(out_pf)
-    return out_pf
-def extract_pdf_word(pdf_file) -> str:
-    """
-    Extract text from PDF using PyMuPDF (fitz).
-    This is a fallback if PyPDF2 fails to extract text properly.
-    """
-    doc = fitz.open(pdf_file)
-    out_pf = []
-    for page in doc:
-        text = page.get_text("text")
-        if text:
-            out_pf.append(text.strip())
-    return "\n".join(out_pf)
-def merge_pdf_wrapped_lines(lines):
-    """
-    Join isolated 'Feb. 20,'  '2023,'  'a.m.' (or 'None') pieces that PyPDF2
-    spits out on separate lines.  Returns a fresh list with composites merged.
-    """
-    merged = []
-    buffer = []
-    def flush_buffer():
-        if buffer:
-            merged.append(' '.join(buffer))
-            buffer.clear()
-    for line in lines:
-        if date_piece_pat.match(line.strip()):
-            buffer.append(line.strip().rstrip(','))     # keep pieces
-            # we leave it and wait for the next piece
-        else:
-            # Finished collecting date fragments → flush if we have a date
-            if buffer:
-                flush_buffer()
-            merged.append(line)
-    flush_buffer()                                       # leftover at EOF
-    return merged
-import re, html
-# --- NEW helper ------------------------------------------------------------
-_ROW_START = re.compile(r"^\s*(?:[1-9]|10)\s+\d+\s*m\b")
-def _split_flat_table(line: str) -> list[str] | None:
-    """If *line* contains a flattened table, return caption+rows list else None."""
-    m = re.match(r"^(Table\s+\d+\.[^.]+?\.)\s+(.*)$", line, re.I)
-    if not m:
-        return None
-    caption, body = m.groups()
-    # Insert newline before every category number 1‑10.
-    body = re.sub(r"\s(?=[1-9]\b|10\b)", "\n", body)
-    rows = [r.strip() for r in body.splitlines() if _ROW_START.match(r)]
-    if 3 <= len(rows) <= 12:
-        return [caption] + rows
-    return None
-def render_table_html(lines: list[str]) -> str:
-    """lines[0] caption, lines[1:] rows with 2+ spaces delim –> <table>."""
-    caption = html.escape(lines[0])
-    header_cells = [""] + [html.escape(c) for c in re.split(r"\s{2,}", lines[1].strip())]
-    out: list[str] = [
-        '<table border="1" cellpadding="4" style="border-collapse:collapse;">',
-        f'<caption>{caption}</caption>',
-        '<thead><tr>' + "".join(f"<th>{c}</th>" for c in header_cells) + '</tr></thead>',
-        '<tbody>'
-    ]
-    for row in lines[1:]:
-        cells = re.split(r"\s{2,}", row.strip())
-        out.append('<tr>' + "".join(f"<td>{html.escape(c)}</td>" for c in cells) + '</tr>')
-    out.append('</tbody></table>')
-    return "\n".join(out)
-def inject_tables(text: str) -> str:
-    """Return *text* where flattened or multi‑line tables are turned into HTML."""
-    out, buf, in_table = [], [], False
-    lines = text.splitlines()
-    i = 0
-    while i < len(lines):
-        ln = lines[i]
-        # B) flattened single‑line table
-        split_result = _split_flat_table(ln)
-        if split_result:
-            out.append(render_table_html(split_result))
-            i += 1
-            continue
-        # A) normal multi‑line table (caption + numeric rows)
-        if re.match(r"^\s*Table\s+\d+\.", ln, re.I):
-            in_table, buf = True, [ln]
-            i += 1
-            continue
-        if in_table and re.match(r"^\s*\d+(?:\s+\d+)+", ln):
-            buf.append(ln)
-            i += 1
-            continue
-        else:
-            if in_table:
-                out.append(render_table_html(buf))
-                in_table, buf = False, []
-        out.append(html.escape(ln))
-        i += 1
-    if in_table:
-        out.append(render_table_html(buf))
-    return "<br>".join(out)
-def inject_tables(text: str) -> str:
-    """
-    Convert *only* CAA tables into HTML tables.
-    OneReg passes through unchanged.
-    """
-    out, buf, in_table = [], [], False
-    lines = text.splitlines()
-    i = 0
-    while i < len(lines):
-        ln = lines[i]
-        # ── B) Flattened CAA table in a single physical line ───────────────
-        split_result = _split_flat_table(ln)
-        if split_result:
-            out.append(render_table_html(split_result))
-            i += 1
-            continue
-        # ── A) Normal caption + separate numeric rows table ────────────────
-        if re.match(r'^\s*Table\s+\d+\.', ln, re.I):      # caption line
-            in_table, buf = True, [ln]
-            i += 1
-            continue
-        if in_table and re.match(r'^\s*\d+(?:\s+\d+)+', ln):  # numeric row
-            buf.append(ln)
-            i += 1
-            continue
-        else:
-            if in_table:                         # end of numeric block
-                out.append(render_table_html(buf))
-                in_table, buf = False, []
-        # not part of a table ► just escape it
-        out.append(html.escape(ln))
-        i += 1
-    if in_table:                                 # file ended inside a table
-        out.append(render_table_html(buf))
-    return "<br>".join(out)
-def collapse_leading_repeats(line: str) -> str:
-    """
-    If a OneReg line begins with the same word repeated
-    (possibly separated by commas/spaces), collapse them
-    into a single occurrence plus one space.
-    """
-    # grab the first word
-    m = re.match(r'^\s*(\w+)\b', line)
-    if not m:
-        return line
-    prefix,first_word= m.groups()
-    pat=rf'\b{re.escape(first_word)}\b'
-    matches=list(re.finditer(pat, line, flags=re.IGNORECASE))
-    if len(matches) < 2:
-        return line
-    # match any number of "word" occurrences, commas or spaces
-    first_end=matches[0].end()
-    second_start,second_end = matches[1].span()
-    between= line[first_end:second_start]
-    between_words = re.split(r'[ ,]+', between)  # split by spaces or commas
-    if len(between_words) <= 5:  # max 5 chars between
-        return line
-    rest=line[second_end:]  # everything after the second occurrence
-    return f"{prefix}{first_word} {rest.strip() if rest else ''}"  # collapse to single occurrence
-def strip_stray_numbers(line: str) -> str:
-    """
-    Remove inline “55(a)”, “12(ii)”, etc., UNLESS they are part of a genuine
-    rule / section / paragraph citation such as “rule 139.21(b)”.
-    """
-    def _repl(m: re.Match) -> str:
-        # text that precedes the match
-        prefix = line[:m.start()]
-        # keep the match if prefix ends with:
-        #   rule 139.21     ← full rule citation
-        #   rule            ← plain “rule”
-        #   paragraph       ← paragraph / paragraphs
-        #   section         ← section / sections
-        if re.search(
-            r'\b(?:rule|rules?|paragraphs?|sections?)'
-            r'(?:\s+\d+\.\d+)?\s*$',
-            prefix,
-            flags=re.IGNORECASE,
-        ):
-            return m.group(0)     # preserve it
-        return ''                 # otherwise drop it
-    # match digits followed by (…)  BUT NOT digits.digits(…)
-    pattern = r'\b(?!\d+\.\d+)\d+\s*\([^)]*\)'
-    return re.sub(pattern, _repl, line)
-def merge_pdf_wrapped_lines(raw_text: str) -> list[str]:
-    """
-    Re-join hard-wrapped lines produced by the PDF extractor.
-    • join when previous line ends with “rule ….” or any lower-case word
-      and the next line starts with a digit, ‘(’, or lower-case letter
-    • keep existing “rule/may + 123.45” glue logic
-    """
-    merged = []
-    for ln in raw_text.splitlines():
-        ln_stripped = ln.strip()
-        if merged:
-            prev = merged[-1]
-            # A) break after “rule 139.”  → glue “21(b)…”
-            if re.search(r'\brule\s+\d+\.$', prev, re.I) and re.match(r'^\d', ln_stripped):
-                merged[-1] = prev + ln_stripped        # no space; dot already present
-                continue
-            # B) generic sentence wrap “…and” / “…or”
-            if re.search(r'[a-z]$', prev) and re.match(r'^[\(a-z]', ln_stripped):
-                merged[-1] = prev + ' ' + ln_stripped
-                continue
-            if re.search(r'\b(?:and|or)$', prev) and re.match(r'^\d+\.\d+', ln_stripped):
-                merged[-1] = prev + ' ' + ln_stripped
-                continue
-            # C) original “rule|may + 123.45 …” glue
-            if re.search(r'\b(?:rule|may)$', prev, re.I) and re.match(r'^\d+\.\d+', ln_stripped):
-                merged[-1] = prev + ' ' + ln_stripped
-                continue
-        merged.append(ln_stripped)
-    return merged
-"""
-def merge_pdf_wrapped_lines(raw_text: str) -> list[str]:
-    Glue back any PDF-wrapped continuation lines so that mid-sentence
-    breaks (e.g. after 'and', 'or', or before a '(2)') get reattached.
-    merged = []
-    for ln in raw_text.splitlines():
-        ln_stripped = ln.strip()
-        if merged:
-            prev = merged[-1]
-            # 1) if prev ends in a lowercase letter (no punctuation),
-            #    and this line starts with '(' or lowercase, glue it:
-            if re.search(r'[a-z]$', prev) and re.match(r'^[\(a-z]', ln_stripped):
-                merged[-1] = prev + ' ' + ln_stripped
-                continue
-            # 2) keep your old rule‐merge logic too:
-            if ( re.search(r'\b(?:rule|may)$', prev, re.IGNORECASE )
-                 and re.match(r'^\d+\.\d+', ln_stripped) ):
-                merged[-1] = prev + ' ' + ln_stripped
-                continue
-        merged.append(ln_stripped)
-    return merged
-"""
-# ═════════════════════════════════════════════════════════════════════════════
-# 2. Helpers to drop OneReg auto IDs & inline rule numbers
-# ═════════════════════════════════════════════════════════════════════════════
-def zap_auto_outline_ids(s: str) -> str:
-    return re.sub(r'\b(?:\d+\.){3,}\s*', '', s)
-def collapse_inner_parens(line: str) -> str:
-    """
-    For OneReg headings *only*:
-    - If the line refers to a rule, section(s), or paragraph(s),
-      return it unchanged (preserve all brackets).
-    - Otherwise, if there are 2+ (...) groups, remove all but the last.
-    """
-    # 1) if it's a rule/section/paragraph reference—skip collapsing
-    if re.search(r'\b(?:rule|section|sections|paragraph|paragraphs|under|and)\b',
-                 line,
-                 flags=re.IGNORECASE):
-        return line
-    # 2) find all paren-groups
-    parens = re.findall(r'\([^()]*\)', line)
-    if len(parens) <= 1:
-        return line
-    # 3) remove every but the final (...)
-    last = parens[-1]
-    new_line = line
-    for p in parens[:-1]:
-        new_line = new_line.replace(p, '')
-    return new_line
-def zap_inline_rule_numbers(s: str) -> str:
-    # only match digit-dot-digit[letter] if NOT preceded by “rule ”
-    return re.sub(
-        r'(?<!\brules\s)\b\d+\.\d+(?:[A-Z]?)\s*(?=\()',
-        '',
-        s,
-        flags=re.IGNORECASE
-    )
-def strip_inline_self_ref(text_line: str, tail: str) -> str:
-    if not tail:
-        return text_line
-    pattern = rf'\b{re.escape(tail)}(?:\s*\([^)]+\))+'
-    prev = None
-    while prev != text_line:
-        prev = text_line
-        text_line = re.sub(pattern, '', text_line).strip()
-    return text_line
-def drop_leading_repeated_title(line: str, title: str) -> str:
-    if not title:
-        return line
-    pat = rf'^(?:{re.escape(title)}\s*){{2,}}'
-    return re.sub(pat, f'{title} ', line, flags=re.IGNORECASE).strip()
-# catch “Appendix A — Title”
-appendix_pat = re.compile(
-  r'^(?:Appendix)\s+([A-Z])\s*[–—-]\s*(?P<title>.+)$',
-    re.IGNORECASE
-)
-page_pat  = re.compile(r'Page\s+\d+\s*/\s*\d+', re.IGNORECASE)
-date_pat  = re.compile(
-    r'(?:Jan\.?|Feb\.?|Mar\.?|Apr\.?|May\.?|Jun\.?|Jul\.?|Aug\.?|'
-    r'Sep\.?|Sept\.?|Oct\.?|Nov\.?|Dec\.?|January|February|March|April|May|'
-    r'June|July|August|September|October|November|December)'
-    r'\s+\d{1,2},\s*\d{4}(?:,\s*(?:a\.?m\.?|p\.?m\.?))?',
-    re.IGNORECASE
-)
-# Inside clean_line function
-header_pat = re.compile(
-    r'^(?:Purpose\s+)?'                    # optional "Purpose"
-    r'(?:[A-Z][a-z]{2}\.)\s+\d{1,2},\s*\d{4},\s*(?:a\.?m\.?|p\.?m\.?)'  # "Feb. 20, 2023, a.m."
-    r'(?:\s*\([a-z]\)\s*[A-Z][a-z]{2}\.\s+\d{1,2},\s*\d{4},\s*(?:a\.?m\.?|p\.?m\.?))*$',  # repeats
-    re.IGNORECASE
-)
-MONTH = r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\.?'
-TIME  = r'(?:a\.?m\.?|p\.?m\.?)'
-YEAR  = r'\d{4}'
-orphan_date_pat = re.compile(
-    rf'^(?:{MONTH}|{TIME}|{YEAR}|\d{{1,2}},?)$'
-)
-def clean_line(line: str, source: str) -> str:
-    # --- Apply OneReg prefix removal FIRST if applicable ---
-    if source == "onereg":
-        line = zap_auto_outline_ids(line) # Remove 11.3.5. etc. prefixes
-        if header_pat.match(line):
-            return ""
-    if source == "caa":
-        line = line.replace('—', '')
-        line = line.replace('–', '')
-    # --- Continue with other cleaning ---
-    line = page_pat.sub('', line)
-    if orphan_date_pat.match(line):
-        return
-    if source == "caa" and ("Civil Aviation Rules" in line or "CAA of NZ" in line):
-        # ... (CAA header cleaning remains the same) ...
-        line = re.sub(
-            r'Civil Aviation Rules\s+Part\s+\d+\s+CAA Consolidation', '', line
-        )
-        line = re.sub(
-            r'^\d{1,2}\s+[A-Za-z]+\s+\d{4}\s*\d*\s*CAA of NZ', '', line
-        ) # Made middle number optional
-        line = re.sub(r'\s{2,}', ' ', line).strip()
-        if not line: return ""
-    # ... (Rest of general cleaning: dots, page, email, time, Exported, date) ...
-    if re.search(r'\.{4,}\s*\d+\s*$', line): return "" # Ellipsis followed by number (TOC)
-    if page_pat.fullmatch(line.strip()): return ""
-    line = re.sub(r'\S+@\S+', '', line)
-    line = re.sub(r'\b\d{1,2}:\d{2}(?:\s*(?:a\.?m\.?|p\.?m\.?))?', '', line, flags=re.IGNORECASE)
-    line = re.sub(r'Exported:.*$', '', line)
-    line = date_pat.sub('', line)
-    # --- Inline rule number zapping (careful not to zap appendix numbers like A.1) ---
-    line = re.sub(r'\b(rule)\s+(\d+\.\d+)', r'\1 \2', line, flags=re.IGNORECASE)
-    # Only zap digit.digit patterns, not Letter.digit
-    line = re.sub(
-         r'(?<!\brule\s)(?<!^[A-Z]\.)\b\d+\.\d+(?:[A-Z]?)\s*(?=\()', # Added negative lookbehind for Letter.
-        '',
-        line,
-        flags=re.IGNORECASE
-    )
-    # Zap table references like Table B-1 if needed, can be specific
-    line = re.sub(r'\bTable\s+[A-Z]-\d+\b', '', line, flags=re.IGNORECASE)
-    line = page_pat.sub('', line) # Redundant page check just in case
-    return re.sub(r'\s{2,}', ' ', line).strip()
-# ═════════════════════════════════════════════════════════════════════════════
-# 4. CAPTURE ONLY TOP-LEVEL RULES
-# ═════════════════════════════════════════════════════════════════════════════
-#rule_pat_1 = re.compile(r'^(?:\d+\.)*\s*(?P<rule>\d+\.\d+)\s*(?P<title>[A-Z].*)$') #works without A,B,C subparts
-# In the Regex section
-# REMOVE: appendix_main_pat = re.compile(...)
-# MODIFIED: Regex for Appendix Items (like "A.1 Title" or "C.2.1 (a) Title")
-# Now the primary way to find appendix sections.
-appendix_item_pat = re.compile(
-    r'^\s*([A-Z])\.(\d+(?:\.\d+)*)'  # Start of line, Letter.Number(s) e.g., "A.1", "C.2.1"
-    r'(?:\s*\(([^)]+)\))?'           # Optional captured parenthetical part e.g., "(a)" or "(1)"
-    r'\s+'                           # Space separator REQUIRED before title
-    r'(?P<title>[A-Za-z0-9].*)$',    # Capture title (must start alphanumeric)
-    re.IGNORECASE
-)
-# Keep rule_pat and subpart_pat as they were (assuming rule_pat ignores prefixes correctly now)
-rule_pat = re.compile(
-    r'^(?:(?:\d+\.){2,}\s*)?'           # Optional & Non-capturing: OneReg outline prefix
-    r'(?P<base_rule>\d+\.\d+(?:[A-Z]?))'
-    r'(?P<parens>(?:\s*\([^)]+\))*?)'
-    r'\s*'
-    r'(?P<title>.*)$',
-    re.IGNORECASE
-)
-subpart_pat = re.compile(
-    r'^\s*'
-    r'\d+\.\s*' # Still expect number for subpart heading based on earlier examples
-    r'Subpart\s+'
-    r'([A-Z]{1,2})\s*'
-    r'[—-]\s*'
-    r'(.+)$',
-    re.IGNORECASE
-)
-print(rule_pat)
-def natural_key(r):
-    minor = r.split('.',1)[1]
-    m = re.match(r"^(\d+)([A-Z]?)$", minor)
-    if m:
-        return (int(m.group(1)), m.group(2))   # (5, "") or (5, "A")
-    return (0, minor)
-# Inside parse_rules function loop
-import re
-def collapse_repeated_first_word(line: str) -> str:
-    """
-    Finds the first word, then finds its next occurrence within 5 words,
-    and if so removes everything between (and including) that second match,
-    preserving all other whitespace and punctuation.
-    """
-    # 1) Match and capture any leading indent + the first word
-    m = re.match(r'^(\s*)(\w+)\b', line)
-    if not m:
-        return line
-    prefix, first_word = m.groups()
-    # 2) Build a word‐boundary pattern for that first word
-    pat = rf'\b{re.escape(first_word)}\b'
-    # 3) Find all occurrences
-    matches = list(re.finditer(pat, line, flags=re.IGNORECASE))
-    if len(matches) < 2:
-        return line
-    # 4) Offsets of the first two matches
-    first_end = matches[0].end()
-    second_start, second_end = matches[1].span()
-    # 5) Count how many actual words lie between
-    between = line[first_end:second_start]
-    if len(re.findall(r'\b\w+\b', between)) > 5:
-        return line
-    # 6) Rebuild: prefix + first word + everything AFTER the second occurrence
-    rest = line[second_end:]
-    return f"{prefix}{first_word}{rest}"
-def parse_rules(text: str, source: str) -> dict[str, str]:
-    rules, current, title = {}, None, ""
-    tail = ""
-    current_appendix_letter = None # Track the current main appendix
-    for raw in text.splitlines():
-        original_line_for_debug = raw.strip() # Keep original for debugging
-        line = clean_line(raw, source)
-        print(f"DEBUG: Checking cleaned line: '{line}'")
-        if not line: continue
-        if source == "onereg":
-            line = collapse_inner_parens(line)
-            #line = strip_stray_numbers(line)
-            #line = collapse_repeated_first_word(line)
-            #line = remove_repeated_prefix(line)
-        # --- Check Order: Appendix Item -> Subpart -> Rule ---
-        # 1. Appendix Item (e.g., "A.1 Title" or "C.2.1 (a) Title")
-        if m_ap_item := appendix_item_pat.match(line):
-            letter = m_ap_item.group(1).upper()
-            numbering = m_ap_item.group(2)
-            paren = m_ap_item.group(3) # Might be None
-            item_title = m_ap_item.group('title').strip()
-            # Construct the key: A.1, A.1(a), C.2.1(1) etc.
-            key_parts = [letter, numbering]
-            if paren:
-                # Clean paren content if needed (e.g., remove internal spaces?)
-                paren_clean = paren.strip()
-                key_parts.append(f"({paren_clean})")
-            key = ".".join(key_parts)
-            current = key
-            title = item_title
-            tail = key
-            rules.setdefault(current, []).append(title)
-            current_appendix_letter = letter # Remember we are inside this appendix
-            print(f"Matched Appendix Item: {key} => '{title}'")
-            continue
-        # 2. Subpart Heading
-        elif m_sp := subpart_pat.match(line):
-            # ... (Subpart logic remains the same) ...
-            code    = m_sp.group(1).upper()
-            subpart_title = m_sp.group(2).strip()
-            heading = f"Subpart {code} — {subpart_title}"
-            key     = f"subpart-{code}"
-            current = key
-            title = heading
-            tail = ""
-            rules.setdefault(current, []).append(heading)
-            current_appendix_letter = None # Exited appendix context
-            print(f"Matched Subpart: {key} => {heading}")
-            continue
-        # 3. Main Rule Heading
-        elif m_rule := rule_pat.match(line):
-            # ... (Main rule logic remains mostly the same, ensure key construction is correct) ...
-            base = m_rule.group('base_rule')
-            parens_str = m_rule.group('parens') or ""
-            title_text = m_rule.group('title').strip()
-            paren_parts = re.findall(r'\(([^)]+)\)', parens_str)
-            key = base + "".join(f"({p.strip()})" for p in paren_parts) # Construct key like 139.555(e)(1)
-            is_likely_heading_only = not title_text or \
-                                     re.match(r'^[\[\(]?[a-zA-Z0-9][\)\]\.]', title_text) or \
-                                     len(title_text) < 5
-            current = key
-            title = title_text
-            tail = key
-            if not is_likely_heading_only and title_text:
-                 rules.setdefault(current, []).append(title_text)
-                 print(f"Matched Rule + Title: {key} => '{title_text}'")
-            else:
-                 rules.setdefault(current, [])
-                 print(f"Matched Rule Heading: {key}")
-            current_appendix_letter = None # Exited appendix context
-            continue
-        # 4. Continuation lines
-        if current:
-            # If we are inside an appendix section (identified by A. B. etc.)
-            # be careful about dropping lines that might look like headings but aren't
-            is_potentially_new_appendix_item = appendix_item_pat.match(line)
-            if is_potentially_new_appendix_item and current_appendix_letter and line.startswith(current_appendix_letter):
-                # This looks like a new sub-item within the *same* appendix
-                # but wasn't matched above (maybe title was too short?).
-                # Treat as continuation for now, might need refinement.
-                print(f"Potential missed heading treated as continuation: {line}")
-                pass # Let it be added below
-            # Apply cleaning to continuation lines
-            # line = strip_inline_self_ref(line, tail) # Review if this works well now
-            line = drop_leading_repeated_title(line, title)
-            if line:
-                rules[current].append(line)
-        else:
-            print(f"DEBUG: Unmatched line (no current rule): '{line}'")
-    return {k: " ".join(v).strip() for k, v in rules.items()}
-# ═════════════════════════════════════════════════════════════════════════════
-# 5. STRING DIFF  (OneReg deletions / modified insertions)
-# ═════════════════════════════════════════════════════════════════════════════
-def diff_cols(one: str, caa: str) -> tuple[str, str]:
-    sm = difflib.SequenceMatcher(None, one, caa)
-    d_one = d_mod = ""
-    for tag, i1, i2, j1, j2 in sm.get_opcodes():
-        if tag == "equal":
-            seg = one[i1:i2]
-            d_one += seg
-            d_mod += seg
-        elif tag == "delete":
-            d_one += f"<span style='background:#f8d4d4'>{one[i1:i2]}</span>"
-        elif tag == "insert":
-            d_mod += f"<span style='background:#d4f8d4'>{caa[j1:j2]}</span>"
-        else:  # replace
-            d_one += f"<span style='background:#f8d4d4'>{one[i1:i2]}</span>"
-            d_mod += f"<span style='background:#d4f8d4'>{caa[j1:j2]}</span>"
-    return d_one, d_mod
-# ═════════════════════════════════════════════════════════════════════════════
-# 5. STRING DIFF (Unified Inline View)
-# ═════════════════════════════════════════════════════════════════════════════
-def diff_unified(one: str, caa: str) -> str:
-    """
-    Generates a single HTML string showing differences inline.
-    Deletions (text in OneReg but not CAA) are shown with red background/strikethrough.
-    Insertions (text in CAA but not OneReg) are shown with green background.
-    Uses html.escape to handle special characters in the text.
-    """
-    sm = difflib.SequenceMatcher(None, one, caa)
-    output = []
-    for tag, i1, i2, j1, j2 in sm.get_opcodes():
-        one_segment = html.escape(one[i1:i2]) # Escape text segments
-        caa_segment = html.escape(caa[j1:j2]) # Escape text segments
-        if tag == "equal":
-            output.append(one_segment)
-        elif tag == "delete":
-            # Wrap deleted text in <del> tags with specific styling
-            output.append(f"<del style='background:#fdd; text-decoration: line-through; color: #000;'>{one_segment}</del>")
-        elif tag == "insert":
-            # Wrap inserted text in <ins> tags with specific styling
-            output.append(f"<ins style='background:#dfd; text-decoration: none; color: #000;'>{caa_segment}</ins>")
-        elif tag == "replace":
-            # Show deletion followed by insertion for replacements
-            output.append(f"<del style='background:#fdd; text-decoration: line-through; color: #000;'>{one_segment}</del>")
-            output.append(f"<ins style='background:#dfd; text-decoration: none; color: #000;'>{caa_segment}</ins>")
-    # Join segments and wrap in a span that preserves whitespace and line breaks
-    # Add color: var(--text) to ensure it adapts to light/dark mode from the body style
-    return f"<span style='white-space: pre-wrap; color: var(--text);'>{''.join(output)}</span>"
-# Remove or comment out the old natural_sort_key if not used elsewhere,
-# or keep if needed for other parts. Let's assume it's not needed now.
-# def natural_sort_key(rule_id: str): ... # Keep if used, remove/comment if not
-def natural_sort_key(rule_id: str):
-    # rule_id is e.g. "139.5", "139.5A", "139.10"
-    minor = rule_id.split('.', 1)[1]        # "5", "5A", "10"
-    m = re.match(r'^(\d+)([A-Z]?)$', minor)  # capture digits + optional letter
-    if m:
-        return (int(m.group(1)), m.group(2)) # e.g. (5, ""), (5, "A"), (10, "")
-    # fallback: put anything weird at the end
-    return (float('inf'), minor)
-# ═════════════════════════════════════════════════════════════════════════════
-# 6. SORTING KEY (Updated for Rule -> Appendix Order)
-# ═════════════════════════════════════════════════���═══════════════════════════
-def combined_sort_key(key: str):
-    # --- Sort Order Priorities ---
-    # 1: Subparts (subpart-A < subpart-AA < subpart-B)
-    # 2: Main Rules (139.1 < 139.5 < 139.5A < 139.10, including subdivisions)
-    # 3: Appendix Items (A.1 < A.1(a) < A.1(1) < A.2 < B.1)
-    # 1. Subparts
-    if key.startswith("subpart-"):
-        code = key.split('-', 1)[1]
-        return (1, len(code), code) # Priority 1
-    # 2. Main Rules (e.g., 139.5, 139.5(e)(1)) - Assign Priority 2
-    elif re.match(r'^\d+\.\d+', key): # Check if it starts like a rule number
-        try:
-            match = re.match(r'^(\d+\.\d+(?:[A-Z]?))((?:\([^)]+\))*)$', key)
-            if match:
-                base_rule_str = match.group(1)    # e.g., "139.555" or "139.5A"
-                parens_str = match.group(2) or "" # e.g., "(e)(1)" or ""
-                part_str, minor_base = base_rule_str.split('.', 1)
-                part_num = int(part_str)
-                m_minor_base = re.match(r'^(\d+)([A-Z]?)$', minor_base)
-                if m_minor_base:
-                    minor_num = int(m_minor_base.group(1))
-                    minor_letter = m_minor_base.group(2)
-                    paren_parts_raw = re.findall(r'\(([^)]+)\)', parens_str)
-                    paren_sort_tuple_elements = ()
-                    for p in paren_parts_raw:
-                        p_strip = p.strip()
-                        if p_strip.isdigit():
-                            paren_sort_tuple_elements += (1, int(p_strip)) # Num first within parens
-                        elif len(p_strip) == 1 and p_strip.isalpha():
-                            paren_sort_tuple_elements += (2, ord(p_strip.lower())) # Letter second
-                        else:
-                             paren_sort_tuple_elements += (3, p_strip.lower()) # Others last
-                    return (2, part_num, minor_num, minor_letter) + paren_sort_tuple_elements # Priority 2
-        except Exception as e:
-             print(f"Warning: Sort key error for rule '{key}': {e}")
-             pass
-    # 3. Appendix Items (e.g., A.1, B.2.1(a)) - Assign Priority 3
-    elif re.match(r'^[A-Z]\.', key):
-        parts = re.split(r'[.()]', key)
-        parts = [p for p in parts if p]
-        sortable_parts = [parts[0]] # Start with the letter (A, B, C...)
-        for part in parts[1:]:
-            if part.isdigit():
-                sortable_parts.append(int(part))
-            else:
-                 if len(part) == 1 and part.isalpha():
-                    sortable_parts.append(ord(part.lower())) # Use ASCII for single letters
-                 else:
-                    sortable_parts.append(part.lower()) # Lowercase others
-        # Priority 3, then sort by parts
-        return (3,) + tuple(sortable_parts)
-    # Fallback
-    return (float('inf'), key) # Put errors/unknowns last
-DATE_HDR_RE = re.compile(
-    r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\.\s+\d{1,2},\s+\d{4}'
-    r'(?:\s+[ap]\.?m\.?)?',       # optional a.m./p.m.
-    re.I,
-)
-# --------------------------------------------------------------------
-# 1) helpers  – put these near the top of the file
-# --------------------------------------------------------------------
-MONTH_RE = r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\.'
-DATE_RE  = re.compile(
-    rf'\b{MONTH_RE}\s+\d{{1,2}},\s+\d{{4}}'
-    r'(?:,\s*)?(?:[ap]\.?m\.?)?',      # optional “, a.m.” / “, p.m.”
-    re.I
-)
-# Stray leading “Fe” that’s left behind when “Feb.” or “Fe\nb.” is split
-FE_CRUMB_RE = re.compile(r'\bFe(?=[a-z])')      # Fevery, Feaccompanied …
-def merge_and_clean(raw: str) -> str:
-    """Collapse newlines, strip date headers, remove Fe* crumbs."""
-    # (i) merge → one long line
-    text = ' '.join(raw.splitlines())
-    # (ii) nuke any full-form dates
-    text = DATE_RE.sub('', text)
-    # (iii) wipe the ‘Fe’ crumbs that remain after bad line-wrap
-    text = FE_CRUMB_RE.sub('', text)
-    # (iv) collapse doubled spaces made by the removals
-    return re.sub(r'\s{2,}', ' ', text).strip()
-def strip_trailing_duplicate_heading(s: str) -> str:
-    """
-    If a line starts with a heading (up to the first '(' or end-of-line)
-    and that identical heading is repeated at the very end, remove the
-    trailing copy.
-    >>> strip_trailing_duplicate_heading(
-    ...   "Changes to certificate holder's organisation (a) … (e) Changes to certificate holder's organisation"
-    ... )
-    "Changes to certificate holder's organisation (a) … (e)"
-    """
-    # 1) grab the prefix heading (everything before the first '(' or EOL)
-    head = s.split('(', 1)[0].strip()
-    if not head:
-        return s
-    # 2) does the string *end* with exactly that heading?
-    if s.rstrip().endswith(head):
-        # slice it off and tidy spaces
-        s = s[: -len(head)].rstrip()
-    return s
-# ═════════════════════════════════════════════════════════════════════════════
-# 6. MAIN COMPARISON FUNCTION
-# ═════════════════════════════════════════════════════════════════════════════
-def compare_regulations_app(part, onereg_pdf, caa_pdf):
-    try:
-        raw_one = extract_pdf_word(onereg_pdf)
-        raw_caa = extract_pdf_text(caa_pdf)
-        for ln in raw_caa.splitlines():
-            if any(tok in ln for tok in ("139.61", "139.63")):
-                print("[RAW]", ln[:120])
-        lines_one = merge_pdf_wrapped_lines(raw_one)
-        lines_caa = merge_pdf_wrapped_lines(raw_caa)
-        text_one = "\n".join(lines_one)
-        text_caa = "\n".join(lines_caa)
-        one = parse_rules(text_one, "onereg")
-        # print(one)
-        caa = parse_rules(text_caa, "caa")
-        # print(one)
-        # filter & sort as before…
-        all_ids = set(one) | set(caa)
-        # Inside compare_regulations_app function
-        user_inputs = [] #to collect the inputs from the user
-        rules = [
-            r for r in all_ids
-            if r.startswith(f"{part}.")  # Main rules like 139.5
-               or r.startswith("subpart-")  # Subpart headings like subpart-A
-               # or r.startswith("appendix-") # REMOVED - No longer generating these keys
-               or re.match(r'^[A-Z]\.', r)  # Appendix items like A.1, B.2(a)
-        ]
-        print(rules)
-        rules.sort(key=combined_sort_key)
-        print(rules)
-        #rules.sort(key=natural_key)
-        sections = []
-        df_rows = []
-        for rule in rules:
-            o = one.get(rule, "")
-            if header_pat.match(o) or DATE_HDR_RE.fullmatch(o.strip()):
-                # print(f"Skipping header line for rule {rule}: {o}")   # comment-out if noisy
-                continue
-            o = merge_and_clean(o)
-            o = collapse_repeated_first_word(o)  # Apply the new function
-            o= strip_trailing_duplicate_heading(o)  # Remove trailing duplicate headings
-            print(o)
-            c = caa.get(rule, "")
-            #o_html = inject_tables(o)
-            c_text_tables = inject_tables(c)
-            unified_diff_html = diff_unified(o, c)
-            sections.append(f"""
-                       <div class="rule-section">
-                         <input type="checkbox" id="chk_{rule}" name="rule" value="{rule}">
-                         <label for="chk_{rule}" class="rule-label">{rule}</label>
-                         <div class="rule-content">
-                           <strong>Unified Diff (OneReg <del style='background:#fdd;text-decoration:line-through;'>deletions</del> / CAA <ins style='background:#dfd;text-decoration:none;'>additions</ins>)</strong><br>
-                           {unified_diff_html}
-                           <br><br>
-                           {
-            f'''<strong>CAA (Cleaned + Tables)</strong><br>
-                                  {c_text_tables}'''
-            }
-                         </div>
-                       </div>
-                       <hr>
-                       """)
-            df_rows.append([rule, ""])
-        style = """
-        <style>
-        /* ───────────  colour tokens  ─────────── */
-        :root{
-            --bg:                 #ffffff;
-            --text:               #000000;
-            --border:             #cccccc;
-            --rule-label-on:      #ff8a80;  /* light green */
-            --rule-content-on:    #e8f5e9;
-        }
-        @media (prefers-color-scheme: dark){
-            :root{
-                --bg:             #121212;
-                --text:           #e0e0e0;
-                --border:         #444444;
-                --rule-label-on:  #ff8a80;;  /* dark-mode green */
-                --rule-content-on:#1b5e20;
-            }
-        }
-        /* ───────────  global  ─────────── */
-        body{
-            background: var(--bg);
-            color:       var(--text);
-            font-family: Arial, Helvetica, sans-serif;
-            font-size:   .9em;
-        }
-        span{white-space:pre-wrap}
-        hr{
-            border:none;
-            border-top:1px solid var(--border);
-            margin:1.2em 0;
-        }
-        /* ───────────  diff-viewer widgets  ─────────── */
-        .rule-section{
-            padding:.5em;
-            transition:background .2s;
-        }
-        .rule-label{
-            font-weight:bold;
-            margin-left:.5em;
-            padding:.2em .4em;
-            border-radius:4px;
-            cursor:pointer;
-        }
-        .rule-content{
-            margin-left:2em;
-            padding:.5em;
-            border-radius:4px;
-        }
-        /* checked highlights */
-        .rule-section input[type=checkbox]:checked + .rule-label{
-            background:var(--rule-label-on);
-        }
-        .rule-section input[type=checkbox]:checked ~ .rule-content{
-            background:var(--rule-content-on);
-        }
-        /* make links + table borders visible in both modes */
-        a{color:inherit;text-decoration:underline;}
-        table{color:inherit;border-color:var(--border);}
-        th,td{border-color:var(--border);}
-        </style>
-        """
-        html_out=style+"".join(sections)
-        # Create a DataFrame for the rules
-        comments_df = pd.DataFrame(df_rows, columns=["Rule", "comment"])
-        return html_out,comments_df
-    except Exception as e:
-        return ("<div style='color:red'>Error:<br>"
-                f"{e}<br><pre>{traceback.format_exc()}</pre></div>")
-def save_comments_to_csv(df: pd.DataFrame):
-    """
-    Writes the editable dataframe (rule, comment) to a CSV and
-    returns a file object that Gradio can offer for download.
-    """
-    # keep only rows where the user actually wrote something
-    df = df[df["comment"].str.strip().astype(bool)]
-    if df.empty:
-        raise gr.Error("You didn’t write any comments yet!")
-    filename = f"rule_comments_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
-    csv_path = os.path.join(os.getcwd(), filename)
-    df.to_csv(csv_path, index=False)
-    return csv_path
-# ═════════════════════════════════════════════════════════════════════════════
-# 7. GRADIO UI
-# ═════════════════════════════════════════════════════════════════════════════
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("## CAA ⇄ OneReg — rule-level diff (section format)")
-    part       = gr.Textbox(label="Part Number", value="139")
-    onereg_pdf = gr.File(label="Upload OneReg PDF")
-    caa_pdf    = gr.File(label="Upload CAA PDF")
-    btn_compare        = gr.Button("Compare")
-    out_html   = gr.HTML()
-    comment_df = gr.Dataframe(
-        headers=["rule", "comment"],
-        datatype=["str", "str"],
-        interactive=True,
-        label="✍️  Type your comments in the **comment** column, then click “Save to CSV”"
-    )
-    btn_save = gr.Button("💾  Save to CSV")
-    download = gr.File(label="Download your CSV")
-    btn_compare.click(
-        compare_regulations_app,
-        inputs=[part, onereg_pdf, caa_pdf],
-        outputs=[out_html, comment_df],
-    )
-    btn_save.click(
-        save_comments_to_csv,
-        inputs=[comment_df],
-        outputs=[download],
-    )
-if __name__ == "__main__":
-    #api_key = os.getenv("GOOGLE_API_KEY")
-    #print(api_key)
-    current_os = platform.system()
-    print(f"Current OS: {current_os}")
-    if current_os == "Windows":
-        print("Running on Windows")
-        server_name = "localhost"
-    elif current_os == "Linux":
-        server_name="0.0.0.0"
-    else:
-        server_name = "0.0.0.0"
-    demo.launch(
-        server_name=server_name,
-        server_port=int(os.environ.get("GRADIO_SERVER_PORT", 7860)),
-        share=False
-    )

+###############################################################################
+#  CAA  ⇄  OneReg   |  Dual Document Cleaning & Comparison Tool               #
+###############################################################################
+import io
+import os
+import re
+import html
+import json
+import traceback
+import difflib
+import platform
+import pandas as pd
+from datetime import datetime
+import fitz  # PyMuPDF
+from PyPDF2 import PdfReader  # plain text extraction
+import gradio as gr  # UI
+from dotenv import load_dotenv  # optional .env support
+# ─────────────────────────────────────────────────────────────────────────────
+# 1. PDF & TEXT PROCESSING
+# ─────────────────────────────────────────────────────────────────────────────
+def extract_pdf_text(pdf_file) -> str:
+    """Extracts text from a PDF file using PyPDF2."""
+    reader = PdfReader(pdf_file)
+    return "\n".join(p.extract_text() or "" for p in reader.pages)
+def extract_pdf_word(pdf_file) -> str:
+    """Extracts text from PDF using PyMuPDF (fitz) for better layout preservation."""
+    doc = fitz.open(pdf_file)
+    text_blocks = [page.get_text("text") for page in doc]
+    return "\n".join(filter(None, text_blocks))
+def merge_pdf_wrapped_lines(raw_text: str) -> list[str]:
+    """Re-join hard-wrapped lines from PDF extraction."""
+    merged = []
+    for ln in raw_text.splitlines():
+        ln_stripped = ln.strip()
+        if not ln_stripped: continue
+        if merged:
+            prev = merged[-1]
+            if (re.search(r'[a-z]$', prev) and re.match(r'^[\(a-z]', ln_stripped)) or \
+                    (re.search(r'\b(?:rule|may|and|or)$', prev, re.I) and re.match(r'^\d+\.\d+', ln_stripped)) or \
+                    (re.search(r'\brule\s+\d+\.$', prev, re.I) and re.match(r'^\d', ln_stripped)):
+                merged[-1] = prev + (' ' if re.search(r'[a-z]$', prev) else '') + ln_stripped
+                continue
+        merged.append(ln_stripped)
+    return merged
+# ─────────────────────────────────────────────────────────────────────────────
+# 2. RULE PARSING & CLEANING (Initial Automated Pass)
+# ─────────────────────────────────────────────────────────────────────────────
+# --- Regex for rule structure ---
+rule_pat = re.compile(
+    r'^(?:(?:\d+\.){2,}\s*)?(?P<base_rule>\d+\.\d+(?:[A-Z]?))(?P<parens>(?:\s*\([^)]+\))*?)\s*(?P<title>.*)$',
+    re.IGNORECASE
+)
+appendix_item_pat = re.compile(
+    r'^\s*([A-Z])\.(\d+(?:\.\d+)*)(?:\s*\(([^)]+)\))?\s+(?P<title>[A-Za-z0-9].*)$',
+    re.IGNORECASE
+)
+subpart_pat = re.compile(
+    r'^\s*\d+\.\s*Subpart\s+([A-Z]{1,2})\s*[—-]\s*(.+)$',
+    re.IGNORECASE
+)
+# --- Regex for cleaning ---
+page_pat = re.compile(r'Page\s+\d+\s*/\s*\d+', re.IGNORECASE)
+date_pat = re.compile(
+    r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z.]*\s+\d{1,2},?\s+\d{4}',
+    re.IGNORECASE
+)
+header_pat = re.compile(
+    r'^(?:Purpose\s+)?(?:[A-Z][a-z]{2}\.)\s+\d{1,2},\s*\d{4},.*$', re.IGNORECASE
+)
+def clean_line(line: str, source: str) -> str:
+    """Performs a basic, automated cleaning pass on a line of text."""
+    if source == "onereg":
+        line = re.sub(r'\b(?:\d+\.){3,}\s*', '', line)  # Zap outline IDs 1.2.3.
+        if header_pat.match(line):
+            return ""
+    # Generic cleaning for both
+    line = page_pat.sub('', line)
+    line = date_pat.sub('', line)
+    line = re.sub(r'Civil Aviation Rules\s+Part\s+\d+\s+CAA Consolidation', '', line, flags=re.I)
+    line = re.sub(r'^\d{1,2}\s+[A-Za-z]+\s+\d{4}\s*\d*\s*CAA of NZ', '', line, flags=re.I)
+    line = re.sub(r'\S+@\S+', '', line)  # email
+    line = re.sub(r'\s{2,}', ' ', line)
+    return line.strip()
+def parse_rules(text: str, source: str) -> dict[str, str]:
+    """Parses raw text into a dictionary of {rule_id: rule_text}."""
+    rules, current, title = {}, None, ""
+    lines = merge_pdf_wrapped_lines(text)
+    for raw_line in lines:
+        line = clean_line(raw_line, source)
+        if not line: continue
+        m_ap_item = appendix_item_pat.match(line)
+        m_sp = subpart_pat.match(line)
+        m_rule = rule_pat.match(line)
+        new_key = None
+        new_title = ""
+        if m_ap_item:
+            key_parts = [m_ap_item.group(1).upper(), m_ap_item.group(2)]
+            if m_ap_item.group(3): key_parts.append(f"({m_ap_item.group(3).strip()})")
+            new_key = ".".join(key_parts)
+            new_title = m_ap_item.group('title').strip()
+        elif m_sp:
+            new_key = f"subpart-{m_sp.group(1).upper()}"
+            new_title = f"Subpart {m_sp.group(1).upper()} — {m_sp.group(2).strip()}"
+        elif m_rule:
+            base = m_rule.group('base_rule')
+            parens_str = m_rule.group('parens') or ""
+            new_key = base + "".join(re.findall(r'\([^)]+\)', parens_str))
+            new_title = m_rule.group('title').strip()
+        if new_key:
+            current = new_key
+            title = new_title
+            rules.setdefault(current, [])
+            if title:
+                rules[current].append(title)
+        elif current:
+            if not title or line.lower() != title.lower():
+                rules[current].append(line)
+    return {k: " ".join(v).strip() for k, v in rules.items()}
+# ─────────────────────────────────────────────────────────────────────────────
+# 3. COMPARISON & UI LOGIC
+# ─────────────────────────────────────────────────────────────────────────────
+def diff_unified(one: str, caa: str) -> str:
+    """Generates a single HTML string showing differences inline."""
+    sm = difflib.SequenceMatcher(None, one, caa, autojunk=False)
+    output = []
+    for tag, i1, i2, j1, j2 in sm.get_opcodes():
+        one_segment = html.escape(one[i1:i2])
+        caa_segment = html.escape(caa[j1:j2])
+        if tag == "equal":
+            output.append(one_segment)
+        elif tag == "delete":
+            output.append(
+                f"<del style='background:#fdd; text-decoration: line-through; color: #000;'>{one_segment}</del>")
+        elif tag == "insert":
+            output.append(f"<ins style='background:#dfd; text-decoration: none; color: #000;'>{caa_segment}</ins>")
+        elif tag == "replace":
+            output.append(
+                f"<del style='background:#fdd; text-decoration: line-through; color: #000;'>{one_segment}</del>")
+            output.append(f"<ins style='background:#dfd; text-decoration: none; color: #000;'>{caa_segment}</ins>")
+    return f"<span style='white-space: pre-wrap; color: var(--text);'>{''.join(output)}</span>"
+def combined_sort_key(key: str):
+    """Robustly sorts rules, subparts, and appendices."""
+    if key.startswith("subpart-"):
+        return (1, key)
+    sortable_tuple = ()
+    if re.match(r'^\d+\.\d+', key):
+        sortable_tuple += (2,)
+    elif re.match(r'^[A-Z]\.', key):
+        sortable_tuple += (3,)
+    else:
+        return (4, key)
+    parts = re.split(r'[.()]', key)
+    parts = [p for p in parts if p]
+    for part in parts:
+        if part.isdigit():
+            sortable_tuple += ((1, int(part)),)
+        else:
+            sortable_tuple += ((2, part.lower()),)
+    return sortable_tuple
+def save_clean_and_dirty_versions(dirty_one, dirty_caa, clean_one, clean_caa, filename: str) -> str:
+    """Saves both original and cleaned versions to a .jsonl file."""
+    all_ids = sorted(
+        list(set(dirty_one.keys()) | set(dirty_caa.keys())),
+        key=combined_sort_key
+    )
+    with open(filename, 'w', encoding='utf-8') as f:
+        for rule_id in all_ids:
+            # OneReg record
+            record_one = {
+                "rule_id": rule_id,
+                "source": "onereg",
+                "dirty_text": dirty_one.get(rule_id, ""),
+                "clean_text": clean_one.get(rule_id, "")
+            }
+            f.write(json.dumps(record_one) + '\n')
+            # CAA record
+            record_caa = {
+                "rule_id": rule_id,
+                "source": "caa",
+                "dirty_text": dirty_caa.get(rule_id, ""),
+                "clean_text": clean_caa.get(rule_id, "")
+            }
+            f.write(json.dumps(record_caa) + '\n')
+    return filename
+# --- STAGE 1: Process PDFs and prepare for user review ---
+def stage1_process_and_review(part, onereg_pdf, caa_pdf):
+    if not (onereg_pdf and caa_pdf):
+        raise gr.Error("Please upload both PDF files.")
+    try:
+        # Process OneReg PDF
+        raw_one = extract_pdf_word(onereg_pdf.name)
+        one_data = parse_rules(raw_one, "onereg")
+        # Process CAA PDF
+        raw_caa = extract_pdf_text(caa_pdf.name)
+        caa_data = parse_rules(raw_caa, "caa")
+        # Get all rule IDs and sort them
+        all_ids = sorted(
+            list(set(one_data.keys()) | set(caa_data.keys())),
+            key=combined_sort_key
+        )
+        rules_to_review = [
+            r for r in all_ids
+            if r.startswith(f"{part}.") or r.startswith("subpart-") or re.match(r'^[A-Z]\.', r)
+        ]
+        # Prepare DataFrame for user editing with both documents
+        review_rows = []
+        for rule_id in rules_to_review:
+            one_text = one_data.get(rule_id, "[Rule not found in OneReg]")
+            caa_text = caa_data.get(rule_id, "[Rule not found in CAA]")
+            review_rows.append([rule_id, one_text, caa_text])
+        df = pd.DataFrame(review_rows, columns=["Rule ID", "OneReg Text (Editable)", "CAA Text (Editable)"])
+        return {
+            original_one_state: one_data,
+            original_caa_state: caa_data,
+            review_df: gr.update(value=df, visible=True),
+            btn_finalize: gr.update(visible=True),
+        }
+    except Exception as e:
+        traceback.print_exc()
+        raise gr.Error(f"Failed during initial processing: {e}")
+# --- STAGE 2: Take user-cleaned text and perform the final comparison ---
+def stage2_finalize_and_compare(review_df, original_one, original_caa):
+    if review_df is None or review_df.empty:
+        raise gr.Error("No data to compare. Please process the files first.")
+    # Convert the user-edited DataFrame back into dictionaries
+    clean_one_data = pd.Series(review_df['OneReg Text (Editable)'].values, index=review_df['Rule ID']).to_dict()
+    clean_caa_data = pd.Series(review_df['CAA Text (Editable)'].values, index=review_df['Rule ID']).to_dict()
+    # Save the training data file
+    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+    jsonl_filename = f"cleaned_rules_{timestamp}.jsonl"
+    saved_filepath = save_clean_and_dirty_versions(original_one, original_caa, clean_one_data, clean_caa_data,
+                                                   jsonl_filename)
+    # Perform the final comparison
+    all_ids = sorted(
+        list(set(clean_one_data.keys()) | set(clean_caa_data.keys())),
+        key=combined_sort_key
+    )
+    sections = []
+    for rule_id in all_ids:
+        one_clean = clean_one_data.get(rule_id, "")
+        caa_clean = clean_caa_data.get(rule_id, "")
+        diff_html = diff_unified(one_clean, caa_clean)
+        sections.append(f"""
+           <div class="rule-section">
+             <strong class="rule-label">{rule_id}</strong>
+             <div class="rule-content">
+               {diff_html}
+             </div>
+           </div>
+           <hr>
+        """)
+    style = """
+    <style>
+      body { font-family: sans-serif; color: var(--body-text-color); }
+      .rule-label { font-size: 1.1em; background: #f0f0f0; padding: 5px; display: block; border-top-left-radius: 5px; border-top-right-radius: 5px; }
+      .rule-content { padding: 10px; border: 1px solid #f0f0f0; border-top: none; margin-bottom: 1em; white-space: pre-wrap; }
+      hr { border: none; border-top: 1px solid #ccc; margin: 1.5em 0; }
+    </style>
+    """
+    final_html = style + "".join(sections)
+    return {
+        out_html: gr.update(value=final_html, visible=True),
+        download_jsonl: gr.update(value=saved_filepath, visible=True)
+    }
+# ─────────────────────────────────────────────────────────────────────────────
+# 4. GRADIO UI LAYOUT
+# ─────────────────────────────────────────────────────────────────────────────
+with gr.Blocks(theme=gr.themes.Soft(), title="Dual Rule Cleaning Tool") as demo:
+    gr.Markdown("## CAA ⇄ OneReg — Dual Document Cleaning & Comparison Tool")
+    # State to hold the original "dirty" data between steps
+    original_one_state = gr.State({})
+    original_caa_state = gr.State({})
+    # --- Stage 1: Inputs and Initial Processing ---
+    with gr.Row():
+        part_num = gr.Textbox(label="Part Number", value="139")
+        onereg_pdf = gr.File(label="Upload OneReg PDF")
+        caa_pdf = gr.File(label="Upload CAA PDF")
+    btn_process = gr.Button("1. Process PDFs & Prepare for Cleaning", variant="secondary")
+    gr.Markdown("---")
+    # --- Stage 2: User Review and Cleaning ---
+    gr.Markdown("### 2. Review and Manually Clean Both Documents")
+    gr.Markdown(
+        "Edit the text in the table below to remove any headers, footers, or other noise from **both** documents. Once you are finished, click the 'Finalize, Compare & Save' button.")
+    review_df = gr.DataFrame(
+        headers=["Rule ID", "OneReg Text (Editable)", "CAA Text (Editable)"],
+        datatype=["str", "str", "str"],
+        interactive=True,
+        visible=False,
+        wrap=True,
+        row_count=(10, "dynamic")
+    )
+    btn_finalize = gr.Button("3. Finalize, Compare & Save", variant="primary", visible=False)
+    gr.Markdown("---")
+    # --- Stage 3: Final Comparison Output & Export ---
+    gr.Markdown("### 4. Final Comparison & Export")
+    gr.Markdown(
+        "Deletions from OneReg are in <del style='background:#fdd;'>red</del> and additions from CAA are in <ins style='background:#dfd;'>green</ins>.")
+    out_html = gr.HTML(visible=False)
+    download_jsonl = gr.File(label="Download Cleaned & Dirty Data (.jsonl)", visible=False)
+    # --- Wire up UI events ---
+    btn_process.click(
+        fn=stage1_process_and_review,
+        inputs=[part_num, onereg_pdf, caa_pdf],
+        outputs=[original_one_state, original_caa_state, review_df, btn_finalize]
+    )
+    btn_finalize.click(
+        fn=stage2_finalize_and_compare,
+        inputs=[review_df, original_one_state, original_caa_state],
+        outputs=[out_html, download_jsonl]
+    )
+if __name__ == "__main__":
+    current_os = platform.system()
+    server_name = "0.0.0.0" if current_os == "Linux" else "127.0.0.1"
+    demo.launch(
+        server_name=server_name,
+        server_port=int(os.environ.get("GRADIO_SERVER_PORT", 7860)),
+    )