Spaces:

HaiderAUT
/

document_comparison

Sleeping

App Files Files Community

document_comparison / app.py

HaiderAUT

Update app.py

cfc4f6b verified 9 months ago

raw

history blame

17.5 kB

	###############################################################################
	# CAA ⇄ OneReg \| Dual Document Cleaning & Comparison Tool #
	###############################################################################
	import io
	import os
	import re
	import html
	import json
	import traceback
	import difflib
	import platform
	import pandas as pd
	from datetime import datetime

	import fitz # PyMuPDF
	from PyPDF2 import PdfReader # plain text extraction
	import gradio as gr # UI
	from dotenv import load_dotenv # optional .env support

	# ─────────────────────────────────────────────────────────────────────────────
	# 1. PDF & TEXT PROCESSING (LOGIC MODIFIED HERE)
	# ─────────────────────────────────────────────────────────────────────────────

	def extract_pdf_text(pdf_file) -> str:
	"""Extracts text from a PDF file using PyPDF2."""
	reader = PdfReader(pdf_file)
	# MODIFICATION: Skips the first 4 pages (ToC/List of Rules)
	return "\n".join(p.extract_text() or "" for i, p in enumerate(reader.pages) if i >= 4)


	def extract_pdf_word(pdf_file) -> str:
	"""Extracts text from PDF using PyMuPDF (fitz) for better layout preservation."""
	doc = fitz.open(pdf_file)
	# MODIFICATION: Skips the first 4 pages (ToC)
	text_blocks = [page.get_text("text") for i, page in enumerate(doc) if i >= 4]
	return "\n".join(filter(None, text_blocks))


	def merge_pdf_wrapped_lines(raw_text: str) -> list[str]:
	"""Re-join hard-wrapped lines from PDF extraction based on grammatical context."""
	merged = []
	for ln in raw_text.splitlines():
	ln_stripped = ln.strip()
	if not ln_stripped:
	continue

	if merged:
	prev = merged[-1]
	# Merge if previous line ends with '—' or lacks closing punctuation,
	# and the next line appears to be a continuation.
	if prev.endswith('—') or \
	(not re.search(r'[.:;)]\s*$', prev) and re.match(r'^[a-z\(]', ln_stripped)):
	merged[-1] = prev + ' ' + ln_stripped
	continue
	merged.append(ln_stripped)
	return merged


	# ─────────────────────────────────────────────────────────────────────────────
	# 2. RULE PARSING & CLEANING (LOGIC MODIFIED HERE)
	# ─────────────────────────────────────────────────────────────────────────────

	# --- Regex for rule structure ---
	rule_pat = re.compile(
	r'^(?:(?:\d+\.){2,}\s)?(?P<base_rule>\d+\.\d+(?:[A-Z]?))(?P<parens>(?:\s$[^)]+$)?)\s(?P<title>.*)$',
	re.IGNORECASE
	)
	appendix_item_pat = re.compile(
	r'^\s([A-Z])\.(\d+(?:\.\d+))(?:\s$([^)]+)$)?\s+(?P<title>[A-Za-z0-9].)$',
	re.IGNORECASE
	)
	subpart_pat = re.compile(
	r'^\s\d+\.\sSubpart\s+([A-Z]{1,2})\s[—-]\s(.+)$',
	re.IGNORECASE
	)
	# NEW: Regex to specifically identify sub-rule paragraphs like (a), (1), (i)
	sub_rule_pat = re.compile(r'^\s($(?:[a-z]{1,2}\|[ivx]+\|\d+)$)\s(.*)', re.IGNORECASE)


	# --- Regex for cleaning ---
	page_pat = re.compile(r'Page\s+\d+\s/\s\d+', re.IGNORECASE)
	date_pat = re.compile(
	r'(?:Jan\|Feb\|Mar\|Apr\|May\|Jun\|Jul\|Aug\|Sep\|Oct\|Nov\|Dec)[a-z.]*\s+\d{1,2},?\s+\d{4}',
	re.IGNORECASE
	)
	header_pat = re.compile(
	r'^(?:Purpose\s+)?(?:[A-Z][a-z]{2}\.)\s+\d{1,2},\s\d{4},.$', re.IGNORECASE
	)


	def clean_line(line: str, source: str) -> str:
	"""Performs a basic, automated cleaning pass on a line of text."""
	if source == "onereg":
	line = re.sub(r'\b(?:\d+\.){3,}\s*', '', line) # Zap outline IDs 1.2.3.
	if header_pat.match(line):
	return ""

	# Generic cleaning for both
	line = page_pat.sub('', line)
	line = date_pat.sub('', line)
	line = re.sub(r'Civil Aviation Rules\s+Part\s+\d+\s+CAA Consolidation', '', line, flags=re.I)
	line = re.sub(r'^\d{1,2}\s+[A-Za-z]+\s+\d{4}\s\d\s*CAA of NZ', '', line, flags=re.I)
	line = re.sub(r'\S+@\S+', '', line) # email
	line = re.sub(r'\s{2,}', ' ', line)
	return line.strip()

	def get_rule_level(paren_str):
	"""Determines nesting level of a sub-rule, e.g., (1) is 1, (a) is 2, (i) is 3."""
	content = paren_str.strip('()').lower()
	if not content: return 99
	if content.isdigit(): return 1
	if all(c in 'ivxlmc' for c in content): return 3 # roman numerals
	if content.isalpha(): return 2 # alphabetical
	return 4 # Unknown level, treat as deeply nested

	def parse_rules(text: str, source: str) -> dict[str, str]:
	"""
	Parses raw text into a dictionary of {rule_id: rule_text}.
	This version is stateful and context-aware to handle hierarchies correctly.
	"""
	rules = {}
	parent_parts = [] # Tracks the current rule hierarchy, e.g., ['108.51', '(3)']
	lines_buffer = []

	def commit_buffer():
	"""Saves the buffered lines to the current rule ID."""
	if parent_parts and lines_buffer:
	rule_id = "".join(parent_parts)
	existing_text = rules.get(rule_id, "")
	new_text = " ".join(lines_buffer)
	rules[rule_id] = (existing_text + " " + new_text).strip()
	lines_buffer.clear()

	lines = merge_pdf_wrapped_lines(text)

	for line in lines:
	cleaned = clean_line(line, source)
	if not cleaned: continue

	m_main = rule_pat.match(cleaned)
	m_sub = sub_rule_pat.match(cleaned)
	m_sp = subpart_pat.match(cleaned)

	if m_sp:
	commit_buffer()
	parent_parts = [f"subpart-{m_sp.group(1).upper()}"]
	rules["".join(parent_parts)] = f"Subpart {m_sp.group(1).upper()} — {m_sp.group(2).strip()}"

	elif m_main:
	new_base_id = m_main.group('base_rule')
	current_base_id = parent_parts[0] if parent_parts and not parent_parts[0].startswith("subpart") else None

	if new_base_id == current_base_id:
	lines_buffer.append(cleaned)
	continue

	commit_buffer()
	parent_parts = [new_base_id]
	title = m_main.group('title').strip()
	if title:
	rules["".join(parent_parts)] = title

	elif m_sub and parent_parts and not parent_parts[0].startswith("subpart"):
	commit_buffer()
	paren_part = m_sub.group(1)
	text_part = m_sub.group(2).strip()
	new_level = get_rule_level(paren_part)

	while len(parent_parts) > 1:
	last_part = parent_parts[-1]
	last_level = get_rule_level(last_part)
	if last_level >= new_level:
	parent_parts.pop()
	else:
	break

	parent_parts.append(paren_part)
	if text_part:
	lines_buffer.append(text_part)

	else:
	lines_buffer.append(cleaned)

	commit_buffer()
	return {k: v for k, v in rules.items() if v}


	# ─────────────────────────────────────────────────────────────────────────────
	# 3. COMPARISON & UI LOGIC (LOGIC MODIFIED HERE)
	# ─────────────────────────────────────────────────────────────────────────────

	def diff_unified(one: str, caa: str) -> str:
	"""Generates a single HTML string showing differences inline."""
	sm = difflib.SequenceMatcher(None, one, caa, autojunk=False)
	output = []
	for tag, i1, i2, j1, j2 in sm.get_opcodes():
	one_segment = html.escape(one[i1:i2])
	caa_segment = html.escape(caa[j1:j2])
	if tag == "equal":
	output.append(one_segment)
	elif tag == "delete":
	output.append(
	f"<del style='background:#fdd; text-decoration: line-through; color: #000;'>{one_segment}</del>")
	elif tag == "insert":
	output.append(f"<ins style='background:#dfd; text-decoration: none; color: #000;'>{caa_segment}</ins>")
	elif tag == "replace":
	output.append(
	f"<del style='background:#fdd; text-decoration: line-through; color: #000;'>{one_segment}</del>")
	output.append(f"<ins style='background:#dfd; text-decoration: none; color: #000;'>{caa_segment}</ins>")
	return f"<span style='white-space: pre-wrap; color: var(--text);'>{''.join(output)}</span>"


	def combined_sort_key(key: str):
	"""Robustly sorts rules, subparts, and appendices."""
	if key.startswith("subpart-"):
	return (1, key)

	sortable_tuple = ()
	if re.match(r'^\d+\.\d+', key):
	sortable_tuple += (2,)
	elif re.match(r'^[A-Z]\.', key):
	sortable_tuple += (3,)
	else:
	return (4, key)

	# MODIFICATION: More robust splitting for hierarchical keys like "108.51(3)(i)"
	parts = re.split(r'(\d+\.\d+)\|($[a-zA-Z0-9]+$)', key)
	parts = [p for p in parts if p]

	for part in parts:
	num_match = re.match(r'^\d+\.\d+$', part)
	if num_match:
	sortable_tuple += tuple( (1, int(x)) for x in part.split('.'))
	else:
	sortable_tuple += ((2, part.lower()),)
	return sortable_tuple


	def save_clean_and_dirty_versions(dirty_one, dirty_caa, clean_one, clean_caa, filename: str) -> str:
	"""Saves both original and cleaned versions to a .jsonl file."""
	all_ids = sorted(
	list(set(dirty_one.keys()) \| set(dirty_caa.keys())),
	key=combined_sort_key
	)
	with open(filename, 'w', encoding='utf-8') as f:
	for rule_id in all_ids:
	# OneReg record
	record_one = {
	"rule_id": rule_id,
	"source": "onereg",
	"dirty_text": dirty_one.get(rule_id, ""),
	"clean_text": clean_one.get(rule_id, "")
	}
	f.write(json.dumps(record_one) + '\n')
	# CAA record
	record_caa = {
	"rule_id": rule_id,
	"source": "caa",
	"dirty_text": dirty_caa.get(rule_id, ""),
	"clean_text": clean_caa.get(rule_id, "")
	}
	f.write(json.dumps(record_caa) + '\n')
	return filename


	def stage1_process_and_review(part, onereg_pdf, caa_pdf):
	if not (onereg_pdf and caa_pdf):
	raise gr.Error("Please upload both PDF files.")
	try:
	# Process OneReg PDF
	raw_one = extract_pdf_word(onereg_pdf.name)
	one_data = parse_rules(raw_one, "onereg")

	# Process CAA PDF
	raw_caa = extract_pdf_text(caa_pdf.name)
	caa_data = parse_rules(raw_caa, "caa")

	# Get all rule IDs and sort them
	all_ids = sorted(
	list(set(one_data.keys()) \| set(caa_data.keys())),
	key=combined_sort_key
	)

	rules_to_review = [
	r for r in all_ids
	if r.startswith(f"{part}.") or r.startswith("subpart-") or re.match(r'^[A-Z]\.', r)
	]

	# Prepare DataFrame for user editing with both documents
	review_rows = []
	for rule_id in rules_to_review:
	one_text = one_data.get(rule_id, "[Rule not found in OneReg]")
	caa_text = caa_data.get(rule_id, "[Rule not found in CAA]")
	review_rows.append([rule_id, one_text, caa_text])

	df = pd.DataFrame(review_rows, columns=["Rule ID", "OneReg Text (Editable)", "CAA Text (Editable)"])

	return {
	original_one_state: one_data,
	original_caa_state: caa_data,
	review_df: gr.update(value=df, visible=True),
	btn_finalize: gr.update(visible=True),
	}
	except Exception as e:
	traceback.print_exc()
	raise gr.Error(f"Failed during initial processing: {e}")


	def stage2_finalize_and_compare(review_df, original_one, original_caa):
	if review_df is None or review_df.empty:
	raise gr.Error("No data to compare. Please process the files first.")

	# Convert the user-edited DataFrame back into dictionaries
	clean_one_data = pd.Series(review_df['OneReg Text (Editable)'].values, index=review_df['Rule ID']).to_dict()
	clean_caa_data = pd.Series(review_df['CAA Text (Editable)'].values, index=review_df['Rule ID']).to_dict()

	# Save the training data file
	timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
	jsonl_filename = f"cleaned_rules_{timestamp}.jsonl"
	saved_filepath = save_clean_and_dirty_versions(original_one, original_caa, clean_one_data, clean_caa_data,
	jsonl_filename)

	# Perform the final comparison
	all_ids = sorted(
	list(set(clean_one_data.keys()) \| set(clean_caa_data.keys())),
	key=combined_sort_key
	)

	sections = []
	for rule_id in all_ids:
	one_clean = clean_one_data.get(rule_id, "")
	caa_clean = caa_data.get(rule_id, "")

	diff_html = diff_unified(one_clean, caa_clean)

	sections.append(f"""
	<div class="rule-section">
	<strong class="rule-label">{rule_id}</strong>
	<div class="rule-content">
	{diff_html}
	</div>
	</div>
	<hr>
	""")

	style = """
	<style>
	body { font-family: sans-serif; color: var(--body-text-color); }
	.rule-label { font-size: 1.1em; background: #f0f0f0; padding: 5px; display: block; border-top-left-radius: 5px; border-top-right-radius: 5px; }
	.rule-content { padding: 10px; border: 1px solid #f0f0f0; border-top: none; margin-bottom: 1em; white-space: pre-wrap; }
	hr { border: none; border-top: 1px solid #ccc; margin: 1.5em 0; }
	</style>
	"""
	final_html = style + "".join(sections)

	return {
	out_html: gr.update(value=final_html, visible=True),
	download_jsonl: gr.update(value=saved_filepath, visible=True)
	}


	# ─────────────────────────────────────────────────────────────────────────────
	# 4. GRADIO UI LAYOUT (UI IS IDENTICAL TO YOUR ORIGINAL SCRIPT)
	# ─────────────────────────────────────────────────────────────────────────────

	with gr.Blocks(theme=gr.themes.Soft(), title="Dual Rule Cleaning Tool") as demo:
	gr.Markdown("## CAA ⇄ OneReg — Dual Document Cleaning & Comparison Tool")

	# State to hold the original "dirty" data between steps
	original_one_state = gr.State({})
	original_caa_state = gr.State({})

	# --- Stage 1: Inputs and Initial Processing ---
	with gr.Row():
	part_num = gr.Textbox(label="Part Number", value="139")
	onereg_pdf = gr.File(label="Upload OneReg PDF")
	caa_pdf = gr.File(label="Upload CAA PDF")

	btn_process = gr.Button("1. Process PDFs & Prepare for Cleaning", variant="secondary")

	gr.Markdown("---")

	# --- Stage 2: User Review and Cleaning ---
	gr.Markdown("### 2. Review and Manually Clean Both Documents")
	gr.Markdown(
	"Edit the text in the table below to remove any headers, footers, or other noise from both documents. Once you are finished, click the 'Finalize, Compare & Save' button.")

	review_df = gr.DataFrame(
	headers=["Rule ID", "OneReg Text (Editable)", "CAA Text (Editable)"],
	datatype=["str", "str", "str"],
	interactive=True,
	visible=False,
	wrap=True,
	row_count=(10, "dynamic")
	)

	btn_finalize = gr.Button("3. Finalize, Compare & Save", variant="primary", visible=False)

	gr.Markdown("---")

	# --- Stage 3: Final Comparison Output & Export ---
	gr.Markdown("### 4. Final Comparison & Export")
	gr.Markdown(
	"Deletions from OneReg are in <del style='background:#fdd;'>red</del> and additions from CAA are in <ins style='background:#dfd;'>green</ins>.")

	out_html = gr.HTML(visible=False)
	download_jsonl = gr.File(label="Download Cleaned & Dirty Data (.jsonl)", visible=False)

	# --- Wire up UI events ---
	btn_process.click(
	fn=stage1_process_and_review,
	inputs=[part_num, onereg_pdf, caa_pdf],
	outputs=[original_one_state, original_caa_state, review_df, btn_finalize]
	)

	btn_finalize.click(
	fn=stage2_finalize_and_compare,
	inputs=[review_df, original_one_state, original_caa_state],
	outputs=[out_html, download_jsonl]
	)

	if __name__ == "__main__":
	current_os = platform.system()
	server_name = "0.0.0.0" if current_os == "Linux" else "127.0.0.1"
	demo.launch(
	server_name=server_name,
	server_port=int(os.environ.get("GRADIO_SERVER_PORT", 7860)),
	)