Spaces:

HaiderAUT
/

document_comparison

Sleeping

App Files Files Community

document_comparison / app.py

HaiderAUT

Update app.py

999af57 verified 7 months ago

raw

history blame contribute delete

15.7 kB

	###############################################################################
	# CAA ⇄ OneReg \| Dual Document Cleaning & Comparison Tool #
	###############################################################################
	import io
	import os
	import re
	import html
	import json
	import traceback
	import difflib
	import platform
	import pandas as pd
	from datetime import datetime

	import fitz # PyMuPDF
	from PyPDF2 import PdfReader # plain text extraction
	import gradio as gr # UI
	from dotenv import load_dotenv # optional .env support


	# ─────────────────────────────────────────────────────────────────────────────
	# 1. PDF & TEXT PROCESSING
	# ─────────────────────────────────────────────────────────────────────────────

	def extract_pdf_text(pdf_file) -> str:
	"""Extracts text from a PDF file using PyPDF2."""
	reader = PdfReader(pdf_file)
	return "\n".join(p.extract_text() or "" for p in reader.pages)


	def extract_pdf_word(pdf_file) -> str:
	"""Extracts text from PDF using PyMuPDF (fitz) for better layout preservation."""
	doc = fitz.open(pdf_file)
	text_blocks = [page.get_text("text") for page in doc]
	return "\n".join(filter(None, text_blocks))


	def merge_pdf_wrapped_lines(raw_text: str) -> list[str]:
	"""Re-join hard-wrapped lines from PDF extraction."""
	merged = []
	for ln in raw_text.splitlines():
	ln_stripped = ln.strip()
	if not ln_stripped: continue
	if merged:
	prev = merged[-1]
	if (re.search(r'[a-z]$', prev) and re.match(r'^[\(a-z]', ln_stripped)) or \
	(re.search(r'\b(?:rule\|may\|and\|or)$', prev, re.I) and re.match(r'^\d+\.\d+', ln_stripped)) or \
	(re.search(r'\brule\s+\d+\.$', prev, re.I) and re.match(r'^\d', ln_stripped)):
	merged[-1] = prev + (' ' if re.search(r'[a-z]$', prev) else '') + ln_stripped
	continue
	merged.append(ln_stripped)
	return merged


	# ─────────────────────────────────────────────────────────────────────────────
	# 2. RULE PARSING & CLEANING (Initial Automated Pass)
	# ─────────────────────────────────────────────────────────────────────────────

	# --- Regex for rule structure ---
	rule_pat = re.compile(
	r'^(?:(?:\d+\.){2,}\s)?(?P<base_rule>\d+\.\d+(?:[A-Z]?))(?P<parens>(?:\s$[^)]+$)?)\s(?P<title>.*)$',
	re.IGNORECASE
	)
	appendix_item_pat = re.compile(
	r'^\s([A-Z])\.(\d+(?:\.\d+))(?:\s$([^)]+)$)?\s+(?P<title>[A-Za-z0-9].)$',
	re.IGNORECASE
	)
	subpart_pat = re.compile(
	r'^\s\d+\.\sSubpart\s+([A-Z]{1,2})\s[—-]\s(.+)$',
	re.IGNORECASE
	)

	# --- Regex for cleaning ---
	page_pat = re.compile(r'Page\s+\d+\s/\s\d+', re.IGNORECASE)
	date_pat = re.compile(
	r'(?:Jan\|Feb\|Mar\|Apr\|May\|Jun\|Jul\|Aug\|Sep\|Oct\|Nov\|Dec)[a-z.]*\s+\d{1,2},?\s+\d{4}',
	re.IGNORECASE
	)
	header_pat = re.compile(
	r'^(?:Purpose\s+)?(?:[A-Z][a-z]{2}\.)\s+\d{1,2},\s\d{4},.$', re.IGNORECASE
	)


	def clean_line(line: str, source: str) -> str:
	"""Performs a basic, automated cleaning pass on a line of text."""
	if source == "onereg":
	line = re.sub(r'\b(?:\d+\.){3,}\s*', '', line) # Zap outline IDs 1.2.3.
	if header_pat.match(line):
	return ""

	# Generic cleaning for both
	line = page_pat.sub('', line)
	line = date_pat.sub('', line)
	line = re.sub(r'Civil Aviation Rules\s+Part\s+\d+\s+CAA Consolidation', '', line, flags=re.I)
	line = re.sub(r'^\d{1,2}\s+[A-Za-z]+\s+\d{4}\s\d\s*CAA of NZ', '', line, flags=re.I)
	line = re.sub(r'\S+@\S+', '', line) # email
	line = re.sub(r'\s{2,}', ' ', line)
	return line.strip()


	def parse_rules(text: str, source: str) -> dict[str, str]:
	"""Parses raw text into a dictionary of {rule_id: rule_text}."""
	rules, current, title = {}, None, ""

	lines = merge_pdf_wrapped_lines(text)

	for raw_line in lines:
	line = clean_line(raw_line, source)
	if not line: continue

	m_ap_item = appendix_item_pat.match(line)
	m_sp = subpart_pat.match(line)
	m_rule = rule_pat.match(line)

	new_key = None
	new_title = ""

	if m_ap_item:
	key_parts = [m_ap_item.group(1).upper(), m_ap_item.group(2)]
	if m_ap_item.group(3): key_parts.append(f"({m_ap_item.group(3).strip()})")
	new_key = ".".join(key_parts)
	new_title = m_ap_item.group('title').strip()
	elif m_sp:
	new_key = f"subpart-{m_sp.group(1).upper()}"
	new_title = f"Subpart {m_sp.group(1).upper()} — {m_sp.group(2).strip()}"
	elif m_rule:
	base = m_rule.group('base_rule')
	parens_str = m_rule.group('parens') or ""
	new_key = base + "".join(re.findall(r'$[^)]+$', parens_str))
	new_title = m_rule.group('title').strip()

	if new_key:
	current = new_key
	title = new_title
	rules.setdefault(current, [])
	if title:
	rules[current].append(title)
	elif current:
	if not title or line.lower() != title.lower():
	rules[current].append(line)

	return {k: " ".join(v).strip() for k, v in rules.items()}


	# ─────────────────────────────────────────────────────────────────────────────
	# 3. COMPARISON & UI LOGIC
	# ─────────────────────────────────────────────────────────────────────────────

	def diff_unified(one: str, caa: str) -> str:
	"""Generates a single HTML string showing differences inline."""
	sm = difflib.SequenceMatcher(None, one, caa, autojunk=False)
	output = []
	for tag, i1, i2, j1, j2 in sm.get_opcodes():
	one_segment = html.escape(one[i1:i2])
	caa_segment = html.escape(caa[j1:j2])
	if tag == "equal":
	output.append(one_segment)
	elif tag == "delete":
	output.append(
	f"<del style='background:#fdd; text-decoration: line-through; color: #000;'>{one_segment}</del>")
	elif tag == "insert":
	output.append(f"<ins style='background:#dfd; text-decoration: none; color: #000;'>{caa_segment}</ins>")
	elif tag == "replace":
	output.append(
	f"<del style='background:#fdd; text-decoration: line-through; color: #000;'>{one_segment}</del>")
	output.append(f"<ins style='background:#dfd; text-decoration: none; color: #000;'>{caa_segment}</ins>")
	return f"<span style='white-space: pre-wrap; color: var(--text);'>{''.join(output)}</span>"


	def combined_sort_key(key: str):
	"""Robustly sorts rules, subparts, and appendices."""
	if key.startswith("subpart-"):
	return (1, key)

	sortable_tuple = ()
	if re.match(r'^\d+\.\d+', key):
	sortable_tuple += (2,)
	elif re.match(r'^[A-Z]\.', key):
	sortable_tuple += (3,)
	else:
	return (4, key)

	parts = re.split(r'[.()]', key)
	parts = [p for p in parts if p]

	for part in parts:
	if part.isdigit():
	sortable_tuple += ((1, int(part)),)
	else:
	sortable_tuple += ((2, part.lower()),)
	return sortable_tuple


	def save_clean_and_dirty_versions(dirty_one, dirty_caa, clean_one, clean_caa, filename: str) -> str:
	"""Saves both original and cleaned versions to a .jsonl file."""
	all_ids = sorted(
	list(set(dirty_one.keys()) \| set(dirty_caa.keys())),
	key=combined_sort_key
	)
	with open(filename, 'w', encoding='utf-8') as f:
	for rule_id in all_ids:
	# OneReg record
	record_one = {
	"rule_id": rule_id,
	"source": "onereg",
	"dirty_text": dirty_one.get(rule_id, ""),
	"clean_text": clean_one.get(rule_id, "")
	}
	f.write(json.dumps(record_one) + '\n')
	# CAA record
	record_caa = {
	"rule_id": rule_id,
	"source": "caa",
	"dirty_text": dirty_caa.get(rule_id, ""),
	"clean_text": clean_caa.get(rule_id, "")
	}
	f.write(json.dumps(record_caa) + '\n')
	return filename


	# --- STAGE 1: Process PDFs and prepare for user review ---
	def stage1_process_and_review(part, onereg_pdf, caa_pdf):
	if not (onereg_pdf and caa_pdf):
	raise gr.Error("Please upload both PDF files.")
	try:
	# Process OneReg PDF
	raw_one = extract_pdf_word(onereg_pdf.name)
	one_data = parse_rules(raw_one, "onereg")

	# Process CAA PDF
	raw_caa = extract_pdf_text(caa_pdf.name)
	caa_data = parse_rules(raw_caa, "caa")

	# Get all rule IDs and sort them
	all_ids = sorted(
	list(set(one_data.keys()) \| set(caa_data.keys())),
	key=combined_sort_key
	)

	rules_to_review = [
	r for r in all_ids
	if r.startswith(f"{part}.") or r.startswith("subpart-") or re.match(r'^[A-Z]\.', r)
	]

	# Prepare DataFrame for user editing with both documents
	review_rows = []
	for rule_id in rules_to_review:
	one_text = one_data.get(rule_id, "[Rule not found in OneReg]")
	caa_text = caa_data.get(rule_id, "[Rule not found in CAA]")
	review_rows.append([rule_id, one_text, caa_text])

	df = pd.DataFrame(review_rows, columns=["Rule ID", "OneReg Text (Editable)", "CAA Text (Editable)"])

	return {
	original_one_state: one_data,
	original_caa_state: caa_data,
	review_df: gr.update(value=df, visible=True),
	btn_finalize: gr.update(visible=True),
	}
	except Exception as e:
	traceback.print_exc()
	raise gr.Error(f"Failed during initial processing: {e}")


	# --- STAGE 2: Take user-cleaned text and perform the final comparison ---
	def stage2_finalize_and_compare(review_df, original_one, original_caa):
	if review_df is None or review_df.empty:
	raise gr.Error("No data to compare. Please process the files first.")

	# Convert the user-edited DataFrame back into dictionaries
	clean_one_data = pd.Series(review_df['OneReg Text (Editable)'].values, index=review_df['Rule ID']).to_dict()
	clean_caa_data = pd.Series(review_df['CAA Text (Editable)'].values, index=review_df['Rule ID']).to_dict()

	# Save the training data file
	timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
	jsonl_filename = f"cleaned_rules_{timestamp}.jsonl"
	saved_filepath = save_clean_and_dirty_versions(original_one, original_caa, clean_one_data, clean_caa_data,
	jsonl_filename)

	# Perform the final comparison
	all_ids = sorted(
	list(set(clean_one_data.keys()) \| set(clean_caa_data.keys())),
	key=combined_sort_key
	)

	sections = []
	for rule_id in all_ids:
	one_clean = clean_one_data.get(rule_id, "")
	caa_clean = clean_caa_data.get(rule_id, "")

	diff_html = diff_unified(one_clean, caa_clean)

	sections.append(f"""
	<div class="rule-section">
	<strong class="rule-label">{rule_id}</strong>
	<div class="rule-content">
	{diff_html}
	</div>
	</div>
	<hr>
	""")

	style = """
	<style>
	body { font-family: sans-serif; color: var(--body-text-color); }
	.rule-label { font-size: 1.1em; background: #f0f0f0; padding: 5px; display: block; border-top-left-radius: 5px; border-top-right-radius: 5px; }
	.rule-content { padding: 10px; border: 1px solid #f0f0f0; border-top: none; margin-bottom: 1em; white-space: pre-wrap; }
	hr { border: none; border-top: 1px solid #ccc; margin: 1.5em 0; }
	</style>
	"""
	final_html = style + "".join(sections)

	return {
	out_html: gr.update(value=final_html, visible=True),
	download_jsonl: gr.update(value=saved_filepath, visible=True)
	}


	# ─────────────────────────────────────────────────────────────────────────────
	# 4. GRADIO UI LAYOUT
	# ─────────────────────────────────────────────────────────────────────────────

	with gr.Blocks(theme=gr.themes.Soft(), title="Dual Rule Cleaning Tool") as demo:
	gr.Markdown("## CAA ⇄ OneReg — Dual Document Cleaning & Comparison Tool")

	# State to hold the original "dirty" data between steps
	original_one_state = gr.State({})
	original_caa_state = gr.State({})

	# --- Stage 1: Inputs and Initial Processing ---
	with gr.Row():
	part_num = gr.Textbox(label="Part Number", value="139")
	onereg_pdf = gr.File(label="Upload OneReg PDF")
	caa_pdf = gr.File(label="Upload CAA PDF")

	btn_process = gr.Button("1. Process PDFs & Prepare for Cleaning", variant="secondary")

	gr.Markdown("---")

	# --- Stage 2: User Review and Cleaning ---
	gr.Markdown("### 2. Review and Manually Clean Both Documents")
	gr.Markdown(
	"Edit the text in the table below to remove any headers, footers, or other noise from both documents. Once you are finished, click the 'Finalize, Compare & Save' button.")

	review_df = gr.DataFrame(
	headers=["Rule ID", "OneReg Text (Editable)", "CAA Text (Editable)"],
	datatype=["str", "str", "str"],
	interactive=True,
	visible=False,
	wrap=True,
	row_count=(10, "dynamic")
	)

	btn_finalize = gr.Button("3. Finalize, Compare & Save", variant="primary", visible=False)

	gr.Markdown("---")

	# --- Stage 3: Final Comparison Output & Export ---
	gr.Markdown("### 4. Final Comparison & Export")
	gr.Markdown(
	"Deletions from OneReg are in <del style='background:#fdd;'>red</del> and additions from CAA are in <ins style='background:#dfd;'>green</ins>.")

	out_html = gr.HTML(visible=False)
	download_jsonl = gr.File(label="Download Cleaned & Dirty Data (.jsonl)", visible=False)

	# --- Wire up UI events ---
	btn_process.click(
	fn=stage1_process_and_review,
	inputs=[part_num, onereg_pdf, caa_pdf],
	outputs=[original_one_state, original_caa_state, review_df, btn_finalize]
	)

	btn_finalize.click(
	fn=stage2_finalize_and_compare,
	inputs=[review_df, original_one_state, original_caa_state],
	outputs=[out_html, download_jsonl]
	)

	if __name__ == "__main__":
	current_os = platform.system()
	server_name = "0.0.0.0" if current_os == "Linux" else "127.0.0.1"
	demo.launch(
	server_name=server_name,
	server_port=int(os.environ.get("GRADIO_SERVER_PORT", 7860)),
	)