Spaces:

bor
/

llm-text-preprocessing

Sleeping

Bor Hodošček

feat: relax threshold

0d2f29f unverified 9 months ago

45.5 kB

	# /// script
	# requires-python = ">=3.12"
	# dependencies = [
	# "charset-normalizer==3.4.2",
	# "great-tables==0.17.0",
	# "marimo",
	# "pandas==2.3.0",
	# ]
	# ///
	import marimo

	__generated_with = "0.14.6"
	app = marimo.App(width="full", app_title="LLM Text Preprocessing Checker")


	@app.cell
	def _():
	import marimo as mo

	return (mo,)


	@app.cell
	def _(mo):
	mo.md(
	r"""
	# LLM Text Preprocessing Checker

	Checks two files and provides the diff output as well as metrics on deleted and inserted characters.
	Additionaly, provides a breakdown by Unicode character class of deletions and insertions.

	Note that this uses a pure-Python Myers diff algorithm for the comparison and may not be performant for larger diffs.
	"""
	)
	return


	@app.cell
	def _():
	import unicodedata
	from typing import List, Dict, Any
	from dataclasses import dataclass
	from enum import IntEnum
	import html as python_html
	from great_tables import GT, loc, style
	import pandas as pd

	class Operation(IntEnum):
	DELETE = 0
	INSERT = 1
	EQUAL = 2

	@dataclass(slots=True)
	class Edit:
	operation: Operation
	old_start: int
	old_end: int
	new_start: int
	new_end: int
	old_text: str = ""
	new_text: str = ""

	DEL_STYLE = "background-color:#ffcccc;color:#880000;text-decoration:line-through;"
	INS_STYLE = "background-color:#ccffcc;color:#008800;"
	EQUAL_STYLE = "color:#666666;"
	CONTAINER_STYLE = (
	"font-family: ui-monospace, monospace; "
	"white-space: pre-wrap; "
	"line-height: 1.6; "
	"padding: 20px; "
	"background-color: #f8f9fa; "
	"border-radius: 8px; "
	"border: 1px solid #dee2e6;"
	)

	def classify_char(char: str) -> str:
	"""Classify a character using Unicode categories."""
	if not char:
	return "empty"

	category = unicodedata.category(char)

	# Map Unicode categories to readable classifications
	category_map = {
	"Ll": "lowercase",
	"Lu": "uppercase",
	"Lt": "titlecase",
	"Lm": "modifier_letter",
	"Lo": "other_letter",
	"Nd": "decimal_digit",
	"Nl": "letter_number",
	"No": "other_number",
	"Pc": "connector_punctuation",
	"Pd": "dash_punctuation",
	"Ps": "open_punctuation",
	"Pe": "close_punctuation",
	"Pi": "initial_punctuation",
	"Pf": "final_punctuation",
	"Po": "other_punctuation",
	"Sm": "math_symbol",
	"Sc": "currency_symbol",
	"Sk": "modifier_symbol",
	"So": "other_symbol",
	"Zs": "space",
	"Zl": "line_separator",
	"Zp": "paragraph_separator",
	"Cc": "control",
	"Cf": "format",
	"Co": "private_use",
	"Cn": "unassigned",
	}

	# Special handling for CJK
	if "\u4e00" <= char <= "\u9fff":
	return "cjk_ideograph"
	elif "\u3040" <= char <= "\u309f":
	return "hiragana"
	elif "\u30a0" <= char <= "\u30ff":
	return "katakana"
	elif "\uac00" <= char <= "\ud7af":
	return "hangul"

	return category_map.get(category, category)

	def _myers_backtrack(trace: List[List[int]], a: str, b: str) -> List[Edit]:
	"""Back-tracking helper to materialise the edit script."""
	edits: List[Edit] = []
	n, m = len(a), len(b)
	x, y = n, m
	offset = len(trace[0]) // 2

	# Walk the layers backwards
	for d in range(len(trace) - 1, 0, -1):
	v = trace[d]
	k = x - y
	idx = k + offset

	# Determine the predecessor k'
	if k == -d or (k != d and v[idx - 1] < v[idx + 1]):
	k_prev = k + 1 # came from below (insertion)
	else:
	k_prev = k - 1 # came from right (deletion)

	x_prev = trace[d - 1][k_prev + offset]
	y_prev = x_prev - k_prev

	# Emit the matching "snake"
	while x > x_prev and y > y_prev:
	x -= 1
	y -= 1
	edits.append(Edit(Operation.EQUAL, x, x + 1, y, y + 1, a[x], b[y]))

	# Emit the single edit (INSERT or DELETE) that led to the snake
	if x_prev == x: # insertion
	y -= 1
	edits.append(Edit(Operation.INSERT, x, x, y, y + 1, "", b[y]))
	else: # deletion
	x -= 1
	edits.append(Edit(Operation.DELETE, x, x + 1, y, y, a[x], ""))

	# Leading snake (d = 0) – everything matched at the start
	while x > 0 and y > 0:
	x -= 1
	y -= 1
	edits.append(Edit(Operation.EQUAL, x, x + 1, y, y + 1, a[x], b[y]))

	# Any remaining leading insertions / deletions
	while x > 0:
	x -= 1
	edits.append(Edit(Operation.DELETE, x, x + 1, y, y, a[x], ""))
	while y > 0:
	y -= 1
	edits.append(Edit(Operation.INSERT, x, x, y, y + 1, "", b[y]))

	edits.reverse()
	return edits

	def myers_diff(a: str, b: str) -> List[Edit]:
	"""
	Very fast Myers diff (O((N+M)·D) time, O(N+M) memory).

	Returns a list of Edit objects (DELETE / INSERT / EQUAL).
	"""
	n, m = len(a), len(b)
	if n == 0:
	return [Edit(Operation.INSERT, 0, 0, 0, m, "", b)] if m else []
	if m == 0:
	return [Edit(Operation.DELETE, 0, n, 0, 0, a, "")] if n else []

	max_d = n + m
	offset = max_d # map k ∈ [-max_d .. +max_d] → index
	v = [0] * (2 * max_d + 1) # current frontier
	trace = [] # keeps a copy of v for every d

	# Forward phase – build the "trace" that will be backtracked
	for d in range(max_d + 1):
	v_next = v[:] # copy once per layer
	for k in range(-d, d + 1, 2):
	idx = k + offset

	# Choosing the predecessor (insertion vs deletion)
	if k == -d or (k != d and v[idx - 1] < v[idx + 1]):
	x = v[idx + 1] # insertion (move down)
	else:
	x = v[idx - 1] + 1 # deletion (move right)

	y = x - k

	# Greedy snake – march diagonally while chars match
	while x < n and y < m and a[x] == b[y]:
	x += 1
	y += 1

	v_next[idx] = x

	# Reached the end – stop early
	if x >= n and y >= m:
	trace.append(v_next)
	return _myers_backtrack(trace, a, b)

	trace.append(v_next)
	v = v_next # reuse buffer

	# Should never get here
	raise RuntimeError("diff failed")

	def classify_text(text: str) -> Dict[str, int]:
	"""Count characters by classification."""
	if not text:
	return {}

	classifications = {}
	for char in text:
	char_class = classify_char(char)
	classifications[char_class] = classifications.get(char_class, 0) + 1

	return classifications

	def classify_edits(edits: List[Edit]) -> Dict[Operation, Dict[str, int]]:
	"""
	Classify edit operations by character class.
	Returns a nested dictionary: {operation: {char_class: count}}
	"""
	# Filter out EQUAL operations to save memory
	change_edits = [e for e in edits if e.operation != Operation.EQUAL]

	# Group all edits by operation type (not consecutive grouping)
	edits_by_op = {}
	for edit in change_edits:
	if edit.operation not in edits_by_op:
	edits_by_op[edit.operation] = []
	edits_by_op[edit.operation].append(edit)

	result = {}
	for op, edit_list in edits_by_op.items():
	combined_text = ""
	if op == Operation.DELETE:
	combined_text = "".join(e.old_text for e in edit_list)
	elif op == Operation.INSERT:
	combined_text = "".join(e.new_text for e in edit_list)

	result[op] = classify_text(combined_text)

	return result

	def calculate_change_metrics(
	original: str,
	edits: List[Edit],
	classifications: Dict[Operation, Dict[str, int]],
	) -> Dict[str, Any]:
	"""Calculate detailed change metrics including percentages."""
	metrics = {
	"total_original_chars": len(original),
	"total_deleted_chars": 0,
	"total_inserted_chars": 0,
	"deletion_percentage": 0.0,
	"insertion_percentage": 0.0,
	"net_change_percentage": 0.0,
	"char_class_metrics": {},
	}

	# Calculate total changes
	for edit in edits:
	if edit.operation == Operation.DELETE:
	metrics["total_deleted_chars"] += len(edit.old_text)
	elif edit.operation == Operation.INSERT:
	metrics["total_inserted_chars"] += len(edit.new_text)

	# Calculate percentages
	if metrics["total_original_chars"] > 0:
	metrics["deletion_percentage"] = (
	metrics["total_deleted_chars"] / metrics["total_original_chars"]
	) * 100
	metrics["insertion_percentage"] = (
	metrics["total_inserted_chars"] / metrics["total_original_chars"]
	) * 100
	net_change = (
	metrics["total_inserted_chars"] - metrics["total_deleted_chars"]
	)
	metrics["net_change_percentage"] = (
	net_change / metrics["total_original_chars"]
	) * 100

	# Get character classification of original text
	original_classifications = classify_text(original)

	# Calculate per-character-class metrics
	all_char_classes = set()
	for op_classes in classifications.values():
	all_char_classes.update(op_classes.keys())
	all_char_classes.update(original_classifications.keys())

	for char_class in all_char_classes:
	original_count = original_classifications.get(char_class, 0)
	deleted_count = classifications.get(Operation.DELETE, {}).get(char_class, 0)
	inserted_count = classifications.get(Operation.INSERT, {}).get(
	char_class, 0
	)

	class_metrics = {
	"original_count": original_count,
	"deleted_count": deleted_count,
	"inserted_count": inserted_count,
	"deletion_percentage": 0.0,
	"insertion_percentage": 0.0,
	}

	if original_count > 0:
	class_metrics["deletion_percentage"] = (
	deleted_count / original_count
	) * 100

	# Insertion percentage relative to original count of this class
	if original_count > 0:
	class_metrics["insertion_percentage"] = (
	inserted_count / original_count
	) * 100
	elif inserted_count > 0:
	# If there were none originally, show as new
	class_metrics["insertion_percentage"] = float("inf")

	metrics["char_class_metrics"][char_class] = class_metrics

	return metrics

	def escape_html(text: str) -> str:
	"""Escape HTML and make whitespace visible."""
	# First escape HTML
	text = python_html.escape(text)
	# Make whitespace visible
	ws_trans = str.maketrans({" ": "·", "\t": "→ ", "\n": "¶\n"})
	return text.translate(ws_trans)

	def generate_html_diff(
	edits: List[Edit], show_equal: bool = True, max_equal_length: int = 100
	) -> str:
	"""Generate HTML visualization of the diff with performance optimizations."""
	# Pre-allocate list for better performance
	html_parts = []

	# Group consecutive edits of the same type to reduce HTML tags
	grouped_edits = []
	current_group = []
	current_op = None

	for edit in edits:
	if (
	edit.operation == current_op and len(current_group) < 100
	): # Batch up to 100
	current_group.append(edit)
	else:
	if current_group:
	grouped_edits.append((current_op, current_group))
	current_group = [edit]
	current_op = edit.operation

	if current_group:
	grouped_edits.append((current_op, current_group))

	# Process grouped edits
	for op, group in grouped_edits:
	if op == Operation.DELETE:
	combined_text = "".join(e.old_text for e in group)
	escaped = escape_html(combined_text)
	html_parts.append(
	f'<span style="{DEL_STYLE}" title="Deleted">{escaped}</span>'
	)
	elif op == Operation.INSERT:
	combined_text = "".join(e.new_text for e in group)
	escaped = escape_html(combined_text)
	html_parts.append(
	f'<span style="{INS_STYLE}" title="Added">{escaped}</span>'
	)
	elif op == Operation.EQUAL and show_equal:
	combined_text = "".join(e.old_text for e in group)
	# Truncate very long equal sections
	if len(combined_text) > max_equal_length:
	start = escape_html(combined_text[: max_equal_length // 2])
	end = escape_html(combined_text[-max_equal_length // 2 :])
	omitted = len(combined_text) - max_equal_length
	html_parts.append(
	f'<span style="{EQUAL_STYLE}">{start}'
	f"<em>...{omitted} chars omitted...</em>"
	f"{end}</span>"
	)
	else:
	escaped = escape_html(combined_text)
	html_parts.append(f'<span style="{EQUAL_STYLE}">{escaped}</span>')

	return f'<div style="{CONTAINER_STYLE}">{"".join(html_parts)}</div>'

	def generate_side_by_side_html(edits: List[Edit]) -> str:
	"""Generate side-by-side HTML diff view."""
	old_parts = []
	new_parts = []

	for edit in edits:
	if edit.operation == Operation.DELETE:
	escaped = escape_html(edit.old_text)
	old_parts.append(f'<span style="{DEL_STYLE}">{escaped}</span>')
	elif edit.operation == Operation.INSERT:
	escaped = escape_html(edit.new_text)
	new_parts.append(f'<span style="{INS_STYLE}">{escaped}</span>')
	elif edit.operation == Operation.EQUAL:
	escaped = escape_html(edit.old_text)
	old_parts.append(f"<span>{escaped}</span>")
	new_parts.append(f"<span>{escaped}</span>")

	return f'''
	<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px;">
	<div>
	<h4 style="margin: 0 0 10px 0;">Original</h4>
	<div style="{CONTAINER_STYLE}">{"".join(old_parts)}</div>
	</div>
	<div>
	<h4 style="margin: 0 0 10px 0;">Processed</h4>
	<div style="{CONTAINER_STYLE}">{"".join(new_parts)}</div>
	</div>
	</div>
	'''

	def generate_html_diff_fast(edits: List[Edit], context_chars: int = 5) -> str:
	"""
	Ultra-fast HTML diff generation showing only changes with context.
	"""
	html_parts = []

	# Filter to only show changes and surrounding context
	change_indices = [
	i for i, e in enumerate(edits) if e.operation != Operation.EQUAL
	]

	if not change_indices:
	return '<div style="{CONTAINER_STYLE}">No changes found.</div>'

	# Build ranges to show (change + context)
	ranges_to_show = []
	start = max(0, change_indices[0] - context_chars)
	end = min(len(edits), change_indices[0] + context_chars + 1)

	for idx in change_indices[1:]:
	if idx - end <= context_chars * 2:
	# Extend current range
	end = min(len(edits), idx + context_chars + 1)
	else:
	# Save current range and start new one
	ranges_to_show.append((start, end))
	start = max(0, idx - context_chars)
	end = min(len(edits), idx + context_chars + 1)

	ranges_to_show.append((start, end))

	# Generate HTML for ranges
	for i, (start, end) in enumerate(ranges_to_show):
	if i > 0:
	html_parts.append(
	'<div style="color:#999;text-align:center;margin:10px 0;">...</div>'
	)

	for j in range(start, end):
	edit = edits[j]
	if edit.operation == Operation.DELETE:
	escaped = escape_html(edit.old_text)
	html_parts.append(f'<span style="{DEL_STYLE}">{escaped}</span>')
	elif edit.operation == Operation.INSERT:
	escaped = escape_html(edit.new_text)
	html_parts.append(f'<span style="{INS_STYLE}">{escaped}</span>')
	else: # EQUAL
	escaped = escape_html(edit.old_text)
	html_parts.append(f'<span style="{EQUAL_STYLE}">{escaped}</span>')

	return f'<div style="{CONTAINER_STYLE}">{"".join(html_parts)}</div>'

	def generate_side_by_side_html_fast(
	edits: List[Edit], context_chars: int = 5
	) -> str:
	"""
	Fast side-by-side HTML diff generation showing only changes with context.
	"""
	# Filter to only show changes and surrounding context
	change_indices = [
	i for i, e in enumerate(edits) if e.operation != Operation.EQUAL
	]

	if not change_indices:
	return """
	<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px;">
	<div>
	<h4 style="margin: 0 0 10px 0;">Original</h4>
	<div style="{CONTAINER_STYLE}">No changes found.</div>
	</div>
	<div>
	<h4 style="margin: 0 0 10px 0;">Processed</h4>
	<div style="{CONTAINER_STYLE}">No changes found.</div>
	</div>
	</div>
	"""

	# Build ranges to show (change + context)
	ranges_to_show = []
	start = max(0, change_indices[0] - context_chars)
	end = min(len(edits), change_indices[0] + context_chars + 1)

	for idx in change_indices[1:]:
	if idx - end <= context_chars * 2:
	# Extend current range
	end = min(len(edits), idx + context_chars + 1)
	else:
	# Save current range and start new one
	ranges_to_show.append((start, end))
	start = max(0, idx - context_chars)
	end = min(len(edits), idx + context_chars + 1)

	ranges_to_show.append((start, end))

	# Generate HTML for ranges
	old_parts = []
	new_parts = []

	for i, (start, end) in enumerate(ranges_to_show):
	if i > 0:
	separator = (
	'<div style="color:#999;text-align:center;margin:10px 0;">...</div>'
	)
	old_parts.append(separator)
	new_parts.append(separator)

	for j in range(start, end):
	edit = edits[j]
	if edit.operation == Operation.DELETE:
	escaped = escape_html(edit.old_text)
	old_parts.append(f'<span style="{DEL_STYLE}">{escaped}</span>')
	elif edit.operation == Operation.INSERT:
	escaped = escape_html(edit.new_text)
	new_parts.append(f'<span style="{INS_STYLE}">{escaped}</span>')
	else: # EQUAL
	escaped = escape_html(edit.old_text)
	old_parts.append(f'<span style="{EQUAL_STYLE}">{escaped}</span>')
	new_parts.append(f'<span style="{EQUAL_STYLE}">{escaped}</span>')

	return f'''
	<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px;">
	<div>
	<h4 style="margin: 0 0 10px 0;">Original</h4>
	<div style="{CONTAINER_STYLE}">{"".join(old_parts)}</div>
	</div>
	<div>
	<h4 style="margin: 0 0 10px 0;">Processed</h4>
	<div style="{CONTAINER_STYLE}">{"".join(new_parts)}</div>
	</div>
	</div>
	'''

	def operation_to_past(op: Operation) -> str:
	if op == Operation.INSERT:
	return "inserted"
	else:
	return str(op) + "d"

	def format_diff_summary(
	edits: List[Edit],
	classifications: Dict[Operation, Dict[str, int]],
	metrics: Dict[str, Any],
	) -> str:
	"""Create a human-readable summary of the diff."""
	lines = ["## Diff Summary\n"]

	# Overall statistics
	lines.append("### Overall Statistics")
	lines.append(
	f"- Original text: {metrics['total_original_chars']:,} characters"
	)

	# Format deletions
	del_pct = format_percentage(metrics["deletion_percentage"])
	lines.append(
	f"- Deletions: {metrics['total_deleted_chars']:,} characters ({del_pct})"
	)

	# Format insertions
	ins_pct = format_percentage(metrics["insertion_percentage"])
	lines.append(
	f"- Insertions: {metrics['total_inserted_chars']:,} characters ({ins_pct})"
	)

	# Format net change
	net_pct = metrics["net_change_percentage"]
	if abs(net_pct) < 0.01:
	net_pct_str = f"{net_pct:+.3f}%"
	else:
	net_pct_str = f"{net_pct:+.1f}%"

	lines.append(
	f"- Net change: {net_pct_str} "
	f"({'increase' if metrics['net_change_percentage'] > 0 else 'decrease' if metrics['net_change_percentage'] < 0 else 'no change'})"
	)

	# Character classifications
	if classifications:
	lines.append("\n### Character Classifications")

	# Show changes by character class
	for op in [Operation.DELETE, Operation.INSERT]:
	if op in classifications and classifications[op]:
	lines.append(f"\n{operation_to_past(op).title()} Characters:")
	for char_class, count in sorted(
	classifications[op].items(), key=lambda x: -x[1]
	):
	lines.append(
	f"- {char_class.replace('_', ' ').title()}: {count}"
	)

	# Show percentage changes by character class
	lines.append("\n### Change Percentages by Character Class")

	# Sort by most changed (highest deletion or insertion percentage)
	sorted_classes = sorted(
	metrics["char_class_metrics"].items(),
	key=lambda x: max(
	x[1]["deletion_percentage"],
	0
	if x[1]["insertion_percentage"] == float("inf")
	else x[1]["insertion_percentage"],
	),
	reverse=True,
	)

	for char_class, class_metrics in sorted_classes:
	if (
	class_metrics["deleted_count"] > 0
	or class_metrics["inserted_count"] > 0
	):
	class_name = char_class.replace("_", " ").title()

	# Format the line
	line_parts = [f"- {class_name}:"]

	if class_metrics["original_count"] > 0:
	line_parts.append(
	f"Original: {class_metrics['original_count']}"
	)

	if class_metrics["deleted_count"] > 0:
	line_parts.append(
	f"Deleted: {class_metrics['deleted_count']} "
	f"({class_metrics['deletion_percentage']:.1f}%)"
	)

	if class_metrics["inserted_count"] > 0:
	if class_metrics["insertion_percentage"] == float("inf"):
	line_parts.append(
	f"Inserted: {class_metrics['inserted_count']} (new)"
	)
	else:
	line_parts.append(
	f"Inserted: {class_metrics['inserted_count']} "
	f"({class_metrics['insertion_percentage']:.1f}%)"
	)

	lines.append(" \| ".join(line_parts))

	return "\n".join(lines)

	def format_percentage(value: float, min_decimals: int = 1) -> str:
	"""Format percentage with adaptive decimal places."""
	if value == 0:
	return "0%"
	elif value < 0.01:
	return f"{value:.3f}%" # Show 3 decimals for very small values
	elif value < 0.1:
	return f"{value:.2f}%" # Show 2 decimals for small values
	elif value < 1:
	return f"{value:.1f}%" # Show 1 decimal for values < 1%
	else:
	return f"{value:.0f}%" # No decimals for values >= 1%

	def classify_edits_with_chars(
	edits: List[Edit],
	) -> Dict[Operation, Dict[str, Dict[str, int]]]:
	"""
	Classify edit operations by character class and track character frequencies.
	Returns: {operation: {char_class: {char: count}}}
	"""
	from collections import defaultdict, Counter

	# Filter out EQUAL operations
	change_edits = [e for e in edits if e.operation != Operation.EQUAL]

	# Track characters by operation and classification
	result = defaultdict(lambda: defaultdict(Counter))

	for edit in change_edits:
	text = (
	edit.old_text if edit.operation == Operation.DELETE else edit.new_text
	)

	for char in text:
	char_class = classify_char(char)
	result[edit.operation][char_class][char] += 1

	return dict(result)

	def get_top_chars(char_counter: Dict[str, int], n: int = 5) -> str:
	"""Get top n characters by frequency, formatted for display."""
	if not char_counter:
	return "-"

	# Sort by frequency and take top n
	top_chars = sorted(char_counter.items(), key=lambda x: -x[1])[:n]

	# Format characters for display
	formatted_chars = []
	for char, _ in top_chars:
	if char == " ":
	formatted_chars.append("·") # Middle dot for space
	elif char == "\n":
	formatted_chars.append("¶") # Pilcrow for newline
	elif char == "\t":
	formatted_chars.append("→") # Arrow for tab
	elif ord(char) < 32 or ord(char) == 127:
	formatted_chars.append(f"\\x{ord(char):02x}") # Hex for control chars
	else:
	formatted_chars.append(char)

	return " ".join(formatted_chars)

	def create_summary_tables(
	edits: List[Edit],
	classifications: Dict[Operation, Dict[str, int]],
	metrics: Dict[str, Any],
	) -> Dict[str, GT]:
	"""Create great_tables tables for the diff summary."""

	# Get detailed character data
	detailed_classifications = classify_edits_with_chars(edits)

	# Table 1: Overall Statistics (unchanged)
	overall_data = pd.DataFrame(
	{
	"Metric": [
	"Original Length",
	"Characters Deleted",
	"Characters Inserted",
	"Net Change",
	],
	"Count": [
	metrics["total_original_chars"],
	metrics["total_deleted_chars"],
	metrics["total_inserted_chars"],
	metrics["total_inserted_chars"] - metrics["total_deleted_chars"],
	],
	"Percentage": [
	"-",
	format_percentage(metrics["deletion_percentage"]),
	format_percentage(metrics["insertion_percentage"]),
	f"{metrics['net_change_percentage']:+.3f}%"
	if abs(metrics["net_change_percentage"]) < 0.01
	else f"{metrics['net_change_percentage']:+.1f}%",
	],
	}
	)

	overall_table = (
	GT(overall_data)
	.tab_header(
	title="Text Change Summary",
	subtitle=f"Total edits: {len([e for e in edits if e.operation != Operation.EQUAL])}",
	)
	.fmt_number(columns="Count", decimals=0, use_seps=True)
	.tab_style(
	style=[style.fill(color="#f0f0f0"), style.text(weight="bold")],
	locations=loc.body(rows=[3]),
	)
	.cols_align(align="center", columns=["Count", "Percentage"])
	.opt_stylize(style=1, color="blue")
	)

	# Table 2: Character Class Changes with top characters
	char_class_data = []

	# Get all character classes
	all_classes = set()
	for op_classes in classifications.values():
	all_classes.update(op_classes.keys())
	all_classes.update(metrics["char_class_metrics"].keys())

	# Build rows
	for char_class in sorted(all_classes):
	class_metrics = metrics["char_class_metrics"].get(char_class, {})

	# Get top characters for this class
	del_chars = detailed_classifications.get(Operation.DELETE, {}).get(
	char_class, {}
	)
	ins_chars = detailed_classifications.get(Operation.INSERT, {}).get(
	char_class, {}
	)

	row = {
	"Character Class": char_class.replace("_", " ").title(),
	"Original": class_metrics.get("original_count", 0),
	"Deleted": class_metrics.get("deleted_count", 0),
	"Top Deleted": get_top_chars(del_chars, 5),
	"Inserted": class_metrics.get("inserted_count", 0),
	"Top Inserted": get_top_chars(ins_chars, 5),
	"Del %": format_percentage(class_metrics.get("deletion_percentage", 0))
	if class_metrics.get("deletion_percentage", 0) > 0
	else "-",
	"Ins %": (
	"new"
	if class_metrics.get("insertion_percentage", 0) == float("inf")
	else format_percentage(class_metrics.get("insertion_percentage", 0))
	if class_metrics.get("insertion_percentage", 0) > 0
	else "-"
	),
	}

	# Only include rows with changes
	if row["Deleted"] > 0 or row["Inserted"] > 0:
	char_class_data.append(row)

	if char_class_data:
	char_class_df = pd.DataFrame(char_class_data)
	char_class_table = (
	GT(char_class_df)
	.tab_header(title="Changes by Character Classification")
	.fmt_number(
	columns=["Original", "Deleted", "Inserted"],
	decimals=0,
	use_seps=True,
	)
	.tab_style(
	style=style.fill(color="#ffcccc"),
	locations=loc.body(columns=["Deleted", "Top Deleted"]),
	)
	.tab_style(
	style=style.fill(color="#ccffcc"),
	locations=loc.body(columns=["Inserted", "Top Inserted"]),
	)
	.tab_style(
	style=style.text(font="monospace"),
	locations=loc.body(columns=["Top Deleted", "Top Inserted"]),
	)
	.cols_align(
	align="center",
	columns=["Original", "Deleted", "Inserted", "Del %", "Ins %"],
	)
	.cols_align(align="left", columns=["Top Deleted", "Top Inserted"])
	.tab_spanner(
	label="Counts", columns=["Original", "Deleted", "Inserted"]
	)
	.tab_spanner(
	label="Characters", columns=["Top Deleted", "Top Inserted"]
	)
	.tab_spanner(label="Percentages", columns=["Del %", "Ins %"])
	.cols_width(
	{
	"Character Class": "20%",
	"Original": "10%",
	"Deleted": "10%",
	"Top Deleted": "15%",
	"Inserted": "10%",
	"Top Inserted": "15%",
	"Del %": "10%",
	"Ins %": "10%",
	}
	)
	.opt_stylize(style=1, color="blue")
	)
	else:
	char_class_table = None

	# Table 3: Compact Combined View (unchanged except for percentage formatting)
	compact_data = []

	# Add summary row
	compact_data.append(
	{
	"Type": "Total",
	"Deleted": metrics["total_deleted_chars"],
	"Inserted": metrics["total_inserted_chars"],
	"Net": metrics["total_inserted_chars"] - metrics["total_deleted_chars"],
	"Change": f"{metrics['net_change_percentage']:+.3f}%"
	if abs(metrics["net_change_percentage"]) < 0.01
	else f"{metrics['net_change_percentage']:+.0f}%",
	}
	)

	# Add top character classes (sorted by total change)
	class_changes = []
	for char_class, class_metrics in metrics["char_class_metrics"].items():
	if (
	class_metrics["deleted_count"] > 0
	or class_metrics["inserted_count"] > 0
	):
	class_changes.append(
	{
	"Type": char_class.replace("_", " ").title(),
	"Deleted": class_metrics["deleted_count"],
	"Inserted": class_metrics["inserted_count"],
	"Net": class_metrics["inserted_count"]
	- class_metrics["deleted_count"],
	"Change": class_metrics["deleted_count"]
	+ class_metrics["inserted_count"],
	}
	)

	# Sort by total change and take top 5
	class_changes.sort(key=lambda x: x["Change"], reverse=True)
	for item in class_changes[:5]:
	item["Change"] = f"{item['Net']:+d}" if item["Net"] != 0 else "±0"
	compact_data.append(item)

	compact_df = pd.DataFrame(compact_data)
	compact_table = (
	GT(compact_df)
	.tab_header(title="Edit Summary - Compact View")
	.fmt_number(
	columns=["Deleted", "Inserted", "Net"], decimals=0, use_seps=True
	)
	.tab_style(
	style=[
	style.fill(color="#e8e8e8"),
	style.text(weight="bold"),
	style.borders(sides=["top", "bottom"], color="#666", weight="2px"),
	],
	locations=loc.body(rows=[0]),
	)
	.tab_style(
	style=style.text(color="#880000"),
	locations=loc.body(columns=["Deleted"]),
	)
	.tab_style(
	style=style.text(color="#008800"),
	locations=loc.body(columns=["Inserted"]),
	)
	.cols_align(
	align="center", columns=["Deleted", "Inserted", "Net", "Change"]
	)
	.cols_width(
	{
	"Type": "40%",
	"Deleted": "15%",
	"Inserted": "15%",
	"Net": "15%",
	"Change": "15%",
	}
	)
	.opt_stylize(style=1, color="cyan")
	)

	return {
	"overall": overall_table,
	"char_class": char_class_table,
	"compact": compact_table,
	}

	def create_operation_matrix_table(
	edits: List[Edit], classifications: Dict[Operation, Dict[str, int]]
	) -> GT:
	"""Create a matrix view of operations by character class."""

	# Get all character classes
	all_classes = set()
	for op_classes in classifications.values():
	all_classes.update(op_classes.keys())

	# Build matrix data
	matrix_data = []
	for char_class in sorted(all_classes):
	row = {
	"Character Type": char_class.replace("_", " ").title(),
	"Deletions": classifications.get(Operation.DELETE, {}).get(
	char_class, 0
	),
	"Insertions": classifications.get(Operation.INSERT, {}).get(
	char_class, 0
	),
	"Balance": (
	classifications.get(Operation.INSERT, {}).get(char_class, 0)
	- classifications.get(Operation.DELETE, {}).get(char_class, 0)
	),
	}
	matrix_data.append(row)

	# Sort by total changes
	matrix_data.sort(key=lambda x: x["Deletions"] + x["Insertions"], reverse=True)

	# Convert to DataFrame
	matrix_df = pd.DataFrame(matrix_data)

	# Calculate max values for domains
	max_del = max((r["Deletions"] for r in matrix_data), default=1)
	max_ins = max((r["Insertions"] for r in matrix_data), default=1)
	max_balance = max((abs(r["Balance"]) for r in matrix_data), default=1)

	matrix_table = (
	GT(matrix_df)
	.tab_header(title="Operation Matrix by Character Type")
	.fmt_number(columns=["Deletions", "Insertions", "Balance"], decimals=0)
	.data_color(
	columns=["Deletions"],
	palette=["white", "#ffcccc"],
	domain=[0, max_del],
	)
	.data_color(
	columns=["Insertions"],
	palette=["white", "#ccffcc"],
	domain=[0, max_ins],
	)
	.data_color(
	columns=["Balance"],
	palette=["#ffcccc", "white", "#ccffcc"],
	domain=[-max_balance, max_balance],
	)
	.cols_align(align="center", columns=["Deletions", "Insertions", "Balance"])
	.opt_stylize(style=2, color="gray")
	)

	return matrix_table

	def is_long_diff(edits: List[Edit], original: str) -> bool:
	"""Determine if a diff should use fast rendering."""
	return len(edits) > 1000 or len(original) > 10000

	def analyze_text_changes(
	original: str,
	processed: str,
	) -> Dict[str, Any]:
	"""
	Main function to analyze changes between two texts.
	"""
	edits = myers_diff(original, processed)
	classifications = classify_edits(edits)
	metrics = calculate_change_metrics(original, edits, classifications)
	summary = format_diff_summary(edits, classifications, metrics)

	result = {
	"edits": edits,
	"classifications": classifications,
	"metrics": metrics,
	"summary": summary,
	"tables": create_summary_tables(edits, classifications, metrics),
	"matrix_table": create_operation_matrix_table(edits, classifications),
	}

	return result

	def render_html_diff(
	edits: List[Edit],
	original: str,
	context_chars: int = 5,
	side_by_side: bool = False,
	use_fast_html: bool \| None = None,
	) -> str:
	"""
	Unified function to render HTML diffs with automatic optimization.

	Args:
	edits: List of Edit operations
	original: Original text (for length checking)
	context_chars: Number of context lines to show in fast mode
	side_by_side: Whether to use side-by-side view
	use_fast_html: Force fast mode (None for auto-detect)

	Returns:
	HTML string of the diff
	"""
	if use_fast_html is None:
	use_fast_html = is_long_diff(edits, original)

	if use_fast_html:
	if side_by_side:
	return generate_side_by_side_html_fast(
	edits, context_chars=context_chars
	)
	else:
	return generate_html_diff_fast(edits, context_chars=context_chars)
	else:
	if side_by_side:
	# For non-fast mode, still use length-based optimization
	if len(edits) > 500:
	return generate_side_by_side_html_fast(edits, max_length=50000)
	else:
	return generate_side_by_side_html(edits)
	else:
	return generate_html_diff(edits, show_equal=True, max_equal_length=200)

	return analyze_text_changes, render_html_diff


	@app.cell
	def _(mo):
	o_file_upload = mo.ui.file(label="Original text", kind="area")
	p_file_upload = mo.ui.file(label="Preprocessed text", kind="area")

	file_stack = mo.hstack([o_file_upload, p_file_upload], widths="equal")
	return file_stack, o_file_upload, p_file_upload


	@app.cell
	def _(mo):
	o_textbox = mo.ui.text_area(label="Original text", full_width=True)
	p_textbox = mo.ui.text_area(label="Preprocessed text", full_width=True)

	text_stack = mo.hstack([o_textbox, p_textbox], widths="equal")
	return o_textbox, p_textbox, text_stack


	@app.cell
	def _(file_stack, mo, text_stack):
	mo.ui.tabs({"Text": text_stack, "File": file_stack})
	return


	@app.function
	def check_text_similarity(text1: str, text2: str, threshold: float = 0.1) -> bool:
	"""Check if texts are similar enough based on length and character overlap."""
	if not text1 or not text2:
	return False
	return len(set(text1) & set(text2)) / len(
	set(text1) \| set(text2)
	) >= threshold and abs(len(text1) - len(text2)) / max(len(text1), len(text2)) <= (
	1 - threshold
	)


	@app.cell
	def _(mo, o_file_upload, o_textbox, p_file_upload, p_textbox):
	from charset_normalizer import detect

	def detect_encoding(b: bytes) -> str:
	result = detect(b)
	return result["encoding"]

	o_text, p_text = (
	"Example text will be used if none provided!",
	"Example Text will be used, if none provided.",
	)
	try:
	if o_file_upload.contents():
	encoding = detect_encoding(o_file_upload.contents())
	try:
	o_text = o_file_upload.contents().decode(encoding)
	except UnicodeDecodeError:
	o_text = o_file_upload.contents().decode("utf-8")
	elif o_textbox.value:
	o_text = o_textbox.value

	if p_file_upload.contents():
	encoding = detect_encoding(o_file_upload.contents())
	try:
	p_text = p_file_upload.contents().decode(encoding)
	except UnicodeDecodeError:
	p_text = p_file_upload.contents().decode("utf-8")
	elif p_textbox.value:
	p_text = p_textbox.value
	except UnicodeDecodeError:
	mo.stop(
	True,
	mo.md("Error decoding files. Please try UTF-8.").callout(kind="danger"),
	)

	mo.stop(
	not check_text_similarity(o_text, p_text),
	mo.md(
	f"Texts are too dissimilar! Aborting comparison.\n\n{o_text[:50]}\n\n{p_text[:50]}"
	).callout(kind="danger"),
	)
	return o_text, p_text


	@app.cell
	def _(analyze_text_changes, o_text, p_text):
	results = analyze_text_changes(o_text, p_text)
	return (results,)


	@app.cell
	def _(mo, results):
	results_tables = mo.vstack(
	[
	results["tables"]["overall"],
	results["tables"]["char_class"],
	results["tables"]["compact"],
	]
	)
	return (results_tables,)


	@app.cell
	def _(mo, o_text, render_html_diff, results, results_tables):
	diff_view = mo.ui.tabs(
	{
	"Combined diff": mo.Html(
	render_html_diff(
	results["edits"],
	o_text,
	)
	),
	"Side-by-side diff": mo.Html(
	render_html_diff(
	results["edits"],
	o_text,
	side_by_side=True,
	)
	),
	}
	)

	mo.md(f"""
	# Results

	{results_tables}

	{diff_view}
	""")
	return


	@app.cell
	def _():
	return


	if __name__ == "__main__":
	app.run()