Spaces:

Text-to-Document-Generation
/

Docgenie-API

Paused

Docgenie-API / docgenie /generation /handwriting_diffusion /generate_handwriting_diffusion_raw.py

Ahadhassan-2003

deploy: update HF Space

dc4e6da about 1 month ago

62 kB

	#!/usr/bin/env python3
	"""
	Diffusion-based handwriting token generator with intelligent word splitting and stitching.

	This script:
	- Reads handwriting bbox JSON files with format: "x1,y1,x2,y2,text,block_no,line_no,word_no"
	- Intelligently splits long words internally based on --split-length parameter
	- Splits numeric sequences within tokens into configurable chunk sizes (default: 2)
	- Generates handwriting using HuggingFace diffusion model with text conditioning
	- Stitches split word segments horizontally with baseline alignment
	- Supports sentence-level reconstruction using line metadata
	- Outputs transparent RGBA images with tight cropping
	- Maintains consistent writer styles per document
	- Supports batched generation for GPU efficiency

	Usage example:
	python scripts/generate_handwriting_diffusion_raw.py \
	--input-dir docvqa-handwritten-sizes4/handwriting_bbox \
	--output-dir docvqa-handwritten-sizes4/handwriting_raw_tokens \
	--run-dir model/experiments/hf_conditional_latent \
	--checkpoint latest.pt \
	--steps 30 --split-length 7 --batch-size 8 --device cuda

	With sentence stitching and custom baseline:
	python scripts/generate_handwriting_diffusion_raw.py \
	--input-dir docvqa-handwritten-sizes4/handwriting_bbox \
	--output-dir docvqa-handwritten-sizes4/handwriting_raw_tokens \
	--run-dir model/experiments/hf_conditional_latent \
	--checkpoint latest.pt \
	--steps 30 --split-length 7 --stitch-sentences \
	--baseline-percentile 85.0 --device cuda

	Install requirements:
	pip install torch diffusers transformers Pillow PyYAML

	Mapping file (raw_token_map.json) structure:
	{
	"backend": "diffusion-hf",
	"split_length": 7,
	"entries": [
	{
	"source_json": "example.json",
	"hw_id": "hw0",
	"author_id": "author1",
	"words": [
	{
	"block_no": 22,
	"line_no": 0,
	"word_no": 0,
	"image": "example/hw0_0.png",
	"style_id": 123,
	"width": 250,
	"height": 64,
	"segments": [
	{"token": "genera", "bbox": [x1,y1,x2,y2]},
	{"token": "tion", "bbox": [x1,y1,x2,y2]}
	]
	}
	]
	}
	],
	"file_author_styles": {"example.json": {"author1": {"style_id": 123}}}
	}
	"""

	from __future__ import annotations
	import argparse
	import json
	import math
	import random
	import sys
	from copy import deepcopy
	from datetime import datetime
	from dataclasses import dataclass
	from pathlib import Path
	from typing import Any, Dict, List, Optional, Tuple
	from collections import defaultdict

	from .tokenizer import CharTokenizer
	from .text_encoder import TextEncoder

	try:
	import torch
	import torch.nn as nn
	from diffusers import (
	AutoencoderKL,
	DDPMScheduler,
	DPMSolverMultistepScheduler,
	UNet2DConditionModel,
	)
	from diffusers.training_utils import EMAModel
	import numpy as np
	from PIL import Image
	import yaml
	from rich.progress import Progress
	except Exception as e:
	print(
	"[ERROR] Missing dependencies. Install: torch diffusers transformers Pillow PyYAML",
	file=sys.stderr,
	)
	raise


	BBox = Tuple[float, float, float, float]


	@dataclass
	class WordSegment:
	"""Represents a segment of a word after splitting."""

	token: str
	bbox: BBox
	original_index: (
	int # Track which part of the word this is (0=first, 1=second, etc.)
	)
	space_before: bool = (
	False # True if this segment had a space before it in the original word
	)


	@dataclass
	class WordTask:
	"""Represents a complete word (possibly split into segments)."""

	source_json: str
	hw_id: str
	author_id: str
	block_no: int
	line_no: int
	word_no: int
	segments: List[WordSegment] # List of segments if word was split
	original_bbox: BBox # Original bbox before splitting
	include_in_sentence: bool = (
	True # Whether this word should be considered for sentence stitching
	)
	sentence_exclusion_reason: Optional[str] = (
	None # Reason for omitting from sentence stitching
	)


	# ---------------------------- util ----------------------------


	def list_json_files(p: Path) -> List[Path]:
	return sorted([x for x in p.glob("*.json") if x.is_file()])


	def load_json(path: Path):
	with path.open("r", encoding="utf-8") as f:
	return json.load(f)


	def parse_bbox_record(rec: str) -> Tuple[BBox, str, int, int, int]:
	"""Parse bbox record in format: x1,y1,x2,y2,text,block_no,line_no,word_no"""
	parts = rec.split(",")
	if len(parts) < 8:
	raise ValueError(f"Invalid bbox record (expected at least 8 parts): {rec}")
	x1, y1, x2, y2 = map(float, parts[:4])
	block_no = int(parts[-3])
	line_no = int(parts[-2])
	word_no = int(parts[-1])
	# Text is everything between coordinates and the last 3 indices
	token = ",".join(parts[4:-3])
	return (x1, y1, x2, y2), token, block_no, line_no, word_no


	def split_word(word: str, split_length: int) -> List[str]:
	"""
	Split a word into segments where each segment is AT MOST split_length characters.
	All segments will have equal or nearly equal length, with no segment exceeding split_length.

	Args:
	word: The word to split
	split_length: Maximum length for each segment

	Returns:
	List of word segments (all <= split_length)

	Examples:
	split_word("generation", 4) -> ["gen", "era", "tio", "n"] (3, 3, 3, 1)
	split_word("generation", 5) -> ["gener", "ation"] (5, 5)
	split_word("extraordinary", 7) -> ["extraor", "dinary"] (7, 7)
	split_word("extraordinary", 5) -> ["extra", "ordin", "ary"] (5, 5, 3)
	split_word("hello", 10) -> ["hello"] (5)

	Strategy:
	- Calculate minimum number of segments needed (ceil(len/split_length))
	- Distribute characters as evenly as possible
	- Ensure no segment exceeds split_length
	"""
	if split_length <= 0:
	return [word]

	word_len = len(word)

	if word_len <= split_length:
	return [word]

	# Calculate minimum number of segments needed
	num_segments = (word_len + split_length - 1) // split_length # Ceiling division

	# Calculate base length for each segment (will be <= split_length)
	base_length = word_len // num_segments
	remainder = word_len % num_segments

	# Verify base_length doesn't exceed split_length
	# (This should always be true given our calculation, but being safe)
	assert base_length <= split_length, (
	f"base_length {base_length} exceeds split_length {split_length}"
	)

	# Build segments: first 'remainder' segments get base_length+1, rest get base_length
	segments = []
	start = 0

	for i in range(num_segments):
	# First 'remainder' segments get one extra character
	seg_length = base_length + (1 if i < remainder else 0)
	segments.append(word[start : start + seg_length])
	start += seg_length

	# Verify all segments are <= split_length
	for seg in segments:
	assert len(seg) <= split_length, (
	f"Segment '{seg}' (len={len(seg)}) exceeds split_length {split_length}"
	)

	return segments


	def split_token_preserving_digit_chunks(
	token: str, split_length_words: int, split_length_numeric: int
	) -> List[str]:
	"""
	Split a token while keeping numeric sequences in configurable chunk sizes.

	Args:
	token: The token to split.
	split_length_words: Maximum length for each non-numeric segment.
	split_length_numeric: Maximum length for numeric sequences (<=0 disables special handling).

	Returns:
	List of token segments in the original order.
	"""
	if split_length_numeric <= 0:
	return split_word(token, split_length_words)

	segments: List[str] = []
	idx = 0
	token_len = len(token)

	while idx < token_len:
	if token[idx].isdigit():
	start = idx
	while idx < token_len and token[idx].isdigit():
	idx += 1
	digits = token[start:idx]
	effective_chunk = max(1, split_length_numeric)
	if split_length_words > 0:
	effective_chunk = min(effective_chunk, split_length_words)
	for chunk_start in range(0, len(digits), effective_chunk):
	segments.append(digits[chunk_start : chunk_start + effective_chunk])
	else:
	start = idx
	while idx < token_len and not token[idx].isdigit():
	idx += 1
	alpha = token[start:idx]
	if alpha:
	segments.extend(split_word(alpha, split_length_words))

	return segments or [token]


	def split_word_with_spaces(
	word: str, split_length_words: int, split_length_numeric: int
	) -> List[Tuple[str, bool]]:
	"""
	Split a word into segments, handling spaces first, then applying length-based splitting.

	Args:
	word: The word to split (may contain spaces)
	split_length_words: Maximum length for each segment
	split_length_numeric: Maximum length for numeric sequences within each token (<=0 disables special handling)

	Returns:
	List of tuples (segment_text, space_before) where space_before indicates if this
	segment was separated by a space in the original word.

	Examples:
	split_word_with_spaces("hello world", 10) -> [("hello", False), ("world", True)]
	split_word_with_spaces("very long phrase", 5) -> [("very", False), ("long", True), ("phras", True), ("e", False)]
	split_word_with_spaces("hello", 3) -> [("hel", False), ("lo", False)]

	Strategy:
	1. Split at spaces first
	2. Apply length-based splitting (with digit chunking) to each space-separated part
	3. Mark segments that were separated by spaces with space_before=True
	"""
	if not word:
	return []

	# Split at spaces first
	space_parts = word.split(" ")

	result = []
	for part_idx, part in enumerate(space_parts):
	if not part: # Skip empty parts (from consecutive spaces)
	continue

	# Apply length-based splitting to this part
	length_segments = split_token_preserving_digit_chunks(
	part, split_length_words, split_length_numeric
	)

	for seg_idx, seg in enumerate(length_segments):
	# First segment of non-first parts had a space before it
	space_before = part_idx > 0 and seg_idx == 0
	result.append((seg, space_before))

	return result


	def extract_tasks(
	json_path: Path,
	data: List[Dict[str, Any]],
	split_length_words: int,
	split_length_numeric: int,
	) -> Tuple[List[WordTask], List[Dict[str, Any]]]:
	"""
	Extract word tasks from JSON data, splitting long words internally.

	Args:
	json_path: Path to the JSON file
	data: Parsed JSON data
	split_length_words: Maximum word length before splitting
	split_length_numeric: Maximum length for numeric sequences within tokens (<=0 disables special handling)

	Returns:
	Tuple of (word tasks, extraction log entries)
	"""
	tasks: List[WordTask] = []
	extraction_logs: List[Dict[str, Any]] = []
	fallback_counters: Dict[str, int] = defaultdict(int)
	zero_bbox: BBox = (0.0, 0.0, 0.0, 0.0)

	for obj in data:
	# Skip entries without valid data
	if obj is None:
	continue

	hw_id = obj.get("id")
	author_id = obj.get("author-id") or obj.get("author_id")
	bboxes = obj.get("bboxes")
	text_content = (obj.get("text") or "").strip()

	# Skip entries with None or empty bboxes
	if bboxes is None or not bboxes:
	if not text_content:
	extraction_logs.append(
	{
	"type": "extraction_skip",
	"source_json": json_path.name,
	"hw_id": hw_id,
	"reason": "missing_bbox_no_text",
	}
	)
	continue

	fallback_words = [w for w in text_content.split() if w]
	if not fallback_words:
	extraction_logs.append(
	{
	"type": "extraction_skip",
	"source_json": json_path.name,
	"hw_id": hw_id,
	"reason": "missing_bbox_no_tokens",
	}
	)
	continue

	for fallback_idx, raw_word in enumerate(fallback_words):
	word_segments_with_flags = split_word_with_spaces(
	raw_word, split_length_words, split_length_numeric
	)
	if not word_segments_with_flags:
	continue

	segments: List[WordSegment] = []
	for seg_idx, (seg_text, space_before) in enumerate(
	word_segments_with_flags
	):
	segments.append(
	WordSegment(
	token=seg_text,
	bbox=zero_bbox,
	original_index=seg_idx,
	space_before=space_before,
	)
	)

	fallback_counter = fallback_counters[hw_id]
	fallback_counters[hw_id] += 1
	tasks.append(
	WordTask(
	source_json=json_path.name,
	hw_id=hw_id,
	author_id=author_id,
	block_no=-1,
	line_no=-1,
	word_no=100000 + fallback_counter,
	segments=segments,
	original_bbox=zero_bbox,
	include_in_sentence=False,
	sentence_exclusion_reason="missing_bbox",
	)
	)

	extraction_logs.append(
	{
	"type": "extraction_notice",
	"source_json": json_path.name,
	"hw_id": hw_id,
	"reason": "missing_bbox_generated",
	"num_words": len(fallback_words),
	}
	)
	continue

	for idx, rec in enumerate(bboxes):
	bbox, token, block_no, line_no, word_no = parse_bbox_record(rec)

	# Split word with space-awareness (splits at spaces first, then by length)
	word_segments_with_flags = split_word_with_spaces(
	token, split_length_words, split_length_numeric
	)

	# Create WordSegment objects for each part
	segments = []
	for seg_idx, (seg_text, space_before) in enumerate(
	word_segments_with_flags
	):
	segments.append(
	WordSegment(
	token=seg_text,
	bbox=bbox, # Use same bbox for all segments (will adjust proportionally if needed)
	original_index=seg_idx,
	space_before=space_before,
	)
	)

	tasks.append(
	WordTask(
	source_json=json_path.name,
	hw_id=hw_id,
	author_id=author_id,
	block_no=block_no,
	line_no=line_no,
	word_no=word_no,
	segments=segments,
	original_bbox=bbox,
	)
	)
	return tasks, extraction_logs


	def style_id_for_file(json_name: str, author_id: str, seed: int, vocab: int) -> int:
	"""Deterministically derive a style id for (json_name, author_id) combo."""
	composite = f"{json_name}::{author_id}"
	return (hash(composite) ^ seed) % vocab


	def build_word_filename(task: WordTask) -> str:
	"""Create a unique filename for a word using hw_id, block, line, and word numbers."""
	block_part = f"b{task.block_no}" if task.block_no is not None else "bX"
	line_part = f"l{task.line_no}" if task.line_no is not None else "lX"
	word_part = f"w{task.word_no}"
	return f"{task.hw_id}_{block_part}_{line_part}_{word_part}.png"


	# ------------------------ generation -------------------------


	def load_experiment(
	run_dir: Path, checkpoint_name: str, device: torch.device
	) -> Dict[str, Any]:
	"""
	Load model components from experiment directory.
	Based on inference_hf.ipynb load_experiment function.
	"""
	run_dir = run_dir.expanduser().resolve()
	if not run_dir.exists():
	raise FileNotFoundError(f"Run directory {run_dir} does not exist.")

	config_path = run_dir / "config.yaml"
	if not config_path.exists():
	raise FileNotFoundError(f"Expected config at {config_path}.")

	with open(config_path, "r", encoding="utf-8") as f:
	config = yaml.safe_load(f)

	# Load tokenizer
	vocab_path = Path(config["data"]["vocab_path"])
	if not vocab_path.is_absolute():
	vocab_path = run_dir / vocab_path
	if not vocab_path.exists():
	vocab_path = run_dir.parent / config["data"]["vocab_path"]

	tokenizer = CharTokenizer.load(str(vocab_path))

	# Load writer_id_map
	writer_map_path = run_dir / "writer_id_map.json"
	if not writer_map_path.exists():
	raise FileNotFoundError(f"Expected writer mapping at {writer_map_path}.")
	with open(writer_map_path, "r", encoding="utf-8") as f:
	raw_writer_map = json.load(f)
	writer_id_map = {str(k): int(v) for k, v in raw_writer_map.items()}
	num_writers = len(writer_id_map)

	# Load text encoder
	text_cfg = config["model"]["text_encoder"]
	text_encoder = TextEncoder(
	vocab_size=len(tokenizer),
	d_model=text_cfg["d_model"],
	num_layers=text_cfg["num_layers"],
	num_heads=text_cfg["num_heads"],
	d_ff=text_cfg["d_ff"],
	dropout=text_cfg["dropout"],
	max_length=text_cfg["max_length"],
	output_dim=text_cfg.get("output_dim", text_cfg["d_model"]),
	).to(device)
	text_encoder.eval()

	# Load UNet
	unet_cfg = deepcopy(config["model"]["unet"])
	pretrained_path = unet_cfg.pop("pretrained_model_name_or_path", None)

	# Ensure tuple types
	for key in ("down_block_types", "up_block_types", "block_out_channels"):
	if key in unet_cfg and isinstance(unet_cfg[key], list):
	unet_cfg[key] = tuple(unet_cfg[key])

	if "sample_size" in unet_cfg and isinstance(unet_cfg["sample_size"], list):
	unet_cfg["sample_size"] = tuple(unet_cfg["sample_size"])

	# Set num_class_embeds from writer_id_map
	unet_cfg["num_class_embeds"] = num_writers

	if pretrained_path:
	unet = UNet2DConditionModel.from_pretrained(
	pretrained_path, num_class_embeds=num_writers
	).to(device)
	else:
	unet = UNet2DConditionModel(**unet_cfg).to(device)

	unet.eval()

	# Load scheduler - using DPM-Solver++ with order 3 for fast, high-quality sampling
	scheduler_cfg = config["model"]["scheduler"]
	noise_scheduler = DPMSolverMultistepScheduler(
	num_train_timesteps=scheduler_cfg["num_train_timesteps"],
	beta_start=scheduler_cfg["beta_start"],
	beta_end=scheduler_cfg["beta_end"],
	beta_schedule=scheduler_cfg["beta_schedule"],
	prediction_type=scheduler_cfg.get("prediction_type", "epsilon"),
	algorithm_type="dpmsolver++",
	solver_order=3, # Higher order = better quality
	use_karras_sigmas=scheduler_cfg.get("use_karras_sigmas", False),
	)
	# Add timestep_spacing if specified in config
	if "timestep_spacing" in scheduler_cfg:
	noise_scheduler.config.timestep_spacing = scheduler_cfg["timestep_spacing"]

	# Load VAE if latent mode
	mode = config["training"].get("mode", "latent")
	vae = None
	vae_scale_factor = 0.18215
	if mode == "latent":
	vae_config = config["model"].get("vae")
	if vae_config is None:
	raise KeyError("Latent mode requires 'model.vae' configuration.")
	vae_model_name = vae_config["model_name"]

	vae_cache_dir = run_dir / "cached_vae"
	if vae_cache_dir.exists():
	vae = AutoencoderKL.from_pretrained(vae_cache_dir).to(device)
	else:
	vae = AutoencoderKL.from_pretrained(vae_model_name).to(device)
	vae_cache_dir.mkdir(parents=True, exist_ok=True)
	vae.save_pretrained(vae_cache_dir)

	vae.eval()

	# Load checkpoint
	checkpoint_path = run_dir / checkpoint_name

	print(checkpoint_path)
	if not checkpoint_path.exists():
	checkpoint_path = Path(checkpoint_name)
	if not checkpoint_path.exists():
	raise FileNotFoundError(f"Checkpoint {checkpoint_name} not found.")

	checkpoint = torch.load(checkpoint_path, map_location=device)
	text_encoder.load_state_dict(checkpoint["text_encoder"])
	unet.load_state_dict(checkpoint["unet"], strict=False)

	# Load EMA if available
	ema_model = None
	if "ema" in checkpoint:
	training_cfg = config.get("training", {})
	use_warmup = training_cfg.get("ema_use_warmup", False)
	ema_model = EMAModel(
	unet.parameters(),
	decay=training_cfg.get("ema_decay", 0.9999),
	use_ema_warmup=use_warmup,
	inv_gamma=training_cfg.get("ema_inv_gamma", 1.0),
	power=training_cfg.get("ema_power", 1.0),
	min_decay=training_cfg.get("ema_min_decay", 0.0),
	device=device,
	model_cls=UNet2DConditionModel,
	model_config=unet.config,
	)
	ema_model.load_state_dict(checkpoint["ema"])
	ema_model.to(device)
	ema_model.copy_to(unet.parameters())

	latent_shape = config["model"].get("latent_shape")
	image_shape = config["model"].get("image_shape")
	if mode == "latent":
	sample_shape = tuple(latent_shape)
	else:
	sample_shape = tuple(image_shape)

	return {
	"tokenizer": tokenizer,
	"text_encoder": text_encoder,
	"unet": unet,
	"noise_scheduler": noise_scheduler,
	"vae": vae,
	"vae_scale_factor": vae_scale_factor,
	"writer_id_map": writer_id_map,
	"device": device,
	"config": config,
	"sample_shape": sample_shape,
	"mode": mode,
	}


	def diffusion_generate_batch(
	tokens: List[str],
	style_ids: List[int],
	components: Dict[str, Any],
	steps: int,
	temperature: float = 1.0,
	) -> List[Image.Image]:
	"""
	Generate batch of handwriting images using diffusion model.
	Based on sample_diffusion from inference_hf.ipynb.
	"""
	if not tokens:
	return []

	device = components["device"]
	tokenizer = components["tokenizer"]
	text_encoder = components["text_encoder"]
	unet = components["unet"]
	noise_scheduler = components["noise_scheduler"]
	sample_shape = components["sample_shape"]
	mode = components["mode"]
	vae = components.get("vae")
	vae_scale_factor = components.get("vae_scale_factor", 0.18215)

	# Encode text
	encodings = tokenizer.encode_batch(tokens)
	input_ids = torch.tensor(encodings["input_ids"], device=device, dtype=torch.long)
	attention_mask = torch.tensor(
	encodings["attention_mask"], device=device, dtype=torch.float32
	)

	# Convert writer style IDs to class indices
	writer_indices = torch.tensor(style_ids, device=device, dtype=torch.long)

	# Set timesteps
	noise_scheduler.set_timesteps(steps, device=device)
	timesteps = noise_scheduler.timesteps

	# Initialize latents
	batch_shape = (len(tokens),) + tuple(sample_shape)
	latents = torch.randn(batch_shape, device=device) * temperature

	# Generate text features
	with torch.no_grad():
	text_features = text_encoder(input_ids, attention_mask=attention_mask)

	# Sampling loop
	for timestep in timesteps:
	t_batch = torch.full(
	(len(tokens),), int(timestep), device=device, dtype=torch.long
	)

	model_output = unet(
	latents,
	t_batch,
	encoder_hidden_states=text_features,
	encoder_attention_mask=attention_mask,
	class_labels=writer_indices,
	)
	noise_pred = (
	model_output.sample if hasattr(model_output, "sample") else model_output
	)

	scheduler_step = noise_scheduler.step(noise_pred, int(timestep), latents)
	latents = scheduler_step.prev_sample

	# Decode if latent mode
	if mode == "latent" and vae is not None:
	latents = latents / vae_scale_factor
	decoded = vae.decode(latents).sample
	else:
	decoded = latents

	images = (decoded / 2 + 0.5).clamp(0.0, 1.0)

	# Convert to PIL images with cropping and transparency
	results: List[Image.Image] = []
	imgs = images.cpu().numpy()

	for i in range(len(tokens)):
	arr = imgs[i]
	if arr.shape[0] == 1:
	arr = arr[0] # Remove channel dim if grayscale
	else:
	arr = arr.transpose(1, 2, 0) # CHW -> HWC

	arr8 = (arr * 255).round().astype("uint8")

	# Binarize
	if arr8.ndim == 3:
	arr8 = arr8.mean(axis=2).astype("uint8")

	thresh = otsu_threshold(arr8)
	bin_arr = (arr8 > thresh).astype("uint8") * 255

	# Crop to content
	cropped, crop_box = crop_to_content(bin_arr)

	# Convert to RGBA
	rgba = binary_to_rgba(cropped)
	rgba.info["crop_box"] = crop_box
	results.append(rgba)

	return results


	# ---------------------- binarization utils -------------------


	def otsu_threshold(arr8):
	hist = np.bincount(arr8.ravel(), minlength=256).astype(np.float64)
	total = arr8.size
	sum_total = (hist * np.arange(256)).sum()
	weight_bg = 0.0
	sum_bg = 0.0
	max_between = -1.0
	thresh = 0
	for i in range(256):
	weight_bg += hist[i]
	if weight_bg == 0:
	continue
	weight_fg = total - weight_bg
	if weight_fg == 0:
	break
	sum_bg += i * hist[i]
	mean_bg = sum_bg / weight_bg
	mean_fg = (sum_total - sum_bg) / weight_fg
	between = weight_bg * weight_fg * (mean_bg - mean_fg) ** 2
	if between > max_between:
	max_between = between
	thresh = i
	return thresh


	# ---------------------- cropping & alpha --------------------


	def crop_to_content(bin_arr: np.ndarray, pad: int = 0):
	"""Crop binary array (0=ink,255=bg) to tight bounding box. Returns (cropped_array, (x1,y1,x2,y2))."""
	h, w = bin_arr.shape
	ink_mask = bin_arr < 255
	if not ink_mask.any():
	# No ink; return 1x1 transparent placeholder
	return bin_arr[:1, :1], (0, 0, 1, 1)
	rows = np.where(ink_mask.any(axis=1))[0]
	cols = np.where(ink_mask.any(axis=0))[0]
	y1, y2 = rows[0], rows[-1]
	x1, x2 = cols[0], cols[-1]
	if pad:
	x1 = max(0, x1 - pad)
	y1 = max(0, y1 - pad)
	x2 = min(w - 1, x2 + pad)
	y2 = min(h - 1, y2 + pad)
	cropped = bin_arr[y1 : y2 + 1, x1 : x2 + 1]
	return cropped, (
	int(x1),
	int(y1),
	int(x2) + 1,
	int(y2) + 1,
	) # x2,y2 exclusive for convenience


	def binary_to_rgba(bin_arr: np.ndarray) -> Image.Image:
	"""Convert binary (0 ink, 255 bg) to RGBA with transparent background."""
	h, w = bin_arr.shape
	# Ink black RGB (0,0,0), alpha 255 where ink, 0 where bg
	alpha = (bin_arr == 0).astype("uint8") * 255
	rgb = np.zeros((h, w, 3), dtype="uint8") # already black
	rgba = np.dstack([rgb, alpha])
	return Image.fromarray(rgba, mode="RGBA")


	def pad_tokens_to_equal_length(tokens: List[str]) -> List[str]:
	"""Pad tokens to equal length by appending spaces to shorter tokens."""
	if not tokens:
	return tokens
	max_len = max(len(t) for t in tokens)
	print([t.ljust(max_len) for t in tokens])
	return [t.ljust(max_len) for t in tokens]


	def calculate_baseline_info(
	img: Image.Image, baseline_percentile: float = 85.0
	) -> Dict[str, Any]:
	"""
	Calculate baseline information for an RGBA image.

	Args:
	img: RGBA PIL Image
	baseline_percentile: Percentile for baseline detection (default: 85.0)

	Returns:
	Dictionary with baseline metrics:
	- baseline_y: Absolute baseline position (pixels from top)
	- baseline_ratio: Baseline as ratio of height (0.0-1.0)
	- height_above: Pixels above baseline
	- height_below: Pixels below baseline
	- ascender_ratio: Ratio of height above baseline
	- descender_ratio: Ratio of height below baseline
	"""
	arr = np.array(img)
	height = img.height

	if arr.shape[2] == 4: # RGBA
	alpha = arr[:, :, 3]
	else:
	alpha = np.ones((height, img.width), dtype=np.uint8) * 255

	ink_mask = alpha > 200

	if not ink_mask.any():
	# No ink, use bottom as baseline
	baseline_y = height - 1
	else:
	# Find bottom-most ink pixels for each column
	bottom_candidates = []
	cols_with_ink = np.where(ink_mask.any(axis=0))[0]
	for col_idx in cols_with_ink:
	ink_rows = np.where(ink_mask[:, col_idx])[0]
	if ink_rows.size > 0:
	bottom_candidates.append(int(ink_rows[-1]))

	if bottom_candidates:
	baseline_y = int(np.percentile(bottom_candidates, baseline_percentile))
	else:
	baseline_y = height - 1

	height_above = baseline_y
	height_below = height - 1 - baseline_y

	return {
	"baseline_y": baseline_y,
	"baseline_ratio": baseline_y / height if height > 0 else 0.0,
	"height_above": height_above,
	"height_below": height_below,
	"ascender_ratio": height_above / height if height > 0 else 0.0,
	"descender_ratio": height_below / height if height > 0 else 0.0,
	}


	def concatenate_images_horizontal(
	images: List[Image.Image],
	gap: int = 0,
	baseline_align: bool = True,
	baseline_percentile: float = 75.0,
	) -> Image.Image:
	"""
	Horizontally concatenate a list of RGBA images with baseline alignment.

	Args:
	images: List of RGBA images to concatenate
	gap: Spacing between images in pixels
	baseline_align: If True, align by baseline; if False, center vertically
	baseline_percentile: Percentile for baseline detection (default: 85.0)

	Returns:
	Concatenated RGBA image
	"""
	if not images:
	raise ValueError("Cannot concatenate empty image list")
	if len(images) == 1:
	return images[0]

	if baseline_align:
	# Calculate baseline for each image
	baselines = []
	max_above_baseline = 0
	max_below_baseline = 0

	for img in images:
	# Convert to grayscale array
	arr = np.array(img)
	if arr.shape[2] == 4: # RGBA
	alpha = arr[:, :, 3]
	else:
	alpha = np.ones((arr.shape[0], arr.shape[1]), dtype=np.uint8) * 255

	# Find ink pixels
	ink_mask = alpha > 200

	if not ink_mask.any():
	# No ink, use bottom as baseline
	baseline = img.height - 1
	else:
	# Find bottom-most ink pixels for each column (optimized: only iterate columns with ink)
	bottom_candidates = []
	cols_with_ink = np.where(ink_mask.any(axis=0))[0]
	for col_idx in cols_with_ink:
	ink_rows = np.where(ink_mask[:, col_idx])[0]
	if ink_rows.size > 0:
	bottom_candidates.append(int(ink_rows[-1]))

	if bottom_candidates:
	baseline = int(
	np.percentile(bottom_candidates, baseline_percentile)
	)
	else:
	baseline = img.height - 1

	baselines.append(baseline)

	# Calculate space above and below baseline
	above = baseline
	below = img.height - 1 - baseline
	max_above_baseline = max(max_above_baseline, above)
	max_below_baseline = max(max_below_baseline, below)

	# Total height needed
	canvas_height = max_above_baseline + 1 + max_below_baseline
	total_width = sum(img.width for img in images) + gap * (len(images) - 1)

	# Create canvas
	result = Image.new("RGBA", (total_width, canvas_height), (0, 0, 0, 0))

	# Paste images aligned by baseline
	x_offset = 0
	for img, baseline in zip(images, baselines):
	# Calculate y position to align baselines
	y_offset = max_above_baseline - baseline
	result.paste(img, (x_offset, y_offset), img)
	x_offset += img.width + gap
	else:
	# Simple vertical centering
	max_height = max(img.height for img in images)
	total_width = sum(img.width for img in images) + gap * (len(images) - 1)

	result = Image.new("RGBA", (total_width, max_height), (0, 0, 0, 0))

	x_offset = 0
	for img in images:
	y_offset = (max_height - img.height) // 2
	result.paste(img, (x_offset, y_offset), img)
	x_offset += img.width + gap

	return result


	def concatenate_segments_with_variable_gaps(
	images: List[Image.Image],
	segments: List[WordSegment],
	segment_gap: int = 2,
	word_gap: int = 20,
	baseline_percentile: float = 75.0,
	) -> Image.Image:
	"""
	Concatenate word segments with variable gaps based on whether they were separated by spaces.

	Args:
	images: List of RGBA segment images (same length as segments)
	segments: List of WordSegment objects with space_before flags
	segment_gap: Gap for length-split segments (no space in original)
	word_gap: Gap for space-separated segments
	baseline_percentile: Percentile for baseline detection

	Returns:
	Concatenated RGBA image with appropriate gaps
	"""
	if not images:
	raise ValueError("Cannot concatenate empty image list")
	if len(images) == 1:
	return images[0]
	if len(images) != len(segments):
	raise ValueError(f"Mismatch: {len(images)} images but {len(segments)} segments")

	# Calculate baseline for each image
	baselines = []
	max_above_baseline = 0
	max_below_baseline = 0

	for img in images:
	arr = np.array(img)
	if arr.shape[2] == 4: # RGBA
	alpha = arr[:, :, 3]
	else:
	alpha = np.ones((arr.shape[0], arr.shape[1]), dtype=np.uint8) * 255

	ink_mask = alpha > 200

	if not ink_mask.any():
	baseline = img.height - 1
	else:
	bottom_candidates = []
	cols_with_ink = np.where(ink_mask.any(axis=0))[0]
	for col_idx in cols_with_ink:
	ink_rows = np.where(ink_mask[:, col_idx])[0]
	if ink_rows.size > 0:
	bottom_candidates.append(int(ink_rows[-1]))

	if bottom_candidates:
	baseline = int(np.percentile(bottom_candidates, baseline_percentile))
	else:
	baseline = img.height - 1

	baselines.append(baseline)
	above = baseline
	below = img.height - 1 - baseline
	max_above_baseline = max(max_above_baseline, above)
	max_below_baseline = max(max_below_baseline, below)

	# Calculate total width based on variable gaps
	canvas_height = max_above_baseline + 1 + max_below_baseline
	total_width = sum(img.width for img in images)
	for i in range(1, len(images)):
	# Use word_gap if this segment had a space before it, else segment_gap
	gap = word_gap if segments[i].space_before else segment_gap
	total_width += gap

	# Create canvas and paste images
	result = Image.new("RGBA", (total_width, canvas_height), (0, 0, 0, 0))

	x_offset = 0
	for i, (img, baseline, segment) in enumerate(zip(images, baselines, segments)):
	y_offset = max_above_baseline - baseline
	result.paste(img, (x_offset, y_offset), img)
	x_offset += img.width

	# Add appropriate gap before next image
	if i < len(images) - 1:
	gap = word_gap if segments[i + 1].space_before else segment_gap
	x_offset += gap

	return result


	# -------------------------- main -----------------------------


	def generate_handwriting(
	input_dir: Path,
	output_dir: Path,
	run_dir: Path,
	checkpoint: str = "latest.pt",
	progress: Progress \| None = None,
	steps: int = 30,
	split_length_words: int = 6,
	split_length_numeric: int = 2,
	temperature: float = 0.5,
	seed: int = 42,
	device: str = "cuda",
	overwrite: bool = False,
	mapping_file: Optional[Path] = None,
	log_file: Optional[Path] = None,
	batch_size: int = 32,
	stitch_sentences: bool = True,
	segment_gap: int = 2,
	word_gap: int = 20,
	baseline_percentile: float = 75.0,
	allowed_writers: Optional[List[str]] = None,
	) -> None:
	"""Generate handwriting images and metadata using configured diffusion models."""
	random.seed(seed)
	torch.manual_seed(seed)
	device_obj = torch.device(
	device if torch.cuda.is_available() or device == "cpu" else "cpu"
	)

	input_dir = Path(input_dir)
	output_dir = Path(output_dir)
	run_dir = Path(run_dir)
	mapping_file = Path(mapping_file) if mapping_file is not None else None
	log_file = Path(log_file) if log_file is not None else None

	# Load model components
	print(f"Loading model from {run_dir}...")
	components = load_experiment(run_dir, checkpoint, device_obj)
	print(f"✓ Model loaded successfully")
	print(f" Mode: {components['mode']}")
	print(f" Sample shape: {components['sample_shape']}")
	print(f" Writers: {len(components['writer_id_map'])}")

	output_dir.mkdir(parents=True, exist_ok=True)

	# Load JSON files
	json_files = list_json_files(input_dir)
	if not json_files:
	print("[ERROR] No JSON files found.", file=sys.stderr)
	sys.exit(1)

	print(f"Found {len(json_files)} JSON files")

	# Extract tasks with word splitting
	tasks: List[WordTask] = []
	extraction_logs: List[Dict[str, Any]] = []
	for jf in json_files:
	data = load_json(jf)
	extracted_tasks, extracted_log_entries = extract_tasks(
	jf, data, split_length_words, split_length_numeric
	)
	tasks.extend(extracted_tasks)
	extraction_logs.extend(extracted_log_entries)

	print(f"Extracted {len(tasks)} word tasks")
	if split_length_words > 0:
	total_segments = sum(len(t.segments) for t in tasks)
	print(
	f" Split into {total_segments} segments (split_length={split_length_words}, digit_chunk_length={split_length_numeric})"
	)

	# Per-file author style mapping
	file_author_style_ids: Dict[str, Dict[str, int]] = {}
	writer_id_map = components["writer_id_map"]

	# Filter to allowed writers if specified
	allowed_writer_ids = None
	if allowed_writers is not None:
	allowed_writer_ids = []
	for w in allowed_writers:
	try:
	writer_id = int(w)
	if 0 <= writer_id < len(writer_id_map):
	allowed_writer_ids.append(writer_id)
	else:
	print(
	f"[WARNING] Writer ID {writer_id} out of range (0-{len(writer_id_map) - 1}), ignoring"
	)
	except ValueError:
	print(f"[WARNING] Invalid writer ID '{w}', must be integer, ignoring")

	if not allowed_writer_ids:
	print("[ERROR] No valid writer IDs provided in --allowed-writers")
	sys.exit(1)

	print(
	f"Using {len(allowed_writer_ids)} allowed writer(s): {sorted(allowed_writer_ids)}"
	)

	# Set up RNG for random writer selection if needed
	rng = random.Random(seed)

	for t in tasks:
	file_author_style_ids.setdefault(t.source_json, {})
	if t.author_id not in file_author_style_ids[t.source_json]:
	# Map author_id to writer index from the model's writer_id_map
	if t.author_id in writer_id_map:
	style_id = writer_id_map[t.author_id]
	# If allowed_writers specified and this author's style not in list, randomly pick from allowed
	if (
	allowed_writer_ids is not None
	and style_id not in allowed_writer_ids
	):
	style_id = rng.choice(allowed_writer_ids)
	else:
	# Author not in map: use allowed writers if specified, else fallback to hashing
	if allowed_writer_ids is not None:
	style_id = rng.choice(allowed_writer_ids)
	else:
	style_id = style_id_for_file(
	t.source_json, t.author_id, seed, len(writer_id_map)
	)
	file_author_style_ids[t.source_json][t.author_id] = style_id

	results: List[Dict[str, Any]] = []
	generation_skip_log: List[Dict[str, Any]] = []
	generation_error_log: List[Dict[str, Any]] = []
	sentence_exclusion_log: List[Dict[str, Any]] = []
	total_words = len(tasks)
	effective_batch_size = max(1, batch_size)
	progress = progress or Progress(transient=True)
	generation_task_id = progress.add_task("Generating words", total=total_words)

	for word_idx in range(0, total_words, effective_batch_size):
	batch_tasks = tasks[word_idx : word_idx + effective_batch_size]

	# Process each word task
	for task in batch_tasks:
	json_stem = Path(task.source_json).stem
	doc_dir = output_dir / json_stem
	doc_dir.mkdir(parents=True, exist_ok=True)

	# Output filename includes block and line numbers to avoid collisions across lines
	out_name = build_word_filename(task)
	relative_image_path = f"{json_stem}/{out_name}"
	out_path = doc_dir / out_name

	if out_path.exists() and not overwrite:
	# Load existing metadata
	try:
	existing_img = Image.open(out_path)
	w, h = existing_img.size
	baseline_info = calculate_baseline_info(
	existing_img, baseline_percentile=baseline_percentile
	)
	results.append(
	{
	"image": relative_image_path,
	"hw_id": task.hw_id,
	"author_id": task.author_id,
	"style_id": file_author_style_ids[task.source_json][
	task.author_id
	],
	"source_json": task.source_json,
	"block_no": task.block_no,
	"line_no": task.line_no,
	"word_no": task.word_no,
	"segments": [
	{
	"token": seg.token,
	"bbox": list(seg.bbox),
	"space_before": seg.space_before,
	}
	for seg in task.segments
	],
	"skipped": True,
	"skip_reason": "existing_output",
	"include_in_sentence": task.include_in_sentence,
	"sentence_exclusion_reason": task.sentence_exclusion_reason,
	"width": w,
	"height": h,
	"baseline": baseline_info,
	}
	)
	generation_skip_log.append(
	{
	"type": "existing_output",
	"source_json": task.source_json,
	"hw_id": task.hw_id,
	"word_no": task.word_no,
	"block_no": task.block_no,
	"line_no": task.line_no,
	"image": relative_image_path,
	}
	)
	if not task.include_in_sentence:
	sentence_exclusion_log.append(
	{
	"source_json": task.source_json,
	"hw_id": task.hw_id,
	"word_no": task.word_no,
	"block_no": task.block_no,
	"line_no": task.line_no,
	"image": relative_image_path,
	"reason": task.sentence_exclusion_reason
	or "manual_exclusion",
	}
	)
	except Exception as e:
	print(f"[WARN] Could not load existing {out_path}: {e}")
	continue

	# Generate all segments for this word
	try:
	tokens_batch = [seg.token for seg in task.segments]
	style_id = file_author_style_ids[task.source_json][task.author_id]
	style_ids_batch = [style_id] * len(tokens_batch)

	segment_images = diffusion_generate_batch(
	tokens_batch,
	style_ids_batch,
	components,
	steps,
	temperature=temperature,
	)

	# Concatenate segments with variable gaps (word-gap for spaces, segment-gap for length splits)
	if len(segment_images) > 1:
	final_image = concatenate_segments_with_variable_gaps(
	segment_images,
	task.segments,
	segment_gap=segment_gap,
	word_gap=word_gap,
	baseline_percentile=baseline_percentile,
	)
	else:
	final_image = segment_images[0]

	# Save
	w, h = final_image.size
	final_image.save(out_path)

	# Calculate baseline information for alignment
	baseline_info = calculate_baseline_info(
	final_image, baseline_percentile=baseline_percentile
	)

	results.append(
	{
	"image": relative_image_path,
	"hw_id": task.hw_id,
	"author_id": task.author_id,
	"style_id": style_id,
	"source_json": task.source_json,
	"block_no": task.block_no,
	"line_no": task.line_no,
	"word_no": task.word_no,
	"segments": [
	{
	"token": seg.token,
	"bbox": list(seg.bbox),
	"space_before": seg.space_before,
	}
	for seg in task.segments
	],
	"skipped": False,
	"skip_reason": None,
	"include_in_sentence": task.include_in_sentence,
	"sentence_exclusion_reason": task.sentence_exclusion_reason,
	"width": w,
	"height": h,
	"baseline": baseline_info,
	}
	)
	if not task.include_in_sentence:
	sentence_exclusion_log.append(
	{
	"source_json": task.source_json,
	"hw_id": task.hw_id,
	"word_no": task.word_no,
	"block_no": task.block_no,
	"line_no": task.line_no,
	"image": relative_image_path,
	"reason": task.sentence_exclusion_reason
	or "manual_exclusion",
	}
	)
	except Exception as e:
	print(
	f"[ERROR] Generation failed for {task.hw_id} word {task.word_no}: {e}",
	file=sys.stderr,
	)
	import traceback

	traceback.print_exc()
	generation_error_log.append(
	{
	"type": "generation_error",
	"source_json": task.source_json,
	"hw_id": task.hw_id,
	"word_no": task.word_no,
	"block_no": task.block_no,
	"line_no": task.line_no,
	"reason": str(e),
	"traceback": traceback.format_exc(),
	}
	)

	if progress and generation_task_id is not None:
	progress.advance(generation_task_id, len(batch_tasks))

	# Sentence-level stitching (if requested)
	if stitch_sentences:
	print("\nStitching words into sentences...")
	sentences_dir = output_dir / "sentences"
	sentences_dir.mkdir(exist_ok=True)

	# Group results by (source_json, hw_id, block_no, line_no)
	line_groups: Dict[Tuple[str, str, int, int], List[Dict[str, Any]]] = {}
	for r in results:
	if r["skipped"]:
	continue
	if not r.get("include_in_sentence", True):
	continue
	key = (r["source_json"], r["hw_id"], r["block_no"], r["line_no"])
	line_groups.setdefault(key, []).append(r)

	# Sort words within each line by word_no
	for key in line_groups:
	line_groups[key].sort(key=lambda x: x["word_no"])

	sentence_results: List[Dict[str, Any]] = []
	sentence_progress = progress
	sentence_task_id = sentence_progress.add_task(
	"Stitching sentences", total=len(line_groups)
	)

	for (source_json, hw_id, block_no, line_no), word_list in line_groups.items():
	if not word_list:
	continue

	json_stem = Path(source_json).stem
	sent_doc_dir = sentences_dir / json_stem
	sent_doc_dir.mkdir(parents=True, exist_ok=True)

	# Output filename: hw{id}_block{block}_line{line}.png
	sent_name = f"{hw_id}_block{block_no}_line{line_no}.png"
	sent_relative_path = f"sentences/{json_stem}/{sent_name}"
	sent_path = sent_doc_dir / sent_name

	if sent_path.exists() and not overwrite:
	if sentence_progress and sentence_task_id is not None:
	sentence_progress.advance(sentence_task_id, 1)
	continue

	try:
	# Load all word images for this line
	word_images = []
	for word_data in word_list:
	word_img_path = output_dir / word_data["image"]
	if word_img_path.exists():
	word_images.append(Image.open(word_img_path))

	if not word_images:
	continue

	# Stitch words together with larger gap
	sentence_image = concatenate_images_horizontal(
	word_images,
	gap=word_gap,
	baseline_align=True,
	baseline_percentile=baseline_percentile,
	)

	# Save sentence image
	sentence_image.save(sent_path)

	# Collect text for this line
	line_text = " ".join(
	[
	"".join([seg["token"] for seg in w["segments"]])
	for w in word_list
	]
	)

	sentence_results.append(
	{
	"image": sent_relative_path,
	"source_json": source_json,
	"hw_id": hw_id,
	"block_no": block_no,
	"line_no": line_no,
	"text": line_text,
	"num_words": len(word_list),
	"width": sentence_image.width,
	"height": sentence_image.height,
	}
	)

	except Exception as e:
	print(
	f"[ERROR] Failed to stitch sentence {hw_id} block{block_no} line{line_no}: {e}",
	file=sys.stderr,
	)

	if sentence_progress and sentence_task_id is not None:
	sentence_progress.advance(sentence_task_id, 1)

	# Save sentence mapping
	sentence_mapping_file = sentences_dir / "sentence_map.json"
	with sentence_mapping_file.open("w", encoding="utf-8") as f:
	json.dump(
	{
	"backend": "diffusion-hf-sentences",
	"word_gap": word_gap,
	"sentences": sentence_results,
	},
	f,
	ensure_ascii=False,
	indent=2,
	)

	print(f"✓ Generated {len(sentence_results)} sentence images")
	print(f"✓ Sentence mapping saved: {sentence_mapping_file}")

	# Build mapping structure
	entries_map: Dict[Tuple[str, str], List[Dict[str, Any]]] = {}
	for r in results:
	key = (r["source_json"], r["hw_id"])
	entries_map.setdefault(key, []).append(r)

	# Export file author styles
	file_author_styles_export = {
	fname: {aid: {"style_id": sid} for aid, sid in inner.items()}
	for fname, inner in sorted(file_author_style_ids.items())
	}

	consolidated = {
	"backend": "diffusion-hf",
	"split_length": split_length_words,
	"digit_chunk_length": split_length_numeric,
	"temperature": temperature,
	"steps": steps,
	"segment_gap": segment_gap,
	"word_gap": word_gap if stitch_sentences else None,
	"baseline_percentile": baseline_percentile,
	"entries": [
	{
	"source_json": src,
	"hw_id": hw,
	"author_id": words[0]["author_id"] if words else None,
	"words": [
	{
	"block_no": w["block_no"],
	"line_no": w["line_no"],
	"word_no": w["word_no"],
	"image": w["image"],
	"style_id": w["style_id"],
	"width": w["width"],
	"height": w["height"],
	"baseline": w["baseline"],
	"segments": w["segments"],
	}
	for w in sorted(
	words, key=lambda x: (x["block_no"], x["line_no"], x["word_no"])
	)
	],
	}
	for (src, hw), words in sorted(entries_map.items())
	],
	"file_author_styles": file_author_styles_export,
	}

	mapping_path = mapping_file or (output_dir / "raw_token_map.json")
	with mapping_path.open("w", encoding="utf-8") as f:
	json.dump(consolidated, f, ensure_ascii=False, indent=2)

	generated_count = sum(1 for r in results if not r["skipped"])
	reused_count = sum(1 for r in results if r["skipped"])
	log_file_path = log_file or (output_dir / "generation_log.json")
	log_payload = {
	"timestamp": datetime.utcnow().isoformat() + "Z",
	"summary": {
	"total_tasks": len(tasks),
	"extraction_skips": len(
	[
	entry
	for entry in extraction_logs
	if entry.get("type") == "extraction_skip"
	]
	),
	"words_generated": generated_count,
	"words_reused": reused_count,
	"generation_errors": len(generation_error_log),
	"sentence_exclusions": len(sentence_exclusion_log),
	},
	"details": {
	"extraction": extraction_logs,
	"generation_skips": generation_skip_log,
	"generation_errors": generation_error_log,
	"sentence_exclusions": sentence_exclusion_log,
	},
	}
	with log_file_path.open("w", encoding="utf-8") as log_fp:
	json.dump(log_payload, log_fp, ensure_ascii=False, indent=2)

	print(f"\n✓ Generated {len(results)} word images")
	print(f"✓ Mapping saved: {mapping_path}")
	print(f"✓ Log saved: {log_file_path}")
	print("[DONE] Freeing up memory..")
	for k, v in components.items():
	del v
	del components
	torch.cuda.empty_cache()


	def main() -> None:
	ap = argparse.ArgumentParser(
	description="Diffusion-based handwriting token generator with intelligent word splitting."
	)
	ap.add_argument(
	"--input-dir",
	type=Path,
	required=True,
	help="Directory containing bbox JSON files",
	)
	ap.add_argument(
	"--output-dir",
	type=Path,
	required=True,
	help="Output directory for generated images",
	)
	ap.add_argument(
	"--run-dir",
	type=Path,
	required=True,
	help="Model experiment directory (e.g., model/experiments/hf_conditional_latent)",
	)
	ap.add_argument(
	"--checkpoint", type=str, default="latest.pt", help="Checkpoint filename"
	)
	ap.add_argument("--steps", type=int, default=30, help="Number of diffusion steps")
	ap.add_argument(
	"--split-length-words",
	type=int,
	default=6,
	help="Maximum word length before splitting (0 = no splitting)",
	)
	ap.add_argument(
	"--temperature", type=float, default=0.5, help="Sampling temperature"
	)
	ap.add_argument("--seed", type=int, default=42, help="Random seed")
	ap.add_argument("--device", type=str, default="cuda", help="Device (cuda/cpu)")
	ap.add_argument(
	"--overwrite", action="store_true", help="Overwrite existing images"
	)
	ap.add_argument(
	"--mapping-file", type=Path, default=None, help="Output mapping JSON path"
	)
	ap.add_argument(
	"--log-file",
	type=Path,
	default=None,
	help="Optional path for JSON log output (default: output_dir/generation_log.json)",
	)
	ap.add_argument(
	"--batch-size", type=int, default=32, help="Batch size for generation"
	)
	ap.add_argument(
	"--stitch-sentences",
	default=True,
	action="store_true",
	help="Generate sentence-level stitched images in separate folder",
	)
	ap.add_argument(
	"--segment-gap",
	type=int,
	default=2,
	help="Gap between word segments (split parts) in pixels",
	)
	ap.add_argument(
	"--word-gap",
	type=int,
	default=20,
	help="Gap between words in sentence stitching in pixels",
	)
	ap.add_argument(
	"--baseline-percentile",
	type=float,
	default=75.0,
	help="Percentile for baseline detection (0-100, default: 85.0)",
	)
	ap.add_argument(
	"--allowed-writers",
	type=str,
	nargs="+",
	default=None,
	help="List of allowed writer IDs to choose from (e.g., --allowed-writers 0 5 10 25)",
	)
	args = ap.parse_args()

	generate_handwriting(**vars(args))


	if __name__ == "__main__":
	main()