Spaces:

hanz245
/

ocr

Running

App Files Files Community

ocr / check_cer.py

hanz245

set up

7111e1a 12 days ago

raw

history blame contribute delete

12.2 kB

	"""
	check_cer.py
	============
	Measures TRUE CER by actually running the model on images.

	Usage:
	python check_cer.py # live CER on val set
	python check_cer.py --saved # old behavior (fast, unreliable)
	python check_cer.py --images test_images/ # run on any image folder
	"""

	import os
	import sys
	import json
	import random
	import cv2
	import numpy as np
	import editdistance
	from pathlib import Path

	try:
	import torch
	except ImportError:
	print("ERROR: torch not installed. Run: pip install torch")
	exit(1)

	USE_SAVED = '--saved' in sys.argv
	IMAGE_DIR = None
	for i, arg in enumerate(sys.argv[1:], 1):
	if arg == '--images' and i < len(sys.argv) - 1:
	IMAGE_DIR = sys.argv[i + 1]
	elif arg.startswith('--images='):
	IMAGE_DIR = arg.split('=', 1)[1]

	CHECKPOINTS = [
	'checkpoint_epoch_50.pth',
	'checkpoint_epoch_60.pth',
	'checkpoint_epoch_70.pth',
	'checkpoint_epoch_80.pth',
	'checkpoint_epoch_90.pth',
	'checkpoint_epoch_100.pth',
	]
	CHECKPOINT_DIR = 'checkpoints'
	VAL_DATA_DIR = 'data/val'
	VAL_ANN_FILE = 'data/val_annotations.json'


	class AdaptiveImageNormalizer:
	def __init__(self, target_height=64, target_width=512):
	self.H = target_height
	self.W = target_width

	def _crop_to_text(self, gray):
	inv = cv2.bitwise_not(gray)
	_, thresh = cv2.threshold(inv, 20, 255, cv2.THRESH_BINARY)
	coords = np.column_stack(np.where(thresh > 0))
	if len(coords) == 0:
	return gray
	y_min, x_min = coords.min(axis=0)
	y_max, x_max = coords.max(axis=0)
	pad = max(4, int((y_max - y_min) * 0.15))
	y_min = max(0, y_min - pad)
	x_min = max(0, x_min - pad)
	y_max = min(gray.shape[0] - 1, y_max + pad)
	x_max = min(gray.shape[1] - 1, x_max + pad)
	return gray[y_min:y_max + 1, x_min:x_max + 1]

	def _smart_resize_gray(self, gray):
	h, w = gray.shape
	if h == 0 or w == 0:
	return np.ones((self.H, self.W), dtype=np.uint8) * 255
	scale = self.H / h
	new_w = int(w * scale)
	new_h = self.H
	if new_w > self.W:
	scale = self.W / w
	new_h = int(h * scale)
	new_w = self.W
	resized = cv2.resize(gray, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
	canvas = np.ones((self.H, self.W), dtype=np.uint8) * 255
	y_off = (self.H - new_h) // 2
	x_off = (self.W - new_w) // 2
	canvas[y_off:y_off + new_h, x_off:x_off + new_w] = resized
	return canvas

	def _binarize(self, img):
	_, otsu = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
	white_ratio = np.mean(otsu == 255)
	if white_ratio < 0.30 or white_ratio > 0.97:
	return cv2.adaptiveThreshold(
	img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
	cv2.THRESH_BINARY, 11, 2)
	return otsu

	def normalize(self, img):
	if len(img.shape) == 3:
	gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
	else:
	gray = img.copy()
	gray = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
	gray = self._crop_to_text(gray)
	gray = self._smart_resize_gray(gray)
	return self._binarize(gray)

	def to_tensor(self, img):
	return torch.FloatTensor(
	img.astype(np.float32) / 255.0
	).unsqueeze(0).unsqueeze(0)


	def greedy_decode(outputs, idx_to_char):
	pred_indices = torch.argmax(outputs, dim=2).permute(1, 0)
	results = []
	for seq in pred_indices:
	chars, prev = [], -1
	for idx in seq:
	idx = idx.item()
	if idx != 0 and idx != prev and idx in idx_to_char:
	chars.append(idx_to_char[idx])
	prev = idx
	results.append(''.join(chars))
	return results


	def measure_live_cer(model, idx_to_char, img_h, img_w,
	ann_file, data_dir, device, max_samples=200):
	if not os.path.exists(ann_file):
	return None, 0, f"Annotation file not found: {ann_file}"

	with open(ann_file, 'r', encoding='utf-8') as f:
	annotations = json.load(f)

	if len(annotations) > max_samples:
	random.seed(42)
	annotations = random.sample(annotations, max_samples)

	normalizer = AdaptiveImageNormalizer(img_h, img_w)
	model.eval()

	total_char_dist = 0
	total_chars = 0
	total_word_dist = 0
	total_words = 0
	n_exact = 0
	n_evaluated = 0
	worst_errors = []

	with torch.no_grad():
	for ann in annotations:
	img_path = os.path.join(data_dir, ann['image_path'])
	gt = ann['text']
	if not os.path.exists(img_path):
	continue
	try:
	raw = cv2.imread(img_path)
	if raw is None:
	continue
	norm = normalizer.normalize(raw)
	tensor = normalizer.to_tensor(norm).to(device)
	out = model(tensor)
	pred = greedy_decode(out.cpu(), idx_to_char)[0]

	cd = editdistance.eval(pred, gt)
	wd = editdistance.eval(pred.split(), gt.split())

	total_char_dist += cd
	total_chars += len(gt)
	total_word_dist += wd
	total_words += len(gt.split())
	if pred == gt:
	n_exact += 1
	if cd > 0:
	worst_errors.append((gt, pred, cd))
	n_evaluated += 1
	except Exception:
	continue

	if n_evaluated == 0:
	return None, 0, "No images could be evaluated"

	cer = (total_char_dist / total_chars * 100) if total_chars > 0 else 0
	wer = (total_word_dist / total_words * 100) if total_words > 0 else 0
	acc = (n_exact / n_evaluated * 100)

	return {
	'cer': cer, 'wer': wer, 'exact_match': acc,
	'n_evaluated': n_evaluated,
	'errors': sorted(worst_errors, key=lambda x: x[2], reverse=True)[:5]
	}, n_evaluated, None


	def run_on_folder(model, idx_to_char, img_h, img_w, folder, device):
	normalizer = AdaptiveImageNormalizer(img_h, img_w)
	model.eval()
	exts = {'.jpg', '.jpeg', '.png', '.bmp'}
	paths = sorted(p for p in Path(folder).rglob('*') if p.suffix.lower() in exts)
	results = []
	with torch.no_grad():
	for p in paths:
	try:
	raw = cv2.imread(str(p))
	norm = normalizer.normalize(raw)
	tensor = normalizer.to_tensor(norm).to(device)
	pred = greedy_decode(model(tensor).cpu(), idx_to_char)[0]
	results.append((p.name, pred))
	except Exception as e:
	results.append((p.name, f'ERROR: {e}'))
	return results


	# ─────────────────────────────────────────────────────────────────────────────
	# MAIN
	# ─────────────────────────────────────────────────────────────────────────────

	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

	if USE_SAVED:
	print("=" * 65)
	print(" SAVED CER (training-time value — may not reflect real accuracy)")
	print(" Run without --saved for true live CER.")
	print("=" * 65)
	print("{:<8} {:<12} {:<12} {}".format("Epoch", "CER(%)", "WER(%)", "File"))
	print("-" * 65)
	best_cer, best_cp = float('inf'), None
	for cp in CHECKPOINTS:
	path = os.path.join(CHECKPOINT_DIR, cp)
	if not os.path.exists(path):
	continue
	try:
	c = torch.load(path, weights_only=False)
	cer = c.get('val_cer', c.get('val_loss', 0))
	epoch = c['epoch']
	history = c.get('history', {})
	wer_list = history.get('val_wer', [])
	wer = wer_list[epoch - 1] if wer_list and epoch <= len(wer_list) else None
	wer_s = f"{wer:.4f}%" if wer else 'N/A'
	marker = ' <-- BEST' if cer < best_cer else ''
	print("{:<8} {:<12} {:<12} {}{}".format(
	epoch, f"{cer:.4f}%", wer_s, cp, marker))
	if cer < best_cer:
	best_cer, best_cp = cer, cp
	except Exception as e:
	print(f" Could not load {cp}: {e}")
	print("=" * 65)
	print(f"\nBEST: {best_cp} CER={best_cer:.4f}%")

	else:
	print("=" * 78)
	print(" LIVE CER — model actually runs on images (true accuracy)")
	print("=" * 78)
	print("{:<8} {:<10} {:<10} {:<12} {:<8} {}".format(
	"Epoch", "CER(%)", "WER(%)", "ExactMatch", "N", "File"))
	print("-" * 78)

	best_cer, best_cp, best_metrics = float('inf'), None, None

	for cp in CHECKPOINTS:
	cp_path = os.path.join(CHECKPOINT_DIR, cp)
	if not os.path.exists(cp_path):
	print(f" (skipping {cp} — not found)")
	continue
	try:
	from crnn_model import get_crnn_model
	c = torch.load(cp_path, map_location=device, weights_only=False)
	epoch = c['epoch']
	idx_to_char = c['idx_to_char']
	config = c.get('config', {})
	img_h = config.get('img_height', 64)
	img_w = config.get('img_width', 512)
	saved_cer = c.get('val_cer', c.get('val_loss', None))

	model = get_crnn_model(
	model_type=config.get('model_type', 'standard'),
	img_height=img_h,
	num_chars=c['model_state_dict']['fc.weight'].shape[0],
	hidden_size=config.get('hidden_size', 128), # FIXED: was 256
	num_lstm_layers=config.get('num_lstm_layers', 1) # FIXED: was 2
	).to(device)
	model.load_state_dict(c['model_state_dict'])

	if IMAGE_DIR:
	print(f"\nPredictions from {cp}:")
	for fname, pred in run_on_folder(
	model, idx_to_char, img_h, img_w, IMAGE_DIR, device):
	print(f" {fname:<35} -> {pred}")
	continue

	metrics, n, err = measure_live_cer(
	model, idx_to_char, img_h, img_w,
	VAL_ANN_FILE, VAL_DATA_DIR, device)

	if metrics is None:
	print(f" Epoch {epoch} SKIP: {err}")
	continue

	cer = metrics['cer']
	marker = ' <-- BEST' if cer < best_cer else ''
	print("{:<8} {:<10} {:<10} {:<12} {:<8} {}{}".format(
	epoch,
	f"{cer:.2f}%",
	f"{metrics['wer']:.2f}%",
	f"{metrics['exact_match']:.1f}%",
	n, cp, marker))

	if saved_cer and abs(cer - saved_cer) > 2.0:
	print(f" ^ MISMATCH: saved={saved_cer:.2f}% live={cer:.2f}%"
	f" diff={abs(cer - saved_cer):.2f}%")
	print(f" Cause: model trained on clean synthetic only.")
	print(f" Fix: regenerate data with fix_data.py + retrain.")

	if cer < best_cer:
	best_cer, best_cp, best_metrics = cer, cp, metrics

	except Exception as e:
	print(f" Could not evaluate {cp}: {e}")

	if not IMAGE_DIR:
	print("=" * 78)
	print(f"\nBEST CHECKPOINT : {best_cp}")
	print(f"BEST LIVE CER : {best_cer:.4f}%")

	if best_metrics and best_metrics['errors']:
	print(f"\nWorst predictions (GT -> Predicted):")
	for gt, pred, dist in best_metrics['errors']:
	print(f" [{dist:2d}] '{gt}'")
	print(f" '{pred}'")

	print(f"\nTo use best model:")
	print(f" import shutil")
	print(f" shutil.copy('checkpoints/{best_cp}', 'checkpoints/best_model.pth')")
	print("=" * 78)