In [None]:
import torch
from PIL import Image
from tqdm.auto import tqdm
from transformers import AutoModelForCausalLM, AutoProcessor

In [None]:
# Load dataset
from get_cdli_dataset import get_dataset, IMG_CACHE

dataset = get_dataset()
test_dataset = dataset["test"]

print(test_dataset)

In [None]:
# Load the model

# model_path = "PaddlePaddle/PaddleOCR-VL" # base
# model_path = "./outputs/sft"
model_path = "../"

model = AutoModelForCausalLM.from_pretrained(
 model_path, trust_remote_code=True, torch_dtype=torch.bfloat16
).to("cuda").eval()
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)

In [None]:
import pyxdameraulevenshtein as dl

def compute_ter(expected_ids: list[int], predicted_ids: list[int]) -> float:
 """
 Compute Token Error Rate (TER) between ground truth and completion tokens.
 TER = (substitutions + deletions + insertions) / len(ground_truth)

 TER is better than CER for cuneiform OCR as:
 - Multi-character Unicode signs count as 1 token instead of multiple chars
 - Special tokens like @obverse/@reverse count as 1 token
 """

 if len(expected_ids) == 0:
 return 0.0 if len(predicted_ids) == 0 else 1.0

 # Calculate edit distance on token sequences
 distance = dl.damerau_levenshtein_distance(expected_ids, predicted_ids)

 # TER is the edit distance normalized by the truth token count
 ter = distance / max(1, len(expected_ids))

 return ter

In [None]:
# Run inference on all test examples
results = []
total_ter = 0.0

pbar = tqdm(test_dataset, desc="Evaluating on test set")

for idx, example in enumerate(pbar):
 expected = example["unicode"]
 expected_ids = processor.tokenizer.encode(expected, add_special_tokens = False)

 # Load image
 with Image.open(IMG_CACHE / f"P{str(example['id']).rjust(6, '0')}.jpg").convert(
 "RGB"
 ) as image:
 # Prepare input
 messages = [
 {
 "role": "user",
 "content": [
 {"type": "image", "image": image},
 {"type": "text", "text": "OCR:"},
 ],
 },
 ]

 inputs = processor.apply_chat_template(
 messages, 
 tokenize=True, 
 add_generation_prompt=True, 	
 return_dict=True,
 return_tensors="pt"
 ).to("cuda")

 # Generate prediction
 with torch.no_grad():
 output_ids = model.generate(
 **inputs,
 use_cache=True,
 max_new_tokens=int(len(expected_ids) * 1.2),
 repetition_penalty=1.03,
 )

 predicted_ids = output_ids[0][inputs["input_ids"].shape[1] :][:-1].tolist()

 # Compute TER for this example
 ter = compute_ter(expected_ids, predicted_ids)
 total_ter += ter

 pbar.set_postfix_str(f"AVG TER={total_ter / (idx+1):.3f}")

 prediction = processor.decode(
 predicted_ids,
 skip_special_tokens=False,
 ).strip()

 # Store results
 results.append(
 {
 "id": example["id"],
 "expected": expected,
 "prediction": prediction,
 "ter": ter,
 }
 )
 tqdm.write(f"\033[94m\nID: {example['id']} | TER: {ter:.4f}\033[0m")
 tqdm.write(f"\033[92mExpected:\033[0m\n{expected}")
 tqdm.write(f"\033[91mPredicted:\033[0m\n{prediction}")

# Compute averages
average_ter = total_ter / len(test_dataset)
print(f"\n{'='*60}")
print(f"Average Token Error Rate (TER): {average_ter:.4f} ({average_ter*100:.2f}%)")
print(f"{'='*60}")

In [None]:
# Show examples: best and worst predictions (sorted by TER)
sorted_results = sorted(results, key=lambda x: x["ter"])

print("="*60)
print("BEST PREDICTIONS (Lowest TER)")
print("="*60)
for i in range(min(10, len(sorted_results))):
 r = sorted_results[i]
 print(f"\nExample {i+1} - ID: {r['id']} - TER: {r['ter']:.4f}")
 print(f"Expected:\n{r['expected']}")
 print(f"Predicted:\n{r['prediction']}")
 print("-"*60)

print("\n" + "="*60)
print("WORST PREDICTIONS (Highest TER)")
print("="*60)
for i in range(min(10, len(sorted_results))):
 r = sorted_results[-(i+1)]
 print(f"\nExample {i+1} - ID: {r['id']} - TER: {r['ter']:.4f}")
 print(f"Expected:\n{r['expected']}")
 print(f"Predicted:\n{r['prediction']}")
 print("-"*60)

In [None]:
# TER and CER distribution statistics
import numpy as np

ter_values = [r["ter"] for r in results]

print("="*60)
print("TER (TOKEN ERROR RATE) DISTRIBUTION STATISTICS")
print("="*60)
print(f"Mean TER: {np.mean(ter_values):.4f} ({np.mean(ter_values)*100:.2f}%)")
print(f"Median TER: {np.median(ter_values):.4f} ({np.median(ter_values)*100:.2f}%)")
print(f"Std Dev: {np.std(ter_values):.4f}")
print(f"Min TER: {np.min(ter_values):.4f} ({np.min(ter_values)*100:.2f}%)")
print(f"Max TER: {np.max(ter_values):.4f} ({np.max(ter_values)*100:.2f}%)")
print(f"\nPercentiles:")
print(f" 25th: {np.percentile(ter_values, 25):.4f}")
print(f" 50th: {np.percentile(ter_values, 50):.4f}")
print(f" 75th: {np.percentile(ter_values, 75):.4f}")
print(f" 90th: {np.percentile(ter_values, 90):.4f}")
print(f" 95th: {np.percentile(ter_values, 95):.4f}")
print(f" 98th: {np.percentile(ter_values, 98):.4f}")

# Count perfect predictions
perfect_predictions = sum(1 for ter in ter_values if ter == 0.0)
print(f"\nPerfect predictions (TER=0%): {perfect_predictions}/{len(ter_values)} ({perfect_predictions/len(ter_values)*100:.2f}%)")

# Count predictions with TER < 0.5 (less than 50% error)
good_predictions = sum(1 for ter in ter_values if ter < 0.5)
print(f"Good predictions (TER<50%): {good_predictions}/{len(ter_values)} ({good_predictions/len(ter_values)*100:.2f}%)")