emanuelaboros's picture
update
99cd583
import re
import math
from difflib import SequenceMatcher
import gradio as gr
from transformers import AutoTokenizer
try:
from wordfreq import zipf_frequency
except ImportError:
zipf_frequency = None
LANGS = {
"English": "en",
"French": "fr",
"German": "de",
"Italian": "it",
}
TOKENIZER_MODELS = {
"GPT-2": "gpt2",
"XLM-RoBERTa": "xlm-roberta-base",
"mT5": "google/mt5-small",
}
_tokenizer_cache = {}
def get_tokenizer(model_name: str):
if model_name not in _tokenizer_cache:
_tokenizer_cache[model_name] = AutoTokenizer.from_pretrained(
TOKENIZER_MODELS[model_name]
)
return _tokenizer_cache[model_name]
def tokenize_words(text: str):
return re.findall(r"\b[\w'-]+\b", text, flags=re.UNICODE)
def suspicious_char_ratio(text: str):
if not text:
return 1.0
suspicious = re.findall(
r"[^ \n\r\t\wÀ-ÖØ-öø-ÿ.,;:!?()'\"%&/\-]", text, flags=re.UNICODE
)
return len(suspicious) / max(len(text), 1)
def repeated_punct_ratio(text: str):
if not text:
return 0.0
matches = re.findall(r"([.,;:!?_\-])\1{1,}", text)
return len(matches) / max(len(text), 1)
def digit_noise_ratio(text: str):
words = tokenize_words(text)
if not words:
return 0.0
weird_digit_patterns = re.findall(r"\b(?:\d+[A-Za-z]+|[A-Za-z]+\d+)\b", text)
return len(weird_digit_patterns) / max(len(words), 1)
def uppercase_ratio(text: str):
letters = [c for c in text if c.isalpha()]
if not letters:
return 0.0
upper = sum(1 for c in letters if c.isupper())
return upper / len(letters)
def broken_word_ratio(words):
if not words:
return 1.0
broken = 0
for w in words:
if len(w) <= 1:
continue
if re.search(r"(.)\1\1", w):
broken += 1
elif len(w) > 25:
broken += 1
elif re.search(r"[0-9]", w) and re.search(r"[A-Za-zÀ-ÖØ-öø-ÿ]", w):
broken += 1
return broken / max(len(words), 1)
def line_length_stability(text: str):
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
if len(lines) < 2:
return 1.0
lengths = [len(ln) for ln in lines]
mean = sum(lengths) / len(lengths)
if mean == 0:
return 1.0
var = sum((x - mean) ** 2 for x in lengths) / len(lengths)
std = math.sqrt(var)
return max(0.0, 1.0 - (std / mean))
def lexical_plausibility(words, lang_code):
if not words:
return 0.0, []
if zipf_frequency is None:
return 0.5, []
scored = []
bad_words = []
for w in words:
lw = w.lower()
if len(lw) <= 1 or lw.isdigit():
continue
z = zipf_frequency(lw, lang_code)
scored.append(z)
if z < 2.5:
bad_words.append(w)
if not scored:
return 0.0, bad_words[:30]
plausible = sum(1 for z in scored if z >= 3.0)
return plausible / len(scored), bad_words[:30]
def tokenizer_fragmentation_metrics(text: str, tokenizer_name: str):
words = tokenize_words(text)
if not words:
return {
"tokens_per_word": 0.0,
"fragmented_word_ratio": 0.0,
"single_char_piece_ratio": 0.0,
}
tokenizer = get_tokenizer(tokenizer_name)
token_counts = []
single_char_pieces = 0
total_pieces = 0
for w in words:
pieces = tokenizer.tokenize(w)
n = len(pieces)
token_counts.append(n)
total_pieces += n
single_char_pieces += sum(1 for p in pieces if len(p.strip("▁Ġ")) == 1)
tokens_per_word = total_pieces / len(words)
fragmented_word_ratio = sum(1 for n in token_counts if n > 3) / len(words)
single_char_piece_ratio = single_char_pieces / max(total_pieces, 1)
return {
"tokens_per_word": tokens_per_word,
"fragmented_word_ratio": fragmented_word_ratio,
"single_char_piece_ratio": single_char_piece_ratio,
}
def classify_score(score: float):
if score >= 85:
return "Very good"
if score >= 70:
return "Good"
if score >= 50:
return "Medium"
if score >= 30:
return "Poor"
return "Very poor"
def compute_ocr_quality(text: str, language: str, tokenizer_name: str):
text = (text or "").strip()
if not text:
return {
"quality_score": 0.0,
"label": "No text",
"details": {},
"bad_words": [],
}
lang_code = LANGS.get(language, "en")
words = tokenize_words(text)
suspicious = suspicious_char_ratio(text)
repeated = repeated_punct_ratio(text)
digit_noise = digit_noise_ratio(text)
broken = broken_word_ratio(words)
line_stability = line_length_stability(text)
upper = uppercase_ratio(text)
lexical_score, bad_words = lexical_plausibility(words, lang_code)
frag = tokenizer_fragmentation_metrics(text, tokenizer_name)
score = 100.0
score -= suspicious * 220
score -= repeated * 180
score -= digit_noise * 45
score -= broken * 65
score -= max(0.0, 0.55 - lexical_score) * 90
score -= max(0.0, frag["tokens_per_word"] - 1.8) * 25
score -= frag["fragmented_word_ratio"] * 60
score -= frag["single_char_piece_ratio"] * 40
score -= max(0.0, upper - 0.35) * 35
score += max(0.0, line_stability - 0.5) * 10
score = max(0.0, min(100.0, round(score, 2)))
label = classify_score(score)
details = {
"word_count": len(words),
"suspicious_char_ratio": round(suspicious, 4),
"repeated_punct_ratio": round(repeated, 4),
"digit_noise_ratio": round(digit_noise, 4),
"broken_word_ratio": round(broken, 4),
"lexical_plausibility": round(lexical_score, 4),
"line_length_stability": round(line_stability, 4),
"uppercase_ratio": round(upper, 4),
"tokens_per_word": round(frag["tokens_per_word"], 4),
"fragmented_word_ratio": round(frag["fragmented_word_ratio"], 4),
"single_char_piece_ratio": round(frag["single_char_piece_ratio"], 4),
}
return {
"quality_score": score,
"label": label,
"details": details,
"bad_words": bad_words,
}
def explain_result(result):
score = result["quality_score"]
label = result["label"]
details = result["details"]
summary = f"## OCR quality: **{label}**\n\n**Score:** {score}/100"
metrics = "\n".join(
f"- **{k}**: {v}" for k, v in details.items()
)
suspicious_words = ", ".join(result["bad_words"][:30]) if result["bad_words"] else "None"
return summary, metrics, suspicious_words
def analyze_text(text, language, tokenizer_name):
result = compute_ocr_quality(text, language, tokenizer_name)
return explain_result(result)
demo = gr.Interface(
fn=analyze_text,
inputs=[
gr.Textbox(lines=18, label="OCR text"),
gr.Dropdown(choices=list(LANGS.keys()), value="English", label="Language"),
gr.Dropdown(
choices=list(TOKENIZER_MODELS.keys()),
value="XLM-RoBERTa",
label="Tokenizer used for fragmentation score",
),
],
outputs=[
gr.Markdown(label="Summary"),
gr.Markdown(label="Metrics"),
gr.Textbox(label="Potentially suspicious / rare words"),
],
title="OCR Quality Detector",
description=(
"A lightweight reference-free OCR quality estimator. "
"It combines OCR-noise heuristics, lexical plausibility, and tokenizer fragmentation."
),
examples=[
[
"THE OMAHA DAILY BEE, TUESDAY, JUNE 24, 1890 NEWS ABOUT THE BLUFFS Comparatively Little Damage Done by Sunday Night's Storm.",
"English",
"XLM-RoBERTa",
],
[
"THHJ C M A 14 A1 HAM p 0 _ _ THE OMAHA DAILY BEE , TUEBPAY , JUNE 24 , 1890 , _ _ NEWS ABOUT THE BLUFFS Comparatively Little Damage Done b , Sunday Night's Storm",
"English",
"XLM-RoBERTa",
],
],
)
if __name__ == "__main__":
demo.launch()