import re import math from difflib import SequenceMatcher import gradio as gr from transformers import AutoTokenizer try: from wordfreq import zipf_frequency except ImportError: zipf_frequency = None LANGS = { "English": "en", "French": "fr", "German": "de", "Italian": "it", } TOKENIZER_MODELS = { "GPT-2": "gpt2", "XLM-RoBERTa": "xlm-roberta-base", "mT5": "google/mt5-small", } _tokenizer_cache = {} def get_tokenizer(model_name: str): if model_name not in _tokenizer_cache: _tokenizer_cache[model_name] = AutoTokenizer.from_pretrained( TOKENIZER_MODELS[model_name] ) return _tokenizer_cache[model_name] def tokenize_words(text: str): return re.findall(r"\b[\w'-]+\b", text, flags=re.UNICODE) def suspicious_char_ratio(text: str): if not text: return 1.0 suspicious = re.findall( r"[^ \n\r\t\wÀ-ÖØ-öø-ÿ.,;:!?()'\"%&/\-]", text, flags=re.UNICODE ) return len(suspicious) / max(len(text), 1) def repeated_punct_ratio(text: str): if not text: return 0.0 matches = re.findall(r"([.,;:!?_\-])\1{1,}", text) return len(matches) / max(len(text), 1) def digit_noise_ratio(text: str): words = tokenize_words(text) if not words: return 0.0 weird_digit_patterns = re.findall(r"\b(?:\d+[A-Za-z]+|[A-Za-z]+\d+)\b", text) return len(weird_digit_patterns) / max(len(words), 1) def uppercase_ratio(text: str): letters = [c for c in text if c.isalpha()] if not letters: return 0.0 upper = sum(1 for c in letters if c.isupper()) return upper / len(letters) def broken_word_ratio(words): if not words: return 1.0 broken = 0 for w in words: if len(w) <= 1: continue if re.search(r"(.)\1\1", w): broken += 1 elif len(w) > 25: broken += 1 elif re.search(r"[0-9]", w) and re.search(r"[A-Za-zÀ-ÖØ-öø-ÿ]", w): broken += 1 return broken / max(len(words), 1) def line_length_stability(text: str): lines = [ln.strip() for ln in text.splitlines() if ln.strip()] if len(lines) < 2: return 1.0 lengths = [len(ln) for ln in lines] mean = sum(lengths) / len(lengths) if mean == 0: return 1.0 var = sum((x - mean) ** 2 for x in lengths) / len(lengths) std = math.sqrt(var) return max(0.0, 1.0 - (std / mean)) def lexical_plausibility(words, lang_code): if not words: return 0.0, [] if zipf_frequency is None: return 0.5, [] scored = [] bad_words = [] for w in words: lw = w.lower() if len(lw) <= 1 or lw.isdigit(): continue z = zipf_frequency(lw, lang_code) scored.append(z) if z < 2.5: bad_words.append(w) if not scored: return 0.0, bad_words[:30] plausible = sum(1 for z in scored if z >= 3.0) return plausible / len(scored), bad_words[:30] def tokenizer_fragmentation_metrics(text: str, tokenizer_name: str): words = tokenize_words(text) if not words: return { "tokens_per_word": 0.0, "fragmented_word_ratio": 0.0, "single_char_piece_ratio": 0.0, } tokenizer = get_tokenizer(tokenizer_name) token_counts = [] single_char_pieces = 0 total_pieces = 0 for w in words: pieces = tokenizer.tokenize(w) n = len(pieces) token_counts.append(n) total_pieces += n single_char_pieces += sum(1 for p in pieces if len(p.strip("▁Ġ")) == 1) tokens_per_word = total_pieces / len(words) fragmented_word_ratio = sum(1 for n in token_counts if n > 3) / len(words) single_char_piece_ratio = single_char_pieces / max(total_pieces, 1) return { "tokens_per_word": tokens_per_word, "fragmented_word_ratio": fragmented_word_ratio, "single_char_piece_ratio": single_char_piece_ratio, } def classify_score(score: float): if score >= 85: return "Very good" if score >= 70: return "Good" if score >= 50: return "Medium" if score >= 30: return "Poor" return "Very poor" def compute_ocr_quality(text: str, language: str, tokenizer_name: str): text = (text or "").strip() if not text: return { "quality_score": 0.0, "label": "No text", "details": {}, "bad_words": [], } lang_code = LANGS.get(language, "en") words = tokenize_words(text) suspicious = suspicious_char_ratio(text) repeated = repeated_punct_ratio(text) digit_noise = digit_noise_ratio(text) broken = broken_word_ratio(words) line_stability = line_length_stability(text) upper = uppercase_ratio(text) lexical_score, bad_words = lexical_plausibility(words, lang_code) frag = tokenizer_fragmentation_metrics(text, tokenizer_name) score = 100.0 score -= suspicious * 220 score -= repeated * 180 score -= digit_noise * 45 score -= broken * 65 score -= max(0.0, 0.55 - lexical_score) * 90 score -= max(0.0, frag["tokens_per_word"] - 1.8) * 25 score -= frag["fragmented_word_ratio"] * 60 score -= frag["single_char_piece_ratio"] * 40 score -= max(0.0, upper - 0.35) * 35 score += max(0.0, line_stability - 0.5) * 10 score = max(0.0, min(100.0, round(score, 2))) label = classify_score(score) details = { "word_count": len(words), "suspicious_char_ratio": round(suspicious, 4), "repeated_punct_ratio": round(repeated, 4), "digit_noise_ratio": round(digit_noise, 4), "broken_word_ratio": round(broken, 4), "lexical_plausibility": round(lexical_score, 4), "line_length_stability": round(line_stability, 4), "uppercase_ratio": round(upper, 4), "tokens_per_word": round(frag["tokens_per_word"], 4), "fragmented_word_ratio": round(frag["fragmented_word_ratio"], 4), "single_char_piece_ratio": round(frag["single_char_piece_ratio"], 4), } return { "quality_score": score, "label": label, "details": details, "bad_words": bad_words, } def explain_result(result): score = result["quality_score"] label = result["label"] details = result["details"] summary = f"## OCR quality: **{label}**\n\n**Score:** {score}/100" metrics = "\n".join( f"- **{k}**: {v}" for k, v in details.items() ) suspicious_words = ", ".join(result["bad_words"][:30]) if result["bad_words"] else "None" return summary, metrics, suspicious_words def analyze_text(text, language, tokenizer_name): result = compute_ocr_quality(text, language, tokenizer_name) return explain_result(result) demo = gr.Interface( fn=analyze_text, inputs=[ gr.Textbox(lines=18, label="OCR text"), gr.Dropdown(choices=list(LANGS.keys()), value="English", label="Language"), gr.Dropdown( choices=list(TOKENIZER_MODELS.keys()), value="XLM-RoBERTa", label="Tokenizer used for fragmentation score", ), ], outputs=[ gr.Markdown(label="Summary"), gr.Markdown(label="Metrics"), gr.Textbox(label="Potentially suspicious / rare words"), ], title="OCR Quality Detector", description=( "A lightweight reference-free OCR quality estimator. " "It combines OCR-noise heuristics, lexical plausibility, and tokenizer fragmentation." ), examples=[ [ "THE OMAHA DAILY BEE, TUESDAY, JUNE 24, 1890 NEWS ABOUT THE BLUFFS Comparatively Little Damage Done by Sunday Night's Storm.", "English", "XLM-RoBERTa", ], [ "THHJ C M A 14 A1 HAM p 0 _ _ THE OMAHA DAILY BEE , TUEBPAY , JUNE 24 , 1890 , _ _ NEWS ABOUT THE BLUFFS Comparatively Little Damage Done b , Sunday Night's Storm", "English", "XLM-RoBERTa", ], ], ) if __name__ == "__main__": demo.launch()