Spaces:
Sleeping
Sleeping
| import re | |
| import math | |
| from difflib import SequenceMatcher | |
| import gradio as gr | |
| from transformers import AutoTokenizer | |
| try: | |
| from wordfreq import zipf_frequency | |
| except ImportError: | |
| zipf_frequency = None | |
| LANGS = { | |
| "English": "en", | |
| "French": "fr", | |
| "German": "de", | |
| "Italian": "it", | |
| } | |
| TOKENIZER_MODELS = { | |
| "GPT-2": "gpt2", | |
| "XLM-RoBERTa": "xlm-roberta-base", | |
| "mT5": "google/mt5-small", | |
| } | |
| _tokenizer_cache = {} | |
| def get_tokenizer(model_name: str): | |
| if model_name not in _tokenizer_cache: | |
| _tokenizer_cache[model_name] = AutoTokenizer.from_pretrained( | |
| TOKENIZER_MODELS[model_name] | |
| ) | |
| return _tokenizer_cache[model_name] | |
| def tokenize_words(text: str): | |
| return re.findall(r"\b[\w'-]+\b", text, flags=re.UNICODE) | |
| def suspicious_char_ratio(text: str): | |
| if not text: | |
| return 1.0 | |
| suspicious = re.findall( | |
| r"[^ \n\r\t\wÀ-ÖØ-öø-ÿ.,;:!?()'\"%&/\-]", text, flags=re.UNICODE | |
| ) | |
| return len(suspicious) / max(len(text), 1) | |
| def repeated_punct_ratio(text: str): | |
| if not text: | |
| return 0.0 | |
| matches = re.findall(r"([.,;:!?_\-])\1{1,}", text) | |
| return len(matches) / max(len(text), 1) | |
| def digit_noise_ratio(text: str): | |
| words = tokenize_words(text) | |
| if not words: | |
| return 0.0 | |
| weird_digit_patterns = re.findall(r"\b(?:\d+[A-Za-z]+|[A-Za-z]+\d+)\b", text) | |
| return len(weird_digit_patterns) / max(len(words), 1) | |
| def uppercase_ratio(text: str): | |
| letters = [c for c in text if c.isalpha()] | |
| if not letters: | |
| return 0.0 | |
| upper = sum(1 for c in letters if c.isupper()) | |
| return upper / len(letters) | |
| def broken_word_ratio(words): | |
| if not words: | |
| return 1.0 | |
| broken = 0 | |
| for w in words: | |
| if len(w) <= 1: | |
| continue | |
| if re.search(r"(.)\1\1", w): | |
| broken += 1 | |
| elif len(w) > 25: | |
| broken += 1 | |
| elif re.search(r"[0-9]", w) and re.search(r"[A-Za-zÀ-ÖØ-öø-ÿ]", w): | |
| broken += 1 | |
| return broken / max(len(words), 1) | |
| def line_length_stability(text: str): | |
| lines = [ln.strip() for ln in text.splitlines() if ln.strip()] | |
| if len(lines) < 2: | |
| return 1.0 | |
| lengths = [len(ln) for ln in lines] | |
| mean = sum(lengths) / len(lengths) | |
| if mean == 0: | |
| return 1.0 | |
| var = sum((x - mean) ** 2 for x in lengths) / len(lengths) | |
| std = math.sqrt(var) | |
| return max(0.0, 1.0 - (std / mean)) | |
| def lexical_plausibility(words, lang_code): | |
| if not words: | |
| return 0.0, [] | |
| if zipf_frequency is None: | |
| return 0.5, [] | |
| scored = [] | |
| bad_words = [] | |
| for w in words: | |
| lw = w.lower() | |
| if len(lw) <= 1 or lw.isdigit(): | |
| continue | |
| z = zipf_frequency(lw, lang_code) | |
| scored.append(z) | |
| if z < 2.5: | |
| bad_words.append(w) | |
| if not scored: | |
| return 0.0, bad_words[:30] | |
| plausible = sum(1 for z in scored if z >= 3.0) | |
| return plausible / len(scored), bad_words[:30] | |
| def tokenizer_fragmentation_metrics(text: str, tokenizer_name: str): | |
| words = tokenize_words(text) | |
| if not words: | |
| return { | |
| "tokens_per_word": 0.0, | |
| "fragmented_word_ratio": 0.0, | |
| "single_char_piece_ratio": 0.0, | |
| } | |
| tokenizer = get_tokenizer(tokenizer_name) | |
| token_counts = [] | |
| single_char_pieces = 0 | |
| total_pieces = 0 | |
| for w in words: | |
| pieces = tokenizer.tokenize(w) | |
| n = len(pieces) | |
| token_counts.append(n) | |
| total_pieces += n | |
| single_char_pieces += sum(1 for p in pieces if len(p.strip("▁Ġ")) == 1) | |
| tokens_per_word = total_pieces / len(words) | |
| fragmented_word_ratio = sum(1 for n in token_counts if n > 3) / len(words) | |
| single_char_piece_ratio = single_char_pieces / max(total_pieces, 1) | |
| return { | |
| "tokens_per_word": tokens_per_word, | |
| "fragmented_word_ratio": fragmented_word_ratio, | |
| "single_char_piece_ratio": single_char_piece_ratio, | |
| } | |
| def classify_score(score: float): | |
| if score >= 85: | |
| return "Very good" | |
| if score >= 70: | |
| return "Good" | |
| if score >= 50: | |
| return "Medium" | |
| if score >= 30: | |
| return "Poor" | |
| return "Very poor" | |
| def compute_ocr_quality(text: str, language: str, tokenizer_name: str): | |
| text = (text or "").strip() | |
| if not text: | |
| return { | |
| "quality_score": 0.0, | |
| "label": "No text", | |
| "details": {}, | |
| "bad_words": [], | |
| } | |
| lang_code = LANGS.get(language, "en") | |
| words = tokenize_words(text) | |
| suspicious = suspicious_char_ratio(text) | |
| repeated = repeated_punct_ratio(text) | |
| digit_noise = digit_noise_ratio(text) | |
| broken = broken_word_ratio(words) | |
| line_stability = line_length_stability(text) | |
| upper = uppercase_ratio(text) | |
| lexical_score, bad_words = lexical_plausibility(words, lang_code) | |
| frag = tokenizer_fragmentation_metrics(text, tokenizer_name) | |
| score = 100.0 | |
| score -= suspicious * 220 | |
| score -= repeated * 180 | |
| score -= digit_noise * 45 | |
| score -= broken * 65 | |
| score -= max(0.0, 0.55 - lexical_score) * 90 | |
| score -= max(0.0, frag["tokens_per_word"] - 1.8) * 25 | |
| score -= frag["fragmented_word_ratio"] * 60 | |
| score -= frag["single_char_piece_ratio"] * 40 | |
| score -= max(0.0, upper - 0.35) * 35 | |
| score += max(0.0, line_stability - 0.5) * 10 | |
| score = max(0.0, min(100.0, round(score, 2))) | |
| label = classify_score(score) | |
| details = { | |
| "word_count": len(words), | |
| "suspicious_char_ratio": round(suspicious, 4), | |
| "repeated_punct_ratio": round(repeated, 4), | |
| "digit_noise_ratio": round(digit_noise, 4), | |
| "broken_word_ratio": round(broken, 4), | |
| "lexical_plausibility": round(lexical_score, 4), | |
| "line_length_stability": round(line_stability, 4), | |
| "uppercase_ratio": round(upper, 4), | |
| "tokens_per_word": round(frag["tokens_per_word"], 4), | |
| "fragmented_word_ratio": round(frag["fragmented_word_ratio"], 4), | |
| "single_char_piece_ratio": round(frag["single_char_piece_ratio"], 4), | |
| } | |
| return { | |
| "quality_score": score, | |
| "label": label, | |
| "details": details, | |
| "bad_words": bad_words, | |
| } | |
| def explain_result(result): | |
| score = result["quality_score"] | |
| label = result["label"] | |
| details = result["details"] | |
| summary = f"## OCR quality: **{label}**\n\n**Score:** {score}/100" | |
| metrics = "\n".join( | |
| f"- **{k}**: {v}" for k, v in details.items() | |
| ) | |
| suspicious_words = ", ".join(result["bad_words"][:30]) if result["bad_words"] else "None" | |
| return summary, metrics, suspicious_words | |
| def analyze_text(text, language, tokenizer_name): | |
| result = compute_ocr_quality(text, language, tokenizer_name) | |
| return explain_result(result) | |
| demo = gr.Interface( | |
| fn=analyze_text, | |
| inputs=[ | |
| gr.Textbox(lines=18, label="OCR text"), | |
| gr.Dropdown(choices=list(LANGS.keys()), value="English", label="Language"), | |
| gr.Dropdown( | |
| choices=list(TOKENIZER_MODELS.keys()), | |
| value="XLM-RoBERTa", | |
| label="Tokenizer used for fragmentation score", | |
| ), | |
| ], | |
| outputs=[ | |
| gr.Markdown(label="Summary"), | |
| gr.Markdown(label="Metrics"), | |
| gr.Textbox(label="Potentially suspicious / rare words"), | |
| ], | |
| title="OCR Quality Detector", | |
| description=( | |
| "A lightweight reference-free OCR quality estimator. " | |
| "It combines OCR-noise heuristics, lexical plausibility, and tokenizer fragmentation." | |
| ), | |
| examples=[ | |
| [ | |
| "THE OMAHA DAILY BEE, TUESDAY, JUNE 24, 1890 NEWS ABOUT THE BLUFFS Comparatively Little Damage Done by Sunday Night's Storm.", | |
| "English", | |
| "XLM-RoBERTa", | |
| ], | |
| [ | |
| "THHJ C M A 14 A1 HAM p 0 _ _ THE OMAHA DAILY BEE , TUEBPAY , JUNE 24 , 1890 , _ _ NEWS ABOUT THE BLUFFS Comparatively Little Damage Done b , Sunday Night's Storm", | |
| "English", | |
| "XLM-RoBERTa", | |
| ], | |
| ], | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |