Spaces:

l3ipp
/

ocr-quality-detector

Sleeping

File size: 8,100 Bytes

import re
import math
from difflib import SequenceMatcher

import gradio as gr
from transformers import AutoTokenizer

try:
    from wordfreq import zipf_frequency
except ImportError:
    zipf_frequency = None


LANGS = {
    "English": "en",
    "French": "fr",
    "German": "de",
    "Italian": "it",
}

TOKENIZER_MODELS = {
    "GPT-2": "gpt2",
    "XLM-RoBERTa": "xlm-roberta-base",
    "mT5": "google/mt5-small",
}

_tokenizer_cache = {}


def get_tokenizer(model_name: str):
    if model_name not in _tokenizer_cache:
        _tokenizer_cache[model_name] = AutoTokenizer.from_pretrained(
            TOKENIZER_MODELS[model_name]
        )
    return _tokenizer_cache[model_name]


def tokenize_words(text: str):
    return re.findall(r"\b[\w'-]+\b", text, flags=re.UNICODE)


def suspicious_char_ratio(text: str):
    if not text:
        return 1.0
    suspicious = re.findall(
        r"[^ \n\r\t\wÀ-ÖØ-öø-ÿ.,;:!?()'\"%&/\-]", text, flags=re.UNICODE
    )
    return len(suspicious) / max(len(text), 1)


def repeated_punct_ratio(text: str):
    if not text:
        return 0.0
    matches = re.findall(r"([.,;:!?_\-])\1{1,}", text)
    return len(matches) / max(len(text), 1)


def digit_noise_ratio(text: str):
    words = tokenize_words(text)
    if not words:
        return 0.0
    weird_digit_patterns = re.findall(r"\b(?:\d+[A-Za-z]+|[A-Za-z]+\d+)\b", text)
    return len(weird_digit_patterns) / max(len(words), 1)


def uppercase_ratio(text: str):
    letters = [c for c in text if c.isalpha()]
    if not letters:
        return 0.0
    upper = sum(1 for c in letters if c.isupper())
    return upper / len(letters)


def broken_word_ratio(words):
    if not words:
        return 1.0
    broken = 0
    for w in words:
        if len(w) <= 1:
            continue
        if re.search(r"(.)\1\1", w):
            broken += 1
        elif len(w) > 25:
            broken += 1
        elif re.search(r"[0-9]", w) and re.search(r"[A-Za-zÀ-ÖØ-öø-ÿ]", w):
            broken += 1
    return broken / max(len(words), 1)


def line_length_stability(text: str):
    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
    if len(lines) < 2:
        return 1.0
    lengths = [len(ln) for ln in lines]
    mean = sum(lengths) / len(lengths)
    if mean == 0:
        return 1.0
    var = sum((x - mean) ** 2 for x in lengths) / len(lengths)
    std = math.sqrt(var)
    return max(0.0, 1.0 - (std / mean))


def lexical_plausibility(words, lang_code):
    if not words:
        return 0.0, []
    if zipf_frequency is None:
        return 0.5, []

    scored = []
    bad_words = []

    for w in words:
        lw = w.lower()
        if len(lw) <= 1 or lw.isdigit():
            continue
        z = zipf_frequency(lw, lang_code)
        scored.append(z)
        if z < 2.5:
            bad_words.append(w)

    if not scored:
        return 0.0, bad_words[:30]

    plausible = sum(1 for z in scored if z >= 3.0)
    return plausible / len(scored), bad_words[:30]


def tokenizer_fragmentation_metrics(text: str, tokenizer_name: str):
    words = tokenize_words(text)
    if not words:
        return {
            "tokens_per_word": 0.0,
            "fragmented_word_ratio": 0.0,
            "single_char_piece_ratio": 0.0,
        }

    tokenizer = get_tokenizer(tokenizer_name)
    token_counts = []
    single_char_pieces = 0
    total_pieces = 0

    for w in words:
        pieces = tokenizer.tokenize(w)
        n = len(pieces)
        token_counts.append(n)
        total_pieces += n
        single_char_pieces += sum(1 for p in pieces if len(p.strip("▁Ġ")) == 1)

    tokens_per_word = total_pieces / len(words)
    fragmented_word_ratio = sum(1 for n in token_counts if n > 3) / len(words)
    single_char_piece_ratio = single_char_pieces / max(total_pieces, 1)

    return {
        "tokens_per_word": tokens_per_word,
        "fragmented_word_ratio": fragmented_word_ratio,
        "single_char_piece_ratio": single_char_piece_ratio,
    }


def classify_score(score: float):
    if score >= 85:
        return "Very good"
    if score >= 70:
        return "Good"
    if score >= 50:
        return "Medium"
    if score >= 30:
        return "Poor"
    return "Very poor"


def compute_ocr_quality(text: str, language: str, tokenizer_name: str):
    text = (text or "").strip()
    if not text:
        return {
            "quality_score": 0.0,
            "label": "No text",
            "details": {},
            "bad_words": [],
        }

    lang_code = LANGS.get(language, "en")
    words = tokenize_words(text)

    suspicious = suspicious_char_ratio(text)
    repeated = repeated_punct_ratio(text)
    digit_noise = digit_noise_ratio(text)
    broken = broken_word_ratio(words)
    line_stability = line_length_stability(text)
    upper = uppercase_ratio(text)
    lexical_score, bad_words = lexical_plausibility(words, lang_code)
    frag = tokenizer_fragmentation_metrics(text, tokenizer_name)

    score = 100.0
    score -= suspicious * 220
    score -= repeated * 180
    score -= digit_noise * 45
    score -= broken * 65
    score -= max(0.0, 0.55 - lexical_score) * 90
    score -= max(0.0, frag["tokens_per_word"] - 1.8) * 25
    score -= frag["fragmented_word_ratio"] * 60
    score -= frag["single_char_piece_ratio"] * 40
    score -= max(0.0, upper - 0.35) * 35
    score += max(0.0, line_stability - 0.5) * 10

    score = max(0.0, min(100.0, round(score, 2)))
    label = classify_score(score)

    details = {
        "word_count": len(words),
        "suspicious_char_ratio": round(suspicious, 4),
        "repeated_punct_ratio": round(repeated, 4),
        "digit_noise_ratio": round(digit_noise, 4),
        "broken_word_ratio": round(broken, 4),
        "lexical_plausibility": round(lexical_score, 4),
        "line_length_stability": round(line_stability, 4),
        "uppercase_ratio": round(upper, 4),
        "tokens_per_word": round(frag["tokens_per_word"], 4),
        "fragmented_word_ratio": round(frag["fragmented_word_ratio"], 4),
        "single_char_piece_ratio": round(frag["single_char_piece_ratio"], 4),
    }

    return {
        "quality_score": score,
        "label": label,
        "details": details,
        "bad_words": bad_words,
    }


def explain_result(result):
    score = result["quality_score"]
    label = result["label"]
    details = result["details"]

    summary = f"## OCR quality: **{label}**\n\n**Score:** {score}/100"

    metrics = "\n".join(
        f"- **{k}**: {v}" for k, v in details.items()
    )

    suspicious_words = ", ".join(result["bad_words"][:30]) if result["bad_words"] else "None"

    return summary, metrics, suspicious_words


def analyze_text(text, language, tokenizer_name):
    result = compute_ocr_quality(text, language, tokenizer_name)
    return explain_result(result)


demo = gr.Interface(
    fn=analyze_text,
    inputs=[
        gr.Textbox(lines=18, label="OCR text"),
        gr.Dropdown(choices=list(LANGS.keys()), value="English", label="Language"),
        gr.Dropdown(
            choices=list(TOKENIZER_MODELS.keys()),
            value="XLM-RoBERTa",
            label="Tokenizer used for fragmentation score",
        ),
    ],
    outputs=[
        gr.Markdown(label="Summary"),
        gr.Markdown(label="Metrics"),
        gr.Textbox(label="Potentially suspicious / rare words"),
    ],
    title="OCR Quality Detector",
    description=(
        "A lightweight reference-free OCR quality estimator. "
        "It combines OCR-noise heuristics, lexical plausibility, and tokenizer fragmentation."
    ),
    examples=[
        [
            "THE OMAHA DAILY BEE, TUESDAY, JUNE 24, 1890 NEWS ABOUT THE BLUFFS Comparatively Little Damage Done by Sunday Night's Storm.",
            "English",
            "XLM-RoBERTa",
        ],
        [
            "THHJ C M A 14 A1 HAM p 0 _ _ THE OMAHA DAILY BEE , TUEBPAY , JUNE 24 , 1890 , _ _ NEWS ABOUT THE BLUFFS Comparatively Little Damage Done b , Sunday Night's Storm",
            "English",
            "XLM-RoBERTa",
        ],
    ],
)

if __name__ == "__main__":
    demo.launch()