Spaces:

l3ipp
/

ocr-quality-detector

Sleeping

App Files Files Community

emanuelaboros commited on Apr 22

Commit

3e9b591

1 Parent(s): 6876bc3

change to gpt

Browse files

Files changed (1) hide show

app.py +130 -50

app.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import re
 import math
 import gradio as gr
-from collections import Counter
 try:
     from wordfreq import zipf_frequency
@@ -16,6 +18,22 @@ LANGS = {
     "Italian": "it",
 }
 def tokenize_words(text: str):
     return re.findall(r"\b[\w'-]+\b", text, flags=re.UNICODE)
@@ -24,7 +42,9 @@ def tokenize_words(text: str):
 def suspicious_char_ratio(text: str):
     if not text:
         return 1.0
-    suspicious = re.findall(r"[^ \n\r\t\wÀ-ÖØ-öø-ÿ.,;:!?()'\"%-]", text, flags=re.UNICODE)
     return len(suspicious) / max(len(text), 1)
@@ -36,10 +56,11 @@ def repeated_punct_ratio(text: str):
 def digit_noise_ratio(text: str):
-    if not text:
         return 0.0
     weird_digit_patterns = re.findall(r"\b(?:\d+[A-Za-z]+|[A-Za-z]+\d+)\b", text)
-    return len(weird_digit_patterns) / max(len(tokenize_words(text)), 1)
 def uppercase_ratio(text: str):
@@ -59,13 +80,26 @@ def broken_word_ratio(words):
             continue
         if re.search(r"(.)\1\1", w):
             broken += 1
-        elif len(w) > 20:
             broken += 1
         elif re.search(r"[0-9]", w) and re.search(r"[A-Za-zÀ-ÖØ-öø-ÿ]", w):
             broken += 1
     return broken / max(len(words), 1)
 def lexical_plausibility(words, lang_code):
     if not words:
         return 0.0, []
@@ -74,6 +108,7 @@ def lexical_plausibility(words, lang_code):
     scored = []
     bad_words = []
     for w in words:
         lw = w.lower()
         if len(lw) <= 1 or lw.isdigit():
@@ -84,30 +119,61 @@ def lexical_plausibility(words, lang_code):
             bad_words.append(w)
     if not scored:
-        return 0.0, bad_words[:20]
     plausible = sum(1 for z in scored if z >= 3.0)
-    return plausible / len(scored), bad_words[:20]
-def line_length_stability(text: str):
-    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
-    if len(lines) < 2:
-        return 1.0
-    lengths = [len(ln) for ln in lines]
-    mean = sum(lengths) / len(lengths)
-    if mean == 0:
-        return 1.0
-    var = sum((x - mean) ** 2 for x in lengths) / len(lengths)
-    std = math.sqrt(var)
-    return max(0.0, 1.0 - (std / mean))
-def compute_ocr_quality(text, language):
     text = (text or "").strip()
     if not text:
         return {
-            "quality_score": 0,
             "label": "No text",
             "details": {},
             "bad_words": [],
@@ -120,42 +186,38 @@ def compute_ocr_quality(text, language):
     repeated = repeated_punct_ratio(text)
     digit_noise = digit_noise_ratio(text)
     broken = broken_word_ratio(words)
-    lex_score, bad_words = lexical_plausibility(words, lang_code)
     line_stability = line_length_stability(text)
     upper = uppercase_ratio(text)
-    # Weighted score
-    score = 100
     score -= suspicious * 220
     score -= repeated * 180
-    score -= digit_noise * 40
-    score -= broken * 60
-    score -= max(0, 0.55 - lex_score) * 90
-    score -= max(0, upper - 0.35) * 40
-    score += max(0, line_stability - 0.5) * 10
-    score = max(0, min(100, round(score, 2)))
-    if score >= 85:
-        label = "Very good"
-    elif score >= 70:
-        label = "Good"
-    elif score >= 50:
-        label = "Medium"
-    elif score >= 30:
-        label = "Poor"
-    else:
-        label = "Very poor"
     details = {
-        "words": len(words),
         "suspicious_char_ratio": round(suspicious, 4),
         "repeated_punct_ratio": round(repeated, 4),
         "digit_noise_ratio": round(digit_noise, 4),
         "broken_word_ratio": round(broken, 4),
-        "lexical_plausibility": round(lex_score, 4),
         "line_length_stability": round(line_stability, 4),
         "uppercase_ratio": round(upper, 4),
     }
     return {
@@ -166,18 +228,25 @@ def compute_ocr_quality(text, language):
     }
-def analyze_text(text, language):
-    result = compute_ocr_quality(text, language)
-    summary = f"### OCR quality: **{result['label']}**\n\n**Score:** {result['quality_score']} / 100"
-    metrics_md = "\n".join(
-        [f"- **{k}**: {v}" for k, v in result["details"].items()]
     )
     suspicious_words = ", ".join(result["bad_words"][:30]) if result["bad_words"] else "None"
-    return summary, metrics_md, suspicious_words
 demo = gr.Interface(
@@ -185,6 +254,11 @@ demo = gr.Interface(
     inputs=[
         gr.Textbox(lines=18, label="OCR text"),
         gr.Dropdown(choices=list(LANGS.keys()), value="English", label="Language"),
     ],
     outputs=[
         gr.Markdown(label="Summary"),
@@ -192,17 +266,23 @@ demo = gr.Interface(
         gr.Textbox(label="Potentially suspicious / rare words"),
     ],
     title="OCR Quality Detector",
-    description="A lightweight reference-free OCR quality estimator based on text heuristics.",
     examples=[
         [
             "THE OMAHA DAILY BEE, TUESDAY, JUNE 24, 1890 NEWS ABOUT THE BLUFFS Comparatively Little Damage Done by Sunday Night's Storm.",
             "English",
         ],
         [
             "THHJ C M A 14 A1 HAM p 0 _ _ THE OMAHA DAILY BEE , TUEBPAY , JUNE 24 , 1890 , _ _ NEWS ABOUT THE BLUFFS Comparatively Little Damage Done b , Sunday Night's Storm",
             "English",
         ],
     ],
 )
 if __name__ == "__main__":

 import re
 import math
+from difflib import SequenceMatcher
 import gradio as gr
+from transformers import AutoTokenizer
 try:
     from wordfreq import zipf_frequency
     "Italian": "it",
 }
+TOKENIZER_MODELS = {
+    "GPT-2": "gpt2",
+    "XLM-RoBERTa": "xlm-roberta-base",
+    "mT5": "google/mt5-small",
+}
+_tokenizer_cache = {}
+def get_tokenizer(model_name: str):
+    if model_name not in _tokenizer_cache:
+        _tokenizer_cache[model_name] = AutoTokenizer.from_pretrained(
+            TOKENIZER_MODELS[model_name]
+        )
+    return _tokenizer_cache[model_name]
 def tokenize_words(text: str):
     return re.findall(r"\b[\w'-]+\b", text, flags=re.UNICODE)
 def suspicious_char_ratio(text: str):
     if not text:
         return 1.0
+    suspicious = re.findall(
+        r"[^ \n\r\t\wÀ-ÖØ-öø-ÿ.,;:!?()'\"%&/\-]", text, flags=re.UNICODE
+    )
     return len(suspicious) / max(len(text), 1)
 def digit_noise_ratio(text: str):
+    words = tokenize_words(text)
+    if not words:
         return 0.0
     weird_digit_patterns = re.findall(r"\b(?:\d+[A-Za-z]+|[A-Za-z]+\d+)\b", text)
+    return len(weird_digit_patterns) / max(len(words), 1)
 def uppercase_ratio(text: str):
             continue
         if re.search(r"(.)\1\1", w):
             broken += 1
+        elif len(w) > 25:
             broken += 1
         elif re.search(r"[0-9]", w) and re.search(r"[A-Za-zÀ-ÖØ-öø-ÿ]", w):
             broken += 1
     return broken / max(len(words), 1)
+def line_length_stability(text: str):
+    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
+    if len(lines) < 2:
+        return 1.0
+    lengths = [len(ln) for ln in lines]
+    mean = sum(lengths) / len(lengths)
+    if mean == 0:
+        return 1.0
+    var = sum((x - mean) ** 2 for x in lengths) / len(lengths)
+    std = math.sqrt(var)
+    return max(0.0, 1.0 - (std / mean))
 def lexical_plausibility(words, lang_code):
     if not words:
         return 0.0, []
     scored = []
     bad_words = []
     for w in words:
         lw = w.lower()
         if len(lw) <= 1 or lw.isdigit():
             bad_words.append(w)
     if not scored:
+        return 0.0, bad_words[:30]
     plausible = sum(1 for z in scored if z >= 3.0)
+    return plausible / len(scored), bad_words[:30]
+def tokenizer_fragmentation_metrics(text: str, tokenizer_name: str):
+    words = tokenize_words(text)
+    if not words:
+        return {
+            "tokens_per_word": 0.0,
+            "fragmented_word_ratio": 0.0,
+            "single_char_piece_ratio": 0.0,
+        }
+    tokenizer = get_tokenizer(tokenizer_name)
+    token_counts = []
+    single_char_pieces = 0
+    total_pieces = 0
+    for w in words:
+        pieces = tokenizer.tokenize(w)
+        n = len(pieces)
+        token_counts.append(n)
+        total_pieces += n
+        single_char_pieces += sum(1 for p in pieces if len(p.strip("▁Ġ")) == 1)
+    tokens_per_word = total_pieces / len(words)
+    fragmented_word_ratio = sum(1 for n in token_counts if n > 3) / len(words)
+    single_char_piece_ratio = single_char_pieces / max(total_pieces, 1)
+    return {
+        "tokens_per_word": tokens_per_word,
+        "fragmented_word_ratio": fragmented_word_ratio,
+        "single_char_piece_ratio": single_char_piece_ratio,
+    }
+def classify_score(score: float):
+    if score >= 85:
+        return "Very good"
+    if score >= 70:
+        return "Good"
+    if score >= 50:
+        return "Medium"
+    if score >= 30:
+        return "Poor"
+    return "Very poor"
+def compute_ocr_quality(text: str, language: str, tokenizer_name: str):
     text = (text or "").strip()
     if not text:
         return {
+            "quality_score": 0.0,
             "label": "No text",
             "details": {},
             "bad_words": [],
     repeated = repeated_punct_ratio(text)
     digit_noise = digit_noise_ratio(text)
     broken = broken_word_ratio(words)
     line_stability = line_length_stability(text)
     upper = uppercase_ratio(text)
+    lexical_score, bad_words = lexical_plausibility(words, lang_code)
+    frag = tokenizer_fragmentation_metrics(text, tokenizer_name)
+    score = 100.0
     score -= suspicious * 220
     score -= repeated * 180
+    score -= digit_noise * 45
+    score -= broken * 65
+    score -= max(0.0, 0.55 - lexical_score) * 90
+    score -= max(0.0, frag["tokens_per_word"] - 1.8) * 25
+    score -= frag["fragmented_word_ratio"] * 60
+    score -= frag["single_char_piece_ratio"] * 40
+    score -= max(0.0, upper - 0.35) * 35
+    score += max(0.0, line_stability - 0.5) * 10
+    score = max(0.0, min(100.0, round(score, 2)))
+    label = classify_score(score)
     details = {
+        "word_count": len(words),
         "suspicious_char_ratio": round(suspicious, 4),
         "repeated_punct_ratio": round(repeated, 4),
         "digit_noise_ratio": round(digit_noise, 4),
         "broken_word_ratio": round(broken, 4),
+        "lexical_plausibility": round(lexical_score, 4),
         "line_length_stability": round(line_stability, 4),
         "uppercase_ratio": round(upper, 4),
+        "tokens_per_word": round(frag["tokens_per_word"], 4),
+        "fragmented_word_ratio": round(frag["fragmented_word_ratio"], 4),
+        "single_char_piece_ratio": round(frag["single_char_piece_ratio"], 4),
     }
     return {
     }
+def explain_result(result):
+    score = result["quality_score"]
+    label = result["label"]
+    details = result["details"]
+    summary = f"## OCR quality: **{label}**\n\n**Score:** {score}/100"
+    metrics = "\n".join(
+        f"- **{k}**: {v}" for k, v in details.items()
     )
     suspicious_words = ", ".join(result["bad_words"][:30]) if result["bad_words"] else "None"
+    return summary, metrics, suspicious_words
+def analyze_text(text, language, tokenizer_name):
+    result = compute_ocr_quality(text, language, tokenizer_name)
+    return explain_result(result)
 demo = gr.Interface(
     inputs=[
         gr.Textbox(lines=18, label="OCR text"),
         gr.Dropdown(choices=list(LANGS.keys()), value="English", label="Language"),
+        gr.Dropdown(
+            choices=list(TOKENIZER_MODELS.keys()),
+            value="XLM-RoBERTa",
+            label="Tokenizer used for fragmentation score",
+        ),
     ],
     outputs=[
         gr.Markdown(label="Summary"),
         gr.Textbox(label="Potentially suspicious / rare words"),
     ],
     title="OCR Quality Detector",
+    description=(
+        "A lightweight reference-free OCR quality estimator. "
+        "It combines OCR-noise heuristics, lexical plausibility, and tokenizer fragmentation."
+    ),
     examples=[
         [
             "THE OMAHA DAILY BEE, TUESDAY, JUNE 24, 1890 NEWS ABOUT THE BLUFFS Comparatively Little Damage Done by Sunday Night's Storm.",
             "English",
+            "XLM-RoBERTa",
         ],
         [
             "THHJ C M A 14 A1 HAM p 0 _ _ THE OMAHA DAILY BEE , TUEBPAY , JUNE 24 , 1890 , _ _ NEWS ABOUT THE BLUFFS Comparatively Little Damage Done b , Sunday Night's Storm",
             "English",
+            "XLM-RoBERTa",
         ],
     ],
+    allow_flagging="never",
 )
 if __name__ == "__main__":