Spaces:

l3ipp
/

ocr-quality-detector

Sleeping

App Files Files Community

emanuelaboros commited on Apr 22

Commit

6876bc3

1 Parent(s): e3379aa

Add application file

Browse files

Files changed (1) hide show

app.py +209 -0

app.py ADDED Viewed

	@@ -0,0 +1,209 @@

+import re
+import math
+import gradio as gr
+from collections import Counter
+try:
+    from wordfreq import zipf_frequency
+except ImportError:
+    zipf_frequency = None
+LANGS = {
+    "English": "en",
+    "French": "fr",
+    "German": "de",
+    "Italian": "it",
+}
+def tokenize_words(text: str):
+    return re.findall(r"\b[\w'-]+\b", text, flags=re.UNICODE)
+def suspicious_char_ratio(text: str):
+    if not text:
+        return 1.0
+    suspicious = re.findall(r"[^ \n\r\t\wÀ-ÖØ-öø-ÿ.,;:!?()'\"%-]", text, flags=re.UNICODE)
+    return len(suspicious) / max(len(text), 1)
+def repeated_punct_ratio(text: str):
+    if not text:
+        return 0.0
+    matches = re.findall(r"([.,;:!?_\-])\1{1,}", text)
+    return len(matches) / max(len(text), 1)
+def digit_noise_ratio(text: str):
+    if not text:
+        return 0.0
+    weird_digit_patterns = re.findall(r"\b(?:\d+[A-Za-z]+|[A-Za-z]+\d+)\b", text)
+    return len(weird_digit_patterns) / max(len(tokenize_words(text)), 1)
+def uppercase_ratio(text: str):
+    letters = [c for c in text if c.isalpha()]
+    if not letters:
+        return 0.0
+    upper = sum(1 for c in letters if c.isupper())
+    return upper / len(letters)
+def broken_word_ratio(words):
+    if not words:
+        return 1.0
+    broken = 0
+    for w in words:
+        if len(w) <= 1:
+            continue
+        if re.search(r"(.)\1\1", w):
+            broken += 1
+        elif len(w) > 20:
+            broken += 1
+        elif re.search(r"[0-9]", w) and re.search(r"[A-Za-zÀ-ÖØ-öø-ÿ]", w):
+            broken += 1
+    return broken / max(len(words), 1)
+def lexical_plausibility(words, lang_code):
+    if not words:
+        return 0.0, []
+    if zipf_frequency is None:
+        return 0.5, []
+    scored = []
+    bad_words = []
+    for w in words:
+        lw = w.lower()
+        if len(lw) <= 1 or lw.isdigit():
+            continue
+        z = zipf_frequency(lw, lang_code)
+        scored.append(z)
+        if z < 2.5:
+            bad_words.append(w)
+    if not scored:
+        return 0.0, bad_words[:20]
+    plausible = sum(1 for z in scored if z >= 3.0)
+    return plausible / len(scored), bad_words[:20]
+def line_length_stability(text: str):
+    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
+    if len(lines) < 2:
+        return 1.0
+    lengths = [len(ln) for ln in lines]
+    mean = sum(lengths) / len(lengths)
+    if mean == 0:
+        return 1.0
+    var = sum((x - mean) ** 2 for x in lengths) / len(lengths)
+    std = math.sqrt(var)
+    return max(0.0, 1.0 - (std / mean))
+def compute_ocr_quality(text, language):
+    text = (text or "").strip()
+    if not text:
+        return {
+            "quality_score": 0,
+            "label": "No text",
+            "details": {},
+            "bad_words": [],
+        }
+    lang_code = LANGS.get(language, "en")
+    words = tokenize_words(text)
+    suspicious = suspicious_char_ratio(text)
+    repeated = repeated_punct_ratio(text)
+    digit_noise = digit_noise_ratio(text)
+    broken = broken_word_ratio(words)
+    lex_score, bad_words = lexical_plausibility(words, lang_code)
+    line_stability = line_length_stability(text)
+    upper = uppercase_ratio(text)
+    # Weighted score
+    score = 100
+    score -= suspicious * 220
+    score -= repeated * 180
+    score -= digit_noise * 40
+    score -= broken * 60
+    score -= max(0, 0.55 - lex_score) * 90
+    score -= max(0, upper - 0.35) * 40
+    score += max(0, line_stability - 0.5) * 10
+    score = max(0, min(100, round(score, 2)))
+    if score >= 85:
+        label = "Very good"
+    elif score >= 70:
+        label = "Good"
+    elif score >= 50:
+        label = "Medium"
+    elif score >= 30:
+        label = "Poor"
+    else:
+        label = "Very poor"
+    details = {
+        "words": len(words),
+        "suspicious_char_ratio": round(suspicious, 4),
+        "repeated_punct_ratio": round(repeated, 4),
+        "digit_noise_ratio": round(digit_noise, 4),
+        "broken_word_ratio": round(broken, 4),
+        "lexical_plausibility": round(lex_score, 4),
+        "line_length_stability": round(line_stability, 4),
+        "uppercase_ratio": round(upper, 4),
+    }
+    return {
+        "quality_score": score,
+        "label": label,
+        "details": details,
+        "bad_words": bad_words,
+    }
+def analyze_text(text, language):
+    result = compute_ocr_quality(text, language)
+    summary = f"### OCR quality: **{result['label']}**\n\n**Score:** {result['quality_score']} / 100"
+    metrics_md = "\n".join(
+        [f"- **{k}**: {v}" for k, v in result["details"].items()]
+    )
+    suspicious_words = ", ".join(result["bad_words"][:30]) if result["bad_words"] else "None"
+    return summary, metrics_md, suspicious_words
+demo = gr.Interface(
+    fn=analyze_text,
+    inputs=[
+        gr.Textbox(lines=18, label="OCR text"),
+        gr.Dropdown(choices=list(LANGS.keys()), value="English", label="Language"),
+    ],
+    outputs=[
+        gr.Markdown(label="Summary"),
+        gr.Markdown(label="Metrics"),
+        gr.Textbox(label="Potentially suspicious / rare words"),
+    ],
+    title="OCR Quality Detector",
+    description="A lightweight reference-free OCR quality estimator based on text heuristics.",
+    examples=[
+        [
+            "THE OMAHA DAILY BEE, TUESDAY, JUNE 24, 1890 NEWS ABOUT THE BLUFFS Comparatively Little Damage Done by Sunday Night's Storm.",
+            "English",
+        ],
+        [
+            "THHJ C M A 14 A1 HAM p 0 _ _ THE OMAHA DAILY BEE , TUEBPAY , JUNE 24 , 1890 , _ _ NEWS ABOUT THE BLUFFS Comparatively Little Damage Done b , Sunday Night's Storm",
+            "English",
+        ],
+    ],
+)
+if __name__ == "__main__":
+    demo.launch()