emanuelaboros commited on
Commit
6876bc3
·
1 Parent(s): e3379aa

Add application file

Browse files
Files changed (1) hide show
  1. app.py +209 -0
app.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import math
3
+ import gradio as gr
4
+ from collections import Counter
5
+
6
+ try:
7
+ from wordfreq import zipf_frequency
8
+ except ImportError:
9
+ zipf_frequency = None
10
+
11
+
12
+ LANGS = {
13
+ "English": "en",
14
+ "French": "fr",
15
+ "German": "de",
16
+ "Italian": "it",
17
+ }
18
+
19
+
20
+ def tokenize_words(text: str):
21
+ return re.findall(r"\b[\w'-]+\b", text, flags=re.UNICODE)
22
+
23
+
24
+ def suspicious_char_ratio(text: str):
25
+ if not text:
26
+ return 1.0
27
+ suspicious = re.findall(r"[^ \n\r\t\wÀ-ÖØ-öø-ÿ.,;:!?()'\"%-]", text, flags=re.UNICODE)
28
+ return len(suspicious) / max(len(text), 1)
29
+
30
+
31
+ def repeated_punct_ratio(text: str):
32
+ if not text:
33
+ return 0.0
34
+ matches = re.findall(r"([.,;:!?_\-])\1{1,}", text)
35
+ return len(matches) / max(len(text), 1)
36
+
37
+
38
+ def digit_noise_ratio(text: str):
39
+ if not text:
40
+ return 0.0
41
+ weird_digit_patterns = re.findall(r"\b(?:\d+[A-Za-z]+|[A-Za-z]+\d+)\b", text)
42
+ return len(weird_digit_patterns) / max(len(tokenize_words(text)), 1)
43
+
44
+
45
+ def uppercase_ratio(text: str):
46
+ letters = [c for c in text if c.isalpha()]
47
+ if not letters:
48
+ return 0.0
49
+ upper = sum(1 for c in letters if c.isupper())
50
+ return upper / len(letters)
51
+
52
+
53
+ def broken_word_ratio(words):
54
+ if not words:
55
+ return 1.0
56
+ broken = 0
57
+ for w in words:
58
+ if len(w) <= 1:
59
+ continue
60
+ if re.search(r"(.)\1\1", w):
61
+ broken += 1
62
+ elif len(w) > 20:
63
+ broken += 1
64
+ elif re.search(r"[0-9]", w) and re.search(r"[A-Za-zÀ-ÖØ-öø-ÿ]", w):
65
+ broken += 1
66
+ return broken / max(len(words), 1)
67
+
68
+
69
+ def lexical_plausibility(words, lang_code):
70
+ if not words:
71
+ return 0.0, []
72
+ if zipf_frequency is None:
73
+ return 0.5, []
74
+
75
+ scored = []
76
+ bad_words = []
77
+ for w in words:
78
+ lw = w.lower()
79
+ if len(lw) <= 1 or lw.isdigit():
80
+ continue
81
+ z = zipf_frequency(lw, lang_code)
82
+ scored.append(z)
83
+ if z < 2.5:
84
+ bad_words.append(w)
85
+
86
+ if not scored:
87
+ return 0.0, bad_words[:20]
88
+
89
+ plausible = sum(1 for z in scored if z >= 3.0)
90
+ return plausible / len(scored), bad_words[:20]
91
+
92
+
93
+ def line_length_stability(text: str):
94
+ lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
95
+ if len(lines) < 2:
96
+ return 1.0
97
+ lengths = [len(ln) for ln in lines]
98
+ mean = sum(lengths) / len(lengths)
99
+ if mean == 0:
100
+ return 1.0
101
+ var = sum((x - mean) ** 2 for x in lengths) / len(lengths)
102
+ std = math.sqrt(var)
103
+ return max(0.0, 1.0 - (std / mean))
104
+
105
+
106
+ def compute_ocr_quality(text, language):
107
+ text = (text or "").strip()
108
+ if not text:
109
+ return {
110
+ "quality_score": 0,
111
+ "label": "No text",
112
+ "details": {},
113
+ "bad_words": [],
114
+ }
115
+
116
+ lang_code = LANGS.get(language, "en")
117
+ words = tokenize_words(text)
118
+
119
+ suspicious = suspicious_char_ratio(text)
120
+ repeated = repeated_punct_ratio(text)
121
+ digit_noise = digit_noise_ratio(text)
122
+ broken = broken_word_ratio(words)
123
+ lex_score, bad_words = lexical_plausibility(words, lang_code)
124
+ line_stability = line_length_stability(text)
125
+ upper = uppercase_ratio(text)
126
+
127
+ # Weighted score
128
+ score = 100
129
+ score -= suspicious * 220
130
+ score -= repeated * 180
131
+ score -= digit_noise * 40
132
+ score -= broken * 60
133
+ score -= max(0, 0.55 - lex_score) * 90
134
+ score -= max(0, upper - 0.35) * 40
135
+ score += max(0, line_stability - 0.5) * 10
136
+
137
+ score = max(0, min(100, round(score, 2)))
138
+
139
+ if score >= 85:
140
+ label = "Very good"
141
+ elif score >= 70:
142
+ label = "Good"
143
+ elif score >= 50:
144
+ label = "Medium"
145
+ elif score >= 30:
146
+ label = "Poor"
147
+ else:
148
+ label = "Very poor"
149
+
150
+ details = {
151
+ "words": len(words),
152
+ "suspicious_char_ratio": round(suspicious, 4),
153
+ "repeated_punct_ratio": round(repeated, 4),
154
+ "digit_noise_ratio": round(digit_noise, 4),
155
+ "broken_word_ratio": round(broken, 4),
156
+ "lexical_plausibility": round(lex_score, 4),
157
+ "line_length_stability": round(line_stability, 4),
158
+ "uppercase_ratio": round(upper, 4),
159
+ }
160
+
161
+ return {
162
+ "quality_score": score,
163
+ "label": label,
164
+ "details": details,
165
+ "bad_words": bad_words,
166
+ }
167
+
168
+
169
+ def analyze_text(text, language):
170
+ result = compute_ocr_quality(text, language)
171
+
172
+ summary = f"### OCR quality: **{result['label']}**\n\n**Score:** {result['quality_score']} / 100"
173
+
174
+ metrics_md = "\n".join(
175
+ [f"- **{k}**: {v}" for k, v in result["details"].items()]
176
+ )
177
+
178
+ suspicious_words = ", ".join(result["bad_words"][:30]) if result["bad_words"] else "None"
179
+
180
+ return summary, metrics_md, suspicious_words
181
+
182
+
183
+ demo = gr.Interface(
184
+ fn=analyze_text,
185
+ inputs=[
186
+ gr.Textbox(lines=18, label="OCR text"),
187
+ gr.Dropdown(choices=list(LANGS.keys()), value="English", label="Language"),
188
+ ],
189
+ outputs=[
190
+ gr.Markdown(label="Summary"),
191
+ gr.Markdown(label="Metrics"),
192
+ gr.Textbox(label="Potentially suspicious / rare words"),
193
+ ],
194
+ title="OCR Quality Detector",
195
+ description="A lightweight reference-free OCR quality estimator based on text heuristics.",
196
+ examples=[
197
+ [
198
+ "THE OMAHA DAILY BEE, TUESDAY, JUNE 24, 1890 NEWS ABOUT THE BLUFFS Comparatively Little Damage Done by Sunday Night's Storm.",
199
+ "English",
200
+ ],
201
+ [
202
+ "THHJ C M A 14 A1 HAM p 0 _ _ THE OMAHA DAILY BEE , TUEBPAY , JUNE 24 , 1890 , _ _ NEWS ABOUT THE BLUFFS Comparatively Little Damage Done b , Sunday Night's Storm",
203
+ "English",
204
+ ],
205
+ ],
206
+ )
207
+
208
+ if __name__ == "__main__":
209
+ demo.launch()