emanuelaboros commited on
Commit
3e9b591
·
1 Parent(s): 6876bc3

change to gpt

Browse files
Files changed (1) hide show
  1. app.py +130 -50
app.py CHANGED
@@ -1,7 +1,9 @@
1
  import re
2
  import math
 
 
3
  import gradio as gr
4
- from collections import Counter
5
 
6
  try:
7
  from wordfreq import zipf_frequency
@@ -16,6 +18,22 @@ LANGS = {
16
  "Italian": "it",
17
  }
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  def tokenize_words(text: str):
21
  return re.findall(r"\b[\w'-]+\b", text, flags=re.UNICODE)
@@ -24,7 +42,9 @@ def tokenize_words(text: str):
24
  def suspicious_char_ratio(text: str):
25
  if not text:
26
  return 1.0
27
- suspicious = re.findall(r"[^ \n\r\t\wÀ-ÖØ-öø-ÿ.,;:!?()'\"%-]", text, flags=re.UNICODE)
 
 
28
  return len(suspicious) / max(len(text), 1)
29
 
30
 
@@ -36,10 +56,11 @@ def repeated_punct_ratio(text: str):
36
 
37
 
38
  def digit_noise_ratio(text: str):
39
- if not text:
 
40
  return 0.0
41
  weird_digit_patterns = re.findall(r"\b(?:\d+[A-Za-z]+|[A-Za-z]+\d+)\b", text)
42
- return len(weird_digit_patterns) / max(len(tokenize_words(text)), 1)
43
 
44
 
45
  def uppercase_ratio(text: str):
@@ -59,13 +80,26 @@ def broken_word_ratio(words):
59
  continue
60
  if re.search(r"(.)\1\1", w):
61
  broken += 1
62
- elif len(w) > 20:
63
  broken += 1
64
  elif re.search(r"[0-9]", w) and re.search(r"[A-Za-zÀ-ÖØ-öø-ÿ]", w):
65
  broken += 1
66
  return broken / max(len(words), 1)
67
 
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  def lexical_plausibility(words, lang_code):
70
  if not words:
71
  return 0.0, []
@@ -74,6 +108,7 @@ def lexical_plausibility(words, lang_code):
74
 
75
  scored = []
76
  bad_words = []
 
77
  for w in words:
78
  lw = w.lower()
79
  if len(lw) <= 1 or lw.isdigit():
@@ -84,30 +119,61 @@ def lexical_plausibility(words, lang_code):
84
  bad_words.append(w)
85
 
86
  if not scored:
87
- return 0.0, bad_words[:20]
88
 
89
  plausible = sum(1 for z in scored if z >= 3.0)
90
- return plausible / len(scored), bad_words[:20]
91
 
92
 
93
- def line_length_stability(text: str):
94
- lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
95
- if len(lines) < 2:
96
- return 1.0
97
- lengths = [len(ln) for ln in lines]
98
- mean = sum(lengths) / len(lengths)
99
- if mean == 0:
100
- return 1.0
101
- var = sum((x - mean) ** 2 for x in lengths) / len(lengths)
102
- std = math.sqrt(var)
103
- return max(0.0, 1.0 - (std / mean))
 
 
104
 
 
 
 
 
 
 
105
 
106
- def compute_ocr_quality(text, language):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  text = (text or "").strip()
108
  if not text:
109
  return {
110
- "quality_score": 0,
111
  "label": "No text",
112
  "details": {},
113
  "bad_words": [],
@@ -120,42 +186,38 @@ def compute_ocr_quality(text, language):
120
  repeated = repeated_punct_ratio(text)
121
  digit_noise = digit_noise_ratio(text)
122
  broken = broken_word_ratio(words)
123
- lex_score, bad_words = lexical_plausibility(words, lang_code)
124
  line_stability = line_length_stability(text)
125
  upper = uppercase_ratio(text)
 
 
126
 
127
- # Weighted score
128
- score = 100
129
  score -= suspicious * 220
130
  score -= repeated * 180
131
- score -= digit_noise * 40
132
- score -= broken * 60
133
- score -= max(0, 0.55 - lex_score) * 90
134
- score -= max(0, upper - 0.35) * 40
135
- score += max(0, line_stability - 0.5) * 10
136
-
137
- score = max(0, min(100, round(score, 2)))
138
-
139
- if score >= 85:
140
- label = "Very good"
141
- elif score >= 70:
142
- label = "Good"
143
- elif score >= 50:
144
- label = "Medium"
145
- elif score >= 30:
146
- label = "Poor"
147
- else:
148
- label = "Very poor"
149
 
150
  details = {
151
- "words": len(words),
152
  "suspicious_char_ratio": round(suspicious, 4),
153
  "repeated_punct_ratio": round(repeated, 4),
154
  "digit_noise_ratio": round(digit_noise, 4),
155
  "broken_word_ratio": round(broken, 4),
156
- "lexical_plausibility": round(lex_score, 4),
157
  "line_length_stability": round(line_stability, 4),
158
  "uppercase_ratio": round(upper, 4),
 
 
 
159
  }
160
 
161
  return {
@@ -166,18 +228,25 @@ def compute_ocr_quality(text, language):
166
  }
167
 
168
 
169
- def analyze_text(text, language):
170
- result = compute_ocr_quality(text, language)
 
 
171
 
172
- summary = f"### OCR quality: **{result['label']}**\n\n**Score:** {result['quality_score']} / 100"
173
 
174
- metrics_md = "\n".join(
175
- [f"- **{k}**: {v}" for k, v in result["details"].items()]
176
  )
177
 
178
  suspicious_words = ", ".join(result["bad_words"][:30]) if result["bad_words"] else "None"
179
 
180
- return summary, metrics_md, suspicious_words
 
 
 
 
 
181
 
182
 
183
  demo = gr.Interface(
@@ -185,6 +254,11 @@ demo = gr.Interface(
185
  inputs=[
186
  gr.Textbox(lines=18, label="OCR text"),
187
  gr.Dropdown(choices=list(LANGS.keys()), value="English", label="Language"),
 
 
 
 
 
188
  ],
189
  outputs=[
190
  gr.Markdown(label="Summary"),
@@ -192,17 +266,23 @@ demo = gr.Interface(
192
  gr.Textbox(label="Potentially suspicious / rare words"),
193
  ],
194
  title="OCR Quality Detector",
195
- description="A lightweight reference-free OCR quality estimator based on text heuristics.",
 
 
 
196
  examples=[
197
  [
198
  "THE OMAHA DAILY BEE, TUESDAY, JUNE 24, 1890 NEWS ABOUT THE BLUFFS Comparatively Little Damage Done by Sunday Night's Storm.",
199
  "English",
 
200
  ],
201
  [
202
  "THHJ C M A 14 A1 HAM p 0 _ _ THE OMAHA DAILY BEE , TUEBPAY , JUNE 24 , 1890 , _ _ NEWS ABOUT THE BLUFFS Comparatively Little Damage Done b , Sunday Night's Storm",
203
  "English",
 
204
  ],
205
  ],
 
206
  )
207
 
208
  if __name__ == "__main__":
 
1
  import re
2
  import math
3
+ from difflib import SequenceMatcher
4
+
5
  import gradio as gr
6
+ from transformers import AutoTokenizer
7
 
8
  try:
9
  from wordfreq import zipf_frequency
 
18
  "Italian": "it",
19
  }
20
 
21
+ TOKENIZER_MODELS = {
22
+ "GPT-2": "gpt2",
23
+ "XLM-RoBERTa": "xlm-roberta-base",
24
+ "mT5": "google/mt5-small",
25
+ }
26
+
27
+ _tokenizer_cache = {}
28
+
29
+
30
+ def get_tokenizer(model_name: str):
31
+ if model_name not in _tokenizer_cache:
32
+ _tokenizer_cache[model_name] = AutoTokenizer.from_pretrained(
33
+ TOKENIZER_MODELS[model_name]
34
+ )
35
+ return _tokenizer_cache[model_name]
36
+
37
 
38
  def tokenize_words(text: str):
39
  return re.findall(r"\b[\w'-]+\b", text, flags=re.UNICODE)
 
42
  def suspicious_char_ratio(text: str):
43
  if not text:
44
  return 1.0
45
+ suspicious = re.findall(
46
+ r"[^ \n\r\t\wÀ-ÖØ-öø-ÿ.,;:!?()'\"%&/\-]", text, flags=re.UNICODE
47
+ )
48
  return len(suspicious) / max(len(text), 1)
49
 
50
 
 
56
 
57
 
58
  def digit_noise_ratio(text: str):
59
+ words = tokenize_words(text)
60
+ if not words:
61
  return 0.0
62
  weird_digit_patterns = re.findall(r"\b(?:\d+[A-Za-z]+|[A-Za-z]+\d+)\b", text)
63
+ return len(weird_digit_patterns) / max(len(words), 1)
64
 
65
 
66
  def uppercase_ratio(text: str):
 
80
  continue
81
  if re.search(r"(.)\1\1", w):
82
  broken += 1
83
+ elif len(w) > 25:
84
  broken += 1
85
  elif re.search(r"[0-9]", w) and re.search(r"[A-Za-zÀ-ÖØ-öø-ÿ]", w):
86
  broken += 1
87
  return broken / max(len(words), 1)
88
 
89
 
90
+ def line_length_stability(text: str):
91
+ lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
92
+ if len(lines) < 2:
93
+ return 1.0
94
+ lengths = [len(ln) for ln in lines]
95
+ mean = sum(lengths) / len(lengths)
96
+ if mean == 0:
97
+ return 1.0
98
+ var = sum((x - mean) ** 2 for x in lengths) / len(lengths)
99
+ std = math.sqrt(var)
100
+ return max(0.0, 1.0 - (std / mean))
101
+
102
+
103
  def lexical_plausibility(words, lang_code):
104
  if not words:
105
  return 0.0, []
 
108
 
109
  scored = []
110
  bad_words = []
111
+
112
  for w in words:
113
  lw = w.lower()
114
  if len(lw) <= 1 or lw.isdigit():
 
119
  bad_words.append(w)
120
 
121
  if not scored:
122
+ return 0.0, bad_words[:30]
123
 
124
  plausible = sum(1 for z in scored if z >= 3.0)
125
+ return plausible / len(scored), bad_words[:30]
126
 
127
 
128
+ def tokenizer_fragmentation_metrics(text: str, tokenizer_name: str):
129
+ words = tokenize_words(text)
130
+ if not words:
131
+ return {
132
+ "tokens_per_word": 0.0,
133
+ "fragmented_word_ratio": 0.0,
134
+ "single_char_piece_ratio": 0.0,
135
+ }
136
+
137
+ tokenizer = get_tokenizer(tokenizer_name)
138
+ token_counts = []
139
+ single_char_pieces = 0
140
+ total_pieces = 0
141
 
142
+ for w in words:
143
+ pieces = tokenizer.tokenize(w)
144
+ n = len(pieces)
145
+ token_counts.append(n)
146
+ total_pieces += n
147
+ single_char_pieces += sum(1 for p in pieces if len(p.strip("▁Ġ")) == 1)
148
 
149
+ tokens_per_word = total_pieces / len(words)
150
+ fragmented_word_ratio = sum(1 for n in token_counts if n > 3) / len(words)
151
+ single_char_piece_ratio = single_char_pieces / max(total_pieces, 1)
152
+
153
+ return {
154
+ "tokens_per_word": tokens_per_word,
155
+ "fragmented_word_ratio": fragmented_word_ratio,
156
+ "single_char_piece_ratio": single_char_piece_ratio,
157
+ }
158
+
159
+
160
+ def classify_score(score: float):
161
+ if score >= 85:
162
+ return "Very good"
163
+ if score >= 70:
164
+ return "Good"
165
+ if score >= 50:
166
+ return "Medium"
167
+ if score >= 30:
168
+ return "Poor"
169
+ return "Very poor"
170
+
171
+
172
+ def compute_ocr_quality(text: str, language: str, tokenizer_name: str):
173
  text = (text or "").strip()
174
  if not text:
175
  return {
176
+ "quality_score": 0.0,
177
  "label": "No text",
178
  "details": {},
179
  "bad_words": [],
 
186
  repeated = repeated_punct_ratio(text)
187
  digit_noise = digit_noise_ratio(text)
188
  broken = broken_word_ratio(words)
 
189
  line_stability = line_length_stability(text)
190
  upper = uppercase_ratio(text)
191
+ lexical_score, bad_words = lexical_plausibility(words, lang_code)
192
+ frag = tokenizer_fragmentation_metrics(text, tokenizer_name)
193
 
194
+ score = 100.0
 
195
  score -= suspicious * 220
196
  score -= repeated * 180
197
+ score -= digit_noise * 45
198
+ score -= broken * 65
199
+ score -= max(0.0, 0.55 - lexical_score) * 90
200
+ score -= max(0.0, frag["tokens_per_word"] - 1.8) * 25
201
+ score -= frag["fragmented_word_ratio"] * 60
202
+ score -= frag["single_char_piece_ratio"] * 40
203
+ score -= max(0.0, upper - 0.35) * 35
204
+ score += max(0.0, line_stability - 0.5) * 10
205
+
206
+ score = max(0.0, min(100.0, round(score, 2)))
207
+ label = classify_score(score)
 
 
 
 
 
 
 
208
 
209
  details = {
210
+ "word_count": len(words),
211
  "suspicious_char_ratio": round(suspicious, 4),
212
  "repeated_punct_ratio": round(repeated, 4),
213
  "digit_noise_ratio": round(digit_noise, 4),
214
  "broken_word_ratio": round(broken, 4),
215
+ "lexical_plausibility": round(lexical_score, 4),
216
  "line_length_stability": round(line_stability, 4),
217
  "uppercase_ratio": round(upper, 4),
218
+ "tokens_per_word": round(frag["tokens_per_word"], 4),
219
+ "fragmented_word_ratio": round(frag["fragmented_word_ratio"], 4),
220
+ "single_char_piece_ratio": round(frag["single_char_piece_ratio"], 4),
221
  }
222
 
223
  return {
 
228
  }
229
 
230
 
231
+ def explain_result(result):
232
+ score = result["quality_score"]
233
+ label = result["label"]
234
+ details = result["details"]
235
 
236
+ summary = f"## OCR quality: **{label}**\n\n**Score:** {score}/100"
237
 
238
+ metrics = "\n".join(
239
+ f"- **{k}**: {v}" for k, v in details.items()
240
  )
241
 
242
  suspicious_words = ", ".join(result["bad_words"][:30]) if result["bad_words"] else "None"
243
 
244
+ return summary, metrics, suspicious_words
245
+
246
+
247
+ def analyze_text(text, language, tokenizer_name):
248
+ result = compute_ocr_quality(text, language, tokenizer_name)
249
+ return explain_result(result)
250
 
251
 
252
  demo = gr.Interface(
 
254
  inputs=[
255
  gr.Textbox(lines=18, label="OCR text"),
256
  gr.Dropdown(choices=list(LANGS.keys()), value="English", label="Language"),
257
+ gr.Dropdown(
258
+ choices=list(TOKENIZER_MODELS.keys()),
259
+ value="XLM-RoBERTa",
260
+ label="Tokenizer used for fragmentation score",
261
+ ),
262
  ],
263
  outputs=[
264
  gr.Markdown(label="Summary"),
 
266
  gr.Textbox(label="Potentially suspicious / rare words"),
267
  ],
268
  title="OCR Quality Detector",
269
+ description=(
270
+ "A lightweight reference-free OCR quality estimator. "
271
+ "It combines OCR-noise heuristics, lexical plausibility, and tokenizer fragmentation."
272
+ ),
273
  examples=[
274
  [
275
  "THE OMAHA DAILY BEE, TUESDAY, JUNE 24, 1890 NEWS ABOUT THE BLUFFS Comparatively Little Damage Done by Sunday Night's Storm.",
276
  "English",
277
+ "XLM-RoBERTa",
278
  ],
279
  [
280
  "THHJ C M A 14 A1 HAM p 0 _ _ THE OMAHA DAILY BEE , TUEBPAY , JUNE 24 , 1890 , _ _ NEWS ABOUT THE BLUFFS Comparatively Little Damage Done b , Sunday Night's Storm",
281
  "English",
282
+ "XLM-RoBERTa",
283
  ],
284
  ],
285
+ allow_flagging="never",
286
  )
287
 
288
  if __name__ == "__main__":