Spaces:

taellinglin
/

EverythingIsAFontOCR

Sleeping

App Files Files Community

taellinglin commited on Aug 1, 2025

Commit

644f8c6

verified ·

1 Parent(s): c4d9a6d

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -15

app.py CHANGED Viewed

@@ -14,13 +14,15 @@ import matplotlib.pyplot as plt
 import math
 from datetime import datetime
 import re
 # --------- Globals --------- #
 CHARS = string.ascii_letters + string.digits + string.punctuation
-CHAR2IDX = {c: i + 1 for i, c in enumerate(CHARS)}
-CHAR2IDX["<BLANK>"] = 0
-BLANK_IDX = 0
 IDX2CHAR = {v: k for k, v in CHAR2IDX.items()}
 IMAGE_HEIGHT = 32
 IMAGE_WIDTH = 128
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -87,7 +89,10 @@ class OCRModel(nn.Module):
         x, _ = self.rnn(x)
         x = self.fc(x)
         return x
 def sanitize_filename(name):
     return re.sub(r'[^a-zA-Z0-9_-]', '_', name)
@@ -239,31 +244,88 @@ def preprocess_image(image: Image.Image):
     return to_pil_image(padded)
-def predict_text(image: Image.Image):
     if ocr_model is None:
         return "Please load or train a model first."
     processed = preprocess_image(image)
     transform = transforms.Compose([
         transforms.ToTensor(),
         transforms.Normalize((0.5,), (0.5,))
     ])
-    img_tensor = transform(processed).unsqueeze(0).to(device)  # (1, C, H, W)
     with torch.no_grad():
         output = ocr_model(img_tensor)  # (B, T, C)
-        log_probs = output.log_softmax(2).permute(1, 0, 2)  # (T, B, C)
-        pred = greedy_decode(log_probs)  # should be a string now
-        probs = log_probs.exp()
-        max_probs = probs.max(2)[0].squeeze(1)  # (T,)
-        avg_conf = max_probs.mean().item()
-    return f"Prediction: {pred}\nConfidence: {avg_conf:.2%}"
 # New helper function: generate label images grid
@@ -359,7 +421,7 @@ with gr.Blocks(css=custom_css) as demo:
         image_input = gr.Image(type="pil", label="Upload word strip")
         predict_btn = gr.Button("Predict")
-        output_text = gr.Textbox(label="Recognized Text")
         model_status = gr.Textbox(label="Model Load Status")
         # Refresh dropdown choices

 import math
 from datetime import datetime
 import re
+from difflib import SequenceMatcher
+from termcolor import colored
+from ctcdecode import CTCBeamDecoder
 # --------- Globals --------- #
 CHARS = string.ascii_letters + string.digits + string.punctuation
+CHAR2IDX = {c: i + 1 for i, c in enumerate(CHARS)}  # Start from 1
+CHAR2IDX["<BLANK>"] = 0  # CTC blank
 IDX2CHAR = {v: k for k, v in CHAR2IDX.items()}
+BLANK_IDX = 0
 IMAGE_HEIGHT = 32
 IMAGE_WIDTH = 128
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         x, _ = self.rnn(x)
         x = self.fc(x)
         return x
+def color_char(c, conf):
+    color_levels = ['\033[31m', '\033[33m', '\033[32m', '\033[36m', '\033[34m', '\033[35m', '\033[0m']
+    idx = min(int(conf * (len(color_levels) - 1)), len(color_levels) - 1)
+    return f"{color_levels[idx]}{c}\033[0m"
 def sanitize_filename(name):
     return re.sub(r'[^a-zA-Z0-9_-]', '_', name)
     return to_pil_image(padded)
+# ROYGBIV color ramp (low → high confidence)
+CONFIDENCE_COLORS = [
+    "#FF0000",  # Red
+    "#FF7F00",  # Orange
+    "#FFFF00",  # Yellow
+    "#00FF00",  # Green
+    "#00BFFF",  # Sky Blue
+    "#0000FF",  # Blue
+    "#8B00FF",  # Violet
+]
+def confidence_to_color(conf):
+    """
+    Map confidence (0.0–1.0) to a ROYGBIV-style hex color.
+    """
+    index = min(int(conf * (len(CONFIDENCE_COLORS) - 1)), len(CONFIDENCE_COLORS) - 1)
+    return CONFIDENCE_COLORS[index]
+def color_char(c, conf):
+    """
+    Wrap character `c` in a span tag with color mapped from `conf`.
+    """
+    color = confidence_to_color(conf)
+    return f'<span style="color:{color}; font-size:32pt; font-weight:bold;">{c}</span>'
+# Build decoder once (outside predict_text)
+decoder = CTCBeamDecoder(
+    labels=[IDX2CHAR[i] for i in range(len(IDX2CHAR))],
+    blank_id=BLANK_IDX,
+    beam_width=10,               # try 10–20 for best results
+    num_processes=4,
+    log_probs_input=True
+)
+def predict_text(image: Image.Image, ground_truth: str = None, debug: bool = False):
     if ocr_model is None:
         return "Please load or train a model first."
     processed = preprocess_image(image)
     transform = transforms.Compose([
         transforms.ToTensor(),
         transforms.Normalize((0.5,), (0.5,))
     ])
+    img_tensor = transform(processed).unsqueeze(0).to(device)
+    ocr_model.eval()
     with torch.no_grad():
         output = ocr_model(img_tensor)  # (B, T, C)
+        log_probs = output.log_softmax(2)  # (B, T, C)
+        output_lengths = torch.full((1,), log_probs.size(1), dtype=torch.int32)
+        # Beam decode
+        beam_results, beam_scores, timesteps, out_lens = decoder.decode(log_probs, output_lengths)
+        pred_indices = beam_results[0][0][:out_lens[0][0]].cpu().numpy().tolist()
+        pred_chars = [IDX2CHAR.get(idx, "?") for idx in pred_indices]
+        # Confidence estimation: mean probability of chosen path
+        avg_conf = torch.exp(beam_scores[0][0] / out_lens[0][0]).item()
+    # Colorize with fake uniform confidence (we don’t get per-char conf from decoder)
+    colorized_chars = [color_char(c, avg_conf) for c in pred_chars]
+    pretty_output = ''.join(colorized_chars)
+    pred_text = ''.join(pred_chars)
+    sim_score = ""
+    if ground_truth:
+        similarity = SequenceMatcher(None, ground_truth, pred_text).ratio()
+        sim_score = f"<br><strong>Levenshtein Similarity:</strong> {similarity:.2%}"
+    if debug:
+        print("Decoded Beam:", pred_text)
+        print("Beam Confidence:", avg_conf)
+        if ground_truth:
+            print("Ground Truth:", ground_truth)
+    return f"<strong>Prediction:</strong> {pretty_output}<br><strong>Confidence:</strong> {avg_conf:.2%}{sim_score}"
 # New helper function: generate label images grid
         image_input = gr.Image(type="pil", label="Upload word strip")
         predict_btn = gr.Button("Predict")
+        output_text = gr.HTML(label="Recognized Text")
         model_status = gr.Textbox(label="Model Load Status")
         # Refresh dropdown choices