Spaces:

taellinglin
/

EverythingIsAFontOCR

Sleeping

App Files Files Community

taellinglin commited on Aug 2, 2025

Commit

f9b53d5

verified ·

1 Parent(s): 216ee30

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -27

app.py CHANGED Viewed

@@ -14,9 +14,11 @@ import matplotlib.pyplot as plt
 import math
 from datetime import datetime
 import re
-from difflib import SequenceMatcher
 from termcolor import colored
-from ctcdecode import CTCBeamDecoder
 # --------- Globals --------- #
 CHARS = string.ascii_letters + string.digits + string.punctuation
 CHAR2IDX = {c: i + 1 for i, c in enumerate(CHARS)}  # Start from 1
@@ -28,7 +30,14 @@ IMAGE_WIDTH = 128
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 font_path = None
 ocr_model = None
 # --------- Dataset --------- #
 class OCRDataset(Dataset):
@@ -273,14 +282,7 @@ def color_char(c, conf):
-# Build decoder once (outside predict_text)
-decoder = CTCBeamDecoder(
-    labels=[IDX2CHAR[i] for i in range(len(IDX2CHAR))],
-    blank_id=BLANK_IDX,
-    beam_width=10,               # try 10–20 for best results
-    num_processes=4,
-    log_probs_input=True
-)
 def predict_text(image: Image.Image, ground_truth: str = None, debug: bool = False):
     if ocr_model is None:
@@ -291,37 +293,32 @@ def predict_text(image: Image.Image, ground_truth: str = None, debug: bool = Fal
         transforms.ToTensor(),
         transforms.Normalize((0.5,), (0.5,))
     ])
-    img_tensor = transform(processed).unsqueeze(0).to(device)
     ocr_model.eval()
     with torch.no_grad():
-        output = ocr_model(img_tensor)  # (B, T, C)
-        log_probs = output.log_softmax(2)  # (B, T, C)
-        output_lengths = torch.full((1,), log_probs.size(1), dtype=torch.int32)
-        # Beam decode
-        beam_results, beam_scores, timesteps, out_lens = decoder.decode(log_probs, output_lengths)
-        pred_indices = beam_results[0][0][:out_lens[0][0]].cpu().numpy().tolist()
-        pred_chars = [IDX2CHAR.get(idx, "?") for idx in pred_indices]
-        # Confidence estimation: mean probability of chosen path
-        avg_conf = torch.exp(beam_scores[0][0] / out_lens[0][0]).item()
-    # Colorize with fake uniform confidence (we don’t get per-char conf from decoder)
-    colorized_chars = [color_char(c, avg_conf) for c in pred_chars]
     pretty_output = ''.join(colorized_chars)
-    pred_text = ''.join(pred_chars)
     sim_score = ""
     if ground_truth:
         similarity = SequenceMatcher(None, ground_truth, pred_text).ratio()
         sim_score = f"<br><strong>Levenshtein Similarity:</strong> {similarity:.2%}"
     if debug:
-        print("Decoded Beam:", pred_text)
-        print("Beam Confidence:", avg_conf)
         if ground_truth:
             print("Ground Truth:", ground_truth)

 import math
 from datetime import datetime
 import re
 from termcolor import colored
+from pyctcdecode import BeamSearchDecoderCTC, Alphabet
+from difflib import SequenceMatcher
 # --------- Globals --------- #
 CHARS = string.ascii_letters + string.digits + string.punctuation
 CHAR2IDX = {c: i + 1 for i, c in enumerate(CHARS)}  # Start from 1
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 font_path = None
 ocr_model = None
+# Create vocabulary list (ensure order matches your model’s output indices!)
+labels = [IDX2CHAR.get(i, "") for i in range(len(IDX2CHAR))]
+# Wrap in Alphabet
+alphabet = Alphabet.build_alphabet(labels)
+# Now initialize decoder correctly
+decoder = BeamSearchDecoderCTC(alphabet)
 # --------- Dataset --------- #
 class OCRDataset(Dataset):
 def predict_text(image: Image.Image, ground_truth: str = None, debug: bool = False):
     if ocr_model is None:
         transforms.ToTensor(),
         transforms.Normalize((0.5,), (0.5,))
     ])
+    img_tensor = transform(processed).unsqueeze(0).to(device)  # (1, C, H, W)
     ocr_model.eval()
     with torch.no_grad():
+        output = ocr_model(img_tensor)           # (1, T, C)
+        log_probs = output.log_softmax(2)[0]     # (T, C)
+        pred_text = decoder.decode(log_probs.cpu().numpy())  # Best beam path
+        # Confidence: mean max prob per timestep
+        probs = log_probs.exp()
+        max_probs = probs.max(dim=1)[0]
+        avg_conf = max_probs.mean().item()
+    # Color each character (uniform confidence for now)
+    colorized_chars = [color_char(c, avg_conf) for c in pred_text]
     pretty_output = ''.join(colorized_chars)
     sim_score = ""
     if ground_truth:
         similarity = SequenceMatcher(None, ground_truth, pred_text).ratio()
         sim_score = f"<br><strong>Levenshtein Similarity:</strong> {similarity:.2%}"
     if debug:
+        print("Decoded Text:", pred_text)
+        print("Average Confidence:", avg_conf)
         if ground_truth:
             print("Ground Truth:", ground_truth)