Spaces:

prasanthmj
/

braille-reader

Sleeping

App Files Files Community

prasanthmj commited on Mar 13

Commit

99857c5

verified ·

1 Parent(s): 44614a0

Initial app: full braille OCR pipeline (YOLOv8 + ByT5)

Browse files

Files changed (3) hide show

README.md +16 -5
app.py +187 -0
requirements.txt +7 -0

README.md CHANGED Viewed

@@ -1,12 +1,23 @@
 ---
 title: Braille Reader
-emoji: 🐠
-colorFrom: purple
-colorTo: gray
 sdk: gradio
-sdk_version: 6.9.0
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Braille Reader
+emoji: 👁️
+colorFrom: blue
+colorTo: indigo
 sdk: gradio
+sdk_version: "5.23.0"
 app_file: app.py
 pinned: false
+license: mit
+models:
+  - prasanthmj/braille-byt5-v3
+  - prasanthmj/yolov8-braille
 ---
+# Braille Reader
+Upload a scanned braille document and get its English translation.
+- **Stage 1:** YOLOv8 detects braille cells in the image
+- **Stage 2:** ByT5 translates detected braille (Grade 2 contracted) to English
+Supports Grade 2 (contracted) braille — the form used in 90-95% of real braille documents.

app.py ADDED Viewed

	@@ -0,0 +1,187 @@

+"""Braille Reader — Upload a braille image, get English text."""
+import json
+import tempfile
+from pathlib import Path
+import cv2
+import gradio as gr
+import numpy as np
+import torch
+from huggingface_hub import hf_hub_download
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+from ultralytics import YOLO
+# --- Model loading (cached at startup) ---
+YOLO_REPO = "prasanthmj/yolov8-braille"
+BYT5_REPO = "prasanthmj/braille-byt5-v3"
+def load_models():
+    """Download and load both models."""
+    # YOLOv8 braille detector
+    weights_path = hf_hub_download(YOLO_REPO, "yolov8_braille.pt")
+    braille_map_path = hf_hub_download(YOLO_REPO, "braille_map.json")
+    yolo_model = YOLO(weights_path)
+    with open(braille_map_path) as f:
+        dot_to_unicode = json.load(f)
+    # ByT5 Grade 2 interpreter
+    tokenizer = AutoTokenizer.from_pretrained(BYT5_REPO)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    byt5_model = AutoModelForSeq2SeqLM.from_pretrained(BYT5_REPO).to(device)
+    byt5_model.eval()
+    return yolo_model, dot_to_unicode, tokenizer, byt5_model, device
+print("Loading models...")
+yolo_model, dot_to_unicode, tokenizer, byt5_model, device = load_models()
+print(f"Models loaded. Device: {device}")
+# --- CLAHE Preprocessing ---
+def preprocess_clahe(image_path: str) -> str:
+    """Apply CLAHE preprocessing for better detection on low-contrast images."""
+    img = cv2.imread(image_path)
+    if img is None:
+        return image_path
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
+    enhanced = clahe.apply(gray)
+    enhanced_bgr = cv2.cvtColor(enhanced, cv2.COLOR_GRAY2BGR)
+    tmp = tempfile.NamedTemporaryFile(suffix=".jpg", delete=False)
+    cv2.imwrite(tmp.name, enhanced_bgr)
+    return tmp.name
+# --- Stage 1: YOLOv8 Detection ---
+def detect_braille(image_path: str, confidence: float = 0.15) -> list[list[dict]]:
+    """Detect braille cells and group into lines."""
+    results = yolo_model.predict(image_path, conf=confidence, verbose=False)
+    boxes = results[0].boxes
+    if len(boxes) == 0:
+        return []
+    n = len(boxes)
+    data = np.zeros((n, 6))
+    data[:, 0] = boxes.xywh[:, 0].cpu().numpy()
+    data[:, 1] = boxes.xywh[:, 1].cpu().numpy()
+    data[:, 2] = boxes.xywh[:, 2].cpu().numpy()
+    data[:, 3] = boxes.xywh[:, 3].cpu().numpy()
+    data[:, 4] = boxes.conf.cpu().numpy()
+    data[:, 5] = boxes.cls.cpu().numpy()
+    # Sort by Y
+    data = data[data[:, 1].argsort()]
+    # Split into lines by Y gaps
+    avg_height = np.mean(data[:, 3])
+    y_threshold = avg_height / 2
+    y_diffs = np.diff(data[:, 1])
+    break_indices = np.where(y_diffs > y_threshold)[0]
+    raw_lines = np.split(data, break_indices + 1)
+    lines = []
+    for raw_line in raw_lines:
+        raw_line = raw_line[raw_line[:, 0].argsort()]
+        cells = []
+        for row in raw_line:
+            class_idx = int(row[5])
+            dots = yolo_model.names[class_idx]
+            unicode_char = dot_to_unicode.get(dots, "?")
+            cells.append({
+                "dots": dots,
+                "unicode": unicode_char,
+                "confidence": row[4],
+            })
+        lines.append(cells)
+    return lines
+# --- Stage 2: ByT5 Interpretation ---
+def interpret_braille(braille_lines: list[str]) -> list[str]:
+    """Translate braille Unicode lines to English using ByT5."""
+    results = []
+    for line in braille_lines:
+        if not line.strip():
+            results.append("")
+            continue
+        input_text = f"translate Braille to English: {line}"
+        inputs = tokenizer(input_text, return_tensors="pt", max_length=1024, truncation=True)
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = byt5_model.generate(**inputs, max_length=512)
+        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        results.append(decoded)
+    return results
+# --- Main pipeline ---
+def transcribe(image) -> str:
+    """Full pipeline: image -> detection -> interpretation -> English text."""
+    if image is None:
+        return "Please upload an image."
+    # Save uploaded image to temp file
+    tmp = tempfile.NamedTemporaryFile(suffix=".jpg", delete=False)
+    if isinstance(image, np.ndarray):
+        cv2.imwrite(tmp.name, cv2.cvtColor(image, cv2.COLOR_RGB2BGR))
+    else:
+        cv2.imwrite(tmp.name, image)
+    image_path = tmp.name
+    # CLAHE preprocessing
+    processed_path = preprocess_clahe(image_path)
+    # Stage 1: Detect braille cells
+    lines = detect_braille(processed_path)
+    if not lines:
+        return "No braille cells detected. Try a clearer image."
+    # Extract Unicode braille per line
+    braille_lines = ["".join(cell["unicode"] for cell in line) for line in lines]
+    # Stats
+    total_cells = sum(len(line) for line in lines)
+    avg_conf = np.mean([cell["confidence"] for line in lines for cell in line])
+    # Stage 2: Interpret with ByT5
+    english_lines = interpret_braille(braille_lines)
+    # Format output
+    braille_text = "\n".join(braille_lines)
+    english_text = "\n".join(english_lines)
+    output = f"{english_text}\n\n"
+    output += f"--- Details ---\n"
+    output += f"Cells detected: {total_cells}\n"
+    output += f"Lines: {len(lines)}\n"
+    output += f"Avg confidence: {avg_conf:.1%}\n"
+    output += f"\nBraille Unicode:\n{braille_text}"
+    return output
+# --- Gradio UI ---
+demo = gr.Interface(
+    fn=transcribe,
+    inputs=gr.Image(type="numpy", label="Upload Braille Image"),
+    outputs=gr.Textbox(label="English Translation", lines=15),
+    title="Braille Reader",
+    description="Upload a scanned braille document to get its English translation. Supports Grade 2 (contracted) braille.",
+    examples=[],
+    flagging_mode="never",
+)
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+ultralytics>=8.0.0
+opencv-python-headless>=4.0.0
+torch>=2.0.0
+transformers>=4.30.0
+sentencepiece>=0.1.99
+huggingface_hub>=0.20.0
+numpy>=1.23.0