# app.py """ Handwritten -> Text Gradio app for Hugging Face Spaces. Primary OCR: Microsoft TrOCR (handwritten). Fallback: EasyOCR (if installed). Supports upload and webcam captures. """ from PIL import Image, ImageOps import io import torch import traceback import gradio as gr # Try to import TrOCR (transformers). If transformers or torch not available, # the Space build will fail and you'll see logs — that's normal. from transformers import TrOCRProcessor, VisionEncoderDecoderModel MODEL_NAME = "microsoft/trocr-small-handwritten" # small model for faster builds device = "cuda" if torch.cuda.is_available() else "cpu" # Load TrOCR processor + model (this may download the model on first build) processor = TrOCRProcessor.from_pretrained(MODEL_NAME) model = VisionEncoderDecoderModel.from_pretrained(MODEL_NAME).to(device) # Try to import EasyOCR as a fallback (optional, may increase build time) try: import easyocr EASYOCR_AVAILABLE = True # instantiate reader with common languages; add more codes if you need them easyocr_reader = easyocr.Reader(["en", "hi"], gpu=torch.cuda.is_available()) except Exception: EASYOCR_AVAILABLE = False easyocr_reader = None def preprocess_image(pil_image: Image.Image) -> Image.Image: """Standardise image: orientation, RGB, mild resize if extremely large.""" if pil_image is None: return None if pil_image.mode != "RGB": pil_image = pil_image.convert("RGB") pil_image = ImageOps.exif_transpose(pil_image) # Optional: downscale very large images to save memory/time max_dim = 1600 if max(pil_image.size) > max_dim: scale = max_dim / max(pil_image.size) new_size = (int(pil_image.size[0] * scale), int(pil_image.size[1] * scale)) pil_image = pil_image.resize(new_size, Image.LANCZOS) return pil_image def trotocr_recognize(pil_image: Image.Image) -> str: """Run Microsoft TrOCR on one image and return text.""" inputs = processor(images=pil_image, return_tensors="pt").pixel_values.to(device) # generation parameters can be tuned generated_ids = model.generate(inputs, max_length=512) generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] return generated_text.strip() def easyocr_recognize(pil_image: Image.Image) -> str: """Run EasyOCR if available (fallback).""" if not EASYOCR_AVAILABLE: return "" # easyocr expects numpy array import numpy as np arr = np.array(pil_image) results = easyocr_reader.readtext(arr) # results: list of (bbox, text, confidence) texts = [r[1] for r in results] return "\n".join(texts).strip() def transcribe(image: Image.Image) -> str: """Main wrapper: preprocess -> try TrOCR -> fallback EasyOCR -> return best result.""" if image is None: return "No image provided." try: img = preprocess_image(image) # Primary: TrOCR text = trotocr_recognize(img) # If TrOCR returns something short/empty and EasyOCR is available, try fallback if (not text or len(text) < 3) and EASYOCR_AVAILABLE: fallback = easyocr_recognize(img) if fallback: return fallback return text if text else "No text recognised. Try a clearer photo or crop the writing." except Exception as e: # In Spaces it's useful to show a friendly error + a short traceback tb = traceback.format_exc() return f"Error during recognition:\n{e}\n\nTraceback:\n{tb}" title = "Handwritten → Text (TrOCR) — Upload or take a photo" description = """ Upload a photo of handwritten notes or click the camera icon to take a picture. This app uses Microsoft TrOCR (handwritten model). For some scripts EasyOCR is used as a fallback. Tip: crop tightly around the writing for better results. """ with gr.Blocks(css=".footer {display:none !important;}") as demo: gr.Markdown(f"# {title}\n\n{description}") with gr.Row(): img = gr.Image(source="upload", type="pil", tool="editor", label="Upload or use webcam (choose from dropdown)") out = gr.Textbox(label="Recognised text", lines=12) with gr.Row(): btn = gr.Button("Transcribe") clear = gr.Button("Clear") info = gr.Markdown("Model: microsoft/trocr-small-handwritten. EasyOCR fallback: " f"{'enabled' if EASYOCR_AVAILABLE else 'not installed'}.") btn.click(fn=transcribe, inputs=img, outputs=out) clear.click(fn=lambda: (None, ""), inputs=None, outputs=[img, out]) gr.Markdown( "### Notes\n" "- For multi-line pages, crop to a single column of writing when possible.\n" "- If your language is not recognised well, consider fine-tuning or using EasyOCR with extra languages.\n" "- This Space may be slow on the free tier (CPU only). Consider a smaller model or a paid GPU space." ) if __name__ == "__main__": demo.launch()