Spaces:
Paused
Paused
| # app.py | |
| import gradio as gr | |
| from PIL import Image | |
| from transformers import VisionEncoderDecoderModel, TrOCRProcessor | |
| import torch | |
| print("--- Initializing Solver Service ---") | |
| # Use a GPU if available (Hugging Face may provide one) | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # --- LOAD MODELS ONLY ONCE AT STARTUP --- | |
| print("1. Loading TrOCR processor...") | |
| processor = TrOCRProcessor.from_pretrained("anuashok/ocr-captcha-v3", use_fast=True) | |
| print(" - Processor loaded.") | |
| print("2. Loading VisionEncoderDecoder model...") | |
| model = VisionEncoderDecoderModel.from_pretrained("anuashok/ocr-captcha-v3").to(device) | |
| print(" - Model loaded.") | |
| print(f"--- Model is running on: {device.upper()} ---") | |
| # --- END OF HEAVY LOADING --- | |
| def solve_captcha(input_image: Image.Image) -> str: | |
| """ | |
| Solves a CAPTCHA using the pre-loaded model. | |
| This function uses the exact image processing logic from your original script. | |
| """ | |
| print("--- Received image for solving ---") | |
| # 1. Convert input image to RGBA (as in your original code) | |
| image = input_image.convert("RGBA") | |
| # 2. Prepare a white background | |
| background = Image.new("RGBA", image.size, (255, 255, 255)) | |
| # 3. Composite the image onto the white background and convert to RGB | |
| combined = Image.alpha_composite(background, image).convert("RGB") | |
| print(" - Image pre-processing complete.") | |
| # 4. Prepare image for the model | |
| pixel_values = processor(images=combined, return_tensors="pt").pixel_values.to(device) | |
| print(" - Image prepared for model.") | |
| # 5. Run model inference | |
| generated_ids = model.generate(pixel_values) | |
| print(" - Model inference complete.") | |
| # 6. Decode the result | |
| generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
| print(f" - Decoding complete. Result: {generated_text}") | |
| return generated_text | |
| # --- Create the Gradio Interface and API Endpoint --- | |
| gr.Interface( | |
| fn=solve_captcha, | |
| inputs=gr.Image(type="pil", label="Upload CAPTCHA Image"), | |
| outputs=gr.Textbox(label="Result"), | |
| title="TrOCR CAPTCHA Solver (Custom Logic)", | |
| description="An API for the anuashok/ocr-captcha-v3 model using specific pre-processing." | |
| ).launch() |