Spaces:

tachiwin
/

multilingual_ocr

Sleeping

App Files Files Community

Luis J Camargo commited on Dec 23, 2025

Commit

2886d21

1 Parent(s): 0c82e96

refactor 2

Browse files

Files changed (1) hide show

app.py +104 -161

app.py CHANGED Viewed

@@ -1,19 +1,18 @@
 import os
 import torch
-from transformers import AutoModelForCausalLM, AutoProcessor
 from PIL import Image
 import gradio as gr
-from queue import Queue, Empty
-from threading import Event, Thread
-import atexit
-CONCURRENCY_LIMIT = 1
 import logging
 import sys
-# Configure logging to sys.stderr which is often more reliable in containerized environments
 logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
@@ -21,7 +20,7 @@ logging.basicConfig(
 )
 logger = logging.getLogger("TachiwinOCR")
-DEVICE = "cpu"
 torch.set_num_threads(os.cpu_count() or 4)
 PROMPTS = {
@@ -31,165 +30,106 @@ PROMPTS = {
     "chart": "Chart Recognition:",
 }
-class OCRModelManager(object):
-    def __init__(self, num_workers, model_factory):
-        super().__init__()
-        self._model_factory = model_factory
-        self._queue = Queue()
-        self._workers = []
-        self._model_initialized_event = Event()
-        for _ in range(num_workers):
-            worker = Thread(target=self._worker, daemon=True)
-            worker.start()
-            self._model_initialized_event.wait()
-            self._model_initialized_event.clear()
-            self._workers.append(worker)
-    def infer(self, *args, **kwargs):
-        result_queue = Queue(maxsize=1)
-        self._queue.put((args, kwargs, result_queue))
-        # Increased timeout to 20 minutes for CPU inference
-        timeout = 1200
-        try:
-            success, payload = result_queue.get(timeout=timeout)
-            if success:
-                return payload
-            else:
-                raise payload
-        except Empty:
-            # Check if workers are still alive
-            alive = any(w.is_alive() for w in self._workers)
-            if not alive:
-                raise RuntimeError("OCR workers have crashed.")
-            raise RuntimeError(f"OCR inference timed out after {timeout} seconds.")
-    def close(self):
-        for _ in self._workers:
-            self._queue.put(None)
-        for worker in self._workers:
-            worker.join()
-    def _worker(self):
-        model, processor = self._model_factory()
-        self._model_initialized_event.set()
-        while True:
-            item = self._queue.get()
-            if item is None:
-                break
-            args, kwargs, result_queue = item
-            try:
-                img_path = args[0]
-                task = kwargs.get("task", "ocr")
-                min_new_tokens = kwargs.get("min_new_tokens", 1)
-                max_new_tokens = kwargs.get("max_new_tokens", 128)
-                temperature = kwargs.get("temperature", 0.2)
-                min_p = kwargs.get("min_p", 0.1)
-                logger.info(f"--- Starting inference process ---")
-                logger.info(f"Task: {task}, Min New Tokens: {min_new_tokens}, Temperature: {temperature}")
-                image = Image.open(img_path).convert("RGB")
-                messages = [
-                    {"role": "user",
-                    "content": [
-                            {"type": "image"},
-                            {"type": "text", "text": PROMPTS[task]},
-                        ]
-                    }
-                ]
-                text_prompt = processor.tokenizer.apply_chat_template(
-                    messages,
-                    tokenize=False,
-                    add_generation_prompt=True
-                )
-                logger.info(f"Text prompt: {text_prompt}")
-                inputs = processor(
-                    image,
-                    text_prompt,
-                    add_special_tokens=False,
-                    return_tensors="pt",
-                ).to(DEVICE)
-                logger.info(f"Inputs prepared (shape: {inputs['input_ids'].shape}). Running model.generate...")
-                logger.info(inputs)
-                outputs = model.generate(
-                    **inputs,
-                    max_new_tokens=max_new_tokens,
-                    min_new_tokens=min_new_tokens,
-                    use_cache=False,
-                    do_sample=True,
-                    temperature=temperature,
-                    min_p=min_p,
-                )
-                logger.info("Generation complete. Decoding results...")
-                decoded_outputs = processor.batch_decode(outputs, skip_special_tokens=True)[0]
-                logger.info(f"Inference finished successfully.")
-                result_queue.put((True, decoded_outputs))
-            except Exception as e:
-                result_queue.put((False, e))
-            finally:
-                self._queue.task_done()
-def create_model():
-    """Initialize PaddleOCR-VL with the fine-tuned Tachiwin model using transformers"""
-    model_path = "tachiwin/PaddleOCR-VL-Tachiwin-BF16"
-    logger.info(f"Loading model and processor from {model_path}...")
     model = AutoModelForCausalLM.from_pretrained(
         model_path,
         trust_remote_code=True,
-        torch_dtype=torch.bfloat16
     ).to(DEVICE).eval()
-    logger.info(f"Model loaded on {DEVICE} with bfloat16")
-    processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
-    logger.info(f"Processor loaded successfully.")
-    return model, processor
-# Initialize model manager with 1 worker to save memory on CPU space
-logger.info("Initializing Tachiwin Indigenous Languages OCR model manager...")
-model_manager = OCRModelManager(1, create_model)
-logger.info("Model manager is ready and listening for tasks!")
-def close_model_manager():
-    model_manager.close()
-atexit.register(close_model_manager)
 def inference(img):
-    """Process image with OCR and return extracted text in markdown format"""
     if img is None:
-        return "Please upload an image."
-    gr.Info("Inference started. On CPU, this may take 2-10 minutes depending on image complexity.")
     try:
-        return model_manager.infer(
-            img,
-            task="ocr",
             min_new_tokens=1,
-            max_new_tokens=128,
-            temperature=1.5,
             min_p=0.1,
         )
     except Exception as e:
         import traceback
         error_detail = traceback.format_exc()
-        print(e)
-        return f"Error during OCR processing:\n\n```\n{error_detail}\n```"
 title = '🌎 Tachiwin OCR for the Indigenous Languages of Mexico'
 description = '''
@@ -198,10 +138,8 @@ description = '''
 This model represents a **world first in tech access and linguistic rights**, specifically trained to recognize
 the diverse character and glyph repertoire of Mexico's 68 indigenous languages.
-**How to use:** Simply upload an image containing text in any Mexican indigenous language, and the model will
-detect and recognize the text.
-### Warning: as this free demonstrator space uses only CPU, a small image could take up to 5 minutes, so be patient.
 🔗 [PaddleOCR Documentation](https://github.com/PaddlePaddle/PaddleOCR)
 '''
@@ -230,12 +168,14 @@ example_labels = """
 css = ".output_image, .input_image {height: 40rem !important; width: 100% !important;} .output_markdown {min-height: 30rem !important;}"
-gr.Interface(
-    inference,
-    [
-        gr.Image(type='filepath', label='Input'),
-    ],
-    gr.Markdown(label='Output', elem_classes="output_markdown"),
     title=title,
     description=description,
     examples=examples,
@@ -265,4 +205,7 @@ gr.Interface(
     Made with ❤️ for linguistic diversity and indigenous rights
     """
-).launch(debug=True)

 import os
 import torch
+from transformers import AutoModelForCausalLM, AutoProcessor, TextIteratorStreamer
 from PIL import Image
 import gradio as gr
+from threading import Thread
 import logging
 import sys
+# --- Configuration ---
+CONCURRENCY_LIMIT = 1
+DEVICE = "cpu"
+DTYPE = torch.float32
+# Configure logging
 logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
 )
 logger = logging.getLogger("TachiwinOCR")
+# Set CPU threads
 torch.set_num_threads(os.cpu_count() or 4)
 PROMPTS = {
     "chart": "Chart Recognition:",
 }
+# --- Global Model Loading ---
+# We load the model globally so it persists across requests.
+# No need for a custom Manager class.
+model_path = "tachiwin/PaddleOCR-VL-Tachiwin-BF16"
+try:
+    logger.info(f"Loading processor from {model_path}...")
+    processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
+    logger.info(f"Loading model from {model_path}...")
     model = AutoModelForCausalLM.from_pretrained(
         model_path,
         trust_remote_code=True,
+        torch_dtype=DTYPE
     ).to(DEVICE).eval()
+    logger.info("Model loaded successfully.")
+except Exception as e:
+    logger.error(f"Failed to load model: {e}")
+    raise e
 def inference(img):
+    """
+    Process image with OCR and Stream the extracted text.
+    """
     if img is None:
+        yield "Please upload an image."
+        return
+    # Basic cleanup
+    if isinstance(img, str):
+        image = Image.open(img).convert("RGB")
+    else:
+        image = Image.fromarray(img).convert("RGB")
+    task = "ocr"
+    # Prepare inputs
+    messages = [
+        {"role": "user",
+         "content": [
+                {"type": "image"},
+                {"type": "text", "text": PROMPTS[task]},
+            ]
+        }
+    ]
     try:
+        text_prompt = processor.tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        inputs = processor(
+            image,
+            text_prompt,
+            add_special_tokens=False,
+            return_tensors="pt",
+        ).to(DEVICE)
+        # Initialize Streamer
+        streamer = TextIteratorStreamer(
+            processor.tokenizer,
+            skip_prompt=True,
+            skip_special_tokens=True
+        )
+        # Generation Arguments
+        generation_kwargs = dict(
+            **inputs,
+            streamer=streamer,
+            max_new_tokens=256, # Increased slightly
             min_new_tokens=1,
+            do_sample=True,
+            temperature=1.5, # Adjusted slightly for stability
             min_p=0.1,
+            use_cache=False # Cache helps speed on CPU significantly
         )
+        # Threading is REQUIRED for streaming
+        # The model generates in a separate thread, while the main thread
+        # yields from the streamer iterator.
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        thread.start()
+        generated_text = ""
+        for new_text in streamer:
+            generated_text += new_text
+            # Yielding here updates the Gradio textbox in real-time
+            yield generated_text
     except Exception as e:
         import traceback
         error_detail = traceback.format_exc()
+        logger.error(error_detail)
+        yield f"Error during OCR processing:\n\n```\n{error_detail}\n```"
+# --- Interface Setup ---
 title = '🌎 Tachiwin OCR for the Indigenous Languages of Mexico'
 description = '''
 This model represents a **world first in tech access and linguistic rights**, specifically trained to recognize
 the diverse character and glyph repertoire of Mexico's 68 indigenous languages.
+**How to use:** Simply upload an image containing text in any Mexican indigenous language.
+**Note:** Running on CPU. Streaming is enabled so you can see progress immediately.
 🔗 [PaddleOCR Documentation](https://github.com/PaddlePaddle/PaddleOCR)
 '''
 css = ".output_image, .input_image {height: 40rem !important; width: 100% !important;} .output_markdown {min-height: 30rem !important;}"
+# Note: We replaced gr.Interface with gr.Blocks or used the generator compatible interface
+# But standard Interface supports generators in newer Gradio versions.
+# Just ensuring concurrency_limit is set.
+demo = gr.Interface(
+    fn=inference,
+    inputs=gr.Image(type='filepath', label='Input'),
+    outputs=gr.Markdown(label='Output', elem_classes="output_markdown"),
     title=title,
     description=description,
     examples=examples,
     Made with ❤️ for linguistic diversity and indigenous rights
     """
+)
+if __name__ == "__main__":
+    demo.queue().launch()