Spaces:

tachiwin
/

multilingual_ocr

Sleeping

App Files Files Community

Luis J Camargo commited on Dec 23, 2025

Commit

0c82e96

1 Parent(s): cbab00e

new setts2

Browse files

Files changed (1) hide show

app.py +40 -61

app.py CHANGED Viewed

@@ -22,7 +22,6 @@ logging.basicConfig(
 logger = logging.getLogger("TachiwinOCR")
 DEVICE = "cpu"
-# Speed up CPU inference
 torch.set_num_threads(os.cpu_count() or 4)
 PROMPTS = {
@@ -82,45 +81,50 @@ class OCRModelManager(object):
             try:
                 img_path = args[0]
                 task = kwargs.get("task", "ocr")
-                min_new_tokens = kwargs.get("min_new_tokens", 3)
-                max_new_tokens = kwargs.get("max_new_tokens", 1024)
                 temperature = kwargs.get("temperature", 0.2)
                 logger.info(f"--- Starting inference process ---")
                 logger.info(f"Task: {task}, Min New Tokens: {min_new_tokens}, Temperature: {temperature}")
                 image = Image.open(img_path).convert("RGB")
                 messages = [
-                    {"role": "user",
-                     "content": [
-                            {"type": "image", "image": image},
                             {"type": "text", "text": PROMPTS[task]},
                         ]
                     }
                 ]
-                inputs = processor.apply_chat_template(
-                    messages,
-                    tokenize=True,
-                    add_generation_prompt=True,
-                    return_dict=True,
-                    return_tensors="pt"
                 ).to(DEVICE)
                 logger.info(f"Inputs prepared (shape: {inputs['input_ids'].shape}). Running model.generate...")
-                with torch.inference_mode():
-                    # Restoring sampling params as requested
-                    # use_cache=False as requested because it's known to be unstable on some setups
-                    outputs = model.generate(
-                        **inputs,
-                        max_new_tokens=max_new_tokens,
-                        min_new_tokens=min_new_tokens,
-                        use_cache=False,
-                        do_sample=True,
-                        temperature=max(temperature, 0.01),
-                        min_p=0.1,
-                    )
                 logger.info("Generation complete. Decoding results...")
                 decoded_outputs = processor.batch_decode(outputs, skip_special_tokens=True)[0]
@@ -138,23 +142,12 @@ def create_model():
     model_path = "tachiwin/PaddleOCR-VL-Tachiwin-BF16"
     logger.info(f"Loading model and processor from {model_path}...")
-    # Use bfloat16 for CPU if supported, else float32
-    # Hugging Face spaces CPUs often support bfloat16
-    try:
-        model = AutoModelForCausalLM.from_pretrained(
-            model_path,
-            trust_remote_code=True,
-            torch_dtype=torch.bfloat16
-        ).to(DEVICE).eval()
-        logger.info(f"Model loaded on {DEVICE} with bfloat16")
-    except Exception as e:
-        logger.warning(f"Failed to load in bfloat16, falling back to float32: {e}")
-        model = AutoModelForCausalLM.from_pretrained(
-            model_path,
-            trust_remote_code=True,
-            torch_dtype=torch.float32
-        ).to(DEVICE).eval()
-        logger.info(f"Model loaded on {DEVICE} with float32")
     processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
     logger.info(f"Processor loaded successfully.")
@@ -184,26 +177,12 @@ def inference(img):
         return model_manager.infer(
             img,
             task="ocr",
-            min_new_tokens=3,
-            max_new_tokens=1024,
         )
-        # # Now extract text from the serialized structure
-        # extracted_texts = []
-        # for page in serialized_result:
-        #     if isinstance(page, dict) and 'parsing_res_list' in page:
-        #         for block in page['parsing_res_list']:
-        #             if isinstance(block, dict) and 'content' in block and block['content']:
-        #                 extracted_texts.append(block['content'])
-        # if not extracted_texts:
-        #     # return json as string
-        #     return json.dumps(serialized_result, indent=4)
-        # # Join all text blocks with double newlines
-        # return "\n\n".join(extracted_texts)
     except Exception as e:
         import traceback
         error_detail = traceback.format_exc()

 logger = logging.getLogger("TachiwinOCR")
 DEVICE = "cpu"
 torch.set_num_threads(os.cpu_count() or 4)
 PROMPTS = {
             try:
                 img_path = args[0]
                 task = kwargs.get("task", "ocr")
+                min_new_tokens = kwargs.get("min_new_tokens", 1)
+                max_new_tokens = kwargs.get("max_new_tokens", 128)
                 temperature = kwargs.get("temperature", 0.2)
+                min_p = kwargs.get("min_p", 0.1)
                 logger.info(f"--- Starting inference process ---")
                 logger.info(f"Task: {task}, Min New Tokens: {min_new_tokens}, Temperature: {temperature}")
                 image = Image.open(img_path).convert("RGB")
                 messages = [
+                    {"role": "user",
+                    "content": [
+                            {"type": "image"},
                             {"type": "text", "text": PROMPTS[task]},
                         ]
                     }
                 ]
+                text_prompt = processor.tokenizer.apply_chat_template(
+                    messages,
+                    tokenize=False,
+                    add_generation_prompt=True
+                )
+                logger.info(f"Text prompt: {text_prompt}")
+                inputs = processor(
+                    image,
+                    text_prompt,
+                    add_special_tokens=False,
+                    return_tensors="pt",
                 ).to(DEVICE)
                 logger.info(f"Inputs prepared (shape: {inputs['input_ids'].shape}). Running model.generate...")
+                logger.info(inputs)
+                outputs = model.generate(
+                    **inputs,
+                    max_new_tokens=max_new_tokens,
+                    min_new_tokens=min_new_tokens,
+                    use_cache=False,
+                    do_sample=True,
+                    temperature=temperature,
+                    min_p=min_p,
+                )
                 logger.info("Generation complete. Decoding results...")
                 decoded_outputs = processor.batch_decode(outputs, skip_special_tokens=True)[0]
     model_path = "tachiwin/PaddleOCR-VL-Tachiwin-BF16"
     logger.info(f"Loading model and processor from {model_path}...")
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        trust_remote_code=True,
+        torch_dtype=torch.bfloat16
+    ).to(DEVICE).eval()
+    logger.info(f"Model loaded on {DEVICE} with bfloat16")
     processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
     logger.info(f"Processor loaded successfully.")
         return model_manager.infer(
             img,
             task="ocr",
+            min_new_tokens=1,
+            max_new_tokens=128,
+            temperature=1.5,
+            min_p=0.1,
         )
     except Exception as e:
         import traceback
         error_detail = traceback.format_exc()