Spaces:

Chhagan005
/

Multi_ML_OCR

Sleeping

App Files Files Community

Chhagan005 commited on Feb 21

Commit

eaba1fd

verified ·

1 Parent(s): 74734eb

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -47

app.py CHANGED Viewed

@@ -717,56 +717,56 @@ def build_unified_summary(front_result: str, back_result: str, mrz_data: dict) -
 def run_step1_extraction(model, processor, image, device, temperature, top_p, top_k, repetition_penalty):
     """Step 1: LLM → Raw OCR, original script, NO translation, NO coordinates"""
-def _generate(prompt_text):
-    from qwen_vl_utils import process_vision_info
-    messages = [{"role": "user", "content": [
-        {"type": "image", "image": image},
-        {"type": "text",  "text": prompt_text},
-    ]}]
-    # Qwen3VL: apply_chat_template with vision content
-    try:
-        prompt = processor.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=True,
-        )
-        # Verify it's a string — some versions return wrong type
-        if not isinstance(prompt, str):
-            raise TypeError("template returned non-string")
-    except (TypeError, Exception):
-        # Manual Qwen3VL format — guaranteed to work
-        prompt = (
-            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
-            "<|im_start|>user\n"
-            "<|vision_start|><|image_pad|><|vision_end|>"
-            f"{prompt_text}<|im_end|>\n"
-            "<|im_start|>assistant\n"
-        )
-    image_inputs, video_inputs = process_vision_info(messages)
-    inputs = processor(
-        text=[prompt],
-        images=image_inputs,
-        videos=video_inputs,
-        padding=True,
-        return_tensors="pt",
-    ).to(device)
-    with torch.no_grad():
-        out = model.generate(
-            **inputs,
-            max_new_tokens=600,
-            do_sample=True,
-            temperature=temperature,
-            top_p=top_p,
-            top_k=top_k,
-            repetition_penalty=repetition_penalty,
-        )
-    gen = out[:, inputs['input_ids'].shape[1]:]
-    return processor.batch_decode(gen, skip_special_tokens=True)[0]

 def run_step1_extraction(model, processor, image, device, temperature, top_p, top_k, repetition_penalty):
     """Step 1: LLM → Raw OCR, original script, NO translation, NO coordinates"""
+    def _generate(prompt_text):
+        from qwen_vl_utils import process_vision_info
+        messages = [{"role": "user", "content": [
+            {"type": "image", "image": image},
+            {"type": "text",  "text": prompt_text},
+        ]}]
+        # Qwen3VL: apply_chat_template with vision content
+        try:
+            prompt = processor.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True,
+            )
+            # Verify it's a string — some versions return wrong type
+            if not isinstance(prompt, str):
+                raise TypeError("template returned non-string")
+        except (TypeError, Exception):
+            # Manual Qwen3VL format — guaranteed to work
+            prompt = (
+                "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+                "<|im_start|>user\n"
+                "<|vision_start|><|image_pad|><|vision_end|>"
+                f"{prompt_text}<|im_end|>\n"
+                "<|im_start|>assistant\n"
+            )
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = processor(
+            text=[prompt],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+        ).to(device)
+        with torch.no_grad():
+            out = model.generate(
+                **inputs,
+                max_new_tokens=600,
+                do_sample=True,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                repetition_penalty=repetition_penalty,
+            )
+        gen = out[:, inputs['input_ids'].shape[1]:]
+        return processor.batch_decode(gen, skip_special_tokens=True)[0]