Spaces:

Chhagan005
/

Multi_ML_OCR

Running on Zero

App Files Files Community

Chhagan005 commited on 1 day ago

Commit

cb30e22

verified ·

1 Parent(s): eaba1fd

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -35

app.py CHANGED Viewed

@@ -718,25 +718,26 @@ def run_step1_extraction(model, processor, image, device, temperature, top_p, to
     """Step 1: LLM → Raw OCR, original script, NO translation, NO coordinates"""
     def _generate(prompt_text):
-        from qwen_vl_utils import process_vision_info
         messages = [{"role": "user", "content": [
             {"type": "image", "image": image},
             {"type": "text",  "text": prompt_text},
         ]}]
-        # Qwen3VL: apply_chat_template with vision content
         try:
             prompt = processor.apply_chat_template(
-                messages,
-                tokenize=False,
-                add_generation_prompt=True,
             )
-            # Verify it's a string — some versions return wrong type
             if not isinstance(prompt, str):
-                raise TypeError("template returned non-string")
-        except (TypeError, Exception):
-            # Manual Qwen3VL format — guaranteed to work
             prompt = (
                 "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
                 "<|im_start|>user\n"
@@ -745,15 +746,50 @@ def run_step1_extraction(model, processor, image, device, temperature, top_p, to
                 "<|im_start|>assistant\n"
             )
-        image_inputs, video_inputs = process_vision_info(messages)
-        inputs = processor(
-            text=[prompt],
-            images=image_inputs,
-            videos=video_inputs,
-            padding=True,
-            return_tensors="pt",
-        ).to(device)
         with torch.no_grad():
             out = model.generate(
@@ -766,30 +802,34 @@ def run_step1_extraction(model, processor, image, device, temperature, top_p, to
                 repetition_penalty=repetition_penalty,
             )
         gen = out[:, inputs['input_ids'].shape[1]:]
-        return processor.batch_decode(gen, skip_special_tokens=True)[0]
     result = _generate(STEP1_EXTRACT_PROMPT)
-    # Detect coordinate output (Qwen grounding mode triggered) → retry
     if re.search(r'\(\d{1,4},\s*\d{1,4}\)', result) or '---TEXT_START---' not in result:
-        print("   ⚠️ Coordinate output detected, retrying...")
-        fallback = """Read all text from this document image and write it line by line in plain text.
-Do NOT output coordinates or bounding boxes.
-Start output with:
-PHOTO_PRESENT: yes or no
-SIGNATURE_PRESENT: yes or no
-MRZ_PRESENT: yes or no
-DETECTED_LANGUAGE: name the language(s)
----TEXT_START---
-[all text here exactly as printed]
----TEXT_END---"""
         result = _generate(fallback)
     return result
 def parse_step1_output(raw_output: str) -> dict:
     """Parse Step 1 structured output → metadata + original text"""
     result = {
@@ -940,8 +980,6 @@ def run_step2_structure(model, processor, metadata: dict, device,
     return streamer, thread, mrz_data, python_sections
-    return streamer, thread, mrz_data, python_sections
 # ╔══════════════════════════════════════════╗

     """Step 1: LLM → Raw OCR, original script, NO translation, NO coordinates"""
     def _generate(prompt_text):
+        try:
+            from qwen_vl_utils import process_vision_info
+            HAS_QWEN_VL_UTILS = True
+        except ImportError:
+            HAS_QWEN_VL_UTILS = False
         messages = [{"role": "user", "content": [
             {"type": "image", "image": image},
             {"type": "text",  "text": prompt_text},
         ]}]
+        # Step A: Build prompt string
         try:
             prompt = processor.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True
             )
             if not isinstance(prompt, str):
+                raise TypeError("non-string returned")
+        except Exception:
+            # Manual Qwen3VL token format — universal fallback
             prompt = (
                 "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
                 "<|im_start|>user\n"
                 "<|im_start|>assistant\n"
             )
+        # Step B: Build inputs — 3 fallback tiers
+        inputs = None
+        # Tier 1: qwen_vl_utils + images/videos kwargs (Qwen3VL standard)
+        if HAS_QWEN_VL_UTILS and inputs is None:
+            try:
+                image_inputs, video_inputs = process_vision_info(messages)
+                proc_kwargs = {
+                    "text": [prompt],
+                    "padding": True,
+                    "return_tensors": "pt"
+                }
+                if image_inputs is not None and len(image_inputs) > 0:
+                    proc_kwargs["images"] = image_inputs
+                if video_inputs is not None and len(video_inputs) > 0:
+                    proc_kwargs["videos"] = video_inputs
+                inputs = processor(**proc_kwargs).to(device)
+                print("      ✅ Tier1: qwen_vl_utils")
+            except Exception as e:
+                print(f"      Tier1 failed: {e}")
+                inputs = None
+        # Tier 2: Direct PIL image (Qwen2VL style)
+        if inputs is None:
+            try:
+                inputs = processor(
+                    text=[prompt],
+                    images=[image],
+                    padding=True,
+                    return_tensors="pt",
+                ).to(device)
+                print("      ✅ Tier2: direct PIL")
+            except Exception as e:
+                print(f"      Tier2 failed: {e}")
+                inputs = None
+        # Tier 3: Text-only (last resort)
+        if inputs is None:
+            print("      ⚠️ Tier3: text-only fallback (no image — degraded)")
+            inputs = processor(
+                text=[prompt],
+                padding=True,
+                return_tensors="pt",
+            ).to(device)
         with torch.no_grad():
             out = model.generate(
                 repetition_penalty=repetition_penalty,
             )
         gen = out[:, inputs['input_ids'].shape[1]:]
+        decoded = processor.batch_decode(gen, skip_special_tokens=True)
+        if isinstance(decoded, list):
+            return decoded[0] if decoded else ""
+        return str(decoded) if decoded else ""
     result = _generate(STEP1_EXTRACT_PROMPT)
+    # Coordinate output detect → retry with simpler prompt
     if re.search(r'\(\d{1,4},\s*\d{1,4}\)', result) or '---TEXT_START---' not in result:
+        print("   ⚠️ Retrying with fallback prompt...")
+        fallback = (
+            "Read all text from this document image and write it line by line in plain text.\n"
+            "Do NOT output coordinates or bounding boxes.\n"
+            "Start output with:\n"
+            "PHOTO_PRESENT: yes or no\n"
+            "SIGNATURE_PRESENT: yes or no\n"
+            "MRZ_PRESENT: yes or no\n"
+            "DETECTED_LANGUAGE: name the language(s)\n"
+            "---TEXT_START---\n"
+            "[all text here exactly as printed]\n"
+            "---TEXT_END---"
+        )
         result = _generate(fallback)
     return result
 def parse_step1_output(raw_output: str) -> dict:
     """Parse Step 1 structured output → metadata + original text"""
     result = {
     return streamer, thread, mrz_data, python_sections
 # ╔══════════════════════════════════════════╗