Chhagan005 commited on
Commit
cb30e22
Β·
verified Β·
1 Parent(s): eaba1fd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -35
app.py CHANGED
@@ -718,25 +718,26 @@ def run_step1_extraction(model, processor, image, device, temperature, top_p, to
718
  """Step 1: LLM β†’ Raw OCR, original script, NO translation, NO coordinates"""
719
 
720
  def _generate(prompt_text):
721
- from qwen_vl_utils import process_vision_info
 
 
 
 
722
 
723
  messages = [{"role": "user", "content": [
724
  {"type": "image", "image": image},
725
  {"type": "text", "text": prompt_text},
726
  ]}]
727
 
728
- # Qwen3VL: apply_chat_template with vision content
729
  try:
730
  prompt = processor.apply_chat_template(
731
- messages,
732
- tokenize=False,
733
- add_generation_prompt=True,
734
  )
735
- # Verify it's a string β€” some versions return wrong type
736
  if not isinstance(prompt, str):
737
- raise TypeError("template returned non-string")
738
- except (TypeError, Exception):
739
- # Manual Qwen3VL format β€” guaranteed to work
740
  prompt = (
741
  "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
742
  "<|im_start|>user\n"
@@ -745,15 +746,50 @@ def run_step1_extraction(model, processor, image, device, temperature, top_p, to
745
  "<|im_start|>assistant\n"
746
  )
747
 
748
- image_inputs, video_inputs = process_vision_info(messages)
749
-
750
- inputs = processor(
751
- text=[prompt],
752
- images=image_inputs,
753
- videos=video_inputs,
754
- padding=True,
755
- return_tensors="pt",
756
- ).to(device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
757
 
758
  with torch.no_grad():
759
  out = model.generate(
@@ -766,30 +802,34 @@ def run_step1_extraction(model, processor, image, device, temperature, top_p, to
766
  repetition_penalty=repetition_penalty,
767
  )
768
  gen = out[:, inputs['input_ids'].shape[1]:]
769
- return processor.batch_decode(gen, skip_special_tokens=True)[0]
770
-
771
-
 
772
 
773
  result = _generate(STEP1_EXTRACT_PROMPT)
774
 
775
- # Detect coordinate output (Qwen grounding mode triggered) β†’ retry
776
  if re.search(r'\(\d{1,4},\s*\d{1,4}\)', result) or '---TEXT_START---' not in result:
777
- print(" ⚠️ Coordinate output detected, retrying...")
778
- fallback = """Read all text from this document image and write it line by line in plain text.
779
- Do NOT output coordinates or bounding boxes.
780
- Start output with:
781
- PHOTO_PRESENT: yes or no
782
- SIGNATURE_PRESENT: yes or no
783
- MRZ_PRESENT: yes or no
784
- DETECTED_LANGUAGE: name the language(s)
785
- ---TEXT_START---
786
- [all text here exactly as printed]
787
- ---TEXT_END---"""
 
 
788
  result = _generate(fallback)
789
 
790
  return result
791
 
792
 
 
793
  def parse_step1_output(raw_output: str) -> dict:
794
  """Parse Step 1 structured output β†’ metadata + original text"""
795
  result = {
@@ -940,8 +980,6 @@ def run_step2_structure(model, processor, metadata: dict, device,
940
 
941
  return streamer, thread, mrz_data, python_sections
942
 
943
-
944
- return streamer, thread, mrz_data, python_sections
945
 
946
 
947
  # ╔══════════════════════════════════════════╗
 
718
  """Step 1: LLM β†’ Raw OCR, original script, NO translation, NO coordinates"""
719
 
720
  def _generate(prompt_text):
721
+ try:
722
+ from qwen_vl_utils import process_vision_info
723
+ HAS_QWEN_VL_UTILS = True
724
+ except ImportError:
725
+ HAS_QWEN_VL_UTILS = False
726
 
727
  messages = [{"role": "user", "content": [
728
  {"type": "image", "image": image},
729
  {"type": "text", "text": prompt_text},
730
  ]}]
731
 
732
+ # Step A: Build prompt string
733
  try:
734
  prompt = processor.apply_chat_template(
735
+ messages, tokenize=False, add_generation_prompt=True
 
 
736
  )
 
737
  if not isinstance(prompt, str):
738
+ raise TypeError("non-string returned")
739
+ except Exception:
740
+ # Manual Qwen3VL token format β€” universal fallback
741
  prompt = (
742
  "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
743
  "<|im_start|>user\n"
 
746
  "<|im_start|>assistant\n"
747
  )
748
 
749
+ # Step B: Build inputs β€” 3 fallback tiers
750
+ inputs = None
751
+
752
+ # Tier 1: qwen_vl_utils + images/videos kwargs (Qwen3VL standard)
753
+ if HAS_QWEN_VL_UTILS and inputs is None:
754
+ try:
755
+ image_inputs, video_inputs = process_vision_info(messages)
756
+ proc_kwargs = {
757
+ "text": [prompt],
758
+ "padding": True,
759
+ "return_tensors": "pt"
760
+ }
761
+ if image_inputs is not None and len(image_inputs) > 0:
762
+ proc_kwargs["images"] = image_inputs
763
+ if video_inputs is not None and len(video_inputs) > 0:
764
+ proc_kwargs["videos"] = video_inputs
765
+ inputs = processor(**proc_kwargs).to(device)
766
+ print(" βœ… Tier1: qwen_vl_utils")
767
+ except Exception as e:
768
+ print(f" Tier1 failed: {e}")
769
+ inputs = None
770
+
771
+ # Tier 2: Direct PIL image (Qwen2VL style)
772
+ if inputs is None:
773
+ try:
774
+ inputs = processor(
775
+ text=[prompt],
776
+ images=[image],
777
+ padding=True,
778
+ return_tensors="pt",
779
+ ).to(device)
780
+ print(" βœ… Tier2: direct PIL")
781
+ except Exception as e:
782
+ print(f" Tier2 failed: {e}")
783
+ inputs = None
784
+
785
+ # Tier 3: Text-only (last resort)
786
+ if inputs is None:
787
+ print(" ⚠️ Tier3: text-only fallback (no image β€” degraded)")
788
+ inputs = processor(
789
+ text=[prompt],
790
+ padding=True,
791
+ return_tensors="pt",
792
+ ).to(device)
793
 
794
  with torch.no_grad():
795
  out = model.generate(
 
802
  repetition_penalty=repetition_penalty,
803
  )
804
  gen = out[:, inputs['input_ids'].shape[1]:]
805
+ decoded = processor.batch_decode(gen, skip_special_tokens=True)
806
+ if isinstance(decoded, list):
807
+ return decoded[0] if decoded else ""
808
+ return str(decoded) if decoded else ""
809
 
810
  result = _generate(STEP1_EXTRACT_PROMPT)
811
 
812
+ # Coordinate output detect β†’ retry with simpler prompt
813
  if re.search(r'\(\d{1,4},\s*\d{1,4}\)', result) or '---TEXT_START---' not in result:
814
+ print(" ⚠️ Retrying with fallback prompt...")
815
+ fallback = (
816
+ "Read all text from this document image and write it line by line in plain text.\n"
817
+ "Do NOT output coordinates or bounding boxes.\n"
818
+ "Start output with:\n"
819
+ "PHOTO_PRESENT: yes or no\n"
820
+ "SIGNATURE_PRESENT: yes or no\n"
821
+ "MRZ_PRESENT: yes or no\n"
822
+ "DETECTED_LANGUAGE: name the language(s)\n"
823
+ "---TEXT_START---\n"
824
+ "[all text here exactly as printed]\n"
825
+ "---TEXT_END---"
826
+ )
827
  result = _generate(fallback)
828
 
829
  return result
830
 
831
 
832
+
833
  def parse_step1_output(raw_output: str) -> dict:
834
  """Parse Step 1 structured output β†’ metadata + original text"""
835
  result = {
 
980
 
981
  return streamer, thread, mrz_data, python_sections
982
 
 
 
983
 
984
 
985
  # ╔══════════════════════════════════════════╗