Chhagan005 commited on
Commit
eaba1fd
·
verified ·
1 Parent(s): 74734eb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -47
app.py CHANGED
@@ -717,56 +717,56 @@ def build_unified_summary(front_result: str, back_result: str, mrz_data: dict) -
717
  def run_step1_extraction(model, processor, image, device, temperature, top_p, top_k, repetition_penalty):
718
  """Step 1: LLM → Raw OCR, original script, NO translation, NO coordinates"""
719
 
720
- def _generate(prompt_text):
721
- from qwen_vl_utils import process_vision_info
722
 
723
- messages = [{"role": "user", "content": [
724
- {"type": "image", "image": image},
725
- {"type": "text", "text": prompt_text},
726
- ]}]
727
 
728
- # Qwen3VL: apply_chat_template with vision content
729
- try:
730
- prompt = processor.apply_chat_template(
731
- messages,
732
- tokenize=False,
733
- add_generation_prompt=True,
734
- )
735
- # Verify it's a string — some versions return wrong type
736
- if not isinstance(prompt, str):
737
- raise TypeError("template returned non-string")
738
- except (TypeError, Exception):
739
- # Manual Qwen3VL format — guaranteed to work
740
- prompt = (
741
- "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
742
- "<|im_start|>user\n"
743
- "<|vision_start|><|image_pad|><|vision_end|>"
744
- f"{prompt_text}<|im_end|>\n"
745
- "<|im_start|>assistant\n"
746
- )
747
-
748
- image_inputs, video_inputs = process_vision_info(messages)
749
-
750
- inputs = processor(
751
- text=[prompt],
752
- images=image_inputs,
753
- videos=video_inputs,
754
- padding=True,
755
- return_tensors="pt",
756
- ).to(device)
757
 
758
- with torch.no_grad():
759
- out = model.generate(
760
- **inputs,
761
- max_new_tokens=600,
762
- do_sample=True,
763
- temperature=temperature,
764
- top_p=top_p,
765
- top_k=top_k,
766
- repetition_penalty=repetition_penalty,
767
- )
768
- gen = out[:, inputs['input_ids'].shape[1]:]
769
- return processor.batch_decode(gen, skip_special_tokens=True)[0]
 
 
 
 
 
 
 
 
 
 
770
 
771
 
772
 
 
717
  def run_step1_extraction(model, processor, image, device, temperature, top_p, top_k, repetition_penalty):
718
  """Step 1: LLM → Raw OCR, original script, NO translation, NO coordinates"""
719
 
720
+ def _generate(prompt_text):
721
+ from qwen_vl_utils import process_vision_info
722
 
723
+ messages = [{"role": "user", "content": [
724
+ {"type": "image", "image": image},
725
+ {"type": "text", "text": prompt_text},
726
+ ]}]
727
 
728
+ # Qwen3VL: apply_chat_template with vision content
729
+ try:
730
+ prompt = processor.apply_chat_template(
731
+ messages,
732
+ tokenize=False,
733
+ add_generation_prompt=True,
734
+ )
735
+ # Verify it's a string — some versions return wrong type
736
+ if not isinstance(prompt, str):
737
+ raise TypeError("template returned non-string")
738
+ except (TypeError, Exception):
739
+ # Manual Qwen3VL format — guaranteed to work
740
+ prompt = (
741
+ "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
742
+ "<|im_start|>user\n"
743
+ "<|vision_start|><|image_pad|><|vision_end|>"
744
+ f"{prompt_text}<|im_end|>\n"
745
+ "<|im_start|>assistant\n"
746
+ )
 
 
 
 
 
 
 
 
 
 
747
 
748
+ image_inputs, video_inputs = process_vision_info(messages)
749
+
750
+ inputs = processor(
751
+ text=[prompt],
752
+ images=image_inputs,
753
+ videos=video_inputs,
754
+ padding=True,
755
+ return_tensors="pt",
756
+ ).to(device)
757
+
758
+ with torch.no_grad():
759
+ out = model.generate(
760
+ **inputs,
761
+ max_new_tokens=600,
762
+ do_sample=True,
763
+ temperature=temperature,
764
+ top_p=top_p,
765
+ top_k=top_k,
766
+ repetition_penalty=repetition_penalty,
767
+ )
768
+ gen = out[:, inputs['input_ids'].shape[1]:]
769
+ return processor.batch_decode(gen, skip_special_tokens=True)[0]
770
 
771
 
772