Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -717,56 +717,56 @@ def build_unified_summary(front_result: str, back_result: str, mrz_data: dict) -
|
|
| 717 |
def run_step1_extraction(model, processor, image, device, temperature, top_p, top_k, repetition_penalty):
|
| 718 |
"""Step 1: LLM → Raw OCR, original script, NO translation, NO coordinates"""
|
| 719 |
|
| 720 |
-
def _generate(prompt_text):
|
| 721 |
-
|
| 722 |
|
| 723 |
-
|
| 724 |
-
|
| 725 |
-
|
| 726 |
-
|
| 727 |
|
| 728 |
-
|
| 729 |
-
|
| 730 |
-
|
| 731 |
-
|
| 732 |
-
|
| 733 |
-
|
| 734 |
-
|
| 735 |
-
|
| 736 |
-
|
| 737 |
-
|
| 738 |
-
|
| 739 |
-
|
| 740 |
-
|
| 741 |
-
|
| 742 |
-
|
| 743 |
-
|
| 744 |
-
|
| 745 |
-
|
| 746 |
-
|
| 747 |
-
|
| 748 |
-
image_inputs, video_inputs = process_vision_info(messages)
|
| 749 |
-
|
| 750 |
-
inputs = processor(
|
| 751 |
-
text=[prompt],
|
| 752 |
-
images=image_inputs,
|
| 753 |
-
videos=video_inputs,
|
| 754 |
-
padding=True,
|
| 755 |
-
return_tensors="pt",
|
| 756 |
-
).to(device)
|
| 757 |
|
| 758 |
-
|
| 759 |
-
|
| 760 |
-
|
| 761 |
-
|
| 762 |
-
|
| 763 |
-
|
| 764 |
-
|
| 765 |
-
|
| 766 |
-
|
| 767 |
-
|
| 768 |
-
|
| 769 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 770 |
|
| 771 |
|
| 772 |
|
|
|
|
| 717 |
def run_step1_extraction(model, processor, image, device, temperature, top_p, top_k, repetition_penalty):
|
| 718 |
"""Step 1: LLM → Raw OCR, original script, NO translation, NO coordinates"""
|
| 719 |
|
| 720 |
+
def _generate(prompt_text):
|
| 721 |
+
from qwen_vl_utils import process_vision_info
|
| 722 |
|
| 723 |
+
messages = [{"role": "user", "content": [
|
| 724 |
+
{"type": "image", "image": image},
|
| 725 |
+
{"type": "text", "text": prompt_text},
|
| 726 |
+
]}]
|
| 727 |
|
| 728 |
+
# Qwen3VL: apply_chat_template with vision content
|
| 729 |
+
try:
|
| 730 |
+
prompt = processor.apply_chat_template(
|
| 731 |
+
messages,
|
| 732 |
+
tokenize=False,
|
| 733 |
+
add_generation_prompt=True,
|
| 734 |
+
)
|
| 735 |
+
# Verify it's a string — some versions return wrong type
|
| 736 |
+
if not isinstance(prompt, str):
|
| 737 |
+
raise TypeError("template returned non-string")
|
| 738 |
+
except (TypeError, Exception):
|
| 739 |
+
# Manual Qwen3VL format — guaranteed to work
|
| 740 |
+
prompt = (
|
| 741 |
+
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
|
| 742 |
+
"<|im_start|>user\n"
|
| 743 |
+
"<|vision_start|><|image_pad|><|vision_end|>"
|
| 744 |
+
f"{prompt_text}<|im_end|>\n"
|
| 745 |
+
"<|im_start|>assistant\n"
|
| 746 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 747 |
|
| 748 |
+
image_inputs, video_inputs = process_vision_info(messages)
|
| 749 |
+
|
| 750 |
+
inputs = processor(
|
| 751 |
+
text=[prompt],
|
| 752 |
+
images=image_inputs,
|
| 753 |
+
videos=video_inputs,
|
| 754 |
+
padding=True,
|
| 755 |
+
return_tensors="pt",
|
| 756 |
+
).to(device)
|
| 757 |
+
|
| 758 |
+
with torch.no_grad():
|
| 759 |
+
out = model.generate(
|
| 760 |
+
**inputs,
|
| 761 |
+
max_new_tokens=600,
|
| 762 |
+
do_sample=True,
|
| 763 |
+
temperature=temperature,
|
| 764 |
+
top_p=top_p,
|
| 765 |
+
top_k=top_k,
|
| 766 |
+
repetition_penalty=repetition_penalty,
|
| 767 |
+
)
|
| 768 |
+
gen = out[:, inputs['input_ids'].shape[1]:]
|
| 769 |
+
return processor.batch_decode(gen, skip_special_tokens=True)[0]
|
| 770 |
|
| 771 |
|
| 772 |
|