Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -718,25 +718,26 @@ def run_step1_extraction(model, processor, image, device, temperature, top_p, to
|
|
| 718 |
"""Step 1: LLM β Raw OCR, original script, NO translation, NO coordinates"""
|
| 719 |
|
| 720 |
def _generate(prompt_text):
|
| 721 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 722 |
|
| 723 |
messages = [{"role": "user", "content": [
|
| 724 |
{"type": "image", "image": image},
|
| 725 |
{"type": "text", "text": prompt_text},
|
| 726 |
]}]
|
| 727 |
|
| 728 |
-
#
|
| 729 |
try:
|
| 730 |
prompt = processor.apply_chat_template(
|
| 731 |
-
messages,
|
| 732 |
-
tokenize=False,
|
| 733 |
-
add_generation_prompt=True,
|
| 734 |
)
|
| 735 |
-
# Verify it's a string β some versions return wrong type
|
| 736 |
if not isinstance(prompt, str):
|
| 737 |
-
raise TypeError("
|
| 738 |
-
except
|
| 739 |
-
# Manual Qwen3VL format β
|
| 740 |
prompt = (
|
| 741 |
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
|
| 742 |
"<|im_start|>user\n"
|
|
@@ -745,15 +746,50 @@ def run_step1_extraction(model, processor, image, device, temperature, top_p, to
|
|
| 745 |
"<|im_start|>assistant\n"
|
| 746 |
)
|
| 747 |
|
| 748 |
-
|
| 749 |
-
|
| 750 |
-
|
| 751 |
-
|
| 752 |
-
|
| 753 |
-
|
| 754 |
-
|
| 755 |
-
|
| 756 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 757 |
|
| 758 |
with torch.no_grad():
|
| 759 |
out = model.generate(
|
|
@@ -766,30 +802,34 @@ def run_step1_extraction(model, processor, image, device, temperature, top_p, to
|
|
| 766 |
repetition_penalty=repetition_penalty,
|
| 767 |
)
|
| 768 |
gen = out[:, inputs['input_ids'].shape[1]:]
|
| 769 |
-
|
| 770 |
-
|
| 771 |
-
|
|
|
|
| 772 |
|
| 773 |
result = _generate(STEP1_EXTRACT_PROMPT)
|
| 774 |
|
| 775 |
-
#
|
| 776 |
if re.search(r'\(\d{1,4},\s*\d{1,4}\)', result) or '---TEXT_START---' not in result:
|
| 777 |
-
print(" β οΈ
|
| 778 |
-
fallback =
|
| 779 |
-
|
| 780 |
-
|
| 781 |
-
|
| 782 |
-
|
| 783 |
-
|
| 784 |
-
|
| 785 |
-
|
| 786 |
-
|
| 787 |
-
|
|
|
|
|
|
|
| 788 |
result = _generate(fallback)
|
| 789 |
|
| 790 |
return result
|
| 791 |
|
| 792 |
|
|
|
|
| 793 |
def parse_step1_output(raw_output: str) -> dict:
|
| 794 |
"""Parse Step 1 structured output β metadata + original text"""
|
| 795 |
result = {
|
|
@@ -940,8 +980,6 @@ def run_step2_structure(model, processor, metadata: dict, device,
|
|
| 940 |
|
| 941 |
return streamer, thread, mrz_data, python_sections
|
| 942 |
|
| 943 |
-
|
| 944 |
-
return streamer, thread, mrz_data, python_sections
|
| 945 |
|
| 946 |
|
| 947 |
# ββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 718 |
"""Step 1: LLM β Raw OCR, original script, NO translation, NO coordinates"""
|
| 719 |
|
| 720 |
def _generate(prompt_text):
|
| 721 |
+
try:
|
| 722 |
+
from qwen_vl_utils import process_vision_info
|
| 723 |
+
HAS_QWEN_VL_UTILS = True
|
| 724 |
+
except ImportError:
|
| 725 |
+
HAS_QWEN_VL_UTILS = False
|
| 726 |
|
| 727 |
messages = [{"role": "user", "content": [
|
| 728 |
{"type": "image", "image": image},
|
| 729 |
{"type": "text", "text": prompt_text},
|
| 730 |
]}]
|
| 731 |
|
| 732 |
+
# Step A: Build prompt string
|
| 733 |
try:
|
| 734 |
prompt = processor.apply_chat_template(
|
| 735 |
+
messages, tokenize=False, add_generation_prompt=True
|
|
|
|
|
|
|
| 736 |
)
|
|
|
|
| 737 |
if not isinstance(prompt, str):
|
| 738 |
+
raise TypeError("non-string returned")
|
| 739 |
+
except Exception:
|
| 740 |
+
# Manual Qwen3VL token format β universal fallback
|
| 741 |
prompt = (
|
| 742 |
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
|
| 743 |
"<|im_start|>user\n"
|
|
|
|
| 746 |
"<|im_start|>assistant\n"
|
| 747 |
)
|
| 748 |
|
| 749 |
+
# Step B: Build inputs β 3 fallback tiers
|
| 750 |
+
inputs = None
|
| 751 |
+
|
| 752 |
+
# Tier 1: qwen_vl_utils + images/videos kwargs (Qwen3VL standard)
|
| 753 |
+
if HAS_QWEN_VL_UTILS and inputs is None:
|
| 754 |
+
try:
|
| 755 |
+
image_inputs, video_inputs = process_vision_info(messages)
|
| 756 |
+
proc_kwargs = {
|
| 757 |
+
"text": [prompt],
|
| 758 |
+
"padding": True,
|
| 759 |
+
"return_tensors": "pt"
|
| 760 |
+
}
|
| 761 |
+
if image_inputs is not None and len(image_inputs) > 0:
|
| 762 |
+
proc_kwargs["images"] = image_inputs
|
| 763 |
+
if video_inputs is not None and len(video_inputs) > 0:
|
| 764 |
+
proc_kwargs["videos"] = video_inputs
|
| 765 |
+
inputs = processor(**proc_kwargs).to(device)
|
| 766 |
+
print(" β
Tier1: qwen_vl_utils")
|
| 767 |
+
except Exception as e:
|
| 768 |
+
print(f" Tier1 failed: {e}")
|
| 769 |
+
inputs = None
|
| 770 |
+
|
| 771 |
+
# Tier 2: Direct PIL image (Qwen2VL style)
|
| 772 |
+
if inputs is None:
|
| 773 |
+
try:
|
| 774 |
+
inputs = processor(
|
| 775 |
+
text=[prompt],
|
| 776 |
+
images=[image],
|
| 777 |
+
padding=True,
|
| 778 |
+
return_tensors="pt",
|
| 779 |
+
).to(device)
|
| 780 |
+
print(" β
Tier2: direct PIL")
|
| 781 |
+
except Exception as e:
|
| 782 |
+
print(f" Tier2 failed: {e}")
|
| 783 |
+
inputs = None
|
| 784 |
+
|
| 785 |
+
# Tier 3: Text-only (last resort)
|
| 786 |
+
if inputs is None:
|
| 787 |
+
print(" β οΈ Tier3: text-only fallback (no image β degraded)")
|
| 788 |
+
inputs = processor(
|
| 789 |
+
text=[prompt],
|
| 790 |
+
padding=True,
|
| 791 |
+
return_tensors="pt",
|
| 792 |
+
).to(device)
|
| 793 |
|
| 794 |
with torch.no_grad():
|
| 795 |
out = model.generate(
|
|
|
|
| 802 |
repetition_penalty=repetition_penalty,
|
| 803 |
)
|
| 804 |
gen = out[:, inputs['input_ids'].shape[1]:]
|
| 805 |
+
decoded = processor.batch_decode(gen, skip_special_tokens=True)
|
| 806 |
+
if isinstance(decoded, list):
|
| 807 |
+
return decoded[0] if decoded else ""
|
| 808 |
+
return str(decoded) if decoded else ""
|
| 809 |
|
| 810 |
result = _generate(STEP1_EXTRACT_PROMPT)
|
| 811 |
|
| 812 |
+
# Coordinate output detect β retry with simpler prompt
|
| 813 |
if re.search(r'\(\d{1,4},\s*\d{1,4}\)', result) or '---TEXT_START---' not in result:
|
| 814 |
+
print(" β οΈ Retrying with fallback prompt...")
|
| 815 |
+
fallback = (
|
| 816 |
+
"Read all text from this document image and write it line by line in plain text.\n"
|
| 817 |
+
"Do NOT output coordinates or bounding boxes.\n"
|
| 818 |
+
"Start output with:\n"
|
| 819 |
+
"PHOTO_PRESENT: yes or no\n"
|
| 820 |
+
"SIGNATURE_PRESENT: yes or no\n"
|
| 821 |
+
"MRZ_PRESENT: yes or no\n"
|
| 822 |
+
"DETECTED_LANGUAGE: name the language(s)\n"
|
| 823 |
+
"---TEXT_START---\n"
|
| 824 |
+
"[all text here exactly as printed]\n"
|
| 825 |
+
"---TEXT_END---"
|
| 826 |
+
)
|
| 827 |
result = _generate(fallback)
|
| 828 |
|
| 829 |
return result
|
| 830 |
|
| 831 |
|
| 832 |
+
|
| 833 |
def parse_step1_output(raw_output: str) -> dict:
|
| 834 |
"""Parse Step 1 structured output β metadata + original text"""
|
| 835 |
result = {
|
|
|
|
| 980 |
|
| 981 |
return streamer, thread, mrz_data, python_sections
|
| 982 |
|
|
|
|
|
|
|
| 983 |
|
| 984 |
|
| 985 |
# ββββββββββββββββββββββββββββββββββββββββββββ
|