Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -718,24 +718,43 @@ def run_step1_extraction(model, processor, image, device, temperature, top_p, to
|
|
| 718 |
"""Step 1: LLM β Raw OCR, original script, NO translation, NO coordinates"""
|
| 719 |
|
| 720 |
def _generate(prompt_text):
|
|
|
|
|
|
|
| 721 |
messages = [{"role": "user", "content": [
|
| 722 |
-
{"type": "image"},
|
| 723 |
-
{"type": "text",
|
| 724 |
]}]
|
| 725 |
-
|
| 726 |
-
|
| 727 |
-
|
| 728 |
-
|
| 729 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 730 |
with torch.no_grad():
|
| 731 |
out = model.generate(
|
| 732 |
-
**inputs,
|
| 733 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 734 |
repetition_penalty=repetition_penalty,
|
| 735 |
)
|
| 736 |
gen = out[:, inputs['input_ids'].shape[1]:]
|
| 737 |
return processor.batch_decode(gen, skip_special_tokens=True)[0]
|
| 738 |
|
|
|
|
| 739 |
result = _generate(STEP1_EXTRACT_PROMPT)
|
| 740 |
|
| 741 |
# Detect coordinate output (Qwen grounding mode triggered) β retry
|
|
@@ -848,7 +867,12 @@ def run_step2_structure(model, processor, metadata: dict, device,
|
|
| 848 |
except:
|
| 849 |
prompt = prompt_text
|
| 850 |
|
| 851 |
-
inputs = processor(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 852 |
|
| 853 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
| 854 |
gen_kwargs = {
|
|
|
|
| 718 |
"""Step 1: LLM β Raw OCR, original script, NO translation, NO coordinates"""
|
| 719 |
|
| 720 |
def _generate(prompt_text):
|
| 721 |
+
from qwen_vl_utils import process_vision_info
|
| 722 |
+
|
| 723 |
messages = [{"role": "user", "content": [
|
| 724 |
+
{"type": "image", "image": image}, # β PIL image yahan directly
|
| 725 |
+
{"type": "text", "text": prompt_text},
|
| 726 |
]}]
|
| 727 |
+
|
| 728 |
+
# apply_chat_template β Qwen3VL format
|
| 729 |
+
prompt = processor.apply_chat_template(
|
| 730 |
+
messages, tokenize=False, add_generation_prompt=True
|
| 731 |
+
)
|
| 732 |
+
|
| 733 |
+
# process_vision_info β Qwen3VL ke liye zaruri step
|
| 734 |
+
image_inputs, video_inputs = process_vision_info(messages)
|
| 735 |
+
|
| 736 |
+
inputs = processor(
|
| 737 |
+
text=[prompt],
|
| 738 |
+
images=image_inputs,
|
| 739 |
+
videos=video_inputs,
|
| 740 |
+
padding=True,
|
| 741 |
+
return_tensors="pt",
|
| 742 |
+
).to(device)
|
| 743 |
+
|
| 744 |
with torch.no_grad():
|
| 745 |
out = model.generate(
|
| 746 |
+
**inputs,
|
| 747 |
+
max_new_tokens=600,
|
| 748 |
+
do_sample=True,
|
| 749 |
+
temperature=temperature,
|
| 750 |
+
top_p=top_p,
|
| 751 |
+
top_k=top_k,
|
| 752 |
repetition_penalty=repetition_penalty,
|
| 753 |
)
|
| 754 |
gen = out[:, inputs['input_ids'].shape[1]:]
|
| 755 |
return processor.batch_decode(gen, skip_special_tokens=True)[0]
|
| 756 |
|
| 757 |
+
|
| 758 |
result = _generate(STEP1_EXTRACT_PROMPT)
|
| 759 |
|
| 760 |
# Detect coordinate output (Qwen grounding mode triggered) β retry
|
|
|
|
| 867 |
except:
|
| 868 |
prompt = prompt_text
|
| 869 |
|
| 870 |
+
inputs = processor(
|
| 871 |
+
text=[prompt],
|
| 872 |
+
padding=True,
|
| 873 |
+
return_tensors="pt",
|
| 874 |
+
).to(device)
|
| 875 |
+
|
| 876 |
|
| 877 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
| 878 |
gen_kwargs = {
|