Chhagan005 commited on
Commit
b3a39ba
Β·
verified Β·
1 Parent(s): ee2bb2f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -10
app.py CHANGED
@@ -718,24 +718,43 @@ def run_step1_extraction(model, processor, image, device, temperature, top_p, to
718
  """Step 1: LLM β†’ Raw OCR, original script, NO translation, NO coordinates"""
719
 
720
  def _generate(prompt_text):
 
 
721
  messages = [{"role": "user", "content": [
722
- {"type": "image"},
723
- {"type": "text", "text": prompt_text},
724
  ]}]
725
- try:
726
- prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
727
- except:
728
- prompt = prompt_text
729
- inputs = processor(text=[prompt], images=[image], return_tensors="pt", padding=True).to(device)
 
 
 
 
 
 
 
 
 
 
 
 
730
  with torch.no_grad():
731
  out = model.generate(
732
- **inputs, max_new_tokens=600, do_sample=True,
733
- temperature=temperature, top_p=top_p, top_k=top_k,
 
 
 
 
734
  repetition_penalty=repetition_penalty,
735
  )
736
  gen = out[:, inputs['input_ids'].shape[1]:]
737
  return processor.batch_decode(gen, skip_special_tokens=True)[0]
738
 
 
739
  result = _generate(STEP1_EXTRACT_PROMPT)
740
 
741
  # Detect coordinate output (Qwen grounding mode triggered) β†’ retry
@@ -848,7 +867,12 @@ def run_step2_structure(model, processor, metadata: dict, device,
848
  except:
849
  prompt = prompt_text
850
 
851
- inputs = processor(text=[prompt], return_tensors="pt", padding=True).to(device)
 
 
 
 
 
852
 
853
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
854
  gen_kwargs = {
 
718
  """Step 1: LLM β†’ Raw OCR, original script, NO translation, NO coordinates"""
719
 
720
  def _generate(prompt_text):
721
+ from qwen_vl_utils import process_vision_info
722
+
723
  messages = [{"role": "user", "content": [
724
+ {"type": "image", "image": image}, # ← PIL image yahan directly
725
+ {"type": "text", "text": prompt_text},
726
  ]}]
727
+
728
+ # apply_chat_template β€” Qwen3VL format
729
+ prompt = processor.apply_chat_template(
730
+ messages, tokenize=False, add_generation_prompt=True
731
+ )
732
+
733
+ # process_vision_info β€” Qwen3VL ke liye zaruri step
734
+ image_inputs, video_inputs = process_vision_info(messages)
735
+
736
+ inputs = processor(
737
+ text=[prompt],
738
+ images=image_inputs,
739
+ videos=video_inputs,
740
+ padding=True,
741
+ return_tensors="pt",
742
+ ).to(device)
743
+
744
  with torch.no_grad():
745
  out = model.generate(
746
+ **inputs,
747
+ max_new_tokens=600,
748
+ do_sample=True,
749
+ temperature=temperature,
750
+ top_p=top_p,
751
+ top_k=top_k,
752
  repetition_penalty=repetition_penalty,
753
  )
754
  gen = out[:, inputs['input_ids'].shape[1]:]
755
  return processor.batch_decode(gen, skip_special_tokens=True)[0]
756
 
757
+
758
  result = _generate(STEP1_EXTRACT_PROMPT)
759
 
760
  # Detect coordinate output (Qwen grounding mode triggered) β†’ retry
 
867
  except:
868
  prompt = prompt_text
869
 
870
+ inputs = processor(
871
+ text=[prompt],
872
+ padding=True,
873
+ return_tensors="pt",
874
+ ).to(device)
875
+
876
 
877
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
878
  gen_kwargs = {