Spaces:

ARCQUB
/

BPMN-entity-extractor

Sleeping

App Files Files Community

ARCQUB commited on Mar 31, 2025

Commit

9e1e3ef

verified ·

1 Parent(s): 462e5c0

Update models/qwen.py

Browse files

Files changed (1) hide show

models/qwen.py +48 -33

models/qwen.py CHANGED Viewed

@@ -3,31 +3,21 @@ import json
 from PIL import Image
 import torch
 from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
-from qwen_vl_utils import process_vision_info
 from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor
-# Initialize Qwen2.5-VL model
-model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    "Qwen/Qwen2.5-VL-7B-Instruct",
-    torch_dtype=torch.bfloat16,
-    device_map="cuda"
-    #attn_implementation="flash_attention_2"
-)
-min_pixels = 256 * 28 * 28
-max_pixels = 1080 * 28 * 28
-processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
-# Initialize Pix2Struct OCR model
-ocr_processor = Pix2StructProcessor.from_pretrained("google/pix2struct-textcaps-base")
-ocr_model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-textcaps-base")
-# Load prompt
 def load_prompt():
-    with open("prompts/prompt.txt", "r") as f:
         return f.read()
-# Try extracting JSON from text
 def try_extract_json(text):
     try:
         return json.loads(text)
@@ -50,20 +40,25 @@ def try_extract_json(text):
         except json.JSONDecodeError:
             return text
-# Extract OCR text using Pix2Struct
 def extract_all_text_pix2struct(image: Image.Image):
-    inputs = ocr_processor(images=image, return_tensors="pt")
     predictions = ocr_model.generate(**inputs, max_new_tokens=512)
-    output_text = ocr_processor.decode(predictions[0], skip_special_tokens=True)
-    return output_text.strip()
-# Assign event/gateway names from OCR text
 def assign_event_gateway_names_from_ocr(json_data: dict, ocr_text: str):
     if not ocr_text or not json_data:
         return json_data
-    lines = [line.strip() for line in ocr_text.split('\n') if line.strip()]
     def assign_best_guess(obj):
         if not obj.get("name") or obj["name"].strip() == "":
             obj["name"] = "(label unknown)"
@@ -76,9 +71,29 @@ def assign_event_gateway_names_from_ocr(json_data: dict, ocr_text: str):
     return json_data
-# Run model
 def run_model(image: Image.Image):
     prompt = load_prompt()
     messages = [
         {
             "role": "user",
@@ -89,21 +104,21 @@ def run_model(image: Image.Image):
         }
     ]
-    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     image_inputs, video_inputs = process_vision_info(messages)
-    inputs = processor(
         text=[text],
         images=image_inputs,
         videos=video_inputs,
         padding=True,
         return_tensors="pt"
-    ).to("cuda")
-    generated_ids = model.generate(**inputs, max_new_tokens=5000)
     generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
-    output_text = processor.batch_decode(
         generated_ids_trimmed,
         skip_special_tokens=True,
         clean_up_tokenization_spaces=False
@@ -111,11 +126,11 @@ def run_model(image: Image.Image):
     parsed_json = try_extract_json(output_text)
-    # Apply OCR post-processing
     ocr_text = extract_all_text_pix2struct(image)
     parsed_json = assign_event_gateway_names_from_ocr(parsed_json, ocr_text)
     return {
         "json": parsed_json,
         "raw": output_text
-    }

 from PIL import Image
 import torch
 from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
 from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor
+from qwen_vl_utils import process_vision_info
+# Globals (lazy-loaded at runtime)
+qwen_model = None
+qwen_processor = None
+ocr_model = None
+ocr_processor = None
 def load_prompt():
+    with open("prompts/prompt.txt", "r", encoding="utf-8") as f:
         return f.read()
 def try_extract_json(text):
     try:
         return json.loads(text)
         except json.JSONDecodeError:
             return text
 def extract_all_text_pix2struct(image: Image.Image):
+    global ocr_model, ocr_processor
+    if ocr_model is None or ocr_processor is None:
+        ocr_processor = Pix2StructProcessor.from_pretrained("google/pix2struct-textcaps-base")
+        ocr_model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-textcaps-base")
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        ocr_model = ocr_model.to(device)
+    inputs = ocr_processor(images=image, return_tensors="pt").to(ocr_model.device)
     predictions = ocr_model.generate(**inputs, max_new_tokens=512)
+    return ocr_processor.decode(predictions[0], skip_special_tokens=True).strip()
 def assign_event_gateway_names_from_ocr(json_data: dict, ocr_text: str):
     if not ocr_text or not json_data:
         return json_data
     def assign_best_guess(obj):
         if not obj.get("name") or obj["name"].strip() == "":
             obj["name"] = "(label unknown)"
     return json_data
 def run_model(image: Image.Image):
+    global qwen_model, qwen_processor
+    if qwen_model is None or qwen_processor is None:
+        qwen_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            "Qwen/Qwen2.5-VL-7B-Instruct",
+            torch_dtype=torch.bfloat16,
+            device_map="auto"
+            # You can enable flash attention here if needed:
+            # attn_implementation="flash_attention_2"
+        )
+        min_pixels = 256 * 28 * 28
+        max_pixels = 1080 * 28 * 28
+        qwen_processor = AutoProcessor.from_pretrained(
+            "Qwen/Qwen2.5-VL-7B-Instruct",
+            min_pixels=min_pixels,
+            max_pixels=max_pixels
+        )
     prompt = load_prompt()
     messages = [
         {
             "role": "user",
         }
     ]
+    text = qwen_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     image_inputs, video_inputs = process_vision_info(messages)
+    inputs = qwen_processor(
         text=[text],
         images=image_inputs,
         videos=video_inputs,
         padding=True,
         return_tensors="pt"
+    ).to(qwen_model.device)
+    generated_ids = qwen_model.generate(**inputs, max_new_tokens=5000)
     generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
+    output_text = qwen_processor.batch_decode(
         generated_ids_trimmed,
         skip_special_tokens=True,
         clean_up_tokenization_spaces=False
     parsed_json = try_extract_json(output_text)
+    # OCR post-processing
     ocr_text = extract_all_text_pix2struct(image)
     parsed_json = assign_event_gateway_names_from_ocr(parsed_json, ocr_text)
     return {
         "json": parsed_json,
         "raw": output_text
+    }