Spaces:

VincentGOURBIN
/

PVB-Flow-Mermaid-Generator

Running on Zero

App Files Files Community

VincentGOURBIN commited on 17 days ago

Commit

d061785

verified ·

1 Parent(s): c29050e

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

src/ai/qwen_zerogpu_analyzer.py +52 -40

src/ai/qwen_zerogpu_analyzer.py CHANGED Viewed

@@ -1,22 +1,22 @@
 """
-Qwen model with ZeroGPU support for Hugging Face Spaces.
 Uses transformers with @spaces.GPU decorator.
 """
 import torch
 from typing import List, Dict
-from transformers import AutoTokenizer, AutoModelForCausalLM
 import spaces
 class QwenZeroGPUAnalyzer:
     """
     Qwen3 model analyzer with ZeroGPU support.
-    Uses Qwen3-4B-Instruct for diagram generation.
     """
     def __init__(
         self,
-        model_name: str = "Qwen/Qwen3-4B-Instruct"
     ):
         """
         Initialize the Qwen ZeroGPU analyzer.
@@ -26,30 +26,28 @@ class QwenZeroGPUAnalyzer:
         """
         self.model_name = model_name
         self.model = None
-        self.tokenizer = None
         print(f"✓ Qwen ZeroGPU analyzer initialized (model will load on first inference)")
         print(f"  Model: {self.model_name}")
     def _load_model(self):
-        """Load model and tokenizer (called on first inference)."""
         if self.model is not None:
             return
         print(f"Loading model: {self.model_name}...")
-        # Load tokenizer
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            self.model_name,
-            trust_remote_code=True
         )
-        # Load model (will be moved to GPU by @spaces.GPU decorator)
-        self.model = AutoModelForCausalLM.from_pretrained(
             self.model_name,
-            torch_dtype=torch.bfloat16,
-            device_map="auto",
-            trust_remote_code=True
         )
         print(f"✓ Model loaded: {self.model_name}")
@@ -70,35 +68,49 @@ class QwenZeroGPUAnalyzer:
         if self.model is None:
             self._load_model()
-        # Apply chat template
-        prompt = self.tokenizer.apply_chat_template(
-            conversation,
-            tokenize=False,
-            add_generation_prompt=True
         )
-        # Tokenize
-        inputs = self.tokenizer(prompt, return_tensors="pt")
-        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
-        # Generate with ZeroGPU
-        with torch.no_grad():
-            outputs = self.model.generate(
-                **inputs,
-                max_new_tokens=max_tokens,
-                temperature=0.2,  # Low temperature for consistent diagrams
-                do_sample=False,  # Greedy decoding for deterministic output
-                pad_token_id=self.tokenizer.eos_token_id
-            )
-        # Decode response (skip input tokens)
-        input_length = inputs["input_ids"].shape[1]
-        response = self.tokenizer.decode(
-            outputs[0][input_length:],
-            skip_special_tokens=True
         )
-        return response.strip()
     def cleanup_model(self):
         """Cleanup (managed by ZeroGPU)."""

 """
+Qwen3-VL model with ZeroGPU support for Hugging Face Spaces.
 Uses transformers with @spaces.GPU decorator.
 """
 import torch
 from typing import List, Dict
+from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
 import spaces
 class QwenZeroGPUAnalyzer:
     """
     Qwen3 model analyzer with ZeroGPU support.
+    Uses Qwen3-VL-4B-Instruct for diagram generation.
     """
     def __init__(
         self,
+        model_name: str = "Qwen/Qwen3-VL-4B-Instruct"
     ):
         """
         Initialize the Qwen ZeroGPU analyzer.
         """
         self.model_name = model_name
         self.model = None
+        self.processor = None
         print(f"✓ Qwen ZeroGPU analyzer initialized (model will load on first inference)")
         print(f"  Model: {self.model_name}")
     def _load_model(self):
+        """Load model and processor (called on first inference)."""
         if self.model is not None:
             return
         print(f"Loading model: {self.model_name}...")
+        # Load processor (for Qwen3-VL)
+        self.processor = AutoProcessor.from_pretrained(
+            self.model_name
         )
+        # Load model (Qwen3-VL model)
+        self.model = Qwen3VLForConditionalGeneration.from_pretrained(
             self.model_name,
+            torch_dtype="auto",  # Use auto dtype like in official example
+            device_map="auto"
         )
         print(f"✓ Model loaded: {self.model_name}")
         if self.model is None:
             self._load_model()
+        # Format conversation for Qwen3-VL (text-only usage)
+        # Build prompt from conversation history
+        messages = []
+        for msg in conversation:
+            role = msg["role"]
+            content = msg["content"]
+            # Qwen3-VL expects specific format
+            messages.append({
+                "role": role,
+                "content": [{"type": "text", "text": content}]
+            })
+        # Apply chat template (following official example)
+        inputs = self.processor.apply_chat_template(
+            messages,
+            tokenize=True,
+            add_generation_prompt=True,
+            return_dict=True,
+            return_tensors="pt"
+        )
+        inputs = inputs.to(self.model.device)
+        # Generate with ZeroGPU (following official example)
+        generated_ids = self.model.generate(
+            **inputs,
+            max_new_tokens=max_tokens
         )
+        # Trim generated ids (remove input tokens)
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):]
+            for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        # Decode response
+        output_text = self.processor.batch_decode(
+            generated_ids_trimmed,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False
         )
+        return output_text[0].strip()
     def cleanup_model(self):
         """Cleanup (managed by ZeroGPU)."""