Spaces:

nexusbert
/

Style

Sleeping

App Files Files Community

nexusbert commited on Dec 30, 2025

Commit

a9753f4

1 Parent(s): ab2012f

fixed 16Gi error

Browse files

Files changed (1) hide show

model_manager.py +42 -10

model_manager.py CHANGED Viewed

@@ -22,27 +22,45 @@ def ensure_model_loaded():
     hf_token = os.getenv("HF_TOKEN")
     try:
         if hf_token:
             style_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
                 model_id,
-                token=hf_token,
-                torch_dtype=torch.float32,
-                device_map="auto",
-                low_cpu_mem_usage=True
             )
             style_processor = AutoProcessor.from_pretrained(model_id, token=hf_token)
         else:
             style_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
                 model_id,
-                torch_dtype=torch.float32,
-                device_map="auto",
-                low_cpu_mem_usage=True
             )
             style_processor = AutoProcessor.from_pretrained(model_id)
-        print(f"Loaded {model_id}")
     except Exception as e:
         print(f"Error loading model: {e}")
         raise
 def generate_chat_response(prompt: str, max_length: int = 512, temperature: float = 0.7, rag_context: Optional[str] = None, system_override: Optional[str] = None, images: Optional[List[str]] = None) -> str:
@@ -80,7 +98,14 @@ def generate_chat_response(prompt: str, max_length: int = 512, temperature: floa
             return_tensors="pt",
         )
-        inputs = {k: v.to(style_model.device) for k, v in inputs.items()}
         temperature = max(0.1, min(1.5, temperature))
@@ -163,7 +188,14 @@ async def generate_chat_response_streaming(prompt: str, max_length: int = 512, t
             return_tensors="pt",
         )
-        inputs = {k: v.to(style_model.device) for k, v in inputs.items()}
         temperature = max(0.1, min(1.5, temperature))

     hf_token = os.getenv("HF_TOKEN")
     try:
+        if torch.cuda.is_available():
+            dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
+            max_memory = {0: "14GiB", "cpu": "2GiB"}
+            offload_folder = None
+        else:
+            dtype = torch.float16
+            max_memory = {"cpu": "14GiB"}
+            offload_folder = "/tmp/model_offload"
+            os.makedirs(offload_folder, exist_ok=True)
+        load_kwargs = {
+            "torch_dtype": dtype,
+            "device_map": "auto",
+            "low_cpu_mem_usage": True,
+            "max_memory": max_memory,
+        }
+        if offload_folder:
+            load_kwargs["offload_folder"] = offload_folder
         if hf_token:
+            load_kwargs["token"] = hf_token
             style_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
                 model_id,
+                **load_kwargs
             )
             style_processor = AutoProcessor.from_pretrained(model_id, token=hf_token)
         else:
             style_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
                 model_id,
+                **load_kwargs
             )
             style_processor = AutoProcessor.from_pretrained(model_id)
+        print(f"Loaded {model_id} with dtype={dtype}, device_map=auto")
     except Exception as e:
         print(f"Error loading model: {e}")
+        import traceback
+        traceback.print_exc()
         raise
 def generate_chat_response(prompt: str, max_length: int = 512, temperature: float = 0.7, rag_context: Optional[str] = None, system_override: Optional[str] = None, images: Optional[List[str]] = None) -> str:
             return_tensors="pt",
         )
+        if hasattr(style_model, 'device'):
+            device = style_model.device
+        elif hasattr(style_model, 'hf_device_map'):
+            device = next(iter(style_model.hf_device_map.values())) if style_model.hf_device_map else torch.device("cpu")
+        else:
+            device = torch.device("cpu")
+        inputs = {k: v.to(device) for k, v in inputs.items()}
         temperature = max(0.1, min(1.5, temperature))
             return_tensors="pt",
         )
+        if hasattr(style_model, 'device'):
+            device = style_model.device
+        elif hasattr(style_model, 'hf_device_map'):
+            device = next(iter(style_model.hf_device_map.values())) if style_model.hf_device_map else torch.device("cpu")
+        else:
+            device = torch.device("cpu")
+        inputs = {k: v.to(device) for k, v in inputs.items()}
         temperature = max(0.1, min(1.5, temperature))