Spaces:

Ashok75
/

react

Sleeping

App Files Files Community

Ashok75 commited on Mar 7

Commit

e944423

verified ·

1 Parent(s): 6b59904

Upload server_runtime.py

Browse files

Files changed (1) hide show

server_runtime.py +43 -15

server_runtime.py CHANGED Viewed

@@ -145,6 +145,9 @@ def create_hf_space_app(config: RuntimeConfig) -> FastAPI:
     join_timeout = float(os.getenv("HF_GENERATION_JOIN_TIMEOUT_SECONDS", "180"))
     max_input_tokens = int(os.getenv("HF_MAX_INPUT_TOKENS", str(config.max_input_tokens)))
     max_new_tokens_limit = int(os.getenv("HF_MAX_NEW_TOKENS", str(config.max_new_tokens)))
     base_dir = os.path.dirname(os.path.abspath(__file__))
@@ -366,32 +369,57 @@ def create_hf_space_app(config: RuntimeConfig) -> FastAPI:
         nonlocal model, tokenizer, worker_tasks, max_workers, device
         logger.info("Loading model %s on %s", config.model_name, device)
-        tokenizer_kwargs: Dict[str, Any] = {"trust_remote_code": True}
         if config.tokenizer_use_fast is not None:
             tokenizer_kwargs["use_fast"] = config.tokenizer_use_fast
-        tokenizer = AutoTokenizer.from_pretrained(config.model_name, **tokenizer_kwargs)
         model_load_kwargs: Dict[str, Any] = {
             "trust_remote_code": True,
             "device_map": "auto" if device == "cuda" else None,
         }
         if device == "cuda":
             model_load_kwargs["dtype"] = "auto"
         else:
             model_load_kwargs["torch_dtype"] = torch.float32
-        try:
-            model = AutoModelForCausalLM.from_pretrained(
-                config.model_name,
-                **model_load_kwargs,
-            )
-        except TypeError:
-            # Backward compatibility for older transformers that do not accept `dtype`.
-            if "dtype" in model_load_kwargs:
-                model_load_kwargs["torch_dtype"] = model_load_kwargs.pop("dtype")
-            model = AutoModelForCausalLM.from_pretrained(
-                config.model_name,
-                **model_load_kwargs,
-            )
         if device != "cuda":
             model = model.to("cpu")

     join_timeout = float(os.getenv("HF_GENERATION_JOIN_TIMEOUT_SECONDS", "180"))
     max_input_tokens = int(os.getenv("HF_MAX_INPUT_TOKENS", str(config.max_input_tokens)))
     max_new_tokens_limit = int(os.getenv("HF_MAX_NEW_TOKENS", str(config.max_new_tokens)))
+    model_load_retries = max(1, int(os.getenv("HF_MODEL_LOAD_RETRIES", "4")))
+    model_load_retry_delay = max(1.0, float(os.getenv("HF_MODEL_LOAD_RETRY_DELAY_SECONDS", "8")))
+    local_files_only = _is_truthy(os.getenv("HF_LOCAL_FILES_ONLY", "0"))
     base_dir = os.path.dirname(os.path.abspath(__file__))
         nonlocal model, tokenizer, worker_tasks, max_workers, device
         logger.info("Loading model %s on %s", config.model_name, device)
+        tokenizer_kwargs: Dict[str, Any] = {
+            "trust_remote_code": True,
+            "local_files_only": local_files_only,
+        }
         if config.tokenizer_use_fast is not None:
             tokenizer_kwargs["use_fast"] = config.tokenizer_use_fast
         model_load_kwargs: Dict[str, Any] = {
             "trust_remote_code": True,
             "device_map": "auto" if device == "cuda" else None,
+            "local_files_only": local_files_only,
         }
         if device == "cuda":
             model_load_kwargs["dtype"] = "auto"
         else:
             model_load_kwargs["torch_dtype"] = torch.float32
+        last_load_error: Optional[Exception] = None
+        for attempt in range(1, model_load_retries + 1):
+            try:
+                tokenizer = AutoTokenizer.from_pretrained(config.model_name, **tokenizer_kwargs)
+                try:
+                    model = AutoModelForCausalLM.from_pretrained(
+                        config.model_name,
+                        **model_load_kwargs,
+                    )
+                except TypeError:
+                    # Backward compatibility for older transformers that do not accept `dtype`.
+                    if "dtype" in model_load_kwargs:
+                        model_load_kwargs["torch_dtype"] = model_load_kwargs.pop("dtype")
+                    model = AutoModelForCausalLM.from_pretrained(
+                        config.model_name,
+                        **model_load_kwargs,
+                    )
+                break
+            except Exception as exc:
+                last_load_error = exc
+                logger.warning(
+                    "Model load attempt %d/%d failed: %s",
+                    attempt,
+                    model_load_retries,
+                    str(exc),
+                )
+                if attempt < model_load_retries:
+                    await asyncio.sleep(model_load_retry_delay)
+                else:
+                    logger.error(
+                        "Model loading failed after %d attempts (local_files_only=%s)",
+                        model_load_retries,
+                        str(local_files_only),
+                    )
+                    raise last_load_error
         if device != "cuda":
             model = model.to("cpu")