Spaces:

Ashok75
/

react

Sleeping

App Files Files Community

Ashok75 commited on Mar 7

Commit

6b59904

verified ·

1 Parent(s): b921a13

Upload server_runtime.py

Browse files

Files changed (1) hide show

server_runtime.py +37 -11

server_runtime.py CHANGED Viewed

@@ -99,7 +99,16 @@ def _is_truthy(value: str) -> bool:
 def _format_sse_event(payload: Dict[str, Any]) -> str:
-    return f"data: {json.dumps(payload)}\n\n"
 def _detect_concurrency(device: str) -> int:
@@ -122,7 +131,8 @@ def _detect_concurrency(device: str) -> int:
         return 3
     cpu_count = os.cpu_count() or 1
-    return max(1, min(4, max(1, cpu_count // 2)))
 def create_hf_space_app(config: RuntimeConfig) -> FastAPI:
@@ -258,9 +268,9 @@ def create_hf_space_app(config: RuntimeConfig) -> FastAPI:
                     break
                 try:
-                    new_text = await asyncio.to_thread(next, stream_iter)
-                except StopIteration:
-                    break
                 except QueueEmpty:
                     if generation_done.is_set():
                         break
@@ -360,12 +370,28 @@ def create_hf_space_app(config: RuntimeConfig) -> FastAPI:
         if config.tokenizer_use_fast is not None:
             tokenizer_kwargs["use_fast"] = config.tokenizer_use_fast
         tokenizer = AutoTokenizer.from_pretrained(config.model_name, **tokenizer_kwargs)
-        model = AutoModelForCausalLM.from_pretrained(
-            config.model_name,
-            trust_remote_code=True,
-            torch_dtype="auto" if device == "cuda" else torch.float32,
-            device_map="auto" if device == "cuda" else None,
-        )
         if device != "cuda":
             model = model.to("cpu")

 def _format_sse_event(payload: Dict[str, Any]) -> str:
+    event_type = str(payload.get("type", "token"))
+    return f"event: {event_type}\ndata: {json.dumps(payload)}\n\n"
+def _read_stream_item(stream_iter) -> tuple[bool, Optional[str]]:
+    """Read one item from streamer iterator without leaking StopIteration across threads."""
+    try:
+        return False, next(stream_iter)
+    except StopIteration:
+        return True, None
 def _detect_concurrency(device: str) -> int:
         return 3
     cpu_count = os.cpu_count() or 1
+    # Conservative CPU default for large models; still within 1..4 range.
+    return max(1, min(4, max(1, cpu_count // 6)))
 def create_hf_space_app(config: RuntimeConfig) -> FastAPI:
                     break
                 try:
+                    stream_finished, new_text = await asyncio.to_thread(_read_stream_item, stream_iter)
+                    if stream_finished:
+                        break
                 except QueueEmpty:
                     if generation_done.is_set():
                         break
         if config.tokenizer_use_fast is not None:
             tokenizer_kwargs["use_fast"] = config.tokenizer_use_fast
         tokenizer = AutoTokenizer.from_pretrained(config.model_name, **tokenizer_kwargs)
+        model_load_kwargs: Dict[str, Any] = {
+            "trust_remote_code": True,
+            "device_map": "auto" if device == "cuda" else None,
+        }
+        if device == "cuda":
+            model_load_kwargs["dtype"] = "auto"
+        else:
+            model_load_kwargs["torch_dtype"] = torch.float32
+        try:
+            model = AutoModelForCausalLM.from_pretrained(
+                config.model_name,
+                **model_load_kwargs,
+            )
+        except TypeError:
+            # Backward compatibility for older transformers that do not accept `dtype`.
+            if "dtype" in model_load_kwargs:
+                model_load_kwargs["torch_dtype"] = model_load_kwargs.pop("dtype")
+            model = AutoModelForCausalLM.from_pretrained(
+                config.model_name,
+                **model_load_kwargs,
+            )
         if device != "cuda":
             model = model.to("cpu")