""" Hugging Face Space server for Nanbeige/Nanbeige4.1-3B. This file uses the shared runtime with: - async queue buffering - worker pool + semaphore concurrency - safe per-request generation thread lifecycle """ try: from .server_runtime import RuntimeConfig, create_hf_space_app except ImportError: # pragma: no cover - direct script execution from server_runtime import RuntimeConfig, create_hf_space_app MODEL_NAME = "Nanbeige/Nanbeige4.1-3B" app = create_hf_space_app( RuntimeConfig( model_name=MODEL_NAME, title="Nanbeige4.1-3B Inference API", description="Streaming chat completion API for Nanbeige4.1-3B", max_input_tokens=32768, eos_token_id=166101, default_temperature=0.6, top_p=0.95, repetition_penalty=1.0, tokenizer_use_fast=False, logger_name=__name__, ) ) if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)