| """ |
| Hugging Face Space server for Nanbeige/Nanbeige4.1-3B. |
| |
| This file uses the shared runtime with: |
| - async queue buffering |
| - worker pool + semaphore concurrency |
| - safe per-request generation thread lifecycle |
| """ |
|
|
| try: |
| from .server_runtime import RuntimeConfig, create_hf_space_app |
| except ImportError: |
| from server_runtime import RuntimeConfig, create_hf_space_app |
|
|
|
|
| MODEL_NAME = "Nanbeige/Nanbeige4.1-3B" |
|
|
| app = create_hf_space_app( |
| RuntimeConfig( |
| model_name=MODEL_NAME, |
| title="Nanbeige4.1-3B Inference API", |
| description="Streaming chat completion API for Nanbeige4.1-3B", |
| max_input_tokens=32768, |
| eos_token_id=166101, |
| default_temperature=0.6, |
| top_p=0.95, |
| repetition_penalty=1.0, |
| tokenizer_use_fast=False, |
| logger_name=__name__, |
| ) |
| ) |
|
|
|
|
| if __name__ == "__main__": |
| import uvicorn |
|
|
| uvicorn.run(app, host="0.0.0.0", port=7860) |
|
|