Spaces:
Running on Zero
Running on Zero
File size: 3,977 Bytes
31d4f9b 495b06b 31d4f9b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 | """Modal deployment script for HearthNet LLM inference.
Run once to deploy a serverless GPU endpoint on Modal:
modal deploy scripts/modal_deploy.py
Then set MODAL_ENDPOINT in your HF Space / local .env to the printed URL.
Qualifies for: Modal Best Use Of Modal prize ($10k credits).
See docs: https://modal.com/docs/guide/webhooks
"""
from __future__ import annotations
# ββ Requirements ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# pip install modal transformers torch accelerate fastapi
import modal
# ββ Modal app definition ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
app = modal.App("hearthnet-llm")
MODEL_ID = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
# Build a container image with the required packages
image = (
modal.Image.debian_slim(python_version="3.11")
.pip_install(
"transformers>=4.40",
"torch>=2.2",
"accelerate>=0.30",
"fastapi",
"uvicorn",
)
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
)
@app.cls(
gpu="T4",
image=image,
scaledown_window=300,
timeout=300,
)
class HearthNetLLM:
@modal.enter()
def load_model(self):
from transformers import pipeline
self.pipe = pipeline(
"text-generation",
model=MODEL_ID,
device_map="auto",
torch_dtype="auto",
)
@modal.web_endpoint(method="GET", label="hearthnet-llm")
def health(self) -> dict:
return {"status": "ok", "model": MODEL_ID}
@modal.web_endpoint(method="POST", label="hearthnet-llm-chat")
def chat_completions(self, request: dict) -> dict:
"""OpenAI-compatible /v1/chat/completions endpoint."""
messages = request.get("messages", [])
max_tokens = request.get("max_tokens", 512)
temperature = request.get("temperature", 0.7)
# Format messages into prompt
prompt = ""
for msg in messages:
role = msg.get("role", "user")
content = msg.get("content", "")
if role == "system":
prompt += f"<|system|>\n{content}\n"
elif role == "user":
prompt += f"<|user|>\n{content}\n"
elif role == "assistant":
prompt += f"<|assistant|>\n{content}\n"
prompt += "<|assistant|>\n"
result = self.pipe(
prompt,
max_new_tokens=max_tokens,
temperature=temperature,
do_sample=temperature > 0,
return_full_text=False,
)
text = result[0]["generated_text"]
return {
"id": "modal-chat-1",
"object": "chat.completion",
"model": MODEL_ID,
"choices": [
{
"index": 0,
"message": {"role": "assistant", "content": text},
"finish_reason": "stop",
}
],
"usage": {
"prompt_tokens": len(prompt.split()),
"completion_tokens": len(text.split()),
"total_tokens": len(prompt.split()) + len(text.split()),
},
}
# ββ Local entrypoint for testing ββββββββββββββββββββββββββββββββββββββββββββββ
@app.local_entrypoint()
def main():
print("Deploying HearthNet LLM to Modal...")
print(f"Model: {MODEL_ID}")
print("After deployment, set MODAL_ENDPOINT to the printed web endpoint URL")
print("Then add to HearthNet config.toml:")
print()
print(" [[llm.backends]]")
print(" name = 'modal'")
print(" endpoint = 'https://YOUR-ORG--hearthnet-llm-chat.modal.run'")
print()
|