File size: 3,977 Bytes
31d4f9b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495b06b
31d4f9b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
"""Modal deployment script for HearthNet LLM inference.

Run once to deploy a serverless GPU endpoint on Modal:

    modal deploy scripts/modal_deploy.py

Then set MODAL_ENDPOINT in your HF Space / local .env to the printed URL.

Qualifies for: Modal Best Use Of Modal prize ($10k credits).
See docs: https://modal.com/docs/guide/webhooks
"""

from __future__ import annotations

# ── Requirements ──────────────────────────────────────────────────────────────
# pip install modal transformers torch accelerate fastapi

import modal

# ── Modal app definition ──────────────────────────────────────────────────────
app = modal.App("hearthnet-llm")

MODEL_ID = "HuggingFaceTB/SmolLM2-1.7B-Instruct"

# Build a container image with the required packages
image = (
    modal.Image.debian_slim(python_version="3.11")
    .pip_install(
        "transformers>=4.40",
        "torch>=2.2",
        "accelerate>=0.30",
        "fastapi",
        "uvicorn",
    )
    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
)


@app.cls(
    gpu="T4",
    image=image,
    scaledown_window=300,
    timeout=300,
)
class HearthNetLLM:
    @modal.enter()
    def load_model(self):
        from transformers import pipeline

        self.pipe = pipeline(
            "text-generation",
            model=MODEL_ID,
            device_map="auto",
            torch_dtype="auto",
        )

    @modal.web_endpoint(method="GET", label="hearthnet-llm")
    def health(self) -> dict:
        return {"status": "ok", "model": MODEL_ID}

    @modal.web_endpoint(method="POST", label="hearthnet-llm-chat")
    def chat_completions(self, request: dict) -> dict:
        """OpenAI-compatible /v1/chat/completions endpoint."""
        messages = request.get("messages", [])
        max_tokens = request.get("max_tokens", 512)
        temperature = request.get("temperature", 0.7)

        # Format messages into prompt
        prompt = ""
        for msg in messages:
            role = msg.get("role", "user")
            content = msg.get("content", "")
            if role == "system":
                prompt += f"<|system|>\n{content}\n"
            elif role == "user":
                prompt += f"<|user|>\n{content}\n"
            elif role == "assistant":
                prompt += f"<|assistant|>\n{content}\n"
        prompt += "<|assistant|>\n"

        result = self.pipe(
            prompt,
            max_new_tokens=max_tokens,
            temperature=temperature,
            do_sample=temperature > 0,
            return_full_text=False,
        )
        text = result[0]["generated_text"]

        return {
            "id": "modal-chat-1",
            "object": "chat.completion",
            "model": MODEL_ID,
            "choices": [
                {
                    "index": 0,
                    "message": {"role": "assistant", "content": text},
                    "finish_reason": "stop",
                }
            ],
            "usage": {
                "prompt_tokens": len(prompt.split()),
                "completion_tokens": len(text.split()),
                "total_tokens": len(prompt.split()) + len(text.split()),
            },
        }


# ── Local entrypoint for testing ──────────────────────────────────────────────
@app.local_entrypoint()
def main():
    print("Deploying HearthNet LLM to Modal...")
    print(f"Model: {MODEL_ID}")
    print("After deployment, set MODAL_ENDPOINT to the printed web endpoint URL")
    print("Then add to HearthNet config.toml:")
    print()
    print("  [[llm.backends]]")
    print("  name = 'modal'")
    print("  endpoint = 'https://YOUR-ORG--hearthnet-llm-chat.modal.run'")
    print()