Spaces:

build-small-hackathon
/

HearthNet

Running on Zero

File size: 3,977 Bytes

"""Modal deployment script for HearthNet LLM inference.

Run once to deploy a serverless GPU endpoint on Modal:

    modal deploy scripts/modal_deploy.py

Then set MODAL_ENDPOINT in your HF Space / local .env to the printed URL.

Qualifies for: Modal Best Use Of Modal prize ($10k credits).
See docs: https://modal.com/docs/guide/webhooks
"""

from __future__ import annotations

# ── Requirements ──────────────────────────────────────────────────────────────
# pip install modal transformers torch accelerate fastapi

import modal

# ── Modal app definition ──────────────────────────────────────────────────────
app = modal.App("hearthnet-llm")

MODEL_ID = "HuggingFaceTB/SmolLM2-1.7B-Instruct"

# Build a container image with the required packages
image = (
    modal.Image.debian_slim(python_version="3.11")
    .pip_install(
        "transformers>=4.40",
        "torch>=2.2",
        "accelerate>=0.30",
        "fastapi",
        "uvicorn",
    )
    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
)


@app.cls(
    gpu="T4",
    image=image,
    scaledown_window=300,
    timeout=300,
)
class HearthNetLLM:
    @modal.enter()
    def load_model(self):
        from transformers import pipeline

        self.pipe = pipeline(
            "text-generation",
            model=MODEL_ID,
            device_map="auto",
            torch_dtype="auto",
        )

    @modal.web_endpoint(method="GET", label="hearthnet-llm")
    def health(self) -> dict:
        return {"status": "ok", "model": MODEL_ID}

    @modal.web_endpoint(method="POST", label="hearthnet-llm-chat")
    def chat_completions(self, request: dict) -> dict:
        """OpenAI-compatible /v1/chat/completions endpoint."""
        messages = request.get("messages", [])
        max_tokens = request.get("max_tokens", 512)
        temperature = request.get("temperature", 0.7)

        # Format messages into prompt
        prompt = ""
        for msg in messages:
            role = msg.get("role", "user")
            content = msg.get("content", "")
            if role == "system":
                prompt += f"<|system|>\n{content}\n"
            elif role == "user":
                prompt += f"<|user|>\n{content}\n"
            elif role == "assistant":
                prompt += f"<|assistant|>\n{content}\n"
        prompt += "<|assistant|>\n"

        result = self.pipe(
            prompt,
            max_new_tokens=max_tokens,
            temperature=temperature,
            do_sample=temperature > 0,
            return_full_text=False,
        )
        text = result[0]["generated_text"]

        return {
            "id": "modal-chat-1",
            "object": "chat.completion",
            "model": MODEL_ID,
            "choices": [
                {
                    "index": 0,
                    "message": {"role": "assistant", "content": text},
                    "finish_reason": "stop",
                }
            ],
            "usage": {
                "prompt_tokens": len(prompt.split()),
                "completion_tokens": len(text.split()),
                "total_tokens": len(prompt.split()) + len(text.split()),
            },
        }


# ── Local entrypoint for testing ──────────────────────────────────────────────
@app.local_entrypoint()
def main():
    print("Deploying HearthNet LLM to Modal...")
    print(f"Model: {MODEL_ID}")
    print("After deployment, set MODAL_ENDPOINT to the printed web endpoint URL")
    print("Then add to HearthNet config.toml:")
    print()
    print("  [[llm.backends]]")
    print("  name = 'modal'")
    print("  endpoint = 'https://YOUR-ORG--hearthnet-llm-chat.modal.run'")
    print()