LisaMegaWatts commited on
Commit
2fd3ad2
Β·
1 Parent(s): eb5e801

Add OpenAI-compatible API server for distil-home-assistant-functiongemma

Browse files
Files changed (3) hide show
  1. Dockerfile +16 -0
  2. app.py +110 -0
  3. requirements.txt +9 -0
Dockerfile ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ RUN useradd -m -u 1000 user
4
+ WORKDIR /app
5
+
6
+ COPY --chown=user requirements.txt .
7
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
8
+
9
+ COPY --chown=user app.py .
10
+
11
+ USER user
12
+ ENV HOME=/home/user
13
+ ENV HF_HOME=/home/user/.cache/huggingface
14
+
15
+ EXPOSE 7860
16
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ distil-home-assistant-functiongemma β€” OpenAI/OpenRouter-compatible inference server
3
+
4
+ Endpoints:
5
+ GET / -> health check / API info
6
+ GET /v1/models -> list available models
7
+ POST /v1/chat/completions -> generate text (OpenAI format)
8
+ """
9
+
10
+ import os
11
+ import time
12
+ import uuid
13
+
14
+ import torch
15
+ from fastapi import FastAPI, Request
16
+ from fastapi.middleware.cors import CORSMiddleware
17
+ from fastapi.responses import JSONResponse
18
+ from transformers import AutoModelForCausalLM, AutoTokenizer
19
+
20
+ # ─── Config ───────────────────────────────────────────────────────────────────
21
+
22
+ MODEL_ID = "LisaMegaWatts/distil-home-assistant-functiongemma"
23
+ MODEL_NAME = "functiongemma-270m"
24
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
25
+ DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
26
+
27
+ # ─── Load model at startup ────────────────────────────────────────────────────
28
+
29
+ print(f"Loading {MODEL_ID} on {DEVICE} ...")
30
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
31
+ model = AutoModelForCausalLM.from_pretrained(
32
+ MODEL_ID, torch_dtype=DTYPE, trust_remote_code=True
33
+ ).to(DEVICE)
34
+ model.eval()
35
+
36
+ if tokenizer.pad_token is None:
37
+ tokenizer.pad_token = tokenizer.eos_token
38
+
39
+ MODEL_CREATED_AT = int(time.time())
40
+ print(f"Model ready on {DEVICE}")
41
+
42
+ # ─── FastAPI app ──────────────────────────────────────────────────────────────
43
+
44
+ app = FastAPI(title=f"{MODEL_NAME} API", version="1.0.0")
45
+ app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
46
+
47
+
48
+ @app.get("/")
49
+ def root():
50
+ return {
51
+ "name": MODEL_NAME,
52
+ "version": "1.0.0",
53
+ "description": "FunctionGemma 270m distilled for home assistant function calling",
54
+ "model": MODEL_ID,
55
+ "endpoints": ["/v1/models", "/v1/chat/completions"],
56
+ "compatible_with": ["OpenAI API", "OpenRouter"],
57
+ }
58
+
59
+
60
+ @app.get("/v1/models")
61
+ def list_models():
62
+ return {
63
+ "object": "list",
64
+ "data": [{"id": MODEL_NAME, "object": "model", "created": MODEL_CREATED_AT, "owned_by": "LisaMegaWatts"}],
65
+ }
66
+
67
+
68
+ @app.post("/v1/chat/completions")
69
+ async def chat_completions(request: Request):
70
+ try:
71
+ body = await request.json()
72
+ except Exception:
73
+ return JSONResponse(status_code=400, content={"error": {"message": "Invalid JSON", "type": "invalid_request_error", "code": "invalid_json"}})
74
+
75
+ messages = body.get("messages", [])
76
+ temperature = max(0.01, min(float(body.get("temperature", 0.7)), 2.0))
77
+ max_tokens = max(1, min(int(body.get("max_tokens", 512)), 2048))
78
+ top_p = float(body.get("top_p", 0.9))
79
+ n_completions = int(body.get("n", 1))
80
+
81
+ if hasattr(tokenizer, "apply_chat_template") and tokenizer.chat_template:
82
+ prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
83
+ else:
84
+ parts = [f"{m.get('role','user')}: {m.get('content','')}" for m in messages]
85
+ parts.append("assistant:")
86
+ prompt = "\n".join(parts)
87
+
88
+ inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
89
+ prompt_tokens = inputs["input_ids"].shape[1]
90
+
91
+ choices = []
92
+ total_completion_tokens = 0
93
+ for i in range(n_completions):
94
+ with torch.no_grad():
95
+ outputs = model.generate(
96
+ **inputs, max_new_tokens=max_tokens, temperature=temperature,
97
+ top_p=top_p, do_sample=temperature > 0.01, pad_token_id=tokenizer.pad_token_id,
98
+ )
99
+ new_tokens = outputs[0][prompt_tokens:]
100
+ text = tokenizer.decode(new_tokens, skip_special_tokens=True)
101
+ completion_tokens = len(new_tokens)
102
+ total_completion_tokens += completion_tokens
103
+ choices.append({"index": i, "message": {"role": "assistant", "content": text}, "finish_reason": "length" if completion_tokens >= max_tokens else "stop"})
104
+
105
+ return {
106
+ "id": f"chatcmpl-{uuid.uuid4()}", "object": "chat.completion", "created": int(time.time()),
107
+ "model": MODEL_NAME, "choices": choices,
108
+ "usage": {"prompt_tokens": prompt_tokens, "completion_tokens": total_completion_tokens, "total_tokens": prompt_tokens + total_completion_tokens},
109
+ "system_fingerprint": f"{MODEL_NAME}-v1",
110
+ }
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ torch
4
+ transformers
5
+ accelerate
6
+ safetensors
7
+ sentencepiece
8
+ protobuf
9
+ tokenizers