LisaMegaWatts commited on
Commit
0a315c6
Β·
verified Β·
1 Parent(s): 7759445

Upload server.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. server.py +532 -0
server.py ADDED
@@ -0,0 +1,532 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ server.py β€” OpenAI-compatible inference server for JuliaSLM-compressed-svd
3
+
4
+ Serves the SVD-90 compressed JuliaSLM model (4.81M params, ~4.5% smaller).
5
+ Downloads checkpoint and tokenizer from HuggingFace on first run.
6
+
7
+ SVD compression: each linear layer W β‰ˆ A @ B (low-rank factorization),
8
+ reducing parameter count while preserving model quality.
9
+
10
+ Endpoints:
11
+ GET / -> health check / API info
12
+ GET /v1/models -> list available models
13
+ POST /v1/chat/completions -> generate text (OpenAI format, streaming supported)
14
+ """
15
+
16
+ import json
17
+ import os
18
+ import regex
19
+ import time
20
+ import uuid
21
+ from http.server import HTTPServer, BaseHTTPRequestHandler
22
+ from threading import Lock
23
+
24
+ import torch
25
+ import torch.nn.functional as F
26
+ from huggingface_hub import hf_hub_download
27
+
28
+ from juliaslm_svd_model import SVDConfig, JuliaSLM_SVD
29
+
30
+ # ═══════════════════════════════════════════════════════════════════
31
+ # Configuration
32
+ # ═══════════════════════════════════════════════════════════════════
33
+
34
+ HF_MODEL_REPO = os.environ.get("HF_MODEL_REPO", "LisaMegaWatts/JuliaSLM-compressed-svd")
35
+ HF_TOKENIZER_REPO = os.environ.get("HF_TOKENIZER_REPO", "LisaMegaWatts/JuliaSLM")
36
+ CHECKPOINT_NAME = os.environ.get("CHECKPOINT_NAME", "svd_SVD-90_best.pt")
37
+ PORT = int(os.environ.get("PORT", "7860"))
38
+ CKPT_DIR = "checkpoints"
39
+ MODEL_ID = "juliaslm-compressed-svd-90"
40
+
41
+ # ═══════════════════════════════════════════════════════════════════
42
+ # BPE Tokenizer (vocab.json + merges.txt)
43
+ # ═══════════════════════════════════════════════════════════════════
44
+
45
+ GPT2_PATTERN = regex.compile(
46
+ r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
47
+ regex.UNICODE,
48
+ )
49
+
50
+
51
+ def _build_byte_to_unicode():
52
+ bs = list(range(0x21, 0x7F)) + list(range(0xA1, 0xAD)) + list(range(0xAE, 0x100))
53
+ cs = list(bs)
54
+ n = 0
55
+ for b in range(256):
56
+ if b not in bs:
57
+ bs.append(b)
58
+ cs.append(256 + n)
59
+ n += 1
60
+ return {b: chr(c) for b, c in zip(bs, cs)}
61
+
62
+
63
+ BYTE_TO_UNICODE = _build_byte_to_unicode()
64
+ UNICODE_TO_BYTE = {v: k for k, v in BYTE_TO_UNICODE.items()}
65
+
66
+
67
+ class BPETokenizer:
68
+ def __init__(self, vocab_path: str, merges_path: str):
69
+ with open(vocab_path, "r", encoding="utf-8") as f:
70
+ self.vocab = json.load(f)
71
+ self.id_to_token = {v: k for k, v in self.vocab.items()}
72
+
73
+ self.merges = []
74
+ self.merge_rank = {}
75
+ with open(merges_path, "r", encoding="utf-8") as f:
76
+ for line in f:
77
+ line = line.strip()
78
+ if not line or line.startswith("#"):
79
+ continue
80
+ parts = line.split()
81
+ if len(parts) == 2:
82
+ pair = (parts[0], parts[1])
83
+ self.merges.append(pair)
84
+ self.merge_rank[pair] = len(self.merge_rank)
85
+
86
+ self.cache = {}
87
+
88
+ def _bpe_word(self, chars: list[str]) -> list[str]:
89
+ tokens = list(chars)
90
+ while len(tokens) >= 2:
91
+ best_rank = float("inf")
92
+ best_pair = None
93
+ for i in range(len(tokens) - 1):
94
+ pair = (tokens[i], tokens[i + 1])
95
+ rank = self.merge_rank.get(pair, float("inf"))
96
+ if rank < best_rank:
97
+ best_rank = rank
98
+ best_pair = pair
99
+ if best_pair is None or best_rank == float("inf"):
100
+ break
101
+ a, b = best_pair
102
+ new_tokens = []
103
+ i = 0
104
+ while i < len(tokens):
105
+ if i < len(tokens) - 1 and tokens[i] == a and tokens[i + 1] == b:
106
+ new_tokens.append(a + b)
107
+ i += 2
108
+ else:
109
+ new_tokens.append(tokens[i])
110
+ i += 1
111
+ tokens = new_tokens
112
+ return tokens
113
+
114
+ def encode(self, text: str) -> list[int]:
115
+ ids = []
116
+ for m in GPT2_PATTERN.finditer(text):
117
+ word = m.group()
118
+ if word in self.cache:
119
+ ids.extend(self.cache[word])
120
+ continue
121
+ chars = [BYTE_TO_UNICODE[b] for b in word.encode("utf-8")]
122
+ tokens = self._bpe_word(chars)
123
+ word_ids = [self.vocab[t] for t in tokens if t in self.vocab]
124
+ self.cache[word] = word_ids
125
+ ids.extend(word_ids)
126
+ return ids
127
+
128
+ def decode(self, ids: list[int]) -> str:
129
+ text = "".join(self.id_to_token.get(i, "") for i in ids)
130
+ byte_vals = [UNICODE_TO_BYTE[c] for c in text if c in UNICODE_TO_BYTE]
131
+ return bytes(byte_vals).decode("utf-8", errors="replace")
132
+
133
+
134
+ # ═══════════════════════════════════════════════════════════════════
135
+ # Sampling helpers
136
+ # ═══════════════════════════════════════════════════════════════════
137
+
138
+
139
+ def _sample_logits(logits: torch.Tensor, temperature: float, top_k: int,
140
+ top_p: float, vocab_size: int) -> int:
141
+ if temperature <= 0:
142
+ return logits.argmax().item()
143
+
144
+ logits = logits / temperature
145
+
146
+ if 0 < top_k < vocab_size:
147
+ topk_vals, _ = torch.topk(logits, top_k)
148
+ logits[logits < topk_vals[-1]] = float("-inf")
149
+
150
+ if top_p < 1.0:
151
+ sorted_logits, sorted_idx = torch.sort(logits, descending=True)
152
+ cum_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
153
+ remove = cum_probs - F.softmax(sorted_logits, dim=-1) >= top_p
154
+ sorted_logits[remove] = float("-inf")
155
+ logits = sorted_logits.scatter(0, sorted_idx, sorted_logits)
156
+
157
+ probs = F.softmax(logits, dim=-1)
158
+ return torch.multinomial(probs, 1).item()
159
+
160
+
161
+ # ═══════════════════════════════════════════════════════════════════
162
+ # Text generation with KV cache
163
+ # ═══════════════════════════════════════════════════════════════════
164
+
165
+
166
+ @torch.inference_mode()
167
+ def generate(
168
+ model: JuliaSLM_SVD,
169
+ tokenizer: BPETokenizer,
170
+ prompt: str,
171
+ max_tokens: int = 200,
172
+ temperature: float = 0.8,
173
+ top_k: int = 40,
174
+ top_p: float = 1.0,
175
+ ) -> tuple[str, int]:
176
+ config = model.config
177
+ input_ids = tokenizer.encode(prompt)
178
+ prompt_len = len(input_ids)
179
+ ids = input_ids[-config.context_length:]
180
+
181
+ x = torch.tensor([ids], dtype=torch.long, device=DEVICE)
182
+ logits, kv_caches = model(x)
183
+ next_logits = logits[0, -1, :].float()
184
+
185
+ generated_ids = []
186
+ seq_len = len(ids)
187
+
188
+ for _ in range(max_tokens):
189
+ if seq_len >= config.context_length:
190
+ break
191
+
192
+ idx = _sample_logits(next_logits, temperature, top_k, top_p, config.vocab_size)
193
+ generated_ids.append(idx)
194
+ seq_len += 1
195
+
196
+ x = torch.tensor([[idx]], dtype=torch.long, device=DEVICE)
197
+ logits, kv_caches = model(x, kv_caches)
198
+ next_logits = logits[0, -1, :].float()
199
+
200
+ return tokenizer.decode(generated_ids), prompt_len
201
+
202
+
203
+ @torch.inference_mode()
204
+ def generate_streaming(
205
+ model: JuliaSLM_SVD,
206
+ tokenizer: BPETokenizer,
207
+ prompt: str,
208
+ max_tokens: int = 200,
209
+ temperature: float = 0.8,
210
+ top_k: int = 40,
211
+ top_p: float = 1.0,
212
+ ):
213
+ config = model.config
214
+ input_ids = tokenizer.encode(prompt)
215
+ prompt_len = len(input_ids)
216
+ ids = input_ids[-config.context_length:]
217
+
218
+ x = torch.tensor([ids], dtype=torch.long, device=DEVICE)
219
+ logits, kv_caches = model(x)
220
+ next_logits = logits[0, -1, :].float()
221
+
222
+ seq_len = len(ids)
223
+
224
+ for _ in range(max_tokens):
225
+ if seq_len >= config.context_length:
226
+ break
227
+
228
+ idx = _sample_logits(next_logits, temperature, top_k, top_p, config.vocab_size)
229
+ seq_len += 1
230
+
231
+ yield tokenizer.decode([idx]), prompt_len
232
+
233
+ x = torch.tensor([[idx]], dtype=torch.long, device=DEVICE)
234
+ logits, kv_caches = model(x, kv_caches)
235
+ next_logits = logits[0, -1, :].float()
236
+
237
+
238
+ # ═══════════════════════════════════════════════════════════════════
239
+ # Download artifacts from HuggingFace
240
+ # ═══════════════════════════════════════════════════════════════════
241
+
242
+
243
+ def ensure_artifacts():
244
+ os.makedirs(CKPT_DIR, exist_ok=True)
245
+ files = {}
246
+
247
+ # Model checkpoint from SVD-compressed repo
248
+ ckpt_local = os.path.join(CKPT_DIR, CHECKPOINT_NAME)
249
+ if not os.path.isfile(ckpt_local):
250
+ print(f"Downloading {CHECKPOINT_NAME} from {HF_MODEL_REPO} ...")
251
+ hf_hub_download(repo_id=HF_MODEL_REPO, filename=CHECKPOINT_NAME, local_dir=CKPT_DIR)
252
+ sz_mb = os.path.getsize(ckpt_local) / (1024 * 1024)
253
+ print(f" -> {ckpt_local} ({sz_mb:.1f} MB)")
254
+ files["checkpoint"] = ckpt_local
255
+
256
+ # Tokenizer from original JuliaSLM repo
257
+ for fname in ("vocab.json", "merges.txt"):
258
+ local = os.path.join(CKPT_DIR, fname)
259
+ if not os.path.isfile(local):
260
+ print(f"Downloading {fname} from {HF_TOKENIZER_REPO} ...")
261
+ hf_hub_download(repo_id=HF_TOKENIZER_REPO, filename=fname, local_dir=CKPT_DIR)
262
+ sz_mb = os.path.getsize(local) / (1024 * 1024)
263
+ print(f" -> {local} ({sz_mb:.1f} MB)")
264
+ files[fname] = local
265
+
266
+ return files
267
+
268
+
269
+ # ═══════════════════════════════════════════════════════════════════
270
+ # Load model
271
+ # ═══════════════════════════════════════════════════════════════════
272
+
273
+ print("Downloading artifacts...")
274
+ ARTIFACT_PATHS = ensure_artifacts()
275
+
276
+ print("\nLoading SVD-compressed model...")
277
+ state_dict = torch.load(ARTIFACT_PATHS["checkpoint"], map_location="cpu", weights_only=True)
278
+
279
+ # Build config from checkpoint (auto-detects ranks per layer)
280
+ CONFIG = SVDConfig.from_checkpoint(state_dict)
281
+ MODEL = JuliaSLM_SVD(CONFIG)
282
+ MODEL.load_state_dict(state_dict, strict=False)
283
+ MODEL.eval()
284
+ DEVICE = torch.device("cpu")
285
+
286
+ print("Loading tokenizer...")
287
+ TOKENIZER = BPETokenizer(
288
+ ARTIFACT_PATHS["vocab.json"],
289
+ ARTIFACT_PATHS["merges.txt"],
290
+ )
291
+
292
+ MODEL_CREATED_AT = int(time.time())
293
+ NUM_PARAMS = MODEL.num_parameters
294
+ print(
295
+ f"\nSVD-compressed model ready: vocab={CONFIG.vocab_size}, d_model={CONFIG.d_model}, "
296
+ f"layers={CONFIG.n_layers}, heads={CONFIG.n_heads}, "
297
+ f"ctx={CONFIG.context_length}, params={NUM_PARAMS:,}"
298
+ )
299
+ print("SVD-90 compression: ~4.5% parameter reduction")
300
+ print("KV cache enabled: O(1) per-token decoding")
301
+
302
+ MODEL_LOCK = Lock()
303
+
304
+ # ═══════════════════════════════════════════════════════════════════
305
+ # HTTP helpers
306
+ # ═══════════════════════════════════════════════════════════════════
307
+
308
+ CORS_HEADERS = {
309
+ "Access-Control-Allow-Origin": "*",
310
+ "Access-Control-Allow-Methods": "GET, POST, OPTIONS",
311
+ "Access-Control-Allow-Headers": "Content-Type, Authorization",
312
+ }
313
+
314
+
315
+ def extract_prompt(messages):
316
+ if not messages:
317
+ return ""
318
+ for msg in reversed(messages):
319
+ if msg.get("role") == "user":
320
+ return msg.get("content", "")
321
+ return messages[-1].get("content", "")
322
+
323
+
324
+ # ═══════════════════════════════════════════════════════════════════
325
+ # Request handler
326
+ # ═══════════════════════════════════════════════════════════════════
327
+
328
+
329
+ class Handler(BaseHTTPRequestHandler):
330
+ def log_message(self, format, *args):
331
+ print(f"[{self.log_date_time_string()}] {format % args}")
332
+
333
+ def _send_json(self, status, body):
334
+ data = json.dumps(body).encode()
335
+ self.send_response(status)
336
+ self.send_header("Content-Type", "application/json")
337
+ for k, v in CORS_HEADERS.items():
338
+ self.send_header(k, v)
339
+ self.send_header("Content-Length", str(len(data)))
340
+ self.end_headers()
341
+ self.wfile.write(data)
342
+
343
+ def do_OPTIONS(self):
344
+ self.send_response(204)
345
+ for k, v in CORS_HEADERS.items():
346
+ self.send_header(k, v)
347
+ self.end_headers()
348
+
349
+ def do_GET(self):
350
+ if self.path == "/":
351
+ self._send_json(200, {
352
+ "name": "JuliaSLM-compressed-svd",
353
+ "version": "1.0.0",
354
+ "description": "SVD-compressed JuliaSLM β€” low-rank factorized weight matrices for efficient inference",
355
+ "architecture": "MHA + RoPE + SwiGLU + RMSNorm + weight tying + SVD compression",
356
+ "compression": {
357
+ "method": "SVD-90",
358
+ "original_params": 5_040_000,
359
+ "compressed_params": NUM_PARAMS,
360
+ "reduction_pct": round((1 - NUM_PARAMS / 5_040_000) * 100, 1),
361
+ "val_loss": 3.756,
362
+ "original_val_loss": 3.552,
363
+ },
364
+ "model": {
365
+ "vocab_size": CONFIG.vocab_size,
366
+ "d_model": CONFIG.d_model,
367
+ "n_layers": CONFIG.n_layers,
368
+ "n_heads": CONFIG.n_heads,
369
+ "context_length": CONFIG.context_length,
370
+ "parameters": NUM_PARAMS,
371
+ },
372
+ "endpoints": ["/v1/models", "/v1/chat/completions"],
373
+ "features": ["streaming", "OpenAI-compatible", "top-k", "top-p", "kv-cache"],
374
+ "compatible_with": ["OpenAI API", "OpenRouter"],
375
+ })
376
+ elif self.path == "/v1/models":
377
+ self._send_json(200, {
378
+ "object": "list",
379
+ "data": [{
380
+ "id": MODEL_ID,
381
+ "object": "model",
382
+ "created": MODEL_CREATED_AT,
383
+ "owned_by": "juliaslm",
384
+ }],
385
+ })
386
+ else:
387
+ self._send_json(404, {"error": {
388
+ "message": f"Not found: GET {self.path}",
389
+ "type": "invalid_request_error",
390
+ "code": "not_found",
391
+ }})
392
+
393
+ def do_POST(self):
394
+ if self.path != "/v1/chat/completions":
395
+ self._send_json(404, {"error": {
396
+ "message": f"Not found: POST {self.path}",
397
+ "type": "invalid_request_error",
398
+ "code": "not_found",
399
+ }})
400
+ return
401
+
402
+ content_length = int(self.headers.get("Content-Length", 0))
403
+ try:
404
+ body = json.loads(self.rfile.read(content_length))
405
+ except (json.JSONDecodeError, ValueError):
406
+ self._send_json(400, {"error": {
407
+ "message": "Invalid JSON in request body",
408
+ "type": "invalid_request_error",
409
+ "code": "invalid_json",
410
+ }})
411
+ return
412
+
413
+ temperature = max(0.0, min(2.0, float(body.get("temperature", 0.8))))
414
+ max_tokens = max(1, min(CONFIG.context_length, int(body.get("max_tokens", 200))))
415
+ top_k_val = max(0, min(CONFIG.vocab_size, int(body.get("top_k", 40))))
416
+ top_p_val = max(0.0, min(1.0, float(body.get("top_p", 1.0))))
417
+ stream = bool(body.get("stream", False))
418
+
419
+ messages = body.get("messages", [])
420
+ prompt_text = extract_prompt(messages)
421
+ completion_id = f"chatcmpl-{uuid.uuid4()}"
422
+ created = int(time.time())
423
+
424
+ with MODEL_LOCK:
425
+ if stream:
426
+ self._handle_stream(
427
+ prompt_text, max_tokens, temperature, top_k_val, top_p_val,
428
+ completion_id, created,
429
+ )
430
+ else:
431
+ self._handle_non_stream(
432
+ prompt_text, max_tokens, temperature, top_k_val, top_p_val,
433
+ completion_id, created,
434
+ )
435
+
436
+ def _handle_stream(self, prompt_text, max_tokens, temperature, top_k, top_p,
437
+ completion_id, created):
438
+ self.send_response(200)
439
+ self.send_header("Content-Type", "text/event-stream")
440
+ self.send_header("Cache-Control", "no-cache")
441
+ self.send_header("X-Accel-Buffering", "no")
442
+ for k, v in CORS_HEADERS.items():
443
+ self.send_header(k, v)
444
+ self.end_headers()
445
+
446
+ def sse(data):
447
+ self.wfile.write(f"data: {json.dumps(data)}\n\n".encode())
448
+ self.wfile.flush()
449
+
450
+ sse({
451
+ "id": completion_id,
452
+ "object": "chat.completion.chunk",
453
+ "created": created,
454
+ "model": MODEL_ID,
455
+ "choices": [{"index": 0, "delta": {"role": "assistant", "content": ""}, "finish_reason": None}],
456
+ })
457
+
458
+ token_count = 0
459
+ prompt_tokens = 0
460
+ for token_str, p_len in generate_streaming(
461
+ MODEL, TOKENIZER, prompt_text,
462
+ max_tokens=max_tokens, temperature=temperature,
463
+ top_k=top_k, top_p=top_p,
464
+ ):
465
+ token_count += 1
466
+ prompt_tokens = p_len
467
+ sse({
468
+ "id": completion_id,
469
+ "object": "chat.completion.chunk",
470
+ "created": created,
471
+ "model": MODEL_ID,
472
+ "choices": [{"index": 0, "delta": {"content": token_str}, "finish_reason": None}],
473
+ })
474
+
475
+ sse({
476
+ "id": completion_id,
477
+ "object": "chat.completion.chunk",
478
+ "created": created,
479
+ "model": MODEL_ID,
480
+ "choices": [{"index": 0, "delta": {}, "finish_reason": "length" if token_count >= max_tokens else "stop"}],
481
+ "usage": {
482
+ "prompt_tokens": prompt_tokens,
483
+ "completion_tokens": token_count,
484
+ "total_tokens": prompt_tokens + token_count,
485
+ },
486
+ })
487
+ self.wfile.write(b"data: [DONE]\n\n")
488
+ self.wfile.flush()
489
+
490
+ def _handle_non_stream(self, prompt_text, max_tokens, temperature, top_k, top_p,
491
+ completion_id, created):
492
+ text, prompt_tokens = generate(
493
+ MODEL, TOKENIZER, prompt_text,
494
+ max_tokens=max_tokens, temperature=temperature,
495
+ top_k=top_k, top_p=top_p,
496
+ )
497
+ completion_tokens = len(TOKENIZER.encode(text))
498
+ finish_reason = "length" if completion_tokens >= max_tokens else "stop"
499
+
500
+ self._send_json(200, {
501
+ "id": completion_id,
502
+ "object": "chat.completion",
503
+ "created": created,
504
+ "model": MODEL_ID,
505
+ "choices": [{
506
+ "index": 0,
507
+ "message": {"role": "assistant", "content": text},
508
+ "finish_reason": finish_reason,
509
+ }],
510
+ "usage": {
511
+ "prompt_tokens": prompt_tokens,
512
+ "completion_tokens": completion_tokens,
513
+ "total_tokens": prompt_tokens + completion_tokens,
514
+ },
515
+ "system_fingerprint": "juliaslm-svd90-v1",
516
+ })
517
+
518
+
519
+ # ═══════════════════════════════════════════════════════════════════
520
+ # Start server
521
+ # ═══════════════════════════════════════════════════════════════════
522
+
523
+ if __name__ == "__main__":
524
+ print(f"\nJuliaSLM-compressed-svd server starting on 0.0.0.0:{PORT} ...")
525
+ print(f" GET http://localhost:{PORT}/")
526
+ print(f" GET http://localhost:{PORT}/v1/models")
527
+ print(f" POST http://localhost:{PORT}/v1/chat/completions")
528
+ print(f" POST http://localhost:{PORT}/v1/chat/completions (stream=true)")
529
+ print()
530
+
531
+ server = HTTPServer(("0.0.0.0", PORT), Handler)
532
+ server.serve_forever()