golyuval commited on
Commit
11f4a01
·
verified ·
1 Parent(s): 827f7d5

Upload 2 files

Browse files
Files changed (2) hide show
  1. handler.py +38 -0
  2. requirements.txt +7 -0
handler.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # handler.py
2
+ from typing import Any, Dict, List
3
+ import os
4
+ from unsloth import FastLanguageModel
5
+
6
+ class EndpointHandler:
7
+ def __init__(self, model_id: str):
8
+ # Called once at endpoint startup with your model repo ID/path
9
+ max_seq = int(os.getenv("MAX_SEQ_LENGTH", 1024))
10
+ self.model, self.tokenizer = FastLanguageModel.from_pretrained(
11
+ model_id,
12
+ max_seq_length = max_seq,
13
+ load_in_4bit = True,
14
+ )
15
+
16
+ def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
17
+ """
18
+ data: {"inputs": "<str>"} or {"inputs": ["<str>", ...]}
19
+ returns: [{"generated_text": "<str>"}, ...]
20
+ """
21
+ inputs = data.get("inputs", data)
22
+ if isinstance(inputs, str):
23
+ prompts = [inputs]
24
+ elif isinstance(inputs, list):
25
+ prompts = inputs
26
+ else:
27
+ raise ValueError(f"Unsupported inputs type: {type(inputs)}")
28
+
29
+ outputs: List[Dict[str, Any]] = []
30
+ for prompt in prompts:
31
+ # generate one response per prompt
32
+ out = self.model.generate(
33
+ prompt,
34
+ max_new_tokens = int(os.getenv("MAX_NEW_TOKENS", 64)),
35
+ pad_token_id = self.tokenizer.eos_token_id,
36
+ )
37
+ outputs.append({"generated_text": out})
38
+ return outputs
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # requirements.txt
2
+ unsloth>=2025.3.19
3
+ transformers>=4.51.3
4
+ torch>=2.6.0
5
+ bitsandbytes>=0.45.5
6
+ accelerate>=1.5.2
7
+ huggingface-hub>=0.30.2