itsjorigo commited on
Commit
256cc53
·
verified ·
1 Parent(s): 3c5d4d3

Update handler.py

Browse files
Files changed (1) hide show
  1. handler.py +13 -2
handler.py CHANGED
@@ -1,10 +1,17 @@
1
  from transformers import AutoTokenizer, AutoModelForCausalLM
2
  import torch
3
 
 
 
4
  class EndpointHandler:
5
  def __init__(self, path=""):
6
- print(f"Loading tokenizer from {path}...")
7
- self.tokenizer = AutoTokenizer.from_pretrained(path)
 
 
 
 
 
8
  if self.tokenizer.pad_token is None:
9
  self.tokenizer.pad_token = self.tokenizer.eos_token
10
 
@@ -13,7 +20,11 @@ class EndpointHandler:
13
  path,
14
  torch_dtype = torch.float16,
15
  device_map = "auto",
 
16
  )
 
 
 
17
  self.model.eval()
18
  print("Model ready!")
19
 
 
1
  from transformers import AutoTokenizer, AutoModelForCausalLM
2
  import torch
3
 
4
+ TOKENIZER_NAME = "polyglots/Extended-Sinhala-LLaMA"
5
+
6
  class EndpointHandler:
7
  def __init__(self, path=""):
8
+ print(f"Loading extended Sinhala tokenizer from {TOKENIZER_NAME}...")
9
+ # Must load from the original extended tokenizer repo, NOT from path
10
+ # because the model folder doesn't contain the full custom tokenizer
11
+ self.tokenizer = AutoTokenizer.from_pretrained(
12
+ TOKENIZER_NAME,
13
+ trust_remote_code=True,
14
+ )
15
  if self.tokenizer.pad_token is None:
16
  self.tokenizer.pad_token = self.tokenizer.eos_token
17
 
 
20
  path,
21
  torch_dtype = torch.float16,
22
  device_map = "auto",
23
+ trust_remote_code = True,
24
  )
25
+ # Resize to match extended vocab (139,336 tokens)
26
+ self.model.resize_token_embeddings(len(self.tokenizer))
27
+ self.model.config.pad_token_id = self.tokenizer.eos_token_id
28
  self.model.eval()
29
  print("Model ready!")
30