Goekdeniz-Guelmez commited on
Commit
c4a5c4d
·
verified ·
1 Parent(s): 0c243da

Upload inference.py

Browse files
Files changed (1) hide show
  1. inference.py +47 -0
inference.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ import torch
4
+ from transformers import AutoTokenizer
5
+ from safetensors.torch import load_file
6
+
7
+ model_dir = "/Users/Goekdeniz.Guelmez@computacenter.com/Library/CloudStorage/OneDrive-COMPUTACENTER/Desktop/mlx-lm/dev"
8
+ sys.path.append(model_dir)
9
+
10
+ from modeling_longcat_flash import LongcatFlashForCausalLM
11
+ from configuration_longcat_flash import LongcatFlashConfig
12
+ import json
13
+
14
+ # Load model
15
+ with open(os.path.join(model_dir, "config.json"), 'r') as f:
16
+ config_dict = json.load(f)
17
+
18
+ config = LongcatFlashConfig(**config_dict)
19
+ model = LongcatFlashForCausalLM(config)
20
+
21
+ # Load weights
22
+ state_dict = load_file(os.path.join(model_dir, "model.safetensors"))
23
+ model.load_state_dict(state_dict)
24
+
25
+ # Load tokenizer
26
+ tokenizer = AutoTokenizer.from_pretrained(model_dir)
27
+ if tokenizer.pad_token is None:
28
+ tokenizer.pad_token = tokenizer.eos_token
29
+
30
+ # Test with "Lorem"
31
+ text = "Lorem"
32
+ inputs = tokenizer(text, return_tensors="pt", padding=True, return_attention_mask=True)
33
+
34
+ print(f"Input: {text}")
35
+
36
+ # Generate
37
+ with torch.no_grad():
38
+ generated_ids = model.generate(
39
+ input_ids=inputs["input_ids"],
40
+ attention_mask=inputs["attention_mask"],
41
+ max_new_tokens=10,
42
+ do_sample=False,
43
+ pad_token_id=tokenizer.pad_token_id
44
+ )
45
+ generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
46
+
47
+ print(f"Generated: {generated_text}")