PingVortex commited on
Commit
cf32d24
·
verified ·
1 Parent(s): 8477440

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +42 -18
README.md CHANGED
@@ -15,6 +15,7 @@ pipeline_tag: text-generation
15
  ```python
16
  from transformers import AutoTokenizer, AutoModelForCausalLM
17
  import torch
 
18
 
19
  MODEL_NAME = "VortexIntelligence/VLM-1.1-K1-Preview"
20
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
@@ -23,26 +24,52 @@ model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto")
23
  tokenizer.add_special_tokens({'additional_special_tokens': ['<|system|>', '<|user|>', '<|assistant|>']})
24
  tokenizer.eos_token = "<|endoftext|>"
25
  model.resize_token_embeddings(len(tokenizer))
 
26
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
27
  model = model.to(device)
28
 
29
-
30
- def generate_response(user_input):
31
  system_prompt = "You are a helpful assistant."
32
  prompt = f"<|system|>\n{system_prompt}\n<|user|>\n{user_input}\n<|assistant|>\n"
33
- inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True).to(model.device)
 
 
 
34
 
35
- outputs = model.generate(
36
- **inputs,
37
- max_new_tokens=128,
38
- do_sample=False,
39
- eos_token_id=tokenizer.eos_token_id,
40
- pad_token_id=tokenizer.pad_token_id,
41
- repetition_penalty=2.1
42
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
- generated_text = tokenizer.decode(outputs[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True)
45
- return generated_text.strip()
 
 
 
 
 
 
 
 
 
 
46
 
47
  print("VLM 1.1 Chat - Type 'exit' to quit")
48
  while True:
@@ -50,9 +77,6 @@ while True:
50
  if user_input.lower() == 'exit':
51
  print("Exiting chat. Goodbye!")
52
  break
53
-
54
- assistant_response = generate_response(user_input)
55
-
56
- print(f"VLM: {assistant_response}")
57
-
58
  ```
 
15
  ```python
16
  from transformers import AutoTokenizer, AutoModelForCausalLM
17
  import torch
18
+ import time
19
 
20
  MODEL_NAME = "VortexIntelligence/VLM-1.1-K1-Preview"
21
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 
24
  tokenizer.add_special_tokens({'additional_special_tokens': ['<|system|>', '<|user|>', '<|assistant|>']})
25
  tokenizer.eos_token = "<|endoftext|>"
26
  model.resize_token_embeddings(len(tokenizer))
27
+
28
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
29
  model = model.to(device)
30
 
31
+ def stream_response(user_input):
 
32
  system_prompt = "You are a helpful assistant."
33
  prompt = f"<|system|>\n{system_prompt}\n<|user|>\n{user_input}\n<|assistant|>\n"
34
+ inputs = tokenizer(prompt, return_tensors="pt").to(device)
35
+
36
+ input_len = inputs['input_ids'].shape[-1]
37
+ max_new_tokens = 128
38
 
39
+ start_time = time.time()
40
+
41
+ streamer = []
42
+ for i in range(1, max_new_tokens + 1):
43
+ output = model.generate(
44
+ **inputs,
45
+ max_new_tokens=i,
46
+ do_sample=False,
47
+ eos_token_id=tokenizer.eos_token_id,
48
+ pad_token_id=tokenizer.pad_token_id,
49
+ repetition_penalty=2.1
50
+ )
51
+ output_tokens = output[0][input_len:]
52
+ generated_text = tokenizer.decode(output_tokens, skip_special_tokens=True)
53
+
54
+ if len(streamer) < len(generated_text):
55
+ new_chunk = generated_text[len(streamer):]
56
+ print(new_chunk, end='', flush=True)
57
+ streamer += new_chunk
58
+ if tokenizer.eos_token in tokenizer.decode(output[0]):
59
+ break
60
 
61
+ end_time = time.time()
62
+ duration = end_time - start_time
63
+ total_tokens = len(streamer)
64
+ tps = total_tokens / duration
65
+ tpm = tps * 60
66
+
67
+ print("\n")
68
+ print("-"*20)
69
+ print(f"Time taken: {duration:.2f}s")
70
+ print(f"Total tokens: {total_tokens}")
71
+ print(f"Tokens/sec: {tps:.2f}")
72
+ print(f"Tokens/min: {tpm:.2f}")
73
 
74
  print("VLM 1.1 Chat - Type 'exit' to quit")
75
  while True:
 
77
  if user_input.lower() == 'exit':
78
  print("Exiting chat. Goodbye!")
79
  break
80
+ print("VLM: ", end="", flush=True)
81
+ stream_response(user_input)
 
 
 
82
  ```