File size: 2,121 Bytes
7bedc14 2edcd8e 7bedc14 2edcd8e 7bedc14 cf32d24 7bedc14 8477440 7bedc14 937a475 7bedc14 937a475 7bedc14 cf32d24 7bedc14 cf32d24 7bedc14 cf32d24 7bedc14 cf32d24 937a475 36f9321 937a475 36f9321 937a475 7bedc14 cf32d24 937a475 cf32d24 937a475 cf32d24 7bedc14 cf32d24 7bedc14 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
---
license: mit
language:
- en
pipeline_tag: text-generation
---
<img src="training_plot.png" width="700">
---
# Talk with the model:
- Paste this code in your python file:
```python
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import time
MODEL_NAME = "VortexIntelligence/VLM-1.1-K1-Preview"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.add_special_tokens({'additional_special_tokens': ['<|system|>', '<|user|>', '<|assistant|>']})
tokenizer.eos_token = "<|endoftext|>"
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
device_map="auto",
low_cpu_mem_usage=True
)
model.resize_token_embeddings(len(tokenizer))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def stream_response(user_input):
system_prompt = "You are a helpful assistant."
prompt = f"<|system|>\n{system_prompt}\n<|user|>\n{user_input}\n<|assistant|>\n"
inputs = tokenizer(prompt, return_tensors="pt").to(device)
input_len = inputs['input_ids'].shape[-1]
max_new_tokens = 128
start_time = time.time()
output = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
do_sample=True,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
repetition_penalty=2.1,
temperature=0.7
)
output_tokens = output[0][input_len:]
generated_text = tokenizer.decode(output_tokens, skip_special_tokens=True)
print(generated_text, end='', flush=True)
end_time = time.time()
duration = end_time - start_time
total_tokens = len(output_tokens)
tps = total_tokens / duration
tpm = tps * 60
print("\n" + "-"*20)
print(f"Time taken: {duration:.2f}s")
print(f"Total tokens: {total_tokens}")
print(f"Tokens/sec: {tps:.2f}")
print(f"Tokens/min: {tpm:.2f}")
print("VLM 1.1 Chat - Type 'exit' to quit")
while True:
user_input = input("User: ")
if user_input.lower() == 'exit':
print("Exiting chat. Goodbye!")
break
print("VLM: ", end="", flush=True)
stream_response(user_input)
``` |