|
|
--- |
|
|
license: mit |
|
|
language: |
|
|
- en |
|
|
pipeline_tag: text-generation |
|
|
--- |
|
|
|
|
|
<img src="training_plot.png" width="700"> |
|
|
|
|
|
--- |
|
|
|
|
|
# Talk with the model: |
|
|
- Paste this code in your python file: |
|
|
|
|
|
```python |
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
import torch |
|
|
import time |
|
|
|
|
|
MODEL_NAME = "VortexIntelligence/VLM-1.1-K1-Preview" |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
|
|
tokenizer.add_special_tokens({'additional_special_tokens': ['<|system|>', '<|user|>', '<|assistant|>']}) |
|
|
tokenizer.eos_token = "<|endoftext|>" |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
MODEL_NAME, |
|
|
device_map="auto", |
|
|
low_cpu_mem_usage=True |
|
|
) |
|
|
|
|
|
model.resize_token_embeddings(len(tokenizer)) |
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
|
|
def stream_response(user_input): |
|
|
system_prompt = "You are a helpful assistant." |
|
|
prompt = f"<|system|>\n{system_prompt}\n<|user|>\n{user_input}\n<|assistant|>\n" |
|
|
inputs = tokenizer(prompt, return_tensors="pt").to(device) |
|
|
|
|
|
input_len = inputs['input_ids'].shape[-1] |
|
|
max_new_tokens = 128 |
|
|
|
|
|
start_time = time.time() |
|
|
|
|
|
output = model.generate( |
|
|
**inputs, |
|
|
max_new_tokens=max_new_tokens, |
|
|
do_sample=True, |
|
|
eos_token_id=tokenizer.eos_token_id, |
|
|
pad_token_id=tokenizer.pad_token_id, |
|
|
repetition_penalty=2.1, |
|
|
temperature=0.7 |
|
|
) |
|
|
|
|
|
output_tokens = output[0][input_len:] |
|
|
generated_text = tokenizer.decode(output_tokens, skip_special_tokens=True) |
|
|
print(generated_text, end='', flush=True) |
|
|
|
|
|
end_time = time.time() |
|
|
duration = end_time - start_time |
|
|
total_tokens = len(output_tokens) |
|
|
tps = total_tokens / duration |
|
|
tpm = tps * 60 |
|
|
|
|
|
print("\n" + "-"*20) |
|
|
print(f"Time taken: {duration:.2f}s") |
|
|
print(f"Total tokens: {total_tokens}") |
|
|
print(f"Tokens/sec: {tps:.2f}") |
|
|
print(f"Tokens/min: {tpm:.2f}") |
|
|
|
|
|
print("VLM 1.1 Chat - Type 'exit' to quit") |
|
|
while True: |
|
|
user_input = input("User: ") |
|
|
if user_input.lower() == 'exit': |
|
|
print("Exiting chat. Goodbye!") |
|
|
break |
|
|
print("VLM: ", end="", flush=True) |
|
|
stream_response(user_input) |
|
|
``` |