File size: 2,121 Bytes
7bedc14
 
 
 
 
 
 
 
 
 
 
 
2edcd8e
7bedc14
2edcd8e
7bedc14
 
cf32d24
7bedc14
8477440
7bedc14
937a475
7bedc14
 
937a475
 
 
 
 
 
 
7bedc14
cf32d24
7bedc14
 
cf32d24
7bedc14
 
cf32d24
 
 
 
7bedc14
cf32d24
 
937a475
 
 
36f9321
937a475
 
36f9321
 
937a475
 
 
 
 
7bedc14
cf32d24
 
937a475
cf32d24
 
 
937a475
cf32d24
 
 
 
7bedc14
 
 
 
 
 
 
cf32d24
 
7bedc14
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
---
license: mit
language:
- en
pipeline_tag: text-generation
---

<img src="training_plot.png" width="700">

---

# Talk with the model:
- Paste this code in your python file:
  
```python
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import time

MODEL_NAME = "VortexIntelligence/VLM-1.1-K1-Preview"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.add_special_tokens({'additional_special_tokens': ['<|system|>', '<|user|>', '<|assistant|>']})
tokenizer.eos_token = "<|endoftext|>"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    low_cpu_mem_usage=True
)

model.resize_token_embeddings(len(tokenizer))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def stream_response(user_input):
    system_prompt = "You are a helpful assistant."
    prompt = f"<|system|>\n{system_prompt}\n<|user|>\n{user_input}\n<|assistant|>\n"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    input_len = inputs['input_ids'].shape[-1]
    max_new_tokens = 128

    start_time = time.time()

    output = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        repetition_penalty=2.1,
        temperature=0.7
    )

    output_tokens = output[0][input_len:]
    generated_text = tokenizer.decode(output_tokens, skip_special_tokens=True)
    print(generated_text, end='', flush=True)

    end_time = time.time()
    duration = end_time - start_time
    total_tokens = len(output_tokens)
    tps = total_tokens / duration
    tpm = tps * 60

    print("\n" + "-"*20)
    print(f"Time taken: {duration:.2f}s")
    print(f"Total tokens: {total_tokens}")
    print(f"Tokens/sec: {tps:.2f}")
    print(f"Tokens/min: {tpm:.2f}")

print("VLM 1.1 Chat - Type 'exit' to quit")
while True:
    user_input = input("User: ")
    if user_input.lower() == 'exit':
        print("Exiting chat. Goodbye!")
        break
    print("VLM: ", end="", flush=True)
    stream_response(user_input)
```