--- license: mit language: - en pipeline_tag: text-generation --- --- # Talk with the model: - Paste this code in your python file: ```python from transformers import AutoTokenizer, AutoModelForCausalLM import torch import time MODEL_NAME = "VortexIntelligence/VLM-1.1-K1-Preview" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) tokenizer.add_special_tokens({'additional_special_tokens': ['<|system|>', '<|user|>', '<|assistant|>']}) tokenizer.eos_token = "<|endoftext|>" model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, device_map="auto", low_cpu_mem_usage=True ) model.resize_token_embeddings(len(tokenizer)) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") def stream_response(user_input): system_prompt = "You are a helpful assistant." prompt = f"<|system|>\n{system_prompt}\n<|user|>\n{user_input}\n<|assistant|>\n" inputs = tokenizer(prompt, return_tensors="pt").to(device) input_len = inputs['input_ids'].shape[-1] max_new_tokens = 128 start_time = time.time() output = model.generate( **inputs, max_new_tokens=max_new_tokens, do_sample=True, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id, repetition_penalty=2.1, temperature=0.7 ) output_tokens = output[0][input_len:] generated_text = tokenizer.decode(output_tokens, skip_special_tokens=True) print(generated_text, end='', flush=True) end_time = time.time() duration = end_time - start_time total_tokens = len(output_tokens) tps = total_tokens / duration tpm = tps * 60 print("\n" + "-"*20) print(f"Time taken: {duration:.2f}s") print(f"Total tokens: {total_tokens}") print(f"Tokens/sec: {tps:.2f}") print(f"Tokens/min: {tpm:.2f}") print("VLM 1.1 Chat - Type 'exit' to quit") while True: user_input = input("User: ") if user_input.lower() == 'exit': print("Exiting chat. Goodbye!") break print("VLM: ", end="", flush=True) stream_response(user_input) ```