|
|
--- |
|
|
datasets: |
|
|
- CreitinGameplays/gemma-r1-test |
|
|
language: |
|
|
- en |
|
|
base_model: |
|
|
- google/gemma-2-2b-it |
|
|
pipeline_tag: text-generation |
|
|
library_name: transformers |
|
|
--- |
|
|
|
|
|
Chat template: |
|
|
``` |
|
|
<start_of_turn>user |
|
|
{user_prompt}<end_of_turn> |
|
|
<start_of_turn>model |
|
|
<think> |
|
|
``` |
|
|
|
|
|
Code for testing: |
|
|
```python |
|
|
# test the model |
|
|
import torch |
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer |
|
|
|
|
|
def main(): |
|
|
model_id = "CreitinGameplays/gemma-2-2b-it-R1-exp" |
|
|
|
|
|
# Load the tokenizer. |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_id) |
|
|
|
|
|
# Load the model using bitsandbytes 8-bit quantization if CUDA is available. |
|
|
if torch.cuda.is_available(): |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
model_id, |
|
|
load_in_4bit=True, |
|
|
device_map="auto" |
|
|
) |
|
|
device = torch.device("cuda") |
|
|
else: |
|
|
model = AutoModelForCausalLM.from_pretrained(model_id) |
|
|
device = torch.device("cpu") |
|
|
|
|
|
# Define the generation parameters. |
|
|
generation_kwargs = { |
|
|
"max_new_tokens": 4096, |
|
|
"do_sample": True, |
|
|
"temperature": 0.6, |
|
|
"top_k": 40, |
|
|
"top_p": 0.9, |
|
|
"repetition_penalty": 1.1, |
|
|
"num_return_sequences": 1, |
|
|
"pad_token_id": tokenizer.eos_token_id |
|
|
} |
|
|
|
|
|
print("Enter your prompt (type 'exit' to quit):") |
|
|
while True: |
|
|
# Get user input. |
|
|
user_input = input("Input> ") |
|
|
if user_input.lower().strip() in ("exit", "quit"): |
|
|
break |
|
|
|
|
|
# Construct the prompt in your desired format. |
|
|
prompt = f""" |
|
|
<start_of_turn>user |
|
|
{user_input}<end_of_turn> |
|
|
<start_of_turn>model |
|
|
<think> |
|
|
""" |
|
|
|
|
|
# Tokenize the prompt and send to the selected device. |
|
|
input_ids = tokenizer.encode(prompt, return_tensors="pt", add_special_tokens=True).to(device) |
|
|
|
|
|
# Create a new TextStreamer instance for streaming responses. |
|
|
streamer = TextStreamer(tokenizer) |
|
|
generation_kwargs["streamer"] = streamer |
|
|
|
|
|
print("\nAssistant Response:") |
|
|
# Generate the text (tokens will stream to stdout via the streamer). |
|
|
outputs = model.generate(input_ids, **generation_kwargs) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
``` |
|
|
|
|
|
#INeedSomeGPU |