CreitinGameplays commited on
Commit
cd831b6
·
verified ·
1 Parent(s): c54de11

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +119 -0
README.md ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ datasets:
4
+ - CreitinGameplays/Raiden-DeepSeek-R1-llama3.1-v1
5
+ language:
6
+ - en
7
+ base_model:
8
+ - meta-llama/Llama-3.2-3B-Instruct
9
+ pipeline_tag: text-generation
10
+ library_name: transformers
11
+ ---
12
+
13
+ # Llama 3.1 8B R1 Experimental
14
+
15
+ Chat template format:
16
+ ```
17
+ <|start_header_id|>system<|end_header_id|>
18
+ You are focused on providing systematic, well-reasoned responses. Response Structure: - Format: <think>{{reasoning}}</think>{{answer}} - Reasoning: Minimum 6 logical steps only when it required in <think> block - Process: Think first, then answer.
19
+
20
+ You are a helpful AI assistant named Llama, made by Meta AI.
21
+ <|eot_id|><|start_header_id|>user<|end_header_id|>
22
+
23
+ How many r's are in strawberry?<|eot_id|><|start_header_id|>assistant<|end_header_id|><think>
24
+ ```
25
+
26
+ Run this model:
27
+ ```python
28
+ # test the model
29
+ import torch
30
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
31
+
32
+ def main():
33
+ model_id = "CreitinGameplays/Llama-3.2-3B-Instruct-R1-v1"
34
+
35
+ # Load the tokenizer.
36
+ tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)
37
+
38
+ # Load the model using bitsandbytes 8-bit quantization if CUDA is available.
39
+ if torch.cuda.is_available():
40
+ model = AutoModelForCausalLM.from_pretrained(
41
+ model_id,
42
+ load_in_8bit=True,
43
+ device_map="auto"
44
+ )
45
+ device = torch.device("cuda")
46
+ else:
47
+ model = AutoModelForCausalLM.from_pretrained(model_id)
48
+ device = torch.device("cpu")
49
+
50
+ # Define the generation parameters.
51
+ generation_kwargs = {
52
+ "max_new_tokens": 2048,
53
+ "do_sample": True,
54
+ "temperature": 0.5,
55
+ "top_p": 0.9,
56
+ "repetition_penalty": 1.1,
57
+ "num_return_sequences": 1,
58
+ "forced_eos_token_id": tokenizer.eos_token_id,
59
+ "pad_token_id": tokenizer.eos_token_id
60
+ }
61
+
62
+ print("Enter your prompt (type 'exit' to quit):")
63
+ while True:
64
+ # Get user input.
65
+ user_input = input("Input> ")
66
+ if user_input.lower().strip() in ("exit", "quit"):
67
+ break
68
+
69
+ # Construct the prompt in your desired format.
70
+ prompt = f"""
71
+ <|start_header_id|>system<|end_header_id|>
72
+ You are focused on providing systematic, well-reasoned responses. Response Structure: - Format: <think>{{reasoning}}</think>{{answer}} - Reasoning: Minimum 6 logical steps only when it required in <think> block - Process: Think first, then answer.
73
+
74
+ You are a helpful AI assistant named Llama, made by Meta AI.
75
+ <|eot_id|><|start_header_id|>user<|end_header_id|>
76
+
77
+ How many r's are in strawberry?<|eot_id|><|start_header_id|>assistant<|end_header_id|><think>
78
+ """
79
+
80
+ # Tokenize the prompt and send to the selected device.
81
+ input_ids = tokenizer.encode(prompt, return_tensors="pt", add_special_tokens=True).to(device)
82
+
83
+ # Create a new TextStreamer instance for streaming responses.
84
+ streamer = TextStreamer(tokenizer)
85
+ generation_kwargs["streamer"] = streamer
86
+
87
+ print("\nAssistant Response:")
88
+ # Generate the text (tokens will stream to stdout via the streamer).
89
+ outputs = model.generate(input_ids, **generation_kwargs)
90
+
91
+ if __name__ == "__main__":
92
+ main()
93
+ ```
94
+
95
+ Or alternatively:
96
+ ```python
97
+ import torch
98
+ from transformers import pipeline
99
+
100
+ model_id = "CreitinGameplays/Llama-3.2-3B-Instruct-R1-v1"
101
+
102
+ pipe = pipeline(
103
+ "text-generation",
104
+ model=model_id,
105
+ torch_dtype=torch.bfloat16,
106
+ device_map="auto"
107
+ )
108
+
109
+ messages = [{"role": "user", "content": "hello there!"}]
110
+
111
+ outputs = pipe(
112
+ messages,
113
+ temperature=0.5,
114
+ repetition_penalty=1.1,
115
+ max_new_tokens=2048
116
+ )
117
+
118
+ print(outputs[0]["generated_text"][-1])
119
+ ```