CreitinGameplays commited on
Commit
159bb3a
·
verified ·
1 Parent(s): d5c77c2

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +83 -1
README.md CHANGED
@@ -8,4 +8,86 @@ base_model:
8
  - meta-llama/Llama-3.1-8B-Instruct
9
  pipeline_tag: text-generation
10
  library_name: transformers
11
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  - meta-llama/Llama-3.1-8B-Instruct
9
  pipeline_tag: text-generation
10
  library_name: transformers
11
+ ---
12
+
13
+ # Llama 3.1 8B R1 Experimental
14
+
15
+ Chat template format:
16
+ ```
17
+ <|start_header_id|>system<|end_header_id|>
18
+
19
+ You are a helpful AI assistant named Llama, made by Meta AI.
20
+ You are focused on providing systematic, well-reasoned responses. Response Structure: - Format: <think>{{reasoning}}</think>{{answer}} - Reasoning: Minimum 6 logical steps only when it required in <think> block - Process: Think first, then answer.<|eot_id|><|start_header_id|>user<|end_header_id|>
21
+
22
+ How many r's are in strawberry?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
23
+ <think>
24
+ ```
25
+
26
+ Run this model:
27
+ ```python
28
+ # test the model
29
+ import torch
30
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
31
+
32
+ def main():
33
+ model_id = "CreitinGameplays/Llama-3.1-8B-R1-experimental"
34
+
35
+ # Load the tokenizer.
36
+ tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)
37
+
38
+ # Load the model using bitsandbytes 8-bit quantization if CUDA is available.
39
+ if torch.cuda.is_available():
40
+ model = AutoModelForCausalLM.from_pretrained(
41
+ model_id,
42
+ load_in_8bit=True,
43
+ device_map="auto"
44
+ )
45
+ device = torch.device("cuda")
46
+ else:
47
+ model = AutoModelForCausalLM.from_pretrained(model_id)
48
+ device = torch.device("cpu")
49
+
50
+ # Define the generation parameters.
51
+ generation_kwargs = {
52
+ "max_new_tokens": 2048,
53
+ "do_sample": True,
54
+ "temperature": 0.6,
55
+ "top_p": 1.0,
56
+ "repetition_penalty": 1.08,
57
+ "num_return_sequences": 1,
58
+ "forced_eos_token_id": tokenizer.eos_token_id,
59
+ "pad_token_id": tokenizer.eos_token_id
60
+ }
61
+
62
+ print("Enter your prompt (type 'exit' to quit):")
63
+ while True:
64
+ # Get user input.
65
+ user_input = input("Input> ")
66
+ if user_input.lower().strip() in ("exit", "quit"):
67
+ break
68
+
69
+ # Construct the prompt in your desired format.
70
+ prompt = f"""
71
+ <|start_header_id|>system<|end_header_id|>
72
+
73
+ You are a helpful AI assistant named Llama, made by Meta AI.
74
+ You are focused on providing systematic, well-reasoned responses. Response Structure: - Format: <think>{{reasoning}}</think>{{answer}} - Reasoning: Minimum 6 logical steps only when it required in <think> block - Process: Think first, then answer.<|eot_id|><|start_header_id|>user<|end_header_id|>
75
+
76
+ {user_input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
77
+ <think>
78
+ """
79
+
80
+ # Tokenize the prompt and send to the selected device.
81
+ input_ids = tokenizer.encode(prompt, return_tensors="pt", add_special_tokens=True).to(device)
82
+
83
+ # Create a new TextStreamer instance for streaming responses.
84
+ streamer = TextStreamer(tokenizer)
85
+ generation_kwargs["streamer"] = streamer
86
+
87
+ print("\nAssistant Response:")
88
+ # Generate the text (tokens will stream to stdout via the streamer).
89
+ outputs = model.generate(input_ids, **generation_kwargs)
90
+
91
+ if __name__ == "__main__":
92
+ main()
93
+ ```