| | --- |
| | license: mit |
| | datasets: |
| | - CreitinGameplays/DeepSeek-R1-Distill-Qwen-32B_NUMINA_train_amc_aime-llama3.1 |
| | language: |
| | - en |
| | base_model: |
| | - meta-llama/Llama-3.1-8B-Instruct |
| | pipeline_tag: text-generation |
| | library_name: transformers |
| | --- |
| | |
| | # Llama 3.1 8B R1 Experimental |
| |
|
| | Chat template format: |
| | ``` |
| | <|start_header_id|>system<|end_header_id|> |
| | |
| | You are a helpful AI assistant named Llama, made by Meta AI. |
| | You are focused on providing systematic, well-reasoned responses. Response Structure: - Format: <think>{{reasoning}}</think>{{answer}} - Reasoning: Minimum 6 logical steps only when it required in <think> block - Process: Think first, then answer.<|eot_id|><|start_header_id|>user<|end_header_id|> |
| | |
| | How many r's are in strawberry?<|eot_id|><|start_header_id|>assistant<|end_header_id|> |
| | <think> |
| | ``` |
| |
|
| | Run this model: |
| | ```python |
| | # test the model |
| | import torch |
| | from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer |
| | |
| | def main(): |
| | model_id = "CreitinGameplays/Llama-3.1-8B-R1-experimental" |
| | |
| | # Load the tokenizer. |
| | tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True) |
| | |
| | # Load the model using bitsandbytes 8-bit quantization if CUDA is available. |
| | if torch.cuda.is_available(): |
| | model = AutoModelForCausalLM.from_pretrained( |
| | model_id, |
| | load_in_8bit=True, |
| | device_map="auto" |
| | ) |
| | device = torch.device("cuda") |
| | else: |
| | model = AutoModelForCausalLM.from_pretrained(model_id) |
| | device = torch.device("cpu") |
| | |
| | # Define the generation parameters. |
| | generation_kwargs = { |
| | "max_new_tokens": 2048, |
| | "do_sample": True, |
| | "temperature": 0.6, |
| | "top_p": 1.0, |
| | "repetition_penalty": 1.08, |
| | "num_return_sequences": 1, |
| | "forced_eos_token_id": tokenizer.eos_token_id, |
| | "pad_token_id": tokenizer.eos_token_id |
| | } |
| | |
| | print("Enter your prompt (type 'exit' to quit):") |
| | while True: |
| | # Get user input. |
| | user_input = input("Input> ") |
| | if user_input.lower().strip() in ("exit", "quit"): |
| | break |
| | |
| | # Construct the prompt in your desired format. |
| | prompt = f""" |
| | <|start_header_id|>system<|end_header_id|> |
| | |
| | You are a helpful AI assistant named Llama, made by Meta AI. |
| | You are focused on providing systematic, well-reasoned responses. Response Structure: - Format: <think>{{reasoning}}</think>{{answer}} - Reasoning: Minimum 6 logical steps only when it required in <think> block - Process: Think first, then answer.<|eot_id|><|start_header_id|>user<|end_header_id|> |
| | |
| | {user_input}<|eot_id|><|start_header_id|>assistant<|end_header_id|> |
| | <think> |
| | """ |
| | |
| | # Tokenize the prompt and send to the selected device. |
| | input_ids = tokenizer.encode(prompt, return_tensors="pt", add_special_tokens=True).to(device) |
| | |
| | # Create a new TextStreamer instance for streaming responses. |
| | streamer = TextStreamer(tokenizer) |
| | generation_kwargs["streamer"] = streamer |
| | |
| | print("\nAssistant Response:") |
| | # Generate the text (tokens will stream to stdout via the streamer). |
| | outputs = model.generate(input_ids, **generation_kwargs) |
| | |
| | if __name__ == "__main__": |
| | main() |
| | ``` |
| |
|
| | Or alternatively: |
| | ```python |
| | import torch |
| | from transformers import pipeline |
| | |
| | model_id = "CreitinGameplays/Llama-3.1-8B-R1-experimental" |
| | |
| | pipe = pipeline( |
| | "text-generation", |
| | model=model_id, |
| | torch_dtype=torch.bfloat16, |
| | device_map="auto" |
| | ) |
| | |
| | messages = [{"role": "user", "content": "hello!"}] |
| | |
| | outputs = pipe( |
| | messages, |
| | temperature=0.6, |
| | repetition_penalty=1.08, |
| | max_new_tokens=2048 |
| | ) |
| | |
| | print(outputs[0]["generated_text"][-1]) |
| | ``` |