Upload 3 files
Browse files- combined_tokenized_data.txt +0 -0
- requirements.txt +4 -0
- train_phi3.py +12 -0
combined_tokenized_data.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
transformers==4.28.1
|
| 2 |
+
datasets==2.12.0
|
| 3 |
+
accelerate==0.21.0
|
| 4 |
+
torch==2.0.1
|
train_phi3.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
|
| 2 |
+
from datasets import load_dataset
|
| 3 |
+
|
| 4 |
+
# Load model and tokenizer
|
| 5 |
+
model_name = "microsoft/Phi-3-mini-128k-instruct"
|
| 6 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 7 |
+
model = AutoModelForCausalLM.from_pretrained(model_name)
|
| 8 |
+
|
| 9 |
+
# Load dataset
|
| 10 |
+
dataset = load_dataset("text", data_files="combined_tokenized_data.txt")["train"]
|
| 11 |
+
|
| 12 |
+
# ... (rest of your code for tokenization, data collator, training arguments, etc.)
|