GenaroCoronel commited on
Commit
f47cc38
verified
1 Parent(s): d03512b

Create train.py

Browse files
Files changed (1) hide show
  1. train.py +37 -0
train.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
2
+ from datasets import load_dataset
3
+
4
+ # Cargar el dataset de Hugging Face
5
+ dataset = load_dataset("tu_usuario/mi_dataset", split="train")
6
+
7
+ # Cargar el modelo y el tokenizador
8
+ tokenizer = AutoTokenizer.from_pretrained("mistral-7b")
9
+ model = AutoModelForCausalLM.from_pretrained("mistral-7b")
10
+
11
+ # Preprocesar los datos (tokenizaci贸n)
12
+ def tokenize_function(examples):
13
+ return tokenizer(examples['text'], padding="max_length", truncation=True)
14
+
15
+ dataset = dataset.map(tokenize_function, batched=True)
16
+
17
+ # Configuraci贸n de entrenamiento
18
+ training_args = TrainingArguments(
19
+ output_dir='./results',
20
+ num_train_epochs=3,
21
+ per_device_train_batch_size=8,
22
+ per_device_eval_batch_size=16,
23
+ logging_dir='./logs',
24
+ )
25
+
26
+ trainer = Trainer(
27
+ model=model,
28
+ args=training_args,
29
+ train_dataset=dataset,
30
+ )
31
+
32
+ # Entrenar el modelo
33
+ trainer.train()
34
+
35
+ # Guardar el modelo entrenado
36
+ model.save_pretrained("tu_usuario/mi_modelo_entrenado")
37
+ tokenizer.save_pretrained("tu_usuario/mi_modelo_entrenado")