Andro0s commited on
Commit
4f41a14
verified
1 Parent(s): 37edf61

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -81
app.py CHANGED
@@ -1,104 +1,69 @@
1
  import os
2
  import torch
3
- from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
4
  from datasets import load_dataset
5
- from peft import LoraConfig, get_peft_model
6
 
7
- # -------------------------------
8
- # Configuraci贸n
9
- # -------------------------------
10
- MODEL_NAME = "codellama/CodeLlama-7b-hf" # Modelo base
11
- LORA_DIR = "lora_codellama" # Carpeta donde se guardar谩 LoRA
12
- DATASET_PATH = "tu_dataset.json" # Tu dataset local (JSON)
13
 
14
- # Crear carpeta si no existe
15
- os.makedirs(LORA_DIR, exist_ok=True)
16
-
17
- # -------------------------------
18
- # Cargar modelo y tokenizer
19
- # -------------------------------
20
- print("Cargando modelo base...")
21
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
22
- model = AutoModelForCausalLM.from_pretrained(
23
- MODEL_NAME,
24
- device_map="auto",
25
- torch_dtype=torch.float16
26
- )
27
 
28
- # -------------------------------
29
- # Configurar LoRA
30
- # -------------------------------
31
- lora_config = LoraConfig(
32
- r=16,
33
- lora_alpha=32,
34
- target_modules=["q_proj","v_proj"],
35
- lora_dropout=0.05,
36
- bias="none",
37
- task_type="CAUSAL_LM"
38
- )
39
 
40
- model = get_peft_model(model, lora_config)
41
 
42
- # -------------------------------
43
- # Cargar dataset
44
- # -------------------------------
45
- dataset = load_dataset("json", data_files=DATASET_PATH)
46
- dataset = dataset["train"]
47
 
48
- print("Columnas del dataset:", dataset.column_names)
 
 
 
 
49
 
50
- # -------------------------------
51
- # Funci贸n de tokenizaci贸n
52
- # -------------------------------
53
  def tokenize_function(examples):
54
- # Detectar columnas autom谩ticamente
55
- columns = dataset.column_names
56
- if "prompt" in columns and "completion" in columns:
57
- texts = [p + "\n" + c for p, c in zip(examples["prompt"], examples["completion"])]
58
- elif "text" in columns:
59
- texts = examples["text"]
60
- else:
61
- # Si no encuentra las columnas, lanza un error con info
62
- raise ValueError(f"Columnas inv谩lidas en dataset: {columns}")
63
-
64
- return tokenizer(texts, truncation=True, max_length=512)
65
 
66
- tokenized_datasets = dataset.map(tokenize_function, batched=True)
67
 
68
- data_collator = DataCollatorForLanguageModeling(
69
- tokenizer=tokenizer,
70
- mlm=False
71
- )
72
 
73
- # -------------------------------
74
- # Entrenamiento
75
- # -------------------------------
76
  training_args = TrainingArguments(
77
- output_dir=LORA_DIR,
78
- num_train_epochs=1, # Ajusta seg煤n tu tiempo y GPU
79
- per_device_train_batch_size=1,
80
- save_steps=500,
81
- save_total_limit=1,
82
- logging_steps=50,
83
- learning_rate=2e-4,
84
- fp16=True,
85
- gradient_accumulation_steps=4,
86
- push_to_hub=False
87
  )
88
 
89
  trainer = Trainer(
90
  model=model,
91
  args=training_args,
92
- train_dataset=tokenized_datasets,
93
- data_collator=data_collator
 
 
94
  )
95
 
96
- print("Comenzando entrenamiento de LoRA...")
97
- trainer.train()
98
-
99
- # -------------------------------
100
- # Guardar LoRA
101
- # -------------------------------
102
- print("Guardando LoRA en la carpeta:", LORA_DIR)
103
- model.save_pretrained(LORA_DIR)
104
- print("隆Entrenamiento completado! LoRA lista para producci贸n.")
 
1
  import os
2
  import torch
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorWithPadding
4
  from datasets import load_dataset
 
5
 
6
+ # ==============================
7
+ # Configuraci贸n del modelo
8
+ # ==============================
9
+ MODEL_NAME = "bigcode/starcoder"
10
+ OUTPUT_DIR = "./results"
 
11
 
12
+ # Cargar tokenizer y modelo
 
 
 
 
 
 
13
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 
 
 
 
 
14
 
15
+ # Corregir padding token
16
+ if tokenizer.pad_token is None:
17
+ tokenizer.pad_token = tokenizer.eos_token # usar EOS como padding
18
+ # Si prefieres agregar un token PAD nuevo:
19
+ # tokenizer.add_special_tokens({'pad_token': '[PAD]'})
 
 
 
 
 
 
20
 
21
+ model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
22
 
23
+ # Si agregaste un token nuevo, redimensionar embeddings
24
+ # model.resize_token_embeddings(len(tokenizer))
 
 
 
25
 
26
+ # ==============================
27
+ # Preparar dataset
28
+ # ==============================
29
+ # Ejemplo con wikitext (reemplaza con tu dataset)
30
+ dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:5%]") # ejemplo peque帽o
31
 
 
 
 
32
  def tokenize_function(examples):
33
+ return tokenizer(examples["text"], truncation=True)
 
 
 
 
 
 
 
 
 
 
34
 
35
+ tokenized_dataset = dataset.map(tokenize_function, batched=True)
36
 
37
+ # ==============================
38
+ # Configuraci贸n del DataCollator
39
+ # ==============================
40
+ data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)
41
 
42
+ # ==============================
43
+ # Configuraci贸n del Trainer
44
+ # ==============================
45
  training_args = TrainingArguments(
46
+ output_dir=OUTPUT_DIR,
47
+ evaluation_strategy="steps",
48
+ per_device_train_batch_size=2,
49
+ per_device_eval_batch_size=2,
50
+ num_train_epochs=1,
51
+ save_steps=10,
52
+ save_total_limit=2,
53
+ logging_steps=5,
54
+ report_to="none",
 
55
  )
56
 
57
  trainer = Trainer(
58
  model=model,
59
  args=training_args,
60
+ train_dataset=tokenized_dataset,
61
+ eval_dataset=tokenized_dataset,
62
+ tokenizer=tokenizer,
63
+ data_collator=data_collator,
64
  )
65
 
66
+ # ==============================
67
+ # Iniciar entrenamiento
68
+ # ==============================
69
+ trainer.train()