Update app.py
Browse files
app.py
CHANGED
|
@@ -77,7 +77,7 @@ def run_training(hf_token, model_name, new_repo_name, lora_r, lora_alpha, lora_d
|
|
| 77 |
def load_all_datasets():
|
| 78 |
streams = []
|
| 79 |
tasks = []
|
| 80 |
-
progress(0.1, desc="Analizando configuraciones
|
| 81 |
|
| 82 |
for ds_name in dataset_list:
|
| 83 |
try:
|
|
@@ -91,7 +91,7 @@ def run_training(hf_token, model_name, new_repo_name, lora_r, lora_alpha, lora_d
|
|
| 91 |
for c in configs:
|
| 92 |
tasks.append((ds_name, c))
|
| 93 |
|
| 94 |
-
progress(0.2, desc=f"Cargando {len(tasks)} fuentes
|
| 95 |
with ThreadPoolExecutor(max_workers=num_workers) as executor:
|
| 96 |
future_to_task = {executor.submit(load_single, d, c): (d, c) for d, c in tasks}
|
| 97 |
for future in as_completed(future_to_task):
|
|
@@ -105,14 +105,17 @@ def run_training(hf_token, model_name, new_repo_name, lora_r, lora_alpha, lora_d
|
|
| 105 |
|
| 106 |
loaded_streams = load_all_datasets()
|
| 107 |
if not loaded_streams:
|
| 108 |
-
return "Error
|
| 109 |
|
| 110 |
def all_samples():
|
| 111 |
return chain.from_iterable(loaded_streams)
|
| 112 |
|
| 113 |
progress(0.3, desc="Cargando Tokenizer...")
|
| 114 |
-
|
| 115 |
-
|
|
|
|
|
|
|
|
|
|
| 116 |
|
| 117 |
def create_text_lines(sample):
|
| 118 |
if isinstance(sample, dict):
|
|
@@ -157,8 +160,11 @@ def run_training(hf_token, model_name, new_repo_name, lora_r, lora_alpha, lora_d
|
|
| 157 |
except:
|
| 158 |
pass
|
| 159 |
|
| 160 |
-
progress(0.4, desc="Cargando Modelo
|
| 161 |
-
|
|
|
|
|
|
|
|
|
|
| 162 |
|
| 163 |
peft_config = LoraConfig(
|
| 164 |
r=int(lora_r),
|
|
@@ -202,22 +208,22 @@ def run_training(hf_token, model_name, new_repo_name, lora_r, lora_alpha, lora_d
|
|
| 202 |
progress(0.5, desc="Entrenando...")
|
| 203 |
trainer.train()
|
| 204 |
|
| 205 |
-
progress(0.8, desc="Guardando
|
| 206 |
trainer.save_model(output_dir)
|
| 207 |
|
| 208 |
-
progress(0.9, desc="Fusionando
|
| 209 |
ft = PeftModel.from_pretrained(original_model, output_dir, torch_dtype=torch.float32, is_trainable=False, device_map={"": device}).merge_and_unload()
|
| 210 |
|
| 211 |
final_path = "/content/merged_model"
|
| 212 |
ft.save_pretrained(final_path, safe_serialization=True)
|
| 213 |
tokenizer.save_pretrained(final_path)
|
| 214 |
|
| 215 |
-
progress(0.95, desc="Subiendo
|
| 216 |
full_repo = f"{username}/{new_repo_name}"
|
| 217 |
create_repo(full_repo, token=hf_token, exist_ok=True)
|
| 218 |
upload_folder(folder_path=final_path, repo_id=full_repo, token=hf_token)
|
| 219 |
|
| 220 |
-
return f"
|
| 221 |
|
| 222 |
custom_css = """
|
| 223 |
body {background-color: #0b0f19; color: #e0e6ed;}
|
|
@@ -240,8 +246,8 @@ with gr.Blocks(css=custom_css, title="Entrenador LLM Ultimate") as demo:
|
|
| 240 |
|
| 241 |
with gr.Row():
|
| 242 |
with gr.Column(scale=1):
|
| 243 |
-
hf_token_input = gr.Textbox(label="HuggingFace Token
|
| 244 |
-
model_input = gr.Textbox(label="Modelo Base", value="
|
| 245 |
repo_input = gr.Textbox(label="Nombre Nuevo Repo", value="multi-dataset-model-v1")
|
| 246 |
|
| 247 |
with gr.Column(scale=1):
|
|
@@ -258,7 +264,7 @@ with gr.Blocks(css=custom_css, title="Entrenador LLM Ultimate") as demo:
|
|
| 258 |
|
| 259 |
datasets_input = gr.Textbox(label="Fuentes de Datos (Datasets)", value="", placeholder="Pega aquí tus datasets separados por coma o salto de línea.\nEjemplo:\nSalesforce/fineweb_deduplicated\nbigcode/the-stack, v2", lines=12, elem_classes="input-box")
|
| 260 |
|
| 261 |
-
train_btn = gr.Button("🚀 INICIAR ENTRENAMIENTO
|
| 262 |
status_output = gr.Textbox(label="Log del Sistema", interactive=False, lines=3)
|
| 263 |
|
| 264 |
train_btn.click(
|
|
|
|
| 77 |
def load_all_datasets():
|
| 78 |
streams = []
|
| 79 |
tasks = []
|
| 80 |
+
progress(0.1, desc="Analizando configuraciones...")
|
| 81 |
|
| 82 |
for ds_name in dataset_list:
|
| 83 |
try:
|
|
|
|
| 91 |
for c in configs:
|
| 92 |
tasks.append((ds_name, c))
|
| 93 |
|
| 94 |
+
progress(0.2, desc=f"Cargando {len(tasks)} fuentes...")
|
| 95 |
with ThreadPoolExecutor(max_workers=num_workers) as executor:
|
| 96 |
future_to_task = {executor.submit(load_single, d, c): (d, c) for d, c in tasks}
|
| 97 |
for future in as_completed(future_to_task):
|
|
|
|
| 105 |
|
| 106 |
loaded_streams = load_all_datasets()
|
| 107 |
if not loaded_streams:
|
| 108 |
+
return "Error: No se pudo cargar ningún dataset válido."
|
| 109 |
|
| 110 |
def all_samples():
|
| 111 |
return chain.from_iterable(loaded_streams)
|
| 112 |
|
| 113 |
progress(0.3, desc="Cargando Tokenizer...")
|
| 114 |
+
try:
|
| 115 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, padding_side="left", add_eos_token=True, add_bos_token=True)
|
| 116 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 117 |
+
except Exception as e:
|
| 118 |
+
return f"Error cargando tokenizer: {str(e)}"
|
| 119 |
|
| 120 |
def create_text_lines(sample):
|
| 121 |
if isinstance(sample, dict):
|
|
|
|
| 160 |
except:
|
| 161 |
pass
|
| 162 |
|
| 163 |
+
progress(0.4, desc="Cargando Modelo...")
|
| 164 |
+
try:
|
| 165 |
+
original_model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).to(device)
|
| 166 |
+
except Exception as e:
|
| 167 |
+
return f"Error cargando modelo: {str(e)}"
|
| 168 |
|
| 169 |
peft_config = LoraConfig(
|
| 170 |
r=int(lora_r),
|
|
|
|
| 208 |
progress(0.5, desc="Entrenando...")
|
| 209 |
trainer.train()
|
| 210 |
|
| 211 |
+
progress(0.8, desc="Guardando...")
|
| 212 |
trainer.save_model(output_dir)
|
| 213 |
|
| 214 |
+
progress(0.9, desc="Fusionando...")
|
| 215 |
ft = PeftModel.from_pretrained(original_model, output_dir, torch_dtype=torch.float32, is_trainable=False, device_map={"": device}).merge_and_unload()
|
| 216 |
|
| 217 |
final_path = "/content/merged_model"
|
| 218 |
ft.save_pretrained(final_path, safe_serialization=True)
|
| 219 |
tokenizer.save_pretrained(final_path)
|
| 220 |
|
| 221 |
+
progress(0.95, desc="Subiendo...")
|
| 222 |
full_repo = f"{username}/{new_repo_name}"
|
| 223 |
create_repo(full_repo, token=hf_token, exist_ok=True)
|
| 224 |
upload_folder(folder_path=final_path, repo_id=full_repo, token=hf_token)
|
| 225 |
|
| 226 |
+
return f"Completado: https://huggingface.co/{full_repo}"
|
| 227 |
|
| 228 |
custom_css = """
|
| 229 |
body {background-color: #0b0f19; color: #e0e6ed;}
|
|
|
|
| 246 |
|
| 247 |
with gr.Row():
|
| 248 |
with gr.Column(scale=1):
|
| 249 |
+
hf_token_input = gr.Textbox(label="HuggingFace Token", type="password", placeholder="hf_...", value=os.getenv("HF_TOKEN", ""))
|
| 250 |
+
model_input = gr.Textbox(label="Modelo Base", value="", placeholder="Ej: Qwen/Qwen2.5-0.5B")
|
| 251 |
repo_input = gr.Textbox(label="Nombre Nuevo Repo", value="multi-dataset-model-v1")
|
| 252 |
|
| 253 |
with gr.Column(scale=1):
|
|
|
|
| 264 |
|
| 265 |
datasets_input = gr.Textbox(label="Fuentes de Datos (Datasets)", value="", placeholder="Pega aquí tus datasets separados por coma o salto de línea.\nEjemplo:\nSalesforce/fineweb_deduplicated\nbigcode/the-stack, v2", lines=12, elem_classes="input-box")
|
| 266 |
|
| 267 |
+
train_btn = gr.Button("🚀 INICIAR ENTRENAMIENTO", elem_classes="primary-btn")
|
| 268 |
status_output = gr.Textbox(label="Log del Sistema", interactive=False, lines=3)
|
| 269 |
|
| 270 |
train_btn.click(
|