Train_xd

Runtime error

App Files Files Community

Ignaciohhhhggfgjfrffd commited on Nov 9

Commit

231ae13

verified ·

1 Parent(s): c4e90bf

Update app.py

Browse files

Files changed (1) hide show

app.py +194 -115

app.py CHANGED Viewed

@@ -16,6 +16,9 @@ import re
 import ast
 from itertools import islice
 from pathlib import Path
 import torch
 import torch.nn.functional as F
 from torch.utils.data import DataLoader
@@ -29,15 +32,14 @@ from langdetect import detect_langs
 import textstat
 from datasketch import MinHash, MinHashLSH
 import gradio as gr
-import spaces
-from datasets import load_dataset, IterableDataset, Dataset, DatasetDict
 from huggingface_hub import login, whoami, create_repo, upload_folder, HfApi
 from transformers import (
     AutoModelForCausalLM, AutoTokenizer, AutoConfig, TrainingArguments, Trainer,
     AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer,
-    AutoModelForImageClassification,
     AutoImageProcessor, AutoModelForAudioClassification, AutoFeatureExtractor, AutoModelForTokenClassification,
-    DataCollatorForTokenClassification, AutoModelForQuestionAnswering,
     AutoProcessor, DataCollatorWithPadding, pipeline, CLIPTextModel, CLIPTokenizer,
     DataCollatorForSeq2Seq, AutoModelForSequenceClassification, BitsAndBytesConfig,
     LlamaConfig, LlamaForCausalLM, MistralConfig, MistralForCausalLM, GemmaConfig, GemmaForCausalLM, GPT2Config, GPT2LMHeadModel,
@@ -54,7 +56,6 @@ from diffusers import (
 )
 import evaluate as hf_evaluate
 from jinja2 import Template
-from collections import defaultdict
 logger = logging.getLogger(__name__)
@@ -94,7 +95,8 @@ TASK_TO_PIPELINE_MAP = {
     "DreamBooth LoRA (Text-to-Image)": "text-to-image",
 }
-MODEL_CARD_TEMPLATE = """---
 language: es
 license: apache-2.0
 tags:
@@ -132,7 +134,8 @@ Este modelo es una versión afinada de [{base_model}](https://huggingface.co/{ba
 - Gradio
 """
-DATASET_CARD_TEMPLATE = """---
 license: mit
 ---
@@ -147,6 +150,7 @@ Este dataset fue creado utilizando la herramienta [AutoTrain-Advanced](https://h
 - **Fecha de Creación:** {date}
 """
 class DebiasingSFTTrainer(SFTTrainer):
     def __init__(self, *args, reweighting_terms=None, reweighting_factor=1.0, **kwargs):
         super().__init__(*args, **kwargs)
@@ -164,6 +168,54 @@ class DebiasingSFTTrainer(SFTTrainer):
                     break
         return (loss, outputs) if return_outputs else loss
 def hf_login(token):
     if not token:
         return "Por favor, introduce un token."
@@ -174,6 +226,7 @@ def hf_login(token):
     except Exception as e:
         return f"❌ Error en la conexión: {e}"
 def _clean_text(example, text_col, **kwargs):
     text = example.get(text_col, "")
     if not isinstance(text, str):
@@ -191,6 +244,7 @@ def _clean_text(example, text_col, **kwargs):
     example[text_col] = text
     return example
 def _apply_quality_filters(example, text_col, min_len, max_len, rep_threshold, exclude_keywords):
     text = example.get(text_col, "")
     if not isinstance(text, str): return False
@@ -204,6 +258,7 @@ def _apply_quality_filters(example, text_col, min_len, max_len, rep_threshold, e
     lower_text = text.lower()
     return not any(keyword in lower_text for keyword in exclude_keywords)
 def _get_filter_functions(**kwargs):
     filters = []
     if kwargs.get('enable_quality_filter'):
@@ -252,6 +307,7 @@ def _get_filter_functions(**kwargs):
         filters.append(stats_filter)
     return filters
 def _load_hf_streaming(ids, split="train", probabilities=None):
     streams = []
     valid_ids = []
@@ -279,10 +335,9 @@ def _load_hf_streaming(ids, split="train", probabilities=None):
     if probabilities and len(probabilities) != len(streams):
         logger.warning(f"Number of probabilities ({len(probabilities)}) does not match number of valid datasets ({len(streams)}). Ignoring weights.")
         probabilities = None
-    from datasets import interleave_datasets
     return interleave_datasets(streams, probabilities=probabilities)
 def _load_uploaded_stream(files):
     all_rows = []
     for f in files or []:
@@ -304,6 +359,7 @@ def _load_uploaded_stream(files):
     random.shuffle(all_rows)
     return {"train": all_rows[:-val_size] if val_size > 0 else all_rows, "validation": all_rows[-val_size:] if val_size > 0 else []}
 def _guess_columns(sample):
     text_col, image_col, audio_col, label_col = "text", "image", "audio", "label"
     if not isinstance(sample, dict):
@@ -320,6 +376,7 @@ def _guess_columns(sample):
     elif "labels" in keys: label_col = keys["labels"]
     return text_col, image_col, audio_col, label_col
 def _apply_cda(dataset, text_col, cda_config_str):
     try:
         swap_groups = json.loads(cda_config_str)
@@ -352,6 +409,7 @@ def _apply_cda(dataset, text_col, cda_config_str):
                 current_texts.update(next_texts)
     return IterableDataset.from_generator(cda_generator)
 def _apply_back_translation(dataset, text_col, ratio, model_id, reverse_model_id):
     if not ratio or ratio <= 0:
         return dataset
@@ -379,6 +437,7 @@ def _apply_back_translation(dataset, text_col, ratio, model_id, reverse_model_id
                         logger.warning(f"Error en retrotraducción: {e}")
     return IterableDataset.from_generator(bt_generator)
 def _generate_synthetic_data(original_dataset, text_col, model_id, num_samples, prompt_template):
     if not num_samples or num_samples <= 0:
         return None
@@ -411,6 +470,7 @@ def _generate_synthetic_data(original_dataset, text_col, model_id, num_samples,
                 continue
     return IterableDataset.from_generator(synthetic_generator)
 def _calculate_auto_config(block_size, is_gpt2_like, steps_per_epoch_estimate, batch_size, gradient_accumulation):
     safe_steps = int(steps_per_epoch_estimate or 10000)
     safe_batch_size = int(batch_size or 1)
@@ -429,6 +489,7 @@ def _calculate_auto_config(block_size, is_gpt2_like, steps_per_epoch_estimate, b
     kv_heads = heads if is_gpt2_like else (max(1, heads // 4))
     return vocab_size, hidden_size, hidden_size * 2, layers, heads, safe_block_size, False, kv_heads
 def _get_eval_dataset(train_ds_id, eval_ds_id, uploaded_val_data, update_logs_fn):
     if eval_ds_id:
         yield update_logs_fn(f"Cargando dataset de evaluación: {eval_ds_id}", "Evaluación")
@@ -450,6 +511,7 @@ def _get_eval_dataset(train_ds_id, eval_ds_id, uploaded_val_data, update_logs_fn
     yield update_logs_fn("No se proporcionó dataset de evaluación. Omitiendo.", "Evaluación")
     return None
 def _create_training_args(output_dir, repo_id, **kwargs):
     neftune_alpha = float(kwargs.get('neftune_noise_alpha', 0.0))
     optim_args_dict = {}
@@ -468,11 +530,12 @@ def _create_training_args(output_dir, repo_id, **kwargs):
         "save_strategy": "steps",
         "logging_steps": int(kwargs.get('logging_steps', 10)),
         "save_steps": int(kwargs.get('save_steps', 50)),
         "eval_steps": int(kwargs.get('save_steps', 50)) if kwargs.get('run_evaluation', False) else None,
         "learning_rate": float(kwargs.get('learning_rate', 2e-5)),
         "fp16": kwargs.get('mixed_precision') == 'fp16' and device == 'cuda',
         "bf16": kwargs.get('mixed_precision') == 'bf16' and device == 'cuda',
-        "max_grad_norm": float(kwargs.get('max_grad_norm', 0.3)),
         "warmup_ratio": float(kwargs.get('warmup_ratio', 0.03)),
         "lr_scheduler_type": kwargs.get('scheduler', 'cosine'),
         "weight_decay": float(kwargs.get('weight_decay', 0.01)),
@@ -507,15 +570,21 @@ def _create_training_args(output_dir, repo_id, **kwargs):
     return TrainingArguments(**args_dict)
 def _generic_model_loader(model_name_or_path, model_class, **kwargs):
     quantization_type = kwargs.get('quantization', 'no')
     bnb_config = None
     if quantization_type != "no" and device == "cuda":
-        if quantization_type == "4bit":
-            bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch_dtype_auto, bnb_4bit_use_double_quant=True)
-        elif quantization_type == "8bit":
-            bnb_config = BitsAndBytesConfig(load_in_8bit=True)
     elif quantization_type != "no" and device == "cpu":
         logger.warning("La cuantización solo es compatible con GPU CUDA. Se procederá sin cuantización.")
@@ -556,6 +625,7 @@ def _generic_model_loader(model_name_or_path, model_class, **kwargs):
     return model
 def _find_all_linear_names(model, quantization_type):
     cls = torch.nn.Linear
     if quantization_type != 'no' and device == "cuda":
@@ -581,6 +651,7 @@ def _find_all_linear_names(model, quantization_type):
     return list(lora_module_names.intersection(common_targets)) or list(lora_module_names)
 def _conversation_formatting_func(example, tokenizer, **kwargs):
     conv_col = ""
     for key in ["messages", "conversations", "turns"]:
@@ -592,6 +663,7 @@ def _conversation_formatting_func(example, tokenizer, **kwargs):
         except: return ""
     return tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=False)
 def _sft_formatting_func(example, text_col, tokenizer, **kwargs):
     if kwargs.get('enable_cot_input') or kwargs.get('enable_tool_use_input'):
         messages = []
@@ -610,9 +682,11 @@ def _sft_formatting_func(example, text_col, tokenizer, **kwargs):
                 return "\n".join([m['content'] for m in messages])
     return example.get(text_col, "")
 def _dpo_formatting_func(example, **kwargs):
     return {"prompt": example.get(kwargs.get('prompt_col_input', 'prompt'), ""), "chosen": example.get(kwargs.get('dpo_chosen_col_input', 'chosen'), ""), "rejected": example.get(kwargs.get('dpo_rejected_col_input', 'rejected'), "")}
 def _evaluate_perplexity(model, tokenizer, eval_dataset, text_col):
     model.eval()
     encodings = tokenizer("\n\n".join(ex[text_col] for ex in islice(eval_dataset, 1000)), return_tensors="pt").to(model.device)
@@ -637,6 +711,7 @@ def _evaluate_perplexity(model, tokenizer, eval_dataset, text_col):
     ppl = torch.exp(torch.stack(nlls).mean())
     return ppl.item()
 def _merge_multiple_loras(base_model_id, adapter_ids_str, weights_str, combination_type):
     adapter_ids = [s.strip() for s in adapter_ids_str.split(',') if s.strip()]
     if not adapter_ids:
@@ -668,6 +743,7 @@ def _merge_multiple_loras(base_model_id, adapter_ids_str, weights_str, combinati
     yield f"Fusión de adaptadores completada. El entrenamiento continuará con el modelo fusionado en {temp_dir}."
     return temp_dir
 def _run_trainer_and_upload(trainer, tokenizer, repo_id, update_logs_fn, model_card_content, **kwargs):
     yield update_logs_fn("Iniciando ciclo de entrenamiento...", "Entrenando")
     trainer.train(resume_from_checkpoint=kwargs.get('resume_from_checkpoint') or False)
@@ -677,6 +753,7 @@ def _run_trainer_and_upload(trainer, tokenizer, repo_id, update_logs_fn, model_c
         eval_logs = [log for log in trainer.state.log_history if 'eval_loss' in log]
         if eval_logs:
             final_metrics = eval_logs[-1]
     yield update_logs_fn("Entrenamiento finalizado.", "Guardando")
     output_dir = trainer.args.output_dir
@@ -695,6 +772,7 @@ def _run_trainer_and_upload(trainer, tokenizer, repo_id, update_logs_fn, model_c
         torch.cuda.empty_cache()
     return output_dir, final_metrics
 def train_sft_dpo(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs):
     output_dir = tempfile.mkdtemp()
     is_dpo = kwargs.get('training_mode') == "DPO (Direct Preference Optimization)"
@@ -726,13 +804,13 @@ def train_sft_dpo(model_name, train_dataset, repo_id, update_logs_fn, model_card
         if kwargs.get('run_evaluation'):
             eval_dataset_gen = _get_eval_dataset(kwargs.get('datasets_hf_text').split(","), kwargs.get('eval_dataset_hf'), kwargs.get('uploaded_val_data'), update_logs_fn)
             for update in eval_dataset_gen:
-                if isinstance(update, tuple):
                     yield update
                 else:
                     eval_dataset = update
         TrainerClass = DPOTrainer if is_dpo else (DebiasingSFTTrainer if kwargs.get('enable_loss_reweighting') else SFTTrainer)
-        trainer_kwargs = {"model": model, "args": training_args, "train_dataset": train_dataset, "eval_dataset": eval_dataset, "peft_config": peft_config, "tokenizer": tokenizer}
         if is_dpo:
             trainer_kwargs.update({"beta": 0.1, "max_length": int(kwargs.get('block_size')), "max_prompt_length": int(kwargs.get('block_size')) // 2})
@@ -743,24 +821,15 @@ def train_sft_dpo(model_name, train_dataset, repo_id, update_logs_fn, model_card
             trainer_kwargs.update({"formatting_func": lambda ex: _sft_formatting_func(example=ex, tokenizer=tokenizer, text_col=text_col, **sft_kwargs)})
             if kwargs.get('enable_loss_reweighting'):
                 trainer_kwargs.update({'reweighting_terms': kwargs.get('reweighting_terms', '').split(','), 'reweighting_factor': kwargs.get('reweighting_factor', 2.0)})
-        try:
-            trainer = TrainerClass(**trainer_kwargs)
-        except TypeError as e:
-            if "unexpected keyword argument 'tokenizer'" in str(e):
-                logger.warning("Caught TypeError for tokenizer argument. Retrying without it for TRL compatibility.")
-                trainer_kwargs.pop("tokenizer", None)
-                trainer = TrainerClass(**trainer_kwargs)
-                trainer.tokenizer = tokenizer
-            else:
-                raise e
         final_model_path, final_metrics = yield from _run_trainer_and_upload(trainer, tokenizer, repo_id, update_logs_fn, model_card_content, **kwargs)
         return final_model_path, final_metrics
     except Exception as e:
         raise Exception(f"Error en {'DPO' if is_dpo else 'SFT'}: {e}\n{traceback.format_exc()}")
 def train_sequence_classification(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs):
     output_dir = tempfile.mkdtemp()
     try:
@@ -771,23 +840,26 @@ def train_sequence_classification(model_name, train_dataset, repo_id, update_log
         tokenizer_id = kwargs.get('tokenizer_name') or model_name
         yield update_logs_fn(f"Cargando tokenizer '{tokenizer_id}'...", "Configuración")
         tokenizer = AutoTokenizer.from_pretrained(tokenizer_id, trust_remote_code=True)
         yield update_logs_fn(f"Cargando modelo '{model_name}'...", "Configuración")
         model = _generic_model_loader(model_name, AutoModelForSequenceClassification, num_labels=len(labels), label2id=label2id, id2label=id2label, **kwargs)
         def preprocess(examples):
             return tokenizer(examples[kwargs['text_col']], truncation=True, max_length=512)
-        train_dataset = train_dataset.map(preprocess)
         eval_dataset = None
         if kwargs.get('run_evaluation'):
             eval_dataset_gen = _get_eval_dataset(kwargs.get('datasets_hf_text').split(","), kwargs.get('eval_dataset_hf'), kwargs.get('uploaded_val_data'), update_logs_fn)
             for update in eval_dataset_gen:
-                if isinstance(update, tuple):
                     yield update
                 else:
                     eval_dataset = update
-            if eval_dataset: eval_dataset = eval_dataset.map(preprocess)
         metric = hf_evaluate.load("accuracy")
         def compute_metrics(eval_pred):
@@ -807,6 +879,7 @@ def train_sequence_classification(model_name, train_dataset, repo_id, update_log
     except Exception as e:
         raise Exception(f"Error en Sequence Classification: {e}\n{traceback.format_exc()}")
 def train_token_classification(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs):
     output_dir = tempfile.mkdtemp()
     try:
@@ -843,7 +916,7 @@ def train_token_classification(model_name, train_dataset, repo_id, update_logs_f
         if kwargs.get('run_evaluation'):
             eval_dataset_gen = _get_eval_dataset(kwargs.get('datasets_hf_text').split(","), kwargs.get('eval_dataset_hf'), kwargs.get('uploaded_val_data'), update_logs_fn)
             for update in eval_dataset_gen:
-                if isinstance(update, tuple):
                     yield update
                 else:
                     eval_dataset = update
@@ -871,6 +944,7 @@ def train_token_classification(model_name, train_dataset, repo_id, update_logs_f
     except Exception as e:
         raise Exception(f"Error en Token Classification: {e}\n{traceback.format_exc()}")
 def train_question_answering(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs):
     output_dir = tempfile.mkdtemp()
     try:
@@ -935,7 +1009,7 @@ def train_question_answering(model_name, train_dataset, repo_id, update_logs_fn,
             eval_dataset_raw_gen = _get_eval_dataset(kwargs.get('datasets_hf_text').split(","), kwargs.get('eval_dataset_hf'), kwargs.get('uploaded_val_data'), update_logs_fn)
             eval_dataset_raw = None
             for update in eval_dataset_raw_gen:
-                if isinstance(update, tuple):
                     yield update
                 else:
                     eval_dataset_raw = update
@@ -955,6 +1029,7 @@ def train_question_answering(model_name, train_dataset, repo_id, update_logs_fn,
     except Exception as e:
         raise Exception(f"Error en Question Answering: {e}\n{traceback.format_exc()}")
 def train_seq2seq(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs):
     output_dir = tempfile.mkdtemp()
     try:
@@ -979,7 +1054,7 @@ def train_seq2seq(model_name, train_dataset, repo_id, update_logs_fn, model_card
         if kwargs.get('run_evaluation'):
             eval_dataset_gen = _get_eval_dataset(kwargs.get('datasets_hf_text').split(","), kwargs.get('eval_dataset_hf'), kwargs.get('uploaded_val_data'), update_logs_fn)
             for update in eval_dataset_gen:
-                if isinstance(update, tuple):
                     yield update
                 else:
                     eval_dataset = update
@@ -1012,6 +1087,7 @@ def train_seq2seq(model_name, train_dataset, repo_id, update_logs_fn, model_card
     except Exception as e:
         raise Exception(f"Error en Seq2Seq: {e}\n{traceback.format_exc()}")
 def train_text_to_image(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs):
     if device == 'cpu':
         raise ValueError("El entrenamiento de Text-to-Image solo es compatible con GPU CUDA.")
@@ -1023,22 +1099,30 @@ def train_text_to_image(model_name, train_dataset, repo_id, update_logs_fn, mode
     yield update_logs_fn("Configurando componentes de Diffusers...", "Text-to-Image (LoRA)")
     tokenizer = CLIPTokenizer.from_pretrained(model_name, subfolder="tokenizer")
-    text_encoder = CLIPTextModel.from_pretrained(model_name, subfolder="text_encoder")
-    vae = AutoencoderKL.from_pretrained(model_name, subfolder="vae")
-    unet = UNet2DConditionModel.from_pretrained(model_name, subfolder="unet")
     noise_scheduler = DDPMScheduler.from_pretrained(model_name, subfolder="scheduler")
     vae.requires_grad_(False)
     text_encoder.requires_grad_(False)
     unet.train()
-    yield update_logs_fn("Agregando adaptadores LoRA al modelo...", "Text-to-Image (LoRA)")
     unet_lora_config = LoraConfig(
         r=int(kwargs.get('lora_r', 16)), lora_alpha=int(kwargs.get('lora_alpha', 32)),
         target_modules=["to_q", "to_k", "to_v", "to_out.0"],
     )
     unet.add_adapter(unet_lora_config)
     yield update_logs_fn("Procesando dataset de imágenes...", "Text-to-Image (LoRA)")
     resolution = int(kwargs.get('diffusion_resolution', 512))
@@ -1050,7 +1134,7 @@ def train_text_to_image(model_name, train_dataset, repo_id, update_logs_fn, mode
     ])
     def preprocess_train(examples):
-        images = [Image.open(image).convert("RGB") for image in examples[kwargs.get('image_col', 'image')]]
         examples["pixel_values"] = [train_transforms(image) for image in images]
         examples["input_ids"] = tokenizer(examples[kwargs.get('text_col', 'text')], max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt").input_ids
         return examples
@@ -1064,14 +1148,17 @@ def train_text_to_image(model_name, train_dataset, repo_id, update_logs_fn, mode
     def collate_fn(examples):
         pixel_values = torch.stack([example["pixel_values"] for example in examples])
-        input_ids = torch.stack([example["input_ids"] for example in examples])
         return {"pixel_values": pixel_values, "input_ids": input_ids}
     train_dataloader = DataLoader(processed_dataset, shuffle=True, collate_fn=collate_fn, batch_size=int(kwargs.get('batch_size', 1)))
-    yield update_logs_fn("Configurando optimizador y planificador...", "Text-to-Image (LoRA)")
     optimizer = torch.optim.AdamW(
-        unet.parameters(), lr=float(kwargs.get('learning_rate', 2e-5)),
         betas=(float(kwargs.get('adam_beta1', 0.9)), float(kwargs.get('adam_beta2', 0.999))),
         weight_decay=float(kwargs.get('weight_decay', 0.01)),
         eps=float(kwargs.get('adam_epsilon', 1e-8)),
@@ -1087,36 +1174,34 @@ def train_text_to_image(model_name, train_dataset, repo_id, update_logs_fn, mode
         num_training_steps=max_train_steps,
     )
-    unet, optimizer, train_dataloader, lr_scheduler, text_encoder, vae = accelerator.prepare(
-        unet, optimizer, train_dataloader, lr_scheduler, text_encoder, vae
     )
-    text_encoder.to(accelerator.device, dtype=torch_dtype_auto)
     vae.to(accelerator.device, dtype=torch_dtype_auto)
-    yield update_logs_fn("Iniciando bucle de entrenamiento de difusión...", "Text-to-Image (LoRA)")
     global_step = 0
     final_loss = 0
     for epoch in range(num_epochs):
         for step, batch in enumerate(train_dataloader):
             with accelerator.accumulate(unet):
-                latents = vae.encode(batch["pixel_values"].to(torch_dtype_auto)).latent_dist.sample()
                 latents = latents * vae.config.scaling_factor
                 noise = torch.randn_like(latents)
                 bsz = latents.shape[0]
-                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
-                timesteps = timesteps.long()
                 noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
-                encoder_hidden_states = text_encoder(batch["input_ids"].to(accelerator.device))[0]
                 noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
                 loss = F.mse_loss(noise_pred.float(), noise.float(), reduction="mean")
                 final_loss = loss.detach().item()
                 accelerator.backward(loss)
                 if accelerator.sync_gradients:
-                    accelerator.clip_grad_norm_(unet.parameters(), float(kwargs.get('max_grad_norm', 1.0)))
                 optimizer.step()
                 lr_scheduler.step()
@@ -1124,16 +1209,21 @@ def train_text_to_image(model_name, train_dataset, repo_id, update_logs_fn, mode
             if accelerator.is_main_process:
                 if global_step % int(kwargs.get('logging_steps', 10)) == 0:
-                    yield update_logs_fn(f"Epoch {epoch}, Step {step}, Loss: {final_loss}", "Text-to-Image (LoRA)")
             global_step += 1
-    yield update_logs_fn("Entrenamiento completado, guardando modelo...", "Text-to-Image (LoRA)")
     accelerator.wait_for_everyone()
     if accelerator.is_main_process:
-        unwrapped_unet = accelerator.unwrap_model(unet)
-        pipeline = StableDiffusionText2ImagePipeline.from_pretrained(model_name, torch_dtype=torch_dtype_auto)
-        pipeline.unet.load_state_dict(unwrapped_unet.state_dict())
         pipeline.save_pretrained(output_dir)
         with open(os.path.join(output_dir, "README.md"), "w", encoding="utf-8") as f:
@@ -1148,7 +1238,7 @@ def train_text_to_image(model_name, train_dataset, repo_id, update_logs_fn, mode
         torch.cuda.empty_cache()
     return output_dir, {"final_loss": final_loss}
 def train_dreambooth_lora(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs):
     if device == 'cpu':
         raise ValueError("El entrenamiento de DreamBooth solo es compatible con GPU CUDA.")
@@ -1163,11 +1253,12 @@ def train_dreambooth_lora(model_name, train_dataset, repo_id, update_logs_fn, mo
     train_dataset = train_dataset.map(add_prompt)
-    yield update_logs_fn(f"Usando el prompt de instancia para todas las imágenes: '{dreambooth_prompt}'", "DreamBooth LoRA (Text-to-Image)")
     final_model_path, final_metrics = yield from train_text_to_image(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs)
     return final_model_path, final_metrics
 def _get_data_processing_pipeline(**kwargs):
     hf_ids = [x.strip() for x in (kwargs.get('datasets_hf_text') or "").split(",") if x.strip()]
     if not hf_ids and not kwargs.get('uploads'):
@@ -1194,9 +1285,8 @@ def _get_data_processing_pipeline(**kwargs):
             if train_dataset is None:
                 train_dataset = hf_train_dataset
             else:
-                from datasets import interleave_datasets
                 all_streams = [train_dataset, hf_train_dataset]
-                all_probs = [0.5, 0.5] if not probabilities else [probabilities] + probabilities[1:]
                 train_dataset = interleave_datasets(all_streams, probabilities=all_probs)
     if train_dataset is None:
@@ -1206,7 +1296,8 @@ def _get_data_processing_pipeline(**kwargs):
     text_col, image_col, audio_col, label_col = _guess_columns(first_example)
     kwargs.update({'text_col': text_col, 'image_col': image_col, 'audio_col': audio_col, 'label_col': label_col, 'uploaded_val_data': uploaded_val_data})
-    if kwargs['training_mode'] not in ["DreamBooth LoRA (Text-to-Image)", "Text-to-Image (LoRA)"]:
         if any([kwargs.get('remove_html_tags'), kwargs.get('normalize_whitespace'), kwargs.get('remove_urls_emails'), kwargs.get('redact_pii')]):
             clean_kwargs = {k:v for k,v in kwargs.items() if k in ['remove_html_tags', 'normalize_whitespace', 'remove_urls_emails', 'redact_pii']}
             train_dataset = train_dataset.map(lambda ex: _clean_text(ex, text_col, **clean_kwargs))
@@ -1229,38 +1320,17 @@ def _get_data_processing_pipeline(**kwargs):
         dedup_method = kwargs.get('deduplication_method')
         if dedup_method != 'Ninguna':
-            base_iterator = train_dataset
-            if dedup_method == 'Exacta':
-                def dedup_generator_exact():
-                    seen_texts = set()
-                    for example in base_iterator:
-                        text = example.get(text_col, "")
-                        if not isinstance(text, str) or text not in seen_texts:
-                            if isinstance(text, str) and text:
-                                seen_texts.add(text)
-                            yield example
-                train_dataset = IterableDataset.from_generator(dedup_generator_exact)
-            elif dedup_method == 'Semántica (MinHash)':
-                threshold = kwargs.get('minhash_threshold', 0.85)
-                num_perm = int(kwargs.get('minhash_num_perm', 128))
-                def dedup_generator_minhash():
-                    lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
-                    for i, example in enumerate(base_iterator):
-                        text = example.get(text_col, "")
-                        if text and isinstance(text, str) and text.strip():
-                            m = MinHash(num_perm=num_perm)
-                            for d in text.split():
-                                m.update(d.encode('utf8'))
-                            if not lsh.query(m):
-                                lsh.insert(f"key_{i}", m)
-                                yield example
-                        else:
-                            yield example
-                train_dataset = IterableDataset.from_generator(dedup_generator_minhash)
     return train_dataset, kwargs
 def _train_and_upload(**kwargs):
     logs, repo_link, final_model_path, final_metrics = "", "", None, {}
@@ -1350,7 +1420,6 @@ def _train_and_upload(**kwargs):
                  raise Exception(f"No se pudo cargar el tokenizer base '{tokenizer_id}' para el modelo desde cero: {e}")
             base_model_id_for_training = temp_model_dir
             kwargs["peft"] = False
-            kwargs["merge_adapter"] = False
             kwargs['tokenizer_name'] = temp_model_dir
             yield update_logs(f"Modelo {architecture} inicializado en {temp_model_dir}.", "Modelo Cero") + (gr.update(), gr.update())
@@ -1363,7 +1432,6 @@ def _train_and_upload(**kwargs):
             os.environ["WANDB_PROJECT"] = kwargs.get('wandb_project_input') or f"{repo_base}"
             os.environ["WANDB_LOG_MODEL"] = "checkpoint"
-        from datetime import datetime
         model_card_content = MODEL_CARD_TEMPLATE.format(
             repo_id=repo_id, base_model=model_name, base_model_name=model_name.split('/')[-1],
             training_mode=kwargs.get('training_mode'),
@@ -1390,8 +1458,11 @@ def _train_and_upload(**kwargs):
             train_generator = train_func(base_model_id_for_training, train_dataset, repo_id, update_logs, model_card_content, **kwargs)
             while True:
                 try:
-                    update_tuple = next(train_generator)
-                    yield update_tuple + (gr.update(), gr.update())
                 except StopIteration as e:
                     final_model_path, final_metrics = e.value
                     break
@@ -1405,7 +1476,7 @@ def _train_and_upload(**kwargs):
             eval_dataset_perp = None
             eval_gen = _get_eval_dataset(kwargs.get('datasets_hf_text').split(","), kwargs.get('eval_dataset_hf'), kwargs.get('uploaded_val_data'), lambda m, p: update_logs(m, p))
             for update in eval_gen:
-                if isinstance(update, tuple):
                     yield update + (gr.update(), gr.update())
                 else:
                     eval_dataset_perp = update
@@ -1436,6 +1507,7 @@ def _train_and_upload(**kwargs):
             gr.update(visible=False)
         )
 def run_inference(task_mode, model_id, text_in, context_in, image_in, audio_in, temperature, top_p, max_new_tokens):
     if not model_id: return "Por favor, introduce un ID de modelo del Hub.", model_id, gr.update(), gr.update(), gr.update(), gr.update()
     task_name = TASK_TO_PIPELINE_MAP.get(task_mode)
@@ -1460,6 +1532,7 @@ def run_inference(task_mode, model_id, text_in, context_in, image_in, audio_in,
         return f"Resultado:\n\n{json.dumps(result, indent=2, ensure_ascii=False)}", model_id, gr.update(), gr.update(), gr.update(), gr.update()
     except Exception as e: return f"Error en Inferencia: {e}\n{traceback.format_exc()}", model_id, gr.update(), gr.update(), gr.update(), gr.update()
 def update_inference_ui(task_mode):
     task_name = TASK_TO_PIPELINE_MAP.get(task_mode, "")
     is_text_gen = task_name == "text-generation"
@@ -1477,6 +1550,7 @@ def update_inference_ui(task_mode):
         gr.update(visible=is_text_gen)
     )
 def create_and_upload_dataset(hf_token, repo_name, creation_type, synth_model, synth_prompt, synth_num_samples, file_uploads, progress=gr.Progress()):
     if not hf_token:
         return "Error: Se requiere un token de Hugging Face.", ""
@@ -1525,7 +1599,6 @@ def create_and_upload_dataset(hf_token, repo_name, creation_type, synth_model, s
                 for item in all_data:
                     f.write(json.dumps(item, ensure_ascii=False) + "\n")
-            from datetime import datetime
             readme_content = DATASET_CARD_TEMPLATE.format(
                 repo_id=repo_id,
                 creation_type=creation_type,
@@ -1550,10 +1623,12 @@ def create_and_upload_dataset(hf_token, repo_name, creation_type, synth_model, s
     except Exception as e:
         return f"❌ Error fatal durante la creación del dataset: {e}\n{traceback.format_exc()}", ""
 def gradio_train_wrapper(*args):
     kwargs = dict(zip(all_input_components_dict.keys(), args))
     yield from _train_and_upload(**kwargs)
 def gradio_preview_data_wrapper(*args):
     kwargs = dict(zip(all_input_components_dict.keys(), args))
     try:
@@ -1563,9 +1638,13 @@ def gradio_preview_data_wrapper(*args):
         dataset, processed_kwargs = _get_data_processing_pipeline(**kwargs)
         text_col = processed_kwargs.get('text_col')
         tokenizer = AutoTokenizer.from_pretrained(
-            kwargs.get('tokenizer_name') or kwargs.get('model_base_input') or 'gpt2',
-            trust_remote_code=True, use_fast=False
         )
         if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
         if kwargs.get('chat_template_jinja', '').strip(): tokenizer.chat_template = kwargs['chat_template_jinja']
@@ -1574,20 +1653,21 @@ def gradio_preview_data_wrapper(*args):
         for i, example in enumerate(islice(dataset, 5)):
             formatted_text = ""
             if kwargs['training_mode'] == "DPO (Direct Preference Optimization)":
-                formatted_text = json.dumps(_dpo_formatting_func(example, **kwargs), indent=2)
             else:
                 formatted_text = _sft_formatting_func(example, text_col, tokenizer, **kwargs)
             preview_samples.append(f"--- MUESTRA {i+1} ---\n{formatted_text}\n")
         preview_text = "\n".join(preview_samples)
-        if not preview_text:
-            preview_text = "No se pudieron generar muestras. Revisa la configuración del dataset y el formato."
         yield preview_text
     except Exception as e:
         yield f"Error al generar la vista previa: {e}\n{traceback.format_exc()}"
 def toggle_training_mode_ui(is_scratch):
     return (
         gr.update(visible=not is_scratch),
@@ -1598,12 +1678,14 @@ def toggle_training_mode_ui(is_scratch):
         gr.update(visible=is_scratch)
     )
 def toggle_task_specific_ui(training_mode):
     is_classification = "Classification" in training_mode
     is_dpo = "DPO" in training_mode
     is_sft = "Causal" in training_mode
     is_ner = "Token Classification" in training_mode
     is_diffusion = training_mode in ["Text-to-Image (LoRA)", "DreamBooth LoRA (Text-to-Image)"]
     return (
         gr.update(visible=is_classification or is_ner),
@@ -1613,13 +1695,15 @@ def toggle_task_specific_ui(training_mode):
         gr.update(visible=training_mode == "DreamBooth LoRA (Text-to-Image)"),
         gr.update(visible=not is_diffusion),
         gr.update(visible=is_diffusion),
-        gr.update(visible=not is_diffusion),
     )
 def toggle_auto_modules_ui(is_auto):
     return gr.update(visible=not is_auto)
 def toggle_dataset_creator_ui(choice):
     is_synth = choice == "Sintético"
     return gr.update(visible=is_synth), gr.update(visible=not is_synth)
@@ -1650,7 +1734,7 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
                     dset_file_uploads = gr.File(label="Subir Archivos (.jsonl, .csv, .txt)", file_count="multiple")
                 dset_create_button = gr.Button("Crear y Subir Dataset", variant="primary")
             with gr.Column(scale=2):
-                dset_status_output = gr.Textbox(label="Estado", lines=10)
                 dset_link_output = gr.Markdown()
         dset_creation_type.change(toggle_dataset_creator_ui, inputs=[dset_creation_type], outputs=[dset_synth_group, dset_file_group])
@@ -1706,7 +1790,7 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
                     with gr.Accordion("Avanzados", open=False):
                          warmup_ratio = gr.Slider(0.0, 0.5, 0.03, label="Ratio de Calentamiento")
                          weight_decay = gr.Textbox(label="Decaimiento de Peso", value="0.01")
-                         max_grad_norm = gr.Textbox(label="Norma Máxima de Gradiente", value="0.3")
                          logging_steps = gr.Textbox(label="Pasos de Registro", value="10")
                          save_steps = gr.Textbox(label="Pasos de Guardado", value="50")
                          save_total_limit = gr.Textbox(label="Límite Total de Guardado", value="1")
@@ -1766,9 +1850,6 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
                         diffusion_resolution = gr.Slider(256, 1024, 512, step=64, label="Resolución")
                     with gr.Group(visible=False) as dreambooth_ui:
                         dreambooth_instance_prompt = gr.Textbox(label="Prompt de Instancia", placeholder="p.ej. 'foto de perro sks'")
-                        dreambooth_class_prompt = gr.Textbox(label="Prompt de Clase (Opcional)", placeholder="p.ej. 'foto de perro'")
-                        dreambooth_num_class_images = gr.Slider(0, 1000, 100, step=10, label="Nº de Imágenes de Clase")
-                        dreambooth_prior_loss_weight = gr.Slider(0.0, 2.0, 1.0, label="Peso de Pérdida a Priori")
                         dreambooth_train_text_encoder = gr.Checkbox(label="Entrenar Text Encoder", value=True)
                     with gr.Group(visible=False) as classification_labels_ui:
                         classification_labels = gr.Textbox(label="Etiquetas de Clasificación (csv)", placeholder="p.ej. positivo,negativo")
@@ -1787,7 +1868,6 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
                     enable_cda = gr.Checkbox(label="Habilitar Aumentación Contrafactual (CDA)", value=False)
                     cda_json_config = gr.Textbox(label="Configuración CDA (JSON)", placeholder='[["ella", "él"], ["mujer", "hombre"]]')
                 with gr.Accordion("🔌 Integraciones", open=False):
                     wandb_api_key_input = gr.Textbox(label="Clave API de W&B", type="password")
                     wandb_project_input = gr.Textbox(label="Proyecto W&B")
@@ -1832,8 +1912,7 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
             "diffusion_resolution": diffusion_resolution, "run_evaluation": run_evaluation, "run_perplexity_evaluation": run_perplexity_evaluation,
             "enable_loss_reweighting": enable_loss_reweighting, "reweighting_terms": reweighting_terms,
             "wandb_api_key_input": wandb_api_key_input, "wandb_project_input": wandb_project_input,
-            "dreambooth_instance_prompt": dreambooth_instance_prompt, "dreambooth_class_prompt": dreambooth_class_prompt,
-            "dreambooth_num_class_images": dreambooth_num_class_images, "dreambooth_prior_loss_weight": dreambooth_prior_loss_weight,
             "dreambooth_train_text_encoder": dreambooth_train_text_encoder
         }
@@ -1905,4 +1984,4 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
             outputs=[inf_text_out, inf_model_id, inf_text_in, inf_context_in, inf_image_in, inf_audio_in]
         )
-demo.queue().launch(server_name="0.0.0.0", server_port=7860)

 import ast
 from itertools import islice
 from pathlib import Path
+from collections import defaultdict
+from datetime import datetime
 import torch
 import torch.nn.functional as F
 from torch.utils.data import DataLoader
 import textstat
 from datasketch import MinHash, MinHashLSH
 import gradio as gr
+from datasets import load_dataset, IterableDataset, Dataset, DatasetDict, interleave_datasets, Audio
 from huggingface_hub import login, whoami, create_repo, upload_folder, HfApi
 from transformers import (
     AutoModelForCausalLM, AutoTokenizer, AutoConfig, TrainingArguments, Trainer,
     AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer,
+    SpeechT5ForTextToSpeech, SpeechT5Processor, SpeechT5HifiGan, AutoModelForImageClassification,
     AutoImageProcessor, AutoModelForAudioClassification, AutoFeatureExtractor, AutoModelForTokenClassification,
+    DataCollatorForTokenClassification, AutoModelForQuestionAnswering, AutoModelForSpeechSeq2Seq,
     AutoProcessor, DataCollatorWithPadding, pipeline, CLIPTextModel, CLIPTokenizer,
     DataCollatorForSeq2Seq, AutoModelForSequenceClassification, BitsAndBytesConfig,
     LlamaConfig, LlamaForCausalLM, MistralConfig, MistralForCausalLM, GemmaConfig, GemmaForCausalLM, GPT2Config, GPT2LMHeadModel,
 )
 import evaluate as hf_evaluate
 from jinja2 import Template
 logger = logging.getLogger(__name__)
     "DreamBooth LoRA (Text-to-Image)": "text-to-image",
 }
+MODEL_CARD_TEMPLATE = """
+---
 language: es
 license: apache-2.0
 tags:
 - Gradio
 """
+DATASET_CARD_TEMPLATE = """
+---
 license: mit
 ---
 - **Fecha de Creación:** {date}
 """
+@spaces.GPU()
 class DebiasingSFTTrainer(SFTTrainer):
     def __init__(self, *args, reweighting_terms=None, reweighting_factor=1.0, **kwargs):
         super().__init__(*args, **kwargs)
                     break
         return (loss, outputs) if return_outputs else loss
+@spaces.GPU()
+class DeduplicatedIterableDataset(IterableDataset):
+    def __init__(self, dataset, text_col, method, threshold=0.85, num_perm=128):
+        super().__init__(ex_iterable=iter([]))
+        self.dataset = dataset
+        self.text_col = text_col
+        self.method = method
+        self.threshold = threshold
+        self.num_perm = num_perm
+        if hasattr(dataset, '_info'):
+            self._info = dataset._info
+        elif hasattr(dataset, 'info'):
+            self._info = dataset.info
+    def __iter__(self):
+        if self.method == 'Exacta':
+            return self._exact_iter()
+        elif self.method == 'Semántica (MinHash)':
+            return self._minhash_iter()
+        else:
+            return iter(self.dataset)
+    def _exact_iter(self):
+        seen_texts = set()
+        for example in self.dataset:
+            text = example.get(self.text_col, "")
+            if text and isinstance(text, str):
+                if text not in seen_texts:
+                    seen_texts.add(text)
+                    yield example
+            else:
+                yield example
+    def _minhash_iter(self):
+        lsh = MinHashLSH(threshold=self.threshold, num_perm=self.num_perm)
+        for i, example in enumerate(self.dataset):
+            text = example.get(self.text_col, "")
+            if text and isinstance(text, str) and text.strip():
+                m = MinHash(num_perm=self.num_perm)
+                for d in text.split():
+                    m.update(d.encode('utf8'))
+                if not lsh.query(m):
+                    lsh.insert(f"key_{i}", m)
+                    yield example
+            else:
+                yield example
+@spaces.GPU()
 def hf_login(token):
     if not token:
         return "Por favor, introduce un token."
     except Exception as e:
         return f"❌ Error en la conexión: {e}"
+@spaces.GPU()
 def _clean_text(example, text_col, **kwargs):
     text = example.get(text_col, "")
     if not isinstance(text, str):
     example[text_col] = text
     return example
+@spaces.GPU()
 def _apply_quality_filters(example, text_col, min_len, max_len, rep_threshold, exclude_keywords):
     text = example.get(text_col, "")
     if not isinstance(text, str): return False
     lower_text = text.lower()
     return not any(keyword in lower_text for keyword in exclude_keywords)
+@spaces.GPU()
 def _get_filter_functions(**kwargs):
     filters = []
     if kwargs.get('enable_quality_filter'):
         filters.append(stats_filter)
     return filters
+@spaces.GPU()
 def _load_hf_streaming(ids, split="train", probabilities=None):
     streams = []
     valid_ids = []
     if probabilities and len(probabilities) != len(streams):
         logger.warning(f"Number of probabilities ({len(probabilities)}) does not match number of valid datasets ({len(streams)}). Ignoring weights.")
         probabilities = None
     return interleave_datasets(streams, probabilities=probabilities)
+@spaces.GPU()
 def _load_uploaded_stream(files):
     all_rows = []
     for f in files or []:
     random.shuffle(all_rows)
     return {"train": all_rows[:-val_size] if val_size > 0 else all_rows, "validation": all_rows[-val_size:] if val_size > 0 else []}
+@spaces.GPU()
 def _guess_columns(sample):
     text_col, image_col, audio_col, label_col = "text", "image", "audio", "label"
     if not isinstance(sample, dict):
     elif "labels" in keys: label_col = keys["labels"]
     return text_col, image_col, audio_col, label_col
+@spaces.GPU()
 def _apply_cda(dataset, text_col, cda_config_str):
     try:
         swap_groups = json.loads(cda_config_str)
                 current_texts.update(next_texts)
     return IterableDataset.from_generator(cda_generator)
+@spaces.GPU()
 def _apply_back_translation(dataset, text_col, ratio, model_id, reverse_model_id):
     if not ratio or ratio <= 0:
         return dataset
                         logger.warning(f"Error en retrotraducción: {e}")
     return IterableDataset.from_generator(bt_generator)
+@spaces.GPU()
 def _generate_synthetic_data(original_dataset, text_col, model_id, num_samples, prompt_template):
     if not num_samples or num_samples <= 0:
         return None
                 continue
     return IterableDataset.from_generator(synthetic_generator)
+@spaces.GPU()
 def _calculate_auto_config(block_size, is_gpt2_like, steps_per_epoch_estimate, batch_size, gradient_accumulation):
     safe_steps = int(steps_per_epoch_estimate or 10000)
     safe_batch_size = int(batch_size or 1)
     kv_heads = heads if is_gpt2_like else (max(1, heads // 4))
     return vocab_size, hidden_size, hidden_size * 2, layers, heads, safe_block_size, False, kv_heads
+@spaces.GPU()
 def _get_eval_dataset(train_ds_id, eval_ds_id, uploaded_val_data, update_logs_fn):
     if eval_ds_id:
         yield update_logs_fn(f"Cargando dataset de evaluación: {eval_ds_id}", "Evaluación")
     yield update_logs_fn("No se proporcionó dataset de evaluación. Omitiendo.", "Evaluación")
     return None
+@spaces.GPU()
 def _create_training_args(output_dir, repo_id, **kwargs):
     neftune_alpha = float(kwargs.get('neftune_noise_alpha', 0.0))
     optim_args_dict = {}
         "save_strategy": "steps",
         "logging_steps": int(kwargs.get('logging_steps', 10)),
         "save_steps": int(kwargs.get('save_steps', 50)),
+        "evaluation_strategy": "steps" if kwargs.get('run_evaluation', False) else "no",
         "eval_steps": int(kwargs.get('save_steps', 50)) if kwargs.get('run_evaluation', False) else None,
         "learning_rate": float(kwargs.get('learning_rate', 2e-5)),
         "fp16": kwargs.get('mixed_precision') == 'fp16' and device == 'cuda',
         "bf16": kwargs.get('mixed_precision') == 'bf16' and device == 'cuda',
+        "max_grad_norm": float(kwargs.get('max_grad_norm', 1.0)),
         "warmup_ratio": float(kwargs.get('warmup_ratio', 0.03)),
         "lr_scheduler_type": kwargs.get('scheduler', 'cosine'),
         "weight_decay": float(kwargs.get('weight_decay', 0.01)),
     return TrainingArguments(**args_dict)
+@spaces.GPU()
 def _generic_model_loader(model_name_or_path, model_class, **kwargs):
     quantization_type = kwargs.get('quantization', 'no')
     bnb_config = None
     if quantization_type != "no" and device == "cuda":
+        try:
+            import bitsandbytes as bnb
+            if quantization_type == "4bit":
+                bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch_dtype_auto, bnb_4bit_use_double_quant=True)
+            elif quantization_type == "8bit":
+                bnb_config = BitsAndBytesConfig(load_in_8bit=True)
+        except ImportError:
+            logger.warning("bitsandbytes no está instalado. No se puede cargar en 4bit/8bit.")
     elif quantization_type != "no" and device == "cpu":
         logger.warning("La cuantización solo es compatible con GPU CUDA. Se procederá sin cuantización.")
     return model
+@spaces.GPU()
 def _find_all_linear_names(model, quantization_type):
     cls = torch.nn.Linear
     if quantization_type != 'no' and device == "cuda":
     return list(lora_module_names.intersection(common_targets)) or list(lora_module_names)
+@spaces.GPU()
 def _conversation_formatting_func(example, tokenizer, **kwargs):
     conv_col = ""
     for key in ["messages", "conversations", "turns"]:
         except: return ""
     return tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=False)
+@spaces.GPU()
 def _sft_formatting_func(example, text_col, tokenizer, **kwargs):
     if kwargs.get('enable_cot_input') or kwargs.get('enable_tool_use_input'):
         messages = []
                 return "\n".join([m['content'] for m in messages])
     return example.get(text_col, "")
+@spaces.GPU()
 def _dpo_formatting_func(example, **kwargs):
     return {"prompt": example.get(kwargs.get('prompt_col_input', 'prompt'), ""), "chosen": example.get(kwargs.get('dpo_chosen_col_input', 'chosen'), ""), "rejected": example.get(kwargs.get('dpo_rejected_col_input', 'rejected'), "")}
+@spaces.GPU()
 def _evaluate_perplexity(model, tokenizer, eval_dataset, text_col):
     model.eval()
     encodings = tokenizer("\n\n".join(ex[text_col] for ex in islice(eval_dataset, 1000)), return_tensors="pt").to(model.device)
     ppl = torch.exp(torch.stack(nlls).mean())
     return ppl.item()
+@spaces.GPU()
 def _merge_multiple_loras(base_model_id, adapter_ids_str, weights_str, combination_type):
     adapter_ids = [s.strip() for s in adapter_ids_str.split(',') if s.strip()]
     if not adapter_ids:
     yield f"Fusión de adaptadores completada. El entrenamiento continuará con el modelo fusionado en {temp_dir}."
     return temp_dir
+@spaces.GPU()
 def _run_trainer_and_upload(trainer, tokenizer, repo_id, update_logs_fn, model_card_content, **kwargs):
     yield update_logs_fn("Iniciando ciclo de entrenamiento...", "Entrenando")
     trainer.train(resume_from_checkpoint=kwargs.get('resume_from_checkpoint') or False)
         eval_logs = [log for log in trainer.state.log_history if 'eval_loss' in log]
         if eval_logs:
             final_metrics = eval_logs[-1]
+            final_metrics = {k.replace('eval_', ''): v for k, v in final_metrics.items()}
     yield update_logs_fn("Entrenamiento finalizado.", "Guardando")
     output_dir = trainer.args.output_dir
         torch.cuda.empty_cache()
     return output_dir, final_metrics
+@spaces.GPU()
 def train_sft_dpo(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs):
     output_dir = tempfile.mkdtemp()
     is_dpo = kwargs.get('training_mode') == "DPO (Direct Preference Optimization)"
         if kwargs.get('run_evaluation'):
             eval_dataset_gen = _get_eval_dataset(kwargs.get('datasets_hf_text').split(","), kwargs.get('eval_dataset_hf'), kwargs.get('uploaded_val_data'), update_logs_fn)
             for update in eval_dataset_gen:
+                if isinstance(update, dict):
                     yield update
                 else:
                     eval_dataset = update
         TrainerClass = DPOTrainer if is_dpo else (DebiasingSFTTrainer if kwargs.get('enable_loss_reweighting') else SFTTrainer)
+        trainer_kwargs = {"model": model, "args": training_args, "train_dataset": train_dataset, "eval_dataset": eval_dataset, "peft_config": peft_config}
         if is_dpo:
             trainer_kwargs.update({"beta": 0.1, "max_length": int(kwargs.get('block_size')), "max_prompt_length": int(kwargs.get('block_size')) // 2})
             trainer_kwargs.update({"formatting_func": lambda ex: _sft_formatting_func(example=ex, tokenizer=tokenizer, text_col=text_col, **sft_kwargs)})
             if kwargs.get('enable_loss_reweighting'):
                 trainer_kwargs.update({'reweighting_terms': kwargs.get('reweighting_terms', '').split(','), 'reweighting_factor': kwargs.get('reweighting_factor', 2.0)})
+        trainer = TrainerClass(**trainer_kwargs)
         final_model_path, final_metrics = yield from _run_trainer_and_upload(trainer, tokenizer, repo_id, update_logs_fn, model_card_content, **kwargs)
         return final_model_path, final_metrics
     except Exception as e:
         raise Exception(f"Error en {'DPO' if is_dpo else 'SFT'}: {e}\n{traceback.format_exc()}")
+@spaces.GPU()
 def train_sequence_classification(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs):
     output_dir = tempfile.mkdtemp()
     try:
         tokenizer_id = kwargs.get('tokenizer_name') or model_name
         yield update_logs_fn(f"Cargando tokenizer '{tokenizer_id}'...", "Configuración")
         tokenizer = AutoTokenizer.from_pretrained(tokenizer_id, trust_remote_code=True)
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
         yield update_logs_fn(f"Cargando modelo '{model_name}'...", "Configuración")
         model = _generic_model_loader(model_name, AutoModelForSequenceClassification, num_labels=len(labels), label2id=label2id, id2label=id2label, **kwargs)
+        model.config.pad_token_id = tokenizer.pad_token_id
         def preprocess(examples):
             return tokenizer(examples[kwargs['text_col']], truncation=True, max_length=512)
+        train_dataset = train_dataset.map(preprocess, batched=True)
         eval_dataset = None
         if kwargs.get('run_evaluation'):
             eval_dataset_gen = _get_eval_dataset(kwargs.get('datasets_hf_text').split(","), kwargs.get('eval_dataset_hf'), kwargs.get('uploaded_val_data'), update_logs_fn)
             for update in eval_dataset_gen:
+                if isinstance(update, dict):
                     yield update
                 else:
                     eval_dataset = update
+            if eval_dataset: eval_dataset = eval_dataset.map(preprocess, batched=True)
         metric = hf_evaluate.load("accuracy")
         def compute_metrics(eval_pred):
     except Exception as e:
         raise Exception(f"Error en Sequence Classification: {e}\n{traceback.format_exc()}")
+@spaces.GPU()
 def train_token_classification(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs):
     output_dir = tempfile.mkdtemp()
     try:
         if kwargs.get('run_evaluation'):
             eval_dataset_gen = _get_eval_dataset(kwargs.get('datasets_hf_text').split(","), kwargs.get('eval_dataset_hf'), kwargs.get('uploaded_val_data'), update_logs_fn)
             for update in eval_dataset_gen:
+                if isinstance(update, dict):
                     yield update
                 else:
                     eval_dataset = update
     except Exception as e:
         raise Exception(f"Error en Token Classification: {e}\n{traceback.format_exc()}")
+@spaces.GPU()
 def train_question_answering(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs):
     output_dir = tempfile.mkdtemp()
     try:
             eval_dataset_raw_gen = _get_eval_dataset(kwargs.get('datasets_hf_text').split(","), kwargs.get('eval_dataset_hf'), kwargs.get('uploaded_val_data'), update_logs_fn)
             eval_dataset_raw = None
             for update in eval_dataset_raw_gen:
+                if isinstance(update, dict):
                     yield update
                 else:
                     eval_dataset_raw = update
     except Exception as e:
         raise Exception(f"Error en Question Answering: {e}\n{traceback.format_exc()}")
+@spaces.GPU()
 def train_seq2seq(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs):
     output_dir = tempfile.mkdtemp()
     try:
         if kwargs.get('run_evaluation'):
             eval_dataset_gen = _get_eval_dataset(kwargs.get('datasets_hf_text').split(","), kwargs.get('eval_dataset_hf'), kwargs.get('uploaded_val_data'), update_logs_fn)
             for update in eval_dataset_gen:
+                if isinstance(update, dict):
                     yield update
                 else:
                     eval_dataset = update
     except Exception as e:
         raise Exception(f"Error en Seq2Seq: {e}\n{traceback.format_exc()}")
+@spaces.GPU()
 def train_text_to_image(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs):
     if device == 'cpu':
         raise ValueError("El entrenamiento de Text-to-Image solo es compatible con GPU CUDA.")
     yield update_logs_fn("Configurando componentes de Diffusers...", "Text-to-Image (LoRA)")
     tokenizer = CLIPTokenizer.from_pretrained(model_name, subfolder="tokenizer")
+    text_encoder = CLIPTextModel.from_pretrained(model_name, subfolder="text_encoder", torch_dtype=torch_dtype_auto)
+    vae = AutoencoderKL.from_pretrained(model_name, subfolder="vae", torch_dtype=torch_dtype_auto)
+    unet = UNet2DConditionModel.from_pretrained(model_name, subfolder="unet", torch_dtype=torch_dtype_auto)
     noise_scheduler = DDPMScheduler.from_pretrained(model_name, subfolder="scheduler")
     vae.requires_grad_(False)
     text_encoder.requires_grad_(False)
     unet.train()
+    yield update_logs_fn("Agregando adaptadores LoRA al UNet...", "Text-to-Image (LoRA)")
     unet_lora_config = LoraConfig(
         r=int(kwargs.get('lora_r', 16)), lora_alpha=int(kwargs.get('lora_alpha', 32)),
         target_modules=["to_q", "to_k", "to_v", "to_out.0"],
     )
     unet.add_adapter(unet_lora_config)
+    if kwargs.get('dreambooth_train_text_encoder', False):
+        yield update_logs_fn("Agregando adaptadores LoRA al Text Encoder...", "DreamBooth LoRA")
+        text_encoder_lora_config = LoraConfig(
+            r=int(kwargs.get('lora_r', 16)), lora_alpha=int(kwargs.get('lora_alpha', 32)),
+            target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
+        )
+        text_encoder.add_adapter(text_encoder_lora_config)
     yield update_logs_fn("Procesando dataset de imágenes...", "Text-to-Image (LoRA)")
     resolution = int(kwargs.get('diffusion_resolution', 512))
     ])
     def preprocess_train(examples):
+        images = [image.convert("RGB") for image in examples[kwargs.get('image_col', 'image')]]
         examples["pixel_values"] = [train_transforms(image) for image in images]
         examples["input_ids"] = tokenizer(examples[kwargs.get('text_col', 'text')], max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt").input_ids
         return examples
     def collate_fn(examples):
         pixel_values = torch.stack([example["pixel_values"] for example in examples])
+        input_ids = torch.stack([e["input_ids"][0] for e in examples])
         return {"pixel_values": pixel_values, "input_ids": input_ids}
     train_dataloader = DataLoader(processed_dataset, shuffle=True, collate_fn=collate_fn, batch_size=int(kwargs.get('batch_size', 1)))
+    params_to_optimize = list(unet.parameters())
+    if kwargs.get('dreambooth_train_text_encoder', False):
+        params_to_optimize += list(text_encoder.parameters())
     optimizer = torch.optim.AdamW(
+        params_to_optimize, lr=float(kwargs.get('learning_rate', 2e-5)),
         betas=(float(kwargs.get('adam_beta1', 0.9)), float(kwargs.get('adam_beta2', 0.999))),
         weight_decay=float(kwargs.get('weight_decay', 0.01)),
         eps=float(kwargs.get('adam_epsilon', 1e-8)),
         num_training_steps=max_train_steps,
     )
+    unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        unet, text_encoder, optimizer, train_dataloader, lr_scheduler
     )
     vae.to(accelerator.device, dtype=torch_dtype_auto)
     global_step = 0
     final_loss = 0
     for epoch in range(num_epochs):
         for step, batch in enumerate(train_dataloader):
             with accelerator.accumulate(unet):
+                latents = vae.encode(batch["pixel_values"].to(dtype=torch_dtype_auto)).latent_dist.sample()
                 latents = latents * vae.config.scaling_factor
                 noise = torch.randn_like(latents)
                 bsz = latents.shape[0]
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device).long()
                 noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+                encoder_hidden_states = text_encoder(batch["input_ids"])[0]
                 noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
                 loss = F.mse_loss(noise_pred.float(), noise.float(), reduction="mean")
                 final_loss = loss.detach().item()
                 accelerator.backward(loss)
                 if accelerator.sync_gradients:
+                    params_to_clip = list(unet.parameters())
+                    if kwargs.get('dreambooth_train_text_encoder', False):
+                        params_to_clip += list(text_encoder.parameters())
+                    accelerator.clip_grad_norm_(params_to_clip, float(kwargs.get('max_grad_norm', 1.0)))
                 optimizer.step()
                 lr_scheduler.step()
             if accelerator.is_main_process:
                 if global_step % int(kwargs.get('logging_steps', 10)) == 0:
+                    yield update_logs_fn(f"Epoch {epoch}, Step {step}, Loss: {final_loss:.4f}", "Entrenando Difusión")
             global_step += 1
+            if global_step >= max_train_steps:
+                break
+        if global_step >= max_train_steps:
+            break
     accelerator.wait_for_everyone()
     if accelerator.is_main_process:
+        pipeline = StableDiffusionText2ImagePipeline.from_pretrained(
+            model_name,
+            unet=accelerator.unwrap_model(unet),
+            text_encoder=accelerator.unwrap_model(text_encoder),
+            torch_dtype=torch_dtype_auto,
+        )
         pipeline.save_pretrained(output_dir)
         with open(os.path.join(output_dir, "README.md"), "w", encoding="utf-8") as f:
         torch.cuda.empty_cache()
     return output_dir, {"final_loss": final_loss}
+@spaces.GPU()
 def train_dreambooth_lora(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs):
     if device == 'cpu':
         raise ValueError("El entrenamiento de DreamBooth solo es compatible con GPU CUDA.")
     train_dataset = train_dataset.map(add_prompt)
+    yield update_logs_fn(f"Usando el prompt de instancia para todas las imágenes: '{dreambooth_prompt}'", "DreamBooth LoRA")
     final_model_path, final_metrics = yield from train_text_to_image(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs)
     return final_model_path, final_metrics
+@spaces.GPU()
 def _get_data_processing_pipeline(**kwargs):
     hf_ids = [x.strip() for x in (kwargs.get('datasets_hf_text') or "").split(",") if x.strip()]
     if not hf_ids and not kwargs.get('uploads'):
             if train_dataset is None:
                 train_dataset = hf_train_dataset
             else:
                 all_streams = [train_dataset, hf_train_dataset]
+                all_probs = [0.5, 0.5]
                 train_dataset = interleave_datasets(all_streams, probabilities=all_probs)
     if train_dataset is None:
     text_col, image_col, audio_col, label_col = _guess_columns(first_example)
     kwargs.update({'text_col': text_col, 'image_col': image_col, 'audio_col': audio_col, 'label_col': label_col, 'uploaded_val_data': uploaded_val_data})
+    is_text_task = kwargs['training_mode'] not in ["DreamBooth LoRA (Text-to-Image)", "Text-to-Image (LoRA)", "Image Classification (Vision)", "Audio Classification (Speech)"]
+    if is_text_task:
         if any([kwargs.get('remove_html_tags'), kwargs.get('normalize_whitespace'), kwargs.get('remove_urls_emails'), kwargs.get('redact_pii')]):
             clean_kwargs = {k:v for k,v in kwargs.items() if k in ['remove_html_tags', 'normalize_whitespace', 'remove_urls_emails', 'redact_pii']}
             train_dataset = train_dataset.map(lambda ex: _clean_text(ex, text_col, **clean_kwargs))
         dedup_method = kwargs.get('deduplication_method')
         if dedup_method != 'Ninguna':
+            train_dataset = DeduplicatedIterableDataset(
+                dataset=train_dataset,
+                text_col=text_col,
+                method=dedup_method,
+                threshold=kwargs.get('minhash_threshold', 0.85),
+                num_perm=int(kwargs.get('minhash_num_perm', 128))
+            )
     return train_dataset, kwargs
+@spaces.GPU()
 def _train_and_upload(**kwargs):
     logs, repo_link, final_model_path, final_metrics = "", "", None, {}
                  raise Exception(f"No se pudo cargar el tokenizer base '{tokenizer_id}' para el modelo desde cero: {e}")
             base_model_id_for_training = temp_model_dir
             kwargs["peft"] = False
             kwargs['tokenizer_name'] = temp_model_dir
             yield update_logs(f"Modelo {architecture} inicializado en {temp_model_dir}.", "Modelo Cero") + (gr.update(), gr.update())
             os.environ["WANDB_PROJECT"] = kwargs.get('wandb_project_input') or f"{repo_base}"
             os.environ["WANDB_LOG_MODEL"] = "checkpoint"
         model_card_content = MODEL_CARD_TEMPLATE.format(
             repo_id=repo_id, base_model=model_name, base_model_name=model_name.split('/')[-1],
             training_mode=kwargs.get('training_mode'),
             train_generator = train_func(base_model_id_for_training, train_dataset, repo_id, update_logs, model_card_content, **kwargs)
             while True:
                 try:
+                    update = next(train_generator)
+                    if isinstance(update, tuple) and len(update) == 4:
+                         yield update + (gr.update(), gr.update())
+                    else:
+                        pass
                 except StopIteration as e:
                     final_model_path, final_metrics = e.value
                     break
             eval_dataset_perp = None
             eval_gen = _get_eval_dataset(kwargs.get('datasets_hf_text').split(","), kwargs.get('eval_dataset_hf'), kwargs.get('uploaded_val_data'), lambda m, p: update_logs(m, p))
             for update in eval_gen:
+                if isinstance(update, dict):
                     yield update + (gr.update(), gr.update())
                 else:
                     eval_dataset_perp = update
             gr.update(visible=False)
         )
+@spaces.GPU()
 def run_inference(task_mode, model_id, text_in, context_in, image_in, audio_in, temperature, top_p, max_new_tokens):
     if not model_id: return "Por favor, introduce un ID de modelo del Hub.", model_id, gr.update(), gr.update(), gr.update(), gr.update()
     task_name = TASK_TO_PIPELINE_MAP.get(task_mode)
         return f"Resultado:\n\n{json.dumps(result, indent=2, ensure_ascii=False)}", model_id, gr.update(), gr.update(), gr.update(), gr.update()
     except Exception as e: return f"Error en Inferencia: {e}\n{traceback.format_exc()}", model_id, gr.update(), gr.update(), gr.update(), gr.update()
+@spaces.GPU()
 def update_inference_ui(task_mode):
     task_name = TASK_TO_PIPELINE_MAP.get(task_mode, "")
     is_text_gen = task_name == "text-generation"
         gr.update(visible=is_text_gen)
     )
+@spaces.GPU()
 def create_and_upload_dataset(hf_token, repo_name, creation_type, synth_model, synth_prompt, synth_num_samples, file_uploads, progress=gr.Progress()):
     if not hf_token:
         return "Error: Se requiere un token de Hugging Face.", ""
                 for item in all_data:
                     f.write(json.dumps(item, ensure_ascii=False) + "\n")
             readme_content = DATASET_CARD_TEMPLATE.format(
                 repo_id=repo_id,
                 creation_type=creation_type,
     except Exception as e:
         return f"❌ Error fatal durante la creación del dataset: {e}\n{traceback.format_exc()}", ""
+@spaces.GPU()
 def gradio_train_wrapper(*args):
     kwargs = dict(zip(all_input_components_dict.keys(), args))
     yield from _train_and_upload(**kwargs)
+@spaces.GPU()
 def gradio_preview_data_wrapper(*args):
     kwargs = dict(zip(all_input_components_dict.keys(), args))
     try:
         dataset, processed_kwargs = _get_data_processing_pipeline(**kwargs)
         text_col = processed_kwargs.get('text_col')
+        model_id_for_tokenizer = kwargs.get('model_base_input')
+        if not model_id_for_tokenizer:
+            raise ValueError("Se necesita un ID de modelo base para cargar el tokenizer para la vista previa.")
+        tokenizer_id = kwargs.get('tokenizer_name') or model_id_for_tokenizer
         tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_id, trust_remote_code=True, use_fast=False
         )
         if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
         if kwargs.get('chat_template_jinja', '').strip(): tokenizer.chat_template = kwargs['chat_template_jinja']
         for i, example in enumerate(islice(dataset, 5)):
             formatted_text = ""
             if kwargs['training_mode'] == "DPO (Direct Preference Optimization)":
+                formatted_text = json.dumps(_dpo_formatting_func(example, **kwargs), indent=2, ensure_ascii=False)
             else:
                 formatted_text = _sft_formatting_func(example, text_col, tokenizer, **kwargs)
             preview_samples.append(f"--- MUESTRA {i+1} ---\n{formatted_text}\n")
         preview_text = "\n".join(preview_samples)
+        if not preview_samples:
+            preview_text = "No se pudieron generar muestras. Revisa la configuración del dataset, los filtros y el formato."
         yield preview_text
     except Exception as e:
         yield f"Error al generar la vista previa: {e}\n{traceback.format_exc()}"
+@spaces.GPU()
 def toggle_training_mode_ui(is_scratch):
     return (
         gr.update(visible=not is_scratch),
         gr.update(visible=is_scratch)
     )
+@spaces.GPU()
 def toggle_task_specific_ui(training_mode):
     is_classification = "Classification" in training_mode
     is_dpo = "DPO" in training_mode
     is_sft = "Causal" in training_mode
     is_ner = "Token Classification" in training_mode
     is_diffusion = training_mode in ["Text-to-Image (LoRA)", "DreamBooth LoRA (Text-to-Image)"]
+    is_streaming = not is_diffusion
     return (
         gr.update(visible=is_classification or is_ner),
         gr.update(visible=training_mode == "DreamBooth LoRA (Text-to-Image)"),
         gr.update(visible=not is_diffusion),
         gr.update(visible=is_diffusion),
+        gr.update(visible=not is_streaming),
+        gr.update(visible=is_streaming)
     )
+@spaces.GPU()
 def toggle_auto_modules_ui(is_auto):
     return gr.update(visible=not is_auto)
+@spaces.GPU()
 def toggle_dataset_creator_ui(choice):
     is_synth = choice == "Sintético"
     return gr.update(visible=is_synth), gr.update(visible=not is_synth)
                     dset_file_uploads = gr.File(label="Subir Archivos (.jsonl, .csv, .txt)", file_count="multiple")
                 dset_create_button = gr.Button("Crear y Subir Dataset", variant="primary")
             with gr.Column(scale=2):
+                dset_status_output = gr.Textbox(label="Estado", lines=10, interactive=False)
                 dset_link_output = gr.Markdown()
         dset_creation_type.change(toggle_dataset_creator_ui, inputs=[dset_creation_type], outputs=[dset_synth_group, dset_file_group])
                     with gr.Accordion("Avanzados", open=False):
                          warmup_ratio = gr.Slider(0.0, 0.5, 0.03, label="Ratio de Calentamiento")
                          weight_decay = gr.Textbox(label="Decaimiento de Peso", value="0.01")
+                         max_grad_norm = gr.Textbox(label="Norma Máxima de Gradiente", value="1.0")
                          logging_steps = gr.Textbox(label="Pasos de Registro", value="10")
                          save_steps = gr.Textbox(label="Pasos de Guardado", value="50")
                          save_total_limit = gr.Textbox(label="Límite Total de Guardado", value="1")
                         diffusion_resolution = gr.Slider(256, 1024, 512, step=64, label="Resolución")
                     with gr.Group(visible=False) as dreambooth_ui:
                         dreambooth_instance_prompt = gr.Textbox(label="Prompt de Instancia", placeholder="p.ej. 'foto de perro sks'")
                         dreambooth_train_text_encoder = gr.Checkbox(label="Entrenar Text Encoder", value=True)
                     with gr.Group(visible=False) as classification_labels_ui:
                         classification_labels = gr.Textbox(label="Etiquetas de Clasificación (csv)", placeholder="p.ej. positivo,negativo")
                     enable_cda = gr.Checkbox(label="Habilitar Aumentación Contrafactual (CDA)", value=False)
                     cda_json_config = gr.Textbox(label="Configuración CDA (JSON)", placeholder='[["ella", "él"], ["mujer", "hombre"]]')
                 with gr.Accordion("🔌 Integraciones", open=False):
                     wandb_api_key_input = gr.Textbox(label="Clave API de W&B", type="password")
                     wandb_project_input = gr.Textbox(label="Proyecto W&B")
             "diffusion_resolution": diffusion_resolution, "run_evaluation": run_evaluation, "run_perplexity_evaluation": run_perplexity_evaluation,
             "enable_loss_reweighting": enable_loss_reweighting, "reweighting_terms": reweighting_terms,
             "wandb_api_key_input": wandb_api_key_input, "wandb_project_input": wandb_project_input,
+            "dreambooth_instance_prompt": dreambooth_instance_prompt,
             "dreambooth_train_text_encoder": dreambooth_train_text_encoder
         }
             outputs=[inf_text_out, inf_model_id, inf_text_in, inf_context_in, inf_image_in, inf_audio_in]
         )
+demo.queue().launch(debug=True)