diff --git "a/app.py" "b/app.py" --- "a/app.py" +++ "b/app.py" @@ -1,6 +1,8 @@ import os -os.system("pip install -U transformers peft accelerate trl bitsandbytes datasets diffusers") -os.system("pip install spaces-0.1.0-py3-none-any.whl") +os.system("pip install -U gradio") +os.system("pip install -U bitsandbytes diffusers torchaudio torchvision torch transformers peft accelerate trl datasets") +os.system("pip install spaces") + import io import json import tempfile @@ -40,36 +42,34 @@ from huggingface_hub import login, whoami, create_repo, upload_folder, HfApi, hf from transformers import ( AutoModelForCausalLM, AutoTokenizer, AutoConfig, TrainingArguments, Trainer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, - SpeechT5ForTextToSpeech, SpeechT5Processor, SpeechT5HifiGan, AutoModelForImageClassification, + AutoModelForImageClassification, AutoImageProcessor, AutoModelForAudioClassification, AutoFeatureExtractor, AutoModelForTokenClassification, DataCollatorForTokenClassification, AutoModelForQuestionAnswering, AutoModelForSpeechSeq2Seq, - AutoProcessor, DataCollatorWithPadding, pipeline, CLIPTextModel, CLIPTokenizer, - DataCollatorForSeq2Seq, AutoModelForSequenceClassification, BitsAndBytesConfig, + AutoProcessor, DataCollatorWithPadding, pipeline, + DataCollatorForSeq2Seq, AutoModelForSequenceClassification, LlamaConfig, LlamaForCausalLM, MistralConfig, MistralForCausalLM, GemmaConfig, GemmaForCausalLM, GPT2Config, GPT2LMHeadModel, PhiConfig, PhiForCausalLM, Qwen2Config, Qwen2ForCausalLM, - DataCollatorForLanguageModeling, DefaultDataCollator, Adafactor, TrainerCallback + DataCollatorForLanguageModeling, DefaultDataCollator, Adafactor ) -from peft import LoraConfig, get_peft_model, PeftModel, prepare_model_for_kbit_training, AdaLoraConfig, PeftConfig +from peft import LoraConfig, get_peft_model, PeftModel from trl import SFTTrainer, DPOTrainer -from diffusers import ( - UNet2DConditionModel, DDPMScheduler, AutoencoderKL, DiffusionPipeline, - get_scheduler as get_diffusers_scheduler, StableDiffusionPipeline as StableDiffusionText2ImagePipeline, - StableDiffusionImg2ImgPipeline as StableDiffusionImage2ImagePipeline, - get_cosine_schedule_with_warmup -) import evaluate as hf_evaluate from jinja2 import Template import spaces from tqdm.auto import tqdm +from diffusers import ( + UNet2DConditionModel, DDPMScheduler, AutoencoderKL, + get_scheduler as get_diffusers_scheduler, StableDiffusionPipeline as StableDiffusionText2ImagePipeline, + StableDiffusionImg2ImgPipeline as StableDiffusionImage2ImagePipeline +) logger = logging.getLogger(__name__) +torch_dtype_auto = torch.float32 -if torch.cuda.is_available(): - device = "cuda" - torch_dtype_auto = torch.float16 -else: - device = "cpu" - torch_dtype_auto = torch.float32 +def _sanitize_model_name_for_yaml(model_name): + name = model_name.split('/')[-1] if '/' in model_name else model_name + sanitized = re.sub(r'[^a-zA-Z0-9\-_\.]', '-', name) + return sanitized if sanitized else "model" ARCHITECTURE_MAP = {"Llama": (LlamaConfig, LlamaForCausalLM), "Mistral": (MistralConfig, MistralForCausalLM), "Gemma": (GemmaConfig, GemmaForCausalLM), "GPT2": (GPT2Config, GPT2LMHeadModel), "Phi": (PhiConfig, PhiForCausalLM), "Qwen2": (Qwen2Config, Qwen2ForCausalLM)} SCRATCH_TOKENIZER_MAP = {"Llama": "meta-llama/Llama-2-7b-hf", "Mistral": "mistralai/Mistral-7B-v0.1", "Gemma": "google/gemma-2b", "GPT2": "gpt2", "Phi": "microsoft/phi-2", "Qwen2": "Qwen/Qwen2-0.5B"} @@ -79,8 +79,7 @@ TRAINING_MODES = [ "Question Answering (Text)", "Token Classification (NER)", "Sequence Classification (Text)", - "Text-to-Image (LoRA)", - "DreamBooth LoRA (Text-to-Image)", + "Text-to-Image Generation", "Image Classification (Vision)", "Audio Classification (Speech)", "ASR (Speech-to-Text)", @@ -96,9 +95,9 @@ TASK_TO_PIPELINE_MAP = { "Audio Classification (Speech)": "audio-classification", "ASR (Speech-to-Text)": "automatic-speech-recognition", "Text2Text Generation": "text2text-generation", - "Text-to-Image (LoRA)": "text-to-image", - "DreamBooth LoRA (Text-to-Image)": "text-to-image", + "Text-to-Image Generation": "text-to-image", } + MODEL_CARD_TEMPLATE = """--- language: es license: apache-2.0 @@ -110,7 +109,7 @@ widget: - text: "Hola, ¿cómo estás?" --- # {repo_id} -Este modelo es una versión afinada de [{base_model}](https://huggingface.co/{base_model}) entrenado con la herramienta AutoTrain-Advanced. +Este modelo es una versión afinada de [{base_model}](https://huggingface.co/{base_model}) entrenado con la herramienta [AutoTrain-Advanced](https://huggingface.co/spaces/autotrain-projects/autotrain-advanced). ## Detalles del Entrenamiento - **Modo de Entrenamiento:** {training_mode} - **Modelo Base:** `{base_model}` @@ -119,68 +118,80 @@ Este modelo es una versión afinada de [{base_model}](https://huggingface.co/{ba ### Hiperparámetros de Entrenamiento ```json {hyperparameters}``` +### Frameworks Utilizados +- Transformers +- PEFT +- Accelerate +- TRL +- Gradio """ DATASET_CARD_TEMPLATE = """--- license: mit --- # {repo_id} -Dataset creado con AutoTrain-Advanced. -## Detalles -- **Tipo:** {creation_type} -- **Modelo Generador:** `{generation_model}` -- **Fecha:** {date} +Este dataset fue creado utilizando la herramienta [AutoTrain-Advanced](https://huggingface.co/spaces/autotrain-projects/autotrain-advanced). +## Detalles del Dataset +- **Tipo de Creación:** {creation_type} +- **Modelo de Generación (si aplica):** `{generation_model}` +- **Fecha de Creación:** {date} """ - -class GradioLogCallback(TrainerCallback): - def __init__(self, log_function): - self.log_function = log_function - - def on_log(self, args, state, control, logs=None, **kwargs): - if logs: - msg = f"Step {state.global_step}: {logs}" - self.log_function(msg, "Entrenando") - -@spaces.GPU() +_tox_pipe_singleton = None +@spaces.GPU class DebiasingSFTTrainer(SFTTrainer): def __init__(self, *args, reweighting_terms=None, reweighting_factor=1.0, **kwargs): super().__init__(*args, **kwargs) self.reweighting_terms = [term.strip().lower() for term in reweighting_terms] if reweighting_terms else [] self.reweighting_factor = reweighting_factor - - def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None): - if hasattr(super(), "compute_loss") and "num_items_in_batch" in super().compute_loss.__code__.co_varnames: - loss, outputs = super().compute_loss(model, inputs, return_outputs=True, num_items_in_batch=num_items_in_batch) - else: - loss, outputs = super().compute_loss(model, inputs, return_outputs=True) - + + def compute_loss(self, model, inputs, return_outputs=False): + loss, outputs = super().compute_loss(model, inputs, return_outputs=True) if self.reweighting_terms and self.reweighting_factor > 1.0: input_ids = inputs.get("input_ids") decoded_texts = self.tokenizer.batch_decode(input_ids, skip_special_tokens=True) - multiplier = 1.0 for text in decoded_texts: if any(term in text.lower() for term in self.reweighting_terms): - multiplier = self.reweighting_factor + loss *= self.reweighting_factor break - loss *= multiplier return (loss, outputs) if return_outputs else loss +@spaces.GPU +class DeduplicatedIterableDataset(IterableDataset): + def __init__(self, dataset, text_col, method, threshold=0.85, num_perm=128): + super().__init__(ex_iterable=iter([])) + self.dataset = dataset + self.text_col = text_col + self.method = method + self.threshold = threshold + self.num_perm = num_perm + if hasattr(dataset, '_info'): + self._info = dataset._info + elif hasattr(dataset, 'info'): + self._info = dataset.info -def _deduplication_generator(dataset, text_col, method, threshold, num_perm): - if method == 'Exacta': + def __iter__(self): + if self.method == 'Exacta': + return self._exact_iter() + elif self.method == 'Semántica (MinHash)': + return self._minhash_iter() + else: + return iter(self.dataset) + + def _exact_iter(self): seen_texts = set() - for example in dataset: - text = example.get(text_col, "") + for example in self.dataset: + text = example.get(self.text_col, "") if text and isinstance(text, str): if text not in seen_texts: seen_texts.add(text) yield example else: yield example - elif method == 'Semántica (MinHash)': - lsh = MinHashLSH(threshold=threshold, num_perm=num_perm) - for i, example in enumerate(dataset): - text = example.get(text_col, "") + + def _minhash_iter(self): + lsh = MinHashLSH(threshold=self.threshold, num_perm=self.num_perm) + for i, example in enumerate(self.dataset): + text = example.get(self.text_col, "") if text and isinstance(text, str) and text.strip(): - m = MinHash(num_perm=num_perm) + m = MinHash(num_perm=self.num_perm) for d in text.split(): m.update(d.encode('utf8')) if not lsh.query(m): @@ -188,22 +199,8 @@ def _deduplication_generator(dataset, text_col, method, threshold, num_perm): yield example else: yield example - else: - yield from dataset - -def _create_deduplicated_iterable_dataset(dataset, text_col, method, threshold=0.85, num_perm=128): - return IterableDataset.from_generator( - _deduplication_generator, - gen_kwargs={ - "dataset": dataset, - "text_col": text_col, - "method": method, - "threshold": threshold, - "num_perm": num_perm, - } - ) -@spaces.GPU() +@spaces.GPU def hf_login(token): if not token: return "Por favor, introduce un token." @@ -213,8 +210,7 @@ def hf_login(token): return f"✅ Conectado como: {user['name']}" except Exception as e: return f"❌ Error en la conexión: {e}" - -@spaces.GPU() +@spaces.GPU def _clean_text(example, text_col, **kwargs): text = example.get(text_col, "") if not isinstance(text, str): @@ -223,16 +219,15 @@ def _clean_text(example, text_col, **kwargs): text = BeautifulSoup(text, "html.parser").get_text() if kwargs.get('remove_urls_emails'): text = re.sub(r'http\S+|www\S+|httpsS+', '', text, flags=re.MULTILINE) - text = re.sub(r'\S+@\S+', '', text) if kwargs.get('normalize_whitespace'): text = ' '.join(text.split()) if kwargs.get('redact_pii'): + text = re.sub(r'\S+@\S+', '', text) text = re.sub(r'(\d{1,4}[-.\s]?){7,}|(\+\d{1,3}\s?)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}', '', text) text = re.sub(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', '', text) example[text_col] = text return example - -@spaces.GPU() +@spaces.GPU def _apply_quality_filters(example, text_col, min_len, max_len, rep_threshold, exclude_keywords): text = example.get(text_col, "") if not isinstance(text, str): return False @@ -245,8 +240,73 @@ def _apply_quality_filters(example, text_col, min_len, max_len, rep_threshold, e if not word_counts or (max(word_counts.values()) / len(words)) > rep_threshold: return False lower_text = text.lower() return not any(keyword in lower_text for keyword in exclude_keywords) - -@spaces.GPU() +@spaces.GPU +def _apply_coherence_filter(example, text_col, char_rep_threshold, ngram_rep_threshold, entropy_threshold): + text = example.get(text_col, "") + if not isinstance(text, str) or not text: + return False + char_repetition_ratio = 0 + if len(text) > 0: + for char in set(text): + if char.isalnum() or char in '.,;:!?': + char_count = text.count(char) + char_ratio = char_count / len(text) + char_repetition_ratio = max(char_repetition_ratio, char_ratio) + if char_repetition_ratio > char_rep_threshold: + return False + text_lower = text.lower() + repeated_chars = 0 + ngram_counts = {} + for n in [3, 4, 5]: + if len(text_lower) >= n: + for i in range(len(text_lower) - n + 1): + ngram = text_lower[i:i+n] + if ngram.isalpha(): + ngram_counts[ngram] = ngram_counts.get(ngram, 0) + 1 + if ngram_counts: + highly_repeated_ngrams = {ng for ng, count in ngram_counts.items() if count > 3} + if highly_repeated_ngrams: + covered_positions = set() + for i in range(len(text_lower)): + for n in [3, 4, 5]: + if i + n <= len(text_lower): + ngram = text_lower[i:i+n] + if ngram in highly_repeated_ngrams: + for j in range(i, i+n): + covered_positions.add(j) + repetition_coverage = len(covered_positions) / len(text_lower) + if repetition_coverage > ngram_rep_threshold: + return False + if len(text) > 10: + char_freq = {} + for char in text: + char_freq[char] = char_freq.get(char, 0) + 1 + entropy = 0 + for count in char_freq.values(): + p = count / len(text) + if p > 0: + entropy -= p * math.log2(p) + max_entropy = math.log2(len(char_freq)) if len(char_freq) > 0 else 1 + normalized_entropy = entropy / max_entropy if max_entropy > 0 else 0 + if normalized_entropy < entropy_threshold: + return False + if len(text) > 0: + alnum_count = sum(1 for c in text if c.isalnum() or c.isspace()) + alnum_ratio = alnum_count / len(text) + if alnum_ratio < 0.7: + return False + scripts = { + 'greek': sum(1 for c in text if '\u0370' <= c <= '\u03FF'), + 'cyrillic': sum(1 for c in text if '\u0400' <= c <= '\u04FF'), + 'arabic': sum(1 for c in text if '\u0600' <= c <= '\u06FF'), + 'chinese': sum(1 for c in text if '\u4E00' <= c <= '\u9FFF'), + } + non_latin_chars = sum(scripts.values()) + latin_chars = sum(1 for c in text if c.isalpha() and not any(scripts.values())) + if non_latin_chars > 2 and latin_chars > 10: + return False + return True +@spaces.GPU def _get_filter_functions(**kwargs): filters = [] if kwargs.get('enable_quality_filter'): @@ -267,15 +327,25 @@ def _get_filter_functions(**kwargs): if kwargs.get('enable_toxicity_filter'): tox_threshold = kwargs.get('toxicity_threshold', 0.8) def tox_filter(ex): + global _tox_pipe_singleton + if _tox_pipe_singleton is None: + logger.info("Initializing toxicity filter pipeline...") + _tox_pipe_singleton = pipeline("text-classification", model="unitary/toxic-bert") text = ex.get(kwargs['text_col'], "") if not text or not isinstance(text, str): return True try: - return True + results = _tox_pipe_singleton(text[:512], truncation=True) + return not (results[0]['label'] == 'toxic' and results[0]['score'] > tox_threshold) except Exception: return True filters.append(tox_filter) + if kwargs.get('enable_coherence_filter'): + char_rep_thresh = kwargs.get('coherence_char_repetition_threshold', 0.4) + ngram_rep_thresh = kwargs.get('coherence_ngram_repetition_threshold', 0.3) + entropy_thresh = kwargs.get('coherence_entropy_threshold', 0.5) + filters.append(lambda ex: _apply_coherence_filter(ex, kwargs['text_col'], char_rep_thresh, ngram_rep_thresh, entropy_thresh)) if any([kwargs.get('enable_readability_filter'), kwargs.get('enable_stopword_filter'), kwargs.get('enable_uniqueness_filter')]): - stop_words = set(textstat.DEFAULT_stopwords) + stop_words = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'should', 'could', 'can', 'may', 'might', 'must', 'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'what', 'which', 'who', 'when', 'where', 'why', 'how']) def stats_filter(ex): text = ex.get(kwargs['text_col'], "") if not isinstance(text, str) or not text: return True @@ -283,17 +353,20 @@ def _get_filter_functions(**kwargs): num_words = len(words) if num_words == 0: return True if kwargs.get('enable_readability_filter'): - score = textstat.flesch_reading_ease(text) - if not (kwargs['min_readability'] <= score <= kwargs['max_readability']): return False + try: + score = textstat.flesch_reading_ease(text) + if not (kwargs['min_readability'] <= score <= kwargs['max_readability']): return False + except: + pass if kwargs.get('enable_stopword_filter'): - if (textstat.stopword_count(text) / num_words) > kwargs['max_stopword_ratio']: return False + stopword_count = sum(1 for word in words if word.lower() in stop_words) + if num_words > 0 and (stopword_count / num_words) > kwargs['max_stopword_ratio']: return False if kwargs.get('enable_uniqueness_filter'): if (len(set(words)) / num_words) < kwargs['min_uniqueness_ratio']: return False return True filters.append(stats_filter) return filters - -@spaces.GPU() +@spaces.GPU def _load_hf_streaming(ids, split="train", probabilities=None): streams = [] valid_ids = [] @@ -313,16 +386,16 @@ def _load_hf_streaming(ids, split="train", probabilities=None): if split_found: valid_ids.append(ident) else: - logger.warning(f"Split '{split}' not found in dataset {ident}. Excluding.") + logger.warning(f"Split '{split}' not found in dataset {ident}. Excluding from this source.") except Exception as e: - logger.error(f"Error loading dataset {ident} split {split}: {e}. Excluding.") + logger.error(f"Error loading dataset {ident} split {split}: {e}. Excluding from this source.") if not streams: return None if probabilities and len(probabilities) != len(streams): + logger.warning(f"Number of probabilities ({len(probabilities)}) does not match number of valid datasets ({len(streams)}). Ignoring weights.") probabilities = None return interleave_datasets(streams, probabilities=probabilities) - -@spaces.GPU() +@spaces.GPU def _load_uploaded_stream(files): all_rows = [] for f in files or []: @@ -343,8 +416,7 @@ def _load_uploaded_stream(files): val_size = max(1, int(len(all_rows) * 0.01)) random.shuffle(all_rows) return {"train": all_rows[:-val_size] if val_size > 0 else all_rows, "validation": all_rows[-val_size:] if val_size > 0 else []} - -@spaces.GPU() +@spaces.GPU def _guess_columns(sample): text_col, image_col, audio_col, label_col = "text", "image", "audio", "label" if not isinstance(sample, dict): @@ -360,8 +432,7 @@ def _guess_columns(sample): if "label" in keys: label_col = keys["label"] elif "labels" in keys: label_col = keys["labels"] return text_col, image_col, audio_col, label_col - -@spaces.GPU() +@spaces.GPU def _apply_cda(dataset, text_col, cda_config_str): try: swap_groups = json.loads(cda_config_str) @@ -393,14 +464,14 @@ def _apply_cda(dataset, text_col, cda_config_str): next_texts.add(new_text) current_texts.update(next_texts) return IterableDataset.from_generator(cda_generator) - -@spaces.GPU() +@spaces.GPU def _apply_back_translation(dataset, text_col, ratio, model_id, reverse_model_id): if not ratio or ratio <= 0: return dataset + logger.info(f"Aplicando retrotraducción al {ratio*100}% del dataset.") try: - pipe_to = pipeline("translation", model=model_id, device=0 if device == 'cuda' else -1) - pipe_from = pipeline("translation", model=reverse_model_id, device=0 if device == 'cuda' else -1) + pipe_to = pipeline("translation", model=model_id, ) + pipe_from = pipeline("translation", model=reverse_model_id, ) except Exception as e: logger.error(f"No se pudieron cargar los modelos de traducción: {e}") return dataset @@ -418,19 +489,21 @@ def _apply_back_translation(dataset, text_col, ratio, model_id, reverse_model_id new_example[text_col] = back_translated yield new_example except Exception as e: - pass + logger.warning(f"Error en retrotraducción: {e}") return IterableDataset.from_generator(bt_generator) - -@spaces.GPU() +@spaces.GPU def _generate_synthetic_data(original_dataset, text_col, model_id, num_samples, prompt_template): if not num_samples or num_samples <= 0: return None + logger.info(f"Iniciando generación de {num_samples} muestras sintéticas con el modelo {model_id}.") try: - generator = pipeline("text-generation", model=model_id, torch_dtype=torch_dtype_auto, device=0 if device == 'cuda' else -1) + generator = pipeline("text-generation", model=model_id, ) except Exception as e: + logger.error(f"No se pudo cargar el modelo generador sintético: {e}") return None seed_examples = list(islice(original_dataset, 200)) if not seed_examples: + logger.warning("Dataset original vacío, no se pueden generar datos sintéticos.") return None def synthetic_generator(): for i in range(num_samples): @@ -447,10 +520,9 @@ def _generate_synthetic_data(original_dataset, text_col, model_id, num_samples, new_example[text_col] = cleaned_text yield new_example except Exception as e: + logger.warning(f"Error generando una muestra sintética: {e}") continue return IterableDataset.from_generator(synthetic_generator) - -@spaces.GPU() def _calculate_auto_config(block_size, is_gpt2_like, steps_per_epoch_estimate, batch_size, gradient_accumulation): safe_steps = int(steps_per_epoch_estimate or 10000) safe_batch_size = int(batch_size or 1) @@ -468,8 +540,7 @@ def _calculate_auto_config(block_size, is_gpt2_like, steps_per_epoch_estimate, b layers = max(8, min(32, 8 + int(log_size * 1.5))) kv_heads = heads if is_gpt2_like else (max(1, heads // 4)) return vocab_size, hidden_size, hidden_size * 2, layers, heads, safe_block_size, False, kv_heads - -@spaces.GPU() +@spaces.GPU def _get_eval_dataset(train_ds_id, eval_ds_id, uploaded_val_data, update_logs_fn): if eval_ds_id: yield update_logs_fn(f"Cargando dataset de evaluación: {eval_ds_id}", "Evaluación") @@ -490,8 +561,6 @@ def _get_eval_dataset(train_ds_id, eval_ds_id, uploaded_val_data, update_logs_fn return None yield update_logs_fn("No se proporcionó dataset de evaluación. Omitiendo.", "Evaluación") return None - -@spaces.GPU() def _create_training_args(output_dir, repo_id, **kwargs): neftune_alpha = float(kwargs.get('neftune_noise_alpha', 0.0)) optim_args_dict = {} @@ -511,15 +580,12 @@ def _create_training_args(output_dir, repo_id, **kwargs): "save_steps": int(kwargs.get('save_steps', 50)), "eval_steps": int(kwargs.get('save_steps', 50)) if kwargs.get('run_evaluation', False) else None, "learning_rate": float(kwargs.get('learning_rate', 2e-5)), - "fp16": kwargs.get('mixed_precision') == 'fp16' and device == 'cuda', - "bf16": kwargs.get('mixed_precision') == 'bf16' and device == 'cuda', "max_grad_norm": float(kwargs.get('max_grad_norm', 1.0)), "warmup_ratio": float(kwargs.get('warmup_ratio', 0.03)), "lr_scheduler_type": kwargs.get('scheduler', 'cosine'), "weight_decay": float(kwargs.get('weight_decay', 0.01)), "load_best_model_at_end": kwargs.get('run_evaluation', False), "save_total_limit": int(kwargs.get('save_total_limit', 1)), - "gradient_checkpointing": not kwargs.get('disable_gradient_checkpointing', False) and device == 'cuda', "push_to_hub": True, "hub_model_id": repo_id, "hub_strategy": kwargs.get('hub_strategy', 'every_save'), @@ -527,47 +593,25 @@ def _create_training_args(output_dir, repo_id, **kwargs): "report_to": "wandb" if kwargs.get('wandb_api_key_input') else "none", "remove_unused_columns": False, "group_by_length": kwargs.get('group_by_length', False), - "packing": kwargs.get('packing', False), "metric_for_best_model": kwargs.get('metric_for_best_model', 'loss') if kwargs.get('run_evaluation') else None, "greater_is_better": kwargs.get('greater_is_better', False), "neftune_noise_alpha": neftune_alpha if neftune_alpha > 0 else None, "adam_beta1": float(kwargs.get('adam_beta1', 0.9)), "adam_beta2": float(kwargs.get('adam_beta2', 0.999)), "adam_epsilon": float(kwargs.get('adam_epsilon', 1e-8)), - "no_cuda": device == 'cpu' } if kwargs.get('early_stopping_patience', 0) > 0 and kwargs.get('run_evaluation', False): args_dict['early_stopping_patience'] = int(kwargs['early_stopping_patience']) args_dict['load_best_model_at_end'] = True - is_diffusion_task = kwargs.get('training_mode', '') in ["Text-to-Image (LoRA)", "DreamBooth LoRA (Text-to-Image)"] - if is_diffusion_task: - args_dict["num_train_epochs"] = float(kwargs.get('epochs', 1.0)) + + max_steps_val = int(kwargs.get('max_steps', -1)) + if max_steps_val > 0: + args_dict["max_steps"] = max_steps_val else: - max_steps_val = int(kwargs.get('max_steps', -1)) - if max_steps_val > 0: - args_dict["max_steps"] = max_steps_val - else: - raise ValueError("Para datasets en streaming se requiere un valor positivo para 'Máximos Pasos de Entrenamiento'.") + raise ValueError("Para datasets en streaming se requiere un valor positivo para 'Máximos Pasos de Entrenamiento'.") return TrainingArguments(**args_dict) - -@spaces.GPU() +@spaces.GPU def _generic_model_loader(model_name_or_path, model_class, **kwargs): - quantization_type = kwargs.get('quantization', 'no') - bnb_config = None - if quantization_type != "no" and device == "cuda": - try: - import bitsandbytes as bnb - if quantization_type == "4bit": - bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch_dtype_auto, bnb_4bit_use_double_quant=True) - elif quantization_type == "8bit": - bnb_config = BitsAndBytesConfig(load_in_8bit=True) - except ImportError: - logger.warning("bitsandbytes no está instalado.") - elif quantization_type != "no" and device == "cpu": - logger.warning("La cuantización solo es compatible con GPU CUDA.") - attn_implementation = kwargs.get('attn_implementation', 'eager') - if attn_implementation == "flash_attention_2" and device != 'cuda': - attn_implementation = "eager" config_kwargs = {"trust_remote_code": True} if kwargs.get('label2id'): config_kwargs.update({"label2id": kwargs['label2id'], "id2label": kwargs['id2label']}) @@ -575,34 +619,17 @@ def _generic_model_loader(model_name_or_path, model_class, **kwargs): if kwargs.get('attention_dropout', 0) > 0: config.attention_dropout = kwargs['attention_dropout'] if kwargs.get('hidden_dropout', 0) > 0: config.hidden_dropout = kwargs['hidden_dropout'] model_kwargs = { - "trust_remote_code": True, "config": config, "attn_implementation": attn_implementation, - "torch_dtype": torch_dtype_auto, "quantization_config": bnb_config, + "trust_remote_code": True, + "config": config, + "torch_dtype": torch.float32, } - if device == "cuda" and bnb_config is None: - model_kwargs["device_map"] = "auto" - elif device == "cpu": - model_kwargs["device_map"] = "cpu" if kwargs.get('num_labels'): model_kwargs.update({"num_labels": kwargs['num_labels'], "ignore_mismatched_sizes": True}) model = model_class.from_pretrained(model_name_or_path, **model_kwargs) - if device == 'cpu' and hasattr(model, 'to'): - model.to(device) - if quantization_type != "no" and device == "cuda": - model = prepare_model_for_kbit_training(model) return model - -@spaces.GPU() -def _find_all_linear_names(model, quantization_type): +@spaces.GPU +def _find_all_linear_names(model): cls = torch.nn.Linear - if quantization_type != 'no' and device == "cuda": - try: - import bitsandbytes as bnb - if quantization_type == '4bit': - cls = bnb.nn.Linear4bit - elif quantization_type == '8bit': - cls = bnb.nn.Linear8bitLt - except ImportError: - pass lora_module_names = set() for name, module in model.named_modules(): if isinstance(module, cls): @@ -612,8 +639,7 @@ def _find_all_linear_names(model, quantization_type): lora_module_names.remove('lm_head') common_targets = {'q_proj', 'v_proj', 'k_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'} return list(lora_module_names.intersection(common_targets)) or list(lora_module_names) - -@spaces.GPU() +@spaces.GPU def _sft_formatting_func(example, text_col, tokenizer, **kwargs): if kwargs.get('sft_format_style') == "Conversacional": conv_col = "" @@ -633,7 +659,7 @@ def _sft_formatting_func(example, text_col, tokenizer, **kwargs): if kwargs.get('enable_cot_input') and example.get(kwargs.get('reasoning_col_input', 'reasoning')): response_parts.append(f"{example[kwargs.get('reasoning_col_input', 'reasoning')]}") if kwargs.get('enable_tool_use_input') and example.get(kwargs.get('tool_use_col_input', 'tools')): - response_parts.append(f"{example[kwargs.get('tool_use_col_input', 'tools')]}") + response_parts.append(f"{example.get(kwargs.get('tool_use_col_input', 'tools'))}") if example.get(kwargs.get('response_col_input', 'response')): response_parts.append(example.get(kwargs.get('response_col_input', 'response'))) if response_parts: @@ -642,18 +668,17 @@ def _sft_formatting_func(example, text_col, tokenizer, **kwargs): try: return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False) except Exception as e: + logger.error(f"Error aplicando la plantilla de chat: {e}.") return "\n".join([m['content'] for m in messages]) return "" return example.get(text_col, "") - -@spaces.GPU() +@spaces.GPU def _dpo_formatting_func(example, **kwargs): return {"prompt": example.get(kwargs.get('prompt_col_input', 'prompt'), ""), "chosen": example.get(kwargs.get('dpo_chosen_col_input', 'chosen'), ""), "rejected": example.get(kwargs.get('dpo_rejected_col_input', 'rejected'), "")} - -@spaces.GPU() +@spaces.GPU def _evaluate_perplexity(model, tokenizer, eval_dataset, text_col): model.eval() - encodings = tokenizer("\n\n".join(ex[text_col] for ex in islice(eval_dataset, 1000)), return_tensors="pt").to(model.device) + encodings = tokenizer("\n\n".join(ex[text_col] for ex in islice(eval_dataset, 1000)), return_tensors="pt") max_length = model.config.max_position_embeddings stride = 512 seq_len = encodings.input_ids.size(1) @@ -674,54 +699,47 @@ def _evaluate_perplexity(model, tokenizer, eval_dataset, text_col): break ppl = torch.exp(torch.stack(nlls).mean()) return ppl.item() - -@spaces.GPU() +@spaces.GPU def _merge_multiple_loras(base_model_id, adapter_ids_str, weights_str, combination_type): adapter_ids = [s.strip() for s in adapter_ids_str.split(',') if s.strip()] if not adapter_ids: - yield "No se proporcionaron IDs de adaptadores válidos." + yield "No se proporcionaron IDs de adaptadores válidos. Omitiendo la fusión múltiple." return base_model_id try: weights = [float(w.strip()) for w in weights_str.split(',')] except: weights = [1.0] * len(adapter_ids) - yield f"Cargando modelo base {base_model_id}..." - model = AutoModelForCausalLM.from_pretrained(base_model_id, torch_dtype=torch_dtype_auto, trust_remote_code=True, device_map=device) - try: - model = PeftModel.from_pretrained(model, adapter_ids[0]) - for i, adapter_id in enumerate(adapter_ids[1:]): - model.load_adapter(adapter_id, adapter_name=f"adapter_{i+1}") - model.add_weighted_adapter( - adapters=[f"adapter_{i}" if i > 0 else "default" for i in range(len(adapter_ids))], - weights=weights, - adapter_name="merged", - combination_type=combination_type - ) - model.set_adapter("merged") - model = model.merge_and_unload() - except Exception as e: - yield f"Error merging: {e}" - return base_model_id + if len(weights) != len(adapter_ids): + weights = [1.0] * len(adapter_ids) + yield "Pesos de adaptadores inválidos, usando 1.0 para todos." + yield f"Cargando modelo base {base_model_id} para fusión múltiple..." + model = AutoModelForCausalLM.from_pretrained(base_model_id, torch_dtype=torch.float32, trust_remote_code=True, ) + for i, adapter_id in enumerate(adapter_ids): + yield f"Cargando adaptador {i+1}: {adapter_id}" + model.load_adapter(adapter_id, adapter_name=f"adapter_{i}") + adapter_names = [f"adapter_{i}" for i in range(len(adapter_ids))] + yield f"Combinando adaptadores: {adapter_names} con pesos: {weights} y tipo: {combination_type}" + model.add_weighted_adapter(adapters=adapter_names, weights=weights, adapter_name="combined", combination_type=combination_type) + model.set_adapter("combined") + yield "Fusionando combinación de adaptadores en el modelo base..." + merged_model = model.merge_and_unload() temp_dir = tempfile.mkdtemp() - yield f"Guardando fusionado en {temp_dir}" - model.save_pretrained(temp_dir) + yield f"Guardando modelo fusionado en {temp_dir}" + merged_model.save_pretrained(temp_dir) tokenizer = AutoTokenizer.from_pretrained(base_model_id) tokenizer.save_pretrained(temp_dir) - yield f"Listo. {temp_dir}" + yield f"Fusión de adaptadores completada. El entrenamiento continuará con el modelo fusionado en {temp_dir}." return temp_dir - -@spaces.GPU() +@spaces.GPU def _run_trainer_and_upload(trainer, tokenizer, repo_id, update_logs_fn, model_card_content, **kwargs): yield update_logs_fn("Iniciando ciclo de entrenamiento...", "Entrenando") - trainer.add_callback(GradioLogCallback(lambda msg, phase: update_logs_fn(msg, phase))) trainer.train(resume_from_checkpoint=kwargs.get('resume_from_checkpoint') or False) final_metrics = {} if kwargs.get('run_evaluation'): - try: - metrics = trainer.evaluate() - final_metrics.update(metrics) - except Exception as e: - logger.warning(f"Error en evaluación final: {e}") + eval_logs = [log for log in trainer.state.log_history if 'eval_loss' in log] + if eval_logs: + final_metrics = eval_logs[-1] + final_metrics = {k.replace('eval_', ''): v for k, v in final_metrics.items()} yield update_logs_fn("Entrenamiento finalizado.", "Guardando") output_dir = trainer.args.output_dir trainer.save_model(output_dir) @@ -731,19 +749,14 @@ def _run_trainer_and_upload(trainer, tokenizer, repo_id, update_logs_fn, model_c f.write(model_card_content) yield update_logs_fn("Subiendo al Hub...", "Subiendo") upload_folder(folder_path=output_dir, repo_id=repo_id, commit_message="Fin de entrenamiento") - del trainer - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() return output_dir, final_metrics - -@spaces.GPU() +@spaces.GPU def train_sft_dpo(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs): output_dir = tempfile.mkdtemp() is_dpo = kwargs.get('training_mode') == "DPO (Direct Preference Optimization)" text_col = kwargs.get('text_col') try: - tokenizer_id = kwargs.get('tokenizer_name') or model_name + tokenizer_id = kwargs.get('tokenizer_name_input') or model_name yield update_logs_fn(f"Cargando tokenizer '{tokenizer_id}'...", "Configuración") tokenizer = AutoTokenizer.from_pretrained(tokenizer_id, trust_remote_code=True, use_fast=False) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token @@ -752,8 +765,8 @@ def train_sft_dpo(model_name, train_dataset, repo_id, update_logs_fn, model_card model = _generic_model_loader(model_name, AutoModelForCausalLM, **kwargs) peft_config = None if kwargs.get('peft'): - target_modules = kwargs.get('target_modules').split(",") if not kwargs.get('auto_find_target_modules') else _find_all_linear_names(model, kwargs.get('quantization')) - yield update_logs_fn(f"Módulos LoRA: {target_modules}", "Configuración") + target_modules = kwargs.get('target_modules').split(",") if not kwargs.get('auto_find_target_modules') else _find_all_linear_names(model) + yield update_logs_fn(f"Módulos LoRA detectados/especificados: {target_modules}", "Configuración") peft_config = LoraConfig( r=int(kwargs.get('lora_r')), lora_alpha=int(kwargs.get('lora_alpha')), lora_dropout=float(kwargs.get('lora_dropout')), target_modules=target_modules, bias="none", task_type="CAUSAL_LM", use_dora=kwargs.get('use_dora', False), @@ -765,10 +778,12 @@ def train_sft_dpo(model_name, train_dataset, repo_id, update_logs_fn, model_card if kwargs.get('run_evaluation'): eval_dataset_gen = _get_eval_dataset(kwargs.get('datasets_hf_text').split(","), kwargs.get('eval_dataset_hf'), kwargs.get('uploaded_val_data'), update_logs_fn) for update in eval_dataset_gen: - if isinstance(update, dict): yield update - else: eval_dataset = update + if isinstance(update, dict): + yield update + else: + eval_dataset = update TrainerClass = DPOTrainer if is_dpo else (DebiasingSFTTrainer if kwargs.get('enable_loss_reweighting') else SFTTrainer) - trainer_kwargs = {"model": model, "args": training_args, "train_dataset": train_dataset, "eval_dataset": eval_dataset, "tokenizer": tokenizer, "peft_config": peft_config} + trainer_kwargs = {"model": model, "args": training_args, "train_dataset": train_dataset, "eval_dataset": eval_dataset, "peft_config": peft_config} if is_dpo: trainer_kwargs.update({"beta": 0.1, "max_length": int(kwargs.get('block_size')), "max_prompt_length": int(kwargs.get('block_size')) // 2}) if train_dataset: @@ -778,7 +793,9 @@ def train_sft_dpo(model_name, train_dataset, repo_id, update_logs_fn, model_card trainer_kwargs.update({"train_dataset": train_dataset, "eval_dataset": eval_dataset}) else: sft_kwargs = kwargs.copy() - trainer_kwargs.update({"formatting_func": lambda ex: _sft_formatting_func(example=ex, tokenizer=tokenizer, text_col=text_col, **sft_kwargs), "max_seq_length": int(kwargs.get('block_size'))}) + if 'text_col' in sft_kwargs: + del sft_kwargs['text_col'] + trainer_kwargs.update({"formatting_func": lambda ex: _sft_formatting_func(ex, text_col=text_col, tokenizer=tokenizer, **sft_kwargs)}) if kwargs.get('enable_loss_reweighting'): trainer_kwargs.update({'reweighting_terms': kwargs.get('reweighting_terms', '').split(','), 'reweighting_factor': float(kwargs.get('reweighting_factor', 2.0))}) trainer = TrainerClass(**trainer_kwargs) @@ -786,15 +803,14 @@ def train_sft_dpo(model_name, train_dataset, repo_id, update_logs_fn, model_card return final_model_path, final_metrics except Exception as e: raise Exception(f"Error en {'DPO' if is_dpo else 'SFT'}: {e}\n{traceback.format_exc()}") - -@spaces.GPU() +@spaces.GPU def train_sequence_classification(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs): output_dir = tempfile.mkdtemp() try: labels = [s.strip() for s in kwargs['classification_labels'].split(',')] label2id = {l: i for i, l in enumerate(labels)} id2label = {i: l for i, l in enumerate(labels)} - tokenizer_id = kwargs.get('tokenizer_name') or model_name + tokenizer_id = kwargs.get('tokenizer_name_input') or model_name yield update_logs_fn(f"Cargando tokenizer '{tokenizer_id}'...", "Configuración") tokenizer = AutoTokenizer.from_pretrained(tokenizer_id, trust_remote_code=True) if tokenizer.pad_token is None: @@ -809,8 +825,10 @@ def train_sequence_classification(model_name, train_dataset, repo_id, update_log if kwargs.get('run_evaluation'): eval_dataset_gen = _get_eval_dataset(kwargs.get('datasets_hf_text').split(","), kwargs.get('eval_dataset_hf'), kwargs.get('uploaded_val_data'), update_logs_fn) for update in eval_dataset_gen: - if isinstance(update, dict): yield update - else: eval_dataset = update + if isinstance(update, dict): + yield update + else: + eval_dataset = update if eval_dataset: eval_dataset = eval_dataset.map(preprocess, batched=True) metric = hf_evaluate.load("accuracy") def compute_metrics(eval_pred): @@ -827,15 +845,14 @@ def train_sequence_classification(model_name, train_dataset, repo_id, update_log return final_model_path, final_metrics except Exception as e: raise Exception(f"Error en Sequence Classification: {e}\n{traceback.format_exc()}") - -@spaces.GPU() +@spaces.GPU def train_token_classification(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs): output_dir = tempfile.mkdtemp() try: labels = [s.strip() for s in kwargs['classification_labels'].split(',')] label2id = {l: i for i, l in enumerate(labels)} id2label = {i: l for i, l in enumerate(labels)} - tokenizer_id = kwargs.get('tokenizer_name') or model_name + tokenizer_id = kwargs.get('tokenizer_name_input') or model_name yield update_logs_fn(f"Cargando tokenizer '{tokenizer_id}'...", "Configuración") tokenizer = AutoTokenizer.from_pretrained(tokenizer_id, trust_remote_code=True, add_prefix_space=True) yield update_logs_fn(f"Cargando modelo '{model_name}'...", "Configuración") @@ -861,8 +878,10 @@ def train_token_classification(model_name, train_dataset, repo_id, update_logs_f if kwargs.get('run_evaluation'): eval_dataset_gen = _get_eval_dataset(kwargs.get('datasets_hf_text').split(","), kwargs.get('eval_dataset_hf'), kwargs.get('uploaded_val_data'), update_logs_fn) for update in eval_dataset_gen: - if isinstance(update, dict): yield update - else: eval_dataset = update + if isinstance(update, dict): + yield update + else: + eval_dataset = update if eval_dataset: eval_dataset = eval_dataset.map(tokenize_and_align_labels, batched=True) metric = hf_evaluate.load("seqeval") def compute_metrics(p): @@ -883,12 +902,11 @@ def train_token_classification(model_name, train_dataset, repo_id, update_logs_f return final_model_path, final_metrics except Exception as e: raise Exception(f"Error en Token Classification: {e}\n{traceback.format_exc()}") - -@spaces.GPU() +@spaces.GPU def train_question_answering(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs): output_dir = tempfile.mkdtemp() try: - tokenizer_id = kwargs.get('tokenizer_name') or model_name + tokenizer_id = kwargs.get('tokenizer_name_input') or model_name yield update_logs_fn(f"Cargando tokenizer '{tokenizer_id}'...", "Configuración") tokenizer = AutoTokenizer.from_pretrained(tokenizer_id, trust_remote_code=True) yield update_logs_fn(f"Cargando modelo '{model_name}'...", "Configuración") @@ -945,8 +963,10 @@ def train_question_answering(model_name, train_dataset, repo_id, update_logs_fn, eval_dataset_raw_gen = _get_eval_dataset(kwargs.get('datasets_hf_text').split(","), kwargs.get('eval_dataset_hf'), kwargs.get('uploaded_val_data'), update_logs_fn) eval_dataset_raw = None for update in eval_dataset_raw_gen: - if isinstance(update, dict): yield update - else: eval_dataset_raw = update + if isinstance(update, dict): + yield update + else: + eval_dataset_raw = update if eval_dataset_raw: eval_dataset = eval_dataset_raw.map(prepare_train_features, batched=True, remove_columns=next(iter(eval_dataset_raw)).keys()) training_args = _create_training_args(output_dir, repo_id, **kwargs) @@ -959,12 +979,11 @@ def train_question_answering(model_name, train_dataset, repo_id, update_logs_fn, return final_model_path, final_metrics except Exception as e: raise Exception(f"Error en Question Answering: {e}\n{traceback.format_exc()}") - -@spaces.GPU() +@spaces.GPU def train_seq2seq(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs): output_dir = tempfile.mkdtemp() try: - tokenizer_id = kwargs.get('tokenizer_name') or model_name + tokenizer_id = kwargs.get('tokenizer_name_input') or model_name yield update_logs_fn(f"Cargando tokenizer '{tokenizer_id}'...", "Configuración") tokenizer = AutoTokenizer.from_pretrained(tokenizer_id, trust_remote_code=True) yield update_logs_fn(f"Cargando modelo '{model_name}'...", "Configuración") @@ -982,8 +1001,10 @@ def train_seq2seq(model_name, train_dataset, repo_id, update_logs_fn, model_card if kwargs.get('run_evaluation'): eval_dataset_gen = _get_eval_dataset(kwargs.get('datasets_hf_text').split(","), kwargs.get('eval_dataset_hf'), kwargs.get('uploaded_val_data'), update_logs_fn) for update in eval_dataset_gen: - if isinstance(update, dict): yield update - else: eval_dataset = update + if isinstance(update, dict): + yield update + else: + eval_dataset = update if eval_dataset: eval_dataset = eval_dataset.map(preprocess_function, batched=True) metric = hf_evaluate.load("sacrebleu") def compute_metrics(eval_preds): @@ -1009,150 +1030,158 @@ def train_seq2seq(model_name, train_dataset, repo_id, update_logs_fn, model_card return final_model_path, final_metrics except Exception as e: raise Exception(f"Error en Seq2Seq: {e}\n{traceback.format_exc()}") - -@spaces.GPU() -def train_text_to_image(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs): - if device == 'cpu': - raise ValueError("El entrenamiento de Text-to-Image solo es compatible con GPU CUDA.") +@spaces.GPU +def train_text_to_image(model_name, train_dataset, repo_id, update_logs, model_card_content, **kwargs): output_dir = tempfile.mkdtemp() - accelerator = accelerate.Accelerator( - gradient_accumulation_steps=int(kwargs.get('gradient_accumulation', 8)), - mixed_precision=kwargs.get('mixed_precision', 'no') - ) - yield update_logs_fn("Configurando componentes de Diffusers...", "Text-to-Image (LoRA)") - tokenizer = CLIPTokenizer.from_pretrained(model_name, subfolder="tokenizer") - text_encoder = CLIPTextModel.from_pretrained(model_name, subfolder="text_encoder", torch_dtype=torch_dtype_auto) - vae = AutoencoderKL.from_pretrained(model_name, subfolder="vae", torch_dtype=torch_dtype_auto) - unet = UNet2DConditionModel.from_pretrained(model_name, subfolder="unet", torch_dtype=torch_dtype_auto) - noise_scheduler = DDPMScheduler.from_pretrained(model_name, subfolder="scheduler") - vae.requires_grad_(False) - text_encoder.requires_grad_(False) - unet.train() - yield update_logs_fn("Agregando adaptadores LoRA al UNet...", "Text-to-Image (LoRA)") - unet_lora_config = LoraConfig( - r=int(kwargs.get('lora_r', 16)), lora_alpha=int(kwargs.get('lora_alpha', 32)), - target_modules=["to_q", "to_k", "to_v", "to_out.0"], - ) - unet.add_adapter(unet_lora_config) - if kwargs.get('dreambooth_train_text_encoder', False): - yield update_logs_fn("Agregando adaptadores LoRA al Text Encoder...", "DreamBooth LoRA") - text_encoder_lora_config = LoraConfig( - r=int(kwargs.get('lora_r', 16)), lora_alpha=int(kwargs.get('lora_alpha', 32)), - target_modules=["q_proj", "k_proj", "v_proj", "out_proj"], + try: + yield update_logs(f"Iniciando entrenamiento Text-to-Image con modelo base '{model_name}'...", "Configuración") + from transformers import CLIPTextModel, CLIPTokenizer + yield update_logs("Cargando componentes del modelo de difusión...", "Configuración") + tokenizer = CLIPTokenizer.from_pretrained(model_name, subfolder="tokenizer") + text_encoder = CLIPTextModel.from_pretrained(model_name, subfolder="text_encoder") + vae = AutoencoderKL.from_pretrained(model_name, subfolder="vae") + unet = UNet2DConditionModel.from_pretrained(model_name, subfolder="unet") + noise_scheduler = DDPMScheduler.from_pretrained(model_name, subfolder="scheduler") + yield update_logs("Componentes del modelo cargados exitosamente.", "Configuración") + vae.requires_grad_(False) + text_encoder.requires_grad_(False) + unet.train() + learning_rate = float(kwargs.get('learning_rate', 1e-5)) + optimizer = torch.optim.AdamW( + unet.parameters(), + lr=learning_rate, + betas=(float(kwargs.get('adam_beta1', 0.9)), float(kwargs.get('adam_beta2', 0.999))), + weight_decay=float(kwargs.get('weight_decay', 0.01)), + eps=float(kwargs.get('adam_epsilon', 1e-8)) ) - text_encoder.add_adapter(text_encoder_lora_config) - yield update_logs_fn("Procesando dataset de imágenes...", "Text-to-Image (LoRA)") - resolution = int(kwargs.get('diffusion_resolution', 512)) - train_transforms = transforms.Compose([ - transforms.Resize(resolution, interpolation=transforms.InterpolationMode.BILINEAR), - transforms.CenterCrop(resolution), - transforms.ToTensor(), - transforms.Normalize([0.5], [0.5]), - ]) - def preprocess_train(examples): - images = [image.convert("RGB") for image in examples[kwargs.get('image_col', 'image')]] - examples["pixel_values"] = [train_transforms(image) for image in images] - examples["input_ids"] = tokenizer(examples[kwargs.get('text_col', 'text')], max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt").input_ids - return examples - with accelerator.main_process_first(): - processed_dataset = train_dataset.map( - function=preprocess_train, - batched=True, - remove_columns=[col for col in next(iter(train_dataset)).keys() if col not in ['pixel_values', 'input_ids']], + yield update_logs("Optimizador configurado.", "Configuración") + text_col = kwargs.get('text_col', 'text') + image_col = kwargs.get('image_col', 'image') + image_transforms = transforms.Compose([ + transforms.Resize(512, interpolation=transforms.InterpolationMode.BILINEAR), + transforms.CenterCrop(512), + transforms.ToTensor(), + transforms.Normalize([0.5], [0.5]), + ]) + def preprocess_train(examples): + images = [image.convert("RGB") for image in examples[image_col]] + examples["pixel_values"] = [image_transforms(image) for image in images] + examples["input_ids"] = tokenizer( + examples[text_col], + padding="max_length", + max_length=tokenizer.model_max_length, + truncation=True, + return_tensors="pt" + ).input_ids + return examples + yield update_logs("Preprocesando dataset...", "Datos") + train_dataset = train_dataset.map(preprocess_train, batched=True, remove_columns=[image_col]) + batch_size = int(kwargs.get('batch_size', 1)) + gradient_accumulation_steps = int(kwargs.get('gradient_accumulation', 4)) + max_steps = int(kwargs.get('max_steps', 1000)) + num_epochs = int(kwargs.get('num_epochs', 1)) + train_dataloader = DataLoader( + train_dataset, + batch_size=batch_size, + shuffle=True, + num_workers=2 ) - def collate_fn(examples): - pixel_values = torch.stack([example["pixel_values"] for example in examples]) - input_ids = torch.stack([e["input_ids"][0] for e in examples]) - return {"pixel_values": pixel_values, "input_ids": input_ids} - train_dataloader = DataLoader(processed_dataset, shuffle=True, collate_fn=collate_fn, batch_size=int(kwargs.get('batch_size', 1))) - params_to_optimize = list(unet.parameters()) - if kwargs.get('dreambooth_train_text_encoder', False): - params_to_optimize += list(text_encoder.parameters()) - optimizer = torch.optim.AdamW( - params_to_optimize, lr=float(kwargs.get('learning_rate', 2e-5)), - betas=(float(kwargs.get('adam_beta1', 0.9)), float(kwargs.get('adam_beta2', 0.999))), - weight_decay=float(kwargs.get('weight_decay', 0.01)), - eps=float(kwargs.get('adam_epsilon', 1e-8)), - ) - num_epochs = int(kwargs.get('epochs', 1)) - num_update_steps_per_epoch = math.ceil(len(train_dataloader) / int(kwargs.get('gradient_accumulation', 8))) - max_train_steps = num_epochs * num_update_steps_per_epoch - lr_scheduler = get_cosine_schedule_with_warmup( - optimizer=optimizer, - num_warmup_steps=int(max_train_steps * float(kwargs.get('warmup_ratio', 0.03))), - num_training_steps=max_train_steps, - ) - unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( - unet, text_encoder, optimizer, train_dataloader, lr_scheduler - ) - vae.to(accelerator.device, dtype=torch_dtype_auto) - global_step = 0 - final_loss = 0 - for epoch in range(num_epochs): - for step, batch in enumerate(train_dataloader): - with accelerator.accumulate(unet): - latents = vae.encode(batch["pixel_values"].to(dtype=torch_dtype_auto)).latent_dist.sample() - latents = latents * vae.config.scaling_factor + from diffusers.optimization import get_scheduler as get_diffusers_lr_scheduler + lr_scheduler = get_diffusers_lr_scheduler( + kwargs.get('scheduler', 'cosine'), + optimizer=optimizer, + num_warmup_steps=int(max_steps * float(kwargs.get('warmup_ratio', 0.03))), + num_training_steps=max_steps + ) + yield update_logs(f"Iniciando entrenamiento: {max_steps} pasos, batch_size={batch_size}", "Entrenando") + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + unet = unet.to(device) + vae = vae.to(device) + text_encoder = text_encoder.to(device) + global_step = 0 + progress_bar = tqdm(range(max_steps), desc="Entrenando") + for epoch in range(num_epochs): + for step, batch in enumerate(train_dataloader): + if global_step >= max_steps: + break + pixel_values = torch.stack(batch["pixel_values"]).to(device) + with torch.no_grad(): + latents = vae.encode(pixel_values).latent_dist.sample() + latents = latents * vae.config.scaling_factor noise = torch.randn_like(latents) - bsz = latents.shape[0] - timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device).long() + timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (latents.shape[0],), device=device).long() noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps) - encoder_hidden_states = text_encoder(batch["input_ids"])[0] + input_ids = batch["input_ids"].to(device) + with torch.no_grad(): + encoder_hidden_states = text_encoder(input_ids)[0] noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample - loss = F.mse_loss(noise_pred.float(), noise.float(), reduction="mean") - final_loss = loss.detach().item() - accelerator.backward(loss) - if accelerator.sync_gradients: - params_to_clip = list(unet.parameters()) - if kwargs.get('dreambooth_train_text_encoder', False): - params_to_clip += list(text_encoder.parameters()) - accelerator.clip_grad_norm_(params_to_clip, float(kwargs.get('max_grad_norm', 1.0))) - optimizer.step() - lr_scheduler.step() - optimizer.zero_grad() - if accelerator.is_main_process: - if global_step % int(kwargs.get('logging_steps', 10)) == 0: - yield update_logs_fn(f"Epoch {epoch}, Step {step}, Loss: {final_loss:.4f}", "Entrenando Difusión") - global_step += 1 - if global_step >= max_train_steps: + loss = F.mse_loss(noise_pred, noise, reduction="mean") + loss = loss / gradient_accumulation_steps + loss.backward() + if (step + 1) % gradient_accumulation_steps == 0: + torch.nn.utils.clip_grad_norm_(unet.parameters(), float(kwargs.get('max_grad_norm', 1.0))) + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + global_step += 1 + progress_bar.update(1) + if global_step % int(kwargs.get('logging_steps', 10)) == 0: + yield update_logs(f"Paso {global_step}/{max_steps} - Loss: {loss.item():.4f}", "Entrenando") + if global_step % int(kwargs.get('save_steps', 500)) == 0: + yield update_logs(f"Guardando checkpoint en paso {global_step}...", "Guardando") + checkpoint_dir = os.path.join(output_dir, f"checkpoint-{global_step}") + os.makedirs(checkpoint_dir, exist_ok=True) + unet.save_pretrained(os.path.join(checkpoint_dir, "unet")) + if kwargs.get('hub_strategy') == 'every_save': + try: + upload_folder( + folder_path=checkpoint_dir, + repo_id=repo_id, + commit_message=f"Checkpoint paso {global_step}" + ) + except Exception as e: + yield update_logs(f"Advertencia: No se pudo subir checkpoint: {e}", "Guardando") + if global_step >= max_steps: + break + if global_step >= max_steps: break - if global_step >= max_train_steps: - break - accelerator.wait_for_everyone() - if accelerator.is_main_process: - pipeline = StableDiffusionText2ImagePipeline.from_pretrained( - model_name, - unet=accelerator.unwrap_model(unet), - text_encoder=accelerator.unwrap_model(text_encoder), - torch_dtype=torch_dtype_auto, + progress_bar.close() + yield update_logs("Entrenamiento completado. Guardando modelo final...", "Guardando") + final_output_dir = os.path.join(output_dir, "final_model") + os.makedirs(final_output_dir, exist_ok=True) + pipeline = StableDiffusionText2ImagePipeline( + text_encoder=text_encoder, + vae=vae, + unet=unet, + tokenizer=tokenizer, + scheduler=noise_scheduler, + safety_checker=None, + feature_extractor=None, + requires_safety_checker=False ) - pipeline.save_pretrained(output_dir) - with open(os.path.join(output_dir, "README.md"), "w", encoding="utf-8") as f: + pipeline.save_pretrained(final_output_dir) + with open(os.path.join(final_output_dir, "README.md"), "w", encoding="utf-8") as f: f.write(model_card_content) - yield update_logs_fn("Subiendo al Hub...", "Subiendo") - upload_folder(folder_path=output_dir, repo_id=repo_id, commit_message="Fin de entrenamiento de difusión") - del unet, vae, text_encoder, optimizer, train_dataloader, lr_scheduler, pipeline - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - return output_dir, {"final_loss": final_loss} - -@spaces.GPU() -def train_dreambooth_lora(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs): - if device == 'cpu': - raise ValueError("El entrenamiento de DreamBooth solo es compatible con GPU CUDA.") - dreambooth_prompt = kwargs.get('dreambooth_instance_prompt') - if not dreambooth_prompt: - raise ValueError("Se requiere un 'instance prompt' para el entrenamiento de DreamBooth.") - def add_prompt(example): - example[kwargs.get('text_col', 'text')] = dreambooth_prompt - return example - train_dataset = train_dataset.map(add_prompt) - yield update_logs_fn(f"Usando el prompt de instancia para todas las imágenes: '{dreambooth_prompt}'", "DreamBooth LoRA") - final_model_path, final_metrics = yield from train_text_to_image(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs) - return final_model_path, final_metrics - -@spaces.GPU() + yield update_logs("Modelo guardado. Subiendo al Hub...", "Subiendo") + upload_folder( + folder_path=final_output_dir, + repo_id=repo_id, + commit_message="Entrenamiento Text-to-Image completado" + ) + yield update_logs(f"✅ Modelo subido exitosamente a {repo_id}", "Completado") + final_metrics = { + "final_loss": loss.item(), + "total_steps": global_step, + "epochs_completed": epoch + 1 + } + del unet, vae, text_encoder, pipeline + gc.collect() + torch.cuda.empty_cache() if torch.cuda.is_available() else None + return final_output_dir, final_metrics + except Exception as e: + yield update_logs(f"❌ Error en entrenamiento Text-to-Image: {str(e)}", "Error") + raise Exception(f"Error en Text-to-Image: {e}\n{traceback.format_exc()}") +@spaces.GPU def _get_data_processing_pipeline(**kwargs): hf_ids = [x.strip() for x in (kwargs.get('datasets_hf_text') or "").split(",") if x.strip()] if not hf_ids and not kwargs.get('uploads'): @@ -1187,7 +1216,7 @@ def _get_data_processing_pipeline(**kwargs): raise ValueError("El dataset de entrenamiento está vacío después del procesamiento.") text_col, image_col, audio_col, label_col = _guess_columns(first_example) kwargs.update({'text_col': text_col, 'image_col': image_col, 'audio_col': audio_col, 'label_col': label_col, 'uploaded_val_data': uploaded_val_data}) - is_text_task = kwargs['training_mode'] not in ["DreamBooth LoRA (Text-to-Image)", "Text-to-Image (LoRA)", "Image Classification (Vision)", "Audio Classification (Speech)"] + is_text_task = kwargs['training_mode'] not in ["Image Classification (Vision)", "Audio Classification (Speech)"] if is_text_task: if any([kwargs.get('remove_html_tags'), kwargs.get('normalize_whitespace'), kwargs.get('remove_urls_emails'), kwargs.get('redact_pii')]): clean_kwargs = {k:v for k,v in kwargs.items() if k in ['remove_html_tags', 'normalize_whitespace', 'remove_urls_emails', 'redact_pii']} @@ -1206,7 +1235,7 @@ def _get_data_processing_pipeline(**kwargs): train_dataset = _apply_cda(train_dataset, text_col, kwargs['cda_json_config']) dedup_method = kwargs.get('deduplication_method') if dedup_method != 'Ninguna': - train_dataset = _create_deduplicated_iterable_dataset( + train_dataset = DeduplicatedIterableDataset( dataset=train_dataset, text_col=text_col, method=dedup_method, @@ -1214,10 +1243,10 @@ def _get_data_processing_pipeline(**kwargs): num_perm=int(kwargs.get('minhash_num_perm', 128)) ) return train_dataset, kwargs - -@spaces.GPU() -def _train_and_upload(**kwargs): +@spaces.GPU +def _train_and_upload(progress=gr.Progress(), **kwargs): logs, repo_link, final_model_path, final_metrics = "", "", None, {} + progress(0, desc="Iniciando...") yield ( "Iniciando...", "Inicio", @@ -1229,6 +1258,7 @@ def _train_and_upload(**kwargs): def update_logs(new_msg, phase_msg): nonlocal logs, repo_link, final_metrics logs += f"[{phase_msg}] {new_msg}\n" + progress(0, desc=f"[{phase_msg}] {new_msg}") return ( logs, phase_msg, @@ -1279,13 +1309,24 @@ def _train_and_upload(**kwargs): config_class, model_class = ARCHITECTURE_MAP[architecture] if kwargs.get('auto_config_scratch'): vocab_size, hidden_size, intermediate_size, layers, heads, block_size_val, tie_word_embeddings, kv_heads = _calculate_auto_config(kwargs.get('block_size'), architecture == "GPT2", kwargs.get('steps_per_epoch_estimate'), kwargs.get('batch_size'), kwargs.get('gradient_accumulation')) + config = config_class(vocab_size=vocab_size, hidden_size=hidden_size, intermediate_size=intermediate_size, num_hidden_layers=layers, num_attention_heads=heads, num_key_value_heads=kv_heads, max_position_embeddings=block_size_val, tie_word_embeddings=tie_word_embeddings) + model = model_class(config) + elif kwargs.get('manual_config_scratch'): + vocab_size = int(kwargs.get('scratch_vocab_size', 32000)) + hidden_size = int(kwargs.get('scratch_hidden_size', 1024)) + intermediate_size = int(kwargs.get('scratch_intermediate_size', 2048)) + layers = int(kwargs.get('scratch_layers', 8)) + heads = int(kwargs.get('scratch_heads', 8)) + kv_heads = int(kwargs.get('scratch_kv_heads', 8)) + block_size_val = int(kwargs.get('scratch_block_size', 1024)) + tie_word_embeddings = kwargs.get('scratch_tie_word_embeddings', False) + config = config_class(vocab_size=vocab_size, hidden_size=hidden_size, intermediate_size=intermediate_size, num_hidden_layers=layers, num_attention_heads=heads, num_key_value_heads=kv_heads, max_position_embeddings=block_size_val, tie_word_embeddings=tie_word_embeddings) + model = model_class(config) else: - vocab_size, hidden_size, intermediate_size, layers, heads, kv_heads, tie_word_embeddings = 32000, 1024, 2048, 8, 8, 8, False - config = config_class(vocab_size=vocab_size, hidden_size=hidden_size, intermediate_size=intermediate_size, num_hidden_layers=layers, num_attention_heads=heads, num_key_value_heads=kv_heads, max_position_embeddings=int(kwargs.get('block_size', 1024)), tie_word_embeddings=tie_word_embeddings) - model = model_class(config) + raise ValueError("Debe seleccionar auto-configuración o configuración manual para entrenar desde cero.") temp_model_dir = tempfile.mkdtemp() model.save_pretrained(temp_model_dir) - tokenizer_id = kwargs.get('tokenizer_name') or SCRATCH_TOKENIZER_MAP.get(architecture, "gpt2") + tokenizer_id = kwargs.get('tokenizer_name_input') or SCRATCH_TOKENIZER_MAP.get(architecture, "gpt2") try: tokenizer = AutoTokenizer.from_pretrained(tokenizer_id) tokenizer.save_pretrained(temp_model_dir) @@ -1304,7 +1345,7 @@ def _train_and_upload(**kwargs): os.environ["WANDB_PROJECT"] = kwargs.get('wandb_project_input') or f"{repo_base}" os.environ["WANDB_LOG_MODEL"] = "checkpoint" model_card_content = MODEL_CARD_TEMPLATE.format( - repo_id=repo_id, base_model=model_name, base_model_name=model_name.split('/')[-1], + repo_id=repo_id, base_model=model_name, base_model_name=_sanitize_model_name_for_yaml(model_name), training_mode=kwargs.get('training_mode'), datasets=', '.join([x.strip() for x in (kwargs.get('datasets_hf_text') or "").split(",") if x.strip()]) or "Archivos locales", hyperparameters=json.dumps({k: v for k, v in kwargs.items() if isinstance(v, (str, int, float, bool)) and 'token' not in k and 'key' not in k and v is not None}, indent=2), @@ -1318,8 +1359,7 @@ def _train_and_upload(**kwargs): "Sequence Classification (Text)": train_sequence_classification, "Token Classification (NER)": train_token_classification, "Text2Text Generation": train_seq2seq, - "Text-to-Image (LoRA)": train_text_to_image, - "DreamBooth LoRA (Text-to-Image)": train_dreambooth_lora, + "Text-to-Image Generation": train_text_to_image, } train_func = training_function_map.get(training_mode) if train_func: @@ -1328,7 +1368,7 @@ def _train_and_upload(**kwargs): try: update = next(train_generator) if isinstance(update, tuple) and len(update) == 4: - yield update + (gr.update(), gr.update()) + yield update + (gr.update(), gr.update()) else: pass except StopIteration as e: @@ -1338,7 +1378,7 @@ def _train_and_upload(**kwargs): raise ValueError(f"El modo de entrenamiento '{training_mode}' no está implementado.") if kwargs.get('run_perplexity_evaluation') and final_model_path and training_mode in ["Causal Language Modeling (SFT/LoRA)", "DPO (Direct Preference Optimization)"]: yield update_logs("Iniciando evaluación de perplejidad...", "Evaluación Final") + (gr.update(), gr.update()) - model = AutoModelForCausalLM.from_pretrained(final_model_path, torch_dtype=torch_dtype_auto, device_map=device) + model = AutoModelForCausalLM.from_pretrained(final_model_path, torch_dtype=torch.float32, ) tokenizer = AutoTokenizer.from_pretrained(final_model_path) eval_dataset_perp = None eval_gen = _get_eval_dataset(kwargs.get('datasets_hf_text').split(","), kwargs.get('eval_dataset_hf'), kwargs.get('uploaded_val_data'), lambda m, p: update_logs(m, p)) @@ -1371,14 +1411,13 @@ def _train_and_upload(**kwargs): gr.update(value="Iniciar Entrenamiento", interactive=True), gr.update(visible=False) ) - -@spaces.GPU() +@spaces.GPU def run_inference(task_mode, model_id, text_in, context_in, image_in, audio_in, temperature, top_p, max_new_tokens): if not model_id: return "Por favor, introduce un ID de modelo del Hub.", model_id, gr.update(), gr.update(), gr.update(), gr.update() task_name = TASK_TO_PIPELINE_MAP.get(task_mode) if not task_name: return f"La inferencia para el modo '{task_mode}' no está soportada.", model_id, gr.update(), gr.update(), gr.update(), gr.update() try: - pipe = pipeline(task_name, model=model_id, torch_dtype=torch_dtype_auto, trust_remote_code=True, device=0 if device == 'cuda' else -1) + pipe = pipeline(task_name, model=model_id, torch_dtype=torch.float32, trust_remote_code=True, ) result = None if task_name == "text-generation": if not text_in: return "Por favor, introduce un prompt de texto.", model_id, gr.update(), gr.update(), gr.update(), gr.update() @@ -1395,8 +1434,6 @@ def run_inference(task_mode, model_id, text_in, context_in, image_in, audio_in, result = pipe(input_data) return f"Resultado:\n\n{json.dumps(result, indent=2, ensure_ascii=False)}", model_id, gr.update(), gr.update(), gr.update(), gr.update() except Exception as e: return f"Error en Inferencia: {e}\n{traceback.format_exc()}", model_id, gr.update(), gr.update(), gr.update(), gr.update() - -@spaces.GPU() def update_inference_ui(task_mode): task_name = TASK_TO_PIPELINE_MAP.get(task_mode, "") is_text_gen = task_name == "text-generation" @@ -1412,8 +1449,7 @@ def update_inference_ui(task_mode): gr.update(visible=show_audio), gr.update(visible=is_text_gen) ) - -@spaces.GPU() +@spaces.GPU def create_and_upload_dataset(hf_token, repo_name, creation_type, synth_model, synth_prompt, synth_num_samples, file_uploads, progress=gr.Progress()): if not hf_token: return "Error: Se requiere un token de Hugging Face.", "" @@ -1431,7 +1467,7 @@ def create_and_upload_dataset(hf_token, repo_name, creation_type, synth_model, s if not synth_model or not synth_prompt or not synth_num_samples: return "Error: Para la generación sintética se requiere un modelo, un prompt y un número de muestras.", "" progress(0, desc="Cargando modelo generador...") - generator = pipeline("text-generation", model=synth_model, torch_dtype=torch_dtype_auto, device=0 if device == 'cuda' else -1) + generator = pipeline("text-generation", model=synth_model, ) for i in progress.tqdm(range(int(synth_num_samples)), desc="Generando muestras"): try: generated_output = generator(synth_prompt, max_new_tokens=256, num_return_sequences=1, do_sample=True, temperature=0.9, top_p=0.95) @@ -1474,29 +1510,34 @@ def create_and_upload_dataset(hf_token, repo_name, creation_type, synth_model, s return f"✅ Dataset creado y subido exitosamente a {repo_id}", f"### ✅ [Dataset Disponible: Visita el Repositorio]({dataset_link})" except Exception as e: return f"❌ Error fatal durante la creación del dataset: {e}\n{traceback.format_exc()}", "" - -@spaces.GPU() +@spaces.GPU def gradio_train_wrapper(*args): kwargs = dict(zip(all_input_components_dict.keys(), args)) yield from _train_and_upload(**kwargs) - -@spaces.GPU() +@spaces.GPU def gradio_preview_data_wrapper(*args): kwargs = dict(zip(all_input_components_dict.keys(), args)) try: preview_text = "Procesando vista previa...\n" yield preview_text - dataset, processed_kwargs = _get_data_processing_pipeline(**kwargs) - text_col = processed_kwargs.get('text_col') model_id_for_tokenizer = kwargs.get('model_base_input') - if not model_id_for_tokenizer: + if not model_id_for_tokenizer and not kwargs.get('train_from_scratch'): raise ValueError("Se necesita un ID de modelo base para cargar el tokenizer para la vista previa.") - tokenizer_id = kwargs.get('tokenizer_name') or model_id_for_tokenizer + + dataset, processed_kwargs = _get_data_processing_pipeline(**kwargs) + text_col = processed_kwargs.get('text_col') + + if kwargs.get('train_from_scratch'): + tokenizer_id = SCRATCH_TOKENIZER_MAP.get(kwargs.get('scratch_architecture'), 'gpt2') + else: + tokenizer_id = kwargs.get('tokenizer_name_input') or model_id_for_tokenizer + tokenizer = AutoTokenizer.from_pretrained( tokenizer_id, trust_remote_code=True, use_fast=False ) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token if kwargs.get('chat_template_jinja', '').strip(): tokenizer.chat_template = kwargs['chat_template_jinja'] + preview_samples = [] for i, example in enumerate(islice(dataset, 5)): formatted_text = "" @@ -1507,61 +1548,59 @@ def gradio_preview_data_wrapper(*args): else: formatted_text = str(example) preview_samples.append(f"--- MUESTRA {i+1} ---\n{formatted_text}\n") + preview_text = "\n".join(preview_samples) if not preview_samples: preview_text = "No se pudieron generar muestras. Revisa la configuración del dataset, los filtros y el formato." yield preview_text except Exception as e: yield f"Error al generar la vista previa: {e}\n{traceback.format_exc()}" - -@spaces.GPU() def toggle_training_mode_ui(is_scratch): return ( - gr.update(visible=not is_scratch), - gr.update(visible=not is_scratch), - gr.update(visible=not is_scratch), - gr.update(visible=not is_scratch), - gr.update(visible=is_scratch), - gr.update(visible=is_scratch) + gr.update(visible=not is_scratch), + gr.update(visible=not is_scratch), + gr.update(visible=not is_scratch), + gr.update(visible=not is_scratch), + gr.update(visible=is_scratch), + gr.update(visible=is_scratch), + gr.update(visible=is_scratch), + gr.update(visible=is_scratch), + gr.update(visible=is_scratch), + gr.update(visible=is_scratch), + gr.update(visible=is_scratch), + gr.update(visible=is_scratch), + gr.update(visible=is_scratch), + gr.update(visible=is_scratch), + gr.update(visible=is_scratch), + gr.update(visible=is_scratch), + gr.update(visible=is_scratch), + gr.update(visible=is_scratch), ) - -@spaces.GPU() def toggle_task_specific_ui(training_mode): is_classification = "Classification" in training_mode is_dpo = "DPO" in training_mode is_sft = "Causal" in training_mode is_ner = "Token Classification" in training_mode - is_diffusion = training_mode in ["Text-to-Image (LoRA)", "DreamBooth LoRA (Text-to-Image)"] - is_streaming = not is_diffusion + is_diffusion = "Image Generation" in training_mode return ( gr.update(visible=is_classification or is_ner), gr.update(visible=is_dpo), gr.update(visible=is_sft), gr.update(visible=is_diffusion), - gr.update(visible=training_mode == "DreamBooth LoRA (Text-to-Image)"), - gr.update(visible=not is_diffusion), - gr.update(visible=is_diffusion), - gr.update(visible=is_streaming), - gr.update(visible=not is_streaming), + gr.update(visible=not is_diffusion) ) - -@spaces.GPU() def toggle_sft_format_ui(format_style): is_tool = format_style == "Razonamiento/Herramientas" return gr.update(visible=is_tool) - -@spaces.GPU() def toggle_auto_modules_ui(is_auto): return gr.update(visible=not is_auto) - -@spaces.GPU() def toggle_dataset_creator_ui(choice): is_synth = choice == "Sintético" return gr.update(visible=is_synth), gr.update(visible=not is_synth) with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo: gr.Markdown("# 🚀 AutoTrain-Advanced: Tu Plataforma de Entrenamiento de Modelos") - gr.Markdown("### Una interfaz completa para fine-tuning, PEFT (LoRA, QLoRA), y despliegue de modelos en Hugging Face.") + gr.Markdown("### Una interfaz completa para fine-tuning y PEFT (LoRA).") with gr.Tab("1. Autenticación"): gr.Markdown("#### Conecta tu cuenta de Hugging Face para guardar y cargar modelos.") @@ -1603,9 +1642,22 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo: model_base_input = gr.Textbox(label="ID del Modelo Base", placeholder="p.ej. 'mistralai/Mistral-7B-v0.1'") tokenizer_name_input = gr.Textbox(label="ID del Tokenizer (opcional)", placeholder="p.ej. si el modelo no tiene tokenizer") repo_name_input = gr.Textbox(label="Nombre del Repositorio de Destino", placeholder="p.ej. 'mi-modelo-afinado'") + private_repo = gr.Checkbox(label="Repositorio Privado", value=False) train_from_scratch = gr.Checkbox(label="Entrenar desde Cero", value=False) auto_config_scratch = gr.Checkbox(label="Auto-Configuración", value=True, visible=False) + manual_config_scratch = gr.Checkbox(label="Configuración Manual", value=False, visible=False) scratch_architecture = gr.Textbox(label="Arquitectura (p.ej. Llama, Mistral)", value="Llama", visible=False) + scratch_vocab_size = gr.Number(label="Tamaño de Vocabulario", value=32000, visible=False) + scratch_hidden_size = gr.Number(label="Tamaño Oculto", value=1024, visible=False) + scratch_intermediate_size = gr.Number(label="Tamaño Intermedio", value=2048, visible=False) + scratch_layers = gr.Number(label="Número de Capas", value=8, visible=False) + scratch_heads = gr.Number(label="Cabezas de Atención", value=8, visible=False) + scratch_kv_heads = gr.Number(label="Cabezas KV", value=8, visible=False) + scratch_block_size = gr.Number(label="Tamaño de Bloque", value=1024, visible=False) + scratch_tie_word_embeddings = gr.Checkbox(label="Enlazar Embeddings de Palabras", value=False, visible=False) + steps_per_epoch_estimate = gr.Number(label="Estimación de Pasos por Época (para auto-config)", value=1000, visible=False) + attention_dropout = gr.Slider(0.0, 0.5, 0.0, label="Dropout de Atención", visible=False) + hidden_dropout = gr.Slider(0.0, 0.5, 0.0, label="Dropout Oculto", visible=False) with gr.Accordion("🔄 Fusión de Múltiples Adaptadores (Avanzado)", open=False) as multi_adapter_accordion: enable_multi_adapter_merge = gr.Checkbox(label="Habilitar Fusión Múltiple", value=False) multi_adapter_model_ids = gr.Textbox(label="IDs de Adaptadores (csv)", placeholder="org/adapter1,org/adapter2") @@ -1625,14 +1677,10 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo: gradient_accumulation = gr.Textbox(label="Acumulación de Gradiente", value="8") with gr.Row(): block_size = gr.Textbox(label="Longitud de Secuencia", value="1024") - with gr.Group(visible=True) as max_steps_ui: - max_steps = gr.Textbox(label="Máximos Pasos de Entrenamiento", value="100") - with gr.Group(visible=False) as epochs_ui: - epochs = gr.Textbox(label="Épocas", value="1") + max_steps = gr.Textbox(label="Máximos Pasos de Entrenamiento", value="100") with gr.Row(): optimizer = gr.Dropdown(["adamw_torch", "adafactor", "sgd", "adagrad"], label="Optimizador", value="adamw_torch") scheduler = gr.Dropdown(["cosine", "linear", "constant"], label="Planificador LR", value="cosine") - mixed_precision = gr.Radio(["no", "fp16", "bf16"], label="Precisión Mixta", value="no") with gr.Accordion("Avanzados", open=False): warmup_ratio = gr.Slider(0.0, 0.5, 0.03, label="Ratio de Calentamiento") weight_decay = gr.Textbox(label="Decaimiento de Peso", value="0.01") @@ -1646,14 +1694,11 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo: adam_beta1 = gr.Textbox(label="Adam Beta1", value="0.9") adam_beta2 = gr.Textbox(label="Adam Beta2", value="0.999") adam_epsilon = gr.Textbox(label="Adam Epsilon", value="1e-8") - disable_gradient_checkpointing = gr.Checkbox(label="Deshabilitar Gradient Checkpointing", value=False) group_by_length = gr.Checkbox(label="Agrupar por Longitud", value=False) neftune_noise_alpha = gr.Textbox(label="NEFTune Ruido Alfa (0 para desactivar)", value="0") optim_args = gr.Textbox(label="Argumentos del Optimizador (formato dict)", placeholder="ej: betas=(0.9,0.995)") - attn_implementation = gr.Dropdown(["eager", "flash_attention_2"], label="Implementación de Atención", value="eager") - with gr.Accordion("🦋 PEFT (LoRA / QLoRA)", open=True) as peft_accordion: + with gr.Accordion("🦋 PEFT (LoRA)", open=True) as peft_accordion: peft = gr.Checkbox(label="Habilitar PEFT/LoRA", value=True) - quantization = gr.Dropdown(["no", "4bit", "8bit"], label="Cuantización", value="no") with gr.Row(): lora_r = gr.Textbox(label="LoRA r", value="16") lora_alpha = gr.Textbox(label="LoRA alpha", value="32") @@ -1677,6 +1722,23 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo: max_len_input = gr.Slider(100, 5000, 2000, label="Longitud Máxima (palabras)") rep_threshold_input = gr.Slider(0, 1, 0.2, label="Umbral de Repetición") exclude_keywords_input = gr.Textbox(label="Palabras Clave a Excluir (csv)") + bias_keywords_input = gr.Textbox(label="Palabras Clave de Sesgo (csv)", placeholder="p.ej. discriminación,prejuicio") + enable_language_filter = gr.Checkbox(label="Habilitar Filtro de Idioma", value=False) + allowed_languages = gr.Textbox(label="Idiomas Permitidos (csv)", value="es,en", placeholder="es,en") + language_detection_threshold = gr.Slider(0.5, 1.0, 0.95, label="Umbral de Detección de Idioma") + enable_toxicity_filter = gr.Checkbox(label="Habilitar Filtro de Toxicidad", value=False) + toxicity_threshold = gr.Slider(0.5, 1.0, 0.8, label="Umbral de Toxicidad") + enable_coherence_filter = gr.Checkbox(label="Habilitar Filtro de Coherencia (Anti-Gibberish)", value=True) + coherence_char_repetition_threshold = gr.Slider(0.1, 0.8, 0.4, label="Umbral de Repetición de Caracteres", info="Máximo ratio de caracteres repetidos permitido") + coherence_ngram_repetition_threshold = gr.Slider(0.1, 0.8, 0.3, label="Umbral de Repetición de N-gramas", info="Máximo ratio de patrones repetidos permitido") + coherence_entropy_threshold = gr.Slider(0.1, 0.9, 0.5, label="Umbral de Entropía", info="Mínima entropía normalizada requerida") + enable_readability_filter = gr.Checkbox(label="Habilitar Filtro de Legibilidad", value=False) + min_readability = gr.Slider(0, 100, 30, label="Legibilidad Mínima (Flesch)") + max_readability = gr.Slider(0, 100, 100, label="Legibilidad Máxima (Flesch)") + enable_stopword_filter = gr.Checkbox(label="Habilitar Filtro de Palabras Vacías", value=False) + max_stopword_ratio = gr.Slider(0.0, 1.0, 0.5, label="Ratio Máxima de Palabras Vacías") + enable_uniqueness_filter = gr.Checkbox(label="Habilitar Filtro de Unicidad", value=False) + min_uniqueness_ratio = gr.Slider(0.0, 1.0, 0.3, label="Ratio Mínima de Unicidad") with gr.Tab("Deduplicación"): deduplication_method = gr.Radio(["Ninguna", "Exacta", "Semántica (MinHash)"], label="Método de Deduplicación", value="Ninguna") minhash_threshold = gr.Slider(0.7, 0.99, 0.85, label="Umbral MinHash") @@ -1685,10 +1747,12 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo: enable_back_translation = gr.Checkbox(label="Habilitar Retrotraducción", value=False) bt_model_id = gr.Textbox(label="Modelo de Traducción", value="Helsinki-NLP/opus-mt-en-de") bt_reverse_model_id = gr.Textbox(label="Modelo Inverso", value="Helsinki-NLP/opus-mt-de-en") + bt_augmentation_ratio = gr.Slider(0.0, 1.0, 0.1, label="Ratio de Aumentación BT") with gr.Tab("Generación Sintética"): enable_synthetic_data = gr.Checkbox(label="Habilitar Datos Sintéticos", value=False) synthetic_model_id = gr.Textbox(label="ID del Modelo Generador", placeholder="p.ej. 'mistralai/Mistral-7B-Instruct-v0.2'") num_synthetic_samples = gr.Number(label="Número de Muestras", value=1000) + synthetic_prompt_template = gr.Textbox(label="Plantilla de Prompt", value="Genera un nuevo ejemplo basado en: {{example_text}}\n\nNuevo ejemplo:", lines=3) with gr.Accordion("📝 Configuración de Formato y Tarea", open=False): with gr.Group(visible=True) as sft_ui: sft_format_style = gr.Radio(["Columna de Texto", "Conversacional", "Razonamiento/Herramientas"], label="Formato de Datos SFT", value="Columna de Texto") @@ -1707,12 +1771,11 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo: with gr.Group(visible=False) as classification_labels_ui: classification_labels = gr.Textbox(label="Etiquetas de Clasificación (csv)", placeholder="p.ej. positivo,negativo") with gr.Group(visible=False) as diffusion_ui: - diffusion_resolution = gr.Slider(256, 1024, 512, step=64, label="Resolución") - with gr.Group(visible=False) as dreambooth_ui: - dreambooth_instance_prompt = gr.Textbox(label="Prompt de Instancia", placeholder="p.ej. 'foto de perro sks'") - dreambooth_train_text_encoder = gr.Checkbox(label="Entrenar Text Encoder", value=True) + gr.Markdown("Opciones para Text-to-Image aparecerán aquí.") with gr.Accordion("📊 Evaluación y Mitigación de Sesgos", open=False): run_evaluation = gr.Checkbox(label="Ejecutar Evaluación", value=False) + metric_for_best_model = gr.Textbox(label="Métrica para Mejor Modelo", value="loss", placeholder="loss, accuracy, f1") + greater_is_better = gr.Checkbox(label="Mayor es Mejor", value=False) run_perplexity_evaluation = gr.Checkbox(label="Calcular Perplejidad", value=True) enable_loss_reweighting = gr.Checkbox(label="Habilitar Re-ponderación de Pérdida", value=False) reweighting_terms = gr.Textbox(label="Términos para Re-ponderar (csv)", placeholder="sesgo,injusto") @@ -1720,6 +1783,7 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo: enable_cda = gr.Checkbox(label="Habilitar Aumentación Contrafactual (CDA)", value=False) cda_json_config = gr.Textbox(label="Configuración CDA (JSON)", placeholder='[["ella", "él"], ["mujer", "hombre"]]') with gr.Accordion("🔌 Integraciones", open=False): + hub_strategy = gr.Dropdown(["every_save", "end", "checkpoint", "all_checkpoints"], label="Estrategia de Subida al Hub", value="every_save") wandb_api_key_input = gr.Textbox(label="Clave API de W&B", type="password") wandb_project_input = gr.Textbox(label="Proyecto W&B") with gr.Column(scale=3): @@ -1733,41 +1797,56 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo: training_logs = gr.Textbox(label="Registros de Entrenamiento", lines=35, interactive=False) all_input_components_dict = { "training_mode": training_mode, "model_base_input": model_base_input, "tokenizer_name_input": tokenizer_name_input, - "repo_name_input": repo_name_input, "train_from_scratch": train_from_scratch, "auto_config_scratch": auto_config_scratch, - "scratch_architecture": scratch_architecture, "enable_multi_adapter_merge": enable_multi_adapter_merge, + "repo_name_input": repo_name_input, "private_repo": private_repo, "train_from_scratch": train_from_scratch, + "auto_config_scratch": auto_config_scratch, "manual_config_scratch": manual_config_scratch, + "scratch_architecture": scratch_architecture, "scratch_vocab_size": scratch_vocab_size, + "scratch_hidden_size": scratch_hidden_size, "scratch_intermediate_size": scratch_intermediate_size, + "scratch_layers": scratch_layers, "scratch_heads": scratch_heads, "scratch_kv_heads": scratch_kv_heads, + "scratch_block_size": scratch_block_size, "scratch_tie_word_embeddings": scratch_tie_word_embeddings, + "steps_per_epoch_estimate": steps_per_epoch_estimate, "attention_dropout": attention_dropout, + "hidden_dropout": hidden_dropout, "enable_multi_adapter_merge": enable_multi_adapter_merge, "multi_adapter_model_ids": multi_adapter_model_ids, "multi_adapter_weights": multi_adapter_weights, "multi_adapter_combination_type": multi_adapter_combination_type, "datasets_hf_text": datasets_hf_text, "uploads": uploads, "dataset_weights": dataset_weights, "eval_dataset_hf": eval_dataset_hf, - "learning_rate": learning_rate, "epochs": epochs, "max_steps": max_steps, "batch_size": batch_size, "gradient_accumulation": gradient_accumulation, + "learning_rate": learning_rate, "max_steps": max_steps, "batch_size": batch_size, "gradient_accumulation": gradient_accumulation, "block_size": block_size, "optimizer": optimizer, "scheduler": scheduler, - "mixed_precision": mixed_precision, "warmup_ratio": warmup_ratio, "weight_decay": weight_decay, "max_grad_norm": max_grad_norm, + "warmup_ratio": warmup_ratio, "weight_decay": weight_decay, "max_grad_norm": max_grad_norm, "logging_steps": logging_steps, "save_steps": save_steps, "save_total_limit": save_total_limit, "resume_from_checkpoint": resume_from_checkpoint, "adam_beta1": adam_beta1, "adam_beta2": adam_beta2, "adam_epsilon": adam_epsilon, - "disable_gradient_checkpointing": disable_gradient_checkpointing, "group_by_length": group_by_length, - "neftune_noise_alpha": neftune_noise_alpha, "optim_args": optim_args, "attn_implementation": attn_implementation, + "group_by_length": group_by_length, + "neftune_noise_alpha": neftune_noise_alpha, "optim_args": optim_args, "early_stopping_patience": early_stopping_patience, - "peft": peft, "quantization": quantization, "lora_r": lora_r, "lora_alpha": lora_alpha, + "peft": peft, "lora_r": lora_r, "lora_alpha": lora_alpha, "lora_dropout": lora_dropout, "auto_find_target_modules": auto_find_target_modules, "target_modules": target_modules, "modules_to_save": modules_to_save, "use_dora": use_dora, "use_rslora": use_rslora, "init_lora_weights": init_lora_weights, "remove_html_tags": remove_html_tags, "normalize_whitespace": normalize_whitespace, "remove_urls_emails": remove_urls_emails, "redact_pii": redact_pii, "enable_quality_filter": enable_quality_filter, "min_len_input": min_len_input, "max_len_input": max_len_input, "rep_threshold_input": rep_threshold_input, "exclude_keywords_input": exclude_keywords_input, + "bias_keywords_input": bias_keywords_input, "enable_language_filter": enable_language_filter, + "allowed_languages": allowed_languages, "language_detection_threshold": language_detection_threshold, + "enable_toxicity_filter": enable_toxicity_filter, "toxicity_threshold": toxicity_threshold, + "enable_coherence_filter": enable_coherence_filter, "coherence_char_repetition_threshold": coherence_char_repetition_threshold, + "coherence_ngram_repetition_threshold": coherence_ngram_repetition_threshold, "coherence_entropy_threshold": coherence_entropy_threshold, + "enable_readability_filter": enable_readability_filter, "min_readability": min_readability, "max_readability": max_readability, + "enable_stopword_filter": enable_stopword_filter, "max_stopword_ratio": max_stopword_ratio, + "enable_uniqueness_filter": enable_uniqueness_filter, "min_uniqueness_ratio": min_uniqueness_ratio, "deduplication_method": deduplication_method, "minhash_threshold": minhash_threshold, "minhash_num_perm": minhash_num_perm, "enable_cda": enable_cda, "cda_json_config": cda_json_config, "enable_back_translation": enable_back_translation, "bt_model_id": bt_model_id, - "bt_reverse_model_id": bt_reverse_model_id, "enable_synthetic_data": enable_synthetic_data, + "bt_reverse_model_id": bt_reverse_model_id, "bt_augmentation_ratio": bt_augmentation_ratio, + "enable_synthetic_data": enable_synthetic_data, "synthetic_model_id": synthetic_model_id, "num_synthetic_samples": num_synthetic_samples, + "synthetic_prompt_template": synthetic_prompt_template, "sft_format_style": sft_format_style, "chat_template_jinja": chat_template_jinja, "enable_cot_input": enable_cot_input, "enable_tool_use_input": enable_tool_use_input, "prompt_col_input": prompt_col_input, "response_col_input": response_col_input, "reasoning_col_input": reasoning_col_input, "tool_use_col_input": tool_use_col_input, "dpo_prompt_col_input": dpo_prompt_col_input, "dpo_chosen_col_input": dpo_chosen_col_input, "dpo_rejected_col_input": dpo_rejected_col_input, "classification_labels": classification_labels, - "diffusion_resolution": diffusion_resolution, "run_evaluation": run_evaluation, "run_perplexity_evaluation": run_perplexity_evaluation, + "run_evaluation": run_evaluation, "metric_for_best_model": metric_for_best_model, + "greater_is_better": greater_is_better, "run_perplexity_evaluation": run_perplexity_evaluation, "enable_loss_reweighting": enable_loss_reweighting, "reweighting_terms": reweighting_terms, "reweighting_factor": reweighting_factor, - "wandb_api_key_input": wandb_api_key_input, "wandb_project_input": wandb_project_input, - "dreambooth_instance_prompt": dreambooth_instance_prompt, - "dreambooth_train_text_encoder": dreambooth_train_text_encoder + "hub_strategy": hub_strategy, "wandb_api_key_input": wandb_api_key_input, "wandb_project_input": wandb_project_input, } all_input_components_list = list(all_input_components_dict.values()) all_output_components = [training_logs, training_phase, repo_link_output, final_eval_results, start_training_button, stop_training_button] @@ -1779,12 +1858,16 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo: train_from_scratch.change( toggle_training_mode_ui, inputs=[train_from_scratch], - outputs=[model_base_input, tokenizer_name_input, multi_adapter_accordion, peft_accordion, auto_config_scratch, scratch_architecture] + outputs=[model_base_input, tokenizer_name_input, multi_adapter_accordion, peft_accordion, + auto_config_scratch, scratch_architecture, manual_config_scratch, scratch_vocab_size, + scratch_hidden_size, scratch_intermediate_size, scratch_layers, scratch_heads, + scratch_kv_heads, scratch_block_size, scratch_tie_word_embeddings, + steps_per_epoch_estimate, attention_dropout, hidden_dropout] ) training_mode.change( toggle_task_specific_ui, inputs=[training_mode], - outputs=[classification_labels_ui, dpo_ui, sft_ui, diffusion_ui, dreambooth_ui, peft_accordion, epochs_ui, max_steps_ui, peft_accordion] + outputs=[classification_labels_ui, dpo_ui, sft_ui, diffusion_ui, peft_accordion] ) sft_format_style.change( toggle_sft_format_ui, @@ -1833,6 +1916,49 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo: inputs=[inf_task_mode, inf_model_id, inf_text_in, inf_context_in, inf_image_in, inf_audio_in, inf_temperature, inf_top_p, inf_max_new_tokens], outputs=[inf_text_out, inf_model_id, inf_text_in, inf_context_in, inf_image_in, inf_audio_in] ) + with gr.Tab("5. Explicación del Código y Mecanismos Avanzados"): + gr.Markdown(""" +### 🧠 Explicación del Código y Mecanismos Avanzados +""") + gr.Markdown("#### 1. CORE MECHANISMS") + gr.Markdown(""" +* PEFT/LoRA: Parameter-Efficient Fine-Tuning. Only low-rank matrices ($A$ and $B$) are trained for low-rank updates ($W' = W + B A$). This drastically reduces trainable parameters. +* Accelerator: Manages device placement and gradient accumulation for stable large-batch training simulation. +* Early Stopping: Halts training if validation loss doesn't improve over a set number of steps (`early_stopping_patience`). +* Gradient Accumulation: Simulates larger batch sizes by accumulating gradients over several forward/backward passes before an optimization step. +* Gradient Clipping: Limits the maximum norm of the gradients (`max_grad_norm`) to prevent exploding gradients during training. +""") + gr.Markdown("#### 2. DATA PROCESSING & AUGMENTATION") + gr.Markdown(""" +* Streaming Datasets: Uses `datasets` streaming mode to handle very large datasets without loading all into RAM. +* Data Cleaning: Removes HTML tags, normalizes whitespace, redacts PII, and removes URLs/emails. +* Advanced Filtering: Includes optional filters for text length, word repetition, language detection, and basic toxicity detection (via `unitary/toxic-bert`). +* Data Augmentation: Supports **Back-Translation (BT)** for introducing paraphrasing variations and **Counterfactual Data Augmentation (CDA)** for controlled bias testing (e.g., swapping gendered pronouns). +* Synthetic Data Generation: Uses a specified LLM to generate new training examples based on an initial prompt template. +* Deduplication: Implements both **Exact** and **Semantic (MinHash LSH)** deduplication to prevent data contamination during iterative fine-tuning. +""") + gr.Markdown("#### 3. TRAINING MODES") + gr.Markdown(""" +* SFT (Supervised Fine-Tuning): Standard fine-tuning, supports **Conversation** and **Reasoning/Tool Use (CoT)** formatting styles. +* DPO (Direct Preference Optimization): Trains directly on preference pairs (chosen vs. rejected), using the `trl` library. +* Task-Specific Heads: Supports **Sequence Classification**, **Token Classification (NER)**, and **Question Answering** by loading appropriate model heads (`AutoModelFor...`). +* Seq2Seq: For translation/summarization tasks, using `Seq2SeqTrainer`. +""") + gr.Markdown("#### 4. MODEL INITIALIZATION & ADVANCED TECHNIQUES") + gr.Markdown(""" +* Model From Scratch: Allows initializing a model (e.g., Llama, Mistral) from a config rather than a pre-trained checkpoint, with optional auto-configuration based on expected training scale. +* Manual Model Configuration: When training from scratch, users can manually specify low-level configuration parameters (e.g., `vocab_size`, `hidden_size`, `num_hidden_layers`) instead of relying on the automatic scaling based on training steps. +* Multi-Adapter Merging: Advanced feature to combine multiple existing LoRA adapters into a single, new adapter using weighted averaging (`slerp`, `linear`, etc.). +* DoRA (Weight-Decomposed Low-Rank Adaptation): A more advanced version of LoRA that can lead to better performance. +* RSLora (Rank-Stabilized LoRA): A variant of LoRA that adjusts the learning rate based on the rank, improving stability. +* NEFTune: Adds noise to the embedding layer during training, which can improve the performance of the fine-tuned model. +""") + gr.Markdown("#### 5. OUTPUT & DEPLOYMENT") + gr.Markdown(""" +* Hugging Face Hub Integration: All trained artifacts (full model/LoRA adapter) are automatically pushed to a specified repository on the HF Hub using the provided token. +* Model Card Generation: Automatically generates a `README.md` detailing training parameters and model provenance. +* Inference Tab: A separate UI for easily testing the trained model with various inputs and generation parameters. +""") if __name__ == "__main__": - demo.launch(debug=True, share=True) \ No newline at end of file + demo.queue().launch(debug=True, share=True) \ No newline at end of file