Train_xd

Runtime error

App Files Files Community

Ignaciohhhhggfgjfrffd commited on 16 days ago

Commit

2879d3c

verified ·

1 Parent(s): 3c40d28

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -2

app.py CHANGED Viewed

@@ -1,9 +1,8 @@
 import os
-#os.system("pip install spaces-0.1.0-py3-none-any.whl")
 os.system("pip install -U gradio")
 os.system("pip install -U bitsandbytes diffusers torchaudio torchvision torch transformers peft accelerate trl datasets")
 os.system("pip install gradio_huggingfacehub_search packaging torchao llmcompressor")
-os.system("pip install spaces-0.1.0-py3-none-any.whl")
 import io
 import json
@@ -172,6 +171,7 @@ MAP_QUANT_TYPE_TO_CONFIG = {
 _tox_pipe_singleton = None
 class DebiasingSFTTrainer(SFTTrainer):
     def __init__(self, *args, reweighting_terms=None, reweighting_factor=1.0, **kwargs):
         super().__init__(*args, **kwargs)
@@ -189,6 +189,7 @@ class DebiasingSFTTrainer(SFTTrainer):
                     break
         return (loss, outputs) if return_outputs else loss
 class DeduplicatedIterableDataset(IterableDataset):
     def __init__(self, dataset, text_col, method, threshold=0.85, num_perm=128):
         super().__init__(ex_iterable=iter([]))
@@ -235,6 +236,7 @@ class DeduplicatedIterableDataset(IterableDataset):
             else:
                 yield example
 def hf_login(token):
     if not token:
         return "Por favor, introduce un token."
@@ -245,6 +247,7 @@ def hf_login(token):
     except Exception as e:
         return f"❌ Error en la conexión: {e}"
 def _clean_text(example, text_col, **kwargs):
     text = example.get(text_col, "")
     if not isinstance(text, str):
@@ -262,6 +265,7 @@ def _clean_text(example, text_col, **kwargs):
     example[text_col] = text
     return example
 def _apply_quality_filters(example, text_col, min_len, max_len, rep_threshold, exclude_keywords):
     text = example.get(text_col, "")
     if not isinstance(text, str): return False
@@ -275,6 +279,7 @@ def _apply_quality_filters(example, text_col, min_len, max_len, rep_threshold, e
     lower_text = text.lower()
     return not any(keyword in lower_text for keyword in exclude_keywords)
 def _apply_coherence_filter(example, text_col, char_rep_threshold, ngram_rep_threshold, entropy_threshold):
     text = example.get(text_col, "")
     if not isinstance(text, str) or not text:
@@ -341,6 +346,7 @@ def _apply_coherence_filter(example, text_col, char_rep_threshold, ngram_rep_thr
         return False
     return True
 def _get_filter_functions(**kwargs):
     filters = []
     if kwargs.get('enable_quality_filter'):
@@ -401,6 +407,7 @@ def _get_filter_functions(**kwargs):
         filters.append(stats_filter)
     return filters
 def _load_hf_streaming(ids, split="train", probabilities=None):
     streams = []
     valid_ids = []
@@ -430,6 +437,7 @@ def _load_hf_streaming(ids, split="train", probabilities=None):
         probabilities = None
     return interleave_datasets(streams, probabilities=probabilities)
 def _load_uploaded_stream(files):
     all_rows = []
     for f in files or []:
@@ -451,6 +459,7 @@ def _load_uploaded_stream(files):
     random.shuffle(all_rows)
     return {"train": all_rows[:-val_size] if val_size > 0 else all_rows, "validation": all_rows[-val_size:] if val_size > 0 else []}
 def _guess_columns(sample):
     text_col, image_col, audio_col, label_col = "text", "image", "audio", "label"
     if not isinstance(sample, dict):
@@ -467,6 +476,7 @@ def _guess_columns(sample):
     elif "labels" in keys: label_col = keys["labels"]
     return text_col, image_col, audio_col, label_col
 def _apply_cda(dataset, text_col, cda_config_str):
     try:
         swap_groups = json.loads(cda_config_str)
@@ -499,6 +509,7 @@ def _apply_cda(dataset, text_col, cda_config_str):
                 current_texts.update(next_texts)
     return IterableDataset.from_generator(cda_generator)
 def _apply_back_translation(dataset, text_col, ratio, model_id, reverse_model_id):
     if not ratio or ratio <= 0:
         return dataset
@@ -526,6 +537,7 @@ def _apply_back_translation(dataset, text_col, ratio, model_id, reverse_model_id
                         logger.warning(f"Error en retrotraducción: {e}")
     return IterableDataset.from_generator(bt_generator)
 def _generate_synthetic_data(original_dataset, text_col, model_id, num_samples, prompt_template):
     if not num_samples or num_samples <= 0:
         return None
@@ -576,6 +588,7 @@ def _calculate_auto_config(block_size, is_gpt2_like, steps_per_epoch_estimate, b
     kv_heads = heads if is_gpt2_like else (max(1, heads // 4))
     return vocab_size, hidden_size, hidden_size * 2, layers, heads, safe_block_size, False, kv_heads
 def _get_eval_dataset(train_ds_id, eval_ds_id, uploaded_val_data, update_logs_fn):
     if eval_ds_id:
         yield update_logs_fn(f"Cargando dataset de evaluación: {eval_ds_id}", "Evaluación")
@@ -647,6 +660,7 @@ def _create_training_args(output_dir, repo_id, **kwargs):
         raise ValueError("Para datasets en streaming se requiere un valor positivo para 'Máximos Pasos de Entrenamiento'.")
     return TrainingArguments(**args_dict)
 def _generic_model_loader(model_name_or_path, model_class, **kwargs):
     config_kwargs = {"trust_remote_code": True}
     if kwargs.get('label2id'):
@@ -664,6 +678,7 @@ def _generic_model_loader(model_name_or_path, model_class, **kwargs):
     model = model_class.from_pretrained(model_name_or_path, **model_kwargs)
     return model
 def _find_all_linear_names(model):
     cls = torch.nn.Linear
     lora_module_names = set()
@@ -676,6 +691,7 @@ def _find_all_linear_names(model):
     common_targets = {'q_proj', 'v_proj', 'k_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'}
     return list(lora_module_names.intersection(common_targets)) or list(lora_module_names)
 def _sft_formatting_func(example, text_col, tokenizer, **kwargs):
     if kwargs.get('sft_format_style') == "Conversacional":
         conv_col = ""
@@ -709,9 +725,11 @@ def _sft_formatting_func(example, text_col, tokenizer, **kwargs):
         return ""
     return example.get(text_col, "")
 def _dpo_formatting_func(example, **kwargs):
     return {"prompt": example.get(kwargs.get('prompt_col_input', 'prompt'), ""), "chosen": example.get(kwargs.get('dpo_chosen_col_input', 'chosen'), ""), "rejected": example.get(kwargs.get('dpo_rejected_col_input', 'rejected'), "")}
 def _evaluate_perplexity(model, tokenizer, eval_dataset, text_col):
     model.eval()
     encodings = tokenizer("\n\n".join(ex[text_col] for ex in islice(eval_dataset, 1000)), return_tensors="pt")
@@ -736,6 +754,7 @@ def _evaluate_perplexity(model, tokenizer, eval_dataset, text_col):
     ppl = torch.exp(torch.stack(nlls).mean())
     return ppl.item()
 def _merge_multiple_loras(base_model_id, adapter_ids_str, weights_str, combination_type):
     adapter_ids = [s.strip() for s in adapter_ids_str.split(',') if s.strip()]
     if not adapter_ids:
@@ -767,6 +786,7 @@ def _merge_multiple_loras(base_model_id, adapter_ids_str, weights_str, combinati
     yield f"Fusión de adaptadores completada. El entrenamiento continuará con el modelo fusionado en {temp_dir}."
     return temp_dir
 def _run_trainer_and_upload(trainer, tokenizer, repo_id, update_logs_fn, model_card_content, **kwargs):
     yield update_logs_fn("Iniciando ciclo de entrenamiento...", "Entrenando")
     trainer.train(resume_from_checkpoint=kwargs.get('resume_from_checkpoint') or False)
@@ -787,6 +807,7 @@ def _run_trainer_and_upload(trainer, tokenizer, repo_id, update_logs_fn, model_c
     upload_folder(folder_path=output_dir, repo_id=repo_id, commit_message="Fin de entrenamiento")
     return output_dir, final_metrics
 def train_sft_dpo(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs):
     output_dir = tempfile.mkdtemp()
     is_dpo = kwargs.get('training_mode') == "DPO (Direct Preference Optimization)"
@@ -840,6 +861,7 @@ def train_sft_dpo(model_name, train_dataset, repo_id, update_logs_fn, model_card
     except Exception as e:
         raise Exception(f"Error en {'DPO' if is_dpo else 'SFT'}: {e}\n{traceback.format_exc()}")
 def train_sequence_classification(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs):
     output_dir = tempfile.mkdtemp()
     try:
@@ -882,6 +904,7 @@ def train_sequence_classification(model_name, train_dataset, repo_id, update_log
     except Exception as e:
         raise Exception(f"Error en Sequence Classification: {e}\n{traceback.format_exc()}")
 def train_token_classification(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs):
     output_dir = tempfile.mkdtemp()
     try:
@@ -939,6 +962,7 @@ def train_token_classification(model_name, train_dataset, repo_id, update_logs_f
     except Exception as e:
         raise Exception(f"Error en Token Classification: {e}\n{traceback.format_exc()}")
 def train_question_answering(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs):
     output_dir = tempfile.mkdtemp()
     try:
@@ -1016,6 +1040,7 @@ def train_question_answering(model_name, train_dataset, repo_id, update_logs_fn,
     except Exception as e:
         raise Exception(f"Error en Question Answering: {e}\n{traceback.format_exc()}")
 def train_seq2seq(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs):
     output_dir = tempfile.mkdtemp()
     try:
@@ -1067,6 +1092,7 @@ def train_seq2seq(model_name, train_dataset, repo_id, update_logs_fn, model_card
     except Exception as e:
         raise Exception(f"Error en Seq2Seq: {e}\n{traceback.format_exc()}")
 def train_text_to_image(model_name, train_dataset, repo_id, update_logs, model_card_content, **kwargs):
     output_dir = tempfile.mkdtemp()
     try:
@@ -1218,6 +1244,7 @@ def train_text_to_image(model_name, train_dataset, repo_id, update_logs, model_c
         yield update_logs(f"❌ Error en entrenamiento Text-to-Image: {str(e)}", "Error")
         raise Exception(f"Error en Text-to-Image: {e}\n{traceback.format_exc()}")
 def _get_data_processing_pipeline(**kwargs):
     hf_ids = [x.strip() for x in (kwargs.get('datasets_hf_text') or "").split(",") if x.strip()]
     if not hf_ids and not kwargs.get('uploads'):
@@ -1489,6 +1516,7 @@ def update_inference_ui(task_mode):
         gr.update(visible=is_text_gen)
     )
 def create_and_upload_dataset(hf_token, repo_name, creation_type, synth_model, synth_prompt, synth_num_samples, file_uploads, progress=gr.Progress()):
     if not hf_token:
         return "Error: Se requiere un token de Hugging Face.", ""

 import os
 os.system("pip install -U gradio")
 os.system("pip install -U bitsandbytes diffusers torchaudio torchvision torch transformers peft accelerate trl datasets")
+#os.system("pip install spaces")
 os.system("pip install gradio_huggingfacehub_search packaging torchao llmcompressor")
 import io
 import json
 _tox_pipe_singleton = None
+@spaces.GPU
 class DebiasingSFTTrainer(SFTTrainer):
     def __init__(self, *args, reweighting_terms=None, reweighting_factor=1.0, **kwargs):
         super().__init__(*args, **kwargs)
                     break
         return (loss, outputs) if return_outputs else loss
+@spaces.GPU
 class DeduplicatedIterableDataset(IterableDataset):
     def __init__(self, dataset, text_col, method, threshold=0.85, num_perm=128):
         super().__init__(ex_iterable=iter([]))
             else:
                 yield example
+@spaces.GPU
 def hf_login(token):
     if not token:
         return "Por favor, introduce un token."
     except Exception as e:
         return f"❌ Error en la conexión: {e}"
+@spaces.GPU
 def _clean_text(example, text_col, **kwargs):
     text = example.get(text_col, "")
     if not isinstance(text, str):
     example[text_col] = text
     return example
+@spaces.GPU
 def _apply_quality_filters(example, text_col, min_len, max_len, rep_threshold, exclude_keywords):
     text = example.get(text_col, "")
     if not isinstance(text, str): return False
     lower_text = text.lower()
     return not any(keyword in lower_text for keyword in exclude_keywords)
+@spaces.GPU
 def _apply_coherence_filter(example, text_col, char_rep_threshold, ngram_rep_threshold, entropy_threshold):
     text = example.get(text_col, "")
     if not isinstance(text, str) or not text:
         return False
     return True
+@spaces.GPU
 def _get_filter_functions(**kwargs):
     filters = []
     if kwargs.get('enable_quality_filter'):
         filters.append(stats_filter)
     return filters
+@spaces.GPU
 def _load_hf_streaming(ids, split="train", probabilities=None):
     streams = []
     valid_ids = []
         probabilities = None
     return interleave_datasets(streams, probabilities=probabilities)
+@spaces.GPU
 def _load_uploaded_stream(files):
     all_rows = []
     for f in files or []:
     random.shuffle(all_rows)
     return {"train": all_rows[:-val_size] if val_size > 0 else all_rows, "validation": all_rows[-val_size:] if val_size > 0 else []}
+@spaces.GPU
 def _guess_columns(sample):
     text_col, image_col, audio_col, label_col = "text", "image", "audio", "label"
     if not isinstance(sample, dict):
     elif "labels" in keys: label_col = keys["labels"]
     return text_col, image_col, audio_col, label_col
+@spaces.GPU
 def _apply_cda(dataset, text_col, cda_config_str):
     try:
         swap_groups = json.loads(cda_config_str)
                 current_texts.update(next_texts)
     return IterableDataset.from_generator(cda_generator)
+@spaces.GPU
 def _apply_back_translation(dataset, text_col, ratio, model_id, reverse_model_id):
     if not ratio or ratio <= 0:
         return dataset
                         logger.warning(f"Error en retrotraducción: {e}")
     return IterableDataset.from_generator(bt_generator)
+@spaces.GPU
 def _generate_synthetic_data(original_dataset, text_col, model_id, num_samples, prompt_template):
     if not num_samples or num_samples <= 0:
         return None
     kv_heads = heads if is_gpt2_like else (max(1, heads // 4))
     return vocab_size, hidden_size, hidden_size * 2, layers, heads, safe_block_size, False, kv_heads
+@spaces.GPU
 def _get_eval_dataset(train_ds_id, eval_ds_id, uploaded_val_data, update_logs_fn):
     if eval_ds_id:
         yield update_logs_fn(f"Cargando dataset de evaluación: {eval_ds_id}", "Evaluación")
         raise ValueError("Para datasets en streaming se requiere un valor positivo para 'Máximos Pasos de Entrenamiento'.")
     return TrainingArguments(**args_dict)
+@spaces.GPU
 def _generic_model_loader(model_name_or_path, model_class, **kwargs):
     config_kwargs = {"trust_remote_code": True}
     if kwargs.get('label2id'):
     model = model_class.from_pretrained(model_name_or_path, **model_kwargs)
     return model
+@spaces.GPU
 def _find_all_linear_names(model):
     cls = torch.nn.Linear
     lora_module_names = set()
     common_targets = {'q_proj', 'v_proj', 'k_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'}
     return list(lora_module_names.intersection(common_targets)) or list(lora_module_names)
+@spaces.GPU
 def _sft_formatting_func(example, text_col, tokenizer, **kwargs):
     if kwargs.get('sft_format_style') == "Conversacional":
         conv_col = ""
         return ""
     return example.get(text_col, "")
+@spaces.GPU
 def _dpo_formatting_func(example, **kwargs):
     return {"prompt": example.get(kwargs.get('prompt_col_input', 'prompt'), ""), "chosen": example.get(kwargs.get('dpo_chosen_col_input', 'chosen'), ""), "rejected": example.get(kwargs.get('dpo_rejected_col_input', 'rejected'), "")}
+@spaces.GPU
 def _evaluate_perplexity(model, tokenizer, eval_dataset, text_col):
     model.eval()
     encodings = tokenizer("\n\n".join(ex[text_col] for ex in islice(eval_dataset, 1000)), return_tensors="pt")
     ppl = torch.exp(torch.stack(nlls).mean())
     return ppl.item()
+@spaces.GPU
 def _merge_multiple_loras(base_model_id, adapter_ids_str, weights_str, combination_type):
     adapter_ids = [s.strip() for s in adapter_ids_str.split(',') if s.strip()]
     if not adapter_ids:
     yield f"Fusión de adaptadores completada. El entrenamiento continuará con el modelo fusionado en {temp_dir}."
     return temp_dir
+@spaces.GPU
 def _run_trainer_and_upload(trainer, tokenizer, repo_id, update_logs_fn, model_card_content, **kwargs):
     yield update_logs_fn("Iniciando ciclo de entrenamiento...", "Entrenando")
     trainer.train(resume_from_checkpoint=kwargs.get('resume_from_checkpoint') or False)
     upload_folder(folder_path=output_dir, repo_id=repo_id, commit_message="Fin de entrenamiento")
     return output_dir, final_metrics
+@spaces.GPU
 def train_sft_dpo(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs):
     output_dir = tempfile.mkdtemp()
     is_dpo = kwargs.get('training_mode') == "DPO (Direct Preference Optimization)"
     except Exception as e:
         raise Exception(f"Error en {'DPO' if is_dpo else 'SFT'}: {e}\n{traceback.format_exc()}")
+@spaces.GPU
 def train_sequence_classification(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs):
     output_dir = tempfile.mkdtemp()
     try:
     except Exception as e:
         raise Exception(f"Error en Sequence Classification: {e}\n{traceback.format_exc()}")
+@spaces.GPU
 def train_token_classification(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs):
     output_dir = tempfile.mkdtemp()
     try:
     except Exception as e:
         raise Exception(f"Error en Token Classification: {e}\n{traceback.format_exc()}")
+@spaces.GPU
 def train_question_answering(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs):
     output_dir = tempfile.mkdtemp()
     try:
     except Exception as e:
         raise Exception(f"Error en Question Answering: {e}\n{traceback.format_exc()}")
+@spaces.GPU
 def train_seq2seq(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs):
     output_dir = tempfile.mkdtemp()
     try:
     except Exception as e:
         raise Exception(f"Error en Seq2Seq: {e}\n{traceback.format_exc()}")
+@spaces.GPU
 def train_text_to_image(model_name, train_dataset, repo_id, update_logs, model_card_content, **kwargs):
     output_dir = tempfile.mkdtemp()
     try:
         yield update_logs(f"❌ Error en entrenamiento Text-to-Image: {str(e)}", "Error")
         raise Exception(f"Error en Text-to-Image: {e}\n{traceback.format_exc()}")
+@spaces.GPU
 def _get_data_processing_pipeline(**kwargs):
     hf_ids = [x.strip() for x in (kwargs.get('datasets_hf_text') or "").split(",") if x.strip()]
     if not hf_ids and not kwargs.get('uploads'):
         gr.update(visible=is_text_gen)
     )
+@spaces.GPU
 def create_and_upload_dataset(hf_token, repo_name, creation_type, synth_model, synth_prompt, synth_num_samples, file_uploads, progress=gr.Progress()):
     if not hf_token:
         return "Error: Se requiere un token de Hugging Face.", ""