Ignaciohhhhggfgjfrffd commited on
Commit
2879d3c
·
verified ·
1 Parent(s): 3c40d28

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -2
app.py CHANGED
@@ -1,9 +1,8 @@
1
  import os
2
- #os.system("pip install spaces-0.1.0-py3-none-any.whl")
3
  os.system("pip install -U gradio")
4
  os.system("pip install -U bitsandbytes diffusers torchaudio torchvision torch transformers peft accelerate trl datasets")
 
5
  os.system("pip install gradio_huggingfacehub_search packaging torchao llmcompressor")
6
- os.system("pip install spaces-0.1.0-py3-none-any.whl")
7
 
8
  import io
9
  import json
@@ -172,6 +171,7 @@ MAP_QUANT_TYPE_TO_CONFIG = {
172
 
173
  _tox_pipe_singleton = None
174
 
 
175
  class DebiasingSFTTrainer(SFTTrainer):
176
  def __init__(self, *args, reweighting_terms=None, reweighting_factor=1.0, **kwargs):
177
  super().__init__(*args, **kwargs)
@@ -189,6 +189,7 @@ class DebiasingSFTTrainer(SFTTrainer):
189
  break
190
  return (loss, outputs) if return_outputs else loss
191
 
 
192
  class DeduplicatedIterableDataset(IterableDataset):
193
  def __init__(self, dataset, text_col, method, threshold=0.85, num_perm=128):
194
  super().__init__(ex_iterable=iter([]))
@@ -235,6 +236,7 @@ class DeduplicatedIterableDataset(IterableDataset):
235
  else:
236
  yield example
237
 
 
238
  def hf_login(token):
239
  if not token:
240
  return "Por favor, introduce un token."
@@ -245,6 +247,7 @@ def hf_login(token):
245
  except Exception as e:
246
  return f"❌ Error en la conexión: {e}"
247
 
 
248
  def _clean_text(example, text_col, **kwargs):
249
  text = example.get(text_col, "")
250
  if not isinstance(text, str):
@@ -262,6 +265,7 @@ def _clean_text(example, text_col, **kwargs):
262
  example[text_col] = text
263
  return example
264
 
 
265
  def _apply_quality_filters(example, text_col, min_len, max_len, rep_threshold, exclude_keywords):
266
  text = example.get(text_col, "")
267
  if not isinstance(text, str): return False
@@ -275,6 +279,7 @@ def _apply_quality_filters(example, text_col, min_len, max_len, rep_threshold, e
275
  lower_text = text.lower()
276
  return not any(keyword in lower_text for keyword in exclude_keywords)
277
 
 
278
  def _apply_coherence_filter(example, text_col, char_rep_threshold, ngram_rep_threshold, entropy_threshold):
279
  text = example.get(text_col, "")
280
  if not isinstance(text, str) or not text:
@@ -341,6 +346,7 @@ def _apply_coherence_filter(example, text_col, char_rep_threshold, ngram_rep_thr
341
  return False
342
  return True
343
 
 
344
  def _get_filter_functions(**kwargs):
345
  filters = []
346
  if kwargs.get('enable_quality_filter'):
@@ -401,6 +407,7 @@ def _get_filter_functions(**kwargs):
401
  filters.append(stats_filter)
402
  return filters
403
 
 
404
  def _load_hf_streaming(ids, split="train", probabilities=None):
405
  streams = []
406
  valid_ids = []
@@ -430,6 +437,7 @@ def _load_hf_streaming(ids, split="train", probabilities=None):
430
  probabilities = None
431
  return interleave_datasets(streams, probabilities=probabilities)
432
 
 
433
  def _load_uploaded_stream(files):
434
  all_rows = []
435
  for f in files or []:
@@ -451,6 +459,7 @@ def _load_uploaded_stream(files):
451
  random.shuffle(all_rows)
452
  return {"train": all_rows[:-val_size] if val_size > 0 else all_rows, "validation": all_rows[-val_size:] if val_size > 0 else []}
453
 
 
454
  def _guess_columns(sample):
455
  text_col, image_col, audio_col, label_col = "text", "image", "audio", "label"
456
  if not isinstance(sample, dict):
@@ -467,6 +476,7 @@ def _guess_columns(sample):
467
  elif "labels" in keys: label_col = keys["labels"]
468
  return text_col, image_col, audio_col, label_col
469
 
 
470
  def _apply_cda(dataset, text_col, cda_config_str):
471
  try:
472
  swap_groups = json.loads(cda_config_str)
@@ -499,6 +509,7 @@ def _apply_cda(dataset, text_col, cda_config_str):
499
  current_texts.update(next_texts)
500
  return IterableDataset.from_generator(cda_generator)
501
 
 
502
  def _apply_back_translation(dataset, text_col, ratio, model_id, reverse_model_id):
503
  if not ratio or ratio <= 0:
504
  return dataset
@@ -526,6 +537,7 @@ def _apply_back_translation(dataset, text_col, ratio, model_id, reverse_model_id
526
  logger.warning(f"Error en retrotraducción: {e}")
527
  return IterableDataset.from_generator(bt_generator)
528
 
 
529
  def _generate_synthetic_data(original_dataset, text_col, model_id, num_samples, prompt_template):
530
  if not num_samples or num_samples <= 0:
531
  return None
@@ -576,6 +588,7 @@ def _calculate_auto_config(block_size, is_gpt2_like, steps_per_epoch_estimate, b
576
  kv_heads = heads if is_gpt2_like else (max(1, heads // 4))
577
  return vocab_size, hidden_size, hidden_size * 2, layers, heads, safe_block_size, False, kv_heads
578
 
 
579
  def _get_eval_dataset(train_ds_id, eval_ds_id, uploaded_val_data, update_logs_fn):
580
  if eval_ds_id:
581
  yield update_logs_fn(f"Cargando dataset de evaluación: {eval_ds_id}", "Evaluación")
@@ -647,6 +660,7 @@ def _create_training_args(output_dir, repo_id, **kwargs):
647
  raise ValueError("Para datasets en streaming se requiere un valor positivo para 'Máximos Pasos de Entrenamiento'.")
648
  return TrainingArguments(**args_dict)
649
 
 
650
  def _generic_model_loader(model_name_or_path, model_class, **kwargs):
651
  config_kwargs = {"trust_remote_code": True}
652
  if kwargs.get('label2id'):
@@ -664,6 +678,7 @@ def _generic_model_loader(model_name_or_path, model_class, **kwargs):
664
  model = model_class.from_pretrained(model_name_or_path, **model_kwargs)
665
  return model
666
 
 
667
  def _find_all_linear_names(model):
668
  cls = torch.nn.Linear
669
  lora_module_names = set()
@@ -676,6 +691,7 @@ def _find_all_linear_names(model):
676
  common_targets = {'q_proj', 'v_proj', 'k_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'}
677
  return list(lora_module_names.intersection(common_targets)) or list(lora_module_names)
678
 
 
679
  def _sft_formatting_func(example, text_col, tokenizer, **kwargs):
680
  if kwargs.get('sft_format_style') == "Conversacional":
681
  conv_col = ""
@@ -709,9 +725,11 @@ def _sft_formatting_func(example, text_col, tokenizer, **kwargs):
709
  return ""
710
  return example.get(text_col, "")
711
 
 
712
  def _dpo_formatting_func(example, **kwargs):
713
  return {"prompt": example.get(kwargs.get('prompt_col_input', 'prompt'), ""), "chosen": example.get(kwargs.get('dpo_chosen_col_input', 'chosen'), ""), "rejected": example.get(kwargs.get('dpo_rejected_col_input', 'rejected'), "")}
714
 
 
715
  def _evaluate_perplexity(model, tokenizer, eval_dataset, text_col):
716
  model.eval()
717
  encodings = tokenizer("\n\n".join(ex[text_col] for ex in islice(eval_dataset, 1000)), return_tensors="pt")
@@ -736,6 +754,7 @@ def _evaluate_perplexity(model, tokenizer, eval_dataset, text_col):
736
  ppl = torch.exp(torch.stack(nlls).mean())
737
  return ppl.item()
738
 
 
739
  def _merge_multiple_loras(base_model_id, adapter_ids_str, weights_str, combination_type):
740
  adapter_ids = [s.strip() for s in adapter_ids_str.split(',') if s.strip()]
741
  if not adapter_ids:
@@ -767,6 +786,7 @@ def _merge_multiple_loras(base_model_id, adapter_ids_str, weights_str, combinati
767
  yield f"Fusión de adaptadores completada. El entrenamiento continuará con el modelo fusionado en {temp_dir}."
768
  return temp_dir
769
 
 
770
  def _run_trainer_and_upload(trainer, tokenizer, repo_id, update_logs_fn, model_card_content, **kwargs):
771
  yield update_logs_fn("Iniciando ciclo de entrenamiento...", "Entrenando")
772
  trainer.train(resume_from_checkpoint=kwargs.get('resume_from_checkpoint') or False)
@@ -787,6 +807,7 @@ def _run_trainer_and_upload(trainer, tokenizer, repo_id, update_logs_fn, model_c
787
  upload_folder(folder_path=output_dir, repo_id=repo_id, commit_message="Fin de entrenamiento")
788
  return output_dir, final_metrics
789
 
 
790
  def train_sft_dpo(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs):
791
  output_dir = tempfile.mkdtemp()
792
  is_dpo = kwargs.get('training_mode') == "DPO (Direct Preference Optimization)"
@@ -840,6 +861,7 @@ def train_sft_dpo(model_name, train_dataset, repo_id, update_logs_fn, model_card
840
  except Exception as e:
841
  raise Exception(f"Error en {'DPO' if is_dpo else 'SFT'}: {e}\n{traceback.format_exc()}")
842
 
 
843
  def train_sequence_classification(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs):
844
  output_dir = tempfile.mkdtemp()
845
  try:
@@ -882,6 +904,7 @@ def train_sequence_classification(model_name, train_dataset, repo_id, update_log
882
  except Exception as e:
883
  raise Exception(f"Error en Sequence Classification: {e}\n{traceback.format_exc()}")
884
 
 
885
  def train_token_classification(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs):
886
  output_dir = tempfile.mkdtemp()
887
  try:
@@ -939,6 +962,7 @@ def train_token_classification(model_name, train_dataset, repo_id, update_logs_f
939
  except Exception as e:
940
  raise Exception(f"Error en Token Classification: {e}\n{traceback.format_exc()}")
941
 
 
942
  def train_question_answering(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs):
943
  output_dir = tempfile.mkdtemp()
944
  try:
@@ -1016,6 +1040,7 @@ def train_question_answering(model_name, train_dataset, repo_id, update_logs_fn,
1016
  except Exception as e:
1017
  raise Exception(f"Error en Question Answering: {e}\n{traceback.format_exc()}")
1018
 
 
1019
  def train_seq2seq(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs):
1020
  output_dir = tempfile.mkdtemp()
1021
  try:
@@ -1067,6 +1092,7 @@ def train_seq2seq(model_name, train_dataset, repo_id, update_logs_fn, model_card
1067
  except Exception as e:
1068
  raise Exception(f"Error en Seq2Seq: {e}\n{traceback.format_exc()}")
1069
 
 
1070
  def train_text_to_image(model_name, train_dataset, repo_id, update_logs, model_card_content, **kwargs):
1071
  output_dir = tempfile.mkdtemp()
1072
  try:
@@ -1218,6 +1244,7 @@ def train_text_to_image(model_name, train_dataset, repo_id, update_logs, model_c
1218
  yield update_logs(f"❌ Error en entrenamiento Text-to-Image: {str(e)}", "Error")
1219
  raise Exception(f"Error en Text-to-Image: {e}\n{traceback.format_exc()}")
1220
 
 
1221
  def _get_data_processing_pipeline(**kwargs):
1222
  hf_ids = [x.strip() for x in (kwargs.get('datasets_hf_text') or "").split(",") if x.strip()]
1223
  if not hf_ids and not kwargs.get('uploads'):
@@ -1489,6 +1516,7 @@ def update_inference_ui(task_mode):
1489
  gr.update(visible=is_text_gen)
1490
  )
1491
 
 
1492
  def create_and_upload_dataset(hf_token, repo_name, creation_type, synth_model, synth_prompt, synth_num_samples, file_uploads, progress=gr.Progress()):
1493
  if not hf_token:
1494
  return "Error: Se requiere un token de Hugging Face.", ""
 
1
  import os
 
2
  os.system("pip install -U gradio")
3
  os.system("pip install -U bitsandbytes diffusers torchaudio torchvision torch transformers peft accelerate trl datasets")
4
+ #os.system("pip install spaces")
5
  os.system("pip install gradio_huggingfacehub_search packaging torchao llmcompressor")
 
6
 
7
  import io
8
  import json
 
171
 
172
  _tox_pipe_singleton = None
173
 
174
+ @spaces.GPU
175
  class DebiasingSFTTrainer(SFTTrainer):
176
  def __init__(self, *args, reweighting_terms=None, reweighting_factor=1.0, **kwargs):
177
  super().__init__(*args, **kwargs)
 
189
  break
190
  return (loss, outputs) if return_outputs else loss
191
 
192
+ @spaces.GPU
193
  class DeduplicatedIterableDataset(IterableDataset):
194
  def __init__(self, dataset, text_col, method, threshold=0.85, num_perm=128):
195
  super().__init__(ex_iterable=iter([]))
 
236
  else:
237
  yield example
238
 
239
+ @spaces.GPU
240
  def hf_login(token):
241
  if not token:
242
  return "Por favor, introduce un token."
 
247
  except Exception as e:
248
  return f"❌ Error en la conexión: {e}"
249
 
250
+ @spaces.GPU
251
  def _clean_text(example, text_col, **kwargs):
252
  text = example.get(text_col, "")
253
  if not isinstance(text, str):
 
265
  example[text_col] = text
266
  return example
267
 
268
+ @spaces.GPU
269
  def _apply_quality_filters(example, text_col, min_len, max_len, rep_threshold, exclude_keywords):
270
  text = example.get(text_col, "")
271
  if not isinstance(text, str): return False
 
279
  lower_text = text.lower()
280
  return not any(keyword in lower_text for keyword in exclude_keywords)
281
 
282
+ @spaces.GPU
283
  def _apply_coherence_filter(example, text_col, char_rep_threshold, ngram_rep_threshold, entropy_threshold):
284
  text = example.get(text_col, "")
285
  if not isinstance(text, str) or not text:
 
346
  return False
347
  return True
348
 
349
+ @spaces.GPU
350
  def _get_filter_functions(**kwargs):
351
  filters = []
352
  if kwargs.get('enable_quality_filter'):
 
407
  filters.append(stats_filter)
408
  return filters
409
 
410
+ @spaces.GPU
411
  def _load_hf_streaming(ids, split="train", probabilities=None):
412
  streams = []
413
  valid_ids = []
 
437
  probabilities = None
438
  return interleave_datasets(streams, probabilities=probabilities)
439
 
440
+ @spaces.GPU
441
  def _load_uploaded_stream(files):
442
  all_rows = []
443
  for f in files or []:
 
459
  random.shuffle(all_rows)
460
  return {"train": all_rows[:-val_size] if val_size > 0 else all_rows, "validation": all_rows[-val_size:] if val_size > 0 else []}
461
 
462
+ @spaces.GPU
463
  def _guess_columns(sample):
464
  text_col, image_col, audio_col, label_col = "text", "image", "audio", "label"
465
  if not isinstance(sample, dict):
 
476
  elif "labels" in keys: label_col = keys["labels"]
477
  return text_col, image_col, audio_col, label_col
478
 
479
+ @spaces.GPU
480
  def _apply_cda(dataset, text_col, cda_config_str):
481
  try:
482
  swap_groups = json.loads(cda_config_str)
 
509
  current_texts.update(next_texts)
510
  return IterableDataset.from_generator(cda_generator)
511
 
512
+ @spaces.GPU
513
  def _apply_back_translation(dataset, text_col, ratio, model_id, reverse_model_id):
514
  if not ratio or ratio <= 0:
515
  return dataset
 
537
  logger.warning(f"Error en retrotraducción: {e}")
538
  return IterableDataset.from_generator(bt_generator)
539
 
540
+ @spaces.GPU
541
  def _generate_synthetic_data(original_dataset, text_col, model_id, num_samples, prompt_template):
542
  if not num_samples or num_samples <= 0:
543
  return None
 
588
  kv_heads = heads if is_gpt2_like else (max(1, heads // 4))
589
  return vocab_size, hidden_size, hidden_size * 2, layers, heads, safe_block_size, False, kv_heads
590
 
591
+ @spaces.GPU
592
  def _get_eval_dataset(train_ds_id, eval_ds_id, uploaded_val_data, update_logs_fn):
593
  if eval_ds_id:
594
  yield update_logs_fn(f"Cargando dataset de evaluación: {eval_ds_id}", "Evaluación")
 
660
  raise ValueError("Para datasets en streaming se requiere un valor positivo para 'Máximos Pasos de Entrenamiento'.")
661
  return TrainingArguments(**args_dict)
662
 
663
+ @spaces.GPU
664
  def _generic_model_loader(model_name_or_path, model_class, **kwargs):
665
  config_kwargs = {"trust_remote_code": True}
666
  if kwargs.get('label2id'):
 
678
  model = model_class.from_pretrained(model_name_or_path, **model_kwargs)
679
  return model
680
 
681
+ @spaces.GPU
682
  def _find_all_linear_names(model):
683
  cls = torch.nn.Linear
684
  lora_module_names = set()
 
691
  common_targets = {'q_proj', 'v_proj', 'k_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'}
692
  return list(lora_module_names.intersection(common_targets)) or list(lora_module_names)
693
 
694
+ @spaces.GPU
695
  def _sft_formatting_func(example, text_col, tokenizer, **kwargs):
696
  if kwargs.get('sft_format_style') == "Conversacional":
697
  conv_col = ""
 
725
  return ""
726
  return example.get(text_col, "")
727
 
728
+ @spaces.GPU
729
  def _dpo_formatting_func(example, **kwargs):
730
  return {"prompt": example.get(kwargs.get('prompt_col_input', 'prompt'), ""), "chosen": example.get(kwargs.get('dpo_chosen_col_input', 'chosen'), ""), "rejected": example.get(kwargs.get('dpo_rejected_col_input', 'rejected'), "")}
731
 
732
+ @spaces.GPU
733
  def _evaluate_perplexity(model, tokenizer, eval_dataset, text_col):
734
  model.eval()
735
  encodings = tokenizer("\n\n".join(ex[text_col] for ex in islice(eval_dataset, 1000)), return_tensors="pt")
 
754
  ppl = torch.exp(torch.stack(nlls).mean())
755
  return ppl.item()
756
 
757
+ @spaces.GPU
758
  def _merge_multiple_loras(base_model_id, adapter_ids_str, weights_str, combination_type):
759
  adapter_ids = [s.strip() for s in adapter_ids_str.split(',') if s.strip()]
760
  if not adapter_ids:
 
786
  yield f"Fusión de adaptadores completada. El entrenamiento continuará con el modelo fusionado en {temp_dir}."
787
  return temp_dir
788
 
789
+ @spaces.GPU
790
  def _run_trainer_and_upload(trainer, tokenizer, repo_id, update_logs_fn, model_card_content, **kwargs):
791
  yield update_logs_fn("Iniciando ciclo de entrenamiento...", "Entrenando")
792
  trainer.train(resume_from_checkpoint=kwargs.get('resume_from_checkpoint') or False)
 
807
  upload_folder(folder_path=output_dir, repo_id=repo_id, commit_message="Fin de entrenamiento")
808
  return output_dir, final_metrics
809
 
810
+ @spaces.GPU
811
  def train_sft_dpo(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs):
812
  output_dir = tempfile.mkdtemp()
813
  is_dpo = kwargs.get('training_mode') == "DPO (Direct Preference Optimization)"
 
861
  except Exception as e:
862
  raise Exception(f"Error en {'DPO' if is_dpo else 'SFT'}: {e}\n{traceback.format_exc()}")
863
 
864
+ @spaces.GPU
865
  def train_sequence_classification(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs):
866
  output_dir = tempfile.mkdtemp()
867
  try:
 
904
  except Exception as e:
905
  raise Exception(f"Error en Sequence Classification: {e}\n{traceback.format_exc()}")
906
 
907
+ @spaces.GPU
908
  def train_token_classification(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs):
909
  output_dir = tempfile.mkdtemp()
910
  try:
 
962
  except Exception as e:
963
  raise Exception(f"Error en Token Classification: {e}\n{traceback.format_exc()}")
964
 
965
+ @spaces.GPU
966
  def train_question_answering(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs):
967
  output_dir = tempfile.mkdtemp()
968
  try:
 
1040
  except Exception as e:
1041
  raise Exception(f"Error en Question Answering: {e}\n{traceback.format_exc()}")
1042
 
1043
+ @spaces.GPU
1044
  def train_seq2seq(model_name, train_dataset, repo_id, update_logs_fn, model_card_content, **kwargs):
1045
  output_dir = tempfile.mkdtemp()
1046
  try:
 
1092
  except Exception as e:
1093
  raise Exception(f"Error en Seq2Seq: {e}\n{traceback.format_exc()}")
1094
 
1095
+ @spaces.GPU
1096
  def train_text_to_image(model_name, train_dataset, repo_id, update_logs, model_card_content, **kwargs):
1097
  output_dir = tempfile.mkdtemp()
1098
  try:
 
1244
  yield update_logs(f"❌ Error en entrenamiento Text-to-Image: {str(e)}", "Error")
1245
  raise Exception(f"Error en Text-to-Image: {e}\n{traceback.format_exc()}")
1246
 
1247
+ @spaces.GPU
1248
  def _get_data_processing_pipeline(**kwargs):
1249
  hf_ids = [x.strip() for x in (kwargs.get('datasets_hf_text') or "").split(",") if x.strip()]
1250
  if not hf_ids and not kwargs.get('uploads'):
 
1516
  gr.update(visible=is_text_gen)
1517
  )
1518
 
1519
+ @spaces.GPU
1520
  def create_and_upload_dataset(hf_token, repo_name, creation_type, synth_model, synth_prompt, synth_num_samples, file_uploads, progress=gr.Progress()):
1521
  if not hf_token:
1522
  return "Error: Se requiere un token de Hugging Face.", ""