Spaces:

AntonioCGF
/

Summarization_Spanish_Text

Sleeping

App Files Files Community

AntonioCGF commited on 9 days ago

Commit

acfcd3b

verified ·

1 Parent(s): 2f76d83

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -16

app.py CHANGED Viewed

@@ -23,6 +23,7 @@ DATASET_SPLITS = {
 DATASET_URL = "hf://datasets/somosnlp/NoticIA-it/"
 BASE_MODEL_NAME = "josmunpen/mt5-small-spanish-summarization"
 DEFAULT_OUTPUT_DIR = "mt5-resumenes-es-final"
 SAMPLE_SIZE = 256
 MAX_INPUT_LENGTH = 256
 MAX_TARGET_LENGTH = 64
@@ -220,22 +221,27 @@ def main():
     df = load_dataframe()
     train_df, val_df, test_df = prepare_splits(df)
-    if output_dir.exists() and not args.retrain:
-        tokenizer = AutoTokenizer.from_pretrained(output_dir)
-        model = AutoModelForSeq2SeqLM.from_pretrained(output_dir)
-        train_tokenized, val_tokenized, test_tokenized = tokenize_datasets(tokenizer, train_df, val_df, test_df)
-        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        model.to(device)
-        data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
-        train_loss = float("nan")
-        test_loss = float("nan")
-        test_perplexity = float("nan")
-    else:
-        tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
-        model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL_NAME)
-        train_tokenized, val_tokenized, test_tokenized = tokenize_datasets(tokenizer, train_df, val_df, test_df)
-        device, train_loss, test_loss, test_perplexity, data_collator = train_model(model, tokenizer, train_tokenized, test_tokenized)
-        save_model(model, tokenizer, output_dir)
     metrics_df = compute_metrics(model, tokenizer, test_tokenized, data_collator, device)
     metrics_df["valor"] = metrics_df["valor"].apply(lambda value: round(value, 4) if isinstance(value, (float, np.floating)) and np.isfinite(value) else value)

 DATASET_URL = "hf://datasets/somosnlp/NoticIA-it/"
 BASE_MODEL_NAME = "josmunpen/mt5-small-spanish-summarization"
 DEFAULT_OUTPUT_DIR = "mt5-resumenes-es-final"
+DEFAULT_BUCKET = "hf://buckets/AntonioCGF/statetensor_TECP"
 SAMPLE_SIZE = 256
 MAX_INPUT_LENGTH = 256
 MAX_TARGET_LENGTH = 64
     df = load_dataframe()
     train_df, val_df, test_df = prepare_splits(df)
+    # if output_dir.exists() and not args.retrain:
+    #     tokenizer = AutoTokenizer.from_pretrained(output_dir)
+    #     model = AutoModelForSeq2SeqLM.from_pretrained(output_dir)
+    #     train_tokenized, val_tokenized, test_tokenized = tokenize_datasets(tokenizer, train_df, val_df, test_df)
+    #     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    #     model.to(device)
+    #     data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
+    #     train_loss = float("nan")
+    #     test_loss = float("nan")
+    #     test_perplexity = float("nan")
+    # else:
+    #     tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
+    #     model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL_NAME)
+    #     train_tokenized, val_tokenized, test_tokenized = tokenize_datasets(tokenizer, train_df, val_df, test_df)
+    #     device, train_loss, test_loss, test_perplexity, data_collator = train_model(model, tokenizer, train_tokenized, test_tokenized)
+    #     save_model(model, tokenizer, output_dir)
+    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
+    model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL_NAME)
+    train_tokenized, val_tokenized, test_tokenized = tokenize_datasets(tokenizer, train_df, val_df, test_df)
+    device, train_loss, test_loss, test_perplexity, data_collator = train_model(model, tokenizer, train_tokenized, test_tokenized)
     metrics_df = compute_metrics(model, tokenizer, test_tokenized, data_collator, device)
     metrics_df["valor"] = metrics_df["valor"].apply(lambda value: round(value, 4) if isinstance(value, (float, np.floating)) and np.isfinite(value) else value)