Spaces:

jjuarez
/

test2

Runtime error

App Files Files Community

jjuarez commited on Apr 10, 2023

Commit

ecbf4e6

1 Parent(s): 6846b58

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -28

app.py CHANGED Viewed

@@ -8,17 +8,17 @@ import nltk
 nltk.download("punkt")
 raw_dataset = load_dataset("scientific_papers", "pubmed")
 metric = evaluate.load("rouge")
-model_checkpoint = "t5-small"
 tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
-if model_checkpoint in ["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b"]:
     prefix = "summarize: "
 else:
     prefix = ""
 # preprocessing function
-max_input_length = 256
-max_target_length = 64
 def preprocess_function(examples):
     inputs = [prefix + doc for doc in examples["article"]]
     model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
@@ -31,23 +31,23 @@ def preprocess_function(examples):
     return model_inputs
 for split in ["train", "validation", "test"]:
-    raw_dataset[split] = raw_dataset[split].select([n for n in np.random.randint(0, len(raw_dataset[split]) - 1, 200)])
 tokenized_dataset = raw_dataset.map(preprocess_function, batched=True)
 model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
-batch_size = 4
 args = Seq2SeqTrainingArguments(
     f"{model_checkpoint}-scientific_papers",
     evaluation_strategy="epoch",
-    learning_rate=3e-5,
     per_device_train_batch_size=batch_size,
     per_device_eval_batch_size=batch_size,
     weight_decay=0.01,
     save_total_limit=3,
-    num_train_epochs=0.5,
     predict_with_generate=True,
     # fp16=True,
     push_to_hub=False,
@@ -69,40 +69,35 @@ def compute_metrics(eval_pred):
     result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
     # Extract a few results
     result = {key: value * 100 for key, value in result.items()}
-    # Add mean generated length
     prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
     result["gen_len"] = np.mean(prediction_lens)
     return {k: round(v, 4) for k, v in result.items()}
-# Define the training and evaluation datasets
-train_dataset = tokenized_dataset["train"]
-eval_dataset = tokenized_dataset["validation"]
-# Create the trainer object
 trainer = Seq2SeqTrainer(
-    model=model,
-    args=args,
-    train_dataset=train_dataset,
-    eval_dataset=eval_dataset,
-    data_collator=data_collator,
-    compute_metrics=compute_metrics,
 )
-# Train the model
 trainer.train()
 # Define the input and output interface of the app
 def summarizer(input_text):
     inputs = [prefix + input_text]
     model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, return_tensors="pt")
     summary_ids = model.generate(
         input_ids=model_inputs["input_ids"],
         attention_mask=model_inputs["attention_mask"],
-        num_beams=6,
-        length_penalty=2.5,
         max_length=max_target_length + 2,  # +2 from original because we start at step=1 and stop before max_length
-        repetition_penalty=3.5,
         early_stopping=True,
         use_cache=True
     )
@@ -119,4 +114,3 @@ iface = gr.Interface(
     theme="gray"
 )
 iface.launch()

 nltk.download("punkt")
 raw_dataset = load_dataset("scientific_papers", "pubmed")
 metric = evaluate.load("rouge")
+model_checkpoint = "google/flan-t5-small"
 tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
+if model_checkpoint in ["google/flan-t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b"]:
     prefix = "summarize: "
 else:
     prefix = ""
 # preprocessing function
+max_input_length = 512
+max_target_length = 128
 def preprocess_function(examples):
     inputs = [prefix + doc for doc in examples["article"]]
     model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
     return model_inputs
 for split in ["train", "validation", "test"]:
+    raw_dataset[split] = raw_dataset[split].select([n for n in np.random.randint(0, len(raw_dataset[split]) - 1, 1_000)])
 tokenized_dataset = raw_dataset.map(preprocess_function, batched=True)
 model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
+batch_size = 8
 args = Seq2SeqTrainingArguments(
     f"{model_checkpoint}-scientific_papers",
     evaluation_strategy="epoch",
+    learning_rate=2e-5,
     per_device_train_batch_size=batch_size,
     per_device_eval_batch_size=batch_size,
     weight_decay=0.01,
     save_total_limit=3,
+    num_train_epochs=1,
     predict_with_generate=True,
     # fp16=True,
     push_to_hub=False,
     result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
     # Extract a few results
     result = {key: value * 100 for key, value in result.items()}
+        # Add mean generated length
     prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
     result["gen_len"] = np.mean(prediction_lens)
     return {k: round(v, 4) for k, v in result.items()}
 trainer = Seq2SeqTrainer(
+model,
+args,
+train_dataset=tokenized_dataset["train"],
+eval_dataset=tokenized_dataset["validation"],
+data_collator=data_collator,
+tokenizer=tokenizer,
+compute_metrics=compute_metrics
 )
 trainer.train()
 # Define the input and output interface of the app
+import gradio as gr
 def summarizer(input_text):
     inputs = [prefix + input_text]
     model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, return_tensors="pt")
     summary_ids = model.generate(
         input_ids=model_inputs["input_ids"],
         attention_mask=model_inputs["attention_mask"],
+        num_beams=4,
+        length_penalty=2.0,
         max_length=max_target_length + 2,  # +2 from original because we start at step=1 and stop before max_length
+        repetition_penalty=2.0,
         early_stopping=True,
         use_cache=True
     )
     theme="gray"
 )
 iface.launch()