summarizer_space / fine_tune.py
student2222333051's picture
Update fine_tune.py
887395f verified
# fine_tune.py
from datasets import load_dataset
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
import os
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)
dataset = load_dataset("marcov/scientific_papers_arxiv_promptsource")
# Күнделікті тест үшін шағын subset
dataset["train"] = dataset["train"].select(range(1000))
dataset["validation"] = dataset["validation"].select(range(200))
max_input_length = 1024
max_output_length = 200
def preprocess_function(batch):
inputs = tokenizer(batch["article"], max_length=max_input_length, truncation=True)
outputs = tokenizer(batch["summary"], max_length=max_output_length, truncation=True)
batch["input_ids"] = inputs["input_ids"]
batch["attention_mask"] = inputs["attention_mask"]
batch["labels"] = outputs["input_ids"]
return batch
tokenized_train = dataset["train"].map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)
tokenized_val = dataset["validation"].map(preprocess_function, batched=True, remove_columns=dataset["validation"].column_names)
training_args = TrainingArguments(
output_dir="./bart-finetuned-arxiv-hub",
evaluation_strategy="steps",
eval_steps=500,
save_steps=500,
save_total_limit=2,
learning_rate=3e-5,
per_device_train_batch_size=2,
per_device_eval_batch_size=2,
num_train_epochs=3,
weight_decay=0.01,
fp16=False, # GPU болса True қой
logging_dir="./logs",
logging_steps=100,
push_to_hub=True
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_val,
tokenizer=tokenizer,
)
trainer.train()
trainer.push_to_hub("username/bart-finetuned-arxiv")
print("Fine-tuning complete.")