student2222333051 commited on
Commit
887395f
·
verified ·
1 Parent(s): 56e71ab

Update fine_tune.py

Browse files
Files changed (1) hide show
  1. fine_tune.py +12 -31
fine_tune.py CHANGED
@@ -1,23 +1,21 @@
1
  # fine_tune.py
2
- from datasets import load_dataset, load_metric
3
  from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
 
 
 
 
 
4
 
5
- # 1️⃣ Датасетті жүктеу (Marcov ArXiv)
6
  dataset = load_dataset("marcov/scientific_papers_arxiv_promptsource")
7
 
8
- # Шағын subset (тест үшін)
9
  dataset["train"] = dataset["train"].select(range(1000))
10
  dataset["validation"] = dataset["validation"].select(range(200))
11
 
12
- # 2️⃣ Tokenizer және модель
13
- model_name = "facebook/bart-large-cnn"
14
- tokenizer = BartTokenizer.from_pretrained(model_name)
15
- model = BartForConditionalGeneration.from_pretrained(model_name)
16
-
17
  max_input_length = 1024
18
  max_output_length = 200
19
 
20
- # 3️⃣ Tokenization
21
  def preprocess_function(batch):
22
  inputs = tokenizer(batch["article"], max_length=max_input_length, truncation=True)
23
  outputs = tokenizer(batch["summary"], max_length=max_output_length, truncation=True)
@@ -26,20 +24,9 @@ def preprocess_function(batch):
26
  batch["labels"] = outputs["input_ids"]
27
  return batch
28
 
29
- tokenized_train = dataset["train"].map(preprocess_function, batched=True)
30
- tokenized_val = dataset["validation"].map(preprocess_function, batched=True)
31
-
32
- # 4️⃣ ROUGE метрика
33
- rouge = load_metric("rouge")
34
 
35
- def compute_metrics(eval_pred):
36
- predictions, labels = eval_pred
37
- decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
38
- decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
39
- result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
40
- return {key: value.mid.fmeasure * 100 for key, value in result.items()}
41
-
42
- # 5️⃣ TrainingArguments
43
  training_args = TrainingArguments(
44
  output_dir="./bart-finetuned-arxiv-hub",
45
  evaluation_strategy="steps",
@@ -51,26 +38,20 @@ training_args = TrainingArguments(
51
  per_device_eval_batch_size=2,
52
  num_train_epochs=3,
53
  weight_decay=0.01,
54
- fp16=True,
55
  logging_dir="./logs",
56
  logging_steps=100,
57
- push_to_hub=True # Fine-tuned моделді HuggingFace Hub-қа жіберу
58
  )
59
 
60
- # 6️⃣ Trainer
61
  trainer = Trainer(
62
  model=model,
63
  args=training_args,
64
  train_dataset=tokenized_train,
65
  eval_dataset=tokenized_val,
66
  tokenizer=tokenizer,
67
- compute_metrics=compute_metrics,
68
  )
69
 
70
- # 7️⃣ Fine-tune бастау
71
  trainer.train()
72
-
73
- # 8️⃣ Модельді HuggingFace Hub-қа push жасау
74
  trainer.push_to_hub("username/bart-finetuned-arxiv")
75
-
76
- print("Fine-tuning аяқталды! Модель Hub-қа жіберілді.")
 
1
  # fine_tune.py
2
+ from datasets import load_dataset
3
  from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
4
+ import os
5
+
6
+ model_name = "facebook/bart-large-cnn"
7
+ tokenizer = BartTokenizer.from_pretrained(model_name)
8
+ model = BartForConditionalGeneration.from_pretrained(model_name)
9
 
 
10
  dataset = load_dataset("marcov/scientific_papers_arxiv_promptsource")
11
 
12
+ # Күнделікті тест үшін шағын subset
13
  dataset["train"] = dataset["train"].select(range(1000))
14
  dataset["validation"] = dataset["validation"].select(range(200))
15
 
 
 
 
 
 
16
  max_input_length = 1024
17
  max_output_length = 200
18
 
 
19
  def preprocess_function(batch):
20
  inputs = tokenizer(batch["article"], max_length=max_input_length, truncation=True)
21
  outputs = tokenizer(batch["summary"], max_length=max_output_length, truncation=True)
 
24
  batch["labels"] = outputs["input_ids"]
25
  return batch
26
 
27
+ tokenized_train = dataset["train"].map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)
28
+ tokenized_val = dataset["validation"].map(preprocess_function, batched=True, remove_columns=dataset["validation"].column_names)
 
 
 
29
 
 
 
 
 
 
 
 
 
30
  training_args = TrainingArguments(
31
  output_dir="./bart-finetuned-arxiv-hub",
32
  evaluation_strategy="steps",
 
38
  per_device_eval_batch_size=2,
39
  num_train_epochs=3,
40
  weight_decay=0.01,
41
+ fp16=False, # GPU болса True қой
42
  logging_dir="./logs",
43
  logging_steps=100,
44
+ push_to_hub=True
45
  )
46
 
 
47
  trainer = Trainer(
48
  model=model,
49
  args=training_args,
50
  train_dataset=tokenized_train,
51
  eval_dataset=tokenized_val,
52
  tokenizer=tokenizer,
 
53
  )
54
 
 
55
  trainer.train()
 
 
56
  trainer.push_to_hub("username/bart-finetuned-arxiv")
57
+ print("Fine-tuning complete.")