Files changed (1) hide show
  1. 訓練模型 +34 -0
訓練模型 ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
2
+ from datasets import Dataset
3
+
4
+ # 加載數據
5
+ data = https://www.ird.gov.hk/datagovhk/brfee.csv # 您的稅務數據集
6
+ dataset = Dataset.from_dict(data)
7
+
8
+ # 加載預訓練模型和標記器
9
+ model_name = "gpt2" # 或 "gpt-3" 根據您的需求
10
+ tokenizer = GPT2Tokenizer.from_pretrained(model_name)
11
+ model = GPT2LMHeadModel.from_pretrained(model_name)
12
+
13
+ # 數據預處理
14
+ def preprocess_function(examples):
15
+ return tokenizer(examples['text'], truncation=True, padding='max_length')
16
+
17
+ tokenized_dataset = dataset.map(preprocess_function, batched=True)
18
+
19
+ # 設置訓練參數
20
+ training_args = TrainingArguments(
21
+ output_dir='./results',
22
+ num_train_epochs=3,
23
+ per_device_train_batch_size=4,
24
+ save_steps=10_000,
25
+ logging_dir='./logs',
26
+ )
27
+
28
+ trainer = Trainer(
29
+ model=model,
30
+ args=training_args,
31
+ train_dataset=tokenized_dataset,
32
+ )
33
+
34
+ trainer.train()