File size: 1,398 Bytes
846dc7c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 | from transformers import Trainer, TrainingArguments
from datasets import Dataset
import json
from modeling_tinytransformer import TinyTransformerModel
from configuration_tinytransformer import TinyTransformerConfig
from tokenization_tinytransformer import TinyTokenizer
# 加载数据
data = []
with open("data/train_data.jsonl", "r", encoding="utf-8") as f:
for line in f:
data.append(json.loads(line))
dataset = Dataset.from_list(data)
# 简单分词
tokenizer = TinyTokenizer()
def preprocess(examples):
encodings = tokenizer(examples["text"], truncation=True, max_length=64, padding="max_length")
encodings["labels"] = examples["label"]
return encodings
tokenized_dataset = dataset.map(preprocess, batched=True)
# 初始化模型
config = TinyTransformerConfig(vocab_size=tokenizer.vocab_size, num_labels=2)
model = TinyTransformerModel(config)
# 训练设置
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=3,
per_device_train_batch_size=4,
logging_steps=1,
save_strategy="no",
report_to="none",
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
)
trainer.train()
# 保存模型
model.save_pretrained("./tiny-sentiment-model")
tokenizer.save_pretrained("./tiny-sentiment-model") # 这行会生成 vocab.json
config.save_pretrained("./tiny-sentiment-model") |