from transformers import Trainer, TrainingArguments from datasets import Dataset import json from modeling_tinytransformer import TinyTransformerModel from configuration_tinytransformer import TinyTransformerConfig from tokenization_tinytransformer import TinyTokenizer # 加载数据 data = [] with open("data/train_data.jsonl", "r", encoding="utf-8") as f: for line in f: data.append(json.loads(line)) dataset = Dataset.from_list(data) # 简单分词 tokenizer = TinyTokenizer() def preprocess(examples): encodings = tokenizer(examples["text"], truncation=True, max_length=64, padding="max_length") encodings["labels"] = examples["label"] return encodings tokenized_dataset = dataset.map(preprocess, batched=True) # 初始化模型 config = TinyTransformerConfig(vocab_size=tokenizer.vocab_size, num_labels=2) model = TinyTransformerModel(config) # 训练设置 training_args = TrainingArguments( output_dir="./results", num_train_epochs=3, per_device_train_batch_size=4, logging_steps=1, save_strategy="no", report_to="none", ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset, ) trainer.train() # 保存模型 model.save_pretrained("./tiny-sentiment-model") tokenizer.save_pretrained("./tiny-sentiment-model") # 这行会生成 vocab.json config.save_pretrained("./tiny-sentiment-model")