| |
| """ |
| 中文情感分析模型訓練腳本 |
| 這個腳本展示如何創建一個可推理的 Hugging Face 模型 |
| """ |
|
|
| import torch |
| import pandas as pd |
| from transformers import ( |
| BertTokenizer, BertForSequenceClassification, |
| TrainingArguments, Trainer, pipeline |
| ) |
| from torch.utils.data import Dataset |
| import numpy as np |
| from sklearn.metrics import accuracy_score |
|
|
| class SentimentDataset(Dataset): |
| def __init__(self, texts, labels, tokenizer, max_length=128): |
| self.texts = texts |
| self.labels = labels |
| self.tokenizer = tokenizer |
| self.max_length = max_length |
|
|
| def __len__(self): |
| return len(self.texts) |
|
|
| def __getitem__(self, idx): |
| text = str(self.texts[idx]) |
| label = self.labels[idx] |
| |
| encoding = self.tokenizer( |
| text, |
| truncation=True, |
| padding='max_length', |
| max_length=self.max_length, |
| return_tensors='pt' |
| ) |
| |
| return { |
| 'input_ids': encoding['input_ids'].flatten(), |
| 'attention_mask': encoding['attention_mask'].flatten(), |
| 'labels': torch.tensor(label, dtype=torch.long) |
| } |
|
|
| def create_demo_model(): |
| """創建一個演示用的情感分析模型""" |
| |
| |
| model_name = "bert-base-chinese" |
| tokenizer = BertTokenizer.from_pretrained(model_name) |
| model = BertForSequenceClassification.from_pretrained( |
| model_name, |
| num_labels=2, |
| id2label={0: "NEGATIVE", 1: "POSITIVE"}, |
| label2id={"NEGATIVE": 0, "POSITIVE": 1} |
| ) |
| |
| |
| texts = [ |
| "這個產品真的很棒!我非常滿意。", |
| "質量很差,完全不推薦。", |
| "服務態度很好,值得信賴。", |
| "價格太貴了,性價比不高。", |
| "非常棒的體驗,會再次購買。", |
| "完全浪費錢,後悔購買。" |
| ] |
| labels = [1, 0, 1, 0, 1, 0] |
| |
| |
| dataset = SentimentDataset(texts, labels, tokenizer) |
| |
| |
| training_args = TrainingArguments( |
| output_dir='./results', |
| num_train_epochs=1, |
| per_device_train_batch_size=2, |
| per_device_eval_batch_size=2, |
| warmup_steps=10, |
| weight_decay=0.01, |
| logging_dir='./logs', |
| save_strategy="no", |
| ) |
| |
| |
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| train_dataset=dataset, |
| tokenizer=tokenizer, |
| ) |
| |
| |
| trainer.train() |
| |
| return model, tokenizer |
|
|
| def save_model_for_huggingface(model, tokenizer, save_directory): |
| """保存模型為 Hugging Face 格式""" |
| |
| |
| model.save_pretrained(save_directory) |
| tokenizer.save_pretrained(save_directory) |
| |
| print(f"模型已保存到: {save_directory}") |
| print("包含的檔案:") |
| import os |
| for file in os.listdir(save_directory): |
| print(f" - {file}") |
|
|
| def test_inference(model_directory): |
| """測試模型推理功能""" |
| |
| |
| classifier = pipeline( |
| "text-classification", |
| model=model_directory, |
| tokenizer=model_directory, |
| return_all_scores=True |
| ) |
| |
| |
| test_texts = [ |
| "這個手機真的很好用!", |
| "服務態度太差了。", |
| "質量不錯,值得推薦。" |
| ] |
| |
| print("\\n=== 模型推理測試 ===") |
| for text in test_texts: |
| result = classifier(text) |
| print(f"文本: {text}") |
| print(f"結果: {result}") |
| print("-" * 50) |
|
|
| if __name__ == "__main__": |
| print("開始創建中文情感分析模型...") |
| |
| |
| model, tokenizer = create_demo_model() |
| |
| |
| save_directory = "./my-sentiment-model" |
| save_model_for_huggingface(model, tokenizer, save_directory) |
| |
| |
| test_inference(save_directory) |
| |
| print("\\n✅ 模型創建完成!") |
| print("現在你可以:") |
| print("1. 將模型檔案推送到 Hugging Face") |
| print("2. 讓其他人使用 transformers 載入你的模型") |
| print("3. 使用 Inference API 進行線上推理") |
|
|