zeng981 commited on
Commit
af249ba
·
verified ·
1 Parent(s): 5a5d524

Upload 1 (4).py

Browse files
Files changed (1) hide show
  1. 1 (4).py +100 -0
1 (4).py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import torch
3
+ import numpy as np
4
+ from sklearn.model_selection import train_test_split
5
+ from sklearn.metrics import classification_report
6
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
7
+ from torch.utils.data import Dataset
8
+ from rouge_score import rouge_scorer
9
+
10
+ # Step 1: 加载数据
11
+ with open("train_data1(2).json", "r", encoding="utf-8") as f:
12
+ raw_data = json.load(f)
13
+
14
+ texts = [d["input"] for d in raw_data]
15
+ labels = [d["output"] for d in raw_data]
16
+
17
+ label2id = {label: i for i, label in enumerate(sorted(set(labels)))}
18
+ id2label = {i: label for label, i in label2id.items()}
19
+ y_numeric = [label2id[label] for label in labels]
20
+
21
+ X_train, X_val, y_train, y_val = train_test_split(texts, y_numeric, test_size=0.2, random_state=42)
22
+
23
+ # Step 2: 自定义数据集类
24
+ class MedicalDataset(Dataset):
25
+ def __init__(self, texts, labels, tokenizer, max_len=128):
26
+ self.texts = texts
27
+ self.labels = labels
28
+ self.tokenizer = tokenizer
29
+ self.max_len = max_len
30
+
31
+ def __len__(self):
32
+ return len(self.texts)
33
+
34
+ def __getitem__(self, idx):
35
+ encoded = self.tokenizer(
36
+ self.texts[idx],
37
+ truncation=True,
38
+ padding='max_length',
39
+ max_length=self.max_len,
40
+ return_tensors='pt'
41
+ )
42
+ return {
43
+ 'input_ids': encoded['input_ids'].squeeze(0),
44
+ 'attention_mask': encoded['attention_mask'].squeeze(0),
45
+ 'labels': torch.tensor(self.labels[idx])
46
+ }
47
+
48
+ # Step 3: 加载本地模型
49
+ model_path = "ClinicalBERT"
50
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
51
+ model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=len(label2id))
52
+
53
+ train_dataset = MedicalDataset(X_train, y_train, tokenizer)
54
+ val_dataset = MedicalDataset(X_val, y_val, tokenizer)
55
+
56
+ # Step 4: 设置训练参数(无 evaluation_strategy)
57
+ training_args = TrainingArguments(
58
+ output_dir="./results",
59
+ save_strategy="no",
60
+ do_train=True,
61
+ do_eval=True,
62
+ per_device_train_batch_size=8,
63
+ per_device_eval_batch_size=8,
64
+ num_train_epochs=3,
65
+ logging_dir="./logs",
66
+ logging_steps=10
67
+ )
68
+
69
+ # Step 5: 训练
70
+ trainer = Trainer(
71
+ model=model,
72
+ args=training_args,
73
+ train_dataset=train_dataset,
74
+ eval_dataset=val_dataset,
75
+ tokenizer=tokenizer
76
+ )
77
+
78
+ trainer.train()
79
+ trainer.evaluate()
80
+
81
+ # Step 6: 分类评估
82
+ preds = trainer.predict(val_dataset).predictions
83
+ pred_ids = np.argmax(preds, axis=1)
84
+ true_labels = [id2label[i] for i in y_val]
85
+ pred_labels = [id2label[i] for i in pred_ids]
86
+
87
+ print(" 分类报告:")
88
+ print(classification_report(true_labels, pred_labels))
89
+
90
+ # Step 7: ROUGE 分数计算
91
+ scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
92
+ rouge1_scores, rougeL_scores = [], []
93
+
94
+ for ref, pred in zip(true_labels, pred_labels):
95
+ score = scorer.score(ref, pred)
96
+ rouge1_scores.append(score['rouge1'].fmeasure)
97
+ rougeL_scores.append(score['rougeL'].fmeasure)
98
+
99
+ print(f"\n Avg ROUGE-1 F1: {np.mean(rouge1_scores):.4f}")
100
+ print(f" Avg ROUGE-L F1: {np.mean(rougeL_scores):.4f}")