LiXinran1 commited on Jul 25, 2025

Commit

26e4a00

verified ·

1 Parent(s): d44dc1c

Upload 33 files

Browse files

Files changed (33) hide show

.gitattributes +4 -35
.gitignore +2 -0
README.md +26 -3
cl.py +77 -0
data/IEMOCAP/dev_data_roberta.json.feature +3 -0
data/IEMOCAP/dev_data_roberta_mm.json.feature +3 -0
data/IEMOCAP/label_vocab.pkl +0 -0
data/IEMOCAP/speaker_vocab.pkl +0 -0
data/IEMOCAP/test_data_roberta.json.feature +3 -0
data/IEMOCAP/test_data_roberta_mm.json.feature +3 -0
data/IEMOCAP/train_data_roberta.json.feature +3 -0
data/IEMOCAP/train_data_roberta_mm.json.feature +3 -0
data/MELD/dev_data_roberta.json.feature +3 -0
data/MELD/dev_data_roberta_mm.json.feature +3 -0
data/MELD/label_vocab.pkl +0 -0
data/MELD/speaker_vocab.pkl +0 -0
data/MELD/test_data_roberta.json.feature +3 -0
data/MELD/test_data_roberta_mm.json.feature +3 -0
data/MELD/train_data_roberta.json.feature +3 -0
data/MELD/train_data_roberta_mm.json.feature +3 -0
dataloader.py +81 -0
dataset.py +230 -0
evaluate.py +199 -0
model.py +1199 -0
model_utils.py +507 -0
requirements.txt +7 -0
run.py +239 -0
saved_models/IEMOCAP/README.txt +2 -0
saved_models/MELD/README.txt +2 -0
saved_models/README.txt +2 -0
similarity_matrix.py +101 -0
trainer.py +171 -0
utils.py +16 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,4 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+# Auto detect text files and perform LF normalization
+* text=auto
+*.feature filter=lfs diff=lfs merge=lfs -text
+*.json.feature filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *.feature
2	+ *.pkl

README.md CHANGED Viewed

@@ -1,3 +1,26 @@
----
-license: mit
----

+# Long-Short Distance Graph Neural Networks and Improved Curriculum Learning for Emotion Recognition in Conversation (Accepted by ECAI2025)
+Emotion Recognition in Conversation (ERC) is a practical and challenging task. This paper proposes a novel multimodal approach, the Long-Short Distance Graph Neural Network (LSDGNN). Based on the Directed Acyclic Graph (DAG), it constructs a long-distance graph neural network and a short-distance graph neural network to obtain multimodal features of distant and nearby utterances, respectively. To ensure that long- and short-distance features are as distinct as possible in representation while enabling mutual influence between the two modules, we employ a Differential Regularizer and incorporate a BiAffine Module to facilitate feature interaction. In addition, we propose an Improved Curriculum Learning (ICL) to address the challenge of data imbalance. By computing the similarity between different emotions to emphasize the shifts in similar emotions, we design a "weighted emotional shift" metric and develop a difficulty measurer, enabling a training process that prioritizes learning easy samples before harder ones. Experimental results on the IEMOCAP and MELD datasets demonstrate that our model outperforms existing benchmarks.
+## Requirements
+Python 3.11
+CUDA 12.2
+After configuring the Python environment and CUDA, you can use `pip install -r requirements.txt` to install the following libraries.
+torch==2.0.0+cu117
+transformers==4.46.3
+numpy==1.24.2
+pandas==2.1.4
+matplotlib==3.7.1
+scikit-learn==1.2.2
+tqdm==4.67.1
+### Training
+GPU NVIDIA GeForce RTX 3090
+for IEMOCAP:
+`python run.py --dataset_name IEMOCAP --gnn_layers 4 --lr 0.0005 --batch_size 16 --epochs 30 --dropout 0.4 --emb_dim 2948 --windowpl 5 --diffloss 0.1 --curriculum --bucket_number 5`
+for MELD:
+`python run.py --dataset_name MELD --gnn_layers 2 --lr 0.00001 --batch_size 64 --epochs 30 --dropout 0.1  --emb_dim 1666 --windowpl 5 --diffloss 0.2  --curriculum --bucket_number  12`

cl.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import numpy as np
+import similarity_matrix
+class Dialog:
+    def __init__(self, utterances, labels, speakers, features, dataset):
+        self.utterances = utterances
+        self.labels = labels
+        self.speakers = speakers
+        self.features = features
+        self.dataset = dataset
+        self.numberofemotionshifts = 0
+        self.numberofspeakers = 0
+        self.numberofutterances = 0
+        self.difficulty = 0
+        self.emotion_variance = 0  # 情感变化度量
+        self.emotion_shift_weighted = 0 #加权后的情感变化
+        self.cc()
+    def __getitem__(self, item):
+        if item == 'utterances':
+            return self.utterances
+        elif item == 'labels':
+            return self.labels
+        elif item == 'speakers':
+            return self.speakers
+        elif item == 'features':
+            return self.features
+    #measure the difficulty of a dialog
+    def cc(self):
+        # 情感数字到文字的映射字典
+      # print(self.dataset)
+        if self.dataset == 'MELD':
+            emotion_map = { -1: 'null', 0: 'neutral', 1: 'surprise', 2: 'fear', 3: 'sadness', 4: 'joy', 5: 'disgust', 6: 'anger'}
+        else:
+            emotion_map = { -1: 'null', 0:'excitement', 1: 'neutral', 2:'frustration', 3:'sadness', 4:'happiness', 5:'anger'}
+     #   print(emotion_map)
+        self.numberofutterances = len(self.utterances)
+        speaker_emo = {}
+        for i in range(0, len(self.labels)):
+            if (self.speakers[i] in speaker_emo):
+                speaker_emo[self.speakers[i]].append(emotion_map[self.labels[i]])
+            else:
+                speaker_emo[self.speakers[i]] = [emotion_map[self.labels[i]]]
+        # 获取情感相似度矩阵
+        matrix, emotion_to_index = similarity_matrix.get_similarity_matrix(self.dataset)
+       # print(matrix)
+        k = 1
+        b = 0.4
+        for key in speaker_emo:
+          #  prev_emo = None
+            for i in range(0, len(speaker_emo[key]) - 1):
+                current_emo = speaker_emo[key][i]
+                next_emo = speaker_emo[key][i + 1]
+                if current_emo != next_emo and current_emo != 'null' and next_emo != 'null':
+                    self.numberofemotionshifts += 1
+                    current_emo_index = emotion_to_index[current_emo]
+                    next_emo_index = emotion_to_index[next_emo]
+                    #线性缩放
+                    #当k为正数时，similarity_score越小说明差距越大，越大说明差距越小,侧重于差距小的情感
+                    #当k为负数时，反之，侧重于差距大的情感
+                    similarity_score = abs(matrix[current_emo_index][next_emo_index]) * k + b
+                    self.emotion_shift_weighted += similarity_score
+        #print(speaker_emo[key])
+        '''
+        for key in speaker_emo:
+            # Convert labels to indices
+            emotions = speaker_emo[key]
+            self.emotion_variance += np.std(emotions)  # 计算每个发言人的情感方差
+       '''
+       # print(self.numberofemotionshifts)
+       # print(self.emotion_shift_weighted)
+       # print('---------')
+        self.numberofspeakers = len(set(self.speakers))
+        self.difficulty = (self.emotion_shift_weighted + self.numberofspeakers ) / (self.numberofutterances + self.numberofspeakers)
+       # self.difficulty = (self.numberofemotionshifts + self.numberofspeakers ) / (self.numberofutterances + self.numberofspeakers)

data/IEMOCAP/dev_data_roberta.json.feature ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c8c1835251e4c85af23e65d1a312990b411d36574f9bd1063f6ca3d20d8f2eda
+size 32689394

data/IEMOCAP/dev_data_roberta_mm.json.feature ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6bd2b4d3fd2a092c2f7b390a0c5115533bf4fb5b0724f49e76a84d98bf054bdb
+size 62364912

data/IEMOCAP/label_vocab.pkl ADDED Viewed

Binary file (98 Bytes). View file

data/IEMOCAP/speaker_vocab.pkl ADDED Viewed

Binary file (7.61 kB). View file

data/IEMOCAP/test_data_roberta.json.feature ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6f7c81d1690f997cd02672b3dd1f077744c997c0446b9ce6599a54f37db4950f
+size 51258384

data/IEMOCAP/test_data_roberta_mm.json.feature ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5bcf5c03d714bbf27105e72141a6e78f1d58278d794e46be82cee53beddcfab1
+size 103033265

data/IEMOCAP/train_data_roberta.json.feature ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6299f2656d0d4e7a7c2e1f91c7e0f811088d500c6320b741ce742a72d6993c99
+size 151409987

data/IEMOCAP/train_data_roberta_mm.json.feature ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cdc380726af732912b539b335233053a465237d29c7369477851bf8dc62e5f28
+size 307542106

data/MELD/dev_data_roberta.json.feature ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2fe9a5f3b0464ff597061a833fa62cfe2337311b53699d1878bba1827106a0a8
+size 29797677

data/MELD/dev_data_roberta_mm.json.feature ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:01930f39300566f37d433a60c323eedc2a8cfd53486d5f5cbda56371e85f8aff
+size 42149762

data/MELD/label_vocab.pkl ADDED Viewed

Binary file (128 Bytes). View file

data/MELD/speaker_vocab.pkl ADDED Viewed

Binary file (5.08 kB). View file

data/MELD/test_data_roberta.json.feature ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0a60fe91031fe79749fa813336309ae5a1804561ff70dd5a6320029a4fdee0f0
+size 70132507

data/MELD/test_data_roberta_mm.json.feature ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0cead63677c699daef4d382047e5592066922c6659d5bca9bc5ce57367dd65a7
+size 99010213

data/MELD/train_data_roberta.json.feature ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:05d4c1be081d894d9f2baa44e3a86882ccaa77cd0783e9c9092b39ad31d3bc81
+size 267972749

data/MELD/train_data_roberta_mm.json.feature ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:40b44a9d69baccc275fd3dcbb95aaf079ac32199d32b072c652e7691a5b46bf8
+size 364368202

dataloader.py ADDED Viewed

	@@ -0,0 +1,81 @@

+from dataset import *
+import pickle
+from torch.utils.data.sampler import SubsetRandomSampler
+from torch.utils.data import DataLoader
+import os
+import argparse
+import numpy as np
+from  transformers import BertTokenizer
+def get_train_valid_sampler(trainset):
+    size = len(trainset)
+    idx = list(range(size))
+    return SubsetRandomSampler(idx)
+def load_vocab(dataset_name):
+    speaker_vocab = pickle.load(open('data/%s/speaker_vocab.pkl' % (dataset_name), 'rb'))
+    label_vocab = pickle.load(open('data/%s/label_vocab.pkl' % (dataset_name), 'rb'))
+    person_vec_dir = 'data/%s/person_vect.pkl' % (dataset_name)
+    # if os.path.exists(person_vec_dir):
+    #     print('Load person vec from ' + person_vec_dir)
+    #     person_vec = pickle.load(open(person_vec_dir, 'rb'))
+    # else:
+    #     print('Creating personality vectors')
+    #     person_vec = np.random.randn(len(speaker_vocab['itos']), 100)a
+    #     print('Saving personality vectors to' + person_vec_dir)
+    #     with open(person_vec_dir,'wb') as f:
+    #         pickle.dump(person_vec, f, -1)
+    person_vec = None
+    return speaker_vocab, label_vocab, person_vec
+def get_IEMOCAP_loaders(dataset_name = 'IEMOCAP', batch_size=32, num_workers=0, pin_memory=False, args = None):
+    print('building vocab.. ')
+    speaker_vocab, label_vocab, person_vec = load_vocab(dataset_name)
+    print('building datasets..')
+    devset = IEMOCAPDataset(dataset_name, 'dev', speaker_vocab, label_vocab, args)
+    valid_sampler = get_train_valid_sampler(devset)
+    testset = IEMOCAPDataset(dataset_name, 'test',  speaker_vocab, label_vocab, args)
+    valid_loader = DataLoader(devset,
+                              batch_size=batch_size,
+                              sampler=valid_sampler,
+                              collate_fn=devset.collate_fn,
+                              num_workers=num_workers,
+                              pin_memory=pin_memory)
+    test_loader = DataLoader(testset,
+                             batch_size=batch_size,
+                             collate_fn=testset.collate_fn,
+                             num_workers=num_workers,
+                             pin_memory=pin_memory)
+    return valid_loader, test_loader, speaker_vocab, label_vocab, person_vec
+#adding babystep_index argument if using curriculum learning
+def get_train_loader(dataset_name = 'IEMOCAP', batch_size=32, num_workers=0, pin_memory=False, args = None, babystep_index = None):
+    print('building vocab.. ')
+    speaker_vocab, label_vocab, person_vec = load_vocab(dataset_name)
+    print('building datasets..')
+    if (args.curriculum):
+            trainset = IEMOCAPDataset(dataset_name, 'train', speaker_vocab, label_vocab, args, None, babystep_index)
+            train_sampler = get_train_valid_sampler(trainset)
+            train_loader = DataLoader(trainset,
+                                      batch_size=batch_size,
+                                      sampler=train_sampler,
+                                      collate_fn=trainset.collate_fn,
+                                      num_workers=num_workers,
+                                      pin_memory=pin_memory)
+            # train_loaders.append(train_loader)
+    else:
+        trainset = IEMOCAPDataset(dataset_name, 'train', speaker_vocab, label_vocab, args)
+        train_sampler = get_train_valid_sampler(trainset)
+        train_loader = DataLoader(trainset,
+                              batch_size=batch_size,
+                              sampler=train_sampler,
+                              collate_fn=trainset.collate_fn,
+                              num_workers=num_workers,
+                              pin_memory=pin_memory)
+        # train_loaders.append(train_loader)
+    return train_loader

dataset.py ADDED Viewed

	@@ -0,0 +1,230 @@

+import torch
+from torch.utils.data import Dataset
+from torch.nn.utils.rnn import pad_sequence
+import pickle, pandas as pd
+import json
+import numpy as np
+import random
+from pandas import DataFrame
+import cl
+class IEMOCAPDataset(Dataset):
+    #babystep_index：用于curriculum learning中的索引，指示在babystep方法中使用多少个“桶”。
+    def __init__(self, dataset_name = 'IEMOCAP', split = 'train', speaker_vocab=None, label_vocab=None, args = None, tokenizer = None, babystep_index = None):
+        self.speaker_vocab = speaker_vocab #说话者的词汇表
+        self.label_vocab = label_vocab     #标签的词汇表
+        self.args = args                    #保存其他配置参数
+        self.data = self.read(dataset_name, split, tokenizer)#调用read方法加载数据集
+        if args.curriculum and split == 'train':  #这行代码处理curriculum learning  如果args.curriculum为True且当前数据是训练集（split == 'train'），则进行babystep操作。
+            self.data = self.babystep(self.getbuckets(self.data, args.bucket_number), babystep_index)
+        print(len(self.data))
+        self.len = len(self.data)
+    def read(self, dataset_name, split, tokenizer):
+        with open('data/%s/%s_data_roberta_mm.json.feature'%(dataset_name, split), encoding='utf-8') as f:
+            raw_data = json.load(f)
+        # process dialogue
+        dialogs = []
+        # raw_data = sorted(raw_data, key=lambda x:len(x))
+        for d in raw_data:
+            # if len(d) < 5 or len(d) > 6:
+            #     continue
+            utterances = []
+            labels = []
+            speakers = []
+            features = []
+            for i,u in enumerate(d):
+                utterances.append(u['text'])
+                labels.append(self.label_vocab['stoi'][u['label']] if 'label' in u.keys() else -1)
+                speakers.append(self.speaker_vocab['stoi'][u['speaker']])
+                features.append(u['cls'][0] + u['cls'][1]+u['cls'][2])
+                #different modalities
+                #features.append(u['cls'][0])
+                #features.append(u['cls'][1])
+                #features.append(u['cls'][2])
+                #features.append(u['cls'][0] + u['cls'][1])
+                #features.append(u['cls'][0] + u['cls'][2])
+                #features.append(u['cls'][1]+u['cls'][2])
+            dialog = cl.Dialog(utterances, labels, speakers, features, self.args.dataset_name)
+            # dialogs.append({
+            #     'utterances': utterances,
+            #     'labels': labels,
+            #     'speakers':speakers,
+            #     'features': features
+            # })
+            dialogs.append(dialog)
+        if self.args.curriculum and split == 'train':
+          totalut = 0
+          totalshift = 0
+          totalspeaker = 0
+          for i in range(0, len(dialogs)):
+            totalut += dialogs[i].numberofutterances
+            totalshift += dialogs[i].numberofemotionshifts
+            totalspeaker += dialogs[i].numberofspeakers
+        # random.shuffle(dialogs)
+          dialogs.sort(key= lambda dialog: dialog.difficulty)
+        # if (split == 'train'):
+        #     num_buckets = 8
+        #     bucket_length = (len(dialogs) + num_buckets - 1) // num_buckets
+        #     buckets = [dialogs[i:i + bucket_length] for i in range(0, len(dialogs), bucket_length)]
+        # print('')
+        else:
+          random.shuffle(dialogs)
+        return dialogs
+    def getbuckets(self, dialogs, num_buckets):
+        buckets = []
+        bucket_length = (len(dialogs) + num_buckets - 1) // num_buckets
+        buckets = [dialogs[i:i + bucket_length] for i in range(0, len(dialogs), bucket_length)]
+        print('bucket')
+        print(len(buckets))
+        return buckets
+    #parameter for curriculum learning
+    def babystep(self, buckets, index):
+        data = []
+        for i in range(0, index):
+            data+= buckets[i];
+        return data
+    def __getitem__(self, index):
+        '''
+        :param index:
+        :return:
+            feature,
+            label
+            speaker
+            length
+            text
+        '''
+        return torch.FloatTensor(self.data[index]['features']), \
+               torch.LongTensor(self.data[index]['labels']),\
+               self.data[index]['speakers'], \
+               len(self.data[index]['labels']), \
+               self.data[index]['utterances']
+    def __len__(self):
+        return self.len
+    def get_adj(self, speakers, max_dialog_len):
+        '''
+        get adj matrix
+        :param speakers:  (B, N)
+        :param max_dialog_len:
+        :return:
+            adj: (B, N, N) adj[:,i,:] means the direct predecessors of node i
+        '''
+        adj = []
+        for speaker in speakers:
+            a = torch.zeros(max_dialog_len, max_dialog_len)
+            for i,s in enumerate(speaker):
+                get_local_pred = False
+                get_global_pred = False
+                for j in range(i - 1, -1, -1):
+                    if speaker[j] == s and not get_local_pred:
+                        get_local_pred = True
+                        a[i,j] = 1
+                    elif speaker[j] != s and not get_global_pred:
+                        get_global_pred = True
+                        a[i,j] = 1
+                    if get_global_pred and get_local_pred:
+                        break
+            adj.append(a)
+        return torch.stack(adj)
+    def get_adj_v1(self, speakers, max_dialog_len):
+        '''
+        get adj matrix
+        :param speakers:  (B, N)
+        :param max_dialog_len:
+        :return:
+            adj: (B, N, N) adj[:,i,:] means the direct predecessors of node i
+        '''
+        adj = []
+        for speaker in speakers:
+            a = torch.zeros(max_dialog_len, max_dialog_len)
+            for i,s in enumerate(speaker):
+                cnt = 0
+                for j in range(i - 1, -1, -1):
+                    a[i,j] = 1
+                    if speaker[j] == s:
+                        cnt += 1
+                        if cnt==self.args.windowps:
+                            break
+            adj.append(a)
+        return torch.stack(adj)
+    def get_adj_v2(self, speakers, max_dialog_len):
+        '''
+        get adj matrix
+        :param speakers:  (N)
+        :param max_dialog_len:
+        :return:
+            adj: (N, N) adj[i,:] means the direct predecessors of node i
+        '''
+        adj = []
+        for speaker in speakers:
+            a = torch.zeros(max_dialog_len, max_dialog_len)
+            for i, s in enumerate(speaker):
+                cnt = 0
+                for j in range(i - 1, -1, -1):
+                    a[i, j] = 1  # Assign 1 for all previous utterances
+                    if speaker[j] == s:  # Compare speaker strings
+                        cnt += 1
+                        if cnt == self.args.windowpl:  # Check if window condition is met
+                            break
+            adj.append(a)
+        return torch.stack(adj)
+    def get_s_mask(self, speakers, max_dialog_len):
+        '''
+        :param speakers:
+        :param max_dialog_len:
+        :return:
+         s_mask: (B, N, N) s_mask[:,i,:] means the speaker informations for predecessors of node i, where 1 denotes the same speaker, 0 denotes the different speaker
+         s_mask_onehot (B, N, N, 2) onehot emcoding of s_mask
+        '''
+        s_mask = []
+        s_mask_onehot = []
+        for speaker in speakers:
+            s = torch.zeros(max_dialog_len, max_dialog_len, dtype = torch.long)
+            s_onehot = torch.zeros(max_dialog_len, max_dialog_len, 2)
+            for i in range(len(speaker)):
+                for j in range(len(speaker)):
+                    if speaker[i] == speaker[j]:
+                        s[i,j] = 1
+                        s_onehot[i,j,1] = 1
+                    else:
+                        s_onehot[i,j,0] = 1
+            s_mask.append(s)
+            s_mask_onehot.append(s_onehot)
+        return torch.stack(s_mask), torch.stack(s_mask_onehot)
+    def collate_fn(self, data):
+        '''
+        :param data:
+            features, labels, speakers, length, utterances
+        :return:
+            features: (B, N, D) padded
+            labels: (B, N) padded
+            adj: (B, N, N) adj[:,i,:] means the direct predecessors of node i
+            s_mask: (B, N, N) s_mask[:,i,:] means the speaker informations for predecessors of node i, where 1 denotes the same speaker, 0 denotes the different speaker
+            lengths: (B, )
+            utterances:  not a tensor
+        '''
+        max_dialog_len = max([d[3] for d in data])
+        feaures = pad_sequence([d[0] for d in data], batch_first = True) # (B, N, D)
+        labels = pad_sequence([d[1] for d in data], batch_first = True, padding_value = -1) # (B, N )
+        adj_1 = self.get_adj_v1([d[2] for d in data], max_dialog_len)
+        adj_2 = self.get_adj_v2([d[2] for d in data], max_dialog_len)
+        s_mask, s_mask_onehot = self.get_s_mask([d[2] for d in data], max_dialog_len)
+        lengths = torch.LongTensor([d[3] for d in data])
+        speakers = pad_sequence([torch.LongTensor(d[2]) for d in data], batch_first = True, padding_value = -1)
+        utterances = [d[4] for d in data]
+        return feaures, labels, adj_1, adj_2, s_mask, s_mask_onehot,lengths, speakers, utterances

evaluate.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = '0'
+import numpy as np, argparse, time, pickle, random
+import torch
+import matplotlib
+import torch.nn as nn
+import torch.optim as optim
+from dataloader import IEMOCAPDataset
+from model import *
+from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, classification_report, \
+    precision_recall_fscore_support, ConfusionMatrixDisplay
+import matplotlib.pyplot as plt
+from trainer import train_or_eval_model, save_badcase
+from dataset import IEMOCAPDataset
+from dataloader import get_IEMOCAP_loaders
+from transformers import AdamW
+import copy
+# We use seed = 100 for reproduction of the results reported in the paper.
+seed = 100
+def seed_everything(seed=seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.benchmark = False
+    torch.backends.cudnn.deterministic = True
+def evaluate(model,  dataloader, cuda, args, speaker_vocab, label_vocab):
+    preds, labels = [], []
+    scores, vids = [], []
+    dialogs = []
+    speakers = []
+    model.eval()
+    for data in dataloader:
+        features, label, adj,s_mask, s_mask_onehot,lengths, speaker, utterances = data
+        if cuda:
+            features = features.cuda()
+            label = label.cuda()
+            adj = adj.cuda()
+            s_mask_onehot = s_mask_onehot.cuda()
+            s_mask = s_mask.cuda()
+            lengths = lengths.cuda()
+        log_prob = model(features, adj,s_mask, s_mask_onehot, lengths) # (B, N, C)
+        label = label.cpu().numpy().tolist() # (B, N)
+        pred = torch.argmax(log_prob, dim = 2).cpu().numpy().tolist() # (B, N)
+        preds += pred
+        labels += label
+        dialogs += utterances
+        speakers += speaker
+    if preds != []:
+        new_preds = []
+        new_labels = []
+        for i,label in enumerate(labels):
+            for j,l in enumerate(label):
+                if l != -1:
+                    new_labels.append(l)
+                    new_preds.append(preds[i][j])
+    else:
+        return
+    avg_accuracy = round(accuracy_score(new_labels, new_preds) * 100, 2)
+    if args.dataset_name in ['IEMOCAP', 'MELD', 'EmoryNLP']:
+        avg_fscore = round(f1_score(new_labels, new_preds, average='weighted') * 100, 2)
+        # get f1 score for each class to generate confusion matrix
+        # fscore_perclass = f1_score(new_labels, new_preds, average=None)
+        # print('fscore_perclass', fscore_perclass)
+        print('test_accuracy', avg_accuracy)
+        print('test_f1', avg_fscore)
+        # confusion matrix test, not working on colab
+        # print(new_labels)
+        # cm = confusion_matrix(new_labels, new_preds, labels=[0, 1, 2, 3, 4, 5, 6])
+        # print(cm)
+        # per_class_accuracies = {}
+        #
+        # # Calculate the accuracy for each one of our classes
+        # for idx, cls in enumerate(label_vocab['itos']):
+        #     # True negatives are all the samples that are not our current GT class (not the current row)
+        #     # and were not predicted as the current class (not the current column)
+        #     true_negatives = np.sum(np.delete(np.delete(cm, idx, axis=0), idx, axis=1))
+        #
+        #     # True positives are all the samples of our current GT class that were predicted as such
+        #     true_positives = cm[idx, idx]
+        #
+        #     # The accuracy for the current class is the ratio between correct predictions to all predictions
+        #     per_class_accuracies[cls] = (true_positives + true_negatives) / np.sum(cm)
+        # print('acc:', per_class_accuracies)
+        # disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_vocab['itos'])
+        # disp.plot()
+        # plt.show()
+        return
+    else:
+        avg_micro_fscore = round(f1_score(new_labels, new_preds, average='micro', labels=list(range(1, 7))) * 100, 2)
+        avg_macro_fscore = round(f1_score(new_labels, new_preds, average='macro') * 100, 2)
+        print('test_accuracy', avg_accuracy)
+        print('test_micro_f1', avg_micro_fscore)
+        print('test_macro_f1', avg_macro_fscore)
+        return
+if __name__ == '__main__':
+    #path = './saved_models/'
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--bert_model_dir', type=str, default='')
+    parser.add_argument('--bert_tokenizer_dir', type=str, default='')
+    parser.add_argument('--state_dict_file', type=str, default='')
+    parser.add_argument('--bert_dim', type = int, default=1024)
+    parser.add_argument('--hidden_dim', type = int, default=300)
+    parser.add_argument('--mlp_layers', type=int, default=2, help='Number of output mlp layers.')
+    parser.add_argument('--gnn_layers', type=int, default=2, help='Number of gnn layers.')
+    parser.add_argument('--emb_dim', type=int, default=1024, help='Feature size.')
+    parser.add_argument('--attn_type', type=str, default='rgcn', choices=['dotprod','linear','bilinear', 'rgcn'], help='Feature size.')
+    parser.add_argument('--no_rel_attn',  action='store_true', default=False, help='no relation for edges' )
+    parser.add_argument('--max_sent_len', type=int, default=200,
+                        help='max content length for each text, if set to 0, then the max length has no constrain')
+    parser.add_argument('--no_cuda', action='store_true', default=False, help='does not use GPU')
+    parser.add_argument('--dataset_name', default='IEMOCAP', type= str, help='dataset name, IEMOCAP or MELD or DailyDialog')
+    parser.add_argument('--windowp', type=int, default=1,
+                        help='context window size for constructing edges in graph model for past utterances')
+    parser.add_argument('--windowf', type=int, default=0,
+                        help='context window size for constructing edges in graph model for future utterances')
+    parser.add_argument('--max_grad_norm', type=float, default=5.0, help='Gradient clipping.')
+    parser.add_argument('--lr', type=float, default=1e-3, metavar='LR', help='learning rate')
+    parser.add_argument('--dropout', type=float, default=0, metavar='dropout', help='dropout rate')
+    parser.add_argument('--batch_size', type=int, default=8, metavar='BS', help='batch size')
+    parser.add_argument('--epochs', type=int, default=20, metavar='E', help='number of epochs')
+    parser.add_argument('--tensorboard', action='store_true', default=False, help='Enables tensorboard log')
+    parser.add_argument('--nodal_att_type', type=str, default=None, choices=['global', 'past'],
+                        help='type of nodal attention')
+    parser.add_argument('--curriculum', action='store_true', default=False, help='Enables curriculum learning')
+    parser.add_argument('--bucket_number', type=int, default=0, help='Number of buckets using')
+    args = parser.parse_args()
+    print(args)
+    seed_everything()
+    args.cuda = torch.cuda.is_available() and not args.no_cuda
+    if args.cuda:
+        print('Running on GPU')
+    else:
+        print('Running on CPU')
+    if args.tensorboard:
+        from tensorboardX import SummaryWriter
+        writer = SummaryWriter()
+    cuda = args.cuda
+    n_epochs = args.epochs
+    batch_size = args.batch_size
+    valid_loader, test_loader, speaker_vocab, label_vocab, person_vec = get_IEMOCAP_loaders(
+        dataset_name=args.dataset_name, batch_size=batch_size, num_workers=0, args=args)
+    n_classes = len(label_vocab['itos'])
+    print('building model..')
+    model = DAGERC_fushion(args, n_classes)
+    if torch.cuda.device_count() > 1:
+        print('Multi-GPU...........')
+        model = nn.DataParallel(model,device_ids = range(torch.cuda.device_count()))
+    if cuda:
+        model.cuda()
+    state_dict = torch.load(args.state_dict_file)
+    model.load_state_dict(state_dict)
+    evaluate(model, test_loader, cuda, args, speaker_vocab, label_vocab)

model.py ADDED Viewed

	@@ -0,0 +1,1199 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np, itertools, random, copy, math
+from transformers import BertModel, BertConfig
+from transformers import AutoTokenizer, AutoModelWithLMHead
+from model_utils import *
+class BertERC(nn.Module):
+    def __init__(self, args, num_class):
+        super().__init__()
+        self.args = args
+        # gcn layer
+        self.dropout = nn.Dropout(args.dropout)
+        # bert_encoder
+        self.bert_config = BertConfig.from_json_file(args.bert_model_dir + 'config.json')
+        self.bert = BertModel.from_pretrained(args.home_dir + args.bert_model_dir, config = self.bert_config)
+        in_dim =  args.bert_dim
+        # output mlp layers
+        layers = [nn.Linear(in_dim, args.hidden_dim), nn.ReLU()]
+        for _ in range(args.mlp_layers- 1):
+            layers += [nn.Linear(args.hidden_dim, args.hidden_dim), nn.ReLU()]
+        layers += [nn.Linear(args.hidden_dim, num_class)]
+        self.out_mlp = nn.Sequential(*layers)
+    def forward(self, content_ids, token_types,utterance_len,seq_len):
+        # the embeddings for bert
+        # if len(content_ids)>512:
+        #     print('ll')
+        #
+        ## w token_type_ids
+        # lastHidden = self.bert(content_ids, token_type_ids = token_types)[1] #(N , D)
+        ## w/t token_type_ids
+        lastHidden = self.bert(content_ids)[1] #(N , D)
+        final_feature = self.dropout(lastHidden)
+        # pooling
+        outputs = self.out_mlp(final_feature) #(N, D)
+        return outputs
+class DAGERC(nn.Module):
+    def __init__(self, args, num_class):
+        super().__init__()
+        self.args = args
+        # gcn layer
+        self.dropout = nn.Dropout(args.dropout)
+        self.gnn_layers = args.gnn_layers
+        if not args.no_rel_attn:
+            self.rel_emb = nn.Embedding(2,args.hidden_dim)
+            self.rel_attn = True
+        else:
+            self.rel_attn = False
+        if self.args.attn_type == 'linear':
+            gats = []
+            for _ in range(args.gnn_layers):
+                gats += [GatLinear(args.hidden_dim) if args.no_rel_attn else GatLinear_rel(args.hidden_dim)]
+            self.gather = nn.ModuleList(gats)
+        else:
+            gats = []
+            for _ in range(args.gnn_layers):
+                gats += [Gatdot(args.hidden_dim) if args.no_rel_attn else Gatdot_rel(args.hidden_dim)]
+            self.gather = nn.ModuleList(gats)
+        grus = []
+        for _ in range(args.gnn_layers):
+            grus += [nn.GRUCell(args.hidden_dim, args.hidden_dim)]
+        self.grus = nn.ModuleList(grus)
+        self.fc1 = nn.Linear(args.emb_dim, args.hidden_dim)
+        in_dim = args.hidden_dim * (args.gnn_layers + 1) + args.emb_dim
+        # output mlp layers
+        layers = [nn.Linear(in_dim, args.hidden_dim), nn.ReLU()]
+        for _ in range(args.mlp_layers - 1):
+            layers += [nn.Linear(args.hidden_dim, args.hidden_dim), nn.ReLU()]
+        layers += [nn.Linear(args.hidden_dim, num_class)]
+        self.out_mlp = nn.Sequential(*layers)
+    def forward(self, features, adj,s_mask):
+        '''
+        :param features: (B, N, D)
+        :param adj: (B, N, N)
+        :param s_mask: (B, N, N)
+        :return:
+        '''
+        num_utter = features.size()[1]
+        if self.rel_attn:
+            rel_ft = self.rel_emb(s_mask) # (B, N, N, D)
+        H0 = F.relu(self.fc1(features)) # (B, N, D)
+        H = [H0]
+        for l in range(self.args.gnn_layers):
+            H1 = self.grus[l](H[l][:,0,:]).unsqueeze(1) # (B, 1, D)
+            for i in range(1, num_utter):
+                if not self.rel_attn:
+                    _, M = self.gather[l](H[l][:,i,:], H1, H1, adj[:,i,:i])
+                else:
+                    _, M = self.gather[l](H[l][:, i, :], H1, H1, adj[:, i, :i], rel_ft[:, i, :i, :])
+                H1 = torch.cat((H1 , self.grus[l](H[l][:,i,:], M).unsqueeze(1)), dim = 1)
+                # print('H1', H1.size())
+                # print('----------------------------------------------------')
+            H.append(H1)
+            H0 = H1
+        H.append(features)
+        H = torch.cat(H, dim = 2) #(B, N, l*D)
+        logits = self.out_mlp(H)
+        return logits
+class DAGERC_fushion(nn.Module):
+    def __init__(self, args, num_class):
+        super().__init__()
+        self.args = args
+        # gcn layer
+        self.dropout = nn.Dropout(args.dropout)
+        self.gnn_layers = args.gnn_layers
+        if not args.no_rel_attn:
+            self.rel_attn = True
+        else:
+            self.rel_attn = False
+        if self.args.attn_type == 'linear':
+            gats = []
+            for _ in range(args.gnn_layers):
+                gats += [GatLinear(args.hidden_dim) if args.no_rel_attn else GatLinear_rel(args.hidden_dim)]
+            self.gather = nn.ModuleList(gats)
+        elif self.args.attn_type == 'dotprod':
+            gats = []
+            for _ in range(args.gnn_layers):
+                gats += [GatDot(args.hidden_dim) if args.no_rel_attn else GatDot_rel(args.hidden_dim)]
+            self.gather = nn.ModuleList(gats)
+        elif self.args.attn_type == 'rgcn':
+            gats = []
+            for _ in range(args.gnn_layers):
+                # gats += [GAT_dialoggcn(args.hidden_dim)]
+                gats += [GAT_dialoggcn_v1(args.hidden_dim)]
+            self.gather = nn.ModuleList(gats)
+        grus_c = []
+        for _ in range(args.gnn_layers):
+            grus_c += [nn.GRUCell(args.hidden_dim, args.hidden_dim)]
+        self.grus_c = nn.ModuleList(grus_c)
+        grus_p = []
+        for _ in range(args.gnn_layers):
+            grus_p += [nn.GRUCell(args.hidden_dim, args.hidden_dim)]
+        self.grus_p = nn.ModuleList(grus_p)
+        fcs = []
+        for _ in range(args.gnn_layers):
+            fcs += [nn.Linear(args.hidden_dim * 2, args.hidden_dim)]
+        self.fcs = nn.ModuleList(fcs)
+        self.fc1 = nn.Linear(args.emb_dim, args.hidden_dim)
+        self.nodal_att_type = args.nodal_att_type
+        in_dim = args.hidden_dim * (args.gnn_layers + 1) + args.emb_dim
+        # output mlp layers
+        layers = [nn.Linear(in_dim, args.hidden_dim), nn.ReLU()]
+        for _ in range(args.mlp_layers - 1):
+            layers += [nn.Linear(args.hidden_dim, args.hidden_dim), nn.ReLU()]
+        layers += [self.dropout]
+        layers += [nn.Linear(args.hidden_dim, num_class)]
+        self.out_mlp = nn.Sequential(*layers)
+        self.attentive_node_features = attentive_node_features(in_dim)
+    def forward(self, features, adj,s_mask,s_mask_onehot, lengths):
+        '''
+        :param features: (B, N, D)
+        :param adj: (B, N, N)
+        :param s_mask: (B, N, N)
+        :param s_mask_onehot: (B, N, N, 2)
+        :return:
+        '''
+        num_utter = features.size()[1]
+        H0 = F.relu(self.fc1(features))
+        # H0 = self.dropout(H0)
+        H = [H0]
+        for l in range(self.args.gnn_layers):
+            C = self.grus_c[l](H[l][:,0,:]).unsqueeze(1)
+            M = torch.zeros_like(C).squeeze(1)
+            # P = M.unsqueeze(1)
+            P = self.grus_p[l](M, H[l][:,0,:]).unsqueeze(1)
+            #H1 = F.relu(self.fcs[l](torch.cat((C,P) , dim = 2)))
+            #H1 = F.relu(C+P)
+            H1 = C+P
+            for i in range(1, num_utter):
+                # print(i,num_utter)
+                if self.args.attn_type == 'rgcn':
+                    _, M = self.gather[l](H[l][:,i,:], H1, H1, adj[:,i,:i], s_mask[:,i,:i])
+                    # _, M = self.gather[l](H[l][:,i,:], H1, H1, adj[:,i,:i], s_mask_onehot[:,i,:i,:])
+                else:
+                    if not self.rel_attn:
+                        _, M = self.gather[l](H[l][:,i,:], H1, H1, adj[:,i,:i])
+                    else:
+                        _, M = self.gather[l](H[l][:,i,:], H1, H1, adj[:,i,:i], s_mask[:, i, :i])
+                C = self.grus_c[l](H[l][:,i,:], M).unsqueeze(1)
+                P = self.grus_p[l](M, H[l][:,i,:]).unsqueeze(1)
+                # P = M.unsqueeze(1)
+                #H_temp = F.relu(self.fcs[l](torch.cat((C,P) , dim = 2)))
+                #H_temp = F.relu(C+P)
+                H_temp = C+P
+                H1 = torch.cat((H1 , H_temp), dim = 1)
+                # print('H1', H1.size())
+                # print('----------------------------------------------------')
+            H.append(H1)
+        H.append(features)
+        H = torch.cat(H, dim = 2)
+        H = self.attentive_node_features(H,lengths,self.nodal_att_type)
+        logits = self.out_mlp(H)
+        return logits
+#仅仅使用最后一层的short和long，concat；只用过去特征
+#Only use the final layer's short and long features, concatenated; use only past features.
+class DAGERC_new_1(nn.Module):
+    def __init__(self, args, num_class):
+        super().__init__()
+        self.args = args
+        # gcn layer
+        self.dropout = nn.Dropout(args.dropout)
+        self.gnn_layers = args.gnn_layers
+        if not args.no_rel_attn:
+            self.rel_attn = True
+        else:
+            self.rel_attn = False
+        if self.args.attn_type == 'linear':
+            gats = []
+            for _ in range(args.gnn_layers):
+                gats += [GatLinear(args.hidden_dim) if args.no_rel_attn else GatLinear_rel(args.hidden_dim)]
+            self.gather = nn.ModuleList(gats)
+        elif self.args.attn_type == 'dotprod':
+            gats = []
+            for _ in range(args.gnn_layers):
+                gats += [GatDot(args.hidden_dim) if args.no_rel_attn else GatDot_rel(args.hidden_dim)]
+            self.gather = nn.ModuleList(gats)
+        elif self.args.attn_type == 'rgcn':
+            #短距离
+            gats_short = []
+            gats_long = []
+            for _ in range(args.gnn_layers):
+                gats_short += [GAT_dialoggcn_v1(args.hidden_dim)]
+            for _ in range(args.gnn_layers):
+                gats_long += [GAT_dialoggcn_v1(args.hidden_dim)]
+            self.gather_short = nn.ModuleList(gats_short)
+            self.gather_long = nn.ModuleList(gats_long)
+        # 近距离 GRU
+        grus_c_short = []
+        for _ in range(args.gnn_layers):
+            grus_c_short += [nn.GRUCell(args.hidden_dim, args.hidden_dim)]
+        self.grus_c_short = nn.ModuleList(grus_c_short)
+        # 远距离 GRU
+        grus_c_long = []
+        for _ in range(args.gnn_layers):
+            grus_c_long += [nn.GRUCell(args.hidden_dim, args.hidden_dim)]
+        self.grus_c_long = nn.ModuleList(grus_c_long)
+        grus_p_short = []
+        for _ in range(args.gnn_layers):
+            grus_p_short += [nn.GRUCell(args.hidden_dim, args.hidden_dim)]
+        self.grus_p_short = nn.ModuleList(grus_p_short)
+        grus_p_long = []
+        for _ in range(args.gnn_layers):
+            grus_p_long += [nn.GRUCell(args.hidden_dim, args.hidden_dim)]
+        self.grus_p_long = nn.ModuleList(grus_p_long)
+        #近距离全链接层
+        fcs_short = []
+        for _ in range(args.gnn_layers):
+            fcs_short += [nn.Linear(args.hidden_dim * 2, args.hidden_dim)]
+        self.fcs_short = nn.ModuleList(fcs_short)
+        # 远距离全连接层
+        fcs_long = []
+        for _ in range(args.gnn_layers):
+            fcs_long += [nn.Linear(args.hidden_dim * 2, args.hidden_dim)]
+        self.fcs_long = nn.ModuleList(fcs_long)
+        self.fc1 = nn.Linear(args.emb_dim, args.hidden_dim)
+        self.nodal_att_type = args.nodal_att_type
+        in_dim = ((args.hidden_dim*2)+ args.emb_dim)
+   #     print(in_dim)
+        # output mlp layers
+        layers = [nn.Linear(in_dim, args.hidden_dim), nn.ReLU()]
+        for _ in range(args.mlp_layers - 1):
+            layers += [nn.Linear(args.hidden_dim, args.hidden_dim), nn.ReLU()]
+        layers += [self.dropout]
+        layers += [nn.Linear(args.hidden_dim, num_class)]
+        self.out_mlp = nn.Sequential(*layers)
+        self.attentive_node_features = attentive_node_features(in_dim)
+        self.affine1 = nn.Parameter(torch.empty(size=((args.hidden_dim)  , (args.hidden_dim)  )))
+        nn.init.xavier_uniform_(self.affine1.data, gain=1.414)
+        self.affine2 = nn.Parameter(torch.empty(size=((args.hidden_dim)  , (args.hidden_dim) )))
+        nn.init.xavier_uniform_(self.affine2.data, gain=1.414)
+        self.diff_loss = DiffLoss(args)
+        self.beta = args.diffloss
+    def forward(self, features, adj_1, adj_2 ,s_mask, s_mask_onehot, lengths):
+        # 检查 H1 和 H2 是否完全相等
+        are_equal = all(torch.equal(h1, h2) for h1, h2 in zip(adj_1, adj_2))
+      #  print("adj1 和 adj2 是否完全相等:", are_equal)
+      #  print('adj1',adj_1)
+      #  print('----------------------------------------------------')
+     #  print('adj2',adj_2)
+     #   print('----------------------------------------------------')
+        num_utter = features.size()[1]
+        H0 = F.relu(self.fc1(features))
+        #print('H0', H0.size())
+        # H0 = self.dropout(H0)
+        H = [H0]
+        H_combined_short_list = []
+        #对短距离特征进行处理
+        for l in range(self.args.gnn_layers):
+            C = self.grus_c_short[l](H[l][:,0,:]).unsqueeze(1) #针对每一层的第一个节点，使用 GRU 单元更新节点特征并聚合信息。
+            M = torch.zeros_like(C).squeeze(1) #初始化一个聚合信息张量 M（全零张量），并使用它与节点特征结合生成额外的特征 P。
+            # P = M.unsqueeze(1)
+            P = self.grus_p_short[l](M, H[l][:,0,:]).unsqueeze(1)  #使用 M（全零张量）和第一个节点的特征 H[l][:, 0, :] 作为输入，得到额外特征 P，形状为 (B, D)
+            #H1 = F.relu(self.fcs[l](torch.cat((C,P) , dim = 2)))
+            #H1 = F.relu(C+P)
+            H1 = C+P#将更新后的特征 C 与额外特征 P 相加，生成新的节点特征 H1，为后续层的计算做准备。
+            for i in range(1, num_utter):
+                # print(i,num_utter)
+                if self.args.attn_type == 'rgcn':
+                    #将 H[l][:, i, :]（当前节点特征）,H1（之前节点的特征聚合结果）,adj[:, i, :i]（当前节点与之前节点的邻接矩阵）
+                    #s_mask[:, i, :i]（当前节点的掩码）,得到聚合结果 M
+                    _, M = self.gather_short[l](H[l][:,i,:], H1, H1, adj_1[:,i,:i], s_mask[:,i,:i])
+                    # _, M = self.gather[l](H[l][:,i,:], H1, H1, adj[:,i,:i], s_mask_onehot[:,i,:i,:])
+                else:
+                    if not self.rel_attn:
+                        _, M = self.gather_short[l](H[l][:,i,:], H1, H1, adj_1[:,i,:i])
+                    else:
+                        _, M = self.gather_short[l](H[l][:,i,:], H1, H1, adj_1[:,i,:i], s_mask[:, i, :i])
+                #使用 GRU 单元 self.grus_c[l] 来处理当前节点的特征 H[l][:, i, :] 和聚合后的特征 M，得到新的特��� C。
+                # 这表明当前节点的特征更新与其邻居的聚合信息有关。
+                C = self.grus_c_short[l](H[l][:,i,:], M).unsqueeze(1)
+                #使用另一个 GRU 单元 self.grus_p[l] 来处理聚合特征 M 和当前节点的特征 H[l][:, i, :]，得到额外的特征 P。
+                P = self.grus_p_short[l](M, H[l][:,i,:]).unsqueeze(1)
+                # P = M.unsqueeze(1)
+                #H_temp = F.relu(self.fcs[l](torch.cat((C,P) , dim = 2)))
+                #H_temp = F.relu(C+P)
+                H_temp = C+P#将更新后的特征 C 和额外特征 P 进行相加，生成新的节点特征 H_temp
+                H1 = torch.cat((H1 , H_temp), dim = 1)  #将当前节点的特征 H_temp 拼接到 H1 中。
+               # print('H1', H1.size())
+                #print('----------------------------------------------------')
+            H.append(H1)
+            H_combined_short_list.append(H[l+1])
+        '''
+        下面对长距离特征进行处理    The following processes the long-distance features.
+        '''
+        H_long = [H0]  # 初始化 H_long
+        H_combined_long_list = []  # 存储长距离处理的结果
+        # 对长距离特征进行处理
+        for l in range(self.args.gnn_layers):
+            C_long = self.grus_c_long[l](H_long[l][:,0,:]).unsqueeze(1)  # 使用 GRU 更新长距离的第一个节点
+            M_long = torch.zeros_like(C_long).squeeze(1)  # 初始化长距离的聚合信息张量 M_long
+            P_long = self.grus_p_long[l](M_long, H_long[l][:,0,:]).unsqueeze(1)  # 生成额外的特征 P_long
+            H1_long = C_long + P_long  # 生成新的长距离节点特征 H1_long
+            for i in range(1, num_utter):
+                # 依据不同的 attention 类型，进行特征聚合
+                if self.args.attn_type == 'rgcn':
+                    _, M_long = self.gather_long[l](H_long[l][:,i,:], H1_long, H1_long, adj_2[:,i,:i], s_mask[:,i,:i])
+                else:
+                    if not self.rel_attn:
+                        _, M_long = self.gather_long[l](H_long[l][:,i,:], H1_long, H1_long, adj_2[:,i,:i])
+                    else:
+                        _, M_long = self.gather_long[l](H_long[l][:,i,:], H1_long, H1_long, adj_2[:,i,:i], s_mask[:,i,:i])
+                # 使用 GRU 更新当前节点的特征 C_long 和 M_long
+                C_long = self.grus_c_long[l](H_long[l][:,i,:], M_long).unsqueeze(1)
+                P_long = self.grus_p_long[l](M_long, H_long[l][:,i,:]).unsqueeze(1)
+                H_temp_long = C_long + P_long  # 将更新后的特征 C_long 和 P_long 相加生成新特征
+                H1_long = torch.cat((H1_long, H_temp_long), dim=1)  # 将特征拼接到 H1_long 中
+            H_long.append(H1_long)  # 更新 H_long 列表
+            H_combined_long_list.append(H_long[l+1])
+        '''
+        两个通道特征都提取完毕！    Both short- and long-distance channel features have been extracted!
+        '''
+      #  print('H_combined_short_list',H_combined_short_list)
+       # print('H_combined_long_list',H_combined_long_list)
+      #  are_equal = all(torch.equal(h1, h2) for h1, h2 in zip(H_combined_short_list, H_combined_long_list))
+      #  print("H_combined_short_list 和  H_combined_long_list 是否完全相等:", are_equal)
+     #   for idx, tensor in enumerate(H_combined_short_list):
+      #      print(f"H_combined_short_list[{idx}] shape: {tensor.shape}")
+        H_final = []
+      #  print("H2 shape:", H2.shape)
+       # 计算差异正则化损失
+        diff_loss = 0
+        for l in range(self.args.gnn_layers):
+        #    print('周期：', l)
+            HShort_prime = H_combined_short_list[l]
+            HLong_prime = H_combined_long_list[l]
+         #   print("HShort_prime:", HShort_prime)
+        #    print("HLong_prime:", HLong_prime)
+        #    print("HShort_prime shape:", HShort_prime.shape)
+        #    print("HLong_prime shape:", HLong_prime.shape)
+            diff_loss = self.diff_loss(HShort_prime, HLong_prime) + diff_loss
+           # print("diff_loss:", diff_loss)
+          #  print(diff_loss.item())
+            # 互交叉注意力机制
+            A1 = F.softmax(torch.bmm(torch.matmul(HShort_prime, self.affine1), torch.transpose(HLong_prime, 1, 2)), dim=2)
+            A2 = F.softmax(torch.bmm(torch.matmul(HLong_prime, self.affine2), torch.transpose(HShort_prime, 1, 2)), dim=2)
+            HShort_prime_new = torch.bmm(A1, HLong_prime)  # 更新的短时特征
+            HLong_prime_new = torch.bmm(A2, HShort_prime)    # 更新的长时特征
+            HShort_prime_out = self.dropout(HShort_prime_new) if l < self.args.gnn_layers - 1 else HShort_prime_new
+            HLong_prime_out = self.dropout(HLong_prime_new) if l <self.args.gnn_layers - 1 else HLong_prime_new
+            H_final.append(HShort_prime_out)
+            H_final.append(HLong_prime_out)
+        H_final.append(features)
+        H_final = torch.cat([H_final[-3],H_final[-2],H_final[-1]], dim = 2)
+     #   print("H shape:", H.shape)
+       # print("H:", H.shape)
+     #   print("H_final shape after cat:", H_final.shape)
+        H_final = self.attentive_node_features(H_final,lengths,self.nodal_att_type)
+     #   print("H_final shape after attentive_node_features:", H_final.shape)
+        logits = self.out_mlp(H_final)
+     #  print(diff_loss)
+        return logits, self.beta * diff_loss
+#仅仅使用最后一层的short和long，concat；使用了过去和未来双特征
+#Only the final-layer short and long features are used and concatenated; both past and future features are utilized.
+class DAGERC_new_2(nn.Module):
+    def __init__(self, args, num_class):
+        super().__init__()
+        self.args = args
+        # gcn layer
+        self.dropout = nn.Dropout(args.dropout)
+        self.gnn_layers = args.gnn_layers
+        if not args.no_rel_attn:
+            self.rel_attn = True
+        else:
+            self.rel_attn = False
+        if self.args.attn_type == 'linear':
+            gats = []
+            for _ in range(args.gnn_layers):
+                gats += [GatLinear(args.hidden_dim) if args.no_rel_attn else GatLinear_rel(args.hidden_dim)]
+            self.gather = nn.ModuleList(gats)
+        elif self.args.attn_type == 'dotprod':
+            gats = []
+            for _ in range(args.gnn_layers):
+                gats += [GatDot(args.hidden_dim) if args.no_rel_attn else GatDot_rel(args.hidden_dim)]
+            self.gather = nn.ModuleList(gats)
+        elif self.args.attn_type == 'rgcn':
+            #短距离
+            gats_short = []
+            gats_long = []
+            for _ in range(args.gnn_layers):
+                gats_short += [GAT_dialoggcn_v1(args.hidden_dim)]
+            for _ in range(args.gnn_layers):
+                gats_long += [GAT_dialoggcn_v1(args.hidden_dim)]
+            self.gather_short = nn.ModuleList(gats_short)
+            self.gather_long = nn.ModuleList(gats_long)
+        # 近距离 GRU
+        grus_c_short = []
+        for _ in range(args.gnn_layers):
+            grus_c_short += [nn.GRUCell(args.hidden_dim, args.hidden_dim)]
+        self.grus_c_short = nn.ModuleList(grus_c_short)
+        # 远距离 GRU
+        grus_c_long = []
+        for _ in range(args.gnn_layers):
+            grus_c_long += [nn.GRUCell(args.hidden_dim, args.hidden_dim)]
+        self.grus_c_long = nn.ModuleList(grus_c_long)
+        grus_p_short = []
+        for _ in range(args.gnn_layers):
+            grus_p_short += [nn.GRUCell(args.hidden_dim, args.hidden_dim)]
+        self.grus_p_short = nn.ModuleList(grus_p_short)
+        grus_p_long = []
+        for _ in range(args.gnn_layers):
+            grus_p_long += [nn.GRUCell(args.hidden_dim, args.hidden_dim)]
+        self.grus_p_long = nn.ModuleList(grus_p_long)
+        #近距离全链接层
+        fcs_short = []
+        for _ in range(args.gnn_layers):
+            fcs_short += [nn.Linear(args.hidden_dim * 2, args.hidden_dim)]
+        self.fcs_short = nn.ModuleList(fcs_short)
+        # 远距离全连接层
+        fcs_long = []
+        for _ in range(args.gnn_layers):
+            fcs_long += [nn.Linear(args.hidden_dim * 2, args.hidden_dim)]
+        self.fcs_long = nn.ModuleList(fcs_long)
+        self.fc1 = nn.Linear(args.emb_dim, args.hidden_dim)
+        self.nodal_att_type = args.nodal_att_type
+        in_dim = ((args.hidden_dim*2)*2 + args.emb_dim)
+   #     print(in_dim)
+        # output mlp layers
+        layers = [nn.Linear(in_dim, args.hidden_dim), nn.ReLU()]
+        for _ in range(args.mlp_layers - 1):
+            layers += [nn.Linear(args.hidden_dim, args.hidden_dim), nn.ReLU()]
+        layers += [self.dropout]
+        layers += [nn.Linear(args.hidden_dim, num_class)]
+        self.out_mlp = nn.Sequential(*layers)
+        self.attentive_node_features = attentive_node_features(in_dim)
+        self.affine1 = nn.Parameter(torch.empty(size=((args.hidden_dim*2)  , (args.hidden_dim*2)  )))
+        nn.init.xavier_uniform_(self.affine1.data, gain=1.414)
+        self.affine2 = nn.Parameter(torch.empty(size=((args.hidden_dim*2)  , (args.hidden_dim*2) )))
+        nn.init.xavier_uniform_(self.affine2.data, gain=1.414)
+        self.diff_loss = DiffLoss(args)
+        self.beta = args.diffloss
+    def forward(self, features, adj_1, adj_2 ,s_mask, s_mask_onehot, lengths):
+        # 检查 H1 和 H2 是否完全相等
+        are_equal = all(torch.equal(h1, h2) for h1, h2 in zip(adj_1, adj_2))
+      #  print("adj1 和 adj2 是否完全相等:", are_equal)
+      #  print('adj1',adj_1)
+      #  print('----------------------------------------------------')
+     #  print('adj2',adj_2)
+     #   print('----------------------------------------------------')
+        num_utter = features.size()[1]
+        H0 = F.relu(self.fc1(features))
+        #print('H0', H0.size())
+        # H0 = self.dropout(H0)
+        H = [H0]
+        H_combined_short_list = []
+        #对短距离特征进行处理
+        for l in range(self.args.gnn_layers):
+            C = self.grus_c_short[l](H[l][:,0,:]).unsqueeze(1) #针对每一层的第一个节点，使用 GRU 单元更新节点特征并聚合信息。
+            M = torch.zeros_like(C).squeeze(1) #初始化一个聚合信息张量 M（全零张量），并使用它与节点特征结合生成额外的特征 P。
+            # P = M.unsqueeze(1)
+            P = self.grus_p_short[l](M, H[l][:,0,:]).unsqueeze(1)  #使用 M（全零张量）和第一个节点的特征 H[l][:, 0, :] 作为输入，得到额外特征 P，形状为 (B, D)
+            #H1 = F.relu(self.fcs[l](torch.cat((C,P) , dim = 2)))
+            #H1 = F.relu(C+P)
+            H1 = C+P#将更新后的特征 C 与额外特征 P 相加，生成新的节点特征 H1，为后续层的计算做准备。
+            for i in range(1, num_utter):
+                # print(i,num_utter)
+                if self.args.attn_type == 'rgcn':
+                    #将 H[l][:, i, :]（当前节点特征）,H1（之前节点的特征聚合结果）,adj[:, i, :i]（当前节点与之前节点的邻接矩阵）
+                    #s_mask[:, i, :i]（当前节点的掩码）,得到聚合结果 M
+                    _, M = self.gather_short[l](H[l][:,i,:], H1, H1, adj_1[:,i,:i], s_mask[:,i,:i])
+                    # _, M = self.gather[l](H[l][:,i,:], H1, H1, adj[:,i,:i], s_mask_onehot[:,i,:i,:])
+                else:
+                    if not self.rel_attn:
+                        _, M = self.gather_short[l](H[l][:,i,:], H1, H1, adj_1[:,i,:i])
+                    else:
+                        _, M = self.gather_short[l](H[l][:,i,:], H1, H1, adj_1[:,i,:i], s_mask[:, i, :i])
+                #使用 GRU 单元 self.grus_c[l] 来处理当前节点的特征 H[l][:, i, :] 和聚合后的特征 M，得到新的特征 C。
+                # 这表明当前节点的特征更新与其邻居的聚合信息有关。
+                C = self.grus_c_short[l](H[l][:,i,:], M).unsqueeze(1)
+                #使用另一个 GRU 单元 self.grus_p[l] 来处理聚合特征 M 和当前节点的特征 H[l][:, i, :]，得到额外的特征 P。
+                P = self.grus_p_short[l](M, H[l][:,i,:]).unsqueeze(1)
+                # P = M.unsqueeze(1)
+                #H_temp = F.relu(self.fcs[l](torch.cat((C,P) , dim = 2)))
+                #H_temp = F.relu(C+P)
+                H_temp = C+P#将更新后的特征 C 和额外特征 P 进行相加，生成新的节点特征 H_temp
+                H1 = torch.cat((H1 , H_temp), dim = 1)  #将当前节点的特征 H_temp 拼接到 H1 中。
+               # print('H1', H1.size())
+                #print('----------------------------------------------------')
+            H.append(H1)
+            # 将输入特征反转
+        # 反向特征提取
+        features_reversed = torch.flip(features, dims=[1])  # 反转特征顺序
+        adj_reversed = torch.flip(adj_1, dims=[1, 2])  # 反转邻接矩阵
+        s_mask_reversed = torch.flip(s_mask, dims=[1, 2])  # 反转掩码
+        H0_reversed = F.relu(self.fc1(features_reversed))
+        H_reversed = [H0_reversed]
+        for l in range(self.args.gnn_layers):
+            C = self.grus_c_short[l](H_reversed[l][:, 0, :]).unsqueeze(1)
+            M = torch.zeros_like(C).squeeze(1)
+            P = self.grus_p_short[l](M, H_reversed[l][:, 0, :]).unsqueeze(1)
+            H1_reversed = C + P
+            for i in range(1, num_utter):
+                if self.args.attn_type == 'rgcn':
+                    _, M = self.gather_short[l](H_reversed[l][:, i, :], H1_reversed, H1_reversed, adj_reversed[:, i, :i], s_mask_reversed[:, i, :i])
+                else:
+                    if not self.rel_attn:
+                        _, M = self.gather_short[l](H_reversed[l][:, i, :], H1_reversed, H1_reversed, adj_reversed[:, i, :i])
+                    else:
+                        _, M = self.gather_short[l](H_reversed[l][:, i, :], H1_reversed, H1_reversed, adj_reversed[:, i, :i], s_mask_reversed[:, i, :i])
+                C = self.grus_c_short[l](H_reversed[l][:, i, :], M).unsqueeze(1)
+                P = self.grus_p_short[l](M, H_reversed[l][:, i, :]).unsqueeze(1)
+                H_temp_reversed = C + P
+                H1_reversed = torch.cat((H1_reversed, H_temp_reversed), dim=1)
+            H_reversed.append(H1_reversed)
+            H_combined = torch.cat((H[l+1], H_reversed[l+1]), dim=2)  # 在第二维度拼接
+            H_combined_short_list.append(H_combined)  # 将拼接后的结果添加到新列表中
+        '''
+        下面对长距离特征进行处理    The following processes the long-distance features.
+        '''
+        H_long = [H0]  # 初始化 H_long
+        H_combined_long_list = []  # 存储长距离处理的结果
+        # 对长距离特征进行处理
+        for l in range(self.args.gnn_layers):
+            C_long = self.grus_c_long[l](H_long[l][:,0,:]).unsqueeze(1)  # 使用 GRU 更新长距离的第一个��点
+            M_long = torch.zeros_like(C_long).squeeze(1)  # 初始化长距离的聚合信息张量 M_long
+            P_long = self.grus_p_long[l](M_long, H_long[l][:,0,:]).unsqueeze(1)  # 生成额外的特征 P_long
+            H1_long = C_long + P_long  # 生成新的长距离节点特征 H1_long
+            for i in range(1, num_utter):
+                # 依据不同的 attention 类型，进行特征聚合
+                if self.args.attn_type == 'rgcn':
+                    _, M_long = self.gather_long[l](H_long[l][:,i,:], H1_long, H1_long, adj_2[:,i,:i], s_mask[:,i,:i])
+                else:
+                    if not self.rel_attn:
+                        _, M_long = self.gather_long[l](H_long[l][:,i,:], H1_long, H1_long, adj_2[:,i,:i])
+                    else:
+                        _, M_long = self.gather_long[l](H_long[l][:,i,:], H1_long, H1_long, adj_2[:,i,:i], s_mask[:,i,:i])
+                # 使用 GRU 更新当前节点的特征 C_long 和 M_long
+                C_long = self.grus_c_long[l](H_long[l][:,i,:], M_long).unsqueeze(1)
+                P_long = self.grus_p_long[l](M_long, H_long[l][:,i,:]).unsqueeze(1)
+                H_temp_long = C_long + P_long  # 将更新后的特征 C_long 和 P_long 相加生成新特征
+                H1_long = torch.cat((H1_long, H_temp_long), dim=1)  # 将特征拼接到 H1_long 中
+            H_long.append(H1_long)  # 更新 H_long 列表
+        # 反转特征顺序，进行逆向长距离特征提取
+        features_reversed_long = torch.flip(features, dims=[1])  # 反转特征顺序
+        adj_reversed_long = torch.flip(adj_2, dims=[1, 2])  # 反转长距离邻接矩阵
+        s_mask_reversed_long = torch.flip(s_mask, dims=[1, 2])  # 反转掩码
+        H0_reversed_long = F.relu(self.fc1(features_reversed_long))
+        H_reversed_long = [H0_reversed_long]
+        for l in range(self.args.gnn_layers):
+            C_long = self.grus_c_long[l](H_reversed_long[l][:, 0, :]).unsqueeze(1)
+            M_long = torch.zeros_like(C_long).squeeze(1)
+            P_long = self.grus_p_long[l](M_long, H_reversed_long[l][:, 0, :]).unsqueeze(1)
+            H1_reversed_long = C_long + P_long
+            for i in range(1, num_utter):
+                if self.args.attn_type == 'rgcn':
+                    _, M_long = self.gather_long[l](H_reversed_long[l][:, i, :], H1_reversed_long, H1_reversed_long, adj_reversed_long[:, i, :i], s_mask_reversed_long[:, i, :i])
+                else:
+                    if not self.rel_attn:
+                        _, M_long = self.gather_long[l](H_reversed_long[l][:, i, :], H1_reversed_long, H1_reversed_long, adj_reversed_long[:, i, :i])
+                    else:
+                        _, M_long = self.gather_long[l](H_reversed_long[l][:, i, :], H1_reversed_long, H1_reversed_long, adj_reversed_long[:, i, :i], s_mask_reversed_long[:, i, :i])
+                C_long = self.grus_c_long[l](H_reversed_long[l][:, i, :], M_long).unsqueeze(1)
+                P_long = self.grus_p_long[l](M_long, H_reversed_long[l][:, i, :]).unsqueeze(1)
+                H_temp_reversed_long = C_long + P_long
+                H1_reversed_long = torch.cat((H1_reversed_long, H_temp_reversed_long), dim=1)
+            H_reversed_long.append(H1_reversed_long)
+            # 将正向和逆向的长距离特征进行拼接
+            H_combined_long = torch.cat((H_long[l+1], H_reversed_long[l+1]), dim=2)
+            H_combined_long_list.append(H_combined_long)
+        '''
+        两个通道特征都提取完毕！    Both short- and long-distance channel features have been extracted!
+        '''
+      #  print('H_combined_short_list',H_combined_short_list)
+       # print('H_combined_long_list',H_combined_long_list)
+      #  are_equal = all(torch.equal(h1, h2) for h1, h2 in zip(H_combined_short_list, H_combined_long_list))
+      #  print("H_combined_short_list 和  H_combined_long_list 是否完全相等:", are_equal)
+     #   for idx, tensor in enumerate(H_combined_short_list):
+      #      print(f"H_combined_short_list[{idx}] shape: {tensor.shape}")
+        H_final = []
+      #  print("H2 shape:", H2.shape)
+       # 计算差异正则化损失
+        diff_loss = 0
+        for l in range(self.args.gnn_layers):
+        #    print('周期：', l)
+            HShort_prime = H_combined_short_list[l]
+            HLong_prime = H_combined_long_list[l]
+            print("HShort_prime:", HShort_prime)
+            print("HLong_prime:", HLong_prime)
+            print("HShort_prime shape:", HShort_prime.shape)
+            print("HLong_prime shape:", HLong_prime.shape)
+            diff_loss = self.diff_loss(HShort_prime, HLong_prime) + diff_loss
+           # print("diff_loss:", diff_loss)
+          #  print(diff_loss.item())
+            # 互交叉注意力机制
+            A1 = F.softmax(torch.bmm(torch.matmul(HShort_prime, self.affine1), torch.transpose(HLong_prime, 1, 2)), dim=2)
+            A2 = F.softmax(torch.bmm(torch.matmul(HLong_prime, self.affine2), torch.transpose(HShort_prime, 1, 2)), dim=2)
+            HShort_prime_new = torch.bmm(A1, HLong_prime)  # 更新的短时特征
+            HLong_prime_new = torch.bmm(A2, HShort_prime)    # 更新的长时特征
+            HShort_prime_out = self.dropout(HShort_prime_new) if l < self.args.gnn_layers - 1 else HShort_prime_new
+            HLong_prime_out = self.dropout(HLong_prime_new) if l <self.args.gnn_layers - 1 else HLong_prime_new
+            H_final.append(HShort_prime_out)
+            H_final.append(HLong_prime_out)
+        H_final.append(features)
+        H_final = torch.cat([H_final[-3],H_final[-2],H_final[-1]], dim = 2)
+     #   print("H shape:", H.shape)
+       # print("H:", H.shape)
+     #   print("H_final shape after cat:", H_final.shape)
+        H_final = self.attentive_node_features(H_final,lengths,self.nodal_att_type)
+     #   print("H_final shape after attentive_node_features:", H_final.shape)
+        logits = self.out_mlp(H_final)
+     #  print(diff_loss)
+        return logits, self.beta * diff_loss
+#使用所有层的short和long，使用sum加每一层，不使用双特征融合技术
+#All-layer short and long features are used, with a sum over each layer; dual-feature fusion is not applied.
+class DAGERC_new_3(nn.Module):
+    def __init__(self, args, num_class):
+        super().__init__()
+        self.args = args
+        # gcn layer
+        self.dropout = nn.Dropout(args.dropout)
+        self.gnn_layers = args.gnn_layers
+        if not args.no_rel_attn:
+            self.rel_attn = True
+        else:
+            self.rel_attn = False
+        if self.args.attn_type == 'linear':
+            gats = []
+            for _ in range(args.gnn_layers):
+                gats += [GatLinear(args.hidden_dim) if args.no_rel_attn else GatLinear_rel(args.hidden_dim)]
+            self.gather = nn.ModuleList(gats)
+        elif self.args.attn_type == 'dotprod':
+            gats = []
+            for _ in range(args.gnn_layers):
+                gats += [GatDot(args.hidden_dim) if args.no_rel_attn else GatDot_rel(args.hidden_dim)]
+            self.gather = nn.ModuleList(gats)
+        elif self.args.attn_type == 'rgcn':
+            #短距离
+            gats_short = []
+            gats_long = []
+            for _ in range(args.gnn_layers):
+                gats_short += [GAT_dialoggcn_v1(args.hidden_dim)]
+            for _ in range(args.gnn_layers):
+                gats_long += [GAT_dialoggcn_v1(args.hidden_dim)]
+            self.gather_short = nn.ModuleList(gats_short)
+            self.gather_long = nn.ModuleList(gats_long)
+        # 近距离 GRU
+        grus_c_short = []
+        for _ in range(args.gnn_layers):
+            grus_c_short += [nn.GRUCell(args.hidden_dim, args.hidden_dim)]
+        self.grus_c_short = nn.ModuleList(grus_c_short)
+        # 远距离 GRU
+        grus_c_long = []
+        for _ in range(args.gnn_layers):
+            grus_c_long += [nn.GRUCell(args.hidden_dim, args.hidden_dim)]
+        self.grus_c_long = nn.ModuleList(grus_c_long)
+        grus_p_short = []
+        for _ in range(args.gnn_layers):
+            grus_p_short += [nn.GRUCell(args.hidden_dim, args.hidden_dim)]
+        self.grus_p_short = nn.ModuleList(grus_p_short)
+        grus_p_long = []
+        for _ in range(args.gnn_layers):
+            grus_p_long += [nn.GRUCell(args.hidden_dim, args.hidden_dim)]
+        self.grus_p_long = nn.ModuleList(grus_p_long)
+        #近距离全链接层
+        fcs_short = []
+        for _ in range(args.gnn_layers):
+            fcs_short += [nn.Linear(args.hidden_dim * 2, args.hidden_dim)]
+        self.fcs_short = nn.ModuleList(fcs_short)
+        # 远距离全连接层
+        fcs_long = []
+        for _ in range(args.gnn_layers):
+            fcs_long += [nn.Linear(args.hidden_dim * 2, args.hidden_dim)]
+        self.fcs_long = nn.ModuleList(fcs_long)
+        self.fc1 = nn.Linear(args.emb_dim, args.hidden_dim)
+        self.nodal_att_type = args.nodal_att_type
+        in_dim = (args.hidden_dim * (args.gnn_layers + 1)) + args.emb_dim
+   #     print(in_dim)
+        # output mlp layers
+        layers = [nn.Linear(in_dim, args.hidden_dim), nn.ReLU()]
+        for _ in range(args.mlp_layers - 1):
+            layers += [nn.Linear(args.hidden_dim, args.hidden_dim), nn.ReLU()]
+        layers += [self.dropout]
+        layers += [nn.Linear(args.hidden_dim, num_class)]
+        self.out_mlp = nn.Sequential(*layers)
+        self.attentive_node_features = attentive_node_features(in_dim)
+    def forward(self, features, adj_1, adj_2 ,s_mask, s_mask_onehot, lengths):
+        # 检查 H1 和 H2 是否完全相等
+        are_equal = all(torch.equal(h1, h2) for h1, h2 in zip(adj_1, adj_2))
+      #  print("adj1 和 adj2 是否完全相等:", are_equal)
+      #  print('adj1',adj_1)
+      #  print('----------------------------------------------------')
+     #  print('adj2',adj_2)
+     #   print('----------------------------------------------------')
+        num_utter = features.size()[1]
+        H0 = F.relu(self.fc1(features))
+        #print('H0', H0.size())
+        # H0 = self.dropout(H0)
+        H = [H0]
+        H_combined_short_list = []
+        #对短距离特征进行处理
+        for l in range(self.args.gnn_layers):
+            C = self.grus_c_short[l](H[l][:,0,:]).unsqueeze(1) #针对每一层的第一个节点，使用 GRU 单元更新节点特征并聚合信息。
+            M = torch.zeros_like(C).squeeze(1) #初始化一个聚合信息张量 M（全零张量），并使用它与节点特征结合生成额外的特征 P。
+            # P = M.unsqueeze(1)
+            P = self.grus_p_short[l](M, H[l][:,0,:]).unsqueeze(1)  #使用 M（全零张量）和第一个节点的特征 H[l][:, 0, :] 作为输入，得到额外特征 P，形状为 (B, D)
+            #H1 = F.relu(self.fcs[l](torch.cat((C,P) , dim = 2)))
+            #H1 = F.relu(C+P)
+            H1 = C+P#将更新后的特征 C 与额外特征 P 相加，生成新的节点特征 H1，为后续层的计算做准备。
+            for i in range(1, num_utter):
+                # print(i,num_utter)
+                if self.args.attn_type == 'rgcn':
+                    #将 H[l][:, i, :]（当前节点特征）,H1（之前节点的特征聚合结果）,adj[:, i, :i]（当前节点与之前节点的邻接矩阵）
+                    #s_mask[:, i, :i]（当前节点的掩码）,得到聚合结果 M
+                    _, M = self.gather_short[l](H[l][:,i,:], H1, H1, adj_1[:,i,:i], s_mask[:,i,:i])
+                    # _, M = self.gather[l](H[l][:,i,:], H1, H1, adj[:,i,:i], s_mask_onehot[:,i,:i,:])
+                else:
+                    if not self.rel_attn:
+                        _, M = self.gather_short[l](H[l][:,i,:], H1, H1, adj_1[:,i,:i])
+                    else:
+                        _, M = self.gather_short[l](H[l][:,i,:], H1, H1, adj_1[:,i,:i], s_mask[:, i, :i])
+                #使用 GRU 单元 self.grus_c[l] 来处理当前节点的特征 H[l][:, i, :] 和聚合后的特征 M，得到新的特征 C。
+                # 这表明当前节点的特征更新与其邻居的聚合信息有关。
+                C = self.grus_c_short[l](H[l][:,i,:], M).unsqueeze(1)
+                #使用另一个 GRU 单元 self.grus_p[l] 来处理聚合特征 M 和当前节点的特征 H[l][:, i, :]，得到额外的特征 P。
+                P = self.grus_p_short[l](M, H[l][:,i,:]).unsqueeze(1)
+                # P = M.unsqueeze(1)
+                #H_temp = F.relu(self.fcs[l](torch.cat((C,P) , dim = 2)))
+                #H_temp = F.relu(C+P)
+                H_temp = C+P#将更新后的特征 C 和额外特征 P 进行相加，生成新的节点特征 H_temp
+                H1 = torch.cat((H1 , H_temp), dim = 1)  #将当前节点的特征 H_temp 拼接到 H1 中。
+               # print('H1', H1.size())
+                #print('----------------------------------------------------')
+            H.append(H1)
+        '''
+        下面对长距离特征进行处理
+        '''
+        H_long = [H0]  # 初始化 H_long
+        H_combined_long_list = []  # 存储长距离处理的结果
+        # 对长距离特征进行处理
+        for l in range(self.args.gnn_layers):
+            C_long = self.grus_c_long[l](H_long[l][:,0,:]).unsqueeze(1)  # 使用 GRU 更新长距离的第一个节点
+            M_long = torch.zeros_like(C_long).squeeze(1)  # 初始化长距离的聚合信息张量 M_long
+            P_long = self.grus_p_long[l](M_long, H_long[l][:,0,:]).unsqueeze(1)  # 生成额外的特征 P_long
+            H1_long = C_long + P_long  # 生成新的长距离节点特征 H1_long
+            for i in range(1, num_utter):
+                # 依据不同的 attention 类型，进行特征聚合
+                if self.args.attn_type == 'rgcn':
+                    _, M_long = self.gather_long[l](H_long[l][:,i,:], H1_long, H1_long, adj_2[:,i,:i], s_mask[:,i,:i])
+                else:
+                    if not self.rel_attn:
+                        _, M_long = self.gather_long[l](H_long[l][:,i,:], H1_long, H1_long, adj_2[:,i,:i])
+                    else:
+                        _, M_long = self.gather_long[l](H_long[l][:,i,:], H1_long, H1_long, adj_2[:,i,:i], s_mask[:,i,:i])
+                # 使用 GRU 更新当前节点的特征 C_long 和 M_long
+                C_long = self.grus_c_long[l](H_long[l][:,i,:], M_long).unsqueeze(1)
+                P_long = self.grus_p_long[l](M_long, H_long[l][:,i,:]).unsqueeze(1)
+                H_temp_long = C_long + P_long  # 将更新后的特征 C_long 和 P_long 相加生成新特征
+                H1_long = torch.cat((H1_long, H_temp_long), dim=1)  # 将特征拼接到 H1_long 中
+            H_long.append(H1_long)  # 更新 H_long 列表
+       # for i, h in enumerate(H):
+        #     print(f"H[{i}] shape: {h.shape}")
+        H_combined = torch.cat(H, dim=2)
+        H_long_combined = torch.cat(H_long, dim=2)
+        sum_features = H_combined + H_long_combined
+      #  print('sum_features Shape:', sum_features.shape)
+       # print('features Shape:', features.shape)
+        H_combined_final = torch.cat((sum_features, features), dim=2)
+        H_final = self.attentive_node_features(H_combined_final,lengths,self.nodal_att_type)
+     #   print("H_final shape after attentive_node_features:", H_final.shape)
+        logits = self.out_mlp(H_final)
+     #  print(diff_loss)
+        return logits
+#使用过去的所有层的short和long，每一层都concat，使用特征融合技术。
+#All past-layer short and long features are used; features from each layer are concatenated, and feature fusion techniques are applied.
+class DAGERC_new_4(nn.Module):
+    def __init__(self, args, num_class):
+        super().__init__()
+        self.args = args
+        # gcn layer
+        self.dropout = nn.Dropout(args.dropout)
+        self.gnn_layers = args.gnn_layers
+        if not args.no_rel_attn:
+            self.rel_attn = True
+        else:
+            self.rel_attn = False
+        if self.args.attn_type == 'linear':
+            gats = []
+            for _ in range(args.gnn_layers):
+                gats += [GatLinear(args.hidden_dim) if args.no_rel_attn else GatLinear_rel(args.hidden_dim)]
+            self.gather = nn.ModuleList(gats)
+        elif self.args.attn_type == 'dotprod':
+            gats = []
+            for _ in range(args.gnn_layers):
+                gats += [GatDot(args.hidden_dim) if args.no_rel_attn else GatDot_rel(args.hidden_dim)]
+            self.gather = nn.ModuleList(gats)
+        elif self.args.attn_type == 'rgcn':
+            gats_short = []
+            gats_long = []
+            for _ in range(args.gnn_layers):
+                gats_short += [GAT_dialoggcn_v1(args.hidden_dim)]
+            for _ in range(args.gnn_layers):
+                gats_long += [GAT_dialoggcn_v1(args.hidden_dim)]
+            self.gather_short = nn.ModuleList(gats_short)
+            self.gather_long = nn.ModuleList(gats_long)
+        # 近距离 GRU
+        # short distance GRU
+        grus_c_short = []
+        for _ in range(args.gnn_layers):
+            grus_c_short += [nn.GRUCell(args.hidden_dim, args.hidden_dim)]
+        self.grus_c_short = nn.ModuleList(grus_c_short)
+        # 远距离 GRU
+        #  long distance GRU
+        grus_c_long = []
+        for _ in range(args.gnn_layers):
+            grus_c_long += [nn.GRUCell(args.hidden_dim, args.hidden_dim)]
+        self.grus_c_long = nn.ModuleList(grus_c_long)
+        grus_p_short = []
+        for _ in range(args.gnn_layers):
+            grus_p_short += [nn.GRUCell(args.hidden_dim, args.hidden_dim)]
+        self.grus_p_short = nn.ModuleList(grus_p_short)
+        grus_p_long = []
+        for _ in range(args.gnn_layers):
+            grus_p_long += [nn.GRUCell(args.hidden_dim, args.hidden_dim)]
+        self.grus_p_long = nn.ModuleList(grus_p_long)
+        #近距离全链接层
+        #Fully Connected Layer for Short-Range Features
+        fcs_short = []
+        for _ in range(args.gnn_layers):
+            fcs_short += [nn.Linear(args.hidden_dim * 2, args.hidden_dim)]
+        self.fcs_short = nn.ModuleList(fcs_short)
+        # 远距离全连接层
+        # Fully Connected Layer for Long-Range Features
+        fcs_long = []
+        for _ in range(args.gnn_layers):
+            fcs_long += [nn.Linear(args.hidden_dim * 2, args.hidden_dim)]
+        self.fcs_long = nn.ModuleList(fcs_long)
+        self.fc1 = nn.Linear(args.emb_dim, args.hidden_dim)
+        self.nodal_att_type = args.nodal_att_type
+        in_dim = (((args.hidden_dim*2))*(args.gnn_layers + 1) + args.emb_dim)
+        # output mlp layers
+        layers = [nn.Linear(in_dim, args.hidden_dim), nn.ReLU()]
+        for _ in range(args.mlp_layers - 1):
+            layers += [nn.Linear(args.hidden_dim, args.hidden_dim), nn.ReLU()]
+        layers += [self.dropout]
+        layers += [nn.Linear(args.hidden_dim, num_class)]
+        self.out_mlp = nn.Sequential(*layers)
+        self.attentive_node_features = attentive_node_features(in_dim)
+        self.affine1 = nn.Parameter(torch.empty(size=((args.hidden_dim)  , (args.hidden_dim)  )))
+        nn.init.xavier_uniform_(self.affine1.data, gain=1.414)
+        self.affine2 = nn.Parameter(torch.empty(size=((args.hidden_dim)  , (args.hidden_dim) )))
+        nn.init.xavier_uniform_(self.affine2.data, gain=1.414)
+        self.diff_loss = DiffLoss(args)
+        self.beta = args.diffloss
+    def forward(self, features, adj_1, adj_2 ,s_mask,s_mask_onehot, lengths):
+        # 检查 H1 和 H2 是否完全相等
+        are_equal = all(torch.equal(h1, h2) for h1, h2 in zip(adj_1, adj_2))
+      #  print("adj1 和 adj2 是否完全相等:", are_equal)
+      #  print('adj1',adj_1)
+      #  print('----------------------------------------------------')
+     #  print('adj2',adj_2)
+     #   print('----------------------------------------------------')
+        num_utter = features.size()[1]
+        H0 = F.relu(self.fc1(features))
+        #print('H0', H0.size())
+        # H0 = self.dropout(H0)
+        H = [H0]
+        H_combined_short_list = []
+        #对短距离特征进行处理   Process short-range features.
+        for l in range(self.args.gnn_layers):
+            C = self.grus_c_short[l](H[l][:,0,:]).unsqueeze(1) #针对每一层的第一个节点，使用 GRU 单元更新节点特征并聚合信息。For the first node of each layer, use a GRU unit to update the node features and aggregate information.
+            M = torch.zeros_like(C).squeeze(1) #初始化一个聚合信息张量 M（全零张量），并使用它与节点特征结合生成额外的特征 P。Initialize an aggregation tensor M (a zero tensor), and use it together with the node features to generate additional features P.
+            # P = M.unsqueeze(1)
+            P = self.grus_p_short[l](M, H[l][:,0,:]).unsqueeze(1)
+            #H1 = F.relu(self.fcs[l](torch.cat((C,P) , dim = 2)))
+            #H1 = F.relu(C+P)
+            H1 = C+P
+            for i in range(1, num_utter):
+                # print(i,num_utter)
+                if self.args.attn_type == 'rgcn':
+                    _, M = self.gather_short[l](H[l][:,i,:], H1, H1, adj_1[:,i,:i], s_mask[:,i,:i])
+                    # _, M = self.gather[l](H[l][:,i,:], H1, H1, adj[:,i,:i], s_mask_onehot[:,i,:i,:])
+                else:
+                    if not self.rel_attn:
+                        _, M = self.gather_short[l](H[l][:,i,:], H1, H1, adj_1[:,i,:i])
+                    else:
+                        _, M = self.gather_short[l](H[l][:,i,:], H1, H1, adj_1[:,i,:i], s_mask[:, i, :i])
+                C = self.grus_c_short[l](H[l][:,i,:], M).unsqueeze(1)
+                P = self.grus_p_short[l](M, H[l][:,i,:]).unsqueeze(1)
+                # P = M.unsqueeze(1)
+                #H_temp = F.relu(self.fcs[l](torch.cat((C,P) , dim = 2)))
+                #H_temp = F.relu(C+P)
+                H_temp = C+P#将更新后的特征 C 和额外特征 P 进行相加，生成新的节点特征 H_temp
+                H1 = torch.cat((H1 , H_temp), dim = 1)  #将当前节点的特征 H_temp 拼接到 H1 中。
+               # print('H1', H1.size())
+                #print('----------------------------------------------------')
+            H.append(H1)
+            H_combined_short_list.append(H[l+1])
+        '''
+        下面对长距离特征进行处理    The following processes the long-distance features.
+        '''
+        H_long = [H0]  # 初始化 H_long
+        H_combined_long_list = []  # 存储长距离处理的结果
+        # 对长距离特征进行处理
+        for l in range(self.args.gnn_layers):
+            C_long = self.grus_c_long[l](H_long[l][:,0,:]).unsqueeze(1)  # 使用 GRU 更新长距离的第一个节点
+            M_long = torch.zeros_like(C_long).squeeze(1)  # 初始化长距离的聚合信息张量 M_long
+            P_long = self.grus_p_long[l](M_long, H_long[l][:,0,:]).unsqueeze(1)  # 生成额外的特征 P_long
+            H1_long = C_long + P_long  # 生成新的长距离节点特征 H1_long
+            for i in range(1, num_utter):
+                # 依据不同的 attention 类型，进行特征聚合
+                if self.args.attn_type == 'rgcn':
+                    _, M_long = self.gather_long[l](H_long[l][:,i,:], H1_long, H1_long, adj_2[:,i,:i], s_mask[:,i,:i])
+                else:
+                    if not self.rel_attn:
+                        _, M_long = self.gather_long[l](H_long[l][:,i,:], H1_long, H1_long, adj_2[:,i,:i])
+                    else:
+                        _, M_long = self.gather_long[l](H_long[l][:,i,:], H1_long, H1_long, adj_2[:,i,:i], s_mask[:,i,:i])
+                # 使用 GRU 更新当前节点的特征 C_long 和 M_long
+                C_long = self.grus_c_long[l](H_long[l][:,i,:], M_long).unsqueeze(1)
+                P_long = self.grus_p_long[l](M_long, H_long[l][:,i,:]).unsqueeze(1)
+                H_temp_long = C_long + P_long  # 将更新后的特征 C_long 和 P_long 相加生成新特征
+                H1_long = torch.cat((H1_long, H_temp_long), dim=1)  # 将特征拼接到 H1_long 中
+            H_long.append(H1_long)  # 更新 H_long 列表
+            H_combined_long_list.append(H_long[l+1])
+        '''
+        两个通道特征都提取完毕！Both short- and long-distance channel features have been extracted!
+        '''
+      #  print('H_combined_short_list',H_combined_short_list)
+       # print('H_combined_long_list',H_combined_long_list)
+      #  are_equal = all(torch.equal(h1, h2) for h1, h2 in zip(H_combined_short_list, H_combined_long_list))
+      #  print("H_combined_short_list 和  H_combined_long_list 是否完全相等:", are_equal)
+     #   for idx, tensor in enumerate(H_combined_short_list):
+      #      print(f"H_combined_short_list[{idx}] shape: {tensor.shape}")
+        H_final = []
+        H_0_final = torch.cat([H0, H0], dim=2)
+        H_final.append(H_0_final)
+      #  print("H2 shape:", H2.shape)
+       # 计算差异正则化损失
+        diff_loss = 0
+        for l in range(self.args.gnn_layers):
+        #    print('周期：', l)
+            HShort_prime = H_combined_short_list[l]
+            HLong_prime = H_combined_long_list[l]
+            #print("HShort_prime:", HShort_prime.shape)
+           # print("HLong_prime:", HLong_prime.shape)
+        #    print("HShort_prime shape:", HShort_prime.shape)
+        #    print("HLong_prime shape:", HLong_prime.shape)
+            diff_loss = self.diff_loss(HShort_prime, HLong_prime) + diff_loss
+            #print("diff_loss:", diff_loss)
+          #  print(diff_loss.item())
+            # 互交叉注意力机制
+            A1 = F.softmax(torch.bmm(torch.matmul(HShort_prime, self.affine1), torch.transpose(HLong_prime, 1, 2)), dim=2)
+            A2 = F.softmax(torch.bmm(torch.matmul(HLong_prime, self.affine2), torch.transpose(HShort_prime, 1, 2)), dim=2)
+            HShort_prime_new = torch.bmm(A1, HLong_prime)  # 更新的短时特征
+            HLong_prime_new = torch.bmm(A2, HShort_prime)    # 更新的长时特征
+            HShort_prime_out = self.dropout(HShort_prime_new) if l < self.args.gnn_layers - 1 else HShort_prime_new
+            HLong_prime_out = self.dropout(HLong_prime_new) if l <self.args.gnn_layers - 1 else HLong_prime_new
+            H_layer = torch.cat([HShort_prime_out, HLong_prime_out], dim=2)
+            H_final.append(H_layer)
+        H_final = torch.cat(H_final, dim=2)
+        H_final = torch.cat([H_final, features], dim=2)
+     #   print("H_final shape:", H_final.shape)
+       # print("H:", H.shape)
+     #   print("H_final shape after cat:", H_final.shape)
+        H_final = self.attentive_node_features(H_final,lengths,self.nodal_att_type)
+     #   print("H_final shape after attentive_node_features:", H_final.shape)
+        logits = self.out_mlp(H_final)
+     #  print(diff_loss)
+        return logits, self.beta * diff_loss

model_utils.py ADDED Viewed

	@@ -0,0 +1,507 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+from torch.nn.utils.rnn import pad_sequence
+import numpy as np, itertools, random, copy, math
+class DiffLoss(nn.Module):
+    def __init__(self, args):
+        super(DiffLoss, self).__init__()
+    def forward(self, input1, input2):
+        # input1 (B,N,D)    input2 (B,N,D)
+        batch_size = input1.size(0)
+        N = input1.size(1)
+        input1 = input1.view(batch_size, -1)  # (B,N*D)
+        input2 = input2.view(batch_size, -1)  # (B, N*D)
+     #   print('input1:', input1)
+     #   print('input2:', input2)
+        # Zero mean
+        input1_mean = torch.mean(input1, dim=0, keepdim=True) # (1,N*D)
+        input2_mean = torch.mean(input2, dim=0, keepdim=True) # (1,N*D)
+        input1 = input1 - input1_mean     # (B,N*D)
+        input2 = input2 - input2_mean     # (B,N*D)
+        input1_l2_norm = torch.norm(input1, p=2, dim=1, keepdim=True) # (B,1)
+        input1_l2 = input1.div(input1_l2_norm.expand_as(input1) + 1e-6) # (B,N*D)
+        input2_l2_norm = torch.norm(input2, p=2, dim=1, keepdim=True) # (B,1)
+        input2_l2 = input2.div(input2_l2_norm.expand_as(input2) + 1e-6) # (B,N*D)
+     #  print("input1_l2_norm:", input1_l2_norm.detach().cpu().numpy())
+     #   print("input2_l2_norm:", input2_l2_norm.detach().cpu().numpy())
+     #   print("input1_l2:", input1_l2.detach().cpu().numpy())
+     #   print("input2_l2:", input2_l2.detach().cpu().numpy())
+        norm_diff = torch.mean(torch.norm(input1_l2 - input2_l2, p=2, dim=1))
+        if norm_diff.item() == 0:
+            return torch.tensor(float('inf'), device=input1.device)
+        diff_loss = 1.0 / norm_diff
+     #   print('loss:', diff_loss)
+        return diff_loss
+class MaskedNLLLoss(nn.Module):
+    def __init__(self, weight=None):
+        super(MaskedNLLLoss, self).__init__()
+        self.weight = weight
+        self.loss = nn.NLLLoss(weight=weight,
+                               reduction='sum')
+    def forward(self, pred, target, mask):
+        """
+        pred -> batch*seq_len, n_classes
+        target -> batch*seq_len
+        mask -> batch, seq_len
+        """
+        mask_ = mask.view(-1, 1)  # batch*seq_len, 1
+        if type(self.weight) == type(None):
+            loss = self.loss(pred * mask_, target) / torch.sum(mask)
+        else:
+            loss = self.loss(pred * mask_, target) \
+                   / torch.sum(self.weight[target] * mask_.squeeze())
+        return loss
+class MaskedMSELoss(nn.Module):
+    def __init__(self):
+        super(MaskedMSELoss, self).__init__()
+        self.loss = nn.MSELoss(reduction='sum')
+    def forward(self, pred, target, mask):
+        """
+        pred -> batch*seq_len
+        target -> batch*seq_len
+        mask -> batch*seq_len
+        """
+        loss = self.loss(pred * mask, target) / torch.sum(mask)
+        return loss
+class UnMaskedWeightedNLLLoss(nn.Module):
+    def __init__(self, weight=None):
+        super(UnMaskedWeightedNLLLoss, self).__init__()
+        self.weight = weight
+        self.loss = nn.NLLLoss(weight=weight,
+                               reduction='sum')
+    def forward(self, pred, target):
+        """
+        pred -> batch*seq_len, n_classes
+        target -> batch*seq_len
+        """
+        if type(self.weight) == type(None):
+            loss = self.loss(pred, target)
+        else:
+            loss = self.loss(pred, target) \
+                   / torch.sum(self.weight[target])
+        return loss
+class GatedSelection(nn.Module):
+    def __init__(self, hidden_size):
+        super().__init__()
+        self.context_trans = nn.Linear(hidden_size, hidden_size)
+        self.linear1 = nn.Linear(hidden_size, hidden_size)
+        self.linear2 = nn.Linear(hidden_size, hidden_size)
+        self.fc = nn.Linear(hidden_size, hidden_size)
+        self.sigmoid = nn.Sigmoid()
+        self.relu = nn.ReLU()
+    def forward(self, x1, x2):
+        x2 = self.context_trans(x2)
+        s = self.sigmoid(self.linear1(x1)+self.linear2(x2))
+        h = s * x1 + (1 - s) * x2
+        return self.relu(self.fc(h))
+def mask_logic(alpha, adj):
+    '''
+    performing mask logic with adj
+    :param alpha:
+    :param adj:
+    :return:
+    '''
+    return alpha - (1 - adj) * 1e30
+class GatLinear(nn.Module):
+    def __init__(self, hidden_size):
+        super().__init__()
+        self.linear = nn.Linear(hidden_size * 2, 1)
+    def forward(self, Q, K, V, adj):
+        '''
+        imformation gatherer with linear attention
+        :param Q: (B, D) # query utterance
+        :param K: (B, N, D) # context
+        :param V: (B, N, D) # context
+        :param adj: (B,  N) # the adj matrix of the i th node
+        :return:
+        '''
+        N = K.size()[1]
+        # print('Q',Q.size())
+        Q = Q.unsqueeze(1).expand(-1, N, -1) # (B, N, D)
+        # print('K',K.size())
+        X = torch.cat((Q,K), dim = 2) # (B, N, 2D)
+        # print('X',X.size())
+        alpha = self.linear(X).permute(0,2,1) #(B, 1, N)
+        # print('alpha',alpha.size())
+        # print(alpha)
+        adj = adj.unsqueeze(1)
+        alpha = mask_logic(alpha, adj) # (B, 1, N)
+        # print('alpha after mask',alpha.size())
+        # print(alpha)
+        attn_weight = F.softmax(alpha, dim = 2) # (B, 1, N)
+        # print('attn_weight',attn_weight.size())
+        # print(attn_weight)
+        attn_sum = torch.bmm(attn_weight, V).squeeze(1) # (B, D)
+        # print('attn_sum',attn_sum.size())
+        return attn_weight, attn_sum
+class GatDot(nn.Module):
+    def __init__(self, hidden_size):
+        super().__init__()
+        self.linear1 = nn.Linear(hidden_size, hidden_size)
+        self.linear2 = nn.Linear(hidden_size, hidden_size)
+    def forward(self, Q, K, V, adj):
+        '''
+        imformation gatherer with dot product attention
+        :param Q: (B, D) # query utterance
+        :param K: (B, N, D) # context
+        :param V: (B, N, D) # context
+        :param adj: (B,  N) # the adj matrix of the i th node
+        :return:
+        '''
+        N = K.size()[1]
+        Q = self.linear1(Q).unsqueeze(2) # (B,D,1)
+        # K = self.linear2(Q) # (B, N, D)
+        K = self.linear2(K) # (B, N, D)
+        alpha = torch.bmm(K, Q).permute(0, 2, 1)  # (B, 1, N)
+        adj = adj.unsqueeze(1)
+        alpha = mask_logic(alpha, adj)  # (B, 1, N)
+        attn_weight = F.softmax(alpha, dim=2)  # (B, 1, N)
+        attn_sum = torch.bmm(attn_weight, V).squeeze(1)  # (B,  D)
+        return attn_weight, attn_sum
+class GatLinear_rel(nn.Module):
+    def __init__(self, hidden_size):
+        super().__init__()
+        self.linear = nn.Linear(hidden_size * 3, 1)
+        self.rel_emb = nn.Embedding(2, hidden_size)
+    def forward(self, Q, K, V, adj, s_mask):
+        '''
+        imformation gatherer with linear attention
+        :param Q: (B, D) # query utterance
+        :param K: (B, N, D) # context
+        :param V: (B, N, D) # context
+        :param adj: (B,  N) # the adj matrix of the i th node
+        :param s_mask: (B,  N) #
+        :return:
+        '''
+        rel_emb = self.rel_emb(s_mask) # (B, N, D)
+        N = K.size()[1]
+        # print('Q',Q.size())
+        Q = Q.unsqueeze(1).expand(-1, N, -1) # (B, N, D)
+        # print('K',K.size())
+        # print('rel_emb', rel_emb.size())
+        X = torch.cat((Q,K, rel_emb), dim = 2) # (B, N, 2D)?   (B, N, 3D)
+        # print('X',X.size())
+        alpha = self.linear(X).permute(0,2,1) #(B, 1, N)
+        # print('alpha',alpha.size())
+        # print(alpha)
+        adj = adj.unsqueeze(1)
+        alpha = mask_logic(alpha, adj) # (B, 1, N)
+        # print('alpha after mask',alpha.size())
+        # print(alpha)
+        attn_weight = F.softmax(alpha, dim = 2) # (B, 1, N)
+        # print('attn_weight',attn_weight.size())
+        # print(attn_weight)
+        attn_sum = torch.bmm(attn_weight, V).squeeze(1) # (B, D)
+        # print('attn_sum',attn_sum.size())
+        return attn_weight, attn_sum
+class GatDot_rel(nn.Module):
+    def __init__(self, hidden_size):
+        super().__init__()
+        self.linear1 = nn.Linear(hidden_size, hidden_size)
+        self.linear2 = nn.Linear(hidden_size, hidden_size)
+        self.linear3 = nn.Linear(hidden_size, 1)
+        self.rel_emb = nn.Embedding(2, hidden_size)
+    def forward(self, Q, K, V, adj, s_mask):
+        '''
+        imformation gatherer with dot product attention
+        :param Q: (B, D) # query utterance
+        :param K: (B, N, D) # context
+        :param V: (B, N, D) # context
+        :param adj: (B,  N) # the adj matrix of the i th node
+        :param s_mask: (B,  N) #  relation mask
+        :return:
+        '''
+        N = K.size()[1]
+        rel_emb = self.rel_emb(s_mask)
+        Q = self.linear1(Q).unsqueeze(2) # (B,D,1)
+        K = self.linear2(K) # (B, N, D)
+        y = self.linear3(rel_emb) # (B, N, 1)
+        alpha = (torch.bmm(K, Q) + y).permute(0, 2, 1)  # (B, 1, N)
+        adj = adj.unsqueeze(1)
+        alpha = mask_logic(alpha, adj)  # (B, 1, N)
+        attn_weight = F.softmax(alpha, dim=2)  # (B, 1, N)
+        attn_sum = torch.bmm(attn_weight, V).squeeze(1)  # (B,  D)
+        return attn_weight, attn_sum
+class GAT_dialoggcn(nn.Module):
+    '''
+    H_i = alpha_ij(W_rH_j)
+    alpha_ij = attention(H_i, H_j)
+    '''
+    def __init__(self, hidden_size):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.linear = nn.Linear(hidden_size * 2, 1)
+        self.rel_emb = nn.Parameter(torch.randn(2, hidden_size, hidden_size))
+    def forward(self, Q, K, V, adj, s_mask_onehot):
+        '''
+        imformation gatherer with linear attention
+        :param Q: (B, D) # query utterance
+        :param K: (B, N, D) # context
+        :param V: (B, N, D) # context
+        :param adj: (B,  N) # the adj matrix of the i th node
+        :param s_mask: (B,  N, 2) #
+        :return:
+        '''
+        B = K.size()[0]
+        N = K.size()[1]
+        # print('Q',Q.size())
+        Q = Q.unsqueeze(1).expand(-1, N, -1) # (B, N, D)；
+        # print('K',K.size())
+        X = torch.cat((Q,K), dim = 2) # (B, N, 2D)
+        # print('X',X.size())
+        alpha = self.linear(X).permute(0,2,1) #(B, 1, N)
+        # print('alpha',alpha.size())
+        # print(alpha)
+        adj = adj.unsqueeze(1)
+        alpha = mask_logic(alpha, adj) # (B, 1, N)
+        # print('alpha after mask',alpha.size())
+        # print(alpha)
+        attn_weight = F.softmax(alpha, dim = 2) # (B, 1, N)
+        # print('attn_weight',attn_weight.size())
+        # print(attn_weight)
+        # print('s_mask_onehot', s_mask_onehot.size())
+        D = self.rel_emb.size()[2]
+        # print('rel_emb', self.rel_emb.size())
+        rel_emb = self.rel_emb.unsqueeze(0).expand(B,-1,-1,-1)
+        # rel_emb = self.rel_emb.unsqueeze(0).repeat(B, 1, 1, 1)
+        # print('rel_emb expand', rel_emb.size())
+        rel_emb = rel_emb.reshape((B, 2, D*D))
+        # print('rel_emb resize', rel_emb.size())
+        Wr = torch.bmm(s_mask_onehot, rel_emb).reshape((B, N, D, D)) # (B, N, D, D)
+        # print('Wr', Wr.size()) # (B, N, D, D)
+        Wr = Wr.reshape((B*N, D, D))
+        # print('Wr after reshape', Wr.size())
+        V = V.unsqueeze(2).reshape((B*N, 1, -1)) # (B*N, 1, D)
+        # print('V after reshape', V.size())
+        V = torch.bmm(V, Wr).unsqueeze(1) #(B * N,  D)
+        # print('V after transform', V.size())
+        V = V.reshape((B,N,-1))
+        # print('Final V', V.size())
+        attn_sum = torch.bmm(attn_weight, V).squeeze(1) # (B, D)
+        # print('attn_sum',attn_sum.size())
+        return attn_weight, attn_sum
+class GAT_dialoggcn_v1(nn.Module):
+    '''
+    use linear to avoid OOM
+    H_i = alpha_ij(W_rH_j)
+    alpha_ij = attention(H_i, H_j)
+    '''
+    def __init__(self, hidden_size):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.linear = nn.Linear(hidden_size * 2, 1)
+        self.Wr0 = nn.Linear(hidden_size, hidden_size, bias = False)
+        self.Wr1 = nn.Linear(hidden_size, hidden_size, bias = False)
+    def forward(self, Q, K, V, adj, s_mask):
+        '''
+        imformation gatherer with linear attention
+        :param Q: (B, D) # query utterance
+        :param K: (B, N, D) # context
+        :param V: (B, N, D) # context
+        :param adj: (B,  N) # the adj matrix of the i th node
+        :param s_mask: (B,  N) #
+        :return:
+        '''
+        B = K.size()[0]
+        N = K.size()[1]
+        # print('Q',Q.size())
+        Q = Q.unsqueeze(1).expand(-1, N, -1) # (B, N, D)；
+        # print('K',K.size())
+        X = torch.cat((Q,K), dim = 2) # (B, N, 2D)
+        # print('X',X.size())
+        alpha = self.linear(X).permute(0,2,1) #(B, 1, N)
+        #alpha = F.leaky_relu(alpha)
+        # print('alpha',alpha.size())
+        # print(alpha)
+        adj = adj.unsqueeze(1)  # (B, 1, N)
+        alpha = mask_logic(alpha, adj) # (B, 1, N)
+        # print('alpha after mask',alpha.size())
+        # print(alpha)
+        attn_weight = F.softmax(alpha, dim = 2) # (B, 1, N)
+        # print('attn_weight',attn_weight.size())
+        # print(attn_weight)
+        V0 = self.Wr0(V) # (B, N, D)
+        V1 = self.Wr1(V) # (B, N, D)
+        s_mask = s_mask.unsqueeze(2).float()   # (B, N, 1)
+        V = V0 * s_mask + V1 * (1 - s_mask)
+        attn_sum = torch.bmm(attn_weight, V).squeeze(1) # (B, D)
+        # print('attn_sum',attn_sum.size())
+        return attn_weight, attn_sum
+class GAT_dialoggcn_v2(nn.Module):
+    '''
+    use linear to avoid OOM
+    H_i = alpha_ij(W_rH_j)
+    alpha_ij = attention(H_i, H_j, rel)
+    '''
+    def __init__(self, hidden_size):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.linear = nn.Linear(hidden_size * 3, 1)
+        self.Wr0 = nn.Linear(hidden_size, hidden_size, bias = False)
+        self.Wr1 = nn.Linear(hidden_size, hidden_size, bias = False)
+        self.rel_emb = nn.Embedding(2, hidden_size)
+    def forward(self, Q, K, V, adj, s_mask):
+        '''
+        imformation gatherer with linear attention
+        :param Q: (B, D) # query utterance
+        :param K: (B, N, D) # context
+        :param V: (B, N, D) # context
+        :param adj: (B,  N) # the adj matrix of the i th node
+        :param s_mask: (B,  N) #
+        :return:
+        '''
+        rel_emb = self.rel_emb(s_mask) # (B, N, D)
+        B = K.size()[0]
+        N = K.size()[1]
+        # print('Q',Q.size())
+        Q = Q.unsqueeze(1).expand(-1, N, -1) # (B, N, D)；
+        # print('K',K.size())
+        X = torch.cat((Q,K,rel_emb), dim = 2) # (B, N, 3D)
+        # print('X',X.size())
+        alpha = self.linear(X).permute(0,2,1) #(B, 1, N)
+        # print('alpha',alpha.size())
+        # print(alpha)
+        adj = adj.unsqueeze(1)
+        alpha = mask_logic(alpha, adj) # (B, 1, N)
+        # print('alpha after mask',alpha.size())
+        # print(alpha)
+        attn_weight = F.softmax(alpha, dim = 2) # (B, 1, N)
+        # print('attn_weight',attn_weight.size())
+        # print(attn_weight)
+        V0 = self.Wr0(V) # (B, N,D)
+        V1 = self.Wr1(V) # (B, N, D)
+        s_mask = s_mask.unsqueeze(2).float()
+        V = V0 * s_mask + V1 * (1 - s_mask)
+        attn_sum = torch.bmm(attn_weight, V).squeeze(1) # (B, D)
+        # print('attn_sum',attn_sum.size())
+        return attn_weight, attn_sum
+class attentive_node_features(nn.Module):
+    '''
+    Method to obtain attentive node features over the graph convoluted features
+    '''
+    def __init__(self, hidden_size):
+        super().__init__()
+        self.transform = nn.Linear(hidden_size, hidden_size)
+    def forward(self,features, lengths, nodal_att_type):
+        '''
+        features : (B, N, V)
+        lengths : (B, )
+        nodal_att_type : type of the final nodal attention
+        '''
+        if nodal_att_type==None:
+            return features
+        batch_size = features.size(0)
+        max_seq_len = features.size(1)
+        padding_mask = [l*[1]+(max_seq_len-l)*[0] for l in lengths]
+        padding_mask = torch.tensor(padding_mask).to(features)    # (B, N)
+        causal_mask = torch.ones(max_seq_len, max_seq_len).to(features)  # (N, N)
+        causal_mask = torch.tril(causal_mask).unsqueeze(0)  # (1, N, N)
+        if nodal_att_type=='global':
+            mask = padding_mask.unsqueeze(1)
+        elif nodal_att_type=='past':
+            mask = padding_mask.unsqueeze(1)*causal_mask
+        x = self.transform(features)  # (B, N, V)
+        temp = torch.bmm(x, features.permute(0,2,1))
+        #print(temp)
+        alpha = F.softmax(torch.tanh(temp), dim=2)  # (B, N, N)
+        alpha_masked = alpha*mask  # (B, N, N)
+        alpha_sum = torch.sum(alpha_masked, dim=2, keepdim=True)  # (B, N, 1)
+        #print(alpha_sum)
+        alpha = alpha_masked / alpha_sum    # (B, N, N)
+        attn_pool = torch.bmm(alpha, features)  # (B, N, V)
+        return attn_pool

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+torch==2.0.0+cu117
+transformers==4.46.3
+numpy==1.24.2
+pandas==2.1.4
+matplotlib==3.7.1
+scikit-learn==1.2.2
+tqdm==4.67.1

run.py ADDED Viewed

	@@ -0,0 +1,239 @@

+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = '1'
+import numpy as np, argparse, time, pickle, random
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from dataloader import IEMOCAPDataset, get_train_loader
+from model import *
+from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, classification_report, \
+    precision_recall_fscore_support
+from trainer import  train_or_eval_model, save_badcase
+from dataset import IEMOCAPDataset
+from dataloader import get_IEMOCAP_loaders
+from transformers import AdamW
+import copy
+# We use seed = 100 for reproduction of the results reported in the paper.
+seed = 100
+import logging
+def get_logger(filename, verbosity=1, name=None):
+    level_dict = {0: logging.DEBUG, 1: logging.INFO, 2: logging.WARNING}
+    formatter = logging.Formatter(
+        "[%(asctime)s][%(filename)s][line:%(lineno)d][%(levelname)s] %(message)s"
+    )
+    logger = logging.getLogger(name)
+    logger.setLevel(level_dict[verbosity])
+    fh = logging.FileHandler(filename, "w")
+    fh.setFormatter(formatter)
+    logger.addHandler(fh)
+    sh = logging.StreamHandler()
+    sh.setFormatter(formatter)
+    logger.addHandler(sh)
+    return logger
+def seed_everything(seed=seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.benchmark = False
+    torch.backends.cudnn.deterministic = True
+if __name__ == '__main__':
+    path = './saved_models/'
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--bert_model_dir', type=str, default='')
+    parser.add_argument('--bert_tokenizer_dir', type=str, default='')
+    parser.add_argument('--bert_dim', type = int, default=1024)
+    parser.add_argument('--hidden_dim', type = int, default=300)
+    parser.add_argument('--mlp_layers', type=int, default=2, help='Number of output mlp layers.')
+    parser.add_argument('--gnn_layers', type=int, default=2, help='Number of gnn layers.')
+    parser.add_argument('--emb_dim', type=int, default=1024, help='Feature size.')
+    parser.add_argument('--attn_type', type=str, default='rgcn', choices=['dotprod','linear','bilinear', 'rgcn'], help='Feature size.')
+    parser.add_argument('--no_rel_attn',  action='store_true', default=False, help='no relation for edges' )
+    parser.add_argument('--max_sent_len', type=int, default=200,
+                        help='max content length for each text, if set to 0, then the max length has no constrain')
+    parser.add_argument('--no_cuda', action='store_true', default=False, help='does not use GPU')
+    parser.add_argument('--dataset_name', default='IEMOCAP', type= str, help='dataset name, IEMOCAP or MELD or DailyDialog')
+    parser.add_argument('--windowps', type=int, default=1,
+                        help='context window size for constructing edges in graph model for past utterances for short')
+    parser.add_argument('--windowpl', type=int, default=5,
+                        help='context window size for constructing edges in graph model for past utterances for long')
+    parser.add_argument('--windowf', type=int, default=0,
+                        help='context window size for constructing edges in graph model for future utterances')
+    parser.add_argument('--max_grad_norm', type=float, default=5.0, help='Gradient clipping.')
+    parser.add_argument('--lr', type=float, default=1e-3, metavar='LR', help='learning rate')
+    parser.add_argument('--dropout', type=float, default=0, metavar='dropout', help='dropout rate')
+    parser.add_argument('--batch_size', type=int, default=16, metavar='BS', help='batch size')
+    parser.add_argument('--epochs', type=int, default=20, metavar='E', help='number of epochs')
+    parser.add_argument('--tensorboard', action='store_true', default=False, help='Enables tensorboard log')
+    parser.add_argument('--nodal_att_type', type=str, default=None, choices=['global','past'], help='type of nodal attention')
+    parser.add_argument('--curriculum', action='store_true', default=False, help='Enables curriculum learning')
+    parser.add_argument('--bucket_number', type=int, default=0)
+    parser.add_argument('--max_epoch_per_baby_step', type=int, default=0)
+    parser.add_argument('--diffloss', type=float , default=0.1, help='diffloss beta')
+    args = parser.parse_args()
+    print(args)
+    seed_everything()
+    args.cuda = torch.cuda.is_available() and not args.no_cuda
+    if args.cuda:
+        print('Running on GPU')
+    else:
+        print('Running on CPU')
+    if args.tensorboard:
+        from tensorboardX import SummaryWriter
+        writer = SummaryWriter()
+    logger = get_logger(path + args.dataset_name + '/logging.log')
+    logger.info('start training on GPU {}!'.format(os.environ["CUDA_VISIBLE_DEVICES"]))
+    logger.info(args)
+    cuda = args.cuda
+    n_epochs = args.epochs
+    batch_size = args.batch_size
+    valid_loader, test_loader, speaker_vocab, label_vocab, person_vec = get_IEMOCAP_loaders(dataset_name=args.dataset_name, batch_size=batch_size, num_workers=0, args = args)
+    n_classes = len(label_vocab['itos'])
+    print('building model..')
+    model = DAGERC_new_4(args, n_classes)
+    if args.dataset_name == 'IEMOCAP':
+        class_labels = ['excitement', 'neutral', 'frustration', 'sadness', 'happiness', 'anger']
+    else:
+        class_labels = ['Neutral', 'Surprise', 'Fear', 'Sadness', 'Joy', 'Disgust', 'Anger']
+    if torch.cuda.device_count() > 1:
+        print('Multi-GPU...........')
+        model = nn.DataParallel(model,device_ids = range(torch.cuda.device_count()))
+    if cuda:
+        model.cuda()
+    loss_function = nn.CrossEntropyLoss(ignore_index=-1)
+    optimizer = AdamW(model.parameters() , lr=args.lr)
+    best_fscore,best_acc, best_loss, best_label, best_pred, best_mask = None,None, None, None, None, None
+    all_fscore, all_acc, all_loss = [], [], []
+    best_acc = 0.
+    best_fscore = 0.
+    best_epoch = 0
+    best_model = None
+    for e in range(n_epochs):
+        start_time = time.time()
+        #for curiculum learning
+        if e + 1 < args.bucket_number:
+            train_loader = get_train_loader(dataset_name=args.dataset_name, batch_size=batch_size, num_workers=0,
+                                            args=args, babystep_index=e + 1)
+        else:
+            train_loader = get_train_loader(dataset_name=args.dataset_name, batch_size=batch_size, num_workers=0,
+                                            args=args, babystep_index=args.bucket_number)
+        if args.dataset_name == 'DailyDialog':
+            train_loss, train_acc, _, _, train_micro_fscore, train_macro_fscore = train_or_eval_model(model,
+                                                                                                      loss_function,
+                                                                                                      train_loader, e,
+                                                                                                      cuda,
+                                                                                                      args, optimizer,
+                                                                                                      True)
+            valid_loss, valid_acc, _, _, valid_micro_fscore, valid_macro_fscore = train_or_eval_model(model,
+                                                                                                      loss_function,
+                                                                                                      valid_loader, e,
+                                                                                                      cuda, args)
+            test_loss, test_acc, test_label, test_pred, test_micro_fscore, test_macro_fscore = train_or_eval_model(
+                model, loss_function, test_loader, e, cuda, args)
+            all_fscore.append([valid_micro_fscore, test_micro_fscore, valid_macro_fscore, test_macro_fscore])
+            logger.info( 'Epoch: {}, train_loss: {}, train_acc: {}, train_micro_fscore: {}, train_macro_fscore: {}, valid_loss: {}, valid_acc: {}, valid_micro_fscore: {}, valid_macro_fscore: {}, test_loss: {}, test_acc: {}, test_micro_fscore: {}, test_macro_fscore: {}, time: {} sec'. \
+                    format(e + 1, train_loss, train_acc, train_micro_fscore, train_macro_fscore, valid_loss, valid_acc, valid_micro_fscore, valid_macro_fscore, test_loss, test_acc,
+                        test_micro_fscore, test_macro_fscore, round(time.time() - start_time, 2)))
+        else:
+            train_loss, train_acc, _, _, train_fscore, _  , _ = train_or_eval_model(model, loss_function,
+                                                                            train_loader, e, cuda,
+                                                                            args, optimizer, True)
+            valid_loss, valid_acc, _, _, valid_fscore, _  , _= train_or_eval_model(model, loss_function,
+                                                                            valid_loader, e, cuda, args)
+            test_loss, test_acc, test_label, test_pred, test_fscore, test_f1_per_class, avg_macro_fscore= train_or_eval_model(model,loss_function, test_loader, e, cuda, args)
+            all_fscore.append([valid_fscore, test_fscore])
+            logger.info(
+                'Epoch: {}, train_loss: {}, train_acc: {}, train_fscore: {}, valid_loss: {}, valid_acc: {}, valid_fscore: {}, test_loss: {}, test_acc: {}, test_fscore: {}, avg_macro_fscore: {}, time: {} sec'. \
+                format(e + 1, train_loss, train_acc, train_fscore, valid_loss, valid_acc, valid_fscore, test_loss,
+                       test_acc,
+                       test_fscore, avg_macro_fscore, round(time.time() - start_time, 2)))
+            f1_with_labels = {label: f1 for label, f1 in zip(class_labels, test_f1_per_class)}
+            logger.info(f"Test F1 per class: {f1_with_labels}")
+        if (test_fscore > best_fscore):
+            best_fscore = test_fscore
+            best_model = copy.deepcopy(model.state_dict())
+            # print(test_fscore)
+            # print(best_model)
+            best_epoch = e + 1
+        # torch.save(model.state_dict(), path + args.dataset_name + '/model_' + str(e) + '_' + str(test_acc)+ '.pkl')
+        e += 1
+    #save model
+    torch.save(best_model, path + args.dataset_name + '/model_' + str(best_epoch) + '_' + str(best_fscore) + '_' + str(
+        args.gnn_layers) + '.pkl')
+    # print(best_model)
+    if args.tensorboard:
+        writer.close()
+    logger.info('finish training!')
+    #print('Test performance..')
+    all_fscore = sorted(all_fscore, key=lambda x: (x[0],x[1]), reverse=True)
+    #print('Best F-Score based on validation:', all_fscore[0][1])
+    #print('Best F-Score based on test:', max([f[1] for f in all_fscore]))
+    #logger.info('Test performance..')
+    #logger.info('Best F-Score based on validation:{}'.format(all_fscore[0][1]))
+    #logger.info('Best F-Score based on test:{}'.format(max([f[1] for f in all_fscore])))
+    if args.dataset_name=='DailyDialog':
+        logger.info('Best micro/macro F-Score based on validation:{}/{}'.format(all_fscore[0][1],all_fscore[0][3]))
+        all_fscore = sorted(all_fscore, key=lambda x: x[1], reverse=True)
+        logger.info('Best micro/macro F-Score based on test:{}/{}'.format(all_fscore[0][1],all_fscore[0][3]))
+    else:
+        logger.info('Best F-Score based on validation:{}'.format(all_fscore[0][1]))
+        logger.info('Best F-Score based on test:{}'.format(max([f[1] for f in all_fscore])))
+    #save_badcase(best_model, test_loader, cuda, args, speaker_vocab, label_vocab)

saved_models/IEMOCAP/README.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ 请在此处存放训练后IEMOCAP的模型。
2	+ Please store the trained IEMOCAP model here.

saved_models/MELD/README.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ 请在此处存放训练后MELD的模型。
2	+ Please store the trained MELD model here.

saved_models/README.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ 请在此处的IEMOCAP和MELD文件夹中存放训练后的模型。
2	+ Please store the trained models in the IEMOCAP and MELD folders here.

similarity_matrix.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import numpy as np
+emotion_positions = {
+    "pleasure": (np.cos(np.pi / 20), np.sin(np.pi / 20)),
+    "happiness": (np.cos(3 * np.pi / 20), np.sin(3 * np.pi / 20)),
+    "joy": (np.cos(3 * np.pi / 20), np.sin(3 * np.pi / 20)),
+    "pride": (np.cos(5 * np.pi / 20), np.sin(5 * np.pi / 20)),
+    "elation": (np.cos(5 * np.pi / 20), np.sin(5 * np.pi / 20)),
+    "excitement": (np.cos(7 * np.pi / 20), np.sin(7 * np.pi / 20)),
+    "surprise": (np.cos(9 * np.pi / 20), np.sin(9 * np.pi / 20)),
+    "interest": (np.cos(9 * np.pi / 20), np.sin(9 * np.pi / 20)),
+    "anger": (-np.cos(9 * np.pi / 20), np.sin(9 * np.pi / 20)),
+    "irritation": (-np.cos(9 * np.pi / 20), np.sin(9 * np.pi / 20)),
+    "hate": (-np.cos(7 * np.pi / 20), np.sin(7 * np.pi / 20)),
+    "contempt": (-np.cos(5 * np.pi / 20), np.sin(5 * np.pi / 20)),
+    "disgust": (-np.cos(3 * np.pi / 20), np.sin(3 * np.pi / 20)),
+    "fear": (-np.cos(np.pi / 20), np.sin(np.pi / 20)),
+    "boredom": (-0.5, 0),
+    "disappointment": (-np.cos(np.pi / 20), -np.sin(np.pi / 20)),
+    "frustration": (-np.cos(np.pi / 20), -np.sin(np.pi / 20)),
+    "shame": (-np.cos(3 * np.pi / 20), -np.sin(3 * np.pi / 20)),
+    "regret": (-np.cos(5 * np.pi / 20), -np.sin(5 * np.pi / 20)),
+    "guilt": (-np.cos(7 * np.pi / 20), -np.sin(7 * np.pi / 20)),
+    "sadness": (-np.cos(9 * np.pi / 20), -np.sin(9 * np.pi / 20)),
+    "compassion": (np.cos(9 * np.pi / 20), -np.sin(9 * np.pi / 20)),
+    "relief": (np.cos(7 * np.pi / 20), -np.sin(7 * np.pi / 20)),
+    "admiration": (np.cos(5 * np.pi / 20), -np.sin(5 * np.pi / 20)),
+    "love": (np.cos(3 * np.pi / 20), -np.sin(3 * np.pi / 20)),
+    "contentment": (np.cos(np.pi / 20), -np.sin(np.pi / 20)),
+    "neutral": (0, 0)
+}
+# 计算两个情感标签之间的余弦相似度  Compute the cosine similarity between two sentiment labels.
+def cosine_similarity(p1, p2):
+    dot_product = np.dot(p1, p2)
+    norm_p1 = np.linalg.norm(p1)
+    norm_p2 = np.linalg.norm(p2)
+    if norm_p1 == 0 or norm_p2 == 0:
+        return 0.0
+   # print(norm_p2)
+    return dot_product / (norm_p1 * norm_p2)
+# 计算情感标签之间的相似度矩阵  Compute the similarity matrix between sentiment labels.
+def compute_similarity_matrix(emotion_positions, n_dataset):
+    emotions = list(emotion_positions.keys())
+   # print(emotions)
+    n = len(emotions)
+    N = n_dataset  # 总情感标签数   Total number of sentiment labels.
+  #  print(N)
+    similarity_matrix = np.zeros((n, n))
+    emotion_to_index = {emotion: idx for idx, emotion in enumerate(emotions)}  # 标签到索引的映射   Mapping from labels to indices.
+    # 获取 "neutral" 标签的索引 Get the index of the label "neutral".
+    neutral_index = emotions.index("neutral")
+    for i in range(n):
+        for j in range(n):
+            if i != j:
+                v1 = emotion_positions[emotions[i]][0]
+                v2 = emotion_positions[emotions[j]][0]
+               # print(v1)
+                p1 = emotion_positions[emotions[i]]
+               # print(p1)
+                p2 = emotion_positions[emotions[j]]
+             #   print(v1)
+                # 如果两个情感标签的价度极性相反，设相似度为0   If the valence polarities of two emotion labels are opposite, set their similarity to 0.
+                if v1 * v2 < 0:
+                    similarity_matrix[i][j] = 0
+                elif v1 * v2 == 0:  # valence 极性为 0
+                    similarity_matrix[i][j] = 1 / N  # 设置为 1/N  set to 1/N
+                else:
+                   # print(v1)
+                   # print(v2)
+                   # print('-----')
+                    similarity_matrix[i][j] = max(cosine_similarity(np.array(p1), np.array(p2)), 0)
+    # 特殊处理 "neutral" 标签   Special handling for the "neutral" label.
+    for i in range(n):
+        if i != neutral_index:
+            similarity_matrix[neutral_index][i] = 1 / N  # 与所有其他标签的相似度为 1/N The similarity between the "neutral" label and all other labels is set to 1/N.
+            similarity_matrix[i][neutral_index] = 1 / N
+    return similarity_matrix, emotion_to_index
+def get_similarity_matrix(dataset):
+    if dataset == 'IEMOCAP':
+        n = 6
+        similarity_matrix , emotion_to_index = compute_similarity_matrix(emotion_positions, n)
+    else:
+        n = 7
+        similarity_matrix , emotion_to_index = compute_similarity_matrix(emotion_positions, n)
+    #输出相似度矩阵
+    #绝对值越接近1表示越相似，越接近0表示越不一样   The closer the absolute value is to 1, the more similar they are; the closer it is to 0, the more different they are.
+   # print("Emotion Similarity Matrix:")
+   # print(similarity_matrix)
+   # print(emotion_to_index)
+    return similarity_matrix,emotion_to_index

trainer.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import numpy as np, argparse, time, pickle, random
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader
+from torch.utils.data.sampler import SubsetRandomSampler
+from dataloader import IEMOCAPDataset
+from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, classification_report, \
+    precision_recall_fscore_support
+from utils import person_embed
+from tqdm import tqdm
+import json
+def train_or_eval_model(model, loss_function, dataloader,epoch, cuda, args, optimizer=None, train=False):
+    losses, preds, labels = [], [], []
+    scores, vids = [], []
+    assert not train or optimizer != None
+    if train:
+        model.train()
+        # dataloader = tqdm(dataloader)
+    else:
+        model.eval()
+    cnt = 0
+    for data in dataloader:
+        if train:
+            optimizer.zero_grad()
+        # text_ids, text_feature, speaker_ids, labels, umask = [d.cuda() for d in data] if cuda else data
+        features, label, adj_1, adj_2, s_mask, s_mask_onehot,lengths, speakers, utterances = data
+        # speaker_vec = person_embed(speaker_ids, person_vec)
+        if cuda:
+            features = features.cuda()
+            label = label.cuda()
+            adj_1 = adj_1.cuda()
+            adj_2 = adj_2.cuda()
+            s_mask = s_mask.cuda()
+            s_mask_onehot = s_mask_onehot.cuda()
+            lengths = lengths.cuda()
+        # print(speakers)
+        log_prob, diff_loss  = model(features, adj_1, adj_2, s_mask, s_mask_onehot, lengths) # (B, N, C)
+        # print(label)
+        loss = loss_function(log_prob.permute(0,2,1), label)+ diff_loss
+        '''
+         # print(speakers)
+        log_prob  = model(features, adj_1, adj_2, s_mask, s_mask_onehot, lengths) # (B, N, C)
+        # print(label)
+        loss = loss_function(log_prob.permute(0,2,1), label)
+         '''
+        label = label.cpu().numpy().tolist()
+        pred = torch.argmax(log_prob, dim = 2).cpu().numpy().tolist()
+        preds += pred
+        labels += label
+        losses.append(loss.item())
+        if train:
+            loss_val = loss.item()
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+            if args.tensorboard:
+                for param in model.named_parameters():
+                    writer.add_histogram(param[0], param[1].grad, epoch)
+            optimizer.step()
+    if preds != []:
+        new_preds = []
+        new_labels = []
+        for i,label in enumerate(labels):
+            for j,l in enumerate(label):
+                if l != -1:
+                    new_labels.append(l)
+                    new_preds.append(preds[i][j])
+    else:
+        return float('nan'), float('nan'), [], [], float('nan'), [], [], [], [], []
+    # print(preds.tolist())
+    # print(labels.tolist())
+    avg_loss = round(np.sum(losses) / len(losses), 4)
+    avg_accuracy = round(accuracy_score(new_labels, new_preds) * 100, 2)
+    if args.dataset_name in ['IEMOCAP', 'MELD', 'EmoryNLP']:
+        avg_fscore = round(f1_score(new_labels, new_preds, average='weighted') * 100, 2)
+        f1_per_class = f1_score(new_labels, new_preds, average=None)  # List of F1 scores for each class
+        avg_macro_fscore = round(f1_score(new_labels, new_preds, average='macro') * 100, 2)
+        return avg_loss, avg_accuracy, labels, preds, avg_fscore,  f1_per_class, avg_macro_fscore
+    else:
+        avg_micro_fscore = round(f1_score(new_labels, new_preds, average='micro', labels=list(range(1, 7))) * 100, 2)
+        avg_macro_fscore = round(f1_score(new_labels, new_preds, average='macro') * 100, 2)
+        return avg_loss, avg_accuracy, labels, preds, avg_micro_fscore, avg_macro_fscore
+def save_badcase(model,  dataloader, cuda, args, speaker_vocab, label_vocab):
+    preds, labels = [], []
+    scores, vids = [], []
+    dialogs = []
+    speakers = []
+    model.eval()
+    for data in dataloader:
+        # text_ids, text_feature, speaker_ids, labels, umask = [d.cuda() for d in data] if cuda else data
+        features, label, adj,s_mask, s_mask_onehot,lengths, speaker, utterances = data
+        # speaker_vec = person_embed(speaker_ids, person_vec)
+        if cuda:
+            features = features.cuda()
+            label = label.cuda()
+            adj = adj.cuda()
+            s_mask_onehot = s_mask_onehot.cuda()
+            s_mask = s_mask.cuda()
+            lengths = lengths.cuda()
+        # print(speakers)
+        log_prob = model(features, adj,s_mask, s_mask_onehot, lengths) # (B, N, C)
+        label = label.cpu().numpy().tolist() # (B, N)
+        pred = torch.argmax(log_prob, dim = 2).cpu().numpy().tolist() # (B, N)
+        preds += pred
+        labels += label
+        dialogs += utterances
+        speakers += speaker
+        # finished here
+    if preds != []:
+        new_preds = []
+        new_labels = []
+        for i,label in enumerate(labels):
+            for j,l in enumerate(label):
+                if l != -1:
+                    new_labels.append(l)
+                    new_preds.append(preds[i][j])
+    else:
+        return
+    cases = []
+    for i,d in enumerate(dialogs):
+        case = []
+        for j,u in enumerate(d):
+            case.append({
+                'text': u,
+                'speaker': speaker_vocab['itos'][speakers[i][j]],
+                'label': label_vocab['itos'][labels[i][j]] if labels[i][j] != -1 else 'none',
+                'pred': label_vocab['itos'][preds[i][j]]
+            })
+        cases.append(case)
+    with open('badcase/%s.json'%(args.dataset_name), 'w', encoding='utf-8') as f:
+        json.dump(cases,f)
+    # print(preds.tolist())
+    # print(labels.tolist())
+    avg_accuracy = round(accuracy_score(new_labels, new_preds) * 100, 2)
+    if args.dataset_name in ['IEMOCAP', 'MELD', 'EmoryNLP']:
+        avg_fscore = round(f1_score(new_labels, new_preds, average='weighted') * 100, 2)
+        print('badcase saved')
+        print('test_f1', avg_fscore)
+        return
+    else:
+        avg_micro_fscore = round(f1_score(new_labels, new_preds, average='micro', labels=list(range(1, 7))) * 100, 2)
+        avg_macro_fscore = round(f1_score(new_labels, new_preds, average='macro') * 100, 2)
+        print('badcase saved')
+        print('test_micro_f1', avg_micro_fscore)
+        print('test_macro_f1', avg_macro_fscore)
+        return

utils.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import numpy as np
+import torch
+def person_embed(speaker_ids, person_vec):
+    '''
+    :param speaker_ids: torch.Tensor ( T, B)
+    :param person_vec: numpy array (num_speakers, 100)
+    :return:
+        speaker_vec: torch.Tensor (T, B, D)
+    '''
+    speaker_vec = []
+    for t in speaker_ids:
+        speaker_vec.append([person_vec[int(i)].tolist() if i != -1 else [0] * 100 for i in t])
+    speaker_vec = torch.FloatTensor(speaker_vec)
+    return speaker_vec