zhanyil2 commited on
Commit
e5a4e3d
·
1 Parent(s): 9a0dcdf

Upload 12 files

Browse files
AT.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18d18fb859fd20f85510eb20f667bf7766438b9072c446365fa074bdf4f7b325
3
+ size 3228
NN.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+ from torch.utils.data import Dataset, DataLoader
3
+
4
+ class OffensiveLanguageDataset(Dataset):
5
+ def __init__(self, data, labels):
6
+
7
+ self.data = data
8
+ self.labels = labels
9
+
10
+ def __len__(self):
11
+ return len(self.data)
12
+
13
+ def __getitem__(self, idx):
14
+ return self.data[idx], self.labels[idx]
15
+
16
+ class OffensiveLanguageClassifier(nn.Module):
17
+ def __init__(self, vocab_size, hidden_size, output_size, num_layers, dropout):
18
+ super(OffensiveLanguageClassifier, self).__init__()
19
+ self.bilstm = nn.LSTM(input_size=vocab_size, hidden_size=hidden_size, num_layers=num_layers, bidirectional=True, dropout=dropout)
20
+ self.fc = nn.Linear(hidden_size * 2, output_size)
21
+ self.fc1 = nn.Linear(hidden_size * 2, output_size)
22
+ def forward(self, input):
23
+ # Perform the computation
24
+ hidden = self.fc1(input)
25
+ relu = self.relu(hidden)
26
+ logits = self.fc2(relu)
27
+ return logits
28
+
29
+
NT.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7856cd610493a88652fad0b09002919e2a21f4bcc4627b485676b3e7b0f877ea
3
+ size 29094
__pycache__/NN.cpython-37.pyc ADDED
Binary file (1.63 kB). View file
 
__pycache__/NN.cpython-39.pyc ADDED
Binary file (1.42 kB). View file
 
__pycache__/process_data.cpython-37.pyc ADDED
Binary file (1.82 kB). View file
 
__pycache__/process_data.cpython-39.pyc ADDED
Binary file (1.52 kB). View file
 
labeled_data.csv ADDED
The diff for this file is too large to render. See raw diff
 
main.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from torch import optim
4
+ from torch.utils.data import DataLoader
5
+ from NN import OffensiveLanguageClassifier, OffensiveLanguageDataset
6
+
7
+ # Set the device to use for training
8
+ from process_data import train
9
+
10
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
11
+
12
+
13
+ batch_size = 2
14
+ vocab_size = 23885
15
+ hidden_size = 128
16
+ output_size = 3
17
+ num_layers = 2
18
+ num_epochs = 2
19
+
20
+ # Create the model and move it to the device
21
+ model = OffensiveLanguageClassifier(vocab_size, hidden_size, output_size, num_layers, dropout = 0.3)
22
+ model.to(device)
23
+
24
+ # Define the loss function and the optimizer
25
+ loss_fn = nn.CrossEntropyLoss()
26
+ optimizer = optim.Adam(model.parameters())
27
+
28
+ # Create the DataLoader
29
+
30
+ train_dataset = OffensiveLanguageDataset(train[0], train["class"])
31
+ #print(train_dataset.shape)
32
+ #print(train_dataset.head(10))
33
+
34
+ dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
35
+ print(type(dataloader))
36
+ # Train the model
37
+ for epoch in range(num_epochs):
38
+ #print(dataloader)
39
+ #train_features, train_labels = next(iter(dataloader)
40
+ for data , labels in dataloader:
41
+ #print(data)
42
+ #print(labels)
43
+ #data, labels = data.to(device), labels.to(device)
44
+
45
+ # Forward pass
46
+ #print(type(data[0]))
47
+ data = torch.stack(data)
48
+ logits = model(data)
49
+ loss = loss_fn(logits, labels)
50
+
51
+ # Backward pass and optimization
52
+ optimizer.zero_grad()
53
+ loss.backward()
54
+ optimizer.step()
55
+
56
+ # Print the loss and accuracy at the end of each epoch
57
+ print(f'Epoch {epoch+1}: loss = {loss:.4f}')
58
+
59
+
60
+
61
+
process_data.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ import nltk as nltk
4
+ import numpy as np
5
+ import pandas as pd
6
+ from gensim.models import Word2Vec
7
+ from sklearn.feature_extraction.text import CountVectorizer
8
+
9
+ df = pd.read_csv("./labeled_data.csv")
10
+ print("Finished loading data from labeled_data.csv")
11
+
12
+ # Data cleansing
13
+ tweets = df.iloc[:,6]
14
+ texts = []
15
+ for iterrow in tweets.items():
16
+ text = iterrow[1]
17
+ text = re.sub(r'\@.*\:', "",text)
18
+ text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', "", text, flags=re.MULTILINE)
19
+ text = re.sub(r'[^A-Za-z ]+', "",text)
20
+ text = re.sub(r'RT', "",text)
21
+ texts.append(text)
22
+
23
+ df_1 = df.iloc[:,:6]
24
+ df_2 = pd.DataFrame(texts)
25
+ print(df_2)
26
+ count = CountVectorizer()
27
+ count = CountVectorizer(stop_words='english', ngram_range=(1,5))
28
+ count.fit(df_2[0])
29
+ X_train_vectorizer=count.transform(df_2[0])
30
+ df_2 = pd.DataFrame(X_train_vectorizer.toarray())
31
+ df_cleaned = pd.concat([df_1,df_2],axis=1)
32
+
33
+ # Data splitting
34
+ def train_validate_test_split(df_local, train_percent=.6, validate_percent=.2, seed=None):
35
+ np.random.seed(seed)
36
+ perm = np.random.permutation(df_local.index)
37
+ m = len(df_local.index)
38
+ train_end = int(train_percent * m)
39
+ validate_end = int(validate_percent * m) + train_end
40
+ train = df_local.iloc[perm[:train_end]]
41
+ validate = df_local.iloc[perm[train_end:validate_end]]
42
+ test = df_local.iloc[perm[validate_end:]]
43
+ return train, validate, test
44
+
45
+ train, validate, test = train_validate_test_split(df_cleaned)
46
+ train = train.dropna(axis=0).reset_index(drop=True)
47
+ validate = validate.dropna(axis=0).reset_index(drop=True)
48
+ test = test.dropna(axis=0).reset_index(drop=True)
49
+
50
+ # Construct a dictionary
51
+ # 1. Traverse each word in the dataset, store them in a dictionary
52
+ # the dictionary will be used for one-hot encoding
53
+ # 2. Calculate the maximum number of words that a sentense contains
54
+ train_tweets = train.iloc[:,6]
55
+ word_set = set()
56
+
57
+ max_len = 0
58
+ curr_len = 0
59
+ for line in train_tweets.items():
60
+ if curr_len > max_len:
61
+ max_len = curr_len
62
+ curr_len = 0
63
+ for word in line[1].split():
64
+ word_set.add(word)
65
+ curr_len += 1
66
+
67
+ dictionary = list(word_set)
68
+ # max_len: 33
69
+ # len(dictionary):
70
+
71
+
72
+ # # Load the word2vec model
73
+ # model = Word2Vec.load("word2vec.model")
74
+ #
75
+ # # Convert the text to a list of words
76
+ # words = nltk.word_tokenize(text)
77
+ #
78
+ # # Convert the words to word vectors using the word2vec model
79
+ # vectors = [model.wv[word] for word in words]
read_graph.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 准备图数据
2
+ import osmnx as ox
3
+
4
+
5
+ def get_graph():
6
+ G = ox.load_graphml('dataset/G_new.graphml')
7
+ # 补充节点和边的lon,lat特征
8
+ # nodes, edges = assign_edge_attr(G)
9
+ # 读取目前的路网
10
+ # import networkx as nx
11
+ # G_new = nx.Graph()
12
+ # import tqdm
13
+ # print("开始读入节点")
14
+ # pos_location = {}
15
+ # for node_id, row in nodes.iterrows():
16
+ # G_new.add_node(node_id, y=row['y'], x=row['x']) # 节点id,节点经纬度
17
+ # pos_location[node_id] = (row['y'], row['x'])
18
+ # e_cnt = 0
19
+ # print("开始读入边")
20
+ # for node_id_1, node_id_2, _ in G.edges:
21
+ # G_new.add_edge(node_id_1, node_id_2) # 边:节点id
22
+ #
23
+ #
24
+ # # 去掉自环
25
+ # G_new.remove_edges_from(nx.selfloop_edges(G_new))
26
+ return G
word2tensor.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+
3
+ import torch
4
+ import numpy as np
5
+ from torch.utils.data import Dataset, DataLoader
6
+ from torch.nn.functional import one_hot,softmax
7
+ import matplotlib.pyplot as plt
8
+ import random
9
+ import torch.utils.data as data
10
+ teacher_forcing_ratio = 0.5
11
+
12
+
13
+ def get_data_set(mode="train"):
14
+ NT,AT = None,None
15
+ with open(r'dataset/'+"NT.pkl","rb") as f1:
16
+ NT = pickle.load(f1)
17
+
18
+ with open(r'dataset/'+"AT.pkl","rb") as f2:
19
+ AT = pickle.load(f2)
20
+
21
+
22
+ def onehot_encode(char, vocab):
23
+ # one hot encode a given text
24
+ encoded = [0 for _ in range(len(vocab))] #[0,0,1,0,000]
25
+ encoded[vocab.index(char)] = 1
26
+ return encoded
27
+
28
+ from read_graph import get_graph
29
+ import networkx as nx
30
+ G_new = get_graph()
31
+ voc = list(G_new)
32
+ with open('nodes.pkl', 'wb') as f:
33
+ pickle.dump(voc, f, pickle.HIGHEST_PROTOCOL)
34
+
35
+ voc = None
36
+ with open('nodes.pkl', 'rb') as f:
37
+ voc = pickle.load(f)
38
+
39
+ voc.append(0) # 补全符号
40
+ voc.append('s') # START
41
+ voc.append('e') # EOF
42
+
43
+ total_word_count = len(voc)
44
+
45
+
46
+ # 轨迹的标签
47
+ samples = []
48
+ labels = []
49
+ if mode=="train":
50
+ for tr in NT:
51
+ samples.append(tr)
52
+ labels.append(1) # 正常
53
+ else:
54
+ for tr in NT:
55
+ samples.append(tr)
56
+ labels.append(1) # 正常
57
+ for tr in AT:
58
+ samples.append(tr)
59
+ labels.append(0) # 异常
60
+
61
+ def padding(x,max_length):
62
+ if len(x) > max_length:
63
+ text = x[:max_length]
64
+ else:
65
+ text = x + [[0,0]] * (max_length - len(x))
66
+ return text
67
+
68
+
69
+ # 计算最长轨迹
70
+ max_len = 10
71
+ for tr in samples:
72
+ max_len = max(max_len,len(tr))
73
+ samples_padded = []
74
+
75
+ # 补全为长轨迹
76
+ for tr in samples:
77
+ tr = padding(tr,max_len)
78
+ samples_padded.append(tr)
79
+
80
+ # One hot
81
+ def onehot_encode(char, vocab):
82
+ # one hot encode a given text
83
+ encoded = [0 for _ in range(len(vocab))]
84
+ if char != 0:
85
+ encoded[vocab.index(char)] = 1
86
+ return encoded
87
+
88
+ samples_one_hot = []
89
+ samples_index = []
90
+ for tr in samples_padded:
91
+ tr_rep = []
92
+ tr_rep_index = []
93
+ for pt in tr:
94
+ spatial = onehot_encode(pt[0], voc)
95
+ temporal = int(pt[1])
96
+ tr_rep.append(spatial)
97
+ tr_rep_index.append(voc.index(pt[0]))
98
+ samples_one_hot.append(tr_rep)
99
+ samples_index.append(tr_rep_index)
100
+
101
+ sampletensor = torch.Tensor(samples_one_hot)
102
+ sampletensor_index = torch.Tensor(samples_index)
103
+ labeltensor = torch.Tensor(labels)
104
+ # print("sampletensor.shape",sampletensor.shape)
105
+ # print("labeltensor.shape",labeltensor.shape)
106
+ return sampletensor,sampletensor_index,labeltensor,max_len
107
+
108
+ global device
109
+
110
+ if torch.cuda.is_available():
111
+ torch.backends.cudnn.enabled = False
112
+ device = torch.device("cuda:0")
113
+ torch.cuda.set_device(0)
114
+ import os
115
+ os.environ['CUDA_VISIBLE_DEVICES']='0'
116
+ print("Working on GPU")
117
+ torch.cuda.empty_cache()
118
+ else:
119
+ device = torch.device("cpu")
120
+
121
+ import torch.nn as nn
122
+ # from VAE import AE,RNN
123
+
124
+ if __name__ == '__main__':
125
+ sampletensor,sampletensor_index,labeltensor,max_len = get_data_set("train")
126
+
127
+ batch_size = 2
128
+ train_set = data.TensorDataset(sampletensor, sampletensor_index,labeltensor)
129
+ train_iter = data.DataLoader(train_set, batch_size, shuffle=False, drop_last=False)
130
+
131
+ # rnn = RNN(input_size=2694,hidden_size=64,batch_size=2,maxlen=max_len)
132
+ # loss = nn.CrossEntropyLoss()
133
+ # optimizer = torch.optim.Adamax(rnn.parameters(),lr=1e-2)
134
+ #
135
+ # net = rnn.to(device)
136
+ # num_epochs = 120
137
+ #
138
+ # h_hat_avg = None
139
+ #
140
+ # from tqdm import tqdm
141
+ # for epoch in tqdm(range(num_epochs)):
142
+ # epoch_total_loss = 0
143
+ # for x, x_label,y in train_iter:
144
+ # # RNN
145
+ # xhat,kld,h_hat = net(x,x,"train",None)
146
+ # # print(xhat.shape)
147
+ # # print(x_label.shape)
148
+ # len_all = (x_label.shape[0])*(x_label.shape[1])
149
+ # xhat = xhat.reshape(len_all,-1)
150
+ # x_label = x_label.reshape(len_all).long().to(device)
151
+ # # print(x_label)
152
+ # # print("xhat",xhat.shape)
153
+ # # print("x_label",x_label.shape)
154
+ # l = loss(xhat,x_label)
155
+ # # print("reconstruction loss:",l,"kld loss:",kld)
156
+ # total_loss = l + kld
157
+ # epoch_total_loss += total_loss
158
+ # optimizer.zero_grad()
159
+ # total_loss.backward()
160
+ # optimizer.step()
161
+ # if epoch == num_epochs - 1:
162
+ # if h_hat_avg is None:
163
+ # h_hat_avg = h_hat/ torch.full(h_hat.shape,len(sampletensor)).to(device)
164
+ # else:
165
+ # h_hat_avg += h_hat / torch.full(h_hat.shape, len(sampletensor)).to(device)
166
+ # print(">>> h_hat_avg",h_hat_avg.shape)
167
+ # print(" epoch_total_loss = ",epoch_total_loss)
168
+ #
169
+ # print("training ends")
170
+ # torch.save(net,"LSTM-VAE.pth")
171
+ # torch.save(h_hat_avg, 'h_hat_avg.pt')
172
+ #
173
+ #
174
+ #
175
+ #
176
+ #
177
+ #
178
+ #
179
+ #
180
+ #
181
+ #
182
+ #
183
+ #