Spaces:

zhanyil2
/

sentiment

No application file

App Files Files Community

zhanyil2 commited on Dec 18, 2022

Commit

e5a4e3d

1 Parent(s): 9a0dcdf

Upload 12 files

Browse files

Files changed (12) hide show

AT.pkl +3 -0
NN.py +29 -0
NT.pkl +3 -0
__pycache__/NN.cpython-37.pyc +0 -0
__pycache__/NN.cpython-39.pyc +0 -0
__pycache__/process_data.cpython-37.pyc +0 -0
__pycache__/process_data.cpython-39.pyc +0 -0
labeled_data.csv +0 -0
main.py +61 -0
process_data.py +79 -0
read_graph.py +26 -0
word2tensor.py +183 -0

AT.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:18d18fb859fd20f85510eb20f667bf7766438b9072c446365fa074bdf4f7b325
+size 3228

NN.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import torch.nn as nn
+from torch.utils.data import Dataset, DataLoader
+class OffensiveLanguageDataset(Dataset):
+    def __init__(self, data, labels):
+        self.data = data
+        self.labels = labels
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        return self.data[idx], self.labels[idx]
+class OffensiveLanguageClassifier(nn.Module):
+    def __init__(self, vocab_size, hidden_size, output_size, num_layers, dropout):
+        super(OffensiveLanguageClassifier, self).__init__()
+        self.bilstm = nn.LSTM(input_size=vocab_size, hidden_size=hidden_size, num_layers=num_layers, bidirectional=True, dropout=dropout)
+        self.fc = nn.Linear(hidden_size * 2, output_size)
+        self.fc1 = nn.Linear(hidden_size * 2, output_size)
+    def forward(self, input):
+        # Perform the computation
+        hidden = self.fc1(input)
+        relu = self.relu(hidden)
+        logits = self.fc2(relu)
+        return logits

NT.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7856cd610493a88652fad0b09002919e2a21f4bcc4627b485676b3e7b0f877ea
+size 29094

__pycache__/NN.cpython-37.pyc ADDED Viewed

Binary file (1.63 kB). View file

__pycache__/NN.cpython-39.pyc ADDED Viewed

Binary file (1.42 kB). View file

__pycache__/process_data.cpython-37.pyc ADDED Viewed

Binary file (1.82 kB). View file

__pycache__/process_data.cpython-39.pyc ADDED Viewed

Binary file (1.52 kB). View file

labeled_data.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

main.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import torch
+import torch.nn as nn
+from torch import optim
+from torch.utils.data import DataLoader
+from NN import OffensiveLanguageClassifier, OffensiveLanguageDataset
+# Set the device to use for training
+from process_data import train
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+batch_size = 2
+vocab_size = 23885
+hidden_size = 128
+output_size = 3
+num_layers = 2
+num_epochs = 2
+# Create the model and move it to the device
+model = OffensiveLanguageClassifier(vocab_size, hidden_size, output_size, num_layers, dropout = 0.3)
+model.to(device)
+# Define the loss function and the optimizer
+loss_fn = nn.CrossEntropyLoss()
+optimizer = optim.Adam(model.parameters())
+# Create the DataLoader
+train_dataset = OffensiveLanguageDataset(train[0], train["class"])
+#print(train_dataset.shape)
+#print(train_dataset.head(10))
+dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+print(type(dataloader))
+# Train the model
+for epoch in range(num_epochs):
+    #print(dataloader)
+    #train_features, train_labels = next(iter(dataloader)
+    for data , labels in dataloader:
+        #print(data)
+        #print(labels)
+        #data, labels = data.to(device), labels.to(device)
+        # Forward pass
+        #print(type(data[0]))
+        data = torch.stack(data)
+        logits = model(data)
+        loss = loss_fn(logits, labels)
+        # Backward pass and optimization
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+    # Print the loss and accuracy at the end of each epoch
+    print(f'Epoch {epoch+1}: loss = {loss:.4f}')

process_data.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import re
+import nltk as nltk
+import numpy as np
+import pandas as pd
+from gensim.models import Word2Vec
+from sklearn.feature_extraction.text import CountVectorizer
+df = pd.read_csv("./labeled_data.csv")
+print("Finished loading data from labeled_data.csv")
+# Data cleansing
+tweets = df.iloc[:,6]
+texts = []
+for iterrow in tweets.items():
+  text = iterrow[1]
+  text = re.sub(r'\@.*\:', "",text)
+  text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', "", text, flags=re.MULTILINE)
+  text = re.sub(r'[^A-Za-z ]+', "",text)
+  text = re.sub(r'RT', "",text)
+  texts.append(text)
+df_1 = df.iloc[:,:6]
+df_2 = pd.DataFrame(texts)
+print(df_2)
+count = CountVectorizer()
+count = CountVectorizer(stop_words='english', ngram_range=(1,5))
+count.fit(df_2[0])
+X_train_vectorizer=count.transform(df_2[0])
+df_2 = pd.DataFrame(X_train_vectorizer.toarray())
+df_cleaned = pd.concat([df_1,df_2],axis=1)
+# Data splitting
+def train_validate_test_split(df_local, train_percent=.6, validate_percent=.2, seed=None):
+    np.random.seed(seed)
+    perm = np.random.permutation(df_local.index)
+    m = len(df_local.index)
+    train_end = int(train_percent * m)
+    validate_end = int(validate_percent * m) + train_end
+    train = df_local.iloc[perm[:train_end]]
+    validate = df_local.iloc[perm[train_end:validate_end]]
+    test = df_local.iloc[perm[validate_end:]]
+    return train, validate, test
+train, validate, test = train_validate_test_split(df_cleaned)
+train = train.dropna(axis=0).reset_index(drop=True)
+validate = validate.dropna(axis=0).reset_index(drop=True)
+test = test.dropna(axis=0).reset_index(drop=True)
+# Construct a dictionary
+# 1. Traverse each word in the dataset, store them in a dictionary
+#   the dictionary will be used for one-hot encoding
+# 2. Calculate the maximum number of words that a sentense contains
+train_tweets = train.iloc[:,6]
+word_set = set()
+max_len = 0
+curr_len = 0
+for line in train_tweets.items():
+  if curr_len > max_len:
+    max_len = curr_len
+  curr_len = 0
+  for word in line[1].split():
+    word_set.add(word)
+    curr_len += 1
+dictionary = list(word_set)
+# max_len: 33
+# len(dictionary):
+# # Load the word2vec model
+# model = Word2Vec.load("word2vec.model")
+#
+# # Convert the text to a list of words
+# words = nltk.word_tokenize(text)
+#
+# # Convert the words to word vectors using the word2vec model
+# vectors = [model.wv[word] for word in words]

read_graph.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# 准备图数据
+import osmnx as ox
+def get_graph():
+    G = ox.load_graphml('dataset/G_new.graphml')
+    # 补充节点和边的lon,lat特征
+    # nodes, edges = assign_edge_attr(G)
+    # 读取目前的路网
+    # import networkx as nx
+    # G_new = nx.Graph()
+    # import tqdm
+    # print("开始读入节点")
+    # pos_location = {}
+    # for node_id, row in nodes.iterrows():
+    #     G_new.add_node(node_id, y=row['y'], x=row['x'])  # 节点id，节点经纬度
+    #     pos_location[node_id] = (row['y'], row['x'])
+    # e_cnt = 0
+    # print("开始读入边")
+    # for node_id_1, node_id_2, _ in G.edges:
+    #     G_new.add_edge(node_id_1, node_id_2)  # 边：节点id
+    #
+    #
+    # # 去掉自环
+    # G_new.remove_edges_from(nx.selfloop_edges(G_new))
+    return G

word2tensor.py ADDED Viewed

	@@ -0,0 +1,183 @@

+import pickle
+import torch
+import numpy as np
+from torch.utils.data import Dataset, DataLoader
+from torch.nn.functional import one_hot,softmax
+import matplotlib.pyplot as plt
+import random
+import torch.utils.data as data
+teacher_forcing_ratio = 0.5
+def get_data_set(mode="train"):
+    NT,AT = None,None
+    with open(r'dataset/'+"NT.pkl","rb") as f1:
+        NT = pickle.load(f1)
+    with open(r'dataset/'+"AT.pkl","rb") as f2:
+        AT = pickle.load(f2)
+    def onehot_encode(char, vocab):
+        # one hot encode a given text
+        encoded = [0 for _ in range(len(vocab))] #[0,0,1,0,000]
+        encoded[vocab.index(char)] = 1
+        return encoded
+    from read_graph import get_graph
+    import networkx as nx
+    G_new = get_graph()
+    voc = list(G_new)
+    with open('nodes.pkl', 'wb') as f:
+        pickle.dump(voc, f, pickle.HIGHEST_PROTOCOL)
+    voc = None
+    with open('nodes.pkl', 'rb') as f:
+        voc = pickle.load(f)
+    voc.append(0) # 补全符号
+    voc.append('s') # START
+    voc.append('e') # EOF
+    total_word_count = len(voc)
+    # 轨迹的标签
+    samples = []
+    labels = []
+    if mode=="train":
+        for tr in NT:
+            samples.append(tr)
+            labels.append(1)  # 正常
+    else:
+        for tr in NT:
+            samples.append(tr)
+            labels.append(1)  # 正常
+        for tr in AT:
+            samples.append(tr)
+            labels.append(0) # 异常
+    def padding(x,max_length):
+        if len(x) > max_length:
+            text = x[:max_length]
+        else:
+            text = x + [[0,0]] * (max_length - len(x))
+        return text
+    # 计算最长轨迹
+    max_len = 10
+    for tr in samples:
+         max_len = max(max_len,len(tr))
+    samples_padded = []
+    # 补全为长轨迹
+    for tr in samples:
+        tr = padding(tr,max_len)
+        samples_padded.append(tr)
+    # One hot
+    def onehot_encode(char, vocab):
+        # one hot encode a given text
+        encoded = [0 for _ in range(len(vocab))]
+        if char != 0:
+            encoded[vocab.index(char)] = 1
+        return encoded
+    samples_one_hot = []
+    samples_index = []
+    for tr in samples_padded:
+        tr_rep = []
+        tr_rep_index = []
+        for pt in tr:
+            spatial = onehot_encode(pt[0], voc)
+            temporal = int(pt[1])
+            tr_rep.append(spatial)
+            tr_rep_index.append(voc.index(pt[0]))
+        samples_one_hot.append(tr_rep)
+        samples_index.append(tr_rep_index)
+    sampletensor = torch.Tensor(samples_one_hot)
+    sampletensor_index = torch.Tensor(samples_index)
+    labeltensor = torch.Tensor(labels)
+    # print("sampletensor.shape",sampletensor.shape)
+    # print("labeltensor.shape",labeltensor.shape)
+    return sampletensor,sampletensor_index,labeltensor,max_len
+global device
+if torch.cuda.is_available():
+    torch.backends.cudnn.enabled = False
+    device = torch.device("cuda:0")
+    torch.cuda.set_device(0)
+    import os
+    os.environ['CUDA_VISIBLE_DEVICES']='0'
+    print("Working on GPU")
+    torch.cuda.empty_cache()
+else:
+    device = torch.device("cpu")
+import torch.nn as nn
+# from VAE import AE,RNN
+if __name__ == '__main__':
+    sampletensor,sampletensor_index,labeltensor,max_len = get_data_set("train")
+    batch_size = 2
+    train_set = data.TensorDataset(sampletensor, sampletensor_index,labeltensor)
+    train_iter = data.DataLoader(train_set, batch_size, shuffle=False, drop_last=False)
+    # rnn = RNN(input_size=2694,hidden_size=64,batch_size=2,maxlen=max_len)
+    # loss = nn.CrossEntropyLoss()
+    # optimizer = torch.optim.Adamax(rnn.parameters(),lr=1e-2)
+    #
+    # net = rnn.to(device)
+    # num_epochs = 120
+    #
+    # h_hat_avg = None
+    #
+    # from tqdm import tqdm
+    # for epoch in tqdm(range(num_epochs)):
+    #     epoch_total_loss = 0
+    #     for x, x_label,y in train_iter:
+    #         # RNN
+    #         xhat,kld,h_hat = net(x,x,"train",None)
+    #         # print(xhat.shape)
+    #         # print(x_label.shape)
+    #         len_all = (x_label.shape[0])*(x_label.shape[1])
+    #         xhat = xhat.reshape(len_all,-1)
+    #         x_label = x_label.reshape(len_all).long().to(device)
+    #         # print(x_label)
+    #         # print("xhat",xhat.shape)
+    #         # print("x_label",x_label.shape)
+    #         l = loss(xhat,x_label)
+    #         # print("reconstruction loss:",l,"kld loss:",kld)
+    #         total_loss = l + kld
+    #         epoch_total_loss += total_loss
+    #         optimizer.zero_grad()
+    #         total_loss.backward()
+    #         optimizer.step()
+    #         if epoch == num_epochs - 1:
+    #             if h_hat_avg is None:
+    #                 h_hat_avg = h_hat/ torch.full(h_hat.shape,len(sampletensor)).to(device)
+    #             else:
+    #                 h_hat_avg += h_hat / torch.full(h_hat.shape, len(sampletensor)).to(device)
+    #             print(">>> h_hat_avg",h_hat_avg.shape)
+    #     print(" epoch_total_loss = ",epoch_total_loss)
+    #
+    # print("training ends")
+    # torch.save(net,"LSTM-VAE.pth")
+    # torch.save(h_hat_avg, 'h_hat_avg.pt')
+    #
+    #
+    #
+    #
+    #
+    #
+    #
+    #
+    #
+    #
+    #
+    #