Di12 commited on
Commit
5930708
·
1 Parent(s): 2a1a3f2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +107 -53
app.py CHANGED
@@ -36,84 +36,135 @@ abbreviations = {
36
  url_pattern = r"http\S+|www\S+" # URLs
37
  user_pattern = r"@\w+" # usernames
38
  emoji_pattern = re.compile(
39
- "["
40
- "\U0001F600-\U0001F64F"
41
- "\U0001F300-\U0001F5FF"
42
- "\U0001F680-\U0001F6FF"
43
- "\U0001F1E0-\U0001F1FF"
44
- "]", flags=re.UNICODE)
45
- emoticon_pattern = r"[:;=8][\-o\*']?[\)\]\(\[dDpP/:}\{@\|\\]"
46
- repeat_pattern = re.compile(r"(.)\1{2,}")
47
-
48
 
49
  def clean_text(text: str) -> str:
 
50
  text = str(text)
51
- text = unicodedata.normalize('NFC', text)
 
 
52
  text = text.lower()
 
 
53
  text = re.sub(url_pattern, '', text)
54
  text = re.sub(user_pattern, '', text)
 
 
55
  text = emoji_pattern.sub(' ', text)
56
  text = re.sub(emoticon_pattern, ' ', text)
57
 
 
 
 
 
 
58
  if abbreviations:
59
- def expand(match):
60
- word = match.group(0)
61
- return abbreviations.get(word, word)
62
  pattern = re.compile(r"\b(" + "|".join(map(re.escape, abbreviations.keys())) + r")\b")
63
  text = pattern.sub(expand, text)
64
 
 
65
  text = repeat_pattern.sub(r"\1", text)
 
66
  text = re.sub(r"[^\w\s\u00C0-\u024F]", ' ', text)
 
67
  text = re.sub(r"\s+", ' ', text).strip()
 
68
  return text
69
 
70
  # Vocabulary class unchanged...
71
  class Vocabulary:
72
  def __init__(self):
73
- self.word2id = {'<pad>': 0, '<unk>': 1}
74
- self.unk_id = 1
75
- self.id2word = {0: '<pad>', 1: '<unk>'}
76
- def __getitem__(self, word): return self.word2id.get(word, self.unk_id)
77
- def __contains__(self, word): return word in self.word2id
78
- def __len__(self): return len(self.word2id)
 
 
 
 
 
 
 
 
 
 
 
 
79
  def add(self, word):
80
- if word not in self.word2id:
81
- idx = len(self.word2id)
82
- self.word2id[word] = idx
83
- self.id2word[idx] = word
84
- return idx
85
- return self[word]
 
86
  @staticmethod
87
  def tokenize_corpus(corpus):
88
- tokenized = []
89
- for doc in tqdm(corpus):
90
- tokenized.append([w.replace(' ', '_') for w in word_tokenize(doc)])
91
- return tokenized
 
 
 
 
92
  def corpus_to_tensor(self, corpus, is_tokenized=False):
93
- tok = corpus if is_tokenized else self.tokenize_corpus(corpus)
94
- tensors = []
95
- for doc in tqdm(tok):
96
- idxs = list(map(lambda w: self[w], doc))
97
- tensors.append(torch.tensor(idxs, dtype=torch.int64).to(device))
98
- return tensors
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
  class RNN(nn.Module):
101
- def __init__(self, vocab_size, emb_dim, hid_dim, n_layers, bidir, dropout, pad_idx, n_classes):
 
102
  super().__init__()
103
- self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=pad_idx)
104
- self.rnn = nn.LSTM(emb_dim, hid_dim, num_layers=n_layers,
105
- bidirectional=bidir, dropout=dropout if n_layers>1 else 0)
 
 
 
 
 
106
  self.dropout = nn.Dropout(dropout)
107
- self.fc = nn.Linear(hid_dim * (2 if bidir else 1), n_classes)
108
- def forward(self, text, lengths):
 
109
  embedded = self.dropout(self.embedding(text))
110
- packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths.to('cpu'), enforce_sorted=False)
111
- packed_out, (h, c) = self.rnn(packed)
 
 
112
  if self.rnn.bidirectional:
113
- h = torch.cat((h[-2], h[-1]), dim=1)
114
  else:
115
- h = h[-1]
116
- return self.fc(self.dropout(h))
117
 
118
  model_path = hf_hub_download(repo_id="Di12/sentiment_analysis", filename="model.pt", repo_type="space")
119
  embedding_path = hf_hub_download(repo_id="Di12/sentiment_analysis", filename="vi_word2vec_reduced.txt", repo_type="space")
@@ -127,19 +178,22 @@ vocab = Vocabulary()
127
  for w in word_embedding.stoi.keys(): vocab.add(w)
128
 
129
  # Model hyperparams
130
- input_dim = word_embedding.vectors.shape[0]
131
- emb_dim = 100
132
- hid_dim = 256
 
133
  n_layers = 2
134
- bidir = True
135
- dropout = 0.5
136
- pad_idx = vocab['<pad>']
 
 
137
 
138
  label_map = {0: 'tiêu cực', 1: 'bình thường', 2: 'tích cực'}
139
 
140
  # Ensure model and its weights moved to correct device
141
  def load_model(path: str):
142
- model = RNN(input_dim, emb_dim, hid_dim, n_layers, bidir, dropout, pad_idx, len(label_map))
143
  model.load_state_dict(torch.load(path, map_location=device))
144
  model.to(device)
145
  model.eval()
 
36
  url_pattern = r"http\S+|www\S+" # URLs
37
  user_pattern = r"@\w+" # usernames
38
  emoji_pattern = re.compile(
39
+ "[" # start
40
+ "\U0001F600-\U0001F64F" # emoticons
41
+ "\U0001F300-\U0001F5FF" # symbols & pictographs
42
+ "\U0001F680-\U0001F6FF" # transport & map symbols
43
+ "\U0001F1E0-\U0001F1FF" # flags
44
+ "]+", flags=re.UNICODE)
45
+ emoticon_pattern = r"[:;=8][\-o\*']?[\)\]\(\[dDpP/:}\{@\|\\]" # emoticons
46
+ repeat_pattern = re.compile(r"(.)\1{2,}") # 3 or more repeats
 
47
 
48
  def clean_text(text: str) -> str:
49
+ # Unicode normalization
50
  text = str(text)
51
+ text = unicodedata.normalize('NFC', text) # Chuẩn hoá Unicode rõ ràng (căn bản)
52
+
53
+ # Lowercase
54
  text = text.lower()
55
+
56
+ # Remove URLs and usernames
57
  text = re.sub(url_pattern, '', text)
58
  text = re.sub(user_pattern, '', text)
59
+
60
+ # Remove emojis and emoticons
61
  text = emoji_pattern.sub(' ', text)
62
  text = re.sub(emoticon_pattern, ' ', text)
63
 
64
+ # Expand common abbreviations
65
+ def expand(match):
66
+ word = match.group(0)
67
+ return abbreviations.get(word, word)
68
+
69
  if abbreviations:
 
 
 
70
  pattern = re.compile(r"\b(" + "|".join(map(re.escape, abbreviations.keys())) + r")\b")
71
  text = pattern.sub(expand, text)
72
 
73
+ # Remove repeated characters (e.g., "quaaa" -> "qua" )
74
  text = repeat_pattern.sub(r"\1", text)
75
+ # Remove punctuation (keep Vietnamese letters & numbers)
76
  text = re.sub(r"[^\w\s\u00C0-\u024F]", ' ', text)
77
+ # Remove extra whitespace
78
  text = re.sub(r"\s+", ' ', text).strip()
79
+
80
  return text
81
 
82
  # Vocabulary class unchanged...
83
  class Vocabulary:
84
  def __init__(self):
85
+ self.word2id = dict()
86
+ self.word2id['<pad>'] = 0 # Pad Token
87
+ self.word2id['<unk>'] = 1 # Unknown Token
88
+ self.unk_id = self.word2id['<unk>']
89
+ self.id2word = {v: k for k, v in self.word2id.items()}
90
+
91
+ def __getitem__(self, word):
92
+ return self.word2id.get(word, self.unk_id)
93
+
94
+ def __contains__(self, word):
95
+ return word in self.word2id
96
+
97
+ def __len__(self):
98
+ return len(self.word2id)
99
+
100
+ def id2word(self, word_index):
101
+ return self.id2word[word_index]
102
+
103
  def add(self, word):
104
+ if word not in self:
105
+ word_index = self.word2id[word] = len(self.word2id)
106
+ self.id2word[word_index] = word
107
+ return word_index
108
+ else:
109
+ return self[word]
110
+
111
  @staticmethod
112
  def tokenize_corpus(corpus):
113
+ print("Tokenize the corpus...")
114
+ tokenized_corpus = list()
115
+ for document in tqdm(corpus):
116
+ tokenized_document = [word.replace(" ", "_") for word in word_tokenize(document)]
117
+ tokenized_corpus.append(tokenized_document)
118
+
119
+ return tokenized_corpus
120
+
121
  def corpus_to_tensor(self, corpus, is_tokenized=False):
122
+ if is_tokenized:
123
+ tokenized_corpus = corpus
124
+ else:
125
+ tokenized_corpus = self.tokenize_corpus(corpus)
126
+ indicies_corpus = list()
127
+ for document in tqdm(tokenized_corpus):
128
+ indicies_document = torch.tensor(list(map(lambda word: self[word], document)),
129
+ dtype=torch.int64)
130
+ indicies_corpus.append(indicies_document)
131
+
132
+ return indicies_corpus
133
+
134
+ def tensor_to_corpus(self, tensor):
135
+ corpus = list()
136
+ for indicies in tqdm(tensor):
137
+ document = list(map(lambda index: self.id2word[index.item()], indicies))
138
+ corpus.append(document)
139
+
140
+ return corpus
141
 
142
  class RNN(nn.Module):
143
+ def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers,
144
+ bidirectional, dropout, pad_idx, n_classes):
145
  super().__init__()
146
+ self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
147
+ self.rnn = nn.LSTM(
148
+ embedding_dim,
149
+ hidden_dim,
150
+ num_layers=n_layers,
151
+ bidirectional=bidirectional,
152
+ dropout=dropout if n_layers > 1 else 0
153
+ )
154
  self.dropout = nn.Dropout(dropout)
155
+ self.fc = nn.Linear(hidden_dim * (2 if bidirectional else 1), n_classes)
156
+
157
+ def forward(self, text, text_lengths):
158
  embedded = self.dropout(self.embedding(text))
159
+ packed_embedded = nn.utils.rnn.pack_padded_sequence(
160
+ embedded, text_lengths.to('cpu'), enforce_sorted=False
161
+ )
162
+ packed_output, (hidden, cell) = self.rnn(packed_embedded)
163
  if self.rnn.bidirectional:
164
+ hidden = self.dropout(torch.cat((hidden[-2], hidden[-1]), dim=1))
165
  else:
166
+ hidden = self.dropout(hidden[-1])
167
+ return self.fc(hidden)
168
 
169
  model_path = hf_hub_download(repo_id="Di12/sentiment_analysis", filename="model.pt", repo_type="space")
170
  embedding_path = hf_hub_download(repo_id="Di12/sentiment_analysis", filename="vi_word2vec_reduced.txt", repo_type="space")
 
178
  for w in word_embedding.stoi.keys(): vocab.add(w)
179
 
180
  # Model hyperparams
181
+ input_dim = word_embedding.vectors.shape[0]
182
+ embedding_dim = 100
183
+ batch_size = 100
184
+ hidden_dim = 8
185
  n_layers = 2
186
+ bidirectional = False
187
+ dropout = 0.3
188
+ pad_idx = vocab["<pad>"]
189
+ unk_idx = vocab["<unk>"]
190
+ n_classes = 3 # positive, neutral, negative
191
 
192
  label_map = {0: 'tiêu cực', 1: 'bình thường', 2: 'tích cực'}
193
 
194
  # Ensure model and its weights moved to correct device
195
  def load_model(path: str):
196
+ model = RNN(input_dim, embedding_dim, hidden_dim, n_layers, bidirectional, dropout, pad_idx, n_classes)
197
  model.load_state_dict(torch.load(path, map_location=device))
198
  model.to(device)
199
  model.eval()