Spaces:

Di12
/

sentiment_analysis

Sleeping

App Files Files Community

Di12 commited on May 29, 2025

Commit

5930708

1 Parent(s): 2a1a3f2

Update app.py

Browse files

Files changed (1) hide show

app.py +107 -53

app.py CHANGED Viewed

@@ -36,84 +36,135 @@ abbreviations = {
 url_pattern = r"http\S+|www\S+"  # URLs
 user_pattern = r"@\w+"  # usernames
 emoji_pattern = re.compile(
-    "["
-    "\U0001F600-\U0001F64F"
-    "\U0001F300-\U0001F5FF"
-    "\U0001F680-\U0001F6FF"
-    "\U0001F1E0-\U0001F1FF"
-    "]", flags=re.UNICODE)
-emoticon_pattern = r"[:;=8][\-o\*']?[\)\]\(\[dDpP/:}\{@\|\\]"
-repeat_pattern = re.compile(r"(.)\1{2,}")
 def clean_text(text: str) -> str:
     text = str(text)
-    text = unicodedata.normalize('NFC', text)
     text = text.lower()
     text = re.sub(url_pattern, '', text)
     text = re.sub(user_pattern, '', text)
     text = emoji_pattern.sub(' ', text)
     text = re.sub(emoticon_pattern, ' ', text)
     if abbreviations:
-        def expand(match):
-            word = match.group(0)
-            return abbreviations.get(word, word)
         pattern = re.compile(r"\b(" + "|".join(map(re.escape, abbreviations.keys())) + r")\b")
         text = pattern.sub(expand, text)
     text = repeat_pattern.sub(r"\1", text)
     text = re.sub(r"[^\w\s\u00C0-\u024F]", ' ', text)
     text = re.sub(r"\s+", ' ', text).strip()
     return text
 # Vocabulary class unchanged...
 class Vocabulary:
     def __init__(self):
-        self.word2id = {'<pad>': 0, '<unk>': 1}
-        self.unk_id = 1
-        self.id2word = {0: '<pad>', 1: '<unk>'}
-    def __getitem__(self, word): return self.word2id.get(word, self.unk_id)
-    def __contains__(self, word): return word in self.word2id
-    def __len__(self): return len(self.word2id)
     def add(self, word):
-        if word not in self.word2id:
-            idx = len(self.word2id)
-            self.word2id[word] = idx
-            self.id2word[idx] = word
-            return idx
-        return self[word]
     @staticmethod
     def tokenize_corpus(corpus):
-        tokenized = []
-        for doc in tqdm(corpus):
-            tokenized.append([w.replace(' ', '_') for w in word_tokenize(doc)])
-        return tokenized
     def corpus_to_tensor(self, corpus, is_tokenized=False):
-        tok = corpus if is_tokenized else self.tokenize_corpus(corpus)
-        tensors = []
-        for doc in tqdm(tok):
-            idxs = list(map(lambda w: self[w], doc))
-            tensors.append(torch.tensor(idxs, dtype=torch.int64).to(device))
-        return tensors
 class RNN(nn.Module):
-    def __init__(self, vocab_size, emb_dim, hid_dim, n_layers, bidir, dropout, pad_idx, n_classes):
         super().__init__()
-        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=pad_idx)
-        self.rnn = nn.LSTM(emb_dim, hid_dim, num_layers=n_layers,
-                           bidirectional=bidir, dropout=dropout if n_layers>1 else 0)
         self.dropout = nn.Dropout(dropout)
-        self.fc = nn.Linear(hid_dim * (2 if bidir else 1), n_classes)
-    def forward(self, text, lengths):
         embedded = self.dropout(self.embedding(text))
-        packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths.to('cpu'), enforce_sorted=False)
-        packed_out, (h, c) = self.rnn(packed)
         if self.rnn.bidirectional:
-            h = torch.cat((h[-2], h[-1]), dim=1)
         else:
-            h = h[-1]
-        return self.fc(self.dropout(h))
 model_path = hf_hub_download(repo_id="Di12/sentiment_analysis", filename="model.pt", repo_type="space")
 embedding_path = hf_hub_download(repo_id="Di12/sentiment_analysis", filename="vi_word2vec_reduced.txt", repo_type="space")
@@ -127,19 +178,22 @@ vocab = Vocabulary()
 for w in word_embedding.stoi.keys(): vocab.add(w)
 # Model hyperparams
-input_dim = word_embedding.vectors.shape[0]
-emb_dim = 100
-hid_dim = 256
 n_layers = 2
-bidir = True
-dropout = 0.5
-pad_idx = vocab['<pad>']
 label_map = {0: 'tiêu cực', 1: 'bình thường', 2: 'tích cực'}
 # Ensure model and its weights moved to correct device
 def load_model(path: str):
-    model = RNN(input_dim, emb_dim, hid_dim, n_layers, bidir, dropout, pad_idx, len(label_map))
     model.load_state_dict(torch.load(path, map_location=device))
     model.to(device)
     model.eval()

 url_pattern = r"http\S+|www\S+"  # URLs
 user_pattern = r"@\w+"  # usernames
 emoji_pattern = re.compile(
+    "["  # start
+    "\U0001F600-\U0001F64F"  # emoticons
+    "\U0001F300-\U0001F5FF"  # symbols & pictographs
+    "\U0001F680-\U0001F6FF"  # transport & map symbols
+    "\U0001F1E0-\U0001F1FF"  # flags
+    "]+", flags=re.UNICODE)
+emoticon_pattern = r"[:;=8][\-o\*']?[\)\]\(\[dDpP/:}\{@\|\\]"  # emoticons
+repeat_pattern = re.compile(r"(.)\1{2,}")  # 3 or more repeats
 def clean_text(text: str) -> str:
+    # Unicode normalization
     text = str(text)
+    text = unicodedata.normalize('NFC', text)  # Chuẩn hoá Unicode rõ ràng (căn bản)
+    # Lowercase
     text = text.lower()
+    # Remove URLs and usernames
     text = re.sub(url_pattern, '', text)
     text = re.sub(user_pattern, '', text)
+    # Remove emojis and emoticons
     text = emoji_pattern.sub(' ', text)
     text = re.sub(emoticon_pattern, ' ', text)
+    # Expand common abbreviations
+    def expand(match):
+        word = match.group(0)
+        return abbreviations.get(word, word)
     if abbreviations:
         pattern = re.compile(r"\b(" + "|".join(map(re.escape, abbreviations.keys())) + r")\b")
         text = pattern.sub(expand, text)
+    # Remove repeated characters (e.g., "quaaa" -> "qua" )
     text = repeat_pattern.sub(r"\1", text)
+    # Remove punctuation (keep Vietnamese letters & numbers)
     text = re.sub(r"[^\w\s\u00C0-\u024F]", ' ', text)
+    # Remove extra whitespace
     text = re.sub(r"\s+", ' ', text).strip()
     return text
 # Vocabulary class unchanged...
 class Vocabulary:
     def __init__(self):
+        self.word2id = dict()
+        self.word2id['<pad>'] = 0   # Pad Token
+        self.word2id['<unk>'] = 1   # Unknown Token
+        self.unk_id = self.word2id['<unk>']
+        self.id2word = {v: k for k, v in self.word2id.items()}
+    def __getitem__(self, word):
+        return self.word2id.get(word, self.unk_id)
+    def __contains__(self, word):
+        return word in self.word2id
+    def __len__(self):
+        return len(self.word2id)
+    def id2word(self, word_index):
+        return self.id2word[word_index]
     def add(self, word):
+        if word not in self:
+            word_index = self.word2id[word] = len(self.word2id)
+            self.id2word[word_index] = word
+            return word_index
+        else:
+            return self[word]
     @staticmethod
     def tokenize_corpus(corpus):
+        print("Tokenize the corpus...")
+        tokenized_corpus = list()
+        for document in tqdm(corpus):
+            tokenized_document = [word.replace(" ", "_") for word in word_tokenize(document)]
+            tokenized_corpus.append(tokenized_document)
+        return tokenized_corpus
     def corpus_to_tensor(self, corpus, is_tokenized=False):
+        if is_tokenized:
+            tokenized_corpus = corpus
+        else:
+            tokenized_corpus = self.tokenize_corpus(corpus)
+        indicies_corpus = list()
+        for document in tqdm(tokenized_corpus):
+            indicies_document = torch.tensor(list(map(lambda word: self[word], document)),
+                                             dtype=torch.int64)
+            indicies_corpus.append(indicies_document)
+        return indicies_corpus
+    def tensor_to_corpus(self, tensor):
+        corpus = list()
+        for indicies in tqdm(tensor):
+            document = list(map(lambda index: self.id2word[index.item()], indicies))
+            corpus.append(document)
+        return corpus
 class RNN(nn.Module):
+    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers,
+                 bidirectional, dropout, pad_idx, n_classes):
         super().__init__()
+        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
+        self.rnn = nn.LSTM(
+            embedding_dim,
+            hidden_dim,
+            num_layers=n_layers,
+            bidirectional=bidirectional,
+            dropout=dropout if n_layers > 1 else 0
+        )
         self.dropout = nn.Dropout(dropout)
+        self.fc = nn.Linear(hidden_dim * (2 if bidirectional else 1), n_classes)
+    def forward(self, text, text_lengths):
         embedded = self.dropout(self.embedding(text))
+        packed_embedded = nn.utils.rnn.pack_padded_sequence(
+            embedded, text_lengths.to('cpu'), enforce_sorted=False
+        )
+        packed_output, (hidden, cell) = self.rnn(packed_embedded)
         if self.rnn.bidirectional:
+            hidden = self.dropout(torch.cat((hidden[-2], hidden[-1]), dim=1))
         else:
+            hidden = self.dropout(hidden[-1])
+        return self.fc(hidden)
 model_path = hf_hub_download(repo_id="Di12/sentiment_analysis", filename="model.pt", repo_type="space")
 embedding_path = hf_hub_download(repo_id="Di12/sentiment_analysis", filename="vi_word2vec_reduced.txt", repo_type="space")
 for w in word_embedding.stoi.keys(): vocab.add(w)
 # Model hyperparams
+input_dim = word_embedding.vectors.shape[0]
+embedding_dim = 100
+batch_size = 100
+hidden_dim = 8
 n_layers = 2
+bidirectional = False
+dropout = 0.3
+pad_idx = vocab["<pad>"]
+unk_idx = vocab["<unk>"]
+n_classes = 3  # positive, neutral, negative
 label_map = {0: 'tiêu cực', 1: 'bình thường', 2: 'tích cực'}
 # Ensure model and its weights moved to correct device
 def load_model(path: str):
+    model = RNN(input_dim, embedding_dim, hidden_dim, n_layers, bidirectional, dropout, pad_idx, n_classes)
     model.load_state_dict(torch.load(path, map_location=device))
     model.to(device)
     model.eval()