import nltk from nltk.corpus import stopwords # tải stopwords try: stopwords.words("english") except LookupError: nltk.download("stopwords", quiet=True) # tải twitter_samples nếu cần try: from nltk.corpus import twitter_samples twitter_samples.fileids() except LookupError: nltk.download("twitter_samples", quiet=True) import re import string import numpy as np from nltk.stem import PorterStemmer from nltk.tokenize import TweetTokenizer # --- constants & tools --- pronouns = { "i","me","my","mine","myself", "we","us","our","ours","ourselves", "you","your","yours","yourself","yourselves", "he","him","his","himself", "she","her","hers","herself", "it","its","itself", "they","them","their","theirs","themselves", } _tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) _stemmer = PorterStemmer() _stopwords_en = set(stopwords.words("english")) def process_tweet(tweet): """Làm sạch + tokenize + remove stopwords/punctuation + stem. Trả về list token.""" tweet = re.sub(r"\$\w*", "", tweet) # bỏ tickers $GE tweet = re.sub(r"^RT[\s]+", "", tweet) # bỏ 'RT' tweet = re.sub(r"https?://[^\s\n\r]+", "", tweet) # bỏ URL tweet = re.sub(r"#", "", tweet) # bỏ dấu '#', giữ từ tokens = _tokenizer.tokenize(tweet) clean = [] for w in tokens: if (w not in _stopwords_en) and (w not in string.punctuation): clean.append(_stemmer.stem(w)) return clean def extract_features_2(tweet, freqs): """ x[0,0]: tổng tần suất từ (đã process) ở lớp 1.0 x[0,1]: tổng tần suất từ (đã process) ở lớp 0.0 """ words = process_tweet(tweet) x = np.zeros((1, 2)) for w in words: x[0, 0] += freqs.get((w, 1.0), 0) x[0, 1] += freqs.get((w, 0.0), 0) return x def extract_features_6(tweet, freqs): """ x1: tổng freq từ theo lớp 1.0 (tokenizer raw-lower) x2: tổng freq từ theo lớp 0.0 x3: 1 nếu có "no" trong tokens else 0 x4: đếm đại từ ngôi 1 & 2 (pronouns) x5: 1 nếu có '!' trong raw tweet else 0 x6: log(số lượng token) (0 nếu rỗng) """ words = _tokenizer.tokenize(tweet) x = np.zeros((1, 6)) for w in words: x[0, 0] += freqs.get((w, 1.0), 0) x[0, 1] += freqs.get((w, 0.0), 0) x[0, 2] = 1 if "no" in words else 0 x[0, 3] = sum(1 for w in words if w in pronouns) x[0, 4] = 1 if "!" in tweet else 0 x[0, 5] = np.log(len(words)) if len(words) > 0 else 0 return x def build_freqs(tweets, ys): """ Xây dựng tần suất (word, sentiment) Input: tweets: list các tweet ys: m×1 array (numpy) với nhãn sentiment mỗi tweet (0 hoặc 1) Output: freqs: dict {(word, y): count} """ yslist = np.squeeze(ys).tolist() freqs = {} for y, tweet in zip(yslist, tweets): for word in process_tweet(tweet): pair = (word, y) freqs[pair] = freqs.get(pair, 0) + 1 return freqs if __name__ == "__main__": """ Đoạn kiểm tra nhanh module: - tải dữ liệu twitter_samples - build freqs - trích 2 loại feature cho 1 tweet mẫu """ import nltk from nltk.corpus import twitter_samples # tải nếu thiếu try: twitter_samples.fileids() except LookupError: nltk.download("twitter_samples") try: stopwords.words("english") except LookupError: nltk.download("stopwords") # lấy dữ liệu pos/neg pos = twitter_samples.strings("positive_tweets.json") neg = twitter_samples.strings("negative_tweets.json") tweets = pos + neg y = np.array([1] * len(pos) + [0] * len(neg)).reshape(-1, 1) print(f"Tổng số tweet: {len(tweets)}") # build freqs freqs = build_freqs(tweets, y) print(f"Số cặp (word, sentiment): {len(freqs)}") # kiểm tra 1 tweet mẫu sample_tweet = tweets[0] print("\nTweet mẫu:", sample_tweet) print("Tokens (process_tweet):", process_tweet(sample_tweet)) x2 = extract_features_2(sample_tweet, freqs) x6 = extract_features_6(sample_tweet, freqs) print("\nFeatures 2 chiều:", x2) print("Features 6 chiều:", x6)