Spaces:
Sleeping
Sleeping
| import nltk | |
| from nltk.corpus import stopwords | |
| # tải stopwords | |
| try: | |
| stopwords.words("english") | |
| except LookupError: | |
| nltk.download("stopwords", quiet=True) | |
| # tải twitter_samples nếu cần | |
| try: | |
| from nltk.corpus import twitter_samples | |
| twitter_samples.fileids() | |
| except LookupError: | |
| nltk.download("twitter_samples", quiet=True) | |
| import re | |
| import string | |
| import numpy as np | |
| from nltk.stem import PorterStemmer | |
| from nltk.tokenize import TweetTokenizer | |
| # --- constants & tools --- | |
| pronouns = { | |
| "i","me","my","mine","myself", | |
| "we","us","our","ours","ourselves", | |
| "you","your","yours","yourself","yourselves", | |
| "he","him","his","himself", | |
| "she","her","hers","herself", | |
| "it","its","itself", | |
| "they","them","their","theirs","themselves", | |
| } | |
| _tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) | |
| _stemmer = PorterStemmer() | |
| _stopwords_en = set(stopwords.words("english")) | |
| def process_tweet(tweet): | |
| """Làm sạch + tokenize + remove stopwords/punctuation + stem. Trả về list token.""" | |
| tweet = re.sub(r"\$\w*", "", tweet) # bỏ tickers $GE | |
| tweet = re.sub(r"^RT[\s]+", "", tweet) # bỏ 'RT' | |
| tweet = re.sub(r"https?://[^\s\n\r]+", "", tweet) # bỏ URL | |
| tweet = re.sub(r"#", "", tweet) # bỏ dấu '#', giữ từ | |
| tokens = _tokenizer.tokenize(tweet) | |
| clean = [] | |
| for w in tokens: | |
| if (w not in _stopwords_en) and (w not in string.punctuation): | |
| clean.append(_stemmer.stem(w)) | |
| return clean | |
| def extract_features_2(tweet, freqs): | |
| """ | |
| x[0,0]: tổng tần suất từ (đã process) ở lớp 1.0 | |
| x[0,1]: tổng tần suất từ (đã process) ở lớp 0.0 | |
| """ | |
| words = process_tweet(tweet) | |
| x = np.zeros((1, 2)) | |
| for w in words: | |
| x[0, 0] += freqs.get((w, 1.0), 0) | |
| x[0, 1] += freqs.get((w, 0.0), 0) | |
| return x | |
| def extract_features_6(tweet, freqs): | |
| """ | |
| x1: tổng freq từ theo lớp 1.0 (tokenizer raw-lower) | |
| x2: tổng freq từ theo lớp 0.0 | |
| x3: 1 nếu có "no" trong tokens else 0 | |
| x4: đếm đại từ ngôi 1 & 2 (pronouns) | |
| x5: 1 nếu có '!' trong raw tweet else 0 | |
| x6: log(số lượng token) (0 nếu rỗng) | |
| """ | |
| words = _tokenizer.tokenize(tweet) | |
| x = np.zeros((1, 6)) | |
| for w in words: | |
| x[0, 0] += freqs.get((w, 1.0), 0) | |
| x[0, 1] += freqs.get((w, 0.0), 0) | |
| x[0, 2] = 1 if "no" in words else 0 | |
| x[0, 3] = sum(1 for w in words if w in pronouns) | |
| x[0, 4] = 1 if "!" in tweet else 0 | |
| x[0, 5] = np.log(len(words)) if len(words) > 0 else 0 | |
| return x | |
| def build_freqs(tweets, ys): | |
| """ | |
| Xây dựng tần suất (word, sentiment) | |
| Input: | |
| tweets: list các tweet | |
| ys: m×1 array (numpy) với nhãn sentiment mỗi tweet (0 hoặc 1) | |
| Output: | |
| freqs: dict {(word, y): count} | |
| """ | |
| yslist = np.squeeze(ys).tolist() | |
| freqs = {} | |
| for y, tweet in zip(yslist, tweets): | |
| for word in process_tweet(tweet): | |
| pair = (word, y) | |
| freqs[pair] = freqs.get(pair, 0) + 1 | |
| return freqs | |
| if __name__ == "__main__": | |
| """ | |
| Đoạn kiểm tra nhanh module: | |
| - tải dữ liệu twitter_samples | |
| - build freqs | |
| - trích 2 loại feature cho 1 tweet mẫu | |
| """ | |
| import nltk | |
| from nltk.corpus import twitter_samples | |
| # tải nếu thiếu | |
| try: | |
| twitter_samples.fileids() | |
| except LookupError: | |
| nltk.download("twitter_samples") | |
| try: | |
| stopwords.words("english") | |
| except LookupError: | |
| nltk.download("stopwords") | |
| # lấy dữ liệu pos/neg | |
| pos = twitter_samples.strings("positive_tweets.json") | |
| neg = twitter_samples.strings("negative_tweets.json") | |
| tweets = pos + neg | |
| y = np.array([1] * len(pos) + [0] * len(neg)).reshape(-1, 1) | |
| print(f"Tổng số tweet: {len(tweets)}") | |
| # build freqs | |
| freqs = build_freqs(tweets, y) | |
| print(f"Số cặp (word, sentiment): {len(freqs)}") | |
| # kiểm tra 1 tweet mẫu | |
| sample_tweet = tweets[0] | |
| print("\nTweet mẫu:", sample_tweet) | |
| print("Tokens (process_tweet):", process_tweet(sample_tweet)) | |
| x2 = extract_features_2(sample_tweet, freqs) | |
| x6 = extract_features_6(sample_tweet, freqs) | |
| print("\nFeatures 2 chiều:", x2) | |
| print("Features 6 chiều:", x6) |