Spaces:
Sleeping
Sleeping
File size: 4,368 Bytes
0e69286 9d9f0fa 0e69286 9d9f0fa 0e69286 9d454eb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 | import nltk
from nltk.corpus import stopwords
# tải stopwords
try:
stopwords.words("english")
except LookupError:
nltk.download("stopwords", quiet=True)
# tải twitter_samples nếu cần
try:
from nltk.corpus import twitter_samples
twitter_samples.fileids()
except LookupError:
nltk.download("twitter_samples", quiet=True)
import re
import string
import numpy as np
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
# --- constants & tools ---
pronouns = {
"i","me","my","mine","myself",
"we","us","our","ours","ourselves",
"you","your","yours","yourself","yourselves",
"he","him","his","himself",
"she","her","hers","herself",
"it","its","itself",
"they","them","their","theirs","themselves",
}
_tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
_stemmer = PorterStemmer()
_stopwords_en = set(stopwords.words("english"))
def process_tweet(tweet):
"""Làm sạch + tokenize + remove stopwords/punctuation + stem. Trả về list token."""
tweet = re.sub(r"\$\w*", "", tweet) # bỏ tickers $GE
tweet = re.sub(r"^RT[\s]+", "", tweet) # bỏ 'RT'
tweet = re.sub(r"https?://[^\s\n\r]+", "", tweet) # bỏ URL
tweet = re.sub(r"#", "", tweet) # bỏ dấu '#', giữ từ
tokens = _tokenizer.tokenize(tweet)
clean = []
for w in tokens:
if (w not in _stopwords_en) and (w not in string.punctuation):
clean.append(_stemmer.stem(w))
return clean
def extract_features_2(tweet, freqs):
"""
x[0,0]: tổng tần suất từ (đã process) ở lớp 1.0
x[0,1]: tổng tần suất từ (đã process) ở lớp 0.0
"""
words = process_tweet(tweet)
x = np.zeros((1, 2))
for w in words:
x[0, 0] += freqs.get((w, 1.0), 0)
x[0, 1] += freqs.get((w, 0.0), 0)
return x
def extract_features_6(tweet, freqs):
"""
x1: tổng freq từ theo lớp 1.0 (tokenizer raw-lower)
x2: tổng freq từ theo lớp 0.0
x3: 1 nếu có "no" trong tokens else 0
x4: đếm đại từ ngôi 1 & 2 (pronouns)
x5: 1 nếu có '!' trong raw tweet else 0
x6: log(số lượng token) (0 nếu rỗng)
"""
words = _tokenizer.tokenize(tweet)
x = np.zeros((1, 6))
for w in words:
x[0, 0] += freqs.get((w, 1.0), 0)
x[0, 1] += freqs.get((w, 0.0), 0)
x[0, 2] = 1 if "no" in words else 0
x[0, 3] = sum(1 for w in words if w in pronouns)
x[0, 4] = 1 if "!" in tweet else 0
x[0, 5] = np.log(len(words)) if len(words) > 0 else 0
return x
def build_freqs(tweets, ys):
"""
Xây dựng tần suất (word, sentiment)
Input:
tweets: list các tweet
ys: m×1 array (numpy) với nhãn sentiment mỗi tweet (0 hoặc 1)
Output:
freqs: dict {(word, y): count}
"""
yslist = np.squeeze(ys).tolist()
freqs = {}
for y, tweet in zip(yslist, tweets):
for word in process_tweet(tweet):
pair = (word, y)
freqs[pair] = freqs.get(pair, 0) + 1
return freqs
if __name__ == "__main__":
"""
Đoạn kiểm tra nhanh module:
- tải dữ liệu twitter_samples
- build freqs
- trích 2 loại feature cho 1 tweet mẫu
"""
import nltk
from nltk.corpus import twitter_samples
# tải nếu thiếu
try:
twitter_samples.fileids()
except LookupError:
nltk.download("twitter_samples")
try:
stopwords.words("english")
except LookupError:
nltk.download("stopwords")
# lấy dữ liệu pos/neg
pos = twitter_samples.strings("positive_tweets.json")
neg = twitter_samples.strings("negative_tweets.json")
tweets = pos + neg
y = np.array([1] * len(pos) + [0] * len(neg)).reshape(-1, 1)
print(f"Tổng số tweet: {len(tweets)}")
# build freqs
freqs = build_freqs(tweets, y)
print(f"Số cặp (word, sentiment): {len(freqs)}")
# kiểm tra 1 tweet mẫu
sample_tweet = tweets[0]
print("\nTweet mẫu:", sample_tweet)
print("Tokens (process_tweet):", process_tweet(sample_tweet))
x2 = extract_features_2(sample_tweet, freqs)
x6 = extract_features_6(sample_tweet, freqs)
print("\nFeatures 2 chiều:", x2)
print("Features 6 chiều:", x6) |