ml_exercise / feature_extract.py
trantuan1701's picture
lalal
9d9f0fa
import nltk
from nltk.corpus import stopwords
# tải stopwords
try:
stopwords.words("english")
except LookupError:
nltk.download("stopwords", quiet=True)
# tải twitter_samples nếu cần
try:
from nltk.corpus import twitter_samples
twitter_samples.fileids()
except LookupError:
nltk.download("twitter_samples", quiet=True)
import re
import string
import numpy as np
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
# --- constants & tools ---
pronouns = {
"i","me","my","mine","myself",
"we","us","our","ours","ourselves",
"you","your","yours","yourself","yourselves",
"he","him","his","himself",
"she","her","hers","herself",
"it","its","itself",
"they","them","their","theirs","themselves",
}
_tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
_stemmer = PorterStemmer()
_stopwords_en = set(stopwords.words("english"))
def process_tweet(tweet):
"""Làm sạch + tokenize + remove stopwords/punctuation + stem. Trả về list token."""
tweet = re.sub(r"\$\w*", "", tweet) # bỏ tickers $GE
tweet = re.sub(r"^RT[\s]+", "", tweet) # bỏ 'RT'
tweet = re.sub(r"https?://[^\s\n\r]+", "", tweet) # bỏ URL
tweet = re.sub(r"#", "", tweet) # bỏ dấu '#', giữ từ
tokens = _tokenizer.tokenize(tweet)
clean = []
for w in tokens:
if (w not in _stopwords_en) and (w not in string.punctuation):
clean.append(_stemmer.stem(w))
return clean
def extract_features_2(tweet, freqs):
"""
x[0,0]: tổng tần suất từ (đã process) ở lớp 1.0
x[0,1]: tổng tần suất từ (đã process) ở lớp 0.0
"""
words = process_tweet(tweet)
x = np.zeros((1, 2))
for w in words:
x[0, 0] += freqs.get((w, 1.0), 0)
x[0, 1] += freqs.get((w, 0.0), 0)
return x
def extract_features_6(tweet, freqs):
"""
x1: tổng freq từ theo lớp 1.0 (tokenizer raw-lower)
x2: tổng freq từ theo lớp 0.0
x3: 1 nếu có "no" trong tokens else 0
x4: đếm đại từ ngôi 1 & 2 (pronouns)
x5: 1 nếu có '!' trong raw tweet else 0
x6: log(số lượng token) (0 nếu rỗng)
"""
words = _tokenizer.tokenize(tweet)
x = np.zeros((1, 6))
for w in words:
x[0, 0] += freqs.get((w, 1.0), 0)
x[0, 1] += freqs.get((w, 0.0), 0)
x[0, 2] = 1 if "no" in words else 0
x[0, 3] = sum(1 for w in words if w in pronouns)
x[0, 4] = 1 if "!" in tweet else 0
x[0, 5] = np.log(len(words)) if len(words) > 0 else 0
return x
def build_freqs(tweets, ys):
"""
Xây dựng tần suất (word, sentiment)
Input:
tweets: list các tweet
ys: m×1 array (numpy) với nhãn sentiment mỗi tweet (0 hoặc 1)
Output:
freqs: dict {(word, y): count}
"""
yslist = np.squeeze(ys).tolist()
freqs = {}
for y, tweet in zip(yslist, tweets):
for word in process_tweet(tweet):
pair = (word, y)
freqs[pair] = freqs.get(pair, 0) + 1
return freqs
if __name__ == "__main__":
"""
Đoạn kiểm tra nhanh module:
- tải dữ liệu twitter_samples
- build freqs
- trích 2 loại feature cho 1 tweet mẫu
"""
import nltk
from nltk.corpus import twitter_samples
# tải nếu thiếu
try:
twitter_samples.fileids()
except LookupError:
nltk.download("twitter_samples")
try:
stopwords.words("english")
except LookupError:
nltk.download("stopwords")
# lấy dữ liệu pos/neg
pos = twitter_samples.strings("positive_tweets.json")
neg = twitter_samples.strings("negative_tweets.json")
tweets = pos + neg
y = np.array([1] * len(pos) + [0] * len(neg)).reshape(-1, 1)
print(f"Tổng số tweet: {len(tweets)}")
# build freqs
freqs = build_freqs(tweets, y)
print(f"Số cặp (word, sentiment): {len(freqs)}")
# kiểm tra 1 tweet mẫu
sample_tweet = tweets[0]
print("\nTweet mẫu:", sample_tweet)
print("Tokens (process_tweet):", process_tweet(sample_tweet))
x2 = extract_features_2(sample_tweet, freqs)
x6 = extract_features_6(sample_tweet, freqs)
print("\nFeatures 2 chiều:", x2)
print("Features 6 chiều:", x6)