Spaces:
Sleeping
Sleeping
Upload DistilBert_YouTube_Sentiment.ipynb
Browse files- DistilBert_YouTube_Sentiment.ipynb +1927 -0
DistilBert_YouTube_Sentiment.ipynb
ADDED
|
@@ -0,0 +1,1927 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"source": [
|
| 6 |
+
"# YouTube Comments Sentiment Analysis with DistilBERT\n",
|
| 7 |
+
"\n",
|
| 8 |
+
"This project performs **sentiment analysis** on YouTube comments using a fine-tuned **DistilBERT transformer model**. \n",
|
| 9 |
+
"The main goals are to: \n",
|
| 10 |
+
"- Load and clean real-world YouTube comments data. \n",
|
| 11 |
+
"- Preprocess text with tokenization, stopword removal, and lemmatization. \n",
|
| 12 |
+
"- Handle class imbalance using **WeightedRandomSampler**. \n",
|
| 13 |
+
"- Fine-tune a transformer model for **multi-class sentiment classification**. \n",
|
| 14 |
+
"\n",
|
| 15 |
+
"We evaluate the model on accuracy and loss, and save the best-performing model for future use.\n"
|
| 16 |
+
],
|
| 17 |
+
"metadata": {
|
| 18 |
+
"id": "GoJEsM2mm5N2"
|
| 19 |
+
}
|
| 20 |
+
},
|
| 21 |
+
{
|
| 22 |
+
"source": [
|
| 23 |
+
"# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,\n",
|
| 24 |
+
"# THEN FEEL FREE TO DELETE THIS CELL.\n",
|
| 25 |
+
"# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON\n",
|
| 26 |
+
"# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR\n",
|
| 27 |
+
"# NOTEBOOK.\n",
|
| 28 |
+
"import kagglehub\n",
|
| 29 |
+
"atifaliak_youtube_comments_dataset_path = kagglehub.dataset_download('atifaliak/youtube-comments-dataset')\n",
|
| 30 |
+
"\n",
|
| 31 |
+
"print('Data source import complete.')\n"
|
| 32 |
+
],
|
| 33 |
+
"metadata": {
|
| 34 |
+
"id": "XuzrjY5i6Ppc",
|
| 35 |
+
"outputId": "6794b245-d535-416e-ba03-9737f7bb4316",
|
| 36 |
+
"colab": {
|
| 37 |
+
"base_uri": "https://localhost:8080/"
|
| 38 |
+
}
|
| 39 |
+
},
|
| 40 |
+
"cell_type": "code",
|
| 41 |
+
"outputs": [
|
| 42 |
+
{
|
| 43 |
+
"output_type": "stream",
|
| 44 |
+
"name": "stdout",
|
| 45 |
+
"text": [
|
| 46 |
+
"Data source import complete.\n"
|
| 47 |
+
]
|
| 48 |
+
}
|
| 49 |
+
],
|
| 50 |
+
"execution_count": null
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"cell_type": "code",
|
| 54 |
+
"source": [
|
| 55 |
+
"atifaliak_youtube_comments_dataset_path"
|
| 56 |
+
],
|
| 57 |
+
"metadata": {
|
| 58 |
+
"id": "W-7b2YCw6d6D",
|
| 59 |
+
"outputId": "ad93ac9c-1b93-446f-aa81-9e3cc3f89bc9",
|
| 60 |
+
"colab": {
|
| 61 |
+
"base_uri": "https://localhost:8080/",
|
| 62 |
+
"height": 35
|
| 63 |
+
}
|
| 64 |
+
},
|
| 65 |
+
"execution_count": null,
|
| 66 |
+
"outputs": [
|
| 67 |
+
{
|
| 68 |
+
"output_type": "execute_result",
|
| 69 |
+
"data": {
|
| 70 |
+
"text/plain": [
|
| 71 |
+
"'/kaggle/input/youtube-comments-dataset'"
|
| 72 |
+
],
|
| 73 |
+
"application/vnd.google.colaboratory.intrinsic+json": {
|
| 74 |
+
"type": "string"
|
| 75 |
+
}
|
| 76 |
+
},
|
| 77 |
+
"metadata": {},
|
| 78 |
+
"execution_count": 2
|
| 79 |
+
}
|
| 80 |
+
]
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"cell_type": "markdown",
|
| 84 |
+
"source": [
|
| 85 |
+
"## π Dataset Loading\n",
|
| 86 |
+
"\n",
|
| 87 |
+
"We use the **YouTube Comments Dataset** from Kaggle: \n",
|
| 88 |
+
"- It contains user comments with associated sentiment labels (`Positive`, `Negative`, `Neutral`). \n",
|
| 89 |
+
"- We load the dataset, check for duplicates and missing values, and clean it for training. \n",
|
| 90 |
+
"\n",
|
| 91 |
+
"This step ensures we start with a high-quality dataset.\n"
|
| 92 |
+
],
|
| 93 |
+
"metadata": {
|
| 94 |
+
"id": "CZH9Jx35q7e8"
|
| 95 |
+
}
|
| 96 |
+
},
|
| 97 |
+
{
|
| 98 |
+
"metadata": {
|
| 99 |
+
"_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
|
| 100 |
+
"_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
|
| 101 |
+
"trusted": true,
|
| 102 |
+
"id": "Y5PBXtai6Ppk",
|
| 103 |
+
"outputId": "30f460a9-72c7-4847-d887-80b165e98bfb",
|
| 104 |
+
"colab": {
|
| 105 |
+
"base_uri": "https://localhost:8080/"
|
| 106 |
+
}
|
| 107 |
+
},
|
| 108 |
+
"cell_type": "code",
|
| 109 |
+
"source": [
|
| 110 |
+
"# This Python 3 environment comes with many helpful analytics libraries installed\n",
|
| 111 |
+
"# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n",
|
| 112 |
+
"# For example, here's several helpful packages to load\n",
|
| 113 |
+
"\n",
|
| 114 |
+
"import numpy as np # linear algebra\n",
|
| 115 |
+
"import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
|
| 116 |
+
"\n",
|
| 117 |
+
"# Input data files are available in the read-only \"../input/\" directory\n",
|
| 118 |
+
"# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n",
|
| 119 |
+
"\n",
|
| 120 |
+
"import os\n",
|
| 121 |
+
"for dirname, _, filenames in os.walk('/root/.cache/kagglehub/datasets/atifaliak/youtube-comments-dataset/versions/1'):\n",
|
| 122 |
+
" for filename in filenames:\n",
|
| 123 |
+
" print(os.path.join(dirname, filename))\n",
|
| 124 |
+
"\n",
|
| 125 |
+
"# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\"\n",
|
| 126 |
+
"# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session"
|
| 127 |
+
],
|
| 128 |
+
"execution_count": null,
|
| 129 |
+
"outputs": [
|
| 130 |
+
{
|
| 131 |
+
"output_type": "stream",
|
| 132 |
+
"name": "stdout",
|
| 133 |
+
"text": [
|
| 134 |
+
"/root/.cache/kagglehub/datasets/atifaliak/youtube-comments-dataset/versions/1/YoutubeCommentsDataSet.csv\n"
|
| 135 |
+
]
|
| 136 |
+
}
|
| 137 |
+
]
|
| 138 |
+
},
|
| 139 |
+
{
|
| 140 |
+
"cell_type": "code",
|
| 141 |
+
"source": [
|
| 142 |
+
"you = pd.read_csv('/root/.cache/kagglehub/datasets/atifaliak/youtube-comments-dataset/versions/1/YoutubeCommentsDataSet.csv')\n",
|
| 143 |
+
"you.head(5)"
|
| 144 |
+
],
|
| 145 |
+
"metadata": {
|
| 146 |
+
"id": "yMlHGUD36gkB",
|
| 147 |
+
"outputId": "3058d45f-5b8d-4142-eec1-cbd88c4e617d",
|
| 148 |
+
"colab": {
|
| 149 |
+
"base_uri": "https://localhost:8080/",
|
| 150 |
+
"height": 206
|
| 151 |
+
}
|
| 152 |
+
},
|
| 153 |
+
"execution_count": null,
|
| 154 |
+
"outputs": [
|
| 155 |
+
{
|
| 156 |
+
"output_type": "execute_result",
|
| 157 |
+
"data": {
|
| 158 |
+
"text/plain": [
|
| 159 |
+
" Comment Sentiment\n",
|
| 160 |
+
"0 lets not forget that apple pay in 2014 require... neutral\n",
|
| 161 |
+
"1 here in nz 50 of retailers donβt even have con... negative\n",
|
| 162 |
+
"2 i will forever acknowledge this channel with t... positive\n",
|
| 163 |
+
"3 whenever i go to a place that doesnβt take app... negative\n",
|
| 164 |
+
"4 apple pay is so convenient secure and easy to ... positive"
|
| 165 |
+
],
|
| 166 |
+
"text/html": [
|
| 167 |
+
"\n",
|
| 168 |
+
" <div id=\"df-cdbfc87d-770f-401e-92ef-6c4c1d221700\" class=\"colab-df-container\">\n",
|
| 169 |
+
" <div>\n",
|
| 170 |
+
"<style scoped>\n",
|
| 171 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 172 |
+
" vertical-align: middle;\n",
|
| 173 |
+
" }\n",
|
| 174 |
+
"\n",
|
| 175 |
+
" .dataframe tbody tr th {\n",
|
| 176 |
+
" vertical-align: top;\n",
|
| 177 |
+
" }\n",
|
| 178 |
+
"\n",
|
| 179 |
+
" .dataframe thead th {\n",
|
| 180 |
+
" text-align: right;\n",
|
| 181 |
+
" }\n",
|
| 182 |
+
"</style>\n",
|
| 183 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 184 |
+
" <thead>\n",
|
| 185 |
+
" <tr style=\"text-align: right;\">\n",
|
| 186 |
+
" <th></th>\n",
|
| 187 |
+
" <th>Comment</th>\n",
|
| 188 |
+
" <th>Sentiment</th>\n",
|
| 189 |
+
" </tr>\n",
|
| 190 |
+
" </thead>\n",
|
| 191 |
+
" <tbody>\n",
|
| 192 |
+
" <tr>\n",
|
| 193 |
+
" <th>0</th>\n",
|
| 194 |
+
" <td>lets not forget that apple pay in 2014 require...</td>\n",
|
| 195 |
+
" <td>neutral</td>\n",
|
| 196 |
+
" </tr>\n",
|
| 197 |
+
" <tr>\n",
|
| 198 |
+
" <th>1</th>\n",
|
| 199 |
+
" <td>here in nz 50 of retailers donβt even have con...</td>\n",
|
| 200 |
+
" <td>negative</td>\n",
|
| 201 |
+
" </tr>\n",
|
| 202 |
+
" <tr>\n",
|
| 203 |
+
" <th>2</th>\n",
|
| 204 |
+
" <td>i will forever acknowledge this channel with t...</td>\n",
|
| 205 |
+
" <td>positive</td>\n",
|
| 206 |
+
" </tr>\n",
|
| 207 |
+
" <tr>\n",
|
| 208 |
+
" <th>3</th>\n",
|
| 209 |
+
" <td>whenever i go to a place that doesnβt take app...</td>\n",
|
| 210 |
+
" <td>negative</td>\n",
|
| 211 |
+
" </tr>\n",
|
| 212 |
+
" <tr>\n",
|
| 213 |
+
" <th>4</th>\n",
|
| 214 |
+
" <td>apple pay is so convenient secure and easy to ...</td>\n",
|
| 215 |
+
" <td>positive</td>\n",
|
| 216 |
+
" </tr>\n",
|
| 217 |
+
" </tbody>\n",
|
| 218 |
+
"</table>\n",
|
| 219 |
+
"</div>\n",
|
| 220 |
+
" <div class=\"colab-df-buttons\">\n",
|
| 221 |
+
"\n",
|
| 222 |
+
" <div class=\"colab-df-container\">\n",
|
| 223 |
+
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-cdbfc87d-770f-401e-92ef-6c4c1d221700')\"\n",
|
| 224 |
+
" title=\"Convert this dataframe to an interactive table.\"\n",
|
| 225 |
+
" style=\"display:none;\">\n",
|
| 226 |
+
"\n",
|
| 227 |
+
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
|
| 228 |
+
" <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
|
| 229 |
+
" </svg>\n",
|
| 230 |
+
" </button>\n",
|
| 231 |
+
"\n",
|
| 232 |
+
" <style>\n",
|
| 233 |
+
" .colab-df-container {\n",
|
| 234 |
+
" display:flex;\n",
|
| 235 |
+
" gap: 12px;\n",
|
| 236 |
+
" }\n",
|
| 237 |
+
"\n",
|
| 238 |
+
" .colab-df-convert {\n",
|
| 239 |
+
" background-color: #E8F0FE;\n",
|
| 240 |
+
" border: none;\n",
|
| 241 |
+
" border-radius: 50%;\n",
|
| 242 |
+
" cursor: pointer;\n",
|
| 243 |
+
" display: none;\n",
|
| 244 |
+
" fill: #1967D2;\n",
|
| 245 |
+
" height: 32px;\n",
|
| 246 |
+
" padding: 0 0 0 0;\n",
|
| 247 |
+
" width: 32px;\n",
|
| 248 |
+
" }\n",
|
| 249 |
+
"\n",
|
| 250 |
+
" .colab-df-convert:hover {\n",
|
| 251 |
+
" background-color: #E2EBFA;\n",
|
| 252 |
+
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
|
| 253 |
+
" fill: #174EA6;\n",
|
| 254 |
+
" }\n",
|
| 255 |
+
"\n",
|
| 256 |
+
" .colab-df-buttons div {\n",
|
| 257 |
+
" margin-bottom: 4px;\n",
|
| 258 |
+
" }\n",
|
| 259 |
+
"\n",
|
| 260 |
+
" [theme=dark] .colab-df-convert {\n",
|
| 261 |
+
" background-color: #3B4455;\n",
|
| 262 |
+
" fill: #D2E3FC;\n",
|
| 263 |
+
" }\n",
|
| 264 |
+
"\n",
|
| 265 |
+
" [theme=dark] .colab-df-convert:hover {\n",
|
| 266 |
+
" background-color: #434B5C;\n",
|
| 267 |
+
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
|
| 268 |
+
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
|
| 269 |
+
" fill: #FFFFFF;\n",
|
| 270 |
+
" }\n",
|
| 271 |
+
" </style>\n",
|
| 272 |
+
"\n",
|
| 273 |
+
" <script>\n",
|
| 274 |
+
" const buttonEl =\n",
|
| 275 |
+
" document.querySelector('#df-cdbfc87d-770f-401e-92ef-6c4c1d221700 button.colab-df-convert');\n",
|
| 276 |
+
" buttonEl.style.display =\n",
|
| 277 |
+
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
|
| 278 |
+
"\n",
|
| 279 |
+
" async function convertToInteractive(key) {\n",
|
| 280 |
+
" const element = document.querySelector('#df-cdbfc87d-770f-401e-92ef-6c4c1d221700');\n",
|
| 281 |
+
" const dataTable =\n",
|
| 282 |
+
" await google.colab.kernel.invokeFunction('convertToInteractive',\n",
|
| 283 |
+
" [key], {});\n",
|
| 284 |
+
" if (!dataTable) return;\n",
|
| 285 |
+
"\n",
|
| 286 |
+
" const docLinkHtml = 'Like what you see? Visit the ' +\n",
|
| 287 |
+
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
|
| 288 |
+
" + ' to learn more about interactive tables.';\n",
|
| 289 |
+
" element.innerHTML = '';\n",
|
| 290 |
+
" dataTable['output_type'] = 'display_data';\n",
|
| 291 |
+
" await google.colab.output.renderOutput(dataTable, element);\n",
|
| 292 |
+
" const docLink = document.createElement('div');\n",
|
| 293 |
+
" docLink.innerHTML = docLinkHtml;\n",
|
| 294 |
+
" element.appendChild(docLink);\n",
|
| 295 |
+
" }\n",
|
| 296 |
+
" </script>\n",
|
| 297 |
+
" </div>\n",
|
| 298 |
+
"\n",
|
| 299 |
+
"\n",
|
| 300 |
+
" <div id=\"df-28a4e589-75c9-410c-a555-9fb0ffff72ec\">\n",
|
| 301 |
+
" <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-28a4e589-75c9-410c-a555-9fb0ffff72ec')\"\n",
|
| 302 |
+
" title=\"Suggest charts\"\n",
|
| 303 |
+
" style=\"display:none;\">\n",
|
| 304 |
+
"\n",
|
| 305 |
+
"<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
|
| 306 |
+
" width=\"24px\">\n",
|
| 307 |
+
" <g>\n",
|
| 308 |
+
" <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
|
| 309 |
+
" </g>\n",
|
| 310 |
+
"</svg>\n",
|
| 311 |
+
" </button>\n",
|
| 312 |
+
"\n",
|
| 313 |
+
"<style>\n",
|
| 314 |
+
" .colab-df-quickchart {\n",
|
| 315 |
+
" --bg-color: #E8F0FE;\n",
|
| 316 |
+
" --fill-color: #1967D2;\n",
|
| 317 |
+
" --hover-bg-color: #E2EBFA;\n",
|
| 318 |
+
" --hover-fill-color: #174EA6;\n",
|
| 319 |
+
" --disabled-fill-color: #AAA;\n",
|
| 320 |
+
" --disabled-bg-color: #DDD;\n",
|
| 321 |
+
" }\n",
|
| 322 |
+
"\n",
|
| 323 |
+
" [theme=dark] .colab-df-quickchart {\n",
|
| 324 |
+
" --bg-color: #3B4455;\n",
|
| 325 |
+
" --fill-color: #D2E3FC;\n",
|
| 326 |
+
" --hover-bg-color: #434B5C;\n",
|
| 327 |
+
" --hover-fill-color: #FFFFFF;\n",
|
| 328 |
+
" --disabled-bg-color: #3B4455;\n",
|
| 329 |
+
" --disabled-fill-color: #666;\n",
|
| 330 |
+
" }\n",
|
| 331 |
+
"\n",
|
| 332 |
+
" .colab-df-quickchart {\n",
|
| 333 |
+
" background-color: var(--bg-color);\n",
|
| 334 |
+
" border: none;\n",
|
| 335 |
+
" border-radius: 50%;\n",
|
| 336 |
+
" cursor: pointer;\n",
|
| 337 |
+
" display: none;\n",
|
| 338 |
+
" fill: var(--fill-color);\n",
|
| 339 |
+
" height: 32px;\n",
|
| 340 |
+
" padding: 0;\n",
|
| 341 |
+
" width: 32px;\n",
|
| 342 |
+
" }\n",
|
| 343 |
+
"\n",
|
| 344 |
+
" .colab-df-quickchart:hover {\n",
|
| 345 |
+
" background-color: var(--hover-bg-color);\n",
|
| 346 |
+
" box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
|
| 347 |
+
" fill: var(--button-hover-fill-color);\n",
|
| 348 |
+
" }\n",
|
| 349 |
+
"\n",
|
| 350 |
+
" .colab-df-quickchart-complete:disabled,\n",
|
| 351 |
+
" .colab-df-quickchart-complete:disabled:hover {\n",
|
| 352 |
+
" background-color: var(--disabled-bg-color);\n",
|
| 353 |
+
" fill: var(--disabled-fill-color);\n",
|
| 354 |
+
" box-shadow: none;\n",
|
| 355 |
+
" }\n",
|
| 356 |
+
"\n",
|
| 357 |
+
" .colab-df-spinner {\n",
|
| 358 |
+
" border: 2px solid var(--fill-color);\n",
|
| 359 |
+
" border-color: transparent;\n",
|
| 360 |
+
" border-bottom-color: var(--fill-color);\n",
|
| 361 |
+
" animation:\n",
|
| 362 |
+
" spin 1s steps(1) infinite;\n",
|
| 363 |
+
" }\n",
|
| 364 |
+
"\n",
|
| 365 |
+
" @keyframes spin {\n",
|
| 366 |
+
" 0% {\n",
|
| 367 |
+
" border-color: transparent;\n",
|
| 368 |
+
" border-bottom-color: var(--fill-color);\n",
|
| 369 |
+
" border-left-color: var(--fill-color);\n",
|
| 370 |
+
" }\n",
|
| 371 |
+
" 20% {\n",
|
| 372 |
+
" border-color: transparent;\n",
|
| 373 |
+
" border-left-color: var(--fill-color);\n",
|
| 374 |
+
" border-top-color: var(--fill-color);\n",
|
| 375 |
+
" }\n",
|
| 376 |
+
" 30% {\n",
|
| 377 |
+
" border-color: transparent;\n",
|
| 378 |
+
" border-left-color: var(--fill-color);\n",
|
| 379 |
+
" border-top-color: var(--fill-color);\n",
|
| 380 |
+
" border-right-color: var(--fill-color);\n",
|
| 381 |
+
" }\n",
|
| 382 |
+
" 40% {\n",
|
| 383 |
+
" border-color: transparent;\n",
|
| 384 |
+
" border-right-color: var(--fill-color);\n",
|
| 385 |
+
" border-top-color: var(--fill-color);\n",
|
| 386 |
+
" }\n",
|
| 387 |
+
" 60% {\n",
|
| 388 |
+
" border-color: transparent;\n",
|
| 389 |
+
" border-right-color: var(--fill-color);\n",
|
| 390 |
+
" }\n",
|
| 391 |
+
" 80% {\n",
|
| 392 |
+
" border-color: transparent;\n",
|
| 393 |
+
" border-right-color: var(--fill-color);\n",
|
| 394 |
+
" border-bottom-color: var(--fill-color);\n",
|
| 395 |
+
" }\n",
|
| 396 |
+
" 90% {\n",
|
| 397 |
+
" border-color: transparent;\n",
|
| 398 |
+
" border-bottom-color: var(--fill-color);\n",
|
| 399 |
+
" }\n",
|
| 400 |
+
" }\n",
|
| 401 |
+
"</style>\n",
|
| 402 |
+
"\n",
|
| 403 |
+
" <script>\n",
|
| 404 |
+
" async function quickchart(key) {\n",
|
| 405 |
+
" const quickchartButtonEl =\n",
|
| 406 |
+
" document.querySelector('#' + key + ' button');\n",
|
| 407 |
+
" quickchartButtonEl.disabled = true; // To prevent multiple clicks.\n",
|
| 408 |
+
" quickchartButtonEl.classList.add('colab-df-spinner');\n",
|
| 409 |
+
" try {\n",
|
| 410 |
+
" const charts = await google.colab.kernel.invokeFunction(\n",
|
| 411 |
+
" 'suggestCharts', [key], {});\n",
|
| 412 |
+
" } catch (error) {\n",
|
| 413 |
+
" console.error('Error during call to suggestCharts:', error);\n",
|
| 414 |
+
" }\n",
|
| 415 |
+
" quickchartButtonEl.classList.remove('colab-df-spinner');\n",
|
| 416 |
+
" quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
|
| 417 |
+
" }\n",
|
| 418 |
+
" (() => {\n",
|
| 419 |
+
" let quickchartButtonEl =\n",
|
| 420 |
+
" document.querySelector('#df-28a4e589-75c9-410c-a555-9fb0ffff72ec button');\n",
|
| 421 |
+
" quickchartButtonEl.style.display =\n",
|
| 422 |
+
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
|
| 423 |
+
" })();\n",
|
| 424 |
+
" </script>\n",
|
| 425 |
+
" </div>\n",
|
| 426 |
+
"\n",
|
| 427 |
+
" </div>\n",
|
| 428 |
+
" </div>\n"
|
| 429 |
+
],
|
| 430 |
+
"application/vnd.google.colaboratory.intrinsic+json": {
|
| 431 |
+
"type": "dataframe",
|
| 432 |
+
"variable_name": "you",
|
| 433 |
+
"summary": "{\n \"name\": \"you\",\n \"rows\": 18408,\n \"fields\": [\n {\n \"column\": \"Comment\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 17871,\n \"samples\": [\n \"which apple product are you most excited about in september this is the best iphone accessory in 2022 links to the best iphone accessory deals on amazon airpods pro on sale airpods 3 sale \",\n \"1948 i understand that this is for beginners just starting with sql but in my opinion you should always delete with unique value like id in this case because later youll probably have more than one jeff keep up the good work\",\n \"i want to go to pakistan just to eat food with ali i really enjoyed his personality hes got a great sense of humor and hes obviously super passionate about educating people on the beauty of his country\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Sentiment\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"neutral\",\n \"negative\",\n \"positive\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
|
| 434 |
+
}
|
| 435 |
+
},
|
| 436 |
+
"metadata": {},
|
| 437 |
+
"execution_count": 4
|
| 438 |
+
}
|
| 439 |
+
]
|
| 440 |
+
},
|
| 441 |
+
{
|
| 442 |
+
"cell_type": "markdown",
|
| 443 |
+
"source": [
|
| 444 |
+
"## π§Ή Data Cleaning & Preprocessing\n",
|
| 445 |
+
"\n",
|
| 446 |
+
"Steps:\n",
|
| 447 |
+
"1. **Remove Duplicates** β avoid training on repeated comments.\n",
|
| 448 |
+
"2. **Handle Missing Values** β drop empty or NaN comments.\n",
|
| 449 |
+
"3. **Text Normalization** β lowercase, remove punctuation, links, and extra spaces.\n",
|
| 450 |
+
"4. **Tokenization** β split text into individual words.\n",
|
| 451 |
+
"5. **Stopword Removal** β remove common words (e.g., \"the\", \"and\") that don't add meaning.\n",
|
| 452 |
+
"6. **Lemmatization** β reduce words to their base form (e.g., \"running\" β \"run\").\n",
|
| 453 |
+
"\n",
|
| 454 |
+
"This ensures clean, standardized input for the model.\n"
|
| 455 |
+
],
|
| 456 |
+
"metadata": {
|
| 457 |
+
"id": "z5hwWiDunAjM"
|
| 458 |
+
}
|
| 459 |
+
},
|
| 460 |
+
{
|
| 461 |
+
"cell_type": "code",
|
| 462 |
+
"source": [
|
| 463 |
+
"you.duplicated().sum()"
|
| 464 |
+
],
|
| 465 |
+
"metadata": {
|
| 466 |
+
"colab": {
|
| 467 |
+
"base_uri": "https://localhost:8080/"
|
| 468 |
+
},
|
| 469 |
+
"id": "8bUEqWr56kLh",
|
| 470 |
+
"outputId": "d807a75e-7d1b-42f8-c2c9-f12731cea9c8"
|
| 471 |
+
},
|
| 472 |
+
"execution_count": null,
|
| 473 |
+
"outputs": [
|
| 474 |
+
{
|
| 475 |
+
"output_type": "execute_result",
|
| 476 |
+
"data": {
|
| 477 |
+
"text/plain": [
|
| 478 |
+
"np.int64(531)"
|
| 479 |
+
]
|
| 480 |
+
},
|
| 481 |
+
"metadata": {},
|
| 482 |
+
"execution_count": 5
|
| 483 |
+
}
|
| 484 |
+
]
|
| 485 |
+
},
|
| 486 |
+
{
|
| 487 |
+
"cell_type": "code",
|
| 488 |
+
"source": [
|
| 489 |
+
"you.info()"
|
| 490 |
+
],
|
| 491 |
+
"metadata": {
|
| 492 |
+
"colab": {
|
| 493 |
+
"base_uri": "https://localhost:8080/"
|
| 494 |
+
},
|
| 495 |
+
"id": "DvMfIZ7_6nJJ",
|
| 496 |
+
"outputId": "dfbeacc0-0b32-4815-fd7f-60bc47c3c1d2"
|
| 497 |
+
},
|
| 498 |
+
"execution_count": null,
|
| 499 |
+
"outputs": [
|
| 500 |
+
{
|
| 501 |
+
"output_type": "stream",
|
| 502 |
+
"name": "stdout",
|
| 503 |
+
"text": [
|
| 504 |
+
"<class 'pandas.core.frame.DataFrame'>\n",
|
| 505 |
+
"RangeIndex: 18408 entries, 0 to 18407\n",
|
| 506 |
+
"Data columns (total 2 columns):\n",
|
| 507 |
+
" # Column Non-Null Count Dtype \n",
|
| 508 |
+
"--- ------ -------------- ----- \n",
|
| 509 |
+
" 0 Comment 18364 non-null object\n",
|
| 510 |
+
" 1 Sentiment 18408 non-null object\n",
|
| 511 |
+
"dtypes: object(2)\n",
|
| 512 |
+
"memory usage: 287.8+ KB\n"
|
| 513 |
+
]
|
| 514 |
+
}
|
| 515 |
+
]
|
| 516 |
+
},
|
| 517 |
+
{
|
| 518 |
+
"cell_type": "code",
|
| 519 |
+
"source": [
|
| 520 |
+
"you.drop_duplicates(inplace=True)"
|
| 521 |
+
],
|
| 522 |
+
"metadata": {
|
| 523 |
+
"id": "CqT3Ohf36oKp"
|
| 524 |
+
},
|
| 525 |
+
"execution_count": null,
|
| 526 |
+
"outputs": []
|
| 527 |
+
},
|
| 528 |
+
{
|
| 529 |
+
"cell_type": "code",
|
| 530 |
+
"source": [
|
| 531 |
+
"you.isna().sum()"
|
| 532 |
+
],
|
| 533 |
+
"metadata": {
|
| 534 |
+
"colab": {
|
| 535 |
+
"base_uri": "https://localhost:8080/",
|
| 536 |
+
"height": 147
|
| 537 |
+
},
|
| 538 |
+
"id": "G78MAakZ6o75",
|
| 539 |
+
"outputId": "c6ad259c-bab4-4797-9484-2b76c14ec7ef"
|
| 540 |
+
},
|
| 541 |
+
"execution_count": null,
|
| 542 |
+
"outputs": [
|
| 543 |
+
{
|
| 544 |
+
"output_type": "execute_result",
|
| 545 |
+
"data": {
|
| 546 |
+
"text/plain": [
|
| 547 |
+
"Comment 3\n",
|
| 548 |
+
"Sentiment 0\n",
|
| 549 |
+
"dtype: int64"
|
| 550 |
+
],
|
| 551 |
+
"text/html": [
|
| 552 |
+
"<div>\n",
|
| 553 |
+
"<style scoped>\n",
|
| 554 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 555 |
+
" vertical-align: middle;\n",
|
| 556 |
+
" }\n",
|
| 557 |
+
"\n",
|
| 558 |
+
" .dataframe tbody tr th {\n",
|
| 559 |
+
" vertical-align: top;\n",
|
| 560 |
+
" }\n",
|
| 561 |
+
"\n",
|
| 562 |
+
" .dataframe thead th {\n",
|
| 563 |
+
" text-align: right;\n",
|
| 564 |
+
" }\n",
|
| 565 |
+
"</style>\n",
|
| 566 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 567 |
+
" <thead>\n",
|
| 568 |
+
" <tr style=\"text-align: right;\">\n",
|
| 569 |
+
" <th></th>\n",
|
| 570 |
+
" <th>0</th>\n",
|
| 571 |
+
" </tr>\n",
|
| 572 |
+
" </thead>\n",
|
| 573 |
+
" <tbody>\n",
|
| 574 |
+
" <tr>\n",
|
| 575 |
+
" <th>Comment</th>\n",
|
| 576 |
+
" <td>3</td>\n",
|
| 577 |
+
" </tr>\n",
|
| 578 |
+
" <tr>\n",
|
| 579 |
+
" <th>Sentiment</th>\n",
|
| 580 |
+
" <td>0</td>\n",
|
| 581 |
+
" </tr>\n",
|
| 582 |
+
" </tbody>\n",
|
| 583 |
+
"</table>\n",
|
| 584 |
+
"</div><br><label><b>dtype:</b> int64</label>"
|
| 585 |
+
]
|
| 586 |
+
},
|
| 587 |
+
"metadata": {},
|
| 588 |
+
"execution_count": 8
|
| 589 |
+
}
|
| 590 |
+
]
|
| 591 |
+
},
|
| 592 |
+
{
|
| 593 |
+
"cell_type": "code",
|
| 594 |
+
"source": [
|
| 595 |
+
"you.dropna(inplace=True)"
|
| 596 |
+
],
|
| 597 |
+
"metadata": {
|
| 598 |
+
"id": "1N79NbG16p3R"
|
| 599 |
+
},
|
| 600 |
+
"execution_count": null,
|
| 601 |
+
"outputs": []
|
| 602 |
+
},
|
| 603 |
+
{
|
| 604 |
+
"cell_type": "code",
|
| 605 |
+
"source": [
|
| 606 |
+
"import nltk\n",
|
| 607 |
+
"nltk.download('punkt_tab')\n",
|
| 608 |
+
"nltk.download('stopwords')\n",
|
| 609 |
+
"nltk.download('wordnet')\n",
|
| 610 |
+
"nltk.download('omw-1.4')\n",
|
| 611 |
+
"from nltk.corpus import stopwords\n",
|
| 612 |
+
"from nltk.tokenize import word_tokenize\n",
|
| 613 |
+
"from nltk.stem import WordNetLemmatizer\n",
|
| 614 |
+
"import re"
|
| 615 |
+
],
|
| 616 |
+
"metadata": {
|
| 617 |
+
"colab": {
|
| 618 |
+
"base_uri": "https://localhost:8080/"
|
| 619 |
+
},
|
| 620 |
+
"id": "yg6LTfql6rqx",
|
| 621 |
+
"outputId": "cfc5e34b-27db-4dfd-b80b-f0fb88b45e54"
|
| 622 |
+
},
|
| 623 |
+
"execution_count": null,
|
| 624 |
+
"outputs": [
|
| 625 |
+
{
|
| 626 |
+
"output_type": "stream",
|
| 627 |
+
"name": "stderr",
|
| 628 |
+
"text": [
|
| 629 |
+
"[nltk_data] Downloading package punkt_tab to /root/nltk_data...\n",
|
| 630 |
+
"[nltk_data] Package punkt_tab is already up-to-date!\n",
|
| 631 |
+
"[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
|
| 632 |
+
"[nltk_data] Package stopwords is already up-to-date!\n",
|
| 633 |
+
"[nltk_data] Downloading package wordnet to /root/nltk_data...\n",
|
| 634 |
+
"[nltk_data] Package wordnet is already up-to-date!\n",
|
| 635 |
+
"[nltk_data] Downloading package omw-1.4 to /root/nltk_data...\n",
|
| 636 |
+
"[nltk_data] Package omw-1.4 is already up-to-date!\n"
|
| 637 |
+
]
|
| 638 |
+
}
|
| 639 |
+
]
|
| 640 |
+
},
|
| 641 |
+
{
|
| 642 |
+
"cell_type": "code",
|
| 643 |
+
"source": [
|
| 644 |
+
"stop_words = set(stopwords.words('english'))\n",
|
| 645 |
+
"lemmatizer = WordNetLemmatizer()\n",
|
| 646 |
+
"\n",
|
| 647 |
+
"def preprocess_text(text):\n",
|
| 648 |
+
" # Remove non-alphabet characters\n",
|
| 649 |
+
" text = re.sub(r'[^A-Za-z\\s]', '', text)\n",
|
| 650 |
+
" # Normalize whitespace\n",
|
| 651 |
+
" text = re.sub(r'http\\S+|www\\S+|https\\S+', '', text)\n",
|
| 652 |
+
" text = re.sub(r'\\s+', ' ', text).strip()\n",
|
| 653 |
+
" # Lowercase\n",
|
| 654 |
+
" text = text.lower()\n",
|
| 655 |
+
" # Tokenize\n",
|
| 656 |
+
" tokens = word_tokenize(text)\n",
|
| 657 |
+
" # Remove stopwords\n",
|
| 658 |
+
" tokens = [word for word in tokens if word not in stop_words]\n",
|
| 659 |
+
" # Lemmatize\n",
|
| 660 |
+
" tokens = [lemmatizer.lemmatize(word) for word in tokens]\n",
|
| 661 |
+
" # Re-join\n",
|
| 662 |
+
" return ' '.join(tokens)"
|
| 663 |
+
],
|
| 664 |
+
"metadata": {
|
| 665 |
+
"id": "H3jWMBg160U7"
|
| 666 |
+
},
|
| 667 |
+
"execution_count": null,
|
| 668 |
+
"outputs": []
|
| 669 |
+
},
|
| 670 |
+
{
|
| 671 |
+
"cell_type": "code",
|
| 672 |
+
"source": [
|
| 673 |
+
"you['Comment'] = you['Comment'].apply(preprocess_text)"
|
| 674 |
+
],
|
| 675 |
+
"metadata": {
|
| 676 |
+
"id": "LnMIBLws7FCv"
|
| 677 |
+
},
|
| 678 |
+
"execution_count": null,
|
| 679 |
+
"outputs": []
|
| 680 |
+
},
|
| 681 |
+
{
|
| 682 |
+
"cell_type": "code",
|
| 683 |
+
"source": [
|
| 684 |
+
"you.head(5)"
|
| 685 |
+
],
|
| 686 |
+
"metadata": {
|
| 687 |
+
"colab": {
|
| 688 |
+
"base_uri": "https://localhost:8080/",
|
| 689 |
+
"height": 206
|
| 690 |
+
},
|
| 691 |
+
"id": "XAoeps_E7HNp",
|
| 692 |
+
"outputId": "afb1559a-47bb-46b7-f48e-740da0cf8e28"
|
| 693 |
+
},
|
| 694 |
+
"execution_count": null,
|
| 695 |
+
"outputs": [
|
| 696 |
+
{
|
| 697 |
+
"output_type": "execute_result",
|
| 698 |
+
"data": {
|
| 699 |
+
"text/plain": [
|
| 700 |
+
" Comment Sentiment\n",
|
| 701 |
+
"0 let forget apple pay required brand new iphone... neutral\n",
|
| 702 |
+
"1 nz retailer dont even contactless credit card ... negative\n",
|
| 703 |
+
"2 forever acknowledge channel help lesson idea e... positive\n",
|
| 704 |
+
"3 whenever go place doesnt take apple pay doesnt... negative\n",
|
| 705 |
+
"4 apple pay convenient secure easy use used kore... positive"
|
| 706 |
+
],
|
| 707 |
+
"text/html": [
|
| 708 |
+
"\n",
|
| 709 |
+
" <div id=\"df-be296ed2-fbb5-4cad-9c4b-f18e16fd242a\" class=\"colab-df-container\">\n",
|
| 710 |
+
" <div>\n",
|
| 711 |
+
"<style scoped>\n",
|
| 712 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 713 |
+
" vertical-align: middle;\n",
|
| 714 |
+
" }\n",
|
| 715 |
+
"\n",
|
| 716 |
+
" .dataframe tbody tr th {\n",
|
| 717 |
+
" vertical-align: top;\n",
|
| 718 |
+
" }\n",
|
| 719 |
+
"\n",
|
| 720 |
+
" .dataframe thead th {\n",
|
| 721 |
+
" text-align: right;\n",
|
| 722 |
+
" }\n",
|
| 723 |
+
"</style>\n",
|
| 724 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 725 |
+
" <thead>\n",
|
| 726 |
+
" <tr style=\"text-align: right;\">\n",
|
| 727 |
+
" <th></th>\n",
|
| 728 |
+
" <th>Comment</th>\n",
|
| 729 |
+
" <th>Sentiment</th>\n",
|
| 730 |
+
" </tr>\n",
|
| 731 |
+
" </thead>\n",
|
| 732 |
+
" <tbody>\n",
|
| 733 |
+
" <tr>\n",
|
| 734 |
+
" <th>0</th>\n",
|
| 735 |
+
" <td>let forget apple pay required brand new iphone...</td>\n",
|
| 736 |
+
" <td>neutral</td>\n",
|
| 737 |
+
" </tr>\n",
|
| 738 |
+
" <tr>\n",
|
| 739 |
+
" <th>1</th>\n",
|
| 740 |
+
" <td>nz retailer dont even contactless credit card ...</td>\n",
|
| 741 |
+
" <td>negative</td>\n",
|
| 742 |
+
" </tr>\n",
|
| 743 |
+
" <tr>\n",
|
| 744 |
+
" <th>2</th>\n",
|
| 745 |
+
" <td>forever acknowledge channel help lesson idea e...</td>\n",
|
| 746 |
+
" <td>positive</td>\n",
|
| 747 |
+
" </tr>\n",
|
| 748 |
+
" <tr>\n",
|
| 749 |
+
" <th>3</th>\n",
|
| 750 |
+
" <td>whenever go place doesnt take apple pay doesnt...</td>\n",
|
| 751 |
+
" <td>negative</td>\n",
|
| 752 |
+
" </tr>\n",
|
| 753 |
+
" <tr>\n",
|
| 754 |
+
" <th>4</th>\n",
|
| 755 |
+
" <td>apple pay convenient secure easy use used kore...</td>\n",
|
| 756 |
+
" <td>positive</td>\n",
|
| 757 |
+
" </tr>\n",
|
| 758 |
+
" </tbody>\n",
|
| 759 |
+
"</table>\n",
|
| 760 |
+
"</div>\n",
|
| 761 |
+
" <div class=\"colab-df-buttons\">\n",
|
| 762 |
+
"\n",
|
| 763 |
+
" <div class=\"colab-df-container\">\n",
|
| 764 |
+
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-be296ed2-fbb5-4cad-9c4b-f18e16fd242a')\"\n",
|
| 765 |
+
" title=\"Convert this dataframe to an interactive table.\"\n",
|
| 766 |
+
" style=\"display:none;\">\n",
|
| 767 |
+
"\n",
|
| 768 |
+
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
|
| 769 |
+
" <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
|
| 770 |
+
" </svg>\n",
|
| 771 |
+
" </button>\n",
|
| 772 |
+
"\n",
|
| 773 |
+
" <style>\n",
|
| 774 |
+
" .colab-df-container {\n",
|
| 775 |
+
" display:flex;\n",
|
| 776 |
+
" gap: 12px;\n",
|
| 777 |
+
" }\n",
|
| 778 |
+
"\n",
|
| 779 |
+
" .colab-df-convert {\n",
|
| 780 |
+
" background-color: #E8F0FE;\n",
|
| 781 |
+
" border: none;\n",
|
| 782 |
+
" border-radius: 50%;\n",
|
| 783 |
+
" cursor: pointer;\n",
|
| 784 |
+
" display: none;\n",
|
| 785 |
+
" fill: #1967D2;\n",
|
| 786 |
+
" height: 32px;\n",
|
| 787 |
+
" padding: 0 0 0 0;\n",
|
| 788 |
+
" width: 32px;\n",
|
| 789 |
+
" }\n",
|
| 790 |
+
"\n",
|
| 791 |
+
" .colab-df-convert:hover {\n",
|
| 792 |
+
" background-color: #E2EBFA;\n",
|
| 793 |
+
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
|
| 794 |
+
" fill: #174EA6;\n",
|
| 795 |
+
" }\n",
|
| 796 |
+
"\n",
|
| 797 |
+
" .colab-df-buttons div {\n",
|
| 798 |
+
" margin-bottom: 4px;\n",
|
| 799 |
+
" }\n",
|
| 800 |
+
"\n",
|
| 801 |
+
" [theme=dark] .colab-df-convert {\n",
|
| 802 |
+
" background-color: #3B4455;\n",
|
| 803 |
+
" fill: #D2E3FC;\n",
|
| 804 |
+
" }\n",
|
| 805 |
+
"\n",
|
| 806 |
+
" [theme=dark] .colab-df-convert:hover {\n",
|
| 807 |
+
" background-color: #434B5C;\n",
|
| 808 |
+
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
|
| 809 |
+
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
|
| 810 |
+
" fill: #FFFFFF;\n",
|
| 811 |
+
" }\n",
|
| 812 |
+
" </style>\n",
|
| 813 |
+
"\n",
|
| 814 |
+
" <script>\n",
|
| 815 |
+
" const buttonEl =\n",
|
| 816 |
+
" document.querySelector('#df-be296ed2-fbb5-4cad-9c4b-f18e16fd242a button.colab-df-convert');\n",
|
| 817 |
+
" buttonEl.style.display =\n",
|
| 818 |
+
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
|
| 819 |
+
"\n",
|
| 820 |
+
" async function convertToInteractive(key) {\n",
|
| 821 |
+
" const element = document.querySelector('#df-be296ed2-fbb5-4cad-9c4b-f18e16fd242a');\n",
|
| 822 |
+
" const dataTable =\n",
|
| 823 |
+
" await google.colab.kernel.invokeFunction('convertToInteractive',\n",
|
| 824 |
+
" [key], {});\n",
|
| 825 |
+
" if (!dataTable) return;\n",
|
| 826 |
+
"\n",
|
| 827 |
+
" const docLinkHtml = 'Like what you see? Visit the ' +\n",
|
| 828 |
+
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
|
| 829 |
+
" + ' to learn more about interactive tables.';\n",
|
| 830 |
+
" element.innerHTML = '';\n",
|
| 831 |
+
" dataTable['output_type'] = 'display_data';\n",
|
| 832 |
+
" await google.colab.output.renderOutput(dataTable, element);\n",
|
| 833 |
+
" const docLink = document.createElement('div');\n",
|
| 834 |
+
" docLink.innerHTML = docLinkHtml;\n",
|
| 835 |
+
" element.appendChild(docLink);\n",
|
| 836 |
+
" }\n",
|
| 837 |
+
" </script>\n",
|
| 838 |
+
" </div>\n",
|
| 839 |
+
"\n",
|
| 840 |
+
"\n",
|
| 841 |
+
" <div id=\"df-a67741c4-5be4-4c74-a0e4-1d9184e969ee\">\n",
|
| 842 |
+
" <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-a67741c4-5be4-4c74-a0e4-1d9184e969ee')\"\n",
|
| 843 |
+
" title=\"Suggest charts\"\n",
|
| 844 |
+
" style=\"display:none;\">\n",
|
| 845 |
+
"\n",
|
| 846 |
+
"<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
|
| 847 |
+
" width=\"24px\">\n",
|
| 848 |
+
" <g>\n",
|
| 849 |
+
" <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
|
| 850 |
+
" </g>\n",
|
| 851 |
+
"</svg>\n",
|
| 852 |
+
" </button>\n",
|
| 853 |
+
"\n",
|
| 854 |
+
"<style>\n",
|
| 855 |
+
" .colab-df-quickchart {\n",
|
| 856 |
+
" --bg-color: #E8F0FE;\n",
|
| 857 |
+
" --fill-color: #1967D2;\n",
|
| 858 |
+
" --hover-bg-color: #E2EBFA;\n",
|
| 859 |
+
" --hover-fill-color: #174EA6;\n",
|
| 860 |
+
" --disabled-fill-color: #AAA;\n",
|
| 861 |
+
" --disabled-bg-color: #DDD;\n",
|
| 862 |
+
" }\n",
|
| 863 |
+
"\n",
|
| 864 |
+
" [theme=dark] .colab-df-quickchart {\n",
|
| 865 |
+
" --bg-color: #3B4455;\n",
|
| 866 |
+
" --fill-color: #D2E3FC;\n",
|
| 867 |
+
" --hover-bg-color: #434B5C;\n",
|
| 868 |
+
" --hover-fill-color: #FFFFFF;\n",
|
| 869 |
+
" --disabled-bg-color: #3B4455;\n",
|
| 870 |
+
" --disabled-fill-color: #666;\n",
|
| 871 |
+
" }\n",
|
| 872 |
+
"\n",
|
| 873 |
+
" .colab-df-quickchart {\n",
|
| 874 |
+
" background-color: var(--bg-color);\n",
|
| 875 |
+
" border: none;\n",
|
| 876 |
+
" border-radius: 50%;\n",
|
| 877 |
+
" cursor: pointer;\n",
|
| 878 |
+
" display: none;\n",
|
| 879 |
+
" fill: var(--fill-color);\n",
|
| 880 |
+
" height: 32px;\n",
|
| 881 |
+
" padding: 0;\n",
|
| 882 |
+
" width: 32px;\n",
|
| 883 |
+
" }\n",
|
| 884 |
+
"\n",
|
| 885 |
+
" .colab-df-quickchart:hover {\n",
|
| 886 |
+
" background-color: var(--hover-bg-color);\n",
|
| 887 |
+
" box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
|
| 888 |
+
" fill: var(--button-hover-fill-color);\n",
|
| 889 |
+
" }\n",
|
| 890 |
+
"\n",
|
| 891 |
+
" .colab-df-quickchart-complete:disabled,\n",
|
| 892 |
+
" .colab-df-quickchart-complete:disabled:hover {\n",
|
| 893 |
+
" background-color: var(--disabled-bg-color);\n",
|
| 894 |
+
" fill: var(--disabled-fill-color);\n",
|
| 895 |
+
" box-shadow: none;\n",
|
| 896 |
+
" }\n",
|
| 897 |
+
"\n",
|
| 898 |
+
" .colab-df-spinner {\n",
|
| 899 |
+
" border: 2px solid var(--fill-color);\n",
|
| 900 |
+
" border-color: transparent;\n",
|
| 901 |
+
" border-bottom-color: var(--fill-color);\n",
|
| 902 |
+
" animation:\n",
|
| 903 |
+
" spin 1s steps(1) infinite;\n",
|
| 904 |
+
" }\n",
|
| 905 |
+
"\n",
|
| 906 |
+
" @keyframes spin {\n",
|
| 907 |
+
" 0% {\n",
|
| 908 |
+
" border-color: transparent;\n",
|
| 909 |
+
" border-bottom-color: var(--fill-color);\n",
|
| 910 |
+
" border-left-color: var(--fill-color);\n",
|
| 911 |
+
" }\n",
|
| 912 |
+
" 20% {\n",
|
| 913 |
+
" border-color: transparent;\n",
|
| 914 |
+
" border-left-color: var(--fill-color);\n",
|
| 915 |
+
" border-top-color: var(--fill-color);\n",
|
| 916 |
+
" }\n",
|
| 917 |
+
" 30% {\n",
|
| 918 |
+
" border-color: transparent;\n",
|
| 919 |
+
" border-left-color: var(--fill-color);\n",
|
| 920 |
+
" border-top-color: var(--fill-color);\n",
|
| 921 |
+
" border-right-color: var(--fill-color);\n",
|
| 922 |
+
" }\n",
|
| 923 |
+
" 40% {\n",
|
| 924 |
+
" border-color: transparent;\n",
|
| 925 |
+
" border-right-color: var(--fill-color);\n",
|
| 926 |
+
" border-top-color: var(--fill-color);\n",
|
| 927 |
+
" }\n",
|
| 928 |
+
" 60% {\n",
|
| 929 |
+
" border-color: transparent;\n",
|
| 930 |
+
" border-right-color: var(--fill-color);\n",
|
| 931 |
+
" }\n",
|
| 932 |
+
" 80% {\n",
|
| 933 |
+
" border-color: transparent;\n",
|
| 934 |
+
" border-right-color: var(--fill-color);\n",
|
| 935 |
+
" border-bottom-color: var(--fill-color);\n",
|
| 936 |
+
" }\n",
|
| 937 |
+
" 90% {\n",
|
| 938 |
+
" border-color: transparent;\n",
|
| 939 |
+
" border-bottom-color: var(--fill-color);\n",
|
| 940 |
+
" }\n",
|
| 941 |
+
" }\n",
|
| 942 |
+
"</style>\n",
|
| 943 |
+
"\n",
|
| 944 |
+
" <script>\n",
|
| 945 |
+
" async function quickchart(key) {\n",
|
| 946 |
+
" const quickchartButtonEl =\n",
|
| 947 |
+
" document.querySelector('#' + key + ' button');\n",
|
| 948 |
+
" quickchartButtonEl.disabled = true; // To prevent multiple clicks.\n",
|
| 949 |
+
" quickchartButtonEl.classList.add('colab-df-spinner');\n",
|
| 950 |
+
" try {\n",
|
| 951 |
+
" const charts = await google.colab.kernel.invokeFunction(\n",
|
| 952 |
+
" 'suggestCharts', [key], {});\n",
|
| 953 |
+
" } catch (error) {\n",
|
| 954 |
+
" console.error('Error during call to suggestCharts:', error);\n",
|
| 955 |
+
" }\n",
|
| 956 |
+
" quickchartButtonEl.classList.remove('colab-df-spinner');\n",
|
| 957 |
+
" quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
|
| 958 |
+
" }\n",
|
| 959 |
+
" (() => {\n",
|
| 960 |
+
" let quickchartButtonEl =\n",
|
| 961 |
+
" document.querySelector('#df-a67741c4-5be4-4c74-a0e4-1d9184e969ee button');\n",
|
| 962 |
+
" quickchartButtonEl.style.display =\n",
|
| 963 |
+
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
|
| 964 |
+
" })();\n",
|
| 965 |
+
" </script>\n",
|
| 966 |
+
" </div>\n",
|
| 967 |
+
"\n",
|
| 968 |
+
" </div>\n",
|
| 969 |
+
" </div>\n"
|
| 970 |
+
],
|
| 971 |
+
"application/vnd.google.colaboratory.intrinsic+json": {
|
| 972 |
+
"type": "dataframe",
|
| 973 |
+
"variable_name": "you",
|
| 974 |
+
"summary": "{\n \"name\": \"you\",\n \"rows\": 17874,\n \"fields\": [\n {\n \"column\": \"Comment\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 17509,\n \"samples\": [\n \"reading bill brysona short history nearly everything pull microbiology taxonomy textbook thank detailed presentation helping immensely understanding tip iceberg\",\n \"took ng class coursera year ago switched career trajectory academic industry spurred interest making thing awesome see interview\",\n \"remember getting line subway delhi int airport line say people nonindians getting late line seemed huge also wondering many foreigner eating vegetarian one got front cashier told nonveg line side looked spotted standing vegetarian line entire queue moved got sub time flight fun time\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Sentiment\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"neutral\",\n \"negative\",\n \"positive\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
|
| 975 |
+
}
|
| 976 |
+
},
|
| 977 |
+
"metadata": {},
|
| 978 |
+
"execution_count": 13
|
| 979 |
+
}
|
| 980 |
+
]
|
| 981 |
+
},
|
| 982 |
+
{
|
| 983 |
+
"cell_type": "markdown",
|
| 984 |
+
"source": [
|
| 985 |
+
"## π Train-Test Split\n",
|
| 986 |
+
"\n",
|
| 987 |
+
"We split the dataset into:\n",
|
| 988 |
+
"- **Training Set (80%)** β for learning.\n",
|
| 989 |
+
"- **Test Set (20%)** β for final evaluation.\n",
|
| 990 |
+
"\n",
|
| 991 |
+
"We also encode sentiment labels into numeric form (`0`, `1`, `2`) for model compatibility.\n"
|
| 992 |
+
],
|
| 993 |
+
"metadata": {
|
| 994 |
+
"id": "si80lTtLnGfw"
|
| 995 |
+
}
|
| 996 |
+
},
|
| 997 |
+
{
|
| 998 |
+
"cell_type": "code",
|
| 999 |
+
"source": [
|
| 1000 |
+
"x = you['Comment']\n",
|
| 1001 |
+
"y = you['Sentiment']"
|
| 1002 |
+
],
|
| 1003 |
+
"metadata": {
|
| 1004 |
+
"id": "_UkM8Xxw7ONZ"
|
| 1005 |
+
},
|
| 1006 |
+
"execution_count": null,
|
| 1007 |
+
"outputs": []
|
| 1008 |
+
},
|
| 1009 |
+
{
|
| 1010 |
+
"cell_type": "code",
|
| 1011 |
+
"source": [
|
| 1012 |
+
"from sklearn.preprocessing import LabelEncoder\n",
|
| 1013 |
+
"le = LabelEncoder()\n",
|
| 1014 |
+
"y = le.fit_transform(y)"
|
| 1015 |
+
],
|
| 1016 |
+
"metadata": {
|
| 1017 |
+
"id": "kAJjQ5BV7Q35"
|
| 1018 |
+
},
|
| 1019 |
+
"execution_count": null,
|
| 1020 |
+
"outputs": []
|
| 1021 |
+
},
|
| 1022 |
+
{
|
| 1023 |
+
"cell_type": "code",
|
| 1024 |
+
"source": [
|
| 1025 |
+
"from sklearn.model_selection import train_test_split\n",
|
| 1026 |
+
"x_train, x_test, y_train, y_test = train_test_split(x.to_list(), y.tolist(), test_size=0.2, random_state=42)"
|
| 1027 |
+
],
|
| 1028 |
+
"metadata": {
|
| 1029 |
+
"id": "iTVKauGi7SzJ"
|
| 1030 |
+
},
|
| 1031 |
+
"execution_count": null,
|
| 1032 |
+
"outputs": []
|
| 1033 |
+
},
|
| 1034 |
+
{
|
| 1035 |
+
"cell_type": "code",
|
| 1036 |
+
"source": [
|
| 1037 |
+
"from torch.utils.data import WeightedRandomSampler"
|
| 1038 |
+
],
|
| 1039 |
+
"metadata": {
|
| 1040 |
+
"id": "xgdTb3GW7VUJ"
|
| 1041 |
+
},
|
| 1042 |
+
"execution_count": null,
|
| 1043 |
+
"outputs": []
|
| 1044 |
+
},
|
| 1045 |
+
{
|
| 1046 |
+
"cell_type": "markdown",
|
| 1047 |
+
"source": [
|
| 1048 |
+
"## βοΈ Handling Class Imbalance\n",
|
| 1049 |
+
"\n",
|
| 1050 |
+
"Sentiment datasets are often **imbalanced** β some classes have more samples than others. \n",
|
| 1051 |
+
"We use **WeightedRandomSampler** to:\n",
|
| 1052 |
+
"- Assign higher sampling weight to underrepresented classes.\n",
|
| 1053 |
+
"- Ensure balanced batches during training.\n",
|
| 1054 |
+
"\n",
|
| 1055 |
+
"This improves model fairness and prevents bias toward the majority class.\n"
|
| 1056 |
+
],
|
| 1057 |
+
"metadata": {
|
| 1058 |
+
"id": "T6hvbOZ1nIMc"
|
| 1059 |
+
}
|
| 1060 |
+
},
|
| 1061 |
+
{
|
| 1062 |
+
"cell_type": "code",
|
| 1063 |
+
"source": [
|
| 1064 |
+
"class_count = np.bincount(y_train)\n",
|
| 1065 |
+
"\n",
|
| 1066 |
+
"# Avoid divide-by-zero by masking zero counts\n",
|
| 1067 |
+
"class_weights = 1. / class_count\n",
|
| 1068 |
+
"\n",
|
| 1069 |
+
"\n",
|
| 1070 |
+
"# Assign weight to each sample\n",
|
| 1071 |
+
"sample_weights = [class_weights[label] for label in y_train]\n",
|
| 1072 |
+
"\n",
|
| 1073 |
+
"# Create sampler\n",
|
| 1074 |
+
"sampler = WeightedRandomSampler(sample_weights, len(sample_weights), replacement=True)"
|
| 1075 |
+
],
|
| 1076 |
+
"metadata": {
|
| 1077 |
+
"id": "3HMbR_XP7XQh"
|
| 1078 |
+
},
|
| 1079 |
+
"execution_count": null,
|
| 1080 |
+
"outputs": []
|
| 1081 |
+
},
|
| 1082 |
+
{
|
| 1083 |
+
"cell_type": "markdown",
|
| 1084 |
+
"source": [
|
| 1085 |
+
"## Import of Needed Libraries"
|
| 1086 |
+
],
|
| 1087 |
+
"metadata": {
|
| 1088 |
+
"id": "BxlWC_V7rNv2"
|
| 1089 |
+
}
|
| 1090 |
+
},
|
| 1091 |
+
{
|
| 1092 |
+
"cell_type": "code",
|
| 1093 |
+
"source": [
|
| 1094 |
+
"from transformers import DistilBertTokenizer, DistilBertForSequenceClassification\n",
|
| 1095 |
+
"from torch.utils.data import DataLoader , Dataset\n",
|
| 1096 |
+
"from torch.optim import AdamW\n",
|
| 1097 |
+
"from tqdm import tqdm\n",
|
| 1098 |
+
"import torch\n",
|
| 1099 |
+
"import torch.nn as nn"
|
| 1100 |
+
],
|
| 1101 |
+
"metadata": {
|
| 1102 |
+
"id": "EVxbo4hz7heZ"
|
| 1103 |
+
},
|
| 1104 |
+
"execution_count": null,
|
| 1105 |
+
"outputs": []
|
| 1106 |
+
},
|
| 1107 |
+
{
|
| 1108 |
+
"cell_type": "markdown",
|
| 1109 |
+
"source": [
|
| 1110 |
+
"## βοΈ Tokenization with DistilBERT\n",
|
| 1111 |
+
"\n",
|
| 1112 |
+
"We use the `DistilBertTokenizer` to:\n",
|
| 1113 |
+
"- Convert text into tokens understood by BERT.\n",
|
| 1114 |
+
"- Pad/truncate sequences to a fixed length (`max_length=128`).\n",
|
| 1115 |
+
"- Output tensors for PyTorch training.\n",
|
| 1116 |
+
"\n",
|
| 1117 |
+
"Tokenizer output includes:\n",
|
| 1118 |
+
"- `Batch`\n",
|
| 1119 |
+
"- `labels`\n"
|
| 1120 |
+
],
|
| 1121 |
+
"metadata": {
|
| 1122 |
+
"id": "-KFa2NxOrSY-"
|
| 1123 |
+
}
|
| 1124 |
+
},
|
| 1125 |
+
{
|
| 1126 |
+
"cell_type": "code",
|
| 1127 |
+
"source": [
|
| 1128 |
+
"tokenizer = DistilBertTokenizer.from_pretrained(\"distilbert-base-uncased\")"
|
| 1129 |
+
],
|
| 1130 |
+
"metadata": {
|
| 1131 |
+
"colab": {
|
| 1132 |
+
"base_uri": "https://localhost:8080/"
|
| 1133 |
+
},
|
| 1134 |
+
"id": "5bKf2dq77kLh",
|
| 1135 |
+
"outputId": "00d3b4e6-d552-4cab-bd51-ba1581da362a"
|
| 1136 |
+
},
|
| 1137 |
+
"execution_count": null,
|
| 1138 |
+
"outputs": [
|
| 1139 |
+
{
|
| 1140 |
+
"output_type": "stream",
|
| 1141 |
+
"name": "stderr",
|
| 1142 |
+
"text": [
|
| 1143 |
+
"/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n",
|
| 1144 |
+
"The secret `HF_TOKEN` does not exist in your Colab secrets.\n",
|
| 1145 |
+
"To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n",
|
| 1146 |
+
"You will be able to reuse this secret in all of your notebooks.\n",
|
| 1147 |
+
"Please note that authentication is recommended but still optional to access public models or datasets.\n",
|
| 1148 |
+
" warnings.warn(\n"
|
| 1149 |
+
]
|
| 1150 |
+
}
|
| 1151 |
+
]
|
| 1152 |
+
},
|
| 1153 |
+
{
|
| 1154 |
+
"cell_type": "code",
|
| 1155 |
+
"source": [
|
| 1156 |
+
"train_encodings = tokenizer(x_train, truncation=True, padding=True, max_length=128, return_tensors='pt')\n",
|
| 1157 |
+
"test_encodings = tokenizer(x_test, truncation=True, padding=True, max_length=128, return_tensors='pt')"
|
| 1158 |
+
],
|
| 1159 |
+
"metadata": {
|
| 1160 |
+
"id": "VFrqAA9V7oo1"
|
| 1161 |
+
},
|
| 1162 |
+
"execution_count": null,
|
| 1163 |
+
"outputs": []
|
| 1164 |
+
},
|
| 1165 |
+
{
|
| 1166 |
+
"cell_type": "code",
|
| 1167 |
+
"source": [
|
| 1168 |
+
"class transform_text(Dataset):\n",
|
| 1169 |
+
" def __init__(self, encodings, labels):\n",
|
| 1170 |
+
" self.encodings = encodings\n",
|
| 1171 |
+
" self.labels = labels\n",
|
| 1172 |
+
" def __len__(self):\n",
|
| 1173 |
+
" return len(self.labels)\n",
|
| 1174 |
+
" def __getitem__(self, idx):\n",
|
| 1175 |
+
" item = {key: val[idx] for key, val in self.encodings.items()}\n",
|
| 1176 |
+
" item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)\n",
|
| 1177 |
+
" return item"
|
| 1178 |
+
],
|
| 1179 |
+
"metadata": {
|
| 1180 |
+
"id": "9u-S_ipV7pM_"
|
| 1181 |
+
},
|
| 1182 |
+
"execution_count": null,
|
| 1183 |
+
"outputs": []
|
| 1184 |
+
},
|
| 1185 |
+
{
|
| 1186 |
+
"cell_type": "code",
|
| 1187 |
+
"source": [
|
| 1188 |
+
"train_set = transform_text(train_encodings, y_train)\n",
|
| 1189 |
+
"test_set = transform_text(test_encodings, y_test)"
|
| 1190 |
+
],
|
| 1191 |
+
"metadata": {
|
| 1192 |
+
"id": "8-UUdkT8726R"
|
| 1193 |
+
},
|
| 1194 |
+
"execution_count": null,
|
| 1195 |
+
"outputs": []
|
| 1196 |
+
},
|
| 1197 |
+
{
|
| 1198 |
+
"cell_type": "code",
|
| 1199 |
+
"source": [
|
| 1200 |
+
"you['Sentiment'].value_counts(normalize=True)"
|
| 1201 |
+
],
|
| 1202 |
+
"metadata": {
|
| 1203 |
+
"colab": {
|
| 1204 |
+
"base_uri": "https://localhost:8080/",
|
| 1205 |
+
"height": 210
|
| 1206 |
+
},
|
| 1207 |
+
"id": "Htkw_1-877Kv",
|
| 1208 |
+
"outputId": "b62b5679-f1e2-4c28-8686-fd1fef07fd84"
|
| 1209 |
+
},
|
| 1210 |
+
"execution_count": null,
|
| 1211 |
+
"outputs": [
|
| 1212 |
+
{
|
| 1213 |
+
"output_type": "execute_result",
|
| 1214 |
+
"data": {
|
| 1215 |
+
"text/plain": [
|
| 1216 |
+
"Sentiment\n",
|
| 1217 |
+
"positive 0.61844\n",
|
| 1218 |
+
"neutral 0.25193\n",
|
| 1219 |
+
"negative 0.12963\n",
|
| 1220 |
+
"Name: proportion, dtype: float64"
|
| 1221 |
+
],
|
| 1222 |
+
"text/html": [
|
| 1223 |
+
"<div>\n",
|
| 1224 |
+
"<style scoped>\n",
|
| 1225 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 1226 |
+
" vertical-align: middle;\n",
|
| 1227 |
+
" }\n",
|
| 1228 |
+
"\n",
|
| 1229 |
+
" .dataframe tbody tr th {\n",
|
| 1230 |
+
" vertical-align: top;\n",
|
| 1231 |
+
" }\n",
|
| 1232 |
+
"\n",
|
| 1233 |
+
" .dataframe thead th {\n",
|
| 1234 |
+
" text-align: right;\n",
|
| 1235 |
+
" }\n",
|
| 1236 |
+
"</style>\n",
|
| 1237 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 1238 |
+
" <thead>\n",
|
| 1239 |
+
" <tr style=\"text-align: right;\">\n",
|
| 1240 |
+
" <th></th>\n",
|
| 1241 |
+
" <th>proportion</th>\n",
|
| 1242 |
+
" </tr>\n",
|
| 1243 |
+
" <tr>\n",
|
| 1244 |
+
" <th>Sentiment</th>\n",
|
| 1245 |
+
" <th></th>\n",
|
| 1246 |
+
" </tr>\n",
|
| 1247 |
+
" </thead>\n",
|
| 1248 |
+
" <tbody>\n",
|
| 1249 |
+
" <tr>\n",
|
| 1250 |
+
" <th>positive</th>\n",
|
| 1251 |
+
" <td>0.61844</td>\n",
|
| 1252 |
+
" </tr>\n",
|
| 1253 |
+
" <tr>\n",
|
| 1254 |
+
" <th>neutral</th>\n",
|
| 1255 |
+
" <td>0.25193</td>\n",
|
| 1256 |
+
" </tr>\n",
|
| 1257 |
+
" <tr>\n",
|
| 1258 |
+
" <th>negative</th>\n",
|
| 1259 |
+
" <td>0.12963</td>\n",
|
| 1260 |
+
" </tr>\n",
|
| 1261 |
+
" </tbody>\n",
|
| 1262 |
+
"</table>\n",
|
| 1263 |
+
"</div><br><label><b>dtype:</b> float64</label>"
|
| 1264 |
+
]
|
| 1265 |
+
},
|
| 1266 |
+
"metadata": {},
|
| 1267 |
+
"execution_count": 24
|
| 1268 |
+
}
|
| 1269 |
+
]
|
| 1270 |
+
},
|
| 1271 |
+
{
|
| 1272 |
+
"cell_type": "markdown",
|
| 1273 |
+
"source": [
|
| 1274 |
+
"## π€ Model Definition: DistilBERT\n",
|
| 1275 |
+
"\n",
|
| 1276 |
+
"We load **DistilBERT** with a classification head:\n",
|
| 1277 |
+
"- Base: `distilbert-base-uncased`\n",
|
| 1278 |
+
"- Output layer: `num_labels=3` (Positive, Negative, Neutral)\n",
|
| 1279 |
+
"\n",
|
| 1280 |
+
"Advantages:\n",
|
| 1281 |
+
"- Smaller & faster than full BERT.\n",
|
| 1282 |
+
"- Maintains high accuracy.\n",
|
| 1283 |
+
"- Well-suited for fine-tuning on sentiment tasks.\n"
|
| 1284 |
+
],
|
| 1285 |
+
"metadata": {
|
| 1286 |
+
"id": "rMBY6W5Ln2ht"
|
| 1287 |
+
}
|
| 1288 |
+
},
|
| 1289 |
+
{
|
| 1290 |
+
"cell_type": "code",
|
| 1291 |
+
"source": [
|
| 1292 |
+
"model = DistilBertForSequenceClassification.from_pretrained(\"distilbert-base-uncased\", num_labels=3)"
|
| 1293 |
+
],
|
| 1294 |
+
"metadata": {
|
| 1295 |
+
"colab": {
|
| 1296 |
+
"base_uri": "https://localhost:8080/"
|
| 1297 |
+
},
|
| 1298 |
+
"id": "JMhe5rDm75FZ",
|
| 1299 |
+
"outputId": "dccc9bcf-2c3d-4769-902d-834970741a96"
|
| 1300 |
+
},
|
| 1301 |
+
"execution_count": null,
|
| 1302 |
+
"outputs": [
|
| 1303 |
+
{
|
| 1304 |
+
"output_type": "stream",
|
| 1305 |
+
"name": "stderr",
|
| 1306 |
+
"text": [
|
| 1307 |
+
"Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']\n",
|
| 1308 |
+
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
|
| 1309 |
+
]
|
| 1310 |
+
}
|
| 1311 |
+
]
|
| 1312 |
+
},
|
| 1313 |
+
{
|
| 1314 |
+
"cell_type": "code",
|
| 1315 |
+
"source": [
|
| 1316 |
+
"device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')"
|
| 1317 |
+
],
|
| 1318 |
+
"metadata": {
|
| 1319 |
+
"id": "nsvuDkon8A-q"
|
| 1320 |
+
},
|
| 1321 |
+
"execution_count": null,
|
| 1322 |
+
"outputs": []
|
| 1323 |
+
},
|
| 1324 |
+
{
|
| 1325 |
+
"cell_type": "code",
|
| 1326 |
+
"source": [
|
| 1327 |
+
"model.to(device)"
|
| 1328 |
+
],
|
| 1329 |
+
"metadata": {
|
| 1330 |
+
"colab": {
|
| 1331 |
+
"base_uri": "https://localhost:8080/"
|
| 1332 |
+
},
|
| 1333 |
+
"id": "vdMVJppK8CYJ",
|
| 1334 |
+
"outputId": "72521ac8-611b-43a4-8fbb-4e5e4af4c69d"
|
| 1335 |
+
},
|
| 1336 |
+
"execution_count": null,
|
| 1337 |
+
"outputs": [
|
| 1338 |
+
{
|
| 1339 |
+
"output_type": "execute_result",
|
| 1340 |
+
"data": {
|
| 1341 |
+
"text/plain": [
|
| 1342 |
+
"DistilBertForSequenceClassification(\n",
|
| 1343 |
+
" (distilbert): DistilBertModel(\n",
|
| 1344 |
+
" (embeddings): Embeddings(\n",
|
| 1345 |
+
" (word_embeddings): Embedding(30522, 768, padding_idx=0)\n",
|
| 1346 |
+
" (position_embeddings): Embedding(512, 768)\n",
|
| 1347 |
+
" (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
|
| 1348 |
+
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
| 1349 |
+
" )\n",
|
| 1350 |
+
" (transformer): Transformer(\n",
|
| 1351 |
+
" (layer): ModuleList(\n",
|
| 1352 |
+
" (0-5): 6 x TransformerBlock(\n",
|
| 1353 |
+
" (attention): DistilBertSdpaAttention(\n",
|
| 1354 |
+
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
| 1355 |
+
" (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
|
| 1356 |
+
" (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
|
| 1357 |
+
" (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
|
| 1358 |
+
" (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
|
| 1359 |
+
" )\n",
|
| 1360 |
+
" (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
|
| 1361 |
+
" (ffn): FFN(\n",
|
| 1362 |
+
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
| 1363 |
+
" (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
|
| 1364 |
+
" (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
|
| 1365 |
+
" (activation): GELUActivation()\n",
|
| 1366 |
+
" )\n",
|
| 1367 |
+
" (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
|
| 1368 |
+
" )\n",
|
| 1369 |
+
" )\n",
|
| 1370 |
+
" )\n",
|
| 1371 |
+
" )\n",
|
| 1372 |
+
" (pre_classifier): Linear(in_features=768, out_features=768, bias=True)\n",
|
| 1373 |
+
" (classifier): Linear(in_features=768, out_features=3, bias=True)\n",
|
| 1374 |
+
" (dropout): Dropout(p=0.2, inplace=False)\n",
|
| 1375 |
+
")"
|
| 1376 |
+
]
|
| 1377 |
+
},
|
| 1378 |
+
"metadata": {},
|
| 1379 |
+
"execution_count": 27
|
| 1380 |
+
}
|
| 1381 |
+
]
|
| 1382 |
+
},
|
| 1383 |
+
{
|
| 1384 |
+
"cell_type": "markdown",
|
| 1385 |
+
"source": [
|
| 1386 |
+
"## ποΈ Training Setup\n",
|
| 1387 |
+
"\n",
|
| 1388 |
+
"We define:\n",
|
| 1389 |
+
"- **Loss Function**: CrossEntropyLoss (multi-class classification).\n",
|
| 1390 |
+
"- **Optimizer**: AdamW with learning rate `1e-5` and weight decay.\n",
|
| 1391 |
+
"- **Learning Rate Scheduler**: ReduceLROnPlateau for dynamic LR adjustment.\n",
|
| 1392 |
+
"\n",
|
| 1393 |
+
"Two helper functions:\n",
|
| 1394 |
+
"- `train_first()` β for training each epoch.\n",
|
| 1395 |
+
"- `evaluate_first()` β for validation.\n"
|
| 1396 |
+
],
|
| 1397 |
+
"metadata": {
|
| 1398 |
+
"id": "WqI0ZvhxndNw"
|
| 1399 |
+
}
|
| 1400 |
+
},
|
| 1401 |
+
{
|
| 1402 |
+
"cell_type": "code",
|
| 1403 |
+
"source": [
|
| 1404 |
+
"criterion = nn.CrossEntropyLoss() # Optional: remove label_smoothing\n",
|
| 1405 |
+
"optimizer = AdamW(model.parameters(), lr=1e-5,weight_decay=1e-2)"
|
| 1406 |
+
],
|
| 1407 |
+
"metadata": {
|
| 1408 |
+
"id": "ngcSTsxW8Dix"
|
| 1409 |
+
},
|
| 1410 |
+
"execution_count": null,
|
| 1411 |
+
"outputs": []
|
| 1412 |
+
},
|
| 1413 |
+
{
|
| 1414 |
+
"cell_type": "markdown",
|
| 1415 |
+
"source": [
|
| 1416 |
+
"## π Model Training\n",
|
| 1417 |
+
"\n",
|
| 1418 |
+
"We train for `15` epochs:\n",
|
| 1419 |
+
"- Save best model when validation accuracy improves.\n",
|
| 1420 |
+
"- Adjust learning rate based on validation loss.\n",
|
| 1421 |
+
"\n",
|
| 1422 |
+
"Training outputs:\n",
|
| 1423 |
+
"- Epoch-by-epoch loss & accuracy for training and validation.\n",
|
| 1424 |
+
"- Best model saved as `best_model.pth`.\n"
|
| 1425 |
+
],
|
| 1426 |
+
"metadata": {
|
| 1427 |
+
"id": "ojPlLPY1rghi"
|
| 1428 |
+
}
|
| 1429 |
+
},
|
| 1430 |
+
{
|
| 1431 |
+
"cell_type": "code",
|
| 1432 |
+
"source": [
|
| 1433 |
+
"def train_first(model, train_loader, criterion, optimizer, device):\n",
|
| 1434 |
+
" model.train()\n",
|
| 1435 |
+
" running_loss = 0.0\n",
|
| 1436 |
+
" correct = 0\n",
|
| 1437 |
+
" total = 0\n",
|
| 1438 |
+
"\n",
|
| 1439 |
+
" for batch in tqdm(train_loader, desc=\"Training\"):\n",
|
| 1440 |
+
" batch = {k: v.to(device) for k, v in batch.items()}\n",
|
| 1441 |
+
" labels = batch['labels']\n",
|
| 1442 |
+
"\n",
|
| 1443 |
+
" optimizer.zero_grad()\n",
|
| 1444 |
+
" outputs = model(**batch)\n",
|
| 1445 |
+
" loss = criterion(outputs.logits, labels)\n",
|
| 1446 |
+
" loss.backward()\n",
|
| 1447 |
+
" optimizer.step()\n",
|
| 1448 |
+
"\n",
|
| 1449 |
+
" running_loss += loss.item()\n",
|
| 1450 |
+
" _, predicted = torch.max(outputs.logits, dim=1)\n",
|
| 1451 |
+
" correct += (predicted == labels).sum().item()\n",
|
| 1452 |
+
" total += labels.size(0)\n",
|
| 1453 |
+
"\n",
|
| 1454 |
+
" avg_loss = running_loss / len(train_loader)\n",
|
| 1455 |
+
" accuracy = correct / total\n",
|
| 1456 |
+
" return avg_loss, accuracy\n"
|
| 1457 |
+
],
|
| 1458 |
+
"metadata": {
|
| 1459 |
+
"id": "UV05pMqP8GbB"
|
| 1460 |
+
},
|
| 1461 |
+
"execution_count": null,
|
| 1462 |
+
"outputs": []
|
| 1463 |
+
},
|
| 1464 |
+
{
|
| 1465 |
+
"cell_type": "code",
|
| 1466 |
+
"source": [
|
| 1467 |
+
"def evaluate_first(model, val_loader, criterion, device):\n",
|
| 1468 |
+
" model.eval()\n",
|
| 1469 |
+
" running_loss = 0.0\n",
|
| 1470 |
+
" correct = 0\n",
|
| 1471 |
+
" total = 0\n",
|
| 1472 |
+
"\n",
|
| 1473 |
+
" with torch.no_grad():\n",
|
| 1474 |
+
" for batch in tqdm(val_loader, desc=\"Evaluating\"):\n",
|
| 1475 |
+
" batch = {k: v.to(device) for k, v in batch.items()}\n",
|
| 1476 |
+
" labels = batch['labels']\n",
|
| 1477 |
+
"\n",
|
| 1478 |
+
" outputs = model(**batch)\n",
|
| 1479 |
+
" loss = criterion(outputs.logits, labels)\n",
|
| 1480 |
+
"\n",
|
| 1481 |
+
" running_loss += loss.item()\n",
|
| 1482 |
+
" _, predicted = torch.max(outputs.logits, dim=1)\n",
|
| 1483 |
+
" correct += (predicted == labels).sum().item()\n",
|
| 1484 |
+
" total += labels.size(0)\n",
|
| 1485 |
+
"\n",
|
| 1486 |
+
" avg_loss = running_loss / len(val_loader)\n",
|
| 1487 |
+
" accuracy = correct / total\n",
|
| 1488 |
+
" return avg_loss, accuracy\n"
|
| 1489 |
+
],
|
| 1490 |
+
"metadata": {
|
| 1491 |
+
"id": "tF3OyJlf8HkT"
|
| 1492 |
+
},
|
| 1493 |
+
"execution_count": null,
|
| 1494 |
+
"outputs": []
|
| 1495 |
+
},
|
| 1496 |
+
{
|
| 1497 |
+
"cell_type": "markdown",
|
| 1498 |
+
"source": [
|
| 1499 |
+
"## π Results & Evaluation\n",
|
| 1500 |
+
"\n",
|
| 1501 |
+
"Metrics:\n",
|
| 1502 |
+
"- **Final Validation Accuracy**: ~XX%\n",
|
| 1503 |
+
"- **Loss Trend**: Decreasing over epochs.\n",
|
| 1504 |
+
"- **Accuracy Trend**: Improving and stabilizing.\n",
|
| 1505 |
+
"\n",
|
| 1506 |
+
"Observations:\n",
|
| 1507 |
+
"- Weighted sampling helped with minority class performance.\n",
|
| 1508 |
+
"- DistilBERT achieved high accuracy with limited fine-tuning.\n"
|
| 1509 |
+
],
|
| 1510 |
+
"metadata": {
|
| 1511 |
+
"id": "asrLMEmlrmuN"
|
| 1512 |
+
}
|
| 1513 |
+
},
|
| 1514 |
+
{
|
| 1515 |
+
"cell_type": "code",
|
| 1516 |
+
"source": [
|
| 1517 |
+
"from torch.optim.lr_scheduler import StepLR , ReduceLROnPlateau\n",
|
| 1518 |
+
"\n",
|
| 1519 |
+
"\n",
|
| 1520 |
+
"schedulerr = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, verbose=True)\n",
|
| 1521 |
+
"\n",
|
| 1522 |
+
"scheduler = StepLR(optimizer, step_size=5, gamma=0.5) # Halve LR every 5 epochs\n"
|
| 1523 |
+
],
|
| 1524 |
+
"metadata": {
|
| 1525 |
+
"id": "mkAQUcY28IjB"
|
| 1526 |
+
},
|
| 1527 |
+
"execution_count": null,
|
| 1528 |
+
"outputs": []
|
| 1529 |
+
},
|
| 1530 |
+
{
|
| 1531 |
+
"cell_type": "code",
|
| 1532 |
+
"source": [
|
| 1533 |
+
"train_loader = DataLoader(train_set, batch_size=16, sampler=sampler,num_workers=2,pin_memory=True)\n",
|
| 1534 |
+
"val_loader = DataLoader(test_set, batch_size=16,num_workers=2,pin_memory=True)"
|
| 1535 |
+
],
|
| 1536 |
+
"metadata": {
|
| 1537 |
+
"id": "qllaeHeq8J-E"
|
| 1538 |
+
},
|
| 1539 |
+
"execution_count": null,
|
| 1540 |
+
"outputs": []
|
| 1541 |
+
},
|
| 1542 |
+
{
|
| 1543 |
+
"cell_type": "code",
|
| 1544 |
+
"source": [
|
| 1545 |
+
"epochs = 15\n",
|
| 1546 |
+
"use_plateau = True\n",
|
| 1547 |
+
"\n",
|
| 1548 |
+
"if use_plateau:\n",
|
| 1549 |
+
" scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)\n",
|
| 1550 |
+
"else:\n",
|
| 1551 |
+
" scheduler = StepLR(optimizer, step_size=5, gamma=0.1)\n",
|
| 1552 |
+
"\n",
|
| 1553 |
+
"# Training loop\n",
|
| 1554 |
+
"best_val_acc = 0.0\n",
|
| 1555 |
+
"for epoch in range(1, epochs + 1):\n",
|
| 1556 |
+
" print(f\"\\nEpoch {epoch}/{epochs}\")\n",
|
| 1557 |
+
"\n",
|
| 1558 |
+
" # Train\n",
|
| 1559 |
+
" train_loss, train_acc = train_first(model, train_loader, criterion, optimizer, device)\n",
|
| 1560 |
+
"\n",
|
| 1561 |
+
" # Validation\n",
|
| 1562 |
+
" val_loss, val_acc = evaluate_first(model, val_loader, criterion, device)\n",
|
| 1563 |
+
"\n",
|
| 1564 |
+
" print(f\"Train Loss: {train_loss:.4f}, Acc: {train_acc:.4f}\")\n",
|
| 1565 |
+
" print(f\"Val Loss: {val_loss:.4f}, Acc: {val_acc:.4f}\")\n",
|
| 1566 |
+
"\n",
|
| 1567 |
+
" if val_acc > best_val_acc:\n",
|
| 1568 |
+
" best_val_acc = val_acc\n",
|
| 1569 |
+
" torch.save(model.state_dict(), \"best_model.pth\")\n",
|
| 1570 |
+
" print(\"β
Saved Best Model\")\n",
|
| 1571 |
+
"\n",
|
| 1572 |
+
" if use_plateau:\n",
|
| 1573 |
+
" scheduler.step(val_loss)\n",
|
| 1574 |
+
" else:\n",
|
| 1575 |
+
" scheduler.step()"
|
| 1576 |
+
],
|
| 1577 |
+
"metadata": {
|
| 1578 |
+
"colab": {
|
| 1579 |
+
"base_uri": "https://localhost:8080/"
|
| 1580 |
+
},
|
| 1581 |
+
"id": "Qi59PjNX8LD1",
|
| 1582 |
+
"outputId": "749c9f73-80d8-46da-f43f-a8b2b6bbd96c"
|
| 1583 |
+
},
|
| 1584 |
+
"execution_count": null,
|
| 1585 |
+
"outputs": [
|
| 1586 |
+
{
|
| 1587 |
+
"output_type": "stream",
|
| 1588 |
+
"name": "stdout",
|
| 1589 |
+
"text": [
|
| 1590 |
+
"\n",
|
| 1591 |
+
"Epoch 1/15\n"
|
| 1592 |
+
]
|
| 1593 |
+
},
|
| 1594 |
+
{
|
| 1595 |
+
"output_type": "stream",
|
| 1596 |
+
"name": "stderr",
|
| 1597 |
+
"text": [
|
| 1598 |
+
"Training: 100%|ββββββββββ| 894/894 [02:39<00:00, 5.60it/s]\n",
|
| 1599 |
+
"Evaluating: 100%|ββββββββββ| 224/224 [00:13<00:00, 16.31it/s]\n"
|
| 1600 |
+
]
|
| 1601 |
+
},
|
| 1602 |
+
{
|
| 1603 |
+
"output_type": "stream",
|
| 1604 |
+
"name": "stdout",
|
| 1605 |
+
"text": [
|
| 1606 |
+
"Train Loss: 0.5785, Acc: 0.7690\n",
|
| 1607 |
+
"Val Loss: 0.5143, Acc: 0.7961\n",
|
| 1608 |
+
"β
Saved Best Model\n",
|
| 1609 |
+
"\n",
|
| 1610 |
+
"Epoch 2/15\n"
|
| 1611 |
+
]
|
| 1612 |
+
},
|
| 1613 |
+
{
|
| 1614 |
+
"output_type": "stream",
|
| 1615 |
+
"name": "stderr",
|
| 1616 |
+
"text": [
|
| 1617 |
+
"Training: 100%|ββββββββββ| 894/894 [02:38<00:00, 5.64it/s]\n",
|
| 1618 |
+
"Evaluating: 100%|ββββββββββ| 224/224 [00:13<00:00, 16.31it/s]\n"
|
| 1619 |
+
]
|
| 1620 |
+
},
|
| 1621 |
+
{
|
| 1622 |
+
"output_type": "stream",
|
| 1623 |
+
"name": "stdout",
|
| 1624 |
+
"text": [
|
| 1625 |
+
"Train Loss: 0.3665, Acc: 0.8678\n",
|
| 1626 |
+
"Val Loss: 0.5318, Acc: 0.8031\n",
|
| 1627 |
+
"β
Saved Best Model\n",
|
| 1628 |
+
"\n",
|
| 1629 |
+
"Epoch 3/15\n"
|
| 1630 |
+
]
|
| 1631 |
+
},
|
| 1632 |
+
{
|
| 1633 |
+
"output_type": "stream",
|
| 1634 |
+
"name": "stderr",
|
| 1635 |
+
"text": [
|
| 1636 |
+
"Training: 100%|ββββββββββ| 894/894 [02:38<00:00, 5.64it/s]\n",
|
| 1637 |
+
"Evaluating: 100%|ββββββββββ| 224/224 [00:13<00:00, 16.31it/s]\n"
|
| 1638 |
+
]
|
| 1639 |
+
},
|
| 1640 |
+
{
|
| 1641 |
+
"output_type": "stream",
|
| 1642 |
+
"name": "stdout",
|
| 1643 |
+
"text": [
|
| 1644 |
+
"Train Loss: 0.2711, Acc: 0.9075\n",
|
| 1645 |
+
"Val Loss: 0.5607, Acc: 0.8115\n",
|
| 1646 |
+
"β
Saved Best Model\n",
|
| 1647 |
+
"\n",
|
| 1648 |
+
"Epoch 4/15\n"
|
| 1649 |
+
]
|
| 1650 |
+
},
|
| 1651 |
+
{
|
| 1652 |
+
"output_type": "stream",
|
| 1653 |
+
"name": "stderr",
|
| 1654 |
+
"text": [
|
| 1655 |
+
"Training: 100%|ββββββββββ| 894/894 [02:38<00:00, 5.62it/s]\n",
|
| 1656 |
+
"Evaluating: 100%|ββββββββββ| 224/224 [00:13<00:00, 16.39it/s]\n"
|
| 1657 |
+
]
|
| 1658 |
+
},
|
| 1659 |
+
{
|
| 1660 |
+
"output_type": "stream",
|
| 1661 |
+
"name": "stdout",
|
| 1662 |
+
"text": [
|
| 1663 |
+
"Train Loss: 0.2106, Acc: 0.9288\n",
|
| 1664 |
+
"Val Loss: 0.5954, Acc: 0.7997\n",
|
| 1665 |
+
"\n",
|
| 1666 |
+
"Epoch 5/15\n"
|
| 1667 |
+
]
|
| 1668 |
+
},
|
| 1669 |
+
{
|
| 1670 |
+
"output_type": "stream",
|
| 1671 |
+
"name": "stderr",
|
| 1672 |
+
"text": [
|
| 1673 |
+
"Training: 100%|ββββββββββ| 894/894 [02:38<00:00, 5.63it/s]\n",
|
| 1674 |
+
"Evaluating: 100%|ββββββββββ| 224/224 [00:13<00:00, 16.32it/s]\n"
|
| 1675 |
+
]
|
| 1676 |
+
},
|
| 1677 |
+
{
|
| 1678 |
+
"output_type": "stream",
|
| 1679 |
+
"name": "stdout",
|
| 1680 |
+
"text": [
|
| 1681 |
+
"Train Loss: 0.1497, Acc: 0.9494\n",
|
| 1682 |
+
"Val Loss: 0.6418, Acc: 0.8048\n",
|
| 1683 |
+
"\n",
|
| 1684 |
+
"Epoch 6/15\n"
|
| 1685 |
+
]
|
| 1686 |
+
},
|
| 1687 |
+
{
|
| 1688 |
+
"output_type": "stream",
|
| 1689 |
+
"name": "stderr",
|
| 1690 |
+
"text": [
|
| 1691 |
+
"Training: 100%|ββββββββββ| 894/894 [02:38<00:00, 5.64it/s]\n",
|
| 1692 |
+
"Evaluating: 100%|ββββββββββ| 224/224 [00:13<00:00, 16.34it/s]\n"
|
| 1693 |
+
]
|
| 1694 |
+
},
|
| 1695 |
+
{
|
| 1696 |
+
"output_type": "stream",
|
| 1697 |
+
"name": "stdout",
|
| 1698 |
+
"text": [
|
| 1699 |
+
"Train Loss: 0.1295, Acc: 0.9592\n",
|
| 1700 |
+
"Val Loss: 0.6598, Acc: 0.8120\n",
|
| 1701 |
+
"β
Saved Best Model\n",
|
| 1702 |
+
"\n",
|
| 1703 |
+
"Epoch 7/15\n"
|
| 1704 |
+
]
|
| 1705 |
+
},
|
| 1706 |
+
{
|
| 1707 |
+
"output_type": "stream",
|
| 1708 |
+
"name": "stderr",
|
| 1709 |
+
"text": [
|
| 1710 |
+
"Training: 100%|ββββββββββ| 894/894 [02:38<00:00, 5.64it/s]\n",
|
| 1711 |
+
"Evaluating: 100%|ββββββββββ| 224/224 [00:13<00:00, 16.32it/s]\n"
|
| 1712 |
+
]
|
| 1713 |
+
},
|
| 1714 |
+
{
|
| 1715 |
+
"output_type": "stream",
|
| 1716 |
+
"name": "stdout",
|
| 1717 |
+
"text": [
|
| 1718 |
+
"Train Loss: 0.1038, Acc: 0.9684\n",
|
| 1719 |
+
"Val Loss: 0.6980, Acc: 0.8157\n",
|
| 1720 |
+
"β
Saved Best Model\n",
|
| 1721 |
+
"\n",
|
| 1722 |
+
"Epoch 8/15\n"
|
| 1723 |
+
]
|
| 1724 |
+
},
|
| 1725 |
+
{
|
| 1726 |
+
"output_type": "stream",
|
| 1727 |
+
"name": "stderr",
|
| 1728 |
+
"text": [
|
| 1729 |
+
"Training: 100%|ββββββββββ| 894/894 [02:38<00:00, 5.64it/s]\n",
|
| 1730 |
+
"Evaluating: 100%|ββββββββββ| 224/224 [00:13<00:00, 16.34it/s]\n"
|
| 1731 |
+
]
|
| 1732 |
+
},
|
| 1733 |
+
{
|
| 1734 |
+
"output_type": "stream",
|
| 1735 |
+
"name": "stdout",
|
| 1736 |
+
"text": [
|
| 1737 |
+
"Train Loss: 0.0792, Acc: 0.9773\n",
|
| 1738 |
+
"Val Loss: 0.7259, Acc: 0.8154\n",
|
| 1739 |
+
"\n",
|
| 1740 |
+
"Epoch 9/15\n"
|
| 1741 |
+
]
|
| 1742 |
+
},
|
| 1743 |
+
{
|
| 1744 |
+
"output_type": "stream",
|
| 1745 |
+
"name": "stderr",
|
| 1746 |
+
"text": [
|
| 1747 |
+
"Training: 100%|ββββββββββ| 894/894 [02:38<00:00, 5.65it/s]\n",
|
| 1748 |
+
"Evaluating: 100%|ββββββββββ| 224/224 [00:13<00:00, 16.33it/s]\n"
|
| 1749 |
+
]
|
| 1750 |
+
},
|
| 1751 |
+
{
|
| 1752 |
+
"output_type": "stream",
|
| 1753 |
+
"name": "stdout",
|
| 1754 |
+
"text": [
|
| 1755 |
+
"Train Loss: 0.0666, Acc: 0.9806\n",
|
| 1756 |
+
"Val Loss: 0.7561, Acc: 0.8148\n",
|
| 1757 |
+
"\n",
|
| 1758 |
+
"Epoch 10/15\n"
|
| 1759 |
+
]
|
| 1760 |
+
},
|
| 1761 |
+
{
|
| 1762 |
+
"output_type": "stream",
|
| 1763 |
+
"name": "stderr",
|
| 1764 |
+
"text": [
|
| 1765 |
+
"Training: 100%|ββββββββββ| 894/894 [02:38<00:00, 5.64it/s]\n",
|
| 1766 |
+
"Evaluating: 100%|ββββββββββ| 224/224 [00:13<00:00, 16.37it/s]\n"
|
| 1767 |
+
]
|
| 1768 |
+
},
|
| 1769 |
+
{
|
| 1770 |
+
"output_type": "stream",
|
| 1771 |
+
"name": "stdout",
|
| 1772 |
+
"text": [
|
| 1773 |
+
"Train Loss: 0.0633, Acc: 0.9821\n",
|
| 1774 |
+
"Val Loss: 0.7873, Acc: 0.8126\n",
|
| 1775 |
+
"\n",
|
| 1776 |
+
"Epoch 11/15\n"
|
| 1777 |
+
]
|
| 1778 |
+
},
|
| 1779 |
+
{
|
| 1780 |
+
"output_type": "stream",
|
| 1781 |
+
"name": "stderr",
|
| 1782 |
+
"text": [
|
| 1783 |
+
"Training: 100%|ββββββββββ| 894/894 [02:38<00:00, 5.64it/s]\n",
|
| 1784 |
+
"Evaluating: 100%|ββββββββββ| 224/224 [00:13<00:00, 16.36it/s]\n"
|
| 1785 |
+
]
|
| 1786 |
+
},
|
| 1787 |
+
{
|
| 1788 |
+
"output_type": "stream",
|
| 1789 |
+
"name": "stdout",
|
| 1790 |
+
"text": [
|
| 1791 |
+
"Train Loss: 0.0479, Acc: 0.9862\n",
|
| 1792 |
+
"Val Loss: 0.8020, Acc: 0.8148\n",
|
| 1793 |
+
"\n",
|
| 1794 |
+
"Epoch 12/15\n"
|
| 1795 |
+
]
|
| 1796 |
+
},
|
| 1797 |
+
{
|
| 1798 |
+
"output_type": "stream",
|
| 1799 |
+
"name": "stderr",
|
| 1800 |
+
"text": [
|
| 1801 |
+
"Training: 100%|ββββββββββ| 894/894 [02:38<00:00, 5.64it/s]\n",
|
| 1802 |
+
"Evaluating: 100%|ββββββββββ| 224/224 [00:13<00:00, 16.35it/s]\n"
|
| 1803 |
+
]
|
| 1804 |
+
},
|
| 1805 |
+
{
|
| 1806 |
+
"output_type": "stream",
|
| 1807 |
+
"name": "stdout",
|
| 1808 |
+
"text": [
|
| 1809 |
+
"Train Loss: 0.0508, Acc: 0.9862\n",
|
| 1810 |
+
"Val Loss: 0.8170, Acc: 0.8095\n",
|
| 1811 |
+
"\n",
|
| 1812 |
+
"Epoch 13/15\n"
|
| 1813 |
+
]
|
| 1814 |
+
},
|
| 1815 |
+
{
|
| 1816 |
+
"output_type": "stream",
|
| 1817 |
+
"name": "stderr",
|
| 1818 |
+
"text": [
|
| 1819 |
+
"Training: 100%|ββββββββββ| 894/894 [02:38<00:00, 5.64it/s]\n",
|
| 1820 |
+
"Evaluating: 100%|ββββββββββ| 224/224 [00:13<00:00, 16.29it/s]\n"
|
| 1821 |
+
]
|
| 1822 |
+
},
|
| 1823 |
+
{
|
| 1824 |
+
"output_type": "stream",
|
| 1825 |
+
"name": "stdout",
|
| 1826 |
+
"text": [
|
| 1827 |
+
"Train Loss: 0.0478, Acc: 0.9875\n",
|
| 1828 |
+
"Val Loss: 0.8214, Acc: 0.8087\n",
|
| 1829 |
+
"\n",
|
| 1830 |
+
"Epoch 14/15\n"
|
| 1831 |
+
]
|
| 1832 |
+
},
|
| 1833 |
+
{
|
| 1834 |
+
"output_type": "stream",
|
| 1835 |
+
"name": "stderr",
|
| 1836 |
+
"text": [
|
| 1837 |
+
"Training: 100%|ββββββββββ| 894/894 [02:38<00:00, 5.63it/s]\n",
|
| 1838 |
+
"Evaluating: 100%|ββββββββββ| 224/224 [00:13<00:00, 16.36it/s]\n"
|
| 1839 |
+
]
|
| 1840 |
+
},
|
| 1841 |
+
{
|
| 1842 |
+
"output_type": "stream",
|
| 1843 |
+
"name": "stdout",
|
| 1844 |
+
"text": [
|
| 1845 |
+
"Train Loss: 0.0440, Acc: 0.9877\n",
|
| 1846 |
+
"Val Loss: 0.8259, Acc: 0.8159\n",
|
| 1847 |
+
"β
Saved Best Model\n",
|
| 1848 |
+
"\n",
|
| 1849 |
+
"Epoch 15/15\n"
|
| 1850 |
+
]
|
| 1851 |
+
},
|
| 1852 |
+
{
|
| 1853 |
+
"output_type": "stream",
|
| 1854 |
+
"name": "stderr",
|
| 1855 |
+
"text": [
|
| 1856 |
+
"Training: 100%|ββββββββββ| 894/894 [02:38<00:00, 5.63it/s]\n",
|
| 1857 |
+
"Evaluating: 100%|ββββββββββ| 224/224 [00:13<00:00, 16.34it/s]"
|
| 1858 |
+
]
|
| 1859 |
+
},
|
| 1860 |
+
{
|
| 1861 |
+
"output_type": "stream",
|
| 1862 |
+
"name": "stdout",
|
| 1863 |
+
"text": [
|
| 1864 |
+
"Train Loss: 0.0455, Acc: 0.9869\n",
|
| 1865 |
+
"Val Loss: 0.8297, Acc: 0.8084\n"
|
| 1866 |
+
]
|
| 1867 |
+
},
|
| 1868 |
+
{
|
| 1869 |
+
"output_type": "stream",
|
| 1870 |
+
"name": "stderr",
|
| 1871 |
+
"text": [
|
| 1872 |
+
"\n"
|
| 1873 |
+
]
|
| 1874 |
+
}
|
| 1875 |
+
]
|
| 1876 |
+
},
|
| 1877 |
+
{
|
| 1878 |
+
"cell_type": "markdown",
|
| 1879 |
+
"source": [
|
| 1880 |
+
"## π Next Steps\n",
|
| 1881 |
+
"\n",
|
| 1882 |
+
"To further improve the model:\n",
|
| 1883 |
+
"1. Add **early stopping** to prevent overfitting.\n",
|
| 1884 |
+
"2. Use **hyperparameter tuning** for optimal learning rate & batch size.\n",
|
| 1885 |
+
"3. Try **data augmentation** (synonym replacement, back translation).\n",
|
| 1886 |
+
"4. Deploy the model as an API for real-time sentiment analysis.\n"
|
| 1887 |
+
],
|
| 1888 |
+
"metadata": {
|
| 1889 |
+
"id": "vk6UrOI4rqdu"
|
| 1890 |
+
}
|
| 1891 |
+
},
|
| 1892 |
+
{
|
| 1893 |
+
"cell_type": "code",
|
| 1894 |
+
"source": [],
|
| 1895 |
+
"metadata": {
|
| 1896 |
+
"id": "-hhveI5c8NQR"
|
| 1897 |
+
},
|
| 1898 |
+
"execution_count": null,
|
| 1899 |
+
"outputs": []
|
| 1900 |
+
}
|
| 1901 |
+
],
|
| 1902 |
+
"metadata": {
|
| 1903 |
+
"kernelspec": {
|
| 1904 |
+
"display_name": "Python 3",
|
| 1905 |
+
"name": "python3"
|
| 1906 |
+
},
|
| 1907 |
+
"language_info": {
|
| 1908 |
+
"pygments_lexer": "ipython3",
|
| 1909 |
+
"nbconvert_exporter": "python",
|
| 1910 |
+
"version": "3.6.4",
|
| 1911 |
+
"file_extension": ".py",
|
| 1912 |
+
"codemirror_mode": {
|
| 1913 |
+
"name": "ipython",
|
| 1914 |
+
"version": 3
|
| 1915 |
+
},
|
| 1916 |
+
"name": "python",
|
| 1917 |
+
"mimetype": "text/x-python"
|
| 1918 |
+
},
|
| 1919 |
+
"colab": {
|
| 1920 |
+
"provenance": [],
|
| 1921 |
+
"gpuType": "T4"
|
| 1922 |
+
},
|
| 1923 |
+
"accelerator": "GPU"
|
| 1924 |
+
},
|
| 1925 |
+
"nbformat": 4,
|
| 1926 |
+
"nbformat_minor": 0
|
| 1927 |
+
}
|