{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "id": "dP6l8vOUtV0J" }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "id": "EDetmMNGuXY9" }, "outputs": [], "source": [ "df = pd.read_csv(\"/content/training.1600000.processed.noemoticon.csv\", encoding='latin-1')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "id": "yDJXNYjqwYo2", "outputId": "b56cb388-71f6-4d0e-c651-0201854cb952" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " 0 1467810369 Mon Apr 06 22:19:45 PDT 2009 NO_QUERY \\\n", "0 0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY \n", "1 0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY \n", "2 0 1467811184 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY \n", "3 0 1467811193 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY \n", "4 0 1467811372 Mon Apr 06 22:20:00 PDT 2009 NO_QUERY \n", "... .. ... ... ... \n", "1599994 4 2193601966 Tue Jun 16 08:40:49 PDT 2009 NO_QUERY \n", "1599995 4 2193601969 Tue Jun 16 08:40:49 PDT 2009 NO_QUERY \n", "1599996 4 2193601991 Tue Jun 16 08:40:49 PDT 2009 NO_QUERY \n", "1599997 4 2193602064 Tue Jun 16 08:40:49 PDT 2009 NO_QUERY \n", "1599998 4 2193602129 Tue Jun 16 08:40:50 PDT 2009 NO_QUERY \n", "\n", " _TheSpecialOne_ \\\n", "0 scotthamilton \n", "1 mattycus \n", "2 ElleCTF \n", "3 Karoli \n", "4 joy_wolf \n", "... ... \n", "1599994 AmandaMarie1028 \n", "1599995 TheWDBoards \n", "1599996 bpbabe \n", "1599997 tinydiamondz \n", "1599998 RyanTrevMorris \n", "\n", " @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D \n", "0 is upset that he can't update his Facebook by ... \n", "1 @Kenichan I dived many times for the ball. Man... \n", "2 my whole body feels itchy and like its on fire \n", "3 @nationwideclass no, it's not behaving at all.... \n", "4 @Kwesidei not the whole crew \n", "... ... \n", "1599994 Just woke up. Having no school is the best fee... \n", "1599995 TheWDB.com - Very cool to hear old Walt interv... \n", "1599996 Are you ready for your MoJo Makeover? Ask me f... \n", "1599997 Happy 38th Birthday to my boo of alll time!!! ... \n", "1599998 happy #charitytuesday @theNSPCC @SparksCharity... \n", "\n", "[1599999 rows x 6 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01467810369Mon Apr 06 22:19:45 PDT 2009NO_QUERY_TheSpecialOne_@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D
001467810672Mon Apr 06 22:19:49 PDT 2009NO_QUERYscotthamiltonis upset that he can't update his Facebook by ...
101467810917Mon Apr 06 22:19:53 PDT 2009NO_QUERYmattycus@Kenichan I dived many times for the ball. Man...
201467811184Mon Apr 06 22:19:57 PDT 2009NO_QUERYElleCTFmy whole body feels itchy and like its on fire
301467811193Mon Apr 06 22:19:57 PDT 2009NO_QUERYKaroli@nationwideclass no, it's not behaving at all....
401467811372Mon Apr 06 22:20:00 PDT 2009NO_QUERYjoy_wolf@Kwesidei not the whole crew
.....................
159999442193601966Tue Jun 16 08:40:49 PDT 2009NO_QUERYAmandaMarie1028Just woke up. Having no school is the best fee...
159999542193601969Tue Jun 16 08:40:49 PDT 2009NO_QUERYTheWDBoardsTheWDB.com - Very cool to hear old Walt interv...
159999642193601991Tue Jun 16 08:40:49 PDT 2009NO_QUERYbpbabeAre you ready for your MoJo Makeover? Ask me f...
159999742193602064Tue Jun 16 08:40:49 PDT 2009NO_QUERYtinydiamondzHappy 38th Birthday to my boo of alll time!!! ...
159999842193602129Tue Jun 16 08:40:50 PDT 2009NO_QUERYRyanTrevMorrishappy #charitytuesday @theNSPCC @SparksCharity...
\n", "

1599999 rows × 6 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df" } }, "metadata": {}, "execution_count": 3 } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 290 }, "id": "kZqf-vIww7VB", "outputId": "e1337690-1fc2-4e84-c800-e7e588a925fd" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0 0\n", "1467810369 0\n", "Mon Apr 06 22:19:45 PDT 2009 0\n", "NO_QUERY 0\n", "_TheSpecialOne_ 0\n", "@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D 0\n", "dtype: int64" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0
00
14678103690
Mon Apr 06 22:19:45 PDT 20090
NO_QUERY0
_TheSpecialOne_0
@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D0
\n", "

" ] }, "metadata": {}, "execution_count": 4 } ], "source": [ "df.isna().sum()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 490 }, "id": "6yNPDs81xDEt", "outputId": "26edc6a0-3338-4835-d87a-2f94db46b618" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "1467810369\n", "1563681287 2\n", "2062516845 2\n", "1551586713 2\n", "1676311044 2\n", "1791602739 2\n", " ..\n", "2197310899 1\n", "2197310477 1\n", "2197310452 1\n", "2197310381 1\n", "2197311865 1\n", "Name: count, Length: 1598314, dtype: int64" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
count
1467810369
15636812872
20625168452
15515867132
16763110442
17916027392
......
21973108991
21973104771
21973104521
21973103811
21973118651
\n", "

1598314 rows × 1 columns

\n", "

" ] }, "metadata": {}, "execution_count": 5 } ], "source": [ "df['1467810369'].value_counts()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 147 }, "id": "qSDrw9sHxS2n", "outputId": "81bad2a3-5bab-440f-9fb8-7a87d99c5e92" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "NO_QUERY\n", "NO_QUERY 1599999\n", "Name: count, dtype: int64" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
count
NO_QUERY
NO_QUERY1599999
\n", "

" ] }, "metadata": {}, "execution_count": 6 } ], "source": [ "df['NO_QUERY'].value_counts()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "id": "3Ix-o-IJxW-B" }, "outputs": [], "source": [ "df.drop('NO_QUERY',axis=1,inplace=True)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "IjITKGzlxhEv", "outputId": "96f77f50-72aa-470b-9675-3b0cf66a8352" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "0 int64\n", "1467810369 int64\n", "Mon Apr 06 22:19:45 PDT 2009 object\n", "_TheSpecialOne_ object\n", "@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D object\n", "dtype: object\n" ] } ], "source": [ "df = df.sample(frac=0.1, random_state=42) # %10 veriyi al\n", "print(df.dtypes)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Nx3sG-x313SH", "outputId": "c2577bc1-6b84-4e13-9fc1-8f5f31578160" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "Index: 16000 entries, 1497139 to 1284035\n", "Data columns (total 5 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 0 16000 non-null int64 \n", " 1 1467810369 16000 non-null int64 \n", " 2 Mon Apr 06 22:19:45 PDT 2009 16000 non-null object\n", " 3 _TheSpecialOne_ 16000 non-null object\n", " 4 @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D 16000 non-null object\n", "dtypes: int64(2), object(3)\n", "memory usage: 750.0+ KB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 300 }, "id": "6dtZVvA515sK", "outputId": "9a7af184-7a43-48ee-f453-d4eb224c6dde" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " 0 1467810369\n", "count 16000.000000 1.600000e+04\n", "mean 2.023250 1.997504e+09\n", "std 1.999927 1.928363e+08\n", "min 0.000000 1.467834e+09\n", "25% 0.000000 1.956811e+09\n", "50% 4.000000 2.001829e+09\n", "75% 4.000000 2.176166e+09\n", "max 4.000000 2.329169e+09" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01467810369
count16000.0000001.600000e+04
mean2.0232501.997504e+09
std1.9999271.928363e+08
min0.0000001.467834e+09
25%0.0000001.956811e+09
50%4.0000002.001829e+09
75%4.0000002.176166e+09
max4.0000002.329169e+09
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "summary": "{\n \"name\": \"df\",\n \"rows\": 8,\n \"fields\": [\n {\n \"column\": \"0\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 5656.045202020064,\n \"min\": 0.0,\n \"max\": 16000.0,\n \"num_unique_values\": 5,\n \"samples\": [\n 2.02325,\n 4.0,\n 1.9999273535150421\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"1467810369\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 911085603.3007622,\n \"min\": 16000.0,\n \"max\": 2329168959.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 1997503968.4805624,\n 2001828620.0,\n 16000.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 11 } ], "source": [ "df.describe()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "SkWc-Myp2am4", "outputId": "bf370764-f04f-4501-e048-a7b5fc59977b" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Count of IDs1467810369\n", "1979685249 1\n", "2047063573 1\n", "2208198047 1\n", "1996357873 1\n", "1676456732 1\n", " ..\n", "2265368249 1\n", "1679452821 1\n", "2217064177 1\n", "2045691294 1\n", "2070290547 1\n", "Name: count, Length: 16000, dtype: int64, count of usernames_TheSpecialOne_\n", "Jayme1988 7\n", "wowlew 7\n", "ShesElectric_ 5\n", "jbfanforever94 4\n", "maynaseric 4\n", " ..\n", "Stephi90 1\n", "ericzueff 1\n", "munoza13 1\n", "MissSTARcey 1\n", "Tmama21 1\n", "Name: count, Length: 15295, dtype: int64\n" ] } ], "source": [ "print(f'Count of IDs{df[\"1467810369\"].value_counts()}, count of usernames{df[\"_TheSpecialOne_\"].value_counts()}')" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "id": "0PAvL2M_3vzw" }, "outputs": [], "source": [ "from nltk.corpus import stopwords" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "id": "MpHUYWR-CKrs" }, "outputs": [], "source": [ "import nltk" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "0kcFyu7hCUdy", "outputId": "5e73979f-761d-4c9d-a5ff-4a48f501aa4e" }, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "True" ] }, "metadata": {}, "execution_count": 15 } ], "source": [ "nltk.download(\"stopwords\")" ] }, { "cell_type": "code", "source": [ "df = df.drop(columns=['1467810369'])" ], "metadata": { "id": "uql-EelvCFrN" }, "execution_count": 16, "outputs": [] }, { "cell_type": "code", "execution_count": 17, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 632 }, "id": "Ur2vB4KWCrZL", "outputId": "7ff638d7-3cba-410c-b5ac-d479e70d362f" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " 0 Mon Apr 06 22:19:45 PDT 2009 _TheSpecialOne_ \\\n", "1497139 4 Sun Jun 07 17:26:02 PDT 2009 Jill_Osi \n", "1350782 4 Fri Jun 05 11:05:43 PDT 2009 vmramos \n", "589668 0 Wed Jun 17 20:20:31 PDT 2009 icorganics \n", "52428 0 Sat May 02 09:23:27 PDT 2009 mundah \n", "738013 0 Sun Jun 21 06:25:38 PDT 2009 skdev \n", "... .. ... ... \n", "1438815 4 Sat Jun 06 20:47:03 PDT 2009 HalfassBackward \n", "298459 0 Mon Jun 01 17:16:49 PDT 2009 half_Milkman \n", "961996 4 Sun May 17 10:12:06 PDT 2009 katizzle \n", "1519397 4 Mon Jun 15 02:49:50 PDT 2009 michieong \n", "1284035 4 Tue Jun 02 02:49:00 PDT 2009 itsJohno \n", "\n", " tweet_text \n", "1497139 Can't wait to be at glacier national park \n", "1350782 @virtualhispanic Falling apart, ha? Maybe you ... \n", "589668 Oh my gosh, there is a Mom Entrepreneur of the... \n", "52428 what shall i drink during the game?: Stella, M... \n", "738013 @awaisnaseer @blessedAyesha ki LCD kharab hay \n", "... ... \n", "1438815 @bjolena Glad to hear your day went well Mine... \n", "298459 @continuity_plus Lol...or responded to the req... \n", "961996 manicure done: today -> black nails \n", "1519397 okay, decided to listen to David Archuleta, no... \n", "1284035 woah, everything looks amazing. i cant bloody ... \n", "\n", "[16000 rows x 4 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0Mon Apr 06 22:19:45 PDT 2009_TheSpecialOne_tweet_text
14971394Sun Jun 07 17:26:02 PDT 2009Jill_OsiCan't wait to be at glacier national park
13507824Fri Jun 05 11:05:43 PDT 2009vmramos@virtualhispanic Falling apart, ha? Maybe you ...
5896680Wed Jun 17 20:20:31 PDT 2009icorganicsOh my gosh, there is a Mom Entrepreneur of the...
524280Sat May 02 09:23:27 PDT 2009mundahwhat shall i drink during the game?: Stella, M...
7380130Sun Jun 21 06:25:38 PDT 2009skdev@awaisnaseer @blessedAyesha ki LCD kharab hay
...............
14388154Sat Jun 06 20:47:03 PDT 2009HalfassBackward@bjolena Glad to hear your day went well Mine...
2984590Mon Jun 01 17:16:49 PDT 2009half_Milkman@continuity_plus Lol...or responded to the req...
9619964Sun May 17 10:12:06 PDT 2009katizzlemanicure done: today -> black nails
15193974Mon Jun 15 02:49:50 PDT 2009michieongokay, decided to listen to David Archuleta, no...
12840354Tue Jun 02 02:49:00 PDT 2009itsJohnowoah, everything looks amazing. i cant bloody ...
\n", "

16000 rows × 4 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df", "repr_error": "0" } }, "metadata": {}, "execution_count": 17 } ], "source": [ "new_column_name = \"tweet_text\"\n", "df.rename(columns={df.columns[3]: new_column_name}, inplace=True)\n", "df" ] }, { "cell_type": "code", "source": [ "def optimize_memory(df):\n", " for col in df.select_dtypes(include=['int64', 'float64']).columns:\n", " df[col] = pd.to_numeric(df[col], downcast='integer' if df[col].dtype == 'int64' else 'float')\n", " return df\n", "\n", "df = optimize_memory(df)\n" ], "metadata": { "id": "Ogt2OoXxYqvR" }, "execution_count": 18, "outputs": [] }, { "cell_type": "code", "source": [ "for col in df.select_dtypes(include=['object']).columns:\n", " df[col] = df[col].astype('category')\n" ], "metadata": { "id": "RwQUgoLNYuVr" }, "execution_count": 19, "outputs": [] }, { "cell_type": "code", "source": [ "from sklearn.utils import shuffle\n", "\n", "batch_size = 100000 # Her seferde 100 bin satır işle\n", "df = shuffle(df, random_state=42) # Verileri karıştır\n", "\n", "for i in range(0, len(df), batch_size):\n", " batch = df.iloc[i:i + batch_size]\n", " # Burada modeli eğit ve belleği temizle\n" ], "metadata": { "id": "5otlQQxlVuLU" }, "execution_count": 20, "outputs": [] }, { "cell_type": "code", "source": [ "df" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 632 }, "id": "7xTOy6i1b9oO", "outputId": "dd96255e-9e20-4d6f-8afa-239f2b8b7d29" }, "execution_count": 24, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " 0 Mon Apr 06 22:19:45 PDT 2009 _TheSpecialOne_ \\\n", "1405892 4 Sat Jun 06 08:59:56 PDT 2009 judomary \n", "841227 4 Sun Apr 19 12:36:21 PDT 2009 KrystleMiller \n", "1542496 4 Mon Jun 15 11:20:14 PDT 2009 tweeteradder7 \n", "705200 0 Sat Jun 20 12:32:46 PDT 2009 ClauBand \n", "1240299 4 Mon Jun 01 10:52:34 PDT 2009 beccuhboo \n", "... .. ... ... \n", "34505 0 Mon Apr 20 05:07:13 PDT 2009 marawigirl \n", "1569381 4 Mon Jun 15 21:49:18 PDT 2009 xkayjay \n", "1082714 4 Fri May 29 22:07:41 PDT 2009 justelle \n", "1203085 4 Sun May 31 18:08:19 PDT 2009 ninja24 \n", "20120 0 Sun Apr 19 01:28:19 PDT 2009 metafiktion \n", "\n", " tweet_text \n", "1405892 @johnnystimson Still? Wow. Your cheese must ha... \n", "841227 @ciara_danella you're supposed to write it on ... \n", "1542496 @vanphotolens Get 100 followers a day using ww... \n", "705200 @ThiOliveiras To no ar Thi!!! \n", "1240299 its cold out today . i love it \n", "... ... \n", "34505 hates being a girl one day every month. red fl... \n", "1569381 10 minutes to midnight! going to get my jonas... \n", "1082714 @krystyl deal girlie! It has been forever! Tom... \n", "1203085 So can't be all sad about stuff time to move o... \n", "20120 You don't quite get the same sense of satisfac... \n", "\n", "[16000 rows x 4 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0Mon Apr 06 22:19:45 PDT 2009_TheSpecialOne_tweet_text
14058924Sat Jun 06 08:59:56 PDT 2009judomary@johnnystimson Still? Wow. Your cheese must ha...
8412274Sun Apr 19 12:36:21 PDT 2009KrystleMiller@ciara_danella you're supposed to write it on ...
15424964Mon Jun 15 11:20:14 PDT 2009tweeteradder7@vanphotolens Get 100 followers a day using ww...
7052000Sat Jun 20 12:32:46 PDT 2009ClauBand@ThiOliveiras To no ar Thi!!!
12402994Mon Jun 01 10:52:34 PDT 2009beccuhbooits cold out today . i love it
...............
345050Mon Apr 20 05:07:13 PDT 2009marawigirlhates being a girl one day every month. red fl...
15693814Mon Jun 15 21:49:18 PDT 2009xkayjay10 minutes to midnight! going to get my jonas...
10827144Fri May 29 22:07:41 PDT 2009justelle@krystyl deal girlie! It has been forever! Tom...
12030854Sun May 31 18:08:19 PDT 2009ninja24So can't be all sad about stuff time to move o...
201200Sun Apr 19 01:28:19 PDT 2009metafiktionYou don't quite get the same sense of satisfac...
\n", "

16000 rows × 4 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df", "summary": "{\n \"name\": \"df\",\n \"rows\": 16000,\n \"fields\": [\n {\n \"column\": \"0\",\n \"properties\": {\n \"dtype\": \"int8\",\n \"num_unique_values\": 2,\n \"samples\": [\n 0,\n 4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Mon Apr 06 22:19:45 PDT 2009\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 15821,\n \"samples\": [\n \"Fri May 29 17:37:13 PDT 2009\",\n \"Tue Jun 02 07:27:45 PDT 2009\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"_TheSpecialOne_\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 15295,\n \"samples\": [\n \"brittuhhnay\",\n \"niamhscullionb\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"tweet_text\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 15977,\n \"samples\": [\n \"I iz lonely and wanting someone to cuddle with. \",\n \"@lasercosmetica What a Blissful time at Wet today! \"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 24 } ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "id": "_JuAy-coCXKH" }, "outputs": [], "source": [ "sample_text = df['tweet_text'].iloc[2] # Use iloc to access by position" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 53 }, "id": "A_2fzixNC8Z9", "outputId": "556b7aab-2e8a-4c00-c24a-d6ac46c5872f" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "'@vanphotolens Get 100 followers a day using www.tweeterfollow.com Once you add everyone you are on the train or pay vip '" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 27 } ], "source": [ "sample_text" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "id": "Hy3uO6kHDAEQ" }, "outputs": [], "source": [ "from bs4 import BeautifulSoup\n", "sample_text = BeautifulSoup(sample_text).get_text()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 53 }, "id": "jcJIBe-QDUba", "outputId": "ce0da960-b572-459a-8513-acd25e1aaaba" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "'@vanphotolens Get 100 followers a day using www.tweeterfollow.com Once you add everyone you are on the train or pay vip '" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 29 } ], "source": [ "sample_text" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "id": "fnm7NTcKDYxX" }, "outputs": [], "source": [ "import re" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 53 }, "id": "COXSgv-hDcNH", "outputId": "1f98a7fb-57a3-4e9e-e6b8-49eeba96f704" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "' vanphotolens Get followers a day using www tweeterfollow com Once you add everyone you are on the train or pay vip '" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 31 } ], "source": [ "sample_text = re.sub(\"[^a-zA-Z]\",' ',sample_text)\n", "sample_text" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 53 }, "id": "8KGLrhAbENEm", "outputId": "42f72040-f208-485c-a2e1-69fb1f14a749" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "' vanphotolens get followers a day using www tweeterfollow com once you add everyone you are on the train or pay vip '" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 32 } ], "source": [ "sample_text = sample_text.lower()\n", "sample_text" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "id": "nvEA-zoMDzy3" }, "outputs": [], "source": [ "sample_text = sample_text.split()" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "hB0AcgPXD-pY", "outputId": "34949289-b050-40c5-f45d-84a4fdd10035" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['vanphotolens',\n", " 'get',\n", " 'followers',\n", " 'a',\n", " 'day',\n", " 'using',\n", " 'www',\n", " 'tweeterfollow',\n", " 'com',\n", " 'once',\n", " 'you',\n", " 'add',\n", " 'everyone',\n", " 'you',\n", " 'are',\n", " 'on',\n", " 'the',\n", " 'train',\n", " 'or',\n", " 'pay',\n", " 'vip']" ] }, "metadata": {}, "execution_count": 34 } ], "source": [ "sample_text" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "C3YVPNJuEGdD", "outputId": "7a2c2100-c7bd-49f1-c53c-a29cd330566f" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "21" ] }, "metadata": {}, "execution_count": 35 } ], "source": [ "len(sample_text)" ] }, { "cell_type": "code", "source": [ "stop_words = set(stopwords.words(\"english\"))\n", "stop_words" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "EuuiDs2ewWXu", "outputId": "b4e7185f-9a08-411e-acff-452751f4d8e8" }, "execution_count": 36, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{'a',\n", " 'about',\n", " 'above',\n", " 'after',\n", " 'again',\n", " 'against',\n", " 'ain',\n", " 'all',\n", " 'am',\n", " 'an',\n", " 'and',\n", " 'any',\n", " 'are',\n", " 'aren',\n", " \"aren't\",\n", " 'as',\n", " 'at',\n", " 'be',\n", " 'because',\n", " 'been',\n", " 'before',\n", " 'being',\n", " 'below',\n", " 'between',\n", " 'both',\n", " 'but',\n", " 'by',\n", " 'can',\n", " 'couldn',\n", " \"couldn't\",\n", " 'd',\n", " 'did',\n", " 'didn',\n", " \"didn't\",\n", " 'do',\n", " 'does',\n", " 'doesn',\n", " \"doesn't\",\n", " 'doing',\n", " 'don',\n", " \"don't\",\n", " 'down',\n", " 'during',\n", " 'each',\n", " 'few',\n", " 'for',\n", " 'from',\n", " 'further',\n", " 'had',\n", " 'hadn',\n", " \"hadn't\",\n", " 'has',\n", " 'hasn',\n", " \"hasn't\",\n", " 'have',\n", " 'haven',\n", " \"haven't\",\n", " 'having',\n", " 'he',\n", " \"he'd\",\n", " \"he'll\",\n", " \"he's\",\n", " 'her',\n", " 'here',\n", " 'hers',\n", " 'herself',\n", " 'him',\n", " 'himself',\n", " 'his',\n", " 'how',\n", " 'i',\n", " \"i'd\",\n", " \"i'll\",\n", " \"i'm\",\n", " \"i've\",\n", " 'if',\n", " 'in',\n", " 'into',\n", " 'is',\n", " 'isn',\n", " \"isn't\",\n", " 'it',\n", " \"it'd\",\n", " \"it'll\",\n", " \"it's\",\n", " 'its',\n", " 'itself',\n", " 'just',\n", " 'll',\n", " 'm',\n", " 'ma',\n", " 'me',\n", " 'mightn',\n", " \"mightn't\",\n", " 'more',\n", " 'most',\n", " 'mustn',\n", " \"mustn't\",\n", " 'my',\n", " 'myself',\n", " 'needn',\n", " \"needn't\",\n", " 'no',\n", " 'nor',\n", " 'not',\n", " 'now',\n", " 'o',\n", " 'of',\n", " 'off',\n", " 'on',\n", " 'once',\n", " 'only',\n", " 'or',\n", " 'other',\n", " 'our',\n", " 'ours',\n", " 'ourselves',\n", " 'out',\n", " 'over',\n", " 'own',\n", " 're',\n", " 's',\n", " 'same',\n", " 'shan',\n", " \"shan't\",\n", " 'she',\n", " \"she'd\",\n", " \"she'll\",\n", " \"she's\",\n", " 'should',\n", " \"should've\",\n", " 'shouldn',\n", " \"shouldn't\",\n", " 'so',\n", " 'some',\n", " 'such',\n", " 't',\n", " 'than',\n", " 'that',\n", " \"that'll\",\n", " 'the',\n", " 'their',\n", " 'theirs',\n", " 'them',\n", " 'themselves',\n", " 'then',\n", " 'there',\n", " 'these',\n", " 'they',\n", " \"they'd\",\n", " \"they'll\",\n", " \"they're\",\n", " \"they've\",\n", " 'this',\n", " 'those',\n", " 'through',\n", " 'to',\n", " 'too',\n", " 'under',\n", " 'until',\n", " 'up',\n", " 've',\n", " 'very',\n", " 'was',\n", " 'wasn',\n", " \"wasn't\",\n", " 'we',\n", " \"we'd\",\n", " \"we'll\",\n", " \"we're\",\n", " \"we've\",\n", " 'were',\n", " 'weren',\n", " \"weren't\",\n", " 'what',\n", " 'when',\n", " 'where',\n", " 'which',\n", " 'while',\n", " 'who',\n", " 'whom',\n", " 'why',\n", " 'will',\n", " 'with',\n", " 'won',\n", " \"won't\",\n", " 'wouldn',\n", " \"wouldn't\",\n", " 'y',\n", " 'you',\n", " \"you'd\",\n", " \"you'll\",\n", " \"you're\",\n", " \"you've\",\n", " 'your',\n", " 'yours',\n", " 'yourself',\n", " 'yourselves'}" ] }, "metadata": {}, "execution_count": 36 } ] }, { "cell_type": "code", "source": [ "sample_text = [w for w in sample_text if w not in stop_words]\n", "sample_text" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "YhEMcfQ2wuC4", "outputId": "6d8c67ad-f1a7-4a8b-9b6e-adb41a05f0fe" }, "execution_count": 37, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['vanphotolens',\n", " 'get',\n", " 'followers',\n", " 'day',\n", " 'using',\n", " 'www',\n", " 'tweeterfollow',\n", " 'com',\n", " 'add',\n", " 'everyone',\n", " 'train',\n", " 'pay',\n", " 'vip']" ] }, "metadata": {}, "execution_count": 37 } ] }, { "cell_type": "code", "source": [ "len(sample_text)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "luUkKN5NxDVw", "outputId": "e1d12840-4715-42e9-d0e7-4a0e0fbfb40e" }, "execution_count": 38, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "13" ] }, "metadata": {}, "execution_count": 38 } ] }, { "cell_type": "code", "source": [ "def process(review):\n", " review = BeautifulSoup(review).get_text()\n", " review = re.sub(\"[^a-zA-Z]\", ' ',review)\n", " review.lower()\n", " review = review.split()\n", " stop_words = set(stopwords.words(\"english\"))\n", " review = [w for w in review if w not in stop_words]\n", " return (' '.join(review))" ], "metadata": { "id": "xKiD-a49xKyT" }, "execution_count": 39, "outputs": [] }, { "cell_type": "code", "source": [ "train_x_all = []\n", "for index in df.index: # Iterate through DataFrame index values\n", " if (index + 1) % 1000 == 0:\n", " print(\"step of process\", index + 1)\n", " train_x_all.append(process(df.loc[index, 'tweet_text'])) # Use loc for index-based access" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Q3kqZGLy3HmH", "outputId": "d9ad5685-c613-494a-898d-30d17955a183" }, "execution_count": 41, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "step of process 772000\n", "step of process 323000\n", "step of process 305000\n", "step of process 164000\n", "step of process 1049000\n", "step of process 698000\n", "step of process 1586000\n", "step of process 1016000\n", "step of process 523000\n", "step of process 410000\n", "step of process 200000\n", "step of process 1317000\n", "step of process 1323000\n", "step of process 1151000\n", "step of process 1274000\n", "step of process 543000\n", "step of process 1482000\n", "step of process 1232000\n" ] } ] }, { "cell_type": "code", "source": [ "from sklearn.model_selection import train_test_split" ], "metadata": { "id": "If9iJarT7jpe" }, "execution_count": 42, "outputs": [] }, { "cell_type": "code", "source": [ "X = train_x_all\n", "y = np.array(df['0'])" ], "metadata": { "id": "meU1L-yP-NAm" }, "execution_count": 43, "outputs": [] }, { "cell_type": "code", "source": [ "X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)" ], "metadata": { "id": "BcasAZ66-WQ2" }, "execution_count": 44, "outputs": [] }, { "cell_type": "code", "source": [ "from sklearn.feature_extraction.text import CountVectorizer" ], "metadata": { "id": "EYVnw4u7-ncX" }, "execution_count": 45, "outputs": [] }, { "cell_type": "code", "source": [ "vectorizer = CountVectorizer(max_features=5000)" ], "metadata": { "id": "wyfKn32LAM9w" }, "execution_count": 46, "outputs": [] }, { "cell_type": "code", "source": [ "train_x1 = vectorizer.fit_transform(X_train)\n", "train_x1" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "RzeZaQSGATEY", "outputId": "433f5985-9115-4d33-9ca7-85572e2ebdd7" }, "execution_count": 47, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": {}, "execution_count": 47 } ] }, { "cell_type": "code", "source": [ "train_x1 = train_x1.toarray()" ], "metadata": { "id": "95Ma6XsKD966" }, "execution_count": 48, "outputs": [] }, { "cell_type": "code", "source": [ "train_y1 = y_train" ], "metadata": { "id": "FfBYLPu9D-_2" }, "execution_count": 49, "outputs": [] }, { "cell_type": "code", "source": [ "train_x1.shape, train_y1.shape" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "z1OKEqxCECFs", "outputId": "3352f2eb-e491-4171-ba78-9e256b230f20" }, "execution_count": 50, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "((12800, 5000), (12800,))" ] }, "metadata": {}, "execution_count": 50 } ] }, { "cell_type": "code", "source": [ "import lightgbm as lgb\n", "model = lgb.LGBMClassifier()" ], "metadata": { "id": "djQ9FWOyEElZ" }, "execution_count": 51, "outputs": [] }, { "cell_type": "code", "source": [ "model.fit(train_x1,train_y1)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 274 }, "id": "Mm-aVRfiEJ0t", "outputId": "d90a0ab3-e636-4bbd-c072-611495d7d0ce" }, "execution_count": 52, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.11/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n", " warnings.warn(\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "[LightGBM] [Info] Number of positive: 6468, number of negative: 6332\n", "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.051104 seconds.\n", "You can set `force_row_wise=true` to remove the overhead.\n", "And if memory is not enough, you can set `force_col_wise=true`.\n", "[LightGBM] [Info] Total Bins 2051\n", "[LightGBM] [Info] Number of data points in the train set: 12800, number of used features: 796\n", "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.505313 -> initscore=0.021251\n", "[LightGBM] [Info] Start training from score 0.021251\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "LGBMClassifier()" ], "text/html": [ "
LGBMClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ] }, "metadata": {}, "execution_count": 52 } ] }, { "cell_type": "code", "source": [ "test_x1 = vectorizer.transform(X_test)" ], "metadata": { "id": "aPP2bBrUISfM" }, "execution_count": 53, "outputs": [] }, { "cell_type": "code", "source": [ "test_x1" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Zl_juoJqIW72", "outputId": "b6dc4c6a-8ec3-4a24-aae7-563ac0ca675d" }, "execution_count": 54, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": {}, "execution_count": 54 } ] }, { "cell_type": "code", "source": [ "test_x1 = test_x1.toarray()" ], "metadata": { "id": "THVcAM_zIbT5" }, "execution_count": 55, "outputs": [] }, { "cell_type": "code", "source": [ "test_x1.shape" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "H_EZiKwoIbMY", "outputId": "a587ec03-5c1f-45f4-863b-f65ef56da33b" }, "execution_count": 56, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(3200, 5000)" ] }, "metadata": {}, "execution_count": 56 } ] }, { "cell_type": "code", "source": [ "from sklearn.metrics import roc_auc_score" ], "metadata": { "id": "zA1LCCL6ImY9" }, "execution_count": 57, "outputs": [] }, { "cell_type": "code", "source": [ "test_predict_proba = model.predict_proba(test_x1)[:, 1] # Assuming binary classification, get probabilities for class 1\n", "roc_auc = roc_auc_score(y_test, test_predict_proba) # Calculate ROC AUC using probabilities\n", "roc_auc" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "JBaTvQjnImSm", "outputId": "ebca2dc8-ede9-4f34-8ee3-ea357499ad04" }, "execution_count": 58, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.11/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n", " warnings.warn(\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "np.float64(0.7849473015873015)" ] }, "metadata": {}, "execution_count": 58 } ] } ], "metadata": { "colab": { "provenance": [], "gpuType": "T4" }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "nbformat": 4, "nbformat_minor": 0 }