{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"id": "dP6l8vOUtV0J"
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"id": "EDetmMNGuXY9"
},
"outputs": [],
"source": [
"df = pd.read_csv(\"/content/training.1600000.processed.noemoticon.csv\", encoding='latin-1')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"id": "yDJXNYjqwYo2",
"outputId": "b56cb388-71f6-4d0e-c651-0201854cb952"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" 0 1467810369 Mon Apr 06 22:19:45 PDT 2009 NO_QUERY \\\n",
"0 0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY \n",
"1 0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY \n",
"2 0 1467811184 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY \n",
"3 0 1467811193 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY \n",
"4 0 1467811372 Mon Apr 06 22:20:00 PDT 2009 NO_QUERY \n",
"... .. ... ... ... \n",
"1599994 4 2193601966 Tue Jun 16 08:40:49 PDT 2009 NO_QUERY \n",
"1599995 4 2193601969 Tue Jun 16 08:40:49 PDT 2009 NO_QUERY \n",
"1599996 4 2193601991 Tue Jun 16 08:40:49 PDT 2009 NO_QUERY \n",
"1599997 4 2193602064 Tue Jun 16 08:40:49 PDT 2009 NO_QUERY \n",
"1599998 4 2193602129 Tue Jun 16 08:40:50 PDT 2009 NO_QUERY \n",
"\n",
" _TheSpecialOne_ \\\n",
"0 scotthamilton \n",
"1 mattycus \n",
"2 ElleCTF \n",
"3 Karoli \n",
"4 joy_wolf \n",
"... ... \n",
"1599994 AmandaMarie1028 \n",
"1599995 TheWDBoards \n",
"1599996 bpbabe \n",
"1599997 tinydiamondz \n",
"1599998 RyanTrevMorris \n",
"\n",
" @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D \n",
"0 is upset that he can't update his Facebook by ... \n",
"1 @Kenichan I dived many times for the ball. Man... \n",
"2 my whole body feels itchy and like its on fire \n",
"3 @nationwideclass no, it's not behaving at all.... \n",
"4 @Kwesidei not the whole crew \n",
"... ... \n",
"1599994 Just woke up. Having no school is the best fee... \n",
"1599995 TheWDB.com - Very cool to hear old Walt interv... \n",
"1599996 Are you ready for your MoJo Makeover? Ask me f... \n",
"1599997 Happy 38th Birthday to my boo of alll time!!! ... \n",
"1599998 happy #charitytuesday @theNSPCC @SparksCharity... \n",
"\n",
"[1599999 rows x 6 columns]"
],
"text/html": [
"\n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" 0 \n",
" 1467810369 \n",
" Mon Apr 06 22:19:45 PDT 2009 \n",
" NO_QUERY \n",
" _TheSpecialOne_ \n",
" @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 0 \n",
" 1467810672 \n",
" Mon Apr 06 22:19:49 PDT 2009 \n",
" NO_QUERY \n",
" scotthamilton \n",
" is upset that he can't update his Facebook by ... \n",
" \n",
" \n",
" 1 \n",
" 0 \n",
" 1467810917 \n",
" Mon Apr 06 22:19:53 PDT 2009 \n",
" NO_QUERY \n",
" mattycus \n",
" @Kenichan I dived many times for the ball. Man... \n",
" \n",
" \n",
" 2 \n",
" 0 \n",
" 1467811184 \n",
" Mon Apr 06 22:19:57 PDT 2009 \n",
" NO_QUERY \n",
" ElleCTF \n",
" my whole body feels itchy and like its on fire \n",
" \n",
" \n",
" 3 \n",
" 0 \n",
" 1467811193 \n",
" Mon Apr 06 22:19:57 PDT 2009 \n",
" NO_QUERY \n",
" Karoli \n",
" @nationwideclass no, it's not behaving at all.... \n",
" \n",
" \n",
" 4 \n",
" 0 \n",
" 1467811372 \n",
" Mon Apr 06 22:20:00 PDT 2009 \n",
" NO_QUERY \n",
" joy_wolf \n",
" @Kwesidei not the whole crew \n",
" \n",
" \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" \n",
" \n",
" 1599994 \n",
" 4 \n",
" 2193601966 \n",
" Tue Jun 16 08:40:49 PDT 2009 \n",
" NO_QUERY \n",
" AmandaMarie1028 \n",
" Just woke up. Having no school is the best fee... \n",
" \n",
" \n",
" 1599995 \n",
" 4 \n",
" 2193601969 \n",
" Tue Jun 16 08:40:49 PDT 2009 \n",
" NO_QUERY \n",
" TheWDBoards \n",
" TheWDB.com - Very cool to hear old Walt interv... \n",
" \n",
" \n",
" 1599996 \n",
" 4 \n",
" 2193601991 \n",
" Tue Jun 16 08:40:49 PDT 2009 \n",
" NO_QUERY \n",
" bpbabe \n",
" Are you ready for your MoJo Makeover? Ask me f... \n",
" \n",
" \n",
" 1599997 \n",
" 4 \n",
" 2193602064 \n",
" Tue Jun 16 08:40:49 PDT 2009 \n",
" NO_QUERY \n",
" tinydiamondz \n",
" Happy 38th Birthday to my boo of alll time!!! ... \n",
" \n",
" \n",
" 1599998 \n",
" 4 \n",
" 2193602129 \n",
" Tue Jun 16 08:40:50 PDT 2009 \n",
" NO_QUERY \n",
" RyanTrevMorris \n",
" happy #charitytuesday @theNSPCC @SparksCharity... \n",
" \n",
" \n",
"
\n",
"
1599999 rows × 6 columns
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "df"
}
},
"metadata": {},
"execution_count": 3
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 290
},
"id": "kZqf-vIww7VB",
"outputId": "e1337690-1fc2-4e84-c800-e7e588a925fd"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0 0\n",
"1467810369 0\n",
"Mon Apr 06 22:19:45 PDT 2009 0\n",
"NO_QUERY 0\n",
"_TheSpecialOne_ 0\n",
"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D 0\n",
"dtype: int64"
],
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" 0 \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 0 \n",
" \n",
" \n",
" 1467810369 \n",
" 0 \n",
" \n",
" \n",
" Mon Apr 06 22:19:45 PDT 2009 \n",
" 0 \n",
" \n",
" \n",
" NO_QUERY \n",
" 0 \n",
" \n",
" \n",
" _TheSpecialOne_ \n",
" 0 \n",
" \n",
" \n",
" @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D \n",
" 0 \n",
" \n",
" \n",
"
\n",
"
dtype: int64 "
]
},
"metadata": {},
"execution_count": 4
}
],
"source": [
"df.isna().sum()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 490
},
"id": "6yNPDs81xDEt",
"outputId": "26edc6a0-3338-4835-d87a-2f94db46b618"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"1467810369\n",
"1563681287 2\n",
"2062516845 2\n",
"1551586713 2\n",
"1676311044 2\n",
"1791602739 2\n",
" ..\n",
"2197310899 1\n",
"2197310477 1\n",
"2197310452 1\n",
"2197310381 1\n",
"2197311865 1\n",
"Name: count, Length: 1598314, dtype: int64"
],
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" count \n",
" \n",
" \n",
" 1467810369 \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" 1563681287 \n",
" 2 \n",
" \n",
" \n",
" 2062516845 \n",
" 2 \n",
" \n",
" \n",
" 1551586713 \n",
" 2 \n",
" \n",
" \n",
" 1676311044 \n",
" 2 \n",
" \n",
" \n",
" 1791602739 \n",
" 2 \n",
" \n",
" \n",
" ... \n",
" ... \n",
" \n",
" \n",
" 2197310899 \n",
" 1 \n",
" \n",
" \n",
" 2197310477 \n",
" 1 \n",
" \n",
" \n",
" 2197310452 \n",
" 1 \n",
" \n",
" \n",
" 2197310381 \n",
" 1 \n",
" \n",
" \n",
" 2197311865 \n",
" 1 \n",
" \n",
" \n",
"
\n",
"
1598314 rows × 1 columns
\n",
"
dtype: int64 "
]
},
"metadata": {},
"execution_count": 5
}
],
"source": [
"df['1467810369'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 147
},
"id": "qSDrw9sHxS2n",
"outputId": "81bad2a3-5bab-440f-9fb8-7a87d99c5e92"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"NO_QUERY\n",
"NO_QUERY 1599999\n",
"Name: count, dtype: int64"
],
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" count \n",
" \n",
" \n",
" NO_QUERY \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" NO_QUERY \n",
" 1599999 \n",
" \n",
" \n",
"
\n",
"
dtype: int64 "
]
},
"metadata": {},
"execution_count": 6
}
],
"source": [
"df['NO_QUERY'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"id": "3Ix-o-IJxW-B"
},
"outputs": [],
"source": [
"df.drop('NO_QUERY',axis=1,inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "IjITKGzlxhEv",
"outputId": "96f77f50-72aa-470b-9675-3b0cf66a8352"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"0 int64\n",
"1467810369 int64\n",
"Mon Apr 06 22:19:45 PDT 2009 object\n",
"_TheSpecialOne_ object\n",
"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D object\n",
"dtype: object\n"
]
}
],
"source": [
"df = df.sample(frac=0.1, random_state=42) # %10 veriyi al\n",
"print(df.dtypes)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Nx3sG-x313SH",
"outputId": "c2577bc1-6b84-4e13-9fc1-8f5f31578160"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"Index: 16000 entries, 1497139 to 1284035\n",
"Data columns (total 5 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 0 16000 non-null int64 \n",
" 1 1467810369 16000 non-null int64 \n",
" 2 Mon Apr 06 22:19:45 PDT 2009 16000 non-null object\n",
" 3 _TheSpecialOne_ 16000 non-null object\n",
" 4 @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D 16000 non-null object\n",
"dtypes: int64(2), object(3)\n",
"memory usage: 750.0+ KB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 300
},
"id": "6dtZVvA515sK",
"outputId": "9a7af184-7a43-48ee-f453-d4eb224c6dde"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" 0 1467810369\n",
"count 16000.000000 1.600000e+04\n",
"mean 2.023250 1.997504e+09\n",
"std 1.999927 1.928363e+08\n",
"min 0.000000 1.467834e+09\n",
"25% 0.000000 1.956811e+09\n",
"50% 4.000000 2.001829e+09\n",
"75% 4.000000 2.176166e+09\n",
"max 4.000000 2.329169e+09"
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" 0 \n",
" 1467810369 \n",
" \n",
" \n",
" \n",
" \n",
" count \n",
" 16000.000000 \n",
" 1.600000e+04 \n",
" \n",
" \n",
" mean \n",
" 2.023250 \n",
" 1.997504e+09 \n",
" \n",
" \n",
" std \n",
" 1.999927 \n",
" 1.928363e+08 \n",
" \n",
" \n",
" min \n",
" 0.000000 \n",
" 1.467834e+09 \n",
" \n",
" \n",
" 25% \n",
" 0.000000 \n",
" 1.956811e+09 \n",
" \n",
" \n",
" 50% \n",
" 4.000000 \n",
" 2.001829e+09 \n",
" \n",
" \n",
" 75% \n",
" 4.000000 \n",
" 2.176166e+09 \n",
" \n",
" \n",
" max \n",
" 4.000000 \n",
" 2.329169e+09 \n",
" \n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"summary": "{\n \"name\": \"df\",\n \"rows\": 8,\n \"fields\": [\n {\n \"column\": \"0\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 5656.045202020064,\n \"min\": 0.0,\n \"max\": 16000.0,\n \"num_unique_values\": 5,\n \"samples\": [\n 2.02325,\n 4.0,\n 1.9999273535150421\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"1467810369\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 911085603.3007622,\n \"min\": 16000.0,\n \"max\": 2329168959.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 1997503968.4805624,\n 2001828620.0,\n 16000.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 11
}
],
"source": [
"df.describe()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "SkWc-Myp2am4",
"outputId": "bf370764-f04f-4501-e048-a7b5fc59977b"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Count of IDs1467810369\n",
"1979685249 1\n",
"2047063573 1\n",
"2208198047 1\n",
"1996357873 1\n",
"1676456732 1\n",
" ..\n",
"2265368249 1\n",
"1679452821 1\n",
"2217064177 1\n",
"2045691294 1\n",
"2070290547 1\n",
"Name: count, Length: 16000, dtype: int64, count of usernames_TheSpecialOne_\n",
"Jayme1988 7\n",
"wowlew 7\n",
"ShesElectric_ 5\n",
"jbfanforever94 4\n",
"maynaseric 4\n",
" ..\n",
"Stephi90 1\n",
"ericzueff 1\n",
"munoza13 1\n",
"MissSTARcey 1\n",
"Tmama21 1\n",
"Name: count, Length: 15295, dtype: int64\n"
]
}
],
"source": [
"print(f'Count of IDs{df[\"1467810369\"].value_counts()}, count of usernames{df[\"_TheSpecialOne_\"].value_counts()}')"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"id": "0PAvL2M_3vzw"
},
"outputs": [],
"source": [
"from nltk.corpus import stopwords"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"id": "MpHUYWR-CKrs"
},
"outputs": [],
"source": [
"import nltk"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "0kcFyu7hCUdy",
"outputId": "5e73979f-761d-4c9d-a5ff-4a48f501aa4e"
},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"True"
]
},
"metadata": {},
"execution_count": 15
}
],
"source": [
"nltk.download(\"stopwords\")"
]
},
{
"cell_type": "code",
"source": [
"df = df.drop(columns=['1467810369'])"
],
"metadata": {
"id": "uql-EelvCFrN"
},
"execution_count": 16,
"outputs": []
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 632
},
"id": "Ur2vB4KWCrZL",
"outputId": "7ff638d7-3cba-410c-b5ac-d479e70d362f"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" 0 Mon Apr 06 22:19:45 PDT 2009 _TheSpecialOne_ \\\n",
"1497139 4 Sun Jun 07 17:26:02 PDT 2009 Jill_Osi \n",
"1350782 4 Fri Jun 05 11:05:43 PDT 2009 vmramos \n",
"589668 0 Wed Jun 17 20:20:31 PDT 2009 icorganics \n",
"52428 0 Sat May 02 09:23:27 PDT 2009 mundah \n",
"738013 0 Sun Jun 21 06:25:38 PDT 2009 skdev \n",
"... .. ... ... \n",
"1438815 4 Sat Jun 06 20:47:03 PDT 2009 HalfassBackward \n",
"298459 0 Mon Jun 01 17:16:49 PDT 2009 half_Milkman \n",
"961996 4 Sun May 17 10:12:06 PDT 2009 katizzle \n",
"1519397 4 Mon Jun 15 02:49:50 PDT 2009 michieong \n",
"1284035 4 Tue Jun 02 02:49:00 PDT 2009 itsJohno \n",
"\n",
" tweet_text \n",
"1497139 Can't wait to be at glacier national park \n",
"1350782 @virtualhispanic Falling apart, ha? Maybe you ... \n",
"589668 Oh my gosh, there is a Mom Entrepreneur of the... \n",
"52428 what shall i drink during the game?: Stella, M... \n",
"738013 @awaisnaseer @blessedAyesha ki LCD kharab hay \n",
"... ... \n",
"1438815 @bjolena Glad to hear your day went well Mine... \n",
"298459 @continuity_plus Lol...or responded to the req... \n",
"961996 manicure done: today -> black nails \n",
"1519397 okay, decided to listen to David Archuleta, no... \n",
"1284035 woah, everything looks amazing. i cant bloody ... \n",
"\n",
"[16000 rows x 4 columns]"
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" 0 \n",
" Mon Apr 06 22:19:45 PDT 2009 \n",
" _TheSpecialOne_ \n",
" tweet_text \n",
" \n",
" \n",
" \n",
" \n",
" 1497139 \n",
" 4 \n",
" Sun Jun 07 17:26:02 PDT 2009 \n",
" Jill_Osi \n",
" Can't wait to be at glacier national park \n",
" \n",
" \n",
" 1350782 \n",
" 4 \n",
" Fri Jun 05 11:05:43 PDT 2009 \n",
" vmramos \n",
" @virtualhispanic Falling apart, ha? Maybe you ... \n",
" \n",
" \n",
" 589668 \n",
" 0 \n",
" Wed Jun 17 20:20:31 PDT 2009 \n",
" icorganics \n",
" Oh my gosh, there is a Mom Entrepreneur of the... \n",
" \n",
" \n",
" 52428 \n",
" 0 \n",
" Sat May 02 09:23:27 PDT 2009 \n",
" mundah \n",
" what shall i drink during the game?: Stella, M... \n",
" \n",
" \n",
" 738013 \n",
" 0 \n",
" Sun Jun 21 06:25:38 PDT 2009 \n",
" skdev \n",
" @awaisnaseer @blessedAyesha ki LCD kharab hay \n",
" \n",
" \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" \n",
" \n",
" 1438815 \n",
" 4 \n",
" Sat Jun 06 20:47:03 PDT 2009 \n",
" HalfassBackward \n",
" @bjolena Glad to hear your day went well Mine... \n",
" \n",
" \n",
" 298459 \n",
" 0 \n",
" Mon Jun 01 17:16:49 PDT 2009 \n",
" half_Milkman \n",
" @continuity_plus Lol...or responded to the req... \n",
" \n",
" \n",
" 961996 \n",
" 4 \n",
" Sun May 17 10:12:06 PDT 2009 \n",
" katizzle \n",
" manicure done: today -> black nails \n",
" \n",
" \n",
" 1519397 \n",
" 4 \n",
" Mon Jun 15 02:49:50 PDT 2009 \n",
" michieong \n",
" okay, decided to listen to David Archuleta, no... \n",
" \n",
" \n",
" 1284035 \n",
" 4 \n",
" Tue Jun 02 02:49:00 PDT 2009 \n",
" itsJohno \n",
" woah, everything looks amazing. i cant bloody ... \n",
" \n",
" \n",
"
\n",
"
16000 rows × 4 columns
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "df",
"repr_error": "0"
}
},
"metadata": {},
"execution_count": 17
}
],
"source": [
"new_column_name = \"tweet_text\"\n",
"df.rename(columns={df.columns[3]: new_column_name}, inplace=True)\n",
"df"
]
},
{
"cell_type": "code",
"source": [
"def optimize_memory(df):\n",
" for col in df.select_dtypes(include=['int64', 'float64']).columns:\n",
" df[col] = pd.to_numeric(df[col], downcast='integer' if df[col].dtype == 'int64' else 'float')\n",
" return df\n",
"\n",
"df = optimize_memory(df)\n"
],
"metadata": {
"id": "Ogt2OoXxYqvR"
},
"execution_count": 18,
"outputs": []
},
{
"cell_type": "code",
"source": [
"for col in df.select_dtypes(include=['object']).columns:\n",
" df[col] = df[col].astype('category')\n"
],
"metadata": {
"id": "RwQUgoLNYuVr"
},
"execution_count": 19,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from sklearn.utils import shuffle\n",
"\n",
"batch_size = 100000 # Her seferde 100 bin satır işle\n",
"df = shuffle(df, random_state=42) # Verileri karıştır\n",
"\n",
"for i in range(0, len(df), batch_size):\n",
" batch = df.iloc[i:i + batch_size]\n",
" # Burada modeli eğit ve belleği temizle\n"
],
"metadata": {
"id": "5otlQQxlVuLU"
},
"execution_count": 20,
"outputs": []
},
{
"cell_type": "code",
"source": [
"df"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 632
},
"id": "7xTOy6i1b9oO",
"outputId": "dd96255e-9e20-4d6f-8afa-239f2b8b7d29"
},
"execution_count": 24,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" 0 Mon Apr 06 22:19:45 PDT 2009 _TheSpecialOne_ \\\n",
"1405892 4 Sat Jun 06 08:59:56 PDT 2009 judomary \n",
"841227 4 Sun Apr 19 12:36:21 PDT 2009 KrystleMiller \n",
"1542496 4 Mon Jun 15 11:20:14 PDT 2009 tweeteradder7 \n",
"705200 0 Sat Jun 20 12:32:46 PDT 2009 ClauBand \n",
"1240299 4 Mon Jun 01 10:52:34 PDT 2009 beccuhboo \n",
"... .. ... ... \n",
"34505 0 Mon Apr 20 05:07:13 PDT 2009 marawigirl \n",
"1569381 4 Mon Jun 15 21:49:18 PDT 2009 xkayjay \n",
"1082714 4 Fri May 29 22:07:41 PDT 2009 justelle \n",
"1203085 4 Sun May 31 18:08:19 PDT 2009 ninja24 \n",
"20120 0 Sun Apr 19 01:28:19 PDT 2009 metafiktion \n",
"\n",
" tweet_text \n",
"1405892 @johnnystimson Still? Wow. Your cheese must ha... \n",
"841227 @ciara_danella you're supposed to write it on ... \n",
"1542496 @vanphotolens Get 100 followers a day using ww... \n",
"705200 @ThiOliveiras To no ar Thi!!! \n",
"1240299 its cold out today . i love it \n",
"... ... \n",
"34505 hates being a girl one day every month. red fl... \n",
"1569381 10 minutes to midnight! going to get my jonas... \n",
"1082714 @krystyl deal girlie! It has been forever! Tom... \n",
"1203085 So can't be all sad about stuff time to move o... \n",
"20120 You don't quite get the same sense of satisfac... \n",
"\n",
"[16000 rows x 4 columns]"
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" 0 \n",
" Mon Apr 06 22:19:45 PDT 2009 \n",
" _TheSpecialOne_ \n",
" tweet_text \n",
" \n",
" \n",
" \n",
" \n",
" 1405892 \n",
" 4 \n",
" Sat Jun 06 08:59:56 PDT 2009 \n",
" judomary \n",
" @johnnystimson Still? Wow. Your cheese must ha... \n",
" \n",
" \n",
" 841227 \n",
" 4 \n",
" Sun Apr 19 12:36:21 PDT 2009 \n",
" KrystleMiller \n",
" @ciara_danella you're supposed to write it on ... \n",
" \n",
" \n",
" 1542496 \n",
" 4 \n",
" Mon Jun 15 11:20:14 PDT 2009 \n",
" tweeteradder7 \n",
" @vanphotolens Get 100 followers a day using ww... \n",
" \n",
" \n",
" 705200 \n",
" 0 \n",
" Sat Jun 20 12:32:46 PDT 2009 \n",
" ClauBand \n",
" @ThiOliveiras To no ar Thi!!! \n",
" \n",
" \n",
" 1240299 \n",
" 4 \n",
" Mon Jun 01 10:52:34 PDT 2009 \n",
" beccuhboo \n",
" its cold out today . i love it \n",
" \n",
" \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" \n",
" \n",
" 34505 \n",
" 0 \n",
" Mon Apr 20 05:07:13 PDT 2009 \n",
" marawigirl \n",
" hates being a girl one day every month. red fl... \n",
" \n",
" \n",
" 1569381 \n",
" 4 \n",
" Mon Jun 15 21:49:18 PDT 2009 \n",
" xkayjay \n",
" 10 minutes to midnight! going to get my jonas... \n",
" \n",
" \n",
" 1082714 \n",
" 4 \n",
" Fri May 29 22:07:41 PDT 2009 \n",
" justelle \n",
" @krystyl deal girlie! It has been forever! Tom... \n",
" \n",
" \n",
" 1203085 \n",
" 4 \n",
" Sun May 31 18:08:19 PDT 2009 \n",
" ninja24 \n",
" So can't be all sad about stuff time to move o... \n",
" \n",
" \n",
" 20120 \n",
" 0 \n",
" Sun Apr 19 01:28:19 PDT 2009 \n",
" metafiktion \n",
" You don't quite get the same sense of satisfac... \n",
" \n",
" \n",
"
\n",
"
16000 rows × 4 columns
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "df",
"summary": "{\n \"name\": \"df\",\n \"rows\": 16000,\n \"fields\": [\n {\n \"column\": \"0\",\n \"properties\": {\n \"dtype\": \"int8\",\n \"num_unique_values\": 2,\n \"samples\": [\n 0,\n 4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Mon Apr 06 22:19:45 PDT 2009\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 15821,\n \"samples\": [\n \"Fri May 29 17:37:13 PDT 2009\",\n \"Tue Jun 02 07:27:45 PDT 2009\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"_TheSpecialOne_\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 15295,\n \"samples\": [\n \"brittuhhnay\",\n \"niamhscullionb\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"tweet_text\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 15977,\n \"samples\": [\n \"I iz lonely and wanting someone to cuddle with. \",\n \"@lasercosmetica What a Blissful time at Wet today! \"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 24
}
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"id": "_JuAy-coCXKH"
},
"outputs": [],
"source": [
"sample_text = df['tweet_text'].iloc[2] # Use iloc to access by position"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 53
},
"id": "A_2fzixNC8Z9",
"outputId": "556b7aab-2e8a-4c00-c24a-d6ac46c5872f"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"'@vanphotolens Get 100 followers a day using www.tweeterfollow.com Once you add everyone you are on the train or pay vip '"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "string"
}
},
"metadata": {},
"execution_count": 27
}
],
"source": [
"sample_text"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"id": "Hy3uO6kHDAEQ"
},
"outputs": [],
"source": [
"from bs4 import BeautifulSoup\n",
"sample_text = BeautifulSoup(sample_text).get_text()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 53
},
"id": "jcJIBe-QDUba",
"outputId": "ce0da960-b572-459a-8513-acd25e1aaaba"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"'@vanphotolens Get 100 followers a day using www.tweeterfollow.com Once you add everyone you are on the train or pay vip '"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "string"
}
},
"metadata": {},
"execution_count": 29
}
],
"source": [
"sample_text"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"id": "fnm7NTcKDYxX"
},
"outputs": [],
"source": [
"import re"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 53
},
"id": "COXSgv-hDcNH",
"outputId": "1f98a7fb-57a3-4e9e-e6b8-49eeba96f704"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"' vanphotolens Get followers a day using www tweeterfollow com Once you add everyone you are on the train or pay vip '"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "string"
}
},
"metadata": {},
"execution_count": 31
}
],
"source": [
"sample_text = re.sub(\"[^a-zA-Z]\",' ',sample_text)\n",
"sample_text"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 53
},
"id": "8KGLrhAbENEm",
"outputId": "42f72040-f208-485c-a2e1-69fb1f14a749"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"' vanphotolens get followers a day using www tweeterfollow com once you add everyone you are on the train or pay vip '"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "string"
}
},
"metadata": {},
"execution_count": 32
}
],
"source": [
"sample_text = sample_text.lower()\n",
"sample_text"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"id": "nvEA-zoMDzy3"
},
"outputs": [],
"source": [
"sample_text = sample_text.split()"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "hB0AcgPXD-pY",
"outputId": "34949289-b050-40c5-f45d-84a4fdd10035"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['vanphotolens',\n",
" 'get',\n",
" 'followers',\n",
" 'a',\n",
" 'day',\n",
" 'using',\n",
" 'www',\n",
" 'tweeterfollow',\n",
" 'com',\n",
" 'once',\n",
" 'you',\n",
" 'add',\n",
" 'everyone',\n",
" 'you',\n",
" 'are',\n",
" 'on',\n",
" 'the',\n",
" 'train',\n",
" 'or',\n",
" 'pay',\n",
" 'vip']"
]
},
"metadata": {},
"execution_count": 34
}
],
"source": [
"sample_text"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "C3YVPNJuEGdD",
"outputId": "7a2c2100-c7bd-49f1-c53c-a29cd330566f"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"21"
]
},
"metadata": {},
"execution_count": 35
}
],
"source": [
"len(sample_text)"
]
},
{
"cell_type": "code",
"source": [
"stop_words = set(stopwords.words(\"english\"))\n",
"stop_words"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "EuuiDs2ewWXu",
"outputId": "b4e7185f-9a08-411e-acff-452751f4d8e8"
},
"execution_count": 36,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'a',\n",
" 'about',\n",
" 'above',\n",
" 'after',\n",
" 'again',\n",
" 'against',\n",
" 'ain',\n",
" 'all',\n",
" 'am',\n",
" 'an',\n",
" 'and',\n",
" 'any',\n",
" 'are',\n",
" 'aren',\n",
" \"aren't\",\n",
" 'as',\n",
" 'at',\n",
" 'be',\n",
" 'because',\n",
" 'been',\n",
" 'before',\n",
" 'being',\n",
" 'below',\n",
" 'between',\n",
" 'both',\n",
" 'but',\n",
" 'by',\n",
" 'can',\n",
" 'couldn',\n",
" \"couldn't\",\n",
" 'd',\n",
" 'did',\n",
" 'didn',\n",
" \"didn't\",\n",
" 'do',\n",
" 'does',\n",
" 'doesn',\n",
" \"doesn't\",\n",
" 'doing',\n",
" 'don',\n",
" \"don't\",\n",
" 'down',\n",
" 'during',\n",
" 'each',\n",
" 'few',\n",
" 'for',\n",
" 'from',\n",
" 'further',\n",
" 'had',\n",
" 'hadn',\n",
" \"hadn't\",\n",
" 'has',\n",
" 'hasn',\n",
" \"hasn't\",\n",
" 'have',\n",
" 'haven',\n",
" \"haven't\",\n",
" 'having',\n",
" 'he',\n",
" \"he'd\",\n",
" \"he'll\",\n",
" \"he's\",\n",
" 'her',\n",
" 'here',\n",
" 'hers',\n",
" 'herself',\n",
" 'him',\n",
" 'himself',\n",
" 'his',\n",
" 'how',\n",
" 'i',\n",
" \"i'd\",\n",
" \"i'll\",\n",
" \"i'm\",\n",
" \"i've\",\n",
" 'if',\n",
" 'in',\n",
" 'into',\n",
" 'is',\n",
" 'isn',\n",
" \"isn't\",\n",
" 'it',\n",
" \"it'd\",\n",
" \"it'll\",\n",
" \"it's\",\n",
" 'its',\n",
" 'itself',\n",
" 'just',\n",
" 'll',\n",
" 'm',\n",
" 'ma',\n",
" 'me',\n",
" 'mightn',\n",
" \"mightn't\",\n",
" 'more',\n",
" 'most',\n",
" 'mustn',\n",
" \"mustn't\",\n",
" 'my',\n",
" 'myself',\n",
" 'needn',\n",
" \"needn't\",\n",
" 'no',\n",
" 'nor',\n",
" 'not',\n",
" 'now',\n",
" 'o',\n",
" 'of',\n",
" 'off',\n",
" 'on',\n",
" 'once',\n",
" 'only',\n",
" 'or',\n",
" 'other',\n",
" 'our',\n",
" 'ours',\n",
" 'ourselves',\n",
" 'out',\n",
" 'over',\n",
" 'own',\n",
" 're',\n",
" 's',\n",
" 'same',\n",
" 'shan',\n",
" \"shan't\",\n",
" 'she',\n",
" \"she'd\",\n",
" \"she'll\",\n",
" \"she's\",\n",
" 'should',\n",
" \"should've\",\n",
" 'shouldn',\n",
" \"shouldn't\",\n",
" 'so',\n",
" 'some',\n",
" 'such',\n",
" 't',\n",
" 'than',\n",
" 'that',\n",
" \"that'll\",\n",
" 'the',\n",
" 'their',\n",
" 'theirs',\n",
" 'them',\n",
" 'themselves',\n",
" 'then',\n",
" 'there',\n",
" 'these',\n",
" 'they',\n",
" \"they'd\",\n",
" \"they'll\",\n",
" \"they're\",\n",
" \"they've\",\n",
" 'this',\n",
" 'those',\n",
" 'through',\n",
" 'to',\n",
" 'too',\n",
" 'under',\n",
" 'until',\n",
" 'up',\n",
" 've',\n",
" 'very',\n",
" 'was',\n",
" 'wasn',\n",
" \"wasn't\",\n",
" 'we',\n",
" \"we'd\",\n",
" \"we'll\",\n",
" \"we're\",\n",
" \"we've\",\n",
" 'were',\n",
" 'weren',\n",
" \"weren't\",\n",
" 'what',\n",
" 'when',\n",
" 'where',\n",
" 'which',\n",
" 'while',\n",
" 'who',\n",
" 'whom',\n",
" 'why',\n",
" 'will',\n",
" 'with',\n",
" 'won',\n",
" \"won't\",\n",
" 'wouldn',\n",
" \"wouldn't\",\n",
" 'y',\n",
" 'you',\n",
" \"you'd\",\n",
" \"you'll\",\n",
" \"you're\",\n",
" \"you've\",\n",
" 'your',\n",
" 'yours',\n",
" 'yourself',\n",
" 'yourselves'}"
]
},
"metadata": {},
"execution_count": 36
}
]
},
{
"cell_type": "code",
"source": [
"sample_text = [w for w in sample_text if w not in stop_words]\n",
"sample_text"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "YhEMcfQ2wuC4",
"outputId": "6d8c67ad-f1a7-4a8b-9b6e-adb41a05f0fe"
},
"execution_count": 37,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['vanphotolens',\n",
" 'get',\n",
" 'followers',\n",
" 'day',\n",
" 'using',\n",
" 'www',\n",
" 'tweeterfollow',\n",
" 'com',\n",
" 'add',\n",
" 'everyone',\n",
" 'train',\n",
" 'pay',\n",
" 'vip']"
]
},
"metadata": {},
"execution_count": 37
}
]
},
{
"cell_type": "code",
"source": [
"len(sample_text)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "luUkKN5NxDVw",
"outputId": "e1d12840-4715-42e9-d0e7-4a0e0fbfb40e"
},
"execution_count": 38,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"13"
]
},
"metadata": {},
"execution_count": 38
}
]
},
{
"cell_type": "code",
"source": [
"def process(review):\n",
" review = BeautifulSoup(review).get_text()\n",
" review = re.sub(\"[^a-zA-Z]\", ' ',review)\n",
" review.lower()\n",
" review = review.split()\n",
" stop_words = set(stopwords.words(\"english\"))\n",
" review = [w for w in review if w not in stop_words]\n",
" return (' '.join(review))"
],
"metadata": {
"id": "xKiD-a49xKyT"
},
"execution_count": 39,
"outputs": []
},
{
"cell_type": "code",
"source": [
"train_x_all = []\n",
"for index in df.index: # Iterate through DataFrame index values\n",
" if (index + 1) % 1000 == 0:\n",
" print(\"step of process\", index + 1)\n",
" train_x_all.append(process(df.loc[index, 'tweet_text'])) # Use loc for index-based access"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Q3kqZGLy3HmH",
"outputId": "d9ad5685-c613-494a-898d-30d17955a183"
},
"execution_count": 41,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"step of process 772000\n",
"step of process 323000\n",
"step of process 305000\n",
"step of process 164000\n",
"step of process 1049000\n",
"step of process 698000\n",
"step of process 1586000\n",
"step of process 1016000\n",
"step of process 523000\n",
"step of process 410000\n",
"step of process 200000\n",
"step of process 1317000\n",
"step of process 1323000\n",
"step of process 1151000\n",
"step of process 1274000\n",
"step of process 543000\n",
"step of process 1482000\n",
"step of process 1232000\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"from sklearn.model_selection import train_test_split"
],
"metadata": {
"id": "If9iJarT7jpe"
},
"execution_count": 42,
"outputs": []
},
{
"cell_type": "code",
"source": [
"X = train_x_all\n",
"y = np.array(df['0'])"
],
"metadata": {
"id": "meU1L-yP-NAm"
},
"execution_count": 43,
"outputs": []
},
{
"cell_type": "code",
"source": [
"X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)"
],
"metadata": {
"id": "BcasAZ66-WQ2"
},
"execution_count": 44,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from sklearn.feature_extraction.text import CountVectorizer"
],
"metadata": {
"id": "EYVnw4u7-ncX"
},
"execution_count": 45,
"outputs": []
},
{
"cell_type": "code",
"source": [
"vectorizer = CountVectorizer(max_features=5000)"
],
"metadata": {
"id": "wyfKn32LAM9w"
},
"execution_count": 46,
"outputs": []
},
{
"cell_type": "code",
"source": [
"train_x1 = vectorizer.fit_transform(X_train)\n",
"train_x1"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "RzeZaQSGATEY",
"outputId": "433f5985-9115-4d33-9ca7-85572e2ebdd7"
},
"execution_count": 47,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
""
]
},
"metadata": {},
"execution_count": 47
}
]
},
{
"cell_type": "code",
"source": [
"train_x1 = train_x1.toarray()"
],
"metadata": {
"id": "95Ma6XsKD966"
},
"execution_count": 48,
"outputs": []
},
{
"cell_type": "code",
"source": [
"train_y1 = y_train"
],
"metadata": {
"id": "FfBYLPu9D-_2"
},
"execution_count": 49,
"outputs": []
},
{
"cell_type": "code",
"source": [
"train_x1.shape, train_y1.shape"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "z1OKEqxCECFs",
"outputId": "3352f2eb-e491-4171-ba78-9e256b230f20"
},
"execution_count": 50,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"((12800, 5000), (12800,))"
]
},
"metadata": {},
"execution_count": 50
}
]
},
{
"cell_type": "code",
"source": [
"import lightgbm as lgb\n",
"model = lgb.LGBMClassifier()"
],
"metadata": {
"id": "djQ9FWOyEElZ"
},
"execution_count": 51,
"outputs": []
},
{
"cell_type": "code",
"source": [
"model.fit(train_x1,train_y1)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 274
},
"id": "Mm-aVRfiEJ0t",
"outputId": "d90a0ab3-e636-4bbd-c072-611495d7d0ce"
},
"execution_count": 52,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.11/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"[LightGBM] [Info] Number of positive: 6468, number of negative: 6332\n",
"[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.051104 seconds.\n",
"You can set `force_row_wise=true` to remove the overhead.\n",
"And if memory is not enough, you can set `force_col_wise=true`.\n",
"[LightGBM] [Info] Total Bins 2051\n",
"[LightGBM] [Info] Number of data points in the train set: 12800, number of used features: 796\n",
"[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.505313 -> initscore=0.021251\n",
"[LightGBM] [Info] Start training from score 0.021251\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"LGBMClassifier()"
],
"text/html": [
"LGBMClassifier() In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. "
]
},
"metadata": {},
"execution_count": 52
}
]
},
{
"cell_type": "code",
"source": [
"test_x1 = vectorizer.transform(X_test)"
],
"metadata": {
"id": "aPP2bBrUISfM"
},
"execution_count": 53,
"outputs": []
},
{
"cell_type": "code",
"source": [
"test_x1"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Zl_juoJqIW72",
"outputId": "b6dc4c6a-8ec3-4a24-aae7-563ac0ca675d"
},
"execution_count": 54,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
""
]
},
"metadata": {},
"execution_count": 54
}
]
},
{
"cell_type": "code",
"source": [
"test_x1 = test_x1.toarray()"
],
"metadata": {
"id": "THVcAM_zIbT5"
},
"execution_count": 55,
"outputs": []
},
{
"cell_type": "code",
"source": [
"test_x1.shape"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "H_EZiKwoIbMY",
"outputId": "a587ec03-5c1f-45f4-863b-f65ef56da33b"
},
"execution_count": 56,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(3200, 5000)"
]
},
"metadata": {},
"execution_count": 56
}
]
},
{
"cell_type": "code",
"source": [
"from sklearn.metrics import roc_auc_score"
],
"metadata": {
"id": "zA1LCCL6ImY9"
},
"execution_count": 57,
"outputs": []
},
{
"cell_type": "code",
"source": [
"test_predict_proba = model.predict_proba(test_x1)[:, 1] # Assuming binary classification, get probabilities for class 1\n",
"roc_auc = roc_auc_score(y_test, test_predict_proba) # Calculate ROC AUC using probabilities\n",
"roc_auc"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "JBaTvQjnImSm",
"outputId": "ebca2dc8-ede9-4f34-8ee3-ea357499ad04"
},
"execution_count": 58,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.11/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"np.float64(0.7849473015873015)"
]
},
"metadata": {},
"execution_count": 58
}
]
}
],
"metadata": {
"colab": {
"provenance": [],
"gpuType": "T4"
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU"
},
"nbformat": 4,
"nbformat_minor": 0
}