diff --git "a/source_code/assets/notebooks/data_gathering_twitter_API.ipynb" "b/source_code/assets/notebooks/data_gathering_twitter_API.ipynb"
new file mode 100644--- /dev/null
+++ "b/source_code/assets/notebooks/data_gathering_twitter_API.ipynb"
@@ -0,0 +1,3078 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "name": "Twitter_API.ipynb",
+ "provenance": []
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "n9mFOtjUGKmk"
+ },
+ "source": [
+ "# Tweet mining using Twitter API via Tweepy:"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "-3bUQ54_84g8"
+ },
+ "source": [
+ "In this notebook I am using Tweepy python library to tweets using relevant hashtags. I was able to retrieve around 19000 unique tweets via twitter API. At the end, all the datasets with different depressive hashtags will be combined, cleaned and saved as depressive_tweets.csv."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "1Bojm_bffNAV",
+ "outputId": "92f04f31-eb1b-4c13-f811-1cad9d759a34"
+ },
+ "source": [
+ "from google.colab import drive\n",
+ "drive.mount('/content/drive')"
+ ],
+ "execution_count": 1,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Mounted at /content/drive\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "7iWDBsjTwEyZ"
+ },
+ "source": [
+ "## Tweets mining"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "TtZk0vyLwWwW"
+ },
+ "source": [
+ "!pip install -qqq tweepy"
+ ],
+ "execution_count": 2,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "jobjTBDIwhUl"
+ },
+ "source": [
+ "## Import required libraries\n",
+ "import tweepy\n",
+ "from tweepy.streaming import StreamListener\n",
+ "from tweepy import OAuthHandler\n",
+ "from tweepy import Stream\n",
+ "import csv\n",
+ "import pandas as pd\n",
+ "\n",
+ "## Access to twitter API cunsumer_key and access_secret\n",
+ "#import config.ipynb"
+ ],
+ "execution_count": 3,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "Dv5AsxY6iL2s"
+ },
+ "source": [
+ "## Twitter API related information\n",
+ "consumer_key = config.API_KEY\n",
+ "consumer_secret = config.API_KEY_SECRET\n",
+ "access_key= config.ACCESS_TOKEN\n",
+ "access_secret = config.ACCESS_TOKEN_SECRET"
+ ],
+ "execution_count": 4,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "M6mSp-B_vzn-"
+ },
+ "source": [
+ "auth = tweepy.OAuthHandler(consumer_key, consumer_secret) # Pass in Consumer key and secret for authentication by API\n",
+ "auth.set_access_token(access_key, access_secret) # Pass in Access key and secret for authentication by API\n",
+ "api = tweepy.API(auth,wait_on_rate_limit=True,wait_on_rate_limit_notify=True) # Sleeps when API limit is reached"
+ ],
+ "execution_count": 5,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "FHqBQHYpDcz_"
+ },
+ "source": [
+ "## depress_tags = [\"#depressed\", \"#anxiety\", \"#depression\", \"#suicide\", \"#mentalhealth\"\n",
+ "## \"#loneliness\", \"#hopelessness\", \"#itsokaynottobeokay\", \"#sad\"]"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "0-BvNrToRims"
+ },
+ "source": [
+ "## \"#depressed\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "BERTal4NwVNx"
+ },
+ "source": [
+ "## Create a function for tweets mining\n",
+ "def tweets_mining1(search_query1, num_tweets1, since_id_num1):\n",
+ " # Collect tweets using the Cursor object\n",
+ " # Each item in the iterator has various attributes that you can access to get information about each tweet\n",
+ " tweet_list1 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query1, lang=\"en\", since_id=since_id_num1, \n",
+ " tweet_mode='extended').items(num_tweets1)]\n",
+ " \n",
+ " # Begin scraping the tweets individually:\n",
+ " for tweet in tweet_list1[::-1]:\n",
+ " tweet_id = tweet.id # get Tweet ID result\n",
+ " created_at = tweet.created_at # get time tweet was created\n",
+ " text = tweet.full_text # retrieve full tweet text\n",
+ " location = tweet.user.location # retrieve user location\n",
+ " retweet = tweet.retweet_count # retrieve number of retweets\n",
+ " favorite = tweet.favorite_count # retrieve number of likes\n",
+ " with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depressed_1.csv','a', newline='', encoding='utf-8') as csvFile1:\n",
+ " csv_writer1 = csv.writer(csvFile1, delimiter=',') # create an instance of csv object\n",
+ " csv_writer1.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row"
+ ],
+ "execution_count": 6,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "8LOXgG5xygnj"
+ },
+ "source": [
+ "search_words1 = \"#depressed\" # Specifying exact phrase to search\n",
+ "# Exclude Links, retweets, replies\n",
+ "search_query1 = search_words1 + \" -filter:links AND -filter:retweets AND -filter:replies\" \n",
+ "with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depressed_1.csv', encoding='utf-8') as data:\n",
+ " latest_tweet = int(list(csv.reader(data))[-1][0]) \n",
+ "tweets_mining1(search_query1, 1000, latest_tweet)"
+ ],
+ "execution_count": 7,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "JSDTPj7Nz5Rh"
+ },
+ "source": [
+ "df_depressed_1 = pd.read_csv(\"/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depressed_1.csv\",\n",
+ " names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"])"
+ ],
+ "execution_count": 8,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 581
+ },
+ "id": "aQe7bso7VBZA",
+ "outputId": "bed5b299-8399-4b86-f6d6-630085f308a8"
+ },
+ "source": [
+ "df_depressed_1"
+ ],
+ "execution_count": 9,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " tweet.id | \n",
+ " created_at | \n",
+ " text | \n",
+ " location | \n",
+ " retweet | \n",
+ " favorite | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1446882366945837057 | \n",
+ " 2021-10-09 16:56:52 | \n",
+ " I totally need someone to hug me TIGHT and say... | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1446896799860539394 | \n",
+ " 2021-10-09 17:54:13 | \n",
+ " i plan on committing suicide today or tommorro... | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 1446912210672959491 | \n",
+ " 2021-10-09 18:55:28 | \n",
+ " Exhausted! Absolutely exhausted and my day isn... | \n",
+ " Lost 🤕 | \n",
+ " 0 | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 1446931930537209856 | \n",
+ " 2021-10-09 20:13:49 | \n",
+ " Im going to get Far Cry 6 and playing video ga... | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 1446934914453082113 | \n",
+ " 2021-10-09 20:25:41 | \n",
+ " Just #depressed haven’t made money in 4 days o... | \n",
+ " Daddy’s lap. | \n",
+ " 0 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 1440 | \n",
+ " 1459292661848883203 | \n",
+ " 2021-11-12 22:50:57 | \n",
+ " it gets dark at 5 now. #depressed | \n",
+ " Toronto, Ontario | \n",
+ " 0 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 1441 | \n",
+ " 1459295472993153030 | \n",
+ " 2021-11-12 23:02:07 | \n",
+ " Ignore my tweets, if I tweet, for the next cou... | \n",
+ " Paisley, Scotland | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 1442 | \n",
+ " 1459323510803759108 | \n",
+ " 2021-11-13 00:53:32 | \n",
+ " how tf you a psychology major and depressed? l... | \n",
+ " San Diego, CA | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1443 | \n",
+ " 1459376207527440385 | \n",
+ " 2021-11-13 04:22:56 | \n",
+ " Liquors my bestie till my flight tomorrow fml ... | \n",
+ " Dreamville, LBC♥ | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1444 | \n",
+ " 1459497253698035714 | \n",
+ " 2021-11-13 12:23:56 | \n",
+ " i signed up for @netflix just so i can watch b... | \n",
+ " Washington, USA | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
1445 rows × 6 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " tweet.id created_at ... retweet favorite\n",
+ "0 1446882366945837057 2021-10-09 16:56:52 ... 0 1\n",
+ "1 1446896799860539394 2021-10-09 17:54:13 ... 0 1\n",
+ "2 1446912210672959491 2021-10-09 18:55:28 ... 0 8\n",
+ "3 1446931930537209856 2021-10-09 20:13:49 ... 0 1\n",
+ "4 1446934914453082113 2021-10-09 20:25:41 ... 0 2\n",
+ "... ... ... ... ... ...\n",
+ "1440 1459292661848883203 2021-11-12 22:50:57 ... 0 2\n",
+ "1441 1459295472993153030 2021-11-12 23:02:07 ... 0 1\n",
+ "1442 1459323510803759108 2021-11-13 00:53:32 ... 0 0\n",
+ "1443 1459376207527440385 2021-11-13 04:22:56 ... 0 0\n",
+ "1444 1459497253698035714 2021-11-13 12:23:56 ... 0 0\n",
+ "\n",
+ "[1445 rows x 6 columns]"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 9
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "gnZnQBdQ8VZL",
+ "outputId": "2dc93be1-17f9-4b9d-d1d5-cab5eafdb544"
+ },
+ "source": [
+ "## Finding unique values in each column\n",
+ "for col in df_depressed_1:\n",
+ " print(\"There are \", len(df_depressed_1[col].unique()), \"unique values in \", col)"
+ ],
+ "execution_count": 10,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "There are 849 unique values in tweet.id\n",
+ "There are 849 unique values in created_at\n",
+ "There are 843 unique values in text\n",
+ "There are 383 unique values in location\n",
+ "There are 7 unique values in retweet\n",
+ "There are 25 unique values in favorite\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "jVSywSxSvYbS"
+ },
+ "source": [
+ "### Anxiety and suicide "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "1UWM-o41vd6Z"
+ },
+ "source": [
+ "## Create a function for tweets mining\n",
+ "def tweets_mining2(search_query2, num_tweets2, since_id_num2):\n",
+ " # Collect tweets using the Cursor object\n",
+ " # Each item in the iterator has various attributes that you can access to get information about each tweet\n",
+ " tweet_list2 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query2, lang=\"en\", since_id=since_id_num2, \n",
+ " tweet_mode='extended').items(num_tweets2)]\n",
+ " \n",
+ " # Begin scraping the tweets individually:\n",
+ " for tweet in tweet_list2[::-1]:\n",
+ " tweet_id = tweet.id # get Tweet ID result\n",
+ " created_at = tweet.created_at # get time tweet was created\n",
+ " text = tweet.full_text # retrieve full tweet text\n",
+ " location = tweet.user.location # retrieve user location\n",
+ " retweet = tweet.retweet_count # retrieve number of retweets\n",
+ " favorite = tweet.favorite_count # retrieve number of likes\n",
+ " with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_anxiety_1.csv','a', newline='', encoding='utf-8') as csvFile2:\n",
+ " csv_writer2 = csv.writer(csvFile2, delimiter=',') # create an instance of csv object\n",
+ " csv_writer2.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row"
+ ],
+ "execution_count": 11,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "4WS3HYJ_yUPe"
+ },
+ "source": [
+ "search_words2 = \"#anxiety\" # Specifying exact phrase to search\n",
+ "# Exclude Links, retweets, replies\n",
+ "search_query2 = search_words2 + \" -filter:links AND -filter:retweets AND -filter:replies\"\n",
+ "with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_anxiety_1.csv', encoding='utf-8') as data:\n",
+ " latest_tweet = int(list(csv.reader(data))[-1][0]) \n",
+ "tweets_mining2(search_query2, 2000, latest_tweet)"
+ ],
+ "execution_count": 12,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "mMnPf-UoD1gA"
+ },
+ "source": [
+ "df_anxiety_1 = pd.read_csv(\"/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_anxiety_1.csv\",\n",
+ " names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"])"
+ ],
+ "execution_count": 13,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 581
+ },
+ "id": "SyvsN8-3D73N",
+ "outputId": "d139df05-638a-4a91-e94c-e7560db53069"
+ },
+ "source": [
+ "df_anxiety_1"
+ ],
+ "execution_count": 14,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " tweet.id | \n",
+ " created_at | \n",
+ " text | \n",
+ " location | \n",
+ " retweet | \n",
+ " favorite | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1447067749654614019 | \n",
+ " 2021-10-10 05:13:31 | \n",
+ " I can't wait to get the hell out. so I'll jus... | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1447069714379857927 | \n",
+ " 2021-10-10 05:21:19 | \n",
+ " Morning. All people except me sleeping. @Billy... | \n",
+ " Queenie's Castle,Yate, S Glos | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 1447072203388985346 | \n",
+ " 2021-10-10 05:31:13 | \n",
+ " On #WorldMentalHealthDay, a big shoutout to my... | \n",
+ " Bengaluru/Muscat/Palakad/Kochi | \n",
+ " 0 | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 1447072334825754626 | \n",
+ " 2021-10-10 05:31:44 | \n",
+ " I hate having anxiety about doing stuff that I... | \n",
+ " Utah, USA | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 1447074986531848192 | \n",
+ " 2021-10-10 05:42:16 | \n",
+ " I am not scared of my ADHD, depression and anx... | \n",
+ " Wollongong, New South Wales | \n",
+ " 2 | \n",
+ " 11 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 6867 | \n",
+ " 1459224031777939460 | \n",
+ " 2021-11-12 18:18:14 | \n",
+ " It’s amazing how everyone runs to me as the su... | \n",
+ " Pennsylvania, USA | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 6868 | \n",
+ " 1459224808512704516 | \n",
+ " 2021-11-12 18:21:20 | \n",
+ " Any suggestions on settling the stomach after ... | \n",
+ " Everywhere, Anywhere | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 6869 | \n",
+ " 1459228047278751747 | \n",
+ " 2021-11-12 18:34:12 | \n",
+ " Gotta love that superpowered #anxiety taking h... | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 6870 | \n",
+ " 1459229518128893952 | \n",
+ " 2021-11-12 18:40:02 | \n",
+ " Growth nor healing is linear. Sometimes you ma... | \n",
+ " London | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 6871 | \n",
+ " 1459230527358222337 | \n",
+ " 2021-11-12 18:44:03 | \n",
+ " Just read on a YouTube comment how mentally il... | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
6872 rows × 6 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " tweet.id created_at ... retweet favorite\n",
+ "0 1447067749654614019 2021-10-10 05:13:31 ... 0 0\n",
+ "1 1447069714379857927 2021-10-10 05:21:19 ... 0 1\n",
+ "2 1447072203388985346 2021-10-10 05:31:13 ... 0 9\n",
+ "3 1447072334825754626 2021-10-10 05:31:44 ... 0 0\n",
+ "4 1447074986531848192 2021-10-10 05:42:16 ... 2 11\n",
+ "... ... ... ... ... ...\n",
+ "6867 1459224031777939460 2021-11-12 18:18:14 ... 0 0\n",
+ "6868 1459224808512704516 2021-11-12 18:21:20 ... 0 0\n",
+ "6869 1459228047278751747 2021-11-12 18:34:12 ... 0 0\n",
+ "6870 1459229518128893952 2021-11-12 18:40:02 ... 0 0\n",
+ "6871 1459230527358222337 2021-11-12 18:44:03 ... 0 0\n",
+ "\n",
+ "[6872 rows x 6 columns]"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 14
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "ygvC0l-C9NXp",
+ "outputId": "cef49691-326a-43d4-a7d5-28725aafc5b5"
+ },
+ "source": [
+ "## Finding unique values in each column\n",
+ "for col in df_anxiety_1:\n",
+ " print(\"There are \", len(df_anxiety_1[col].unique()), \"unique values in \", col)"
+ ],
+ "execution_count": 15,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "There are 4738 unique values in tweet.id\n",
+ "There are 4733 unique values in created_at\n",
+ "There are 4342 unique values in text\n",
+ "There are 1381 unique values in location\n",
+ "There are 33 unique values in retweet\n",
+ "There are 80 unique values in favorite\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "iSbEvJo0CVBh"
+ },
+ "source": [
+ "## \"#Suicide\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "ofqzhBcR1bj-"
+ },
+ "source": [
+ "## Create a function for tweets mining\n",
+ "def tweets_mining3(search_query3, num_tweets3, since_id_num3):\n",
+ " # Collect tweets using the Cursor object\n",
+ " # Each item in the iterator has various attributes that you can access to get information about each tweet\n",
+ " tweet_list3 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query3, lang=\"en\", since_id=since_id_num3, \n",
+ " tweet_mode='extended').items(num_tweets3)]\n",
+ " \n",
+ " # Begin scraping the tweets individually:\n",
+ " for tweet in tweet_list3[::-1]:\n",
+ " tweet_id = tweet.id # get Tweet ID result\n",
+ " created_at = tweet.created_at # get time tweet was created\n",
+ " text = tweet.full_text # retrieve full tweet text\n",
+ " location = tweet.user.location # retrieve user location\n",
+ " retweet = tweet.retweet_count # retrieve number of retweets\n",
+ " favorite = tweet.favorite_count # retrieve number of likes\n",
+ " with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_suicide_1.csv','a', newline='', encoding='utf-8') as csvFile3:\n",
+ " csv_writer3 = csv.writer(csvFile3, delimiter=',') # create an instance of csv object\n",
+ " csv_writer3.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row"
+ ],
+ "execution_count": 10,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "_wIXzt57Cn3e"
+ },
+ "source": [
+ "search_words3 = \"#suicide\" # Specifying exact phrase to search\n",
+ "# Exclude Links, retweets, replies\n",
+ "search_query3 = search_words3 + \" -filter:links AND -filter:retweets AND -filter:replies\" \n",
+ "with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_suicide_1.csv', encoding='utf-8') as data:\n",
+ " latest_tweet = int(list(csv.reader(data))[-1][0]) \n",
+ "tweets_mining3(search_query3, 10000, latest_tweet)"
+ ],
+ "execution_count": 11,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "XkfhTVodENiy"
+ },
+ "source": [
+ "df_suicide_1 = pd.read_csv(\"/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_suicide_1.csv\",\n",
+ " names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"])"
+ ],
+ "execution_count": 12,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 581
+ },
+ "id": "8HAqIISVEXFy",
+ "outputId": "0667d586-2e25-4690-95b4-a86f748e9eae"
+ },
+ "source": [
+ "df_suicide_1"
+ ],
+ "execution_count": 13,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " tweet.id | \n",
+ " created_at | \n",
+ " text | \n",
+ " location | \n",
+ " retweet | \n",
+ " favorite | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1447381474034999296 | \n",
+ " 2021-10-11 02:00:09 | \n",
+ " #suicide is the strong belief that no matter h... | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1447439429409415172 | \n",
+ " 2021-10-11 05:50:26 | \n",
+ " \"suicide\"\\nHollowness enough\\nSilence enough\\n... | \n",
+ " NaN | \n",
+ " 2 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 1447444376464998400 | \n",
+ " 2021-10-11 06:10:06 | \n",
+ " Every year passes but the pain remains the sam... | \n",
+ " India | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 1447445469467131906 | \n",
+ " 2021-10-11 06:14:26 | \n",
+ " Have I told you how much I hate my life😂😂😁 #su... | \n",
+ " Ohio, USA | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 1447461306295013377 | \n",
+ " 2021-10-11 07:17:22 | \n",
+ " The man responsible for the #CDC policies that... | \n",
+ " United States | \n",
+ " 1 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 713 | \n",
+ " 1459446304577363971 | \n",
+ " 2021-11-13 09:01:28 | \n",
+ " Someone wanted me to tell you. You're beautifu... | \n",
+ " D(1) Florida | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 714 | \n",
+ " 1459454059975352320 | \n",
+ " 2021-11-13 09:32:17 | \n",
+ " It's a regular thing🙂💔\\n#Coimbatore #suicide #... | \n",
+ " Tiruppur, India | \n",
+ " 0 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 715 | \n",
+ " 1459454073644765185 | \n",
+ " 2021-11-13 09:32:21 | \n",
+ " #Suicide is not as bad as people make it \\n\\nB... | \n",
+ " The Chisolm Trail | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 716 | \n",
+ " 1459495548373934081 | \n",
+ " 2021-11-13 12:17:09 | \n",
+ " Just Uploaded My Review Of Dear Evan Hansen To... | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 717 | \n",
+ " 1459525084180267027 | \n",
+ " 2021-11-13 14:14:31 | \n",
+ " On #WorldKindnessDay we would just like to say... | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
718 rows × 6 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " tweet.id created_at ... retweet favorite\n",
+ "0 1447381474034999296 2021-10-11 02:00:09 ... 0 0\n",
+ "1 1447439429409415172 2021-10-11 05:50:26 ... 2 2\n",
+ "2 1447444376464998400 2021-10-11 06:10:06 ... 0 0\n",
+ "3 1447445469467131906 2021-10-11 06:14:26 ... 0 1\n",
+ "4 1447461306295013377 2021-10-11 07:17:22 ... 1 2\n",
+ ".. ... ... ... ... ...\n",
+ "713 1459446304577363971 2021-11-13 09:01:28 ... 0 0\n",
+ "714 1459454059975352320 2021-11-13 09:32:17 ... 0 3\n",
+ "715 1459454073644765185 2021-11-13 09:32:21 ... 0 0\n",
+ "716 1459495548373934081 2021-11-13 12:17:09 ... 0 0\n",
+ "717 1459525084180267027 2021-11-13 14:14:31 ... 1 5\n",
+ "\n",
+ "[718 rows x 6 columns]"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 13
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "ZilsrGx9Ex2i"
+ },
+ "source": [
+ "## \"#hopelessness\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "mqFLOv-AE5Lw"
+ },
+ "source": [
+ "## Create a function for tweets mining\n",
+ "def tweets_mining4(search_query4, num_tweets4, since_id_num4):\n",
+ " # Collect tweets using the Cursor object\n",
+ " # Each item in the iterator has various attributes that you can access to get information about each tweet\n",
+ " tweet_list4 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query4, lang=\"en\", since_id=since_id_num4, \n",
+ " tweet_mode='extended').items(num_tweets4)]\n",
+ " \n",
+ " # Begin scraping the tweets individually:\n",
+ " for tweet in tweet_list4[::-1]:\n",
+ " tweet_id = tweet.id # get Tweet ID result\n",
+ " created_at = tweet.created_at # get time tweet was created\n",
+ " text = tweet.full_text # retrieve full tweet text\n",
+ " location = tweet.user.location # retrieve user location\n",
+ " retweet = tweet.retweet_count # retrieve number of retweets\n",
+ " favorite = tweet.favorite_count # retrieve number of likes\n",
+ " with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_hopeless_1.csv','a', newline='', encoding='utf-8') as csvFile4:\n",
+ " csv_writer4 = csv.writer(csvFile4, delimiter=',') # create an instance of csv object\n",
+ " csv_writer4.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row"
+ ],
+ "execution_count": 14,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "7Pf9avomE-G6"
+ },
+ "source": [
+ "search_words4 = \"#hopelessness\" # Specifying exact phrase to search\n",
+ "# Exclude Links, retweets, replies\n",
+ "search_query4 = search_words4 + \" -filter:links AND -filter:retweets AND -filter:replies\"\n",
+ "with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_hopeless_1.csv', encoding='utf-8') as data:\n",
+ " latest_tweet = int(list(csv.reader(data))[-1][0]) \n",
+ "tweets_mining4(search_query4, 10000, latest_tweet)"
+ ],
+ "execution_count": 15,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "pSauv_5jFAzX"
+ },
+ "source": [
+ "df_hopeless_1 = pd.read_csv(\"/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_hopeless_1.csv\",\n",
+ " names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"])"
+ ],
+ "execution_count": 16,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 1000
+ },
+ "id": "jFjXgpXDFwn1",
+ "outputId": "a063c672-3333-4e3b-c71e-c3329270854e"
+ },
+ "source": [
+ "df_hopeless_1"
+ ],
+ "execution_count": 17,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " tweet.id | \n",
+ " created_at | \n",
+ " text | \n",
+ " location | \n",
+ " retweet | \n",
+ " favorite | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1447537898572574730 | \n",
+ " 2021-10-11 12:21:43 | \n",
+ " Open discussion. Between the Transfer Portal a... | \n",
+ " Cheyenne Wyoming | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1447540582490988553 | \n",
+ " 2021-10-11 12:32:23 | \n",
+ " Plenty of things are changing in my life and t... | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 1447807717859491842 | \n",
+ " 2021-10-12 06:13:53 | \n",
+ " I feel a little hopeless. Anyone else? #hopele... | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 1448076026219692033 | \n",
+ " 2021-10-13 00:00:03 | \n",
+ " Which is more healthy? Hope, or hopelessness? ... | \n",
+ " Denver, CO | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 1448382047375040513 | \n",
+ " 2021-10-13 20:16:04 | \n",
+ " So someone tell me how do I get over #HOPELESS... | \n",
+ " Portland Or . | \n",
+ " 0 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 1448595145138622464 | \n",
+ " 2021-10-14 10:22:50 | \n",
+ " No parent deserves to experience the Indian le... | \n",
+ " Bombay, Dubai | \n",
+ " 1 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 1448843909841313793 | \n",
+ " 2021-10-15 02:51:20 | \n",
+ " Being in a #union also looks a lot like being ... | \n",
+ " Alberta, Canada | \n",
+ " 7 | \n",
+ " 17 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 1449848070783524864 | \n",
+ " 2021-10-17 21:21:31 | \n",
+ " I am so glad that @GreysABC is tackling the hu... | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 1447537898572574730 | \n",
+ " 2021-10-11 12:21:43 | \n",
+ " Open discussion. Between the Transfer Portal a... | \n",
+ " Cheyenne Wyoming | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 1447540582490988553 | \n",
+ " 2021-10-11 12:32:23 | \n",
+ " Plenty of things are changing in my life and t... | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " 1447807717859491842 | \n",
+ " 2021-10-12 06:13:53 | \n",
+ " I feel a little hopeless. Anyone else? #hopele... | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " 1448076026219692033 | \n",
+ " 2021-10-13 00:00:03 | \n",
+ " Which is more healthy? Hope, or hopelessness? ... | \n",
+ " Denver, CO | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " 1448382047375040513 | \n",
+ " 2021-10-13 20:16:04 | \n",
+ " So someone tell me how do I get over #HOPELESS... | \n",
+ " Portland Or . | \n",
+ " 0 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " 1448595145138622464 | \n",
+ " 2021-10-14 10:22:50 | \n",
+ " No parent deserves to experience the Indian le... | \n",
+ " Bombay, Dubai | \n",
+ " 1 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | 14 | \n",
+ " 1448843909841313793 | \n",
+ " 2021-10-15 02:51:20 | \n",
+ " Being in a #union also looks a lot like being ... | \n",
+ " Alberta, Canada | \n",
+ " 7 | \n",
+ " 17 | \n",
+ "
\n",
+ " \n",
+ " | 15 | \n",
+ " 1449848070783524864 | \n",
+ " 2021-10-17 21:21:31 | \n",
+ " I am so glad that @GreysABC is tackling the hu... | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " 1447537898572574730 | \n",
+ " 2021-10-11 12:21:43 | \n",
+ " Open discussion. Between the Transfer Portal a... | \n",
+ " Cheyenne Wyoming | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " 1447540582490988553 | \n",
+ " 2021-10-11 12:32:23 | \n",
+ " Plenty of things are changing in my life and t... | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 18 | \n",
+ " 1447807717859491842 | \n",
+ " 2021-10-12 06:13:53 | \n",
+ " I feel a little hopeless. Anyone else? #hopele... | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 19 | \n",
+ " 1448076026219692033 | \n",
+ " 2021-10-13 00:00:03 | \n",
+ " Which is more healthy? Hope, or hopelessness? ... | \n",
+ " Denver, CO | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 20 | \n",
+ " 1448382047375040513 | \n",
+ " 2021-10-13 20:16:04 | \n",
+ " So someone tell me how do I get over #HOPELESS... | \n",
+ " Portland Or . | \n",
+ " 0 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 21 | \n",
+ " 1448595145138622464 | \n",
+ " 2021-10-14 10:22:50 | \n",
+ " No parent deserves to experience the Indian le... | \n",
+ " Bombay, Dubai | \n",
+ " 1 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | 22 | \n",
+ " 1448843909841313793 | \n",
+ " 2021-10-15 02:51:20 | \n",
+ " Being in a #union also looks a lot like being ... | \n",
+ " Alberta, Canada | \n",
+ " 7 | \n",
+ " 17 | \n",
+ "
\n",
+ " \n",
+ " | 23 | \n",
+ " 1449848070783524864 | \n",
+ " 2021-10-17 21:21:31 | \n",
+ " I am so glad that @GreysABC is tackling the hu... | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 24 | \n",
+ " 1451858330591318022 | \n",
+ " 2021-10-23 10:29:34 | \n",
+ " If you know someone who’s depressed please res... | \n",
+ " Rwanda | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 25 | \n",
+ " 1453499016394723330 | \n",
+ " 2021-10-27 23:09:04 | \n",
+ " A #grateful #heart will #SeeGod. You will find... | \n",
+ " Berlin, NJ | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 26 | \n",
+ " 1453738324598865920 | \n",
+ " 2021-10-28 15:00:00 | \n",
+ " “Our world today so desperately hungers for ho... | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 27 | \n",
+ " 1453745900996726785 | \n",
+ " 2021-10-28 15:30:06 | \n",
+ " Depression is a bitch that is difficult for me... | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 28 | \n",
+ " 1454441137951821824 | \n",
+ " 2021-10-30 13:32:44 | \n",
+ " Add to this list #whatsincreased \\n#petrol\\n#d... | \n",
+ " New Delhi, India | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 29 | \n",
+ " 1456980506160025604 | \n",
+ " 2021-11-06 13:43:16 | \n",
+ " \"Hopelessness has surprised me with patience.\"... | \n",
+ " Planet Earth | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 30 | \n",
+ " 1457005145510797315 | \n",
+ " 2021-11-06 15:21:11 | \n",
+ " “Go if you have to, but remember, don’t come b... | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 31 | \n",
+ " 1457192619184902147 | \n",
+ " 2021-11-07 03:46:08 | \n",
+ " Hey @Headspace, I need to believe in something... | \n",
+ " Santo Mondongo | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 32 | \n",
+ " 1458953923151212548 | \n",
+ " 2021-11-12 00:24:55 | \n",
+ " 2 years ago I attempted #suicide to escape #do... | \n",
+ " Carpentersville, IL | \n",
+ " 0 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 33 | \n",
+ " 1459449269140787202 | \n",
+ " 2021-11-13 09:13:15 | \n",
+ " WARNING: Being deprived of God’s joy will lead... | \n",
+ " United States | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " tweet.id created_at ... retweet favorite\n",
+ "0 1447537898572574730 2021-10-11 12:21:43 ... 0 0\n",
+ "1 1447540582490988553 2021-10-11 12:32:23 ... 0 0\n",
+ "2 1447807717859491842 2021-10-12 06:13:53 ... 0 0\n",
+ "3 1448076026219692033 2021-10-13 00:00:03 ... 0 0\n",
+ "4 1448382047375040513 2021-10-13 20:16:04 ... 0 2\n",
+ "5 1448595145138622464 2021-10-14 10:22:50 ... 1 4\n",
+ "6 1448843909841313793 2021-10-15 02:51:20 ... 7 17\n",
+ "7 1449848070783524864 2021-10-17 21:21:31 ... 0 1\n",
+ "8 1447537898572574730 2021-10-11 12:21:43 ... 0 0\n",
+ "9 1447540582490988553 2021-10-11 12:32:23 ... 0 0\n",
+ "10 1447807717859491842 2021-10-12 06:13:53 ... 0 0\n",
+ "11 1448076026219692033 2021-10-13 00:00:03 ... 0 0\n",
+ "12 1448382047375040513 2021-10-13 20:16:04 ... 0 2\n",
+ "13 1448595145138622464 2021-10-14 10:22:50 ... 1 4\n",
+ "14 1448843909841313793 2021-10-15 02:51:20 ... 7 17\n",
+ "15 1449848070783524864 2021-10-17 21:21:31 ... 0 1\n",
+ "16 1447537898572574730 2021-10-11 12:21:43 ... 0 0\n",
+ "17 1447540582490988553 2021-10-11 12:32:23 ... 0 0\n",
+ "18 1447807717859491842 2021-10-12 06:13:53 ... 0 0\n",
+ "19 1448076026219692033 2021-10-13 00:00:03 ... 0 0\n",
+ "20 1448382047375040513 2021-10-13 20:16:04 ... 0 2\n",
+ "21 1448595145138622464 2021-10-14 10:22:50 ... 1 4\n",
+ "22 1448843909841313793 2021-10-15 02:51:20 ... 7 17\n",
+ "23 1449848070783524864 2021-10-17 21:21:31 ... 1 2\n",
+ "24 1451858330591318022 2021-10-23 10:29:34 ... 0 1\n",
+ "25 1453499016394723330 2021-10-27 23:09:04 ... 0 1\n",
+ "26 1453738324598865920 2021-10-28 15:00:00 ... 0 0\n",
+ "27 1453745900996726785 2021-10-28 15:30:06 ... 0 3\n",
+ "28 1454441137951821824 2021-10-30 13:32:44 ... 0 0\n",
+ "29 1456980506160025604 2021-11-06 13:43:16 ... 0 0\n",
+ "30 1457005145510797315 2021-11-06 15:21:11 ... 0 0\n",
+ "31 1457192619184902147 2021-11-07 03:46:08 ... 0 0\n",
+ "32 1458953923151212548 2021-11-12 00:24:55 ... 0 2\n",
+ "33 1459449269140787202 2021-11-13 09:13:15 ... 0 0\n",
+ "\n",
+ "[34 rows x 6 columns]"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 17
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "zsX2-S8vGGh8"
+ },
+ "source": [
+ "## \"#mentalhealth\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "gdvSCV-oGOP8"
+ },
+ "source": [
+ "## Create a function for tweets mining\n",
+ "def tweets_mining5(search_query5, num_tweets5, since_id_num5):\n",
+ " # Collect tweets using the Cursor object\n",
+ " # Each item in the iterator has various attributes that you can access to get information about each tweet\n",
+ " tweet_list5 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query5, lang=\"en\", since_id=since_id_num5, \n",
+ " tweet_mode='extended').items(num_tweets5)]\n",
+ " \n",
+ " # Begin scraping the tweets individually:\n",
+ " for tweet in tweet_list5[::-1]:\n",
+ " tweet_id = tweet.id # get Tweet ID result\n",
+ " created_at = tweet.created_at # get time tweet was created\n",
+ " text = tweet.full_text # retrieve full tweet text\n",
+ " location = tweet.user.location # retrieve user location\n",
+ " retweet = tweet.retweet_count # retrieve number of retweets\n",
+ " favorite = tweet.favorite_count # retrieve number of likes\n",
+ " with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_mentalhealth_1.csv','a', newline='', encoding='utf-8') as csvFile5:\n",
+ " csv_writer5 = csv.writer(csvFile5, delimiter=',') # create an instance of csv object\n",
+ " csv_writer5.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row"
+ ],
+ "execution_count": 18,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "Euoe88tsGdkc"
+ },
+ "source": [
+ "search_words5 = \"#mentalhealth\" # Specifying exact phrase to search\n",
+ "# Exclude Links, retweets, replies\n",
+ "search_query5 = search_words5 + \" -filter:links AND -filter:retweets AND -filter:replies\" \n",
+ "with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_mentalhealth_1.csv', encoding='utf-8') as data:\n",
+ " latest_tweet = int(list(csv.reader(data))[-1][0])\n",
+ "tweets_mining5(search_query5, 1000, latest_tweet)"
+ ],
+ "execution_count": 19,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "s8rbK0pOGu80"
+ },
+ "source": [
+ "df_mentalhealth_1 = pd.read_csv(\"/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_mentalhealth_1.csv\",\n",
+ " names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"])"
+ ],
+ "execution_count": 20,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 581
+ },
+ "id": "CpmrexYEH9ii",
+ "outputId": "0b26846b-b32d-44ea-8612-4cfb551bb444"
+ },
+ "source": [
+ "df_mentalhealth_1"
+ ],
+ "execution_count": 21,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " tweet.id | \n",
+ " created_at | \n",
+ " text | \n",
+ " location | \n",
+ " retweet | \n",
+ " favorite | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1449685870945185792 | \n",
+ " 2021-10-17 10:37:00 | \n",
+ " Sunday's goals. \\n1. Take meds\\n2. Drink 3 lit... | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1449686119658840065 | \n",
+ " 2021-10-17 10:37:59 | \n",
+ " \"????\" #Mentalhealth\\n\\ni'm tired of fighting... | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 1449686255185321986 | \n",
+ " 2021-10-17 10:38:31 | \n",
+ " Surrounded by people but feeling so alone 😔 \\n... | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 1449686716168671232 | \n",
+ " 2021-10-17 10:40:21 | \n",
+ " I understand my dv worker has emergencies but ... | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 1449687397776592898 | \n",
+ " 2021-10-17 10:43:04 | \n",
+ " Struggling to get out of bed and do things tha... | \n",
+ " England, United Kingdom | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 6592 | \n",
+ " 1459531596009283600 | \n",
+ " 2021-11-13 14:40:23 | \n",
+ " Let’s make good choices today friends!!! ❤️ #R... | \n",
+ " Florida, USA | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 6593 | \n",
+ " 1459532754387976200 | \n",
+ " 2021-11-13 14:45:00 | \n",
+ " Oh it’s a dark joke when I say I wanna bedazzl... | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 6594 | \n",
+ " 1459532763942604800 | \n",
+ " 2021-11-13 14:45:02 | \n",
+ " I discovered today that clothes shopping is a ... | \n",
+ " England, United Kingdom | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 6595 | \n",
+ " 1459532906074935304 | \n",
+ " 2021-11-13 14:45:36 | \n",
+ " We composed a tweet thread about our college's... | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 6596 | \n",
+ " 1459533316428754950 | \n",
+ " 2021-11-13 14:47:14 | \n",
+ " feels awkward at 1st but don’t know how i feel... | \n",
+ " Anaheim, CA | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
6597 rows × 6 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " tweet.id created_at ... retweet favorite\n",
+ "0 1449685870945185792 2021-10-17 10:37:00 ... 0 1\n",
+ "1 1449686119658840065 2021-10-17 10:37:59 ... 0 0\n",
+ "2 1449686255185321986 2021-10-17 10:38:31 ... 0 1\n",
+ "3 1449686716168671232 2021-10-17 10:40:21 ... 0 0\n",
+ "4 1449687397776592898 2021-10-17 10:43:04 ... 0 0\n",
+ "... ... ... ... ... ...\n",
+ "6592 1459531596009283600 2021-11-13 14:40:23 ... 0 1\n",
+ "6593 1459532754387976200 2021-11-13 14:45:00 ... 0 1\n",
+ "6594 1459532763942604800 2021-11-13 14:45:02 ... 0 1\n",
+ "6595 1459532906074935304 2021-11-13 14:45:36 ... 0 1\n",
+ "6596 1459533316428754950 2021-11-13 14:47:14 ... 0 1\n",
+ "\n",
+ "[6597 rows x 6 columns]"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 21
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "Jwcc9Bwdx0ie"
+ },
+ "source": [
+ "## \"#loneliness\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "tfu8ca0Wx1m9"
+ },
+ "source": [
+ "## Create a function for tweets mining\n",
+ "def tweets_mining6(search_query6, num_tweets6, since_id_num6):\n",
+ " # Collect tweets using the Cursor object\n",
+ " # Each item in the iterator has various attributes that you can access to get information about each tweet\n",
+ " tweet_list6 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query6, lang=\"en\", since_id=since_id_num6, \n",
+ " tweet_mode='extended').items(num_tweets6)]\n",
+ " \n",
+ " # Begin scraping the tweets individually:\n",
+ " for tweet in tweet_list6[::-1]:\n",
+ " tweet_id = tweet.id # get Tweet ID result\n",
+ " created_at = tweet.created_at # get time tweet was created\n",
+ " text = tweet.full_text # retrieve full tweet text\n",
+ " location = tweet.user.location # retrieve user location\n",
+ " retweet = tweet.retweet_count # retrieve number of retweets\n",
+ " favorite = tweet.favorite_count # retrieve number of likes\n",
+ " with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_loneliness_1.csv','a', newline='', encoding='utf-8') as csvFile6:\n",
+ " csv_writer6 = csv.writer(csvFile6, delimiter=',') # create an instance of csv object\n",
+ " csv_writer6.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row"
+ ],
+ "execution_count": 22,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "veyW6kE7z5A0"
+ },
+ "source": [
+ "search_words6 = \"#loneliness\" # Specifying exact phrase to search\n",
+ "# Exclude Links, retweets, replies\n",
+ "search_query6 = search_words6 + \" -filter:links AND -filter:retweets AND -filter:replies\" \n",
+ "with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_loneliness_1.csv', encoding='utf-8') as data:\n",
+ " latest_tweet = int(list(csv.reader(data))[-1][0])\n",
+ "tweets_mining6(search_query6, 10000, latest_tweet)"
+ ],
+ "execution_count": 23,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "bggxtMrn0EGM"
+ },
+ "source": [
+ "df_loneliness_1 = pd.read_csv(\"/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_loneliness_1.csv\",\n",
+ " names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"])"
+ ],
+ "execution_count": 24,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 581
+ },
+ "id": "SlXTyO6d0KrH",
+ "outputId": "a8a7127b-34e5-437e-effd-a1364ff5bad5"
+ },
+ "source": [
+ "df_loneliness_1"
+ ],
+ "execution_count": 25,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " tweet.id | \n",
+ " created_at | \n",
+ " text | \n",
+ " location | \n",
+ " retweet | \n",
+ " favorite | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1447444376464998400 | \n",
+ " 2021-10-11 06:10:06 | \n",
+ " Every year passes but the pain remains the sam... | \n",
+ " India | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1447517473679441921 | \n",
+ " 2021-10-11 11:00:33 | \n",
+ " In this life, I can't expect things to be in m... | \n",
+ " Davao Region | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 1447540227422162949 | \n",
+ " 2021-10-11 12:30:58 | \n",
+ " holidays can bring on a sense of loss - of fam... | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 1447564113928863744 | \n",
+ " 2021-10-11 14:05:53 | \n",
+ " Must be good to have someone by your side. #Lo... | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 1447599325304000515 | \n",
+ " 2021-10-11 16:25:48 | \n",
+ " #Artists without an air of #loneliness , are #... | \n",
+ " Sulaimanyah, Kurdistan | \n",
+ " 0 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 306 | \n",
+ " 1459371193283362820 | \n",
+ " 2021-11-13 04:03:00 | \n",
+ " I want someone who loves to take nighttime dri... | \n",
+ " North Carolina, USA | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 307 | \n",
+ " 1459473286836989959 | \n",
+ " 2021-11-13 10:48:41 | \n",
+ " I have apparently reached the point of #autist... | \n",
+ " South West, England | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 308 | \n",
+ " 1459491234473553921 | \n",
+ " 2021-11-13 12:00:00 | \n",
+ " Give us a call. Need any advice with #covid19 ... | \n",
+ " Dublin City, Ireland | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 309 | \n",
+ " 1459495762908401664 | \n",
+ " 2021-11-13 12:18:00 | \n",
+ " fob lyrics trying so hard to be someone you’re... | \n",
+ " she/they • 18 • scorpio | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 310 | \n",
+ " 1459513880527441920 | \n",
+ " 2021-11-13 13:30:00 | \n",
+ " Give us a call. Need any advice with #covid19 ... | \n",
+ " Dublin City, Ireland | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
311 rows × 6 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " tweet.id created_at ... retweet favorite\n",
+ "0 1447444376464998400 2021-10-11 06:10:06 ... 0 0\n",
+ "1 1447517473679441921 2021-10-11 11:00:33 ... 0 0\n",
+ "2 1447540227422162949 2021-10-11 12:30:58 ... 0 0\n",
+ "3 1447564113928863744 2021-10-11 14:05:53 ... 0 0\n",
+ "4 1447599325304000515 2021-10-11 16:25:48 ... 0 5\n",
+ ".. ... ... ... ... ...\n",
+ "306 1459371193283362820 2021-11-13 04:03:00 ... 0 0\n",
+ "307 1459473286836989959 2021-11-13 10:48:41 ... 0 1\n",
+ "308 1459491234473553921 2021-11-13 12:00:00 ... 1 1\n",
+ "309 1459495762908401664 2021-11-13 12:18:00 ... 0 1\n",
+ "310 1459513880527441920 2021-11-13 13:30:00 ... 0 0\n",
+ "\n",
+ "[311 rows x 6 columns]"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 25
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "QnHoDxZ70SnD"
+ },
+ "source": [
+ "## \"#itsokaynottobeokay\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "WtQHpt-c0Te1"
+ },
+ "source": [
+ "## Create a function for tweets mining\n",
+ "def tweets_mining7(search_query7, num_tweets7, since_id_num7):\n",
+ " # Collect tweets using the Cursor object\n",
+ " # Each item in the iterator has various attributes that you can access to get information about each tweet\n",
+ " tweet_list7 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query7, lang=\"en\", since_id=since_id_num7, \n",
+ " tweet_mode='extended').items(num_tweets7)]\n",
+ " \n",
+ " # Begin scraping the tweets individually:\n",
+ " for tweet in tweet_list7[::-1]:\n",
+ " tweet_id = tweet.id # get Tweet ID result\n",
+ " created_at = tweet.created_at # get time tweet was created\n",
+ " text = tweet.full_text # retrieve full tweet text\n",
+ " location = tweet.user.location # retrieve user location\n",
+ " retweet = tweet.retweet_count # retrieve number of retweets\n",
+ " favorite = tweet.favorite_count # retrieve number of likes\n",
+ " with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_itsoknottobeok_1 copy.csv','a', newline='', encoding='utf-8') as csvFile7:\n",
+ " csv_writer7 = csv.writer(csvFile7, delimiter=',') # create an instance of csv object\n",
+ " csv_writer7.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row"
+ ],
+ "execution_count": 26,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "TP-dBQTL1vkD"
+ },
+ "source": [
+ "search_words7 = \"#itsokaynottobeokay\" # Specifying exact phrase to search\n",
+ "# Exclude Links, retweets, replies\n",
+ "search_query7 = search_words7 + \" -filter:links AND -filter:retweets AND -filter:replies\"\n",
+ "with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_itsoknottobeok_1 copy.csv', encoding='utf-8') as data:\n",
+ " latest_tweet = int(list(csv.reader(data))[-1][0]) \n",
+ "tweets_mining7(search_query7, 2000, latest_tweet)"
+ ],
+ "execution_count": 27,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "IEyjMy_B2hc7"
+ },
+ "source": [
+ "df_itsok_1 = pd.read_csv(\"/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_itsoknottobeok_1 copy.csv\",\n",
+ " names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"])"
+ ],
+ "execution_count": 28,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 581
+ },
+ "id": "GD5zNft02yGK",
+ "outputId": "22900167-41bb-4c8a-ca74-80db5d1a70e5"
+ },
+ "source": [
+ "df_itsok_1"
+ ],
+ "execution_count": 29,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " tweet.id | \n",
+ " created_at | \n",
+ " text | \n",
+ " location | \n",
+ " retweet | \n",
+ " favorite | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1447444376464998400 | \n",
+ " 2021-10-11 06:10:06 | \n",
+ " Every year passes but the pain remains the sam... | \n",
+ " India | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1447517473679441921 | \n",
+ " 2021-10-11 11:00:33 | \n",
+ " In this life, I can't expect things to be in m... | \n",
+ " Davao Region | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 1447540227422162949 | \n",
+ " 2021-10-11 12:30:58 | \n",
+ " holidays can bring on a sense of loss - of fam... | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 1447564113928863744 | \n",
+ " 2021-10-11 14:05:53 | \n",
+ " Must be good to have someone by your side. #Lo... | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 1447599325304000515 | \n",
+ " 2021-10-11 16:25:48 | \n",
+ " #Artists without an air of #loneliness , are #... | \n",
+ " Sulaimanyah, Kurdistan | \n",
+ " 0 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 160 | \n",
+ " 1459084076250546178 | \n",
+ " 2021-11-12 09:02:06 | \n",
+ " Every problem has a solution if you don’t know... | \n",
+ " South East, England | \n",
+ " 0 | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " | 161 | \n",
+ " 1459236894219325441 | \n",
+ " 2021-11-12 19:09:21 | \n",
+ " I'm loving @calumscott new song, definitely me... | \n",
+ " Wrexham, Wales | \n",
+ " 0 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 162 | \n",
+ " 1459270946485719041 | \n",
+ " 2021-11-12 21:24:40 | \n",
+ " You ever stop to acknowledge : would you look... | \n",
+ " United States | \n",
+ " 0 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 163 | \n",
+ " 1459429100180111361 | \n",
+ " 2021-11-13 07:53:07 | \n",
+ " i became teume bcoz of “ #itsokaynottobeokay ”... | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 164 | \n",
+ " 1459458776092786694 | \n",
+ " 2021-11-13 09:51:02 | \n",
+ " I don't usually do this but I just want to tha... | \n",
+ " Leicester, England | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
165 rows × 6 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " tweet.id created_at ... retweet favorite\n",
+ "0 1447444376464998400 2021-10-11 06:10:06 ... 0 0\n",
+ "1 1447517473679441921 2021-10-11 11:00:33 ... 0 0\n",
+ "2 1447540227422162949 2021-10-11 12:30:58 ... 0 0\n",
+ "3 1447564113928863744 2021-10-11 14:05:53 ... 0 0\n",
+ "4 1447599325304000515 2021-10-11 16:25:48 ... 0 5\n",
+ ".. ... ... ... ... ...\n",
+ "160 1459084076250546178 2021-11-12 09:02:06 ... 0 10\n",
+ "161 1459236894219325441 2021-11-12 19:09:21 ... 0 3\n",
+ "162 1459270946485719041 2021-11-12 21:24:40 ... 0 2\n",
+ "163 1459429100180111361 2021-11-13 07:53:07 ... 0 0\n",
+ "164 1459458776092786694 2021-11-13 09:51:02 ... 0 1\n",
+ "\n",
+ "[165 rows x 6 columns]"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 29
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "-RXWp6HY44nN"
+ },
+ "source": [
+ "## \"#depression\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "pbZltJ-k45d5"
+ },
+ "source": [
+ "## Create a function for tweets mining\n",
+ "def tweets_mining8(search_query8, num_tweets8, since_id_num8):\n",
+ " # Collect tweets using the Cursor object\n",
+ " # Each item in the iterator has various attributes that you can access to get information about each tweet\n",
+ " tweet_list8 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query8, lang=\"en\", since_id=since_id_num8, \n",
+ " tweet_mode='extended').items(num_tweets8)]\n",
+ " \n",
+ " # Begin scraping the tweets individually:\n",
+ " for tweet in tweet_list8[::-1]:\n",
+ " tweet_id = tweet.id # get Tweet ID result\n",
+ " created_at = tweet.created_at # get time tweet was created\n",
+ " text = tweet.full_text # retrieve full tweet text\n",
+ " location = tweet.user.location # retrieve user location\n",
+ " retweet = tweet.retweet_count # retrieve number of retweets\n",
+ " favorite = tweet.favorite_count # retrieve number of likes\n",
+ " with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depression_1.csv','a', newline='', encoding='utf-8') as csvFile8:\n",
+ " csv_writer8 = csv.writer(csvFile8, delimiter=',') # create an instance of csv object\n",
+ " csv_writer8.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row"
+ ],
+ "execution_count": 30,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "ghHhnfIO5xMg"
+ },
+ "source": [
+ "search_words8 = \"#depression\" # Specifying exact phrase to search\n",
+ "# Exclude Links, retweets, replies\n",
+ "search_query8 = search_words8 + \" -filter:links AND -filter:retweets AND -filter:replies\"\n",
+ "with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depression_1.csv', encoding='utf-8') as data:\n",
+ " latest_tweet = int(list(csv.reader(data))[-1][0]) \n",
+ "tweets_mining8(search_query8, 1000, latest_tweet)"
+ ],
+ "execution_count": 31,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "2tZbCrCQ6BKL"
+ },
+ "source": [
+ "df_depression_1 = pd.read_csv(\"/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depression_1.csv\",\n",
+ " names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"])"
+ ],
+ "execution_count": 32,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 581
+ },
+ "id": "9vYE-YWt6hsd",
+ "outputId": "172ccd9a-eb04-4617-eb09-b7bb421126c9"
+ },
+ "source": [
+ "df_depression_1"
+ ],
+ "execution_count": 33,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " tweet.id | \n",
+ " created_at | \n",
+ " text | \n",
+ " location | \n",
+ " retweet | \n",
+ " favorite | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1447381882828623879 | \n",
+ " 2021-10-11 02:01:46 | \n",
+ " #letstalk many suffering from #depression and ... | \n",
+ " Chicago, IL | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1447387707362131970 | \n",
+ " 2021-10-11 02:24:55 | \n",
+ " #Harassmentatwork can lead to debilitating men... | \n",
+ " Lahore | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 1447396592877805570 | \n",
+ " 2021-10-11 03:00:13 | \n",
+ " So . . . my #therapist called my wife and told... | \n",
+ " If it makes a difference, ask. | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 1447398472735342600 | \n",
+ " 2021-10-11 03:07:41 | \n",
+ " #psychology #love #mentalhealth #therapy #heal... | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 1447400177510146062 | \n",
+ " 2021-10-11 03:14:28 | \n",
+ " #psychology #love #mentalhealth #therapy #heal... | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 4478 | \n",
+ " 1459517445736124420 | \n",
+ " 2021-11-13 13:44:10 | \n",
+ " I've literally cried atleast once a day for th... | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 4479 | \n",
+ " 1459521433193877511 | \n",
+ " 2021-11-13 14:00:00 | \n",
+ " Black cohosh (Cimicifuga racemosa) is a partic... | \n",
+ " Global | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 4480 | \n",
+ " 1459527712775847936 | \n",
+ " 2021-11-13 14:24:58 | \n",
+ " I mention therapy to him today, his response \"... | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 4481 | \n",
+ " 1459531002276192263 | \n",
+ " 2021-11-13 14:38:02 | \n",
+ " Finna go to dollar tree and get some organizin... | \n",
+ " Dallas Texas, USA | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 4482 | \n",
+ " 1459532763942604800 | \n",
+ " 2021-11-13 14:45:02 | \n",
+ " I discovered today that clothes shopping is a ... | \n",
+ " England, United Kingdom | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
4483 rows × 6 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " tweet.id created_at ... retweet favorite\n",
+ "0 1447381882828623879 2021-10-11 02:01:46 ... 0 0\n",
+ "1 1447387707362131970 2021-10-11 02:24:55 ... 1 1\n",
+ "2 1447396592877805570 2021-10-11 03:00:13 ... 0 0\n",
+ "3 1447398472735342600 2021-10-11 03:07:41 ... 1 0\n",
+ "4 1447400177510146062 2021-10-11 03:14:28 ... 1 4\n",
+ "... ... ... ... ... ...\n",
+ "4478 1459517445736124420 2021-11-13 13:44:10 ... 0 0\n",
+ "4479 1459521433193877511 2021-11-13 14:00:00 ... 1 1\n",
+ "4480 1459527712775847936 2021-11-13 14:24:58 ... 0 1\n",
+ "4481 1459531002276192263 2021-11-13 14:38:02 ... 0 0\n",
+ "4482 1459532763942604800 2021-11-13 14:45:02 ... 0 1\n",
+ "\n",
+ "[4483 rows x 6 columns]"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 33
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "iaBSFYwsUPaI",
+ "outputId": "7b2a0935-671f-4d94-d364-5dc7a7134e12"
+ },
+ "source": [
+ "## Finding unique values in each column\n",
+ "for col in df_depression_1:\n",
+ " print(\"There are \", len(df_depression_1[col].unique()), \"unique values in \", col)"
+ ],
+ "execution_count": 14,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "There are 3185 unique values in tweet.id\n",
+ "There are 3182 unique values in created_at\n",
+ "There are 2818 unique values in text\n",
+ "There are 939 unique values in location\n",
+ "There are 23 unique values in retweet\n",
+ "There are 59 unique values in favorite\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "N2ZER9SmTPzF"
+ },
+ "source": [
+ "## \"#sad\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "EWSDmH8s6iuZ"
+ },
+ "source": [
+ "## Create a function for tweets mining\n",
+ "def tweets_mining9(search_query9, num_tweets9, since_id_num9):\n",
+ " # Collect tweets using the Cursor object\n",
+ " # Each item in the iterator has various attributes that you can access to get information about each tweet\n",
+ " tweet_list9 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query9, lang=\"en\", since_id=since_id_num9, \n",
+ " tweet_mode='extended').items(num_tweets9)]\n",
+ " \n",
+ " # Begin scraping the tweets individually:\n",
+ " for tweet in tweet_list9[::-1]:\n",
+ " tweet_id = tweet.id # get Tweet ID result\n",
+ " created_at = tweet.created_at # get time tweet was created\n",
+ " text = tweet.full_text # retrieve full tweet text\n",
+ " location = tweet.user.location # retrieve user location\n",
+ " retweet = tweet.retweet_count # retrieve number of retweets\n",
+ " favorite = tweet.favorite_count # retrieve number of likes\n",
+ " with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_sad_1.csv','a', newline='', encoding='utf-8') as csvFile9:\n",
+ " csv_writer9 = csv.writer(csvFile9, delimiter=',') # create an instance of csv object\n",
+ " csv_writer9.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row"
+ ],
+ "execution_count": 34,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "5G-4-YnoUAVZ"
+ },
+ "source": [
+ "search_words9 = \"#sad\" # Specifying exact phrase to search\n",
+ "# Exclude Links, retweets, replies\n",
+ "search_query9 = search_words9 + \" -filter:links AND -filter:retweets AND -filter:replies\" \n",
+ "with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_sad_1.csv', encoding='utf-8') as data:\n",
+ " latest_tweet = int(list(csv.reader(data))[-1][0]) \n",
+ "tweets_mining9(search_query9, 2000, latest_tweet)"
+ ],
+ "execution_count": 35,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "6ivTsYufUKw2"
+ },
+ "source": [
+ "df_sad_1 = pd.read_csv(\"/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_sad_1.csv\",\n",
+ " names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"])"
+ ],
+ "execution_count": 36,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 581
+ },
+ "id": "4TjbnQlJUUbA",
+ "outputId": "4ab3eb84-3d0c-444b-fa61-6d0b9969d3d2"
+ },
+ "source": [
+ "df_sad_1"
+ ],
+ "execution_count": 37,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " tweet.id | \n",
+ " created_at | \n",
+ " text | \n",
+ " location | \n",
+ " retweet | \n",
+ " favorite | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1447386915502792706 | \n",
+ " 2021-10-11 02:21:46 | \n",
+ " Tried to propose to Todd with an air ring duri... | \n",
+ " MD/DC | \n",
+ " 0 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1447389433553096704 | \n",
+ " 2021-10-11 02:31:46 | \n",
+ " Forgetting to bring a post game pint to pickup... | \n",
+ " Canada | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 1447390726132625416 | \n",
+ " 2021-10-11 02:36:54 | \n",
+ " bro wtf i came to school because of him and he... | \n",
+ " she / her | cbyf !! | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 1447390741706149895 | \n",
+ " 2021-10-11 02:36:58 | \n",
+ " I agree with @clint_dempsey on the Yanks not w... | \n",
+ " Los Angeles, CA | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 1447391562380554244 | \n",
+ " 2021-10-11 02:40:14 | \n",
+ " The amount of people who do not tip for grocer... | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 3517 | \n",
+ " 1459521498842992642 | \n",
+ " 2021-11-13 14:00:16 | \n",
+ " Just got banned from a server F #sad | \n",
+ " Jakarta Capital Region | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 3518 | \n",
+ " 1459521611997003777 | \n",
+ " 2021-11-13 14:00:43 | \n",
+ " I literally cried during my exam and the cam i... | \n",
+ " بيت أمك | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 3519 | \n",
+ " 1459524263946326017 | \n",
+ " 2021-11-13 14:11:15 | \n",
+ " No one can be happy with a guy like me. That's... | \n",
+ " Varanasi, Uttar Pradesh, India | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 3520 | \n",
+ " 1459530315437785095 | \n",
+ " 2021-11-13 14:35:18 | \n",
+ " arrived at my house but Am I Home? #deep #sad ... | \n",
+ " they19sea | \n",
+ " 1 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 3521 | \n",
+ " 1459530643591905284 | \n",
+ " 2021-11-13 14:36:36 | \n",
+ " Being spoken down to rn at @starbucks and reme... | \n",
+ " Night Vale, USA | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
3522 rows × 6 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " tweet.id created_at ... retweet favorite\n",
+ "0 1447386915502792706 2021-10-11 02:21:46 ... 0 4\n",
+ "1 1447389433553096704 2021-10-11 02:31:46 ... 0 1\n",
+ "2 1447390726132625416 2021-10-11 02:36:54 ... 0 0\n",
+ "3 1447390741706149895 2021-10-11 02:36:58 ... 0 0\n",
+ "4 1447391562380554244 2021-10-11 02:40:14 ... 0 1\n",
+ "... ... ... ... ... ...\n",
+ "3517 1459521498842992642 2021-11-13 14:00:16 ... 0 1\n",
+ "3518 1459521611997003777 2021-11-13 14:00:43 ... 0 0\n",
+ "3519 1459524263946326017 2021-11-13 14:11:15 ... 0 0\n",
+ "3520 1459530315437785095 2021-11-13 14:35:18 ... 1 3\n",
+ "3521 1459530643591905284 2021-11-13 14:36:36 ... 0 0\n",
+ "\n",
+ "[3522 rows x 6 columns]"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 37
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "WMQTcPwD38hP"
+ },
+ "source": [
+ "# Combining all the tweets"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "aGjcg4Et6ZR9"
+ },
+ "source": [
+ "import glob"
+ ],
+ "execution_count": 38,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 285
+ },
+ "id": "FVBUCENZ4BIQ",
+ "outputId": "e06fbce1-e125-4ff4-c763-b128e9acf2ea"
+ },
+ "source": [
+ "path = r'/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API' # use your path\n",
+ "all_files = glob.glob(path + \"/*.csv\")\n",
+ "\n",
+ "tweets = []\n",
+ "\n",
+ "for filename in all_files:\n",
+ " df = pd.read_csv(filename, \n",
+ " names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"]) # Convert each csv to a dataframe\n",
+ " tweets.append(df)\n",
+ "\n",
+ "tweets_df = pd.concat(tweets, ignore_index=True) # Merge all dataframes\n",
+ "#tweets_df.columns=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"]\n",
+ "tweets_df.head()"
+ ],
+ "execution_count": 39,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " tweet.id | \n",
+ " created_at | \n",
+ " text | \n",
+ " location | \n",
+ " retweet | \n",
+ " favorite | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1447537898572574730 | \n",
+ " 2021-10-11 12:21:43 | \n",
+ " Open discussion. Between the Transfer Portal a... | \n",
+ " Cheyenne Wyoming | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1447540582490988553 | \n",
+ " 2021-10-11 12:32:23 | \n",
+ " Plenty of things are changing in my life and t... | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 1447807717859491842 | \n",
+ " 2021-10-12 06:13:53 | \n",
+ " I feel a little hopeless. Anyone else? #hopele... | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 1448076026219692033 | \n",
+ " 2021-10-13 00:00:03 | \n",
+ " Which is more healthy? Hope, or hopelessness? ... | \n",
+ " Denver, CO | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 1448382047375040513 | \n",
+ " 2021-10-13 20:16:04 | \n",
+ " So someone tell me how do I get over #HOPELESS... | \n",
+ " Portland Or . | \n",
+ " 0 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " tweet.id created_at ... retweet favorite\n",
+ "0 1447537898572574730 2021-10-11 12:21:43 ... 0 0\n",
+ "1 1447540582490988553 2021-10-11 12:32:23 ... 0 0\n",
+ "2 1447807717859491842 2021-10-12 06:13:53 ... 0 0\n",
+ "3 1448076026219692033 2021-10-13 00:00:03 ... 0 0\n",
+ "4 1448382047375040513 2021-10-13 20:16:04 ... 0 2\n",
+ "\n",
+ "[5 rows x 6 columns]"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 39
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 581
+ },
+ "id": "NIh6Pc_C5BmN",
+ "outputId": "6ceba47d-7e76-49e4-f459-8b78860e6aae"
+ },
+ "source": [
+ "tweets_df"
+ ],
+ "execution_count": 40,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " tweet.id | \n",
+ " created_at | \n",
+ " text | \n",
+ " location | \n",
+ " retweet | \n",
+ " favorite | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1447537898572574730 | \n",
+ " 2021-10-11 12:21:43 | \n",
+ " Open discussion. Between the Transfer Portal a... | \n",
+ " Cheyenne Wyoming | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1447540582490988553 | \n",
+ " 2021-10-11 12:32:23 | \n",
+ " Plenty of things are changing in my life and t... | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 1447807717859491842 | \n",
+ " 2021-10-12 06:13:53 | \n",
+ " I feel a little hopeless. Anyone else? #hopele... | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 1448076026219692033 | \n",
+ " 2021-10-13 00:00:03 | \n",
+ " Which is more healthy? Hope, or hopelessness? ... | \n",
+ " Denver, CO | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 1448382047375040513 | \n",
+ " 2021-10-13 20:16:04 | \n",
+ " So someone tell me how do I get over #HOPELESS... | \n",
+ " Portland Or . | \n",
+ " 0 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 24142 | \n",
+ " 1459521498842992642 | \n",
+ " 2021-11-13 14:00:16 | \n",
+ " Just got banned from a server F #sad | \n",
+ " Jakarta Capital Region | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 24143 | \n",
+ " 1459521611997003777 | \n",
+ " 2021-11-13 14:00:43 | \n",
+ " I literally cried during my exam and the cam i... | \n",
+ " بيت أمك | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 24144 | \n",
+ " 1459524263946326017 | \n",
+ " 2021-11-13 14:11:15 | \n",
+ " No one can be happy with a guy like me. That's... | \n",
+ " Varanasi, Uttar Pradesh, India | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 24145 | \n",
+ " 1459530315437785095 | \n",
+ " 2021-11-13 14:35:18 | \n",
+ " arrived at my house but Am I Home? #deep #sad ... | \n",
+ " they19sea | \n",
+ " 1 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 24146 | \n",
+ " 1459530643591905284 | \n",
+ " 2021-11-13 14:36:36 | \n",
+ " Being spoken down to rn at @starbucks and reme... | \n",
+ " Night Vale, USA | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
24147 rows × 6 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " tweet.id created_at ... retweet favorite\n",
+ "0 1447537898572574730 2021-10-11 12:21:43 ... 0 0\n",
+ "1 1447540582490988553 2021-10-11 12:32:23 ... 0 0\n",
+ "2 1447807717859491842 2021-10-12 06:13:53 ... 0 0\n",
+ "3 1448076026219692033 2021-10-13 00:00:03 ... 0 0\n",
+ "4 1448382047375040513 2021-10-13 20:16:04 ... 0 2\n",
+ "... ... ... ... ... ...\n",
+ "24142 1459521498842992642 2021-11-13 14:00:16 ... 0 1\n",
+ "24143 1459521611997003777 2021-11-13 14:00:43 ... 0 0\n",
+ "24144 1459524263946326017 2021-11-13 14:11:15 ... 0 0\n",
+ "24145 1459530315437785095 2021-11-13 14:35:18 ... 1 3\n",
+ "24146 1459530643591905284 2021-11-13 14:36:36 ... 0 0\n",
+ "\n",
+ "[24147 rows x 6 columns]"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 40
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "Yia0nXGnQsiV"
+ },
+ "source": [
+ "tweets_df.to_csv('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/output/depressive_tweets.csv')"
+ ],
+ "execution_count": 41,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "Zvj3hdFwO2IO"
+ },
+ "source": [
+ "## Data cleaning"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "GEBn1OyhPDp1"
+ },
+ "source": [
+ "Data cleaning is one of the essential steps because without a proper cleaning procedure you will have errors in your analysis and eventually your data-driven results. Here I try to eliminate duplicates tweets by using the Primary key ('tweets.id'), checked for empty rows and replaced “NaN” if there is any."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "zgrxs9HGOhnN",
+ "outputId": "f8886c9b-28b7-4429-ebe0-b91ad894f32b"
+ },
+ "source": [
+ "tweets_df.shape #Get number of rows and columns"
+ ],
+ "execution_count": 42,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "(24147, 6)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 42
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 232
+ },
+ "id": "s6rb-N77QIA-",
+ "outputId": "ae758d07-1cbc-4bc8-988f-8f38777ac201"
+ },
+ "source": [
+ "## Check the data type of each column\n",
+ "tweets_df.dtypes.to_frame().rename(columns={0:'data_type'})"
+ ],
+ "execution_count": 43,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " data_type | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | tweet.id | \n",
+ " int64 | \n",
+ "
\n",
+ " \n",
+ " | created_at | \n",
+ " object | \n",
+ "
\n",
+ " \n",
+ " | text | \n",
+ " object | \n",
+ "
\n",
+ " \n",
+ " | location | \n",
+ " object | \n",
+ "
\n",
+ " \n",
+ " | retweet | \n",
+ " int64 | \n",
+ "
\n",
+ " \n",
+ " | favorite | \n",
+ " int64 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " data_type\n",
+ "tweet.id int64\n",
+ "created_at object\n",
+ "text object\n",
+ "location object\n",
+ "retweet int64\n",
+ "favorite int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 43
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "mYuqjbWiPJVK",
+ "outputId": "997390f8-38b1-41d6-a94d-d3b25ba402c4"
+ },
+ "source": [
+ "## Finding unique values in each column\n",
+ "for col in tweets_df:\n",
+ " print(\"There are \", len(tweets_df[col].unique()), \"unique values in \", col)"
+ ],
+ "execution_count": 45,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "There are 18190 unique values in tweet.id\n",
+ "There are 18071 unique values in created_at\n",
+ "There are 17107 unique values in text\n",
+ "There are 4648 unique values in location\n",
+ "There are 74 unique values in retweet\n",
+ "There are 159 unique values in favorite\n"
+ ]
+ }
+ ]
+ }
+ ]
+}
\ No newline at end of file