ameythakur commited on
Commit
4d1cb0c
·
verified ·
1 Parent(s): 2fc0f9d

DEPRESSION-DETECTION

Browse files
Files changed (46) hide show
  1. .gitattributes +2 -0
  2. Dockerfile +2 -2
  3. README.md +311 -311
  4. source_code/app.py +59 -0
  5. source_code/app_utilities.py +65 -0
  6. source_code/assets/data/external/README.md +3 -0
  7. source_code/assets/data/processed_data.csv +0 -0
  8. source_code/assets/data/scrapped/depressive_tweets.csv +0 -0
  9. source_code/assets/img/ROC_Precision_LR.png +3 -0
  10. source_code/assets/img/ROC_Precision_SVM.png +3 -0
  11. source_code/assets/img/app.png +3 -0
  12. source_code/assets/img/depression.png +3 -0
  13. source_code/assets/img/logo.jpeg +3 -0
  14. source_code/assets/img/loss_accuracy_LSTM.png +3 -0
  15. source_code/assets/img/models_comparison.png +3 -0
  16. source_code/assets/img/wordcloud_depressive.png +3 -0
  17. source_code/assets/img/wordcloud_random.png +3 -0
  18. source_code/assets/models/model_LSTM.pkl +3 -0
  19. source_code/assets/models/model_LogReg.pkl +3 -0
  20. source_code/assets/models/model_svm.pkl +3 -0
  21. source_code/assets/models/model_svm1.pkl +3 -0
  22. source_code/assets/notebooks/data_cleaning_exploration.ipynb +0 -0
  23. source_code/assets/notebooks/data_gathering_twint.ipynb +483 -0
  24. source_code/assets/notebooks/data_gathering_twitter_API.ipynb +0 -0
  25. source_code/assets/notebooks/modeling.ipynb +0 -0
  26. source_code/core/clean.py +72 -0
  27. source_code/core/clean_utilities.py +226 -0
  28. source_code/core/predict.py +110 -0
  29. source_code/core/train.py +80 -0
  30. source_code/core/train_utilities.py +226 -0
  31. source_code/notebooks/data_cleaning_exploration.py +505 -0
  32. source_code/notebooks/data_gathering_twint.py +80 -0
  33. source_code/notebooks/data_gathering_twitter_API.py +388 -0
  34. source_code/notebooks/modeling.py +378 -0
  35. source_code/notebooks/old_models.py +637 -0
  36. source_code/notebooks/testing.py +283 -0
  37. source_code/requirements.txt +42 -0
  38. source_code/static/brain.svg +1 -0
  39. source_code/static/overlay.css +99 -0
  40. source_code/static/security.js +26 -0
  41. source_code/static/styles.css +237 -0
  42. source_code/static/tweet-sound.mp3 +0 -0
  43. source_code/templates/404.html +137 -0
  44. source_code/templates/index.html +136 -0
  45. source_code/templates/result.html +90 -0
  46. source_code/test_app.py +20 -0
.gitattributes CHANGED
@@ -1,2 +1,4 @@
1
  Source[[:space:]]Code/assets/models/model_svm.pkl filter=lfs diff=lfs merge=lfs -text
2
  Source[[:space:]]Code/assets/models/model_svm1.pkl filter=lfs diff=lfs merge=lfs -text
 
 
 
1
  Source[[:space:]]Code/assets/models/model_svm.pkl filter=lfs diff=lfs merge=lfs -text
2
  Source[[:space:]]Code/assets/models/model_svm1.pkl filter=lfs diff=lfs merge=lfs -text
3
+ source_code/assets/models/model_svm.pkl filter=lfs diff=lfs merge=lfs -text
4
+ source_code/assets/models/model_svm1.pkl filter=lfs diff=lfs merge=lfs -text
Dockerfile CHANGED
@@ -19,7 +19,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
19
 
20
  # Install dependencies
21
  # Note: Path is relative to the repository root where Dockerfile resides
22
- COPY "Source Code/requirements.txt" ./
23
  RUN pip install --upgrade pip
24
  RUN pip install --no-cache-dir -r requirements.txt
25
 
@@ -27,7 +27,7 @@ RUN pip install --no-cache-dir -r requirements.txt
27
  RUN python -m spacy download en_core_web_lg
28
 
29
  # Copy project source code
30
- COPY "Source Code/" ./
31
 
32
  # Hugging Face Spaces requires port 7860
33
  EXPOSE 7860
 
19
 
20
  # Install dependencies
21
  # Note: Path is relative to the repository root where Dockerfile resides
22
+ COPY source_code/requirements.txt ./
23
  RUN pip install --upgrade pip
24
  RUN pip install --no-cache-dir -r requirements.txt
25
 
 
27
  RUN python -m spacy download en_core_web_lg
28
 
29
  # Copy project source code
30
+ COPY source_code/ ./
31
 
32
  # Hugging Face Spaces requires port 7860
33
  EXPOSE 7860
README.md CHANGED
@@ -1,312 +1,312 @@
1
- ---
2
- title: Depression Detection Using Tweets
3
- emoji: 🧠
4
- colorFrom: blue
5
- colorTo: indigo
6
- sdk: docker
7
- app_file: app.py
8
- pinned: false
9
- license: mit
10
- short_description: Depression Detection in Tweets ML Web App
11
- ---
12
-
13
- <div align="center">
14
-
15
- <a name="readme-top"></a>
16
- # Depression Detection Using Tweets
17
-
18
- [![License: MIT](https://img.shields.io/badge/License-MIT-lightgrey)](LICENSE)
19
- ![Status](https://img.shields.io/badge/Status-Completed-success)
20
- [![Technology](https://img.shields.io/badge/Technology-Python%20%7C%20Machine%20Learning-blueviolet)](https://github.com/Amey-Thakur/DEPRESSION_DETECTION_USING_TWEETS)
21
- [![Developed by Amey Thakur and Mega Satish](https://img.shields.io/badge/Developed%20by-Amey%20Thakur%20%26%20Mega%20Satish-blue.svg)](https://github.com/Amey-Thakur/DEPRESSION_DETECTION_USING_TWEETS)
22
-
23
- A modern **Python** + **Flask** application designed to analyze tweet sentiment and predict depressive characteristics using a finalized **SVM** model and **spaCy** NLP pipeline.
24
-
25
- **[Source Code](Source%20Code/)** &nbsp;·&nbsp; **[Technical Specification](docs/SPECIFICATION.md)** &nbsp;·&nbsp; **[Live Demo](https://huggingface.co/spaces/ameythakur/Depression-Detection-Using-Tweets)**
26
-
27
- </div>
28
-
29
- ---
30
-
31
- <div align="center">
32
-
33
- [Authors](#authors) &nbsp;·&nbsp; [Overview](#overview) &nbsp;·&nbsp; [Features](#features) &nbsp;·&nbsp; [Structure](#project-structure) &nbsp;·&nbsp; [Results](#results) &nbsp;·&nbsp; [Quick Start](#quick-start) &nbsp;·&nbsp; [Usage Guidelines](#usage-guidelines) &nbsp;·&nbsp; [License](#license) &nbsp;·&nbsp; [About](#about-this-repository) &nbsp;·&nbsp; [Acknowledgments](#acknowledgments)
34
-
35
- </div>
36
-
37
- ---
38
-
39
- <!-- AUTHORS -->
40
- <div align="center">
41
-
42
- <a name="authors"></a>
43
- ## Authors
44
-
45
- | <a href="https://github.com/Amey-Thakur"><img src="https://github.com/Amey-Thakur.png" width="150" height="150" alt="Amey Thakur"></a><br>[**Amey Thakur**](https://github.com/Amey-Thakur)<br><br>[![ORCID](https://img.shields.io/badge/ORCID-0000--0001--5644--1575-green.svg)](https://orcid.org/0000-0001-5644-1575) | <a href="https://github.com/msatmod"><img src="Mega/Mega.png" width="150" height="150" alt="Mega Satish"></a><br>[**Mega Satish**](https://github.com/msatmod)<br><br>[![ORCID](https://img.shields.io/badge/ORCID-0000--0002--1844--9557-green.svg)](https://orcid.org/0000-0002-1844-9557) |
46
- | :---: | :---: |
47
-
48
- </div>
49
-
50
- > [!IMPORTANT]
51
- > ### 🤝🏻 Special Acknowledgement
52
- > *Special thanks to **[Mega Satish](https://github.com/msatmod)** for her meaningful contributions, guidance, and support that helped shape this work.*
53
-
54
- ---
55
-
56
- <!-- OVERVIEW -->
57
- <a name="overview"></a>
58
- ## Overview
59
-
60
- **Depression Detection Using Tweets** is a specialized Machine Learning framework designed to translate complex linguistic patterns into empirical psychological insights. This repository prioritizes **high-dimensional feature extraction** and **probabilistic classification** to provide a robust baseline for sentiment analysis within the context of mental health monitoring.
61
-
62
- * **Linguistic Determinism**: The system utilizes deep NLP preprocessing, including lemmatization and entity normalization, to ensure that the semantic core of a tweet is preserved regardless of slang or stylistic variation.
63
- * **Vector-Space Inference**: By leveraging **Support Vector Machines (SVM)** and **TF-IDF vectorization**, the model maps textual input into a multi-dimensional hyperplane, enabling precise binary classification of depressive sentiment.
64
- * **Architectural Efficiency**: The backend is architected for low-latency serving via Flask, ensuring that model inference and result rendering occur in sub-second cycles, critical for interactive user feedback.
65
-
66
- > [!TIP]
67
- > **NLP Pipeline Optimization**
68
- >
69
- > To maximize classification reliability, the engine employs a **multi-stage linguistic filter**. **Stop-word suppression** and **morphological analysis** strip away structural noise, while the **en_core_web_lg** transformer model contextualizes surviving tokens. This ensures the classifier’s weights are strictly coupled with affective indicators, minimizing the false-positive skew common in generalized sentiment analysis models.
70
-
71
- ---
72
-
73
- <!-- FEATURES -->
74
- <a name="features"></a>
75
- ## Features
76
-
77
- | Feature | Description |
78
- |---------|-------------|
79
- | **Core SVM Model** | **High-Dimensional Classification** engine optimized for binary depressive sentiment prediction. |
80
- | **NLP Pipeline** | Deep linguistic feature extraction powered by the **spaCy transformer model** (`en_core_web_lg`). |
81
- | **Prediction Hub** | **Real-Time Inference Interface** built with Flask for sub-second classification feedback. |
82
- | **Security Suite** | Integrated **Browser-Side Integrity** protocols including anti-right-click and anti-select systems. |
83
- | **Cinematic Surprise** | **Immersive Branding Overlay** featuring animated Twitter iconography and synchronized audio. |
84
-
85
- > [!NOTE]
86
- > ### Technical Polish: The Linguistic Singularity
87
- > We have engineered a **Probabilistic Sentiment Manager** that calibrates model weights across thousands of TF-IDF vectors to simulate human-like linguistic intuition. The visual language focuses on a "Neural Slate" aesthetic, ensuring maximum cognitive focus on the diagnostic outputs without procedural distraction.
88
-
89
- ### Tech Stack
90
- - **Languages**: Python 3.9+
91
- - **Logic**: **SVM Classifier** (Scikit-Learn Inference Engine)
92
- - **Linguistic Data**: **spaCy NLP** (Transformer-based word embeddings)
93
- - **Web App**: **Flask Framework** (Micro-service architecture for model serving)
94
- - **UI System**: Premium Modern Aesthetics (Custom CSS / Play Typography)
95
- - **Deployment**: Standard Python Environment (PIP-managed dependencies)
96
-
97
- ---
98
-
99
- <!-- PROJECT STRUCTURE -->
100
- <a name="project-structure"></a>
101
- ## Project Structure
102
-
103
- ```python
104
- DEPRESSION-DETECTION-USING-TWEETS/
105
-
106
- ├── docs/ # Technical Documentation
107
- │ └── SPECIFICATION.md # Architecture & Design Specification
108
-
109
- ├── Mega/ # Archival Attribution Assets
110
- │ ├── Filly.jpg # Companion (Filly)
111
- │ └── Mega.png # Author Profile Image (Mega Satish)
112
-
113
- ├── screenshots/ # Project Visualization Gallery
114
- │ ├── 01_landing_page.png # System Hub Initial State
115
- │ ├── 02_footer_details.png # Brand and Metadata Footer
116
- │ ├── 03_surprise_cinematic.png # Interactive Animated Sequence
117
- │ ├── 04_predict_interface.png # Sentiment Analysis Entry Point
118
- │ ├── 05_analysis_output.png # Model Inference result
119
- │ └── 06_result_prediction.png # Final Sentiment Output
120
-
121
- ├── Source Code/ # Primary Application Layer
122
- │ ├── assets/ # Serialized Models & Linguistic Data
123
- │ ├── core/ # ML Pipeline (Clean, Train, Predict)
124
- │ ├── static/ # Styling, Audio, & Security Scripts
125
- │ ├── templates/ # HTML Templates (Index, Result, 404)
126
- │ └── app.py # Flask Application (Entry Point)
127
-
128
- ├── .gitattributes # Git configuration
129
- ├── .gitignore # Repository Filters
130
- ├── CITATION.cff # Scholarly Citation Metadata
131
- ├── codemeta.json # Machine-Readable Project Metadata
132
- ├── LICENSE # MIT License Terms
133
- ├── README.md # Comprehensive Scholarly Entrance
134
- └── SECURITY.md # Security Policy & Protocol
135
- ```
136
-
137
- ---
138
-
139
- <!-- RESULTS -->
140
- <a name="results"></a>
141
- ## Results
142
-
143
- <div align="center">
144
- <b>Main Landing: System Hub Initialization</b>
145
- <br>
146
- <i>Minimalist interface for rapid tweet sentiment analysis.</i>
147
- <br><br>
148
- <img src="screenshots/01_landing_page.png" alt="Landing Page" width="90%">
149
- <br><br><br>
150
-
151
- <b>Metadata Synthesis: Branding and Footer Detail</b>
152
- <br>
153
- <i>Scholarly attribution and project status integration.</i>
154
- <br><br>
155
- <img src="screenshots/02_footer_details.png" alt="Footer Details" width="90%">
156
- <br><br><br>
157
-
158
- <b>Interactivity: Animated Twitter Sequence</b>
159
- <br>
160
- <i>Immersive audiovisual overlay triggered by core branding elements.</i>
161
- <br><br>
162
- <img src="screenshots/03_surprise_cinematic.png" alt="Cinematic Surprise" width="90%">
163
- <br><br><br>
164
-
165
- <b>Sentiment Entry: Real-time Analysis Interface</b>
166
- <br>
167
- <i>Direct manipulation environment for high-latency textual input.</i>
168
- <br><br>
169
- <img src="screenshots/04_predict_interface.png" alt="Predict Interface" width="90%">
170
- <br><br><br>
171
-
172
- <b>Model Inference: Feature Extraction Output</b>
173
- <br>
174
- <i>Deep linguistic analysis and probabilistic score generation.</i>
175
- <br><br>
176
- <img src="screenshots/05_analysis_output.png" alt="Analysis Output" width="90%">
177
- <br><br><br>
178
-
179
- <b>Statistical Output: Final Sentiment Classification</b>
180
- <br>
181
- <i>Categorized classification results with immediate visual feedback.</i>
182
- <br><br>
183
- <img src="screenshots/06_result_prediction.png" alt="Result Prediction" width="90%">
184
- </div>
185
-
186
- ---
187
-
188
- <!-- QUICK START -->
189
- <a name="quick-start"></a>
190
- ## Quick Start
191
-
192
- ### 1. Prerequisites
193
- - **Python 3.11+**: Required for runtime execution. [Download Python](https://www.python.org/downloads/)
194
- - **Git**: For version control and cloning. [Download Git](https://git-scm.com/downloads)
195
-
196
- > [!WARNING]
197
- > **Data Acquisition & Memory Constraints**
198
- >
199
- > The linguistic pipeline relies on the **en_core_web_lg** transformer model, which requires an initial download of approximately **800MB**. Ensure a stable network connection during setup. Additionally, loading this model into memory requires at least **2GB of available RAM** to prevent swapping and ensure low-latency inference.
200
-
201
- ### 2. Installation & Setup
202
-
203
- #### Step 1: Clone the Repository
204
- Open your terminal and clone the repository:
205
- ```bash
206
- git clone https://github.com/Amey-Thakur/DEPRESSION-DETECTION-USING-TWEETS.git
207
- cd DEPRESSION-DETECTION-USING-TWEETS
208
- ```
209
-
210
- #### Step 2: Configure Virtual Environment
211
- Prepare an isolated environment to manage dependencies:
212
-
213
- **Windows (Command Prompt / PowerShell):**
214
- ```bash
215
- python -m venv venv
216
- venv\Scripts\activate
217
- ```
218
-
219
- **macOS / Linux (Terminal):**
220
- ```bash
221
- python3 -m venv venv
222
- source venv/bin/activate
223
- ```
224
-
225
- #### Step 3: Install Core Dependencies
226
- Ensure your environment is active, then install the required libraries:
227
- ```bash
228
- pip install -r "Source Code/requirements.txt"
229
- ```
230
-
231
- #### Step 4: Linguistic Model Acquisition
232
- Download the large-scale linguistic model required for analysis (approx. 800MB):
233
- ```bash
234
- python -m spacy download en_core_web_lg
235
- ```
236
-
237
- ### 3. Execution
238
- Launch the sentiment analysis dashboard:
239
-
240
- ```bash
241
- python "Source Code/app.py"
242
- ```
243
-
244
- ---
245
-
246
- <!-- USAGE GUIDELINES -->
247
- <a name="usage-guidelines"></a>
248
- ## Usage Guidelines
249
-
250
- This repository is openly shared to support learning and knowledge exchange across the academic community.
251
-
252
- **For Students**
253
- Use this project as reference material for understanding **Support Vector Machines (SVM)**, **spaCy NLP pipelines**, and **sentiment analysis within the context of mental health monitoring**. The source code is available for study to facilitate self-paced learning and exploration of **high-dimensional feature extraction and model serving via Flask**.
254
-
255
- **For Educators**
256
- This project may serve as a practical lab example or supplementary teaching resource for **Data Science**, **Natural Language Processing**, and **Machine Learning** courses. Attribution is appreciated when utilizing content.
257
-
258
- **For Researchers**
259
- The documentation and architectural approach may provide insights into **academic project structuring**, **psychological linguistic modeling**, and **algorithmic deployment**.
260
-
261
- ---
262
-
263
- <!-- LICENSE -->
264
- <a name="license"></a>
265
- ## License
266
-
267
- This repository and all its creative and technical assets are made available under the **MIT License**. See the [LICENSE](LICENSE) file for complete terms.
268
-
269
- > [!NOTE]
270
- > **Summary**: You are free to share and adapt this content for any purpose, even commercially, as long as you provide appropriate attribution to the original authors.
271
-
272
- Copyright © 2022 Amey Thakur & Mega Satish
273
-
274
- ---
275
-
276
- <!-- ABOUT -->
277
- <a name="about-this-repository"></a>
278
- ## About This Repository
279
-
280
- **Created & Maintained by**: [Amey Thakur](https://github.com/Amey-Thakur) & [Mega Satish](https://github.com/msatmod)
281
-
282
- This project features **Depression Detection**, a high-performance sentiment analysis system. It represents a personal exploration into **Python**-based machine learning and interactive web-service architecture.
283
-
284
- **Connect:** [GitHub](https://github.com/Amey-Thakur) &nbsp;·&nbsp; [LinkedIn](https://www.linkedin.com/in/amey-thakur) &nbsp;·&nbsp; [ORCID](https://orcid.org/0000-0001-5644-1575)
285
-
286
- ### Acknowledgments
287
-
288
- Grateful acknowledgment to [**Mega Satish**](https://github.com/msatmod) for her exceptional collaboration and scholarly partnership during the development of this machine learning project. Her constant support, technical clarity, and dedication to software quality were instrumental in achieving the system's functional objectives. Learning alongside her was a transformative experience; her thoughtful approach to problem-solving and steady encouragement turned complex requirements into meaningful learning moments. This work reflects the growth and insights gained from our side-by-side academic journey. Thank you, Mega, for everything you shared and taught along the way.
289
-
290
- Special thanks to the **mentors and peers** whose encouragement, discussions, and support contributed meaningfully to this learning experience.
291
-
292
- ---
293
-
294
- <div align="center">
295
-
296
- [↑ Back to Top](#readme-top)
297
-
298
- [Authors](#authors) &nbsp;·&nbsp; [Overview](#overview) &nbsp;·&nbsp; [Features](#features) &nbsp;·&nbsp; [Structure](#project-structure) &nbsp;·&nbsp; [Results](#results) &nbsp;·&nbsp; [Quick Start](#quick-start) &nbsp;·&nbsp; [Usage Guidelines](#usage-guidelines) &nbsp;·&nbsp; [License](#license) &nbsp;·&nbsp; [About](#about-this-repository) &nbsp;·&nbsp; [Acknowledgments](#acknowledgments)
299
-
300
- <br>
301
-
302
- 🧠 **[DEPRESSION-DETECTION](https://huggingface.co/spaces/ameythakur/Depression-Detection-Using-Tweets)**
303
-
304
- ---
305
-
306
- ### 🎓 [Computer Engineering Repository](https://github.com/Amey-Thakur/COMPUTER-ENGINEERING)
307
-
308
- **Computer Engineering (B.E.)**
309
-
310
- *Semester-wise curriculum, laboratories, projects, and academic notes.*
311
-
312
  </div>
 
1
+ ---
2
+ title: Depression Detection Using Tweets
3
+ emoji: 🧠
4
+ colorFrom: blue
5
+ colorTo: indigo
6
+ sdk: docker
7
+ app_file: app.py
8
+ pinned: false
9
+ license: mit
10
+ short_description: Depression Detection in Tweets ML Web App
11
+ ---
12
+
13
+ <div align="center">
14
+
15
+ <a name="readme-top"></a>
16
+ # Depression Detection Using Tweets
17
+
18
+ [![License: MIT](https://img.shields.io/badge/License-MIT-lightgrey)](LICENSE)
19
+ ![Status](https://img.shields.io/badge/Status-Completed-success)
20
+ [![Technology](https://img.shields.io/badge/Technology-Python%20%7C%20Machine%20Learning-blueviolet)](https://github.com/Amey-Thakur/DEPRESSION_DETECTION_USING_TWEETS)
21
+ [![Developed by Amey Thakur and Mega Satish](https://img.shields.io/badge/Developed%20by-Amey%20Thakur%20%26%20Mega%20Satish-blue.svg)](https://github.com/Amey-Thakur/DEPRESSION_DETECTION_USING_TWEETS)
22
+
23
+ A modern **Python** + **Flask** application designed to analyze tweet sentiment and predict depressive characteristics using a finalized **SVM** model and **spaCy** NLP pipeline.
24
+
25
+ **[Source Code](source_code/)** &nbsp;·&nbsp; **[Technical Specification](docs/SPECIFICATION.md)** &nbsp;·&nbsp; **[Live Demo](https://huggingface.co/spaces/ameythakur/Depression-Detection-Using-Tweets)**
26
+
27
+ </div>
28
+
29
+ ---
30
+
31
+ <div align="center">
32
+
33
+ [Authors](#authors) &nbsp;·&nbsp; [Overview](#overview) &nbsp;·&nbsp; [Features](#features) &nbsp;·&nbsp; [Structure](#project-structure) &nbsp;·&nbsp; [Results](#results) &nbsp;·&nbsp; [Quick Start](#quick-start) &nbsp;·&nbsp; [Usage Guidelines](#usage-guidelines) &nbsp;·&nbsp; [License](#license) &nbsp;·&nbsp; [About](#about-this-repository) &nbsp;·&nbsp; [Acknowledgments](#acknowledgments)
34
+
35
+ </div>
36
+
37
+ ---
38
+
39
+ <!-- AUTHORS -->
40
+ <div align="center">
41
+
42
+ <a name="authors"></a>
43
+ ## Authors
44
+
45
+ | <a href="https://github.com/Amey-Thakur"><img src="https://github.com/Amey-Thakur.png" width="150" height="150" alt="Amey Thakur"></a><br>[**Amey Thakur**](https://github.com/Amey-Thakur)<br><br>[![ORCID](https://img.shields.io/badge/ORCID-0000--0001--5644--1575-green.svg)](https://orcid.org/0000-0001-5644-1575) | <a href="https://github.com/msatmod"><img src="Mega/Mega.png" width="150" height="150" alt="Mega Satish"></a><br>[**Mega Satish**](https://github.com/msatmod)<br><br>[![ORCID](https://img.shields.io/badge/ORCID-0000--0002--1844--9557-green.svg)](https://orcid.org/0000-0002-1844-9557) |
46
+ | :---: | :---: |
47
+
48
+ </div>
49
+
50
+ > [!IMPORTANT]
51
+ > ### 🤝🏻 Special Acknowledgement
52
+ > *Special thanks to **[Mega Satish](https://github.com/msatmod)** for her meaningful contributions, guidance, and support that helped shape this work.*
53
+
54
+ ---
55
+
56
+ <!-- OVERVIEW -->
57
+ <a name="overview"></a>
58
+ ## Overview
59
+
60
+ **Depression Detection Using Tweets** is a specialized Machine Learning framework designed to translate complex linguistic patterns into empirical psychological insights. This repository prioritizes **high-dimensional feature extraction** and **probabilistic classification** to provide a robust baseline for sentiment analysis within the context of mental health monitoring.
61
+
62
+ * **Linguistic Determinism**: The system utilizes deep NLP preprocessing, including lemmatization and entity normalization, to ensure that the semantic core of a tweet is preserved regardless of slang or stylistic variation.
63
+ * **Vector-Space Inference**: By leveraging **Support Vector Machines (SVM)** and **TF-IDF vectorization**, the model maps textual input into a multi-dimensional hyperplane, enabling precise binary classification of depressive sentiment.
64
+ * **Architectural Efficiency**: The backend is architected for low-latency serving via Flask, ensuring that model inference and result rendering occur in sub-second cycles, critical for interactive user feedback.
65
+
66
+ > [!TIP]
67
+ > **NLP Pipeline Optimization**
68
+ >
69
+ > To maximize classification reliability, the engine employs a **multi-stage linguistic filter**. **Stop-word suppression** and **morphological analysis** strip away structural noise, while the **en_core_web_lg** transformer model contextualizes surviving tokens. This ensures the classifier’s weights are strictly coupled with affective indicators, minimizing the false-positive skew common in generalized sentiment analysis models.
70
+
71
+ ---
72
+
73
+ <!-- FEATURES -->
74
+ <a name="features"></a>
75
+ ## Features
76
+
77
+ | Feature | Description |
78
+ |---------|-------------|
79
+ | **Core SVM Model** | **High-Dimensional Classification** engine optimized for binary depressive sentiment prediction. |
80
+ | **NLP Pipeline** | Deep linguistic feature extraction powered by the **spaCy transformer model** (`en_core_web_lg`). |
81
+ | **Prediction Hub** | **Real-Time Inference Interface** built with Flask for sub-second classification feedback. |
82
+ | **Security Suite** | Integrated **Browser-Side Integrity** protocols including anti-right-click and anti-select systems. |
83
+ | **Cinematic Surprise** | **Immersive Branding Overlay** featuring animated Twitter iconography and synchronized audio. |
84
+
85
+ > [!NOTE]
86
+ > ### Technical Polish: The Linguistic Singularity
87
+ > We have engineered a **Probabilistic Sentiment Manager** that calibrates model weights across thousands of TF-IDF vectors to simulate human-like linguistic intuition. The visual language focuses on a "Neural Slate" aesthetic, ensuring maximum cognitive focus on the diagnostic outputs without procedural distraction.
88
+
89
+ ### Tech Stack
90
+ - **Languages**: Python 3.9+
91
+ - **Logic**: **SVM Classifier** (Scikit-Learn Inference Engine)
92
+ - **Linguistic Data**: **spaCy NLP** (Transformer-based word embeddings)
93
+ - **Web App**: **Flask Framework** (Micro-service architecture for model serving)
94
+ - **UI System**: Premium Modern Aesthetics (Custom CSS / Play Typography)
95
+ - **Deployment**: Standard Python Environment (PIP-managed dependencies)
96
+
97
+ ---
98
+
99
+ <!-- PROJECT STRUCTURE -->
100
+ <a name="project-structure"></a>
101
+ ## Project Structure
102
+
103
+ ```python
104
+ DEPRESSION-DETECTION-USING-TWEETS/
105
+
106
+ ├── docs/ # Technical Documentation
107
+ │ └── SPECIFICATION.md # Architecture & Design Specification
108
+
109
+ ├── Mega/ # Archival Attribution Assets
110
+ │ ├── Filly.jpg # Companion (Filly)
111
+ │ └── Mega.png # Author Profile Image (Mega Satish)
112
+
113
+ ├── screenshots/ # Project Visualization Gallery
114
+ │ ├── 01_landing_page.png # System Hub Initial State
115
+ │ ├── 02_footer_details.png # Brand and Metadata Footer
116
+ │ ├── 03_surprise_cinematic.png # Interactive Animated Sequence
117
+ │ ├── 04_predict_interface.png # Sentiment Analysis Entry Point
118
+ │ ├── 05_analysis_output.png # Model Inference result
119
+ │ └── 06_result_prediction.png # Final Sentiment Output
120
+
121
+ ├── source_code/ # Primary Application Layer
122
+ │ ├── assets/ # Serialized Models & Linguistic Data
123
+ │ ├── core/ # ML Pipeline (Clean, Train, Predict)
124
+ │ ├── static/ # Styling, Audio, & Security Scripts
125
+ │ ├── templates/ # HTML Templates (Index, Result, 404)
126
+ │ └── app.py # Flask Application (Entry Point)
127
+
128
+ ├── .gitattributes # Git configuration
129
+ ├── .gitignore # Repository Filters
130
+ ├── CITATION.cff # Scholarly Citation Metadata
131
+ ├── codemeta.json # Machine-Readable Project Metadata
132
+ ├── LICENSE # MIT License Terms
133
+ ├── README.md # Comprehensive Scholarly Entrance
134
+ └── SECURITY.md # Security Policy & Protocol
135
+ ```
136
+
137
+ ---
138
+
139
+ <!-- RESULTS -->
140
+ <a name="results"></a>
141
+ ## Results
142
+
143
+ <div align="center">
144
+ <b>Main Landing: System Hub Initialization</b>
145
+ <br>
146
+ <i>Minimalist interface for rapid tweet sentiment analysis.</i>
147
+ <br><br>
148
+ <img src="screenshots/01_landing_page.png" alt="Landing Page" width="90%">
149
+ <br><br><br>
150
+
151
+ <b>Metadata Synthesis: Branding and Footer Detail</b>
152
+ <br>
153
+ <i>Scholarly attribution and project status integration.</i>
154
+ <br><br>
155
+ <img src="screenshots/02_footer_details.png" alt="Footer Details" width="90%">
156
+ <br><br><br>
157
+
158
+ <b>Interactivity: Animated Twitter Sequence</b>
159
+ <br>
160
+ <i>Immersive audiovisual overlay triggered by core branding elements.</i>
161
+ <br><br>
162
+ <img src="screenshots/03_surprise_cinematic.png" alt="Cinematic Surprise" width="90%">
163
+ <br><br><br>
164
+
165
+ <b>Sentiment Entry: Real-time Analysis Interface</b>
166
+ <br>
167
+ <i>Direct manipulation environment for high-latency textual input.</i>
168
+ <br><br>
169
+ <img src="screenshots/04_predict_interface.png" alt="Predict Interface" width="90%">
170
+ <br><br><br>
171
+
172
+ <b>Model Inference: Feature Extraction Output</b>
173
+ <br>
174
+ <i>Deep linguistic analysis and probabilistic score generation.</i>
175
+ <br><br>
176
+ <img src="screenshots/05_analysis_output.png" alt="Analysis Output" width="90%">
177
+ <br><br><br>
178
+
179
+ <b>Statistical Output: Final Sentiment Classification</b>
180
+ <br>
181
+ <i>Categorized classification results with immediate visual feedback.</i>
182
+ <br><br>
183
+ <img src="screenshots/06_result_prediction.png" alt="Result Prediction" width="90%">
184
+ </div>
185
+
186
+ ---
187
+
188
+ <!-- QUICK START -->
189
+ <a name="quick-start"></a>
190
+ ## Quick Start
191
+
192
+ ### 1. Prerequisites
193
+ - **Python 3.11+**: Required for runtime execution. [Download Python](https://www.python.org/downloads/)
194
+ - **Git**: For version control and cloning. [Download Git](https://git-scm.com/downloads)
195
+
196
+ > [!WARNING]
197
+ > **Data Acquisition & Memory Constraints**
198
+ >
199
+ > The linguistic pipeline relies on the **en_core_web_lg** transformer model, which requires an initial download of approximately **800MB**. Ensure a stable network connection during setup. Additionally, loading this model into memory requires at least **2GB of available RAM** to prevent swapping and ensure low-latency inference.
200
+
201
+ ### 2. Installation & Setup
202
+
203
+ #### Step 1: Clone the Repository
204
+ Open your terminal and clone the repository:
205
+ ```bash
206
+ git clone https://github.com/Amey-Thakur/DEPRESSION-DETECTION-USING-TWEETS.git
207
+ cd DEPRESSION-DETECTION-USING-TWEETS
208
+ ```
209
+
210
+ #### Step 2: Configure Virtual Environment
211
+ Prepare an isolated environment to manage dependencies:
212
+
213
+ **Windows (Command Prompt / PowerShell):**
214
+ ```bash
215
+ python -m venv venv
216
+ venv\Scripts\activate
217
+ ```
218
+
219
+ **macOS / Linux (Terminal):**
220
+ ```bash
221
+ python3 -m venv venv
222
+ source venv/bin/activate
223
+ ```
224
+
225
+ #### Step 3: Install Core Dependencies
226
+ Ensure your environment is active, then install the required libraries:
227
+ ```bash
228
+ pip install -r "Source Code/requirements.txt"
229
+ ```
230
+
231
+ #### Step 4: Linguistic Model Acquisition
232
+ Download the large-scale linguistic model required for analysis (approx. 800MB):
233
+ ```bash
234
+ python -m spacy download en_core_web_lg
235
+ ```
236
+
237
+ ### 3. Execution
238
+ Launch the sentiment analysis dashboard:
239
+
240
+ ```bash
241
+ python "Source Code/app.py"
242
+ ```
243
+
244
+ ---
245
+
246
+ <!-- USAGE GUIDELINES -->
247
+ <a name="usage-guidelines"></a>
248
+ ## Usage Guidelines
249
+
250
+ This repository is openly shared to support learning and knowledge exchange across the academic community.
251
+
252
+ **For Students**
253
+ Use this project as reference material for understanding **Support Vector Machines (SVM)**, **spaCy NLP pipelines**, and **sentiment analysis within the context of mental health monitoring**. The source code is available for study to facilitate self-paced learning and exploration of **high-dimensional feature extraction and model serving via Flask**.
254
+
255
+ **For Educators**
256
+ This project may serve as a practical lab example or supplementary teaching resource for **Data Science**, **Natural Language Processing**, and **Machine Learning** courses. Attribution is appreciated when utilizing content.
257
+
258
+ **For Researchers**
259
+ The documentation and architectural approach may provide insights into **academic project structuring**, **psychological linguistic modeling**, and **algorithmic deployment**.
260
+
261
+ ---
262
+
263
+ <!-- LICENSE -->
264
+ <a name="license"></a>
265
+ ## License
266
+
267
+ This repository and all its creative and technical assets are made available under the **MIT License**. See the [LICENSE](LICENSE) file for complete terms.
268
+
269
+ > [!NOTE]
270
+ > **Summary**: You are free to share and adapt this content for any purpose, even commercially, as long as you provide appropriate attribution to the original authors.
271
+
272
+ Copyright © 2022 Amey Thakur & Mega Satish
273
+
274
+ ---
275
+
276
+ <!-- ABOUT -->
277
+ <a name="about-this-repository"></a>
278
+ ## About This Repository
279
+
280
+ **Created & Maintained by**: [Amey Thakur](https://github.com/Amey-Thakur) & [Mega Satish](https://github.com/msatmod)
281
+
282
+ This project features **Depression Detection**, a high-performance sentiment analysis system. It represents a personal exploration into **Python**-based machine learning and interactive web-service architecture.
283
+
284
+ **Connect:** [GitHub](https://github.com/Amey-Thakur) &nbsp;·&nbsp; [LinkedIn](https://www.linkedin.com/in/amey-thakur) &nbsp;·&nbsp; [ORCID](https://orcid.org/0000-0001-5644-1575)
285
+
286
+ ### Acknowledgments
287
+
288
+ Grateful acknowledgment to [**Mega Satish**](https://github.com/msatmod) for her exceptional collaboration and scholarly partnership during the development of this machine learning project. Her constant support, technical clarity, and dedication to software quality were instrumental in achieving the system's functional objectives. Learning alongside her was a transformative experience; her thoughtful approach to problem-solving and steady encouragement turned complex requirements into meaningful learning moments. This work reflects the growth and insights gained from our side-by-side academic journey. Thank you, Mega, for everything you shared and taught along the way.
289
+
290
+ Special thanks to the **mentors and peers** whose encouragement, discussions, and support contributed meaningfully to this learning experience.
291
+
292
+ ---
293
+
294
+ <div align="center">
295
+
296
+ [↑ Back to Top](#readme-top)
297
+
298
+ [Authors](#authors) &nbsp;·&nbsp; [Overview](#overview) &nbsp;·&nbsp; [Features](#features) &nbsp;·&nbsp; [Structure](#project-structure) &nbsp;·&nbsp; [Results](#results) &nbsp;·&nbsp; [Quick Start](#quick-start) &nbsp;·&nbsp; [Usage Guidelines](#usage-guidelines) &nbsp;·&nbsp; [License](#license) &nbsp;·&nbsp; [About](#about-this-repository) &nbsp;·&nbsp; [Acknowledgments](#acknowledgments)
299
+
300
+ <br>
301
+
302
+ 🧠 **[DEPRESSION-DETECTION](https://huggingface.co/spaces/ameythakur/Depression-Detection-Using-Tweets)**
303
+
304
+ ---
305
+
306
+ ### 🎓 [Computer Engineering Repository](https://github.com/Amey-Thakur/COMPUTER-ENGINEERING)
307
+
308
+ **Computer Engineering (B.E.)**
309
+
310
+ *Semester-wise curriculum, laboratories, projects, and academic notes.*
311
+
312
  </div>
source_code/app.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ==============================================================================
2
+ # PROJECT: DEPRESSION-DETECTION-USING-TWEETS
3
+ # AUTHORS: AMEY THAKUR & MEGA SATISH
4
+ # GITHUB (AMEY): https://github.com/Amey-Thakur
5
+ # GITHUB (MEGA): https://github.com/msatmod
6
+ # REPOSITORY: https://github.com/Amey-Thakur/DEPRESSION-DETECTION-USING-TWEETS
7
+ # RELEASE DATE: June 5, 2022
8
+ # LICENSE: MIT License
9
+ # DESCRIPTION: Flask application entry point for the tweet analysis project.
10
+ # ==============================================================================
11
+
12
+ #!/usr/bin/env python3
13
+
14
+ import pickle
15
+ from flask import Flask, request, render_template
16
+ from flask_bootstrap import Bootstrap
17
+ import app_utilities
18
+
19
+ # Initialize the Flask application
20
+ # Flask-Bootstrap is utilized for enhanced UI styling consistency
21
+ app = Flask(__name__)
22
+ Bootstrap(app)
23
+
24
+ @app.route('/')
25
+ def index():
26
+ """Renders the landing page for tweet input."""
27
+ return render_template('index.html')
28
+
29
+ @app.route('/predict', methods=['POST'])
30
+ def predict():
31
+ """
32
+ Handles the form submission and displays the prediction result.
33
+
34
+ Returns:
35
+ Rendered result HTML with the model's prediction outcome.
36
+ """
37
+ if request.method == 'POST':
38
+ # Retrieve the tweet content submitted via the web interface
39
+ tweet = request.form["tweet"]
40
+ input_data = [tweet]
41
+
42
+ # Invoke the backend prediction utility to classify the tweet's sentiment
43
+ # The engine utilizes an SVM classifier with spaCy word embeddings
44
+ my_prediction = app_utilities.tweet_prediction(str(input_data))
45
+
46
+ return render_template("result.html", prediction=my_prediction, name=tweet)
47
+
48
+ @app.errorhandler(404)
49
+ def page_not_found(e):
50
+ """
51
+ Custom 404 error handler.
52
+ Renders the personalized 404 page when a resource is not found.
53
+ """
54
+ return render_template('404.html'), 404
55
+
56
+ # Entry point for the Flask development server
57
+ if __name__ == '__main__':
58
+ # Execution on port 7860 as required for Hugging Face Spaces
59
+ app.run(host='0.0.0.0', port=7860)
source_code/app_utilities.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ==============================================================================
2
+ # PROJECT: DEPRESSION-DETECTION-USING-TWEETS
3
+ # AUTHORS: AMEY THAKUR & MEGA SATISH
4
+ # GITHUB (AMEY): https://github.com/Amey-Thakur
5
+ # GITHUB (MEGA): https://github.com/msatmod
6
+ # REPOSITORY: https://github.com/Amey-Thakur/DEPRESSION-DETECTION-USING-TWEETS
7
+ # RELEASE DATE: June 5, 2022
8
+ # LICENSE: MIT License
9
+ # DESCRIPTION: Utility module for tweet analysis predictions.
10
+ # ==============================================================================
11
+
12
+ import sys
13
+ import pickle
14
+ import warnings
15
+ import numpy as np
16
+ import pandas as pd
17
+ import spacy
18
+ import en_core_web_lg
19
+ # Configure sys.path to permit localized module discovery within the core directory
20
+ sys.path.append('./core')
21
+
22
+ import clean_utilities as CU
23
+
24
+ # Suppression of non-critical runtime warnings to maintain a clean console log
25
+ warnings.filterwarnings("ignore")
26
+
27
+ def tweet_prediction(tweet: str) -> int:
28
+ """
29
+ Takes a tweet and returns whether it's classified as depressive (1) or not (0).
30
+
31
+ The process:
32
+ 1. Clean the text using our utility module.
33
+ 2. Convert text to numbers using spaCy.
34
+ 3. Use the trained SVM model to make a prediction.
35
+ Args:
36
+ tweet (str): The tweet text from the user.
37
+
38
+ Returns:
39
+ int: 1 for Depressive, 0 for Non-depressive.
40
+ """
41
+ # Step 1: Clean the text
42
+ processed_tweet = tweet
43
+ cleaned_input = []
44
+ cleaned_input.append(CU.tweets_cleaner(processed_tweet))
45
+
46
+ # Step 2: Convert text to numbers using spaCy
47
+ nlp_engine = en_core_web_lg.load()
48
+
49
+ # Step 3: Compute centroid word embeddings
50
+ # We calculate the mean vector of all tokens to represent the tweet's semantic context
51
+ semantic_vectors = np.array([
52
+ np.array([token.vector for token in nlp_engine(s)]).mean(axis=0) * np.ones((300))
53
+ for s in cleaned_input
54
+ ])
55
+
56
+ # Step 4: Load the pre-trained Support Vector Machine (SVM) model artifact
57
+ # The SVM was selected for its robust performance in high-dimensional text classification
58
+ model_path = "./assets/models/model_svm1.pkl"
59
+ with open(model_path, 'rb') as model_file:
60
+ classifier = pickle.load(model_file)
61
+
62
+ # Step 5: Perform binary classification
63
+ prediction_result = classifier.predict(semantic_vectors)
64
+
65
+ return int(prediction_result[0])
source_code/assets/data/external/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ## External dataset:
2
+
3
+ We need two types of datasets one with tweets containing depressive characteristic which is obtained from twitter API and the other one with random tweets which is available in one of the [Kaggle datasets](https://www.kaggle.com/ywang311/twitter-sentiment/data).
source_code/assets/data/processed_data.csv ADDED
The diff for this file is too large to render. See raw diff
 
source_code/assets/data/scrapped/depressive_tweets.csv ADDED
The diff for this file is too large to render. See raw diff
 
source_code/assets/img/ROC_Precision_LR.png ADDED

Git LFS Details

  • SHA256: a68b8402011ad790ad1067ab5ce45ca04c6dd9ff616195f47752ea0293a018b5
  • Pointer size: 130 Bytes
  • Size of remote file: 27.2 kB
source_code/assets/img/ROC_Precision_SVM.png ADDED

Git LFS Details

  • SHA256: 8bc69c917379a995c30217d6161b4c4271b889159926ce25ab2af4af84725ca4
  • Pointer size: 131 Bytes
  • Size of remote file: 174 kB
source_code/assets/img/app.png ADDED

Git LFS Details

  • SHA256: dfc9dba44a38d3c3abb8ce7b29760768c057c5de6e05daa1864044dd9ddd18d0
  • Pointer size: 130 Bytes
  • Size of remote file: 89.7 kB
source_code/assets/img/depression.png ADDED

Git LFS Details

  • SHA256: 2ac8ddc806ef0247726a5505c758243e475278d2093c5f56cbf70cd03d746076
  • Pointer size: 130 Bytes
  • Size of remote file: 73.7 kB
source_code/assets/img/logo.jpeg ADDED

Git LFS Details

  • SHA256: c7336d7d92c75bf17bd1e699480b0eadb8d543744ab800342cd6ec5912dc86a0
  • Pointer size: 130 Bytes
  • Size of remote file: 64.9 kB
source_code/assets/img/loss_accuracy_LSTM.png ADDED

Git LFS Details

  • SHA256: 6dfb07f282e671377e8de29ff013522816de8459fb6658b04e31bdfbdca8f22c
  • Pointer size: 130 Bytes
  • Size of remote file: 31.6 kB
source_code/assets/img/models_comparison.png ADDED

Git LFS Details

  • SHA256: 2887ff4a8edcd961ad901180341d486f70d17d171e401526d9fcc55b189d232a
  • Pointer size: 130 Bytes
  • Size of remote file: 97.6 kB
source_code/assets/img/wordcloud_depressive.png ADDED

Git LFS Details

  • SHA256: 0eda6550e1a7ff294dd4c58a6d0fb298a34cdaaf4780e83bd0e9bca7d5718acc
  • Pointer size: 131 Bytes
  • Size of remote file: 546 kB
source_code/assets/img/wordcloud_random.png ADDED

Git LFS Details

  • SHA256: 6c89c3635c9d96fb67b893873d60e24c562ca59eecaa783bb1986bb47197a52f
  • Pointer size: 131 Bytes
  • Size of remote file: 375 kB
source_code/assets/models/model_LSTM.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b53fb9941244f6f5ad410246b5606c222763d55462b66b0be08d1845c3dd8574
3
+ size 81
source_code/assets/models/model_LogReg.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc7de925c22a478f8168b698570b9775b28613847b0dfa998fe972a7c9273a0e
3
+ size 197693
source_code/assets/models/model_svm.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db18cfbfd954728d2091154eda313a9c1ffb0f4a1778e878985c055181b17170
3
+ size 24450690
source_code/assets/models/model_svm1.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84c846534b54571c35b0d34bc73d948a0c230e9047ce94df3796750c14e18351
3
+ size 24450707
source_code/assets/notebooks/data_cleaning_exploration.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
source_code/assets/notebooks/data_gathering_twint.ipynb ADDED
@@ -0,0 +1,483 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "name": "Data_Gathering_Twint.ipynb",
7
+ "provenance": [],
8
+ "collapsed_sections": []
9
+ },
10
+ "kernelspec": {
11
+ "name": "python3",
12
+ "display_name": "Python 3"
13
+ },
14
+ "language_info": {
15
+ "name": "python"
16
+ }
17
+ },
18
+ "cells": [
19
+ {
20
+ "cell_type": "code",
21
+ "metadata": {
22
+ "colab": {
23
+ "base_uri": "https://localhost:8080/"
24
+ },
25
+ "id": "QgxmLN_lyiCS",
26
+ "outputId": "58d94201-559a-45fc-8e85-3dc7b73968fd"
27
+ },
28
+ "source": [
29
+ "from google.colab import drive\n",
30
+ "drive.mount('/content/drive')"
31
+ ],
32
+ "execution_count": null,
33
+ "outputs": [
34
+ {
35
+ "output_type": "stream",
36
+ "name": "stdout",
37
+ "text": [
38
+ "Mounted at /content/drive\n"
39
+ ]
40
+ }
41
+ ]
42
+ },
43
+ {
44
+ "cell_type": "code",
45
+ "metadata": {
46
+ "colab": {
47
+ "base_uri": "https://localhost:8080/"
48
+ },
49
+ "id": "-nI0XtuNwnmQ",
50
+ "outputId": "66c8ac16-109f-4ba6-e556-574091755cfc"
51
+ },
52
+ "source": [
53
+ "!git clone https://github.com/twintproject/twint.git"
54
+ ],
55
+ "execution_count": null,
56
+ "outputs": [
57
+ {
58
+ "output_type": "stream",
59
+ "name": "stdout",
60
+ "text": [
61
+ "Cloning into 'twint'...\n",
62
+ "remote: Enumerating objects: 4457, done.\u001b[K\n",
63
+ "remote: Counting objects: 100% (4/4), done.\u001b[K\n",
64
+ "remote: Compressing objects: 100% (4/4), done.\u001b[K\n",
65
+ "remote: Total 4457 (delta 0), reused 2 (delta 0), pack-reused 4453\u001b[K\n",
66
+ "Receiving objects: 100% (4457/4457), 4.47 MiB | 13.40 MiB/s, done.\n",
67
+ "Resolving deltas: 100% (2634/2634), done.\n"
68
+ ]
69
+ }
70
+ ]
71
+ },
72
+ {
73
+ "cell_type": "code",
74
+ "metadata": {
75
+ "id": "DcLRNsvGya2i"
76
+ },
77
+ "source": [
78
+ "import os\n",
79
+ "os.chdir(\"/content/twint\")"
80
+ ],
81
+ "execution_count": null,
82
+ "outputs": []
83
+ },
84
+ {
85
+ "cell_type": "code",
86
+ "metadata": {
87
+ "id": "XV0Tp_SQydvh"
88
+ },
89
+ "source": [
90
+ "!pip freeze > requirements.txt"
91
+ ],
92
+ "execution_count": null,
93
+ "outputs": []
94
+ },
95
+ {
96
+ "cell_type": "code",
97
+ "metadata": {
98
+ "colab": {
99
+ "base_uri": "https://localhost:8080/"
100
+ },
101
+ "id": "84dyXWmLyrsn",
102
+ "outputId": "57188228-60e3-4a80-b3b9-737362a81227"
103
+ },
104
+ "source": [
105
+ "!pip install ."
106
+ ],
107
+ "execution_count": null,
108
+ "outputs": [
109
+ {
110
+ "output_type": "stream",
111
+ "name": "stdout",
112
+ "text": [
113
+ "Processing /content/twint\n",
114
+ "\u001b[33m DEPRECATION: A future pip version will change local packages to be built in-place without first copying to a temporary directory. We recommend you use --use-feature=in-tree-build to test your packages with this new behavior before it becomes the default.\n",
115
+ " pip 21.3 will remove support for this functionality. You can find discussion regarding this at https://github.com/pypa/pip/issues/7555.\u001b[0m\n",
116
+ "Collecting aiohttp\n",
117
+ " Downloading aiohttp-3.7.4.post0-cp37-cp37m-manylinux2014_x86_64.whl (1.3 MB)\n",
118
+ "\u001b[K |████████████████████████████████| 1.3 MB 7.3 MB/s \n",
119
+ "\u001b[?25hCollecting aiodns\n",
120
+ " Downloading aiodns-3.0.0-py3-none-any.whl (5.0 kB)\n",
121
+ "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.7/dist-packages (from twint==2.1.21) (4.6.3)\n",
122
+ "Collecting cchardet\n",
123
+ " Downloading cchardet-2.1.7-cp37-cp37m-manylinux2010_x86_64.whl (263 kB)\n",
124
+ "\u001b[K |████████████████████████████████| 263 kB 47.9 MB/s \n",
125
+ "\u001b[?25hCollecting dataclasses\n",
126
+ " Downloading dataclasses-0.6-py3-none-any.whl (14 kB)\n",
127
+ "Collecting elasticsearch\n",
128
+ " Downloading elasticsearch-7.15.1-py2.py3-none-any.whl (378 kB)\n",
129
+ "\u001b[K |████████████████████████████████| 378 kB 70.9 MB/s \n",
130
+ "\u001b[?25hRequirement already satisfied: pysocks in /usr/local/lib/python3.7/dist-packages (from twint==2.1.21) (1.7.1)\n",
131
+ "Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from twint==2.1.21) (1.1.5)\n",
132
+ "Collecting aiohttp_socks\n",
133
+ " Downloading aiohttp_socks-0.6.0-py3-none-any.whl (9.2 kB)\n",
134
+ "Collecting schedule\n",
135
+ " Downloading schedule-1.1.0-py2.py3-none-any.whl (10 kB)\n",
136
+ "Requirement already satisfied: geopy in /usr/local/lib/python3.7/dist-packages (from twint==2.1.21) (1.17.0)\n",
137
+ "Collecting fake-useragent\n",
138
+ " Downloading fake-useragent-0.1.11.tar.gz (13 kB)\n",
139
+ "Collecting googletransx\n",
140
+ " Downloading googletransx-2.4.2.tar.gz (13 kB)\n",
141
+ "Collecting pycares>=4.0.0\n",
142
+ " Downloading pycares-4.0.0-cp37-cp37m-manylinux2010_x86_64.whl (291 kB)\n",
143
+ "\u001b[K |████████████████████████████████| 291 kB 59.7 MB/s \n",
144
+ "\u001b[?25hRequirement already satisfied: cffi>=1.5.0 in /usr/local/lib/python3.7/dist-packages (from pycares>=4.0.0->aiodns->twint==2.1.21) (1.14.6)\n",
145
+ "Requirement already satisfied: pycparser in /usr/local/lib/python3.7/dist-packages (from cffi>=1.5.0->pycares>=4.0.0->aiodns->twint==2.1.21) (2.20)\n",
146
+ "Collecting multidict<7.0,>=4.5\n",
147
+ " Downloading multidict-5.2.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (160 kB)\n",
148
+ "\u001b[K |████████████████████████████████| 160 kB 67.8 MB/s \n",
149
+ "\u001b[?25hRequirement already satisfied: typing-extensions>=3.6.5 in /usr/local/lib/python3.7/dist-packages (from aiohttp->twint==2.1.21) (3.7.4.3)\n",
150
+ "Requirement already satisfied: chardet<5.0,>=2.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->twint==2.1.21) (3.0.4)\n",
151
+ "Collecting yarl<2.0,>=1.0\n",
152
+ " Downloading yarl-1.7.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (271 kB)\n",
153
+ "\u001b[K |████████████████████████████████| 271 kB 65.8 MB/s \n",
154
+ "\u001b[?25hCollecting async-timeout<4.0,>=3.0\n",
155
+ " Downloading async_timeout-3.0.1-py3-none-any.whl (8.2 kB)\n",
156
+ "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->twint==2.1.21) (21.2.0)\n",
157
+ "Requirement already satisfied: idna>=2.0 in /usr/local/lib/python3.7/dist-packages (from yarl<2.0,>=1.0->aiohttp->twint==2.1.21) (2.10)\n",
158
+ "Collecting python-socks[asyncio]>=1.2.2\n",
159
+ " Downloading python_socks-1.2.4-py3-none-any.whl (35 kB)\n",
160
+ "Requirement already satisfied: urllib3<2,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from elasticsearch->twint==2.1.21) (1.24.3)\n",
161
+ "Requirement already satisfied: certifi in /usr/local/lib/python3.7/dist-packages (from elasticsearch->twint==2.1.21) (2021.5.30)\n",
162
+ "Requirement already satisfied: geographiclib<2,>=1.49 in /usr/local/lib/python3.7/dist-packages (from geopy->twint==2.1.21) (1.52)\n",
163
+ "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from googletransx->twint==2.1.21) (2.23.0)\n",
164
+ "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas->twint==2.1.21) (2018.9)\n",
165
+ "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas->twint==2.1.21) (2.8.2)\n",
166
+ "Requirement already satisfied: numpy>=1.15.4 in /usr/local/lib/python3.7/dist-packages (from pandas->twint==2.1.21) (1.19.5)\n",
167
+ "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas->twint==2.1.21) (1.15.0)\n",
168
+ "Building wheels for collected packages: twint, fake-useragent, googletransx\n",
169
+ " Building wheel for twint (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
170
+ " Created wheel for twint: filename=twint-2.1.21-py3-none-any.whl size=38870 sha256=a648841e8abdbeafa3718d69334377eef20411c64b11553f70be9937a84be56a\n",
171
+ " Stored in directory: /tmp/pip-ephem-wheel-cache-oe_ws1ie/wheels/f7/3e/11/2803f3c6890e87a9bec35bb8e37ef1ad0777a00f43e2441fb1\n",
172
+ " Building wheel for fake-useragent (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
173
+ " Created wheel for fake-useragent: filename=fake_useragent-0.1.11-py3-none-any.whl size=13502 sha256=db8ea0f861a9b913fb4822f3cbfb76d4ff27a371f7c125657f7cbb17766fc316\n",
174
+ " Stored in directory: /root/.cache/pip/wheels/ed/f7/62/50ab6c9a0b5567267ab76a9daa9d06315704209b2c5d032031\n",
175
+ " Building wheel for googletransx (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
176
+ " Created wheel for googletransx: filename=googletransx-2.4.2-py3-none-any.whl size=15968 sha256=513c5ee44dad1d794a351939b43e11cd2a5d2fbe541ad2a9271d661c96e29221\n",
177
+ " Stored in directory: /root/.cache/pip/wheels/66/d5/b1/31104b338f7fd45aa8f7d22587765db06773b13df48a89735f\n",
178
+ "Successfully built twint fake-useragent googletransx\n",
179
+ "Installing collected packages: multidict, yarl, python-socks, async-timeout, pycares, aiohttp, schedule, googletransx, fake-useragent, elasticsearch, dataclasses, cchardet, aiohttp-socks, aiodns, twint\n",
180
+ "Successfully installed aiodns-3.0.0 aiohttp-3.7.4.post0 aiohttp-socks-0.6.0 async-timeout-3.0.1 cchardet-2.1.7 dataclasses-0.6 elasticsearch-7.15.1 fake-useragent-0.1.11 googletransx-2.4.2 multidict-5.2.0 pycares-4.0.0 python-socks-1.2.4 schedule-1.1.0 twint-2.1.21 yarl-1.7.0\n"
181
+ ]
182
+ }
183
+ ]
184
+ },
185
+ {
186
+ "cell_type": "code",
187
+ "metadata": {
188
+ "colab": {
189
+ "base_uri": "https://localhost:8080/"
190
+ },
191
+ "id": "b8rOKGVQ8OBq",
192
+ "outputId": "499b3126-f2b4-4799-f265-5065322a0146"
193
+ },
194
+ "source": [
195
+ "!pip install -U git+https://github.com/cyxv/twint.git@master"
196
+ ],
197
+ "execution_count": null,
198
+ "outputs": [
199
+ {
200
+ "output_type": "stream",
201
+ "name": "stdout",
202
+ "text": [
203
+ "Collecting git+https://github.com/cyxv/twint.git@master\n",
204
+ " Cloning https://github.com/cyxv/twint.git (to revision master) to /tmp/pip-req-build-bjyd0ng2\n",
205
+ " Running command git clone -q https://github.com/cyxv/twint.git /tmp/pip-req-build-bjyd0ng2\n",
206
+ "Requirement already satisfied: aiohttp in /usr/local/lib/python3.7/dist-packages (from twint==2.1.21) (3.7.4.post0)\n",
207
+ "Requirement already satisfied: aiodns in /usr/local/lib/python3.7/dist-packages (from twint==2.1.21) (3.0.0)\n",
208
+ "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.7/dist-packages (from twint==2.1.21) (4.6.3)\n",
209
+ "Requirement already satisfied: cchardet in /usr/local/lib/python3.7/dist-packages (from twint==2.1.21) (2.1.7)\n",
210
+ "Requirement already satisfied: dataclasses in /usr/local/lib/python3.7/dist-packages (from twint==2.1.21) (0.6)\n",
211
+ "Requirement already satisfied: elasticsearch in /usr/local/lib/python3.7/dist-packages (from twint==2.1.21) (7.15.1)\n",
212
+ "Requirement already satisfied: pysocks in /usr/local/lib/python3.7/dist-packages (from twint==2.1.21) (1.7.1)\n",
213
+ "Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from twint==2.1.21) (1.1.5)\n",
214
+ "Requirement already satisfied: aiohttp_socks in /usr/local/lib/python3.7/dist-packages (from twint==2.1.21) (0.6.0)\n",
215
+ "Requirement already satisfied: schedule in /usr/local/lib/python3.7/dist-packages (from twint==2.1.21) (1.1.0)\n",
216
+ "Requirement already satisfied: geopy in /usr/local/lib/python3.7/dist-packages (from twint==2.1.21) (1.17.0)\n",
217
+ "Requirement already satisfied: fake-useragent in /usr/local/lib/python3.7/dist-packages (from twint==2.1.21) (0.1.11)\n",
218
+ "Requirement already satisfied: googletransx in /usr/local/lib/python3.7/dist-packages (from twint==2.1.21) (2.4.2)\n",
219
+ "Requirement already satisfied: pycares>=4.0.0 in /usr/local/lib/python3.7/dist-packages (from aiodns->twint==2.1.21) (4.0.0)\n",
220
+ "Requirement already satisfied: cffi>=1.5.0 in /usr/local/lib/python3.7/dist-packages (from pycares>=4.0.0->aiodns->twint==2.1.21) (1.14.6)\n",
221
+ "Requirement already satisfied: pycparser in /usr/local/lib/python3.7/dist-packages (from cffi>=1.5.0->pycares>=4.0.0->aiodns->twint==2.1.21) (2.20)\n",
222
+ "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->twint==2.1.21) (1.7.0)\n",
223
+ "Requirement already satisfied: chardet<5.0,>=2.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->twint==2.1.21) (3.0.4)\n",
224
+ "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.7/dist-packages (from aiohttp->twint==2.1.21) (5.2.0)\n",
225
+ "Requirement already satisfied: async-timeout<4.0,>=3.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->twint==2.1.21) (3.0.1)\n",
226
+ "Requirement already satisfied: typing-extensions>=3.6.5 in /usr/local/lib/python3.7/dist-packages (from aiohttp->twint==2.1.21) (3.7.4.3)\n",
227
+ "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->twint==2.1.21) (21.2.0)\n",
228
+ "Requirement already satisfied: idna>=2.0 in /usr/local/lib/python3.7/dist-packages (from yarl<2.0,>=1.0->aiohttp->twint==2.1.21) (2.10)\n",
229
+ "Requirement already satisfied: python-socks[asyncio]>=1.2.2 in /usr/local/lib/python3.7/dist-packages (from aiohttp_socks->twint==2.1.21) (1.2.4)\n",
230
+ "Requirement already satisfied: urllib3<2,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from elasticsearch->twint==2.1.21) (1.24.3)\n",
231
+ "Requirement already satisfied: certifi in /usr/local/lib/python3.7/dist-packages (from elasticsearch->twint==2.1.21) (2021.5.30)\n",
232
+ "Requirement already satisfied: geographiclib<2,>=1.49 in /usr/local/lib/python3.7/dist-packages (from geopy->twint==2.1.21) (1.52)\n",
233
+ "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from googletransx->twint==2.1.21) (2.23.0)\n",
234
+ "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas->twint==2.1.21) (2018.9)\n",
235
+ "Requirement already satisfied: numpy>=1.15.4 in /usr/local/lib/python3.7/dist-packages (from pandas->twint==2.1.21) (1.19.5)\n",
236
+ "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas->twint==2.1.21) (2.8.2)\n",
237
+ "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas->twint==2.1.21) (1.15.0)\n"
238
+ ]
239
+ }
240
+ ]
241
+ },
242
+ {
243
+ "cell_type": "code",
244
+ "metadata": {
245
+ "colab": {
246
+ "base_uri": "https://localhost:8080/"
247
+ },
248
+ "id": "PKbYSKBJy2Ow",
249
+ "outputId": "8f24189f-56ec-41d5-f8c0-475ffb3b1bc2"
250
+ },
251
+ "source": [
252
+ "!pip install nest_asyncio"
253
+ ],
254
+ "execution_count": null,
255
+ "outputs": [
256
+ {
257
+ "output_type": "stream",
258
+ "name": "stdout",
259
+ "text": [
260
+ "Requirement already satisfied: nest_asyncio in /usr/local/lib/python3.7/dist-packages (1.5.1)\n"
261
+ ]
262
+ }
263
+ ]
264
+ },
265
+ {
266
+ "cell_type": "code",
267
+ "metadata": {
268
+ "colab": {
269
+ "base_uri": "https://localhost:8080/"
270
+ },
271
+ "id": "Fks7CZYMy5cR",
272
+ "outputId": "180f0f46-4fca-4e7f-a134-2cf99283a6a4"
273
+ },
274
+ "source": [
275
+ "!pip3 install twint"
276
+ ],
277
+ "execution_count": null,
278
+ "outputs": [
279
+ {
280
+ "output_type": "stream",
281
+ "name": "stdout",
282
+ "text": [
283
+ "Requirement already satisfied: twint in /usr/local/lib/python3.7/dist-packages (2.1.21)\n",
284
+ "Requirement already satisfied: schedule in /usr/local/lib/python3.7/dist-packages (from twint) (1.1.0)\n",
285
+ "Requirement already satisfied: pysocks in /usr/local/lib/python3.7/dist-packages (from twint) (1.7.1)\n",
286
+ "Requirement already satisfied: cchardet in /usr/local/lib/python3.7/dist-packages (from twint) (2.1.7)\n",
287
+ "Requirement already satisfied: fake-useragent in /usr/local/lib/python3.7/dist-packages (from twint) (0.1.11)\n",
288
+ "Requirement already satisfied: elasticsearch in /usr/local/lib/python3.7/dist-packages (from twint) (7.15.1)\n",
289
+ "Requirement already satisfied: aiohttp in /usr/local/lib/python3.7/dist-packages (from twint) (3.7.4.post0)\n",
290
+ "Requirement already satisfied: googletransx in /usr/local/lib/python3.7/dist-packages (from twint) (2.4.2)\n",
291
+ "Requirement already satisfied: geopy in /usr/local/lib/python3.7/dist-packages (from twint) (1.17.0)\n",
292
+ "Requirement already satisfied: aiohttp-socks in /usr/local/lib/python3.7/dist-packages (from twint) (0.6.0)\n",
293
+ "Requirement already satisfied: aiodns in /usr/local/lib/python3.7/dist-packages (from twint) (3.0.0)\n",
294
+ "Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from twint) (1.1.5)\n",
295
+ "Requirement already satisfied: dataclasses in /usr/local/lib/python3.7/dist-packages (from twint) (0.6)\n",
296
+ "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.7/dist-packages (from twint) (4.6.3)\n",
297
+ "Requirement already satisfied: pycares>=4.0.0 in /usr/local/lib/python3.7/dist-packages (from aiodns->twint) (4.0.0)\n",
298
+ "Requirement already satisfied: cffi>=1.5.0 in /usr/local/lib/python3.7/dist-packages (from pycares>=4.0.0->aiodns->twint) (1.14.6)\n",
299
+ "Requirement already satisfied: pycparser in /usr/local/lib/python3.7/dist-packages (from cffi>=1.5.0->pycares>=4.0.0->aiodns->twint) (2.20)\n",
300
+ "Requirement already satisfied: chardet<5.0,>=2.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->twint) (3.0.4)\n",
301
+ "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->twint) (1.7.0)\n",
302
+ "Requirement already satisfied: async-timeout<4.0,>=3.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->twint) (3.0.1)\n",
303
+ "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->twint) (21.2.0)\n",
304
+ "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.7/dist-packages (from aiohttp->twint) (5.2.0)\n",
305
+ "Requirement already satisfied: typing-extensions>=3.6.5 in /usr/local/lib/python3.7/dist-packages (from aiohttp->twint) (3.7.4.3)\n",
306
+ "Requirement already satisfied: idna>=2.0 in /usr/local/lib/python3.7/dist-packages (from yarl<2.0,>=1.0->aiohttp->twint) (2.10)\n",
307
+ "Requirement already satisfied: python-socks[asyncio]>=1.2.2 in /usr/local/lib/python3.7/dist-packages (from aiohttp-socks->twint) (1.2.4)\n",
308
+ "Requirement already satisfied: urllib3<2,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from elasticsearch->twint) (1.24.3)\n",
309
+ "Requirement already satisfied: certifi in /usr/local/lib/python3.7/dist-packages (from elasticsearch->twint) (2021.5.30)\n",
310
+ "Requirement already satisfied: geographiclib<2,>=1.49 in /usr/local/lib/python3.7/dist-packages (from geopy->twint) (1.52)\n",
311
+ "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from googletransx->twint) (2.23.0)\n",
312
+ "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas->twint) (2.8.2)\n",
313
+ "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas->twint) (2018.9)\n",
314
+ "Requirement already satisfied: numpy>=1.15.4 in /usr/local/lib/python3.7/dist-packages (from pandas->twint) (1.19.5)\n",
315
+ "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas->twint) (1.15.0)\n"
316
+ ]
317
+ }
318
+ ]
319
+ },
320
+ {
321
+ "cell_type": "code",
322
+ "metadata": {
323
+ "id": "qZyxEstcy8R9"
324
+ },
325
+ "source": [
326
+ "# Import required libraries\n",
327
+ "import nest_asyncio\n",
328
+ "nest_asyncio.apply()\n",
329
+ "import pandas as pd\n",
330
+ "import twint\n",
331
+ "import pandas as pd\n",
332
+ "import re"
333
+ ],
334
+ "execution_count": null,
335
+ "outputs": []
336
+ },
337
+ {
338
+ "cell_type": "code",
339
+ "metadata": {
340
+ "colab": {
341
+ "base_uri": "https://localhost:8080/",
342
+ "height": 371
343
+ },
344
+ "id": "oAkguWB20dB4",
345
+ "outputId": "e595e29d-f470-4f3c-9846-32a60f7ed2ef"
346
+ },
347
+ "source": [
348
+ "# add some tweets with depressed and depression tags, for a particular year\n",
349
+ "\n",
350
+ "depress_tags = [\"#depressed\", \"#anxiety\", \"#depression\", \"#suicide\", \"#mentalhealth\"\n",
351
+ " \"#loneliness\", \"#hopelessness\", \"#itsokaynottobeokay\"]\n",
352
+ "\n",
353
+ "content = {}\n",
354
+ "for i in range(len(depress_tags)):\n",
355
+ " print(depress_tags[i])\n",
356
+ " c = twint.Config()\n",
357
+ " \n",
358
+ " c.Format = \"Tweet id: {id} | Tweet: {tweet}\"\n",
359
+ " c.Search = depress_tags[i]\n",
360
+ " c.Limit = 1000\n",
361
+ " c.Year = 2019\n",
362
+ " c.Lang = \"en\"\n",
363
+ " c.Store_csv = True\n",
364
+ " c.Store_Object = True\n",
365
+ " c.Output = \"/content/drive/MyDrive/NLP/Depression_Detection/depressive_en_2019.csv\"\n",
366
+ " c.Hide_output = True\n",
367
+ " c.Stats = True\n",
368
+ " c.Lowercase = True\n",
369
+ " c.Filter_retweets = True\n",
370
+ " twint.run.Search(c)"
371
+ ],
372
+ "execution_count": null,
373
+ "outputs": [
374
+ {
375
+ "output_type": "stream",
376
+ "name": "stdout",
377
+ "text": [
378
+ "#depressed\n"
379
+ ]
380
+ },
381
+ {
382
+ "output_type": "error",
383
+ "ename": "TypeError",
384
+ "evalue": "ignored",
385
+ "traceback": [
386
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
387
+ "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
388
+ "\u001b[0;32m<ipython-input-3-092f46e39459>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[0mc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mLowercase\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[0mc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mFilter_retweets\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 23\u001b[0;31m \u001b[0mtwint\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSearch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
389
+ "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/twint/run.py\u001b[0m in \u001b[0;36mSearch\u001b[0;34m(config, callback)\u001b[0m\n\u001b[1;32m 408\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mFollowers\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 409\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mProfile\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 410\u001b[0;31m \u001b[0mrun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcallback\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 411\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPandas_au\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 412\u001b[0m \u001b[0mstorage\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpanda\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_autoget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"tweet\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
390
+ "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/twint/run.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(config, callback)\u001b[0m\n\u001b[1;32m 327\u001b[0m \u001b[0;32mraise\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 328\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 329\u001b[0;31m \u001b[0mget_event_loop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_until_complete\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mTwint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcallback\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 330\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 331\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
391
+ "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/nest_asyncio.py\u001b[0m in \u001b[0;36mrun_until_complete\u001b[0;34m(self, future)\u001b[0m\n\u001b[1;32m 68\u001b[0m raise RuntimeError(\n\u001b[1;32m 69\u001b[0m 'Event loop stopped before Future completed.')\n\u001b[0;32m---> 70\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 71\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_run_once\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
392
+ "\u001b[0;32m/usr/lib/python3.7/asyncio/futures.py\u001b[0m in \u001b[0;36mresult\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 179\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__log_traceback\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 180\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_exception\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 181\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_exception\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 182\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_result\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 183\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
393
+ "\u001b[0;32m/usr/lib/python3.7/asyncio/tasks.py\u001b[0m in \u001b[0;36m__step\u001b[0;34m(***failed resolving arguments***)\u001b[0m\n\u001b[1;32m 249\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcoro\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 250\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 251\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcoro\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mthrow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 252\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mStopIteration\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 253\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_must_cancel\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
394
+ "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/twint/run.py\u001b[0m in \u001b[0;36mmain\u001b[0;34m(self, callback)\u001b[0m\n\u001b[1;32m 233\u001b[0m \u001b[0mtask\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd_done_callback\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcallback\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 234\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 235\u001b[0;31m \u001b[0;32mawait\u001b[0m \u001b[0mtask\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 236\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 237\u001b[0m \u001b[0;32masync\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mrun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
395
+ "\u001b[0;32m/usr/lib/python3.7/asyncio/futures.py\u001b[0m in \u001b[0;36m__await__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 261\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdone\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 262\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_asyncio_future_blocking\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 263\u001b[0;31m \u001b[0;32myield\u001b[0m \u001b[0mself\u001b[0m \u001b[0;31m# This tells Task to wait for completion.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 264\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdone\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 265\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mRuntimeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"await wasn't used with future\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
396
+ "\u001b[0;32m/usr/lib/python3.7/asyncio/tasks.py\u001b[0m in \u001b[0;36m__wakeup\u001b[0;34m(self, future)\u001b[0m\n\u001b[1;32m 316\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__wakeup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfuture\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 317\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 318\u001b[0;31m \u001b[0mfuture\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 319\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 320\u001b[0m \u001b[0;31m# This may also be a cancellation.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
397
+ "\u001b[0;32m/usr/lib/python3.7/asyncio/futures.py\u001b[0m in \u001b[0;36mresult\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 179\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__log_traceback\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 180\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_exception\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 181\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_exception\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 182\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_result\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 183\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
398
+ "\u001b[0;32m/usr/lib/python3.7/asyncio/tasks.py\u001b[0m in \u001b[0;36m__step\u001b[0;34m(***failed resolving arguments***)\u001b[0m\n\u001b[1;32m 247\u001b[0m \u001b[0;31m# We use the `send` method directly, because coroutines\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 248\u001b[0m \u001b[0;31m# don't have `__iter__` and `__next__` methods.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 249\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcoro\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 250\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 251\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcoro\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mthrow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
399
+ "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/twint/run.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 284\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTwitterSearch\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 285\u001b[0m \u001b[0mlogme\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdebug\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m':Twint:main:twitter-search'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 286\u001b[0;31m \u001b[0;32mawait\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtweets\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 287\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 288\u001b[0m \u001b[0mlogme\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdebug\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m':Twint:main:no-more-tweets'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
400
+ "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/twint/run.py\u001b[0m in \u001b[0;36mtweets\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 224\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtweet\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfeed\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 225\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcount\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 226\u001b[0;31m \u001b[0;32mawait\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTweets\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtweet\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 227\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 228\u001b[0m \u001b[0;32masync\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mmain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcallback\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
401
+ "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/twint/output.py\u001b[0m in \u001b[0;36mTweets\u001b[0;34m(tweets, config, conn)\u001b[0m\n\u001b[1;32m 164\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTwitterSearch\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mProfile\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 165\u001b[0m \u001b[0mlogme\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdebug\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m':Tweets:TwitterSearch'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 166\u001b[0;31m \u001b[0;32mawait\u001b[0m \u001b[0mcheckData\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtweets\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 167\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 168\u001b[0m \u001b[0mlogme\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdebug\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m':Tweets:else'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
402
+ "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/twint/output.py\u001b[0m in \u001b[0;36mcheckData\u001b[0;34m(tweet, config, conn)\u001b[0m\n\u001b[1;32m 135\u001b[0m \u001b[0;32mreturn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 136\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdatecheck\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtweet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdatestamp\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\" \"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mtweet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtimestamp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 137\u001b[0;31m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mformat\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTweet\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtweet\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 138\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDatabase\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 139\u001b[0m \u001b[0mlogme\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdebug\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m':checkData:Database'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
403
+ "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/twint/format.py\u001b[0m in \u001b[0;36mTweet\u001b[0;34m(config, t)\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"{hashtags}\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\",\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhashtags\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"{cashtags}\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\",\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcashtags\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 23\u001b[0;31m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"{replies}\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplies_count\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 24\u001b[0m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"{retweets}\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mretweets_count\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"{likes}\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlikes_count\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
404
+ "\u001b[0;31mTypeError\u001b[0m: replace() argument 2 must be str, not int"
405
+ ]
406
+ }
407
+ ]
408
+ },
409
+ {
410
+ "cell_type": "code",
411
+ "metadata": {
412
+ "colab": {
413
+ "base_uri": "https://localhost:8080/",
414
+ "height": 424
415
+ },
416
+ "id": "g4zJVi7sy_b2",
417
+ "outputId": "71712dc1-cfeb-4294-f883-86fc6ce82984"
418
+ },
419
+ "source": [
420
+ "# add some tweets with depressed and depression tags, for a particular year\n",
421
+ "\n",
422
+ "depress_tags = [\"#depressed\", \"#depression\", \"#loneliness\", \"#hopelessness\"]\n",
423
+ "\n",
424
+ "content = {}\n",
425
+ "for i in range(len(depress_tags)):\n",
426
+ " print(depress_tags[i])\n",
427
+ " c = twint.Config()\n",
428
+ " \n",
429
+ " c.Format = \"Tweet id: {id} | Tweet: {tweet}\"\n",
430
+ " c.Search = depress_tags[i]\n",
431
+ " c.Limit = 1000\n",
432
+ " c.Year = 2020\n",
433
+ " c.Store_csv = True\n",
434
+ " c.Store_json = True\n",
435
+ " c.Output = \"/content/drive/MyDrive/NLP/Depression_Detection/dataset_depression.json\"\n",
436
+ " c.Hide_output = True\n",
437
+ " c.Stats = True\n",
438
+ " c.Lowercase = True\n",
439
+ " c.Filter_retweets = True\n",
440
+ " twint.run.Search(c)"
441
+ ],
442
+ "execution_count": null,
443
+ "outputs": [
444
+ {
445
+ "output_type": "stream",
446
+ "name": "stdout",
447
+ "text": [
448
+ "#depressed\n",
449
+ "[!] No more data! Scraping will stop now.\n",
450
+ "found 0 deleted tweets in this search.\n",
451
+ "#depression\n"
452
+ ]
453
+ },
454
+ {
455
+ "output_type": "error",
456
+ "ename": "TypeError",
457
+ "evalue": "ignored",
458
+ "traceback": [
459
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
460
+ "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
461
+ "\u001b[0;32m<ipython-input-6-d584c0441bfc>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0mc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mLowercase\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[0mc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mFilter_retweets\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 21\u001b[0;31m \u001b[0mtwint\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSearch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
462
+ "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/twint/run.py\u001b[0m in \u001b[0;36mSearch\u001b[0;34m(config, callback)\u001b[0m\n\u001b[1;32m 408\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mFollowers\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 409\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mProfile\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 410\u001b[0;31m \u001b[0mrun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcallback\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 411\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPandas_au\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 412\u001b[0m \u001b[0mstorage\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpanda\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_autoget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"tweet\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
463
+ "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/twint/run.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(config, callback)\u001b[0m\n\u001b[1;32m 327\u001b[0m \u001b[0;32mraise\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 328\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 329\u001b[0;31m \u001b[0mget_event_loop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_until_complete\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mTwint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcallback\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 330\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 331\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
464
+ "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/nest_asyncio.py\u001b[0m in \u001b[0;36mrun_until_complete\u001b[0;34m(self, future)\u001b[0m\n\u001b[1;32m 68\u001b[0m raise RuntimeError(\n\u001b[1;32m 69\u001b[0m 'Event loop stopped before Future completed.')\n\u001b[0;32m---> 70\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 71\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_run_once\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
465
+ "\u001b[0;32m/usr/lib/python3.7/asyncio/futures.py\u001b[0m in \u001b[0;36mresult\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 179\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__log_traceback\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 180\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_exception\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 181\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_exception\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 182\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_result\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 183\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
466
+ "\u001b[0;32m/usr/lib/python3.7/asyncio/tasks.py\u001b[0m in \u001b[0;36m__step\u001b[0;34m(***failed resolving arguments***)\u001b[0m\n\u001b[1;32m 249\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcoro\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 250\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 251\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcoro\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mthrow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 252\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mStopIteration\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 253\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_must_cancel\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
467
+ "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/twint/run.py\u001b[0m in \u001b[0;36mmain\u001b[0;34m(self, callback)\u001b[0m\n\u001b[1;32m 233\u001b[0m \u001b[0mtask\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd_done_callback\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcallback\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 234\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 235\u001b[0;31m \u001b[0;32mawait\u001b[0m \u001b[0mtask\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 236\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 237\u001b[0m \u001b[0;32masync\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mrun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
468
+ "\u001b[0;32m/usr/lib/python3.7/asyncio/futures.py\u001b[0m in \u001b[0;36m__await__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 261\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdone\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 262\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_asyncio_future_blocking\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 263\u001b[0;31m \u001b[0;32myield\u001b[0m \u001b[0mself\u001b[0m \u001b[0;31m# This tells Task to wait for completion.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 264\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdone\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 265\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mRuntimeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"await wasn't used with future\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
469
+ "\u001b[0;32m/usr/lib/python3.7/asyncio/tasks.py\u001b[0m in \u001b[0;36m__wakeup\u001b[0;34m(self, future)\u001b[0m\n\u001b[1;32m 316\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__wakeup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfuture\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 317\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 318\u001b[0;31m \u001b[0mfuture\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 319\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 320\u001b[0m \u001b[0;31m# This may also be a cancellation.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
470
+ "\u001b[0;32m/usr/lib/python3.7/asyncio/futures.py\u001b[0m in \u001b[0;36mresult\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 179\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__log_traceback\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 180\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_exception\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 181\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_exception\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 182\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_result\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 183\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
471
+ "\u001b[0;32m/usr/lib/python3.7/asyncio/tasks.py\u001b[0m in \u001b[0;36m__step\u001b[0;34m(***failed resolving arguments***)\u001b[0m\n\u001b[1;32m 247\u001b[0m \u001b[0;31m# We use the `send` method directly, because coroutines\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 248\u001b[0m \u001b[0;31m# don't have `__iter__` and `__next__` methods.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 249\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcoro\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 250\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 251\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcoro\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mthrow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
472
+ "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/twint/run.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 284\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTwitterSearch\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 285\u001b[0m \u001b[0mlogme\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdebug\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m':Twint:main:twitter-search'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 286\u001b[0;31m \u001b[0;32mawait\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtweets\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 287\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 288\u001b[0m \u001b[0mlogme\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdebug\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m':Twint:main:no-more-tweets'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
473
+ "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/twint/run.py\u001b[0m in \u001b[0;36mtweets\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 224\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtweet\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfeed\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 225\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcount\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 226\u001b[0;31m \u001b[0;32mawait\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTweets\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtweet\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 227\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 228\u001b[0m \u001b[0;32masync\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mmain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcallback\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
474
+ "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/twint/output.py\u001b[0m in \u001b[0;36mTweets\u001b[0;34m(tweets, config, conn)\u001b[0m\n\u001b[1;32m 164\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTwitterSearch\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mProfile\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 165\u001b[0m \u001b[0mlogme\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdebug\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m':Tweets:TwitterSearch'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 166\u001b[0;31m \u001b[0;32mawait\u001b[0m \u001b[0mcheckData\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtweets\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 167\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 168\u001b[0m \u001b[0mlogme\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdebug\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m':Tweets:else'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
475
+ "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/twint/output.py\u001b[0m in \u001b[0;36mcheckData\u001b[0;34m(tweet, config, conn)\u001b[0m\n\u001b[1;32m 135\u001b[0m \u001b[0;32mreturn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 136\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdatecheck\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtweet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdatestamp\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\" \"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mtweet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtimestamp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 137\u001b[0;31m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mformat\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTweet\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtweet\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 138\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDatabase\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 139\u001b[0m \u001b[0mlogme\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdebug\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m':checkData:Database'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
476
+ "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/twint/format.py\u001b[0m in \u001b[0;36mTweet\u001b[0;34m(config, t)\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"{hashtags}\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\",\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhashtags\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"{cashtags}\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\",\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcashtags\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 23\u001b[0;31m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"{replies}\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplies_count\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 24\u001b[0m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"{retweets}\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mretweets_count\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"{likes}\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlikes_count\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
477
+ "\u001b[0;31mTypeError\u001b[0m: replace() argument 2 must be str, not int"
478
+ ]
479
+ }
480
+ ]
481
+ }
482
+ ]
483
+ }
source_code/assets/notebooks/data_gathering_twitter_API.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
source_code/assets/notebooks/modeling.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
source_code/core/clean.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ==============================================================================
2
+ # PROJECT: DEPRESSION-DETECTION-USING-TWEETS
3
+ # AUTHORS: AMEY THAKUR & MEGA SATISH
4
+ # GITHUB (AMEY): https://github.com/Amey-Thakur
5
+ # GITHUB (MEGA): https://github.com/msatmod
6
+ # REPOSITORY: https://github.com/Amey-Thakur/DEPRESSION-DETECTION-USING-TWEETS
7
+ # RELEASE DATE: June 5, 2022
8
+ # LICENSE: MIT License
9
+ # DESCRIPTION: Utility for cleaning raw tweet data for analysis.
10
+ # ==============================================================================
11
+
12
+ import argparse
13
+ import warnings
14
+ import clean_utilities as CU
15
+
16
+ # Suppression of non-critical runtime warnings to ensure output clarity
17
+ warnings.filterwarnings("ignore")
18
+
19
+ def main():
20
+ """
21
+ Primary execution routine for the tweet cleaning utility.
22
+
23
+ This script facilitates the transformation of raw unstructured text
24
+ into a standardized format, essential for downstream machine learning
25
+ inference and training.
26
+ """
27
+ # Configuration of the command-line argument parser
28
+ parser = argparse.ArgumentParser(
29
+ description="Twitter Depression Detection: Text Cleaning Utility"
30
+ )
31
+
32
+ # Definition of the mandatory positional argument for input file path
33
+ parser.add_argument(
34
+ 'filename',
35
+ help="Path to the raw text file containing the tweet to be sanitized"
36
+ )
37
+
38
+ # Parsing and validation of terminal arguments
39
+ args = parser.parse_args()
40
+
41
+ # Conditional logic to verify input availability before processing
42
+ if args.filename is not None:
43
+ print(f"Targeting file for preprocessing: {args.filename}")
44
+
45
+ try:
46
+ # Atomic read operation for the target text file
47
+ with open(args.filename, 'r', encoding='utf-8') as file:
48
+ raw_tweet = file.read()
49
+
50
+ # Invocation of the granular cleaning pipeline
51
+ # Methodology includes contraction expansion, tokenization, and lemmatization
52
+ print("Linguistic cleaning in progress...")
53
+ sanitized_tweet = CU.tweets_cleaner(raw_tweet)
54
+
55
+ # Persisting the sanitized result to local storage
56
+ with open('clean_tweet.txt', 'w', encoding='utf-8') as output_file:
57
+ print("Sanitization complete. Persistence target: clean_tweet.txt")
58
+ output_file.write(sanitized_tweet)
59
+
60
+ except FileNotFoundError:
61
+ print(f"Error: The specified file '{args.filename}' was not discovered.")
62
+ except Exception as e:
63
+ print(f"An unexpected analytical error occurred: {e}")
64
+
65
+ else:
66
+ print("Required input: Please specify a valid filename as a positional argument.")
67
+
68
+ if __name__ == '__main__':
69
+ main()
70
+
71
+
72
+
source_code/core/clean_utilities.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ==============================================================================
2
+ # PROJECT: DEPRESSION-DETECTION-USING-TWEETS
3
+ # AUTHORS: AMEY THAKUR & MEGA SATISH
4
+ # GITHUB (AMEY): https://github.com/Amey-Thakur
5
+ # GITHUB (MEGA): https://github.com/msatmod
6
+ # REPOSITORY: https://github.com/Amey-Thakur/DEPRESSION-DETECTION-USING-TWEETS
7
+ # RELEASE DATE: June 5, 2022
8
+ # LICENSE: MIT License
9
+ # DESCRIPTION: Core NLP logic for cleaning and normalizing tweet text.
10
+ # ==============================================================================
11
+
12
+ import re
13
+ import warnings
14
+ import nltk
15
+ import ftfy
16
+ from nltk.stem import WordNetLemmatizer
17
+ from nltk.corpus import stopwords
18
+
19
+ # Suppression of non-critical warnings to ensure a streamlined algorithmic log
20
+ warnings.filterwarnings("ignore")
21
+
22
+ # Dictionary of standard English contractions for lexical expansion
23
+ # This facilitates uniform tokenization by resolving ambiguous shorthand
24
+ CONTRACTIONS_LIST = {
25
+ "ain't": "am not",
26
+ "aren't": "are not",
27
+ "can't": "cannot",
28
+ "can't've": "cannot have",
29
+ "'cause": "because",
30
+ "could've": "could have",
31
+ "couldn't": "could not",
32
+ "couldn't've": "could not have",
33
+ "didn't": "did not",
34
+ "doesn't": "does not",
35
+ "don't": "do not",
36
+ "hadn't": "had not",
37
+ "hadn't've": "had not have",
38
+ "hasn't": "has not",
39
+ "haven't": "have not",
40
+ "he'd": "he would",
41
+ "he'd've": "he would have",
42
+ "he'll": "he will",
43
+ "he'll've": "he will have",
44
+ "he's": "he is",
45
+ "how'd": "how did",
46
+ "how'd'y": "how do you",
47
+ "how'll": "how will",
48
+ "how's": "how is",
49
+ "I'd": "I would",
50
+ "I'd've": "I would have",
51
+ "I'll": "I will",
52
+ "I'll've": "I will have",
53
+ "I'm": "I am",
54
+ "I've": "I have",
55
+ "isn't": "is not",
56
+ "it'd": "it had",
57
+ "it'd've": "it would have",
58
+ "it'll": "it will",
59
+ "it'll've": "it will have",
60
+ "it's": "it is",
61
+ "let's": "let us",
62
+ "ma'am": "madam",
63
+ "mayn't": "may not",
64
+ "might've": "might have",
65
+ "mightn't": "might not",
66
+ "mightn't've": "might not have",
67
+ "must've": "must have",
68
+ "mustn't": "must not",
69
+ "mustn't've": "must not have",
70
+ "needn't": "need not",
71
+ "needn't've": "need not have",
72
+ "o'clock": "of the clock",
73
+ "oughtn't": "ought not",
74
+ "oughtn't've": "ought not have",
75
+ "shan't": "shall not",
76
+ "sha'n't": "shall not",
77
+ "shan't've": "shall not have",
78
+ "she'd": "she would",
79
+ "she'd've": "she would have",
80
+ "she'll": "she will",
81
+ "she'll've": "she will have",
82
+ "she's": "she is",
83
+ "should've": "should have",
84
+ "shouldn't": "should not",
85
+ "shouldn't've": "should not have",
86
+ "so've": "so have",
87
+ "so's": "so is",
88
+ "that'd": "that would",
89
+ "that'd've": "that would have",
90
+ "that's": "that is",
91
+ "there'd": "there had",
92
+ "there'd've": "there would have",
93
+ "there's": "there is",
94
+ "they'd": "they would",
95
+ "they'd've": "they would have",
96
+ "they'll": "they will",
97
+ "they'll've": "they will have",
98
+ "they're": "they are",
99
+ "they've": "they have",
100
+ "to've": "to have",
101
+ "wasn't": "was not",
102
+ "we'd": "we had",
103
+ "we'd've": "we would have",
104
+ "we'll": "we will",
105
+ "we'll've": "we will have",
106
+ "we're": "we are",
107
+ "we've": "we have",
108
+ "weren't": "were not",
109
+ "what'll": "what will",
110
+ "what'll've": "what will have",
111
+ "what're": "what are",
112
+ "what's": "what is",
113
+ "what've": "what have",
114
+ "when's": "when is",
115
+ "when've": "when have",
116
+ "where'd": "where did",
117
+ "where's": "where is",
118
+ "where've": "where have",
119
+ "who'll": "who will",
120
+ "who'll've": "who will have",
121
+ "who's": "who is",
122
+ "who've": "who have",
123
+ "why's": "why is",
124
+ "why've": "why have",
125
+ "will've": "will have",
126
+ "won't": "will not",
127
+ "won't've": "will not have",
128
+ "would've": "would have",
129
+ "wouldn't": "would not",
130
+ "wouldn't've": "would not have",
131
+ "y'all": "you all",
132
+ "y'alls": "you alls",
133
+ "y'all'd": "you all would",
134
+ "y'all'd've": "you all would have",
135
+ "y'all're": "you all are",
136
+ "y'all've": "you all have",
137
+ "you'd": "you had",
138
+ "you'd've": "you would have",
139
+ "you'll": "you you will",
140
+ "you'll've": "you you will have",
141
+ "you're": "you are",
142
+ "you've": "you have"
143
+ }
144
+
145
+ # Pre-compiled regular expression for efficient contraction matching
146
+ CONTRACTIONS_RE = re.compile('(%s)' % '|'.join(CONTRACTIONS_LIST.keys()))
147
+
148
+ def expand_contractions(text: str, contractions_re=CONTRACTIONS_RE) -> str:
149
+ """
150
+ Identifies and replaces English contractions within the input text
151
+ using a predefined mapping.
152
+
153
+ Args:
154
+ text (str): The raw text potentially containing contractions.
155
+ contractions_re: Compiled regex pattern for matching contractions.
156
+
157
+ Returns:
158
+ str: Expanded lexical form of the input text.
159
+ """
160
+ def replace(match):
161
+ return CONTRACTIONS_LIST[match.group(0)]
162
+ return contractions_re.sub(replace, text)
163
+
164
+ def tweets_cleaner(tweet: str) -> str:
165
+ """
166
+ Executes a comprehensive analytical pipeline for the linguistic
167
+ normalization of microblogging content (Tweets).
168
+
169
+ Analytical Methodology:
170
+ 1. Case Normalization: Lowercasting to ensure uniformity.
171
+ 2. Relevance Filtering: Exclusion of tweets consisting solely of URLs.
172
+ 3. Noise Reduction: Removal of hashtags, mentions, and visual asset links.
173
+ 4. Encoding Correction: Fixing malformed Unicode sequences (via ftfy).
174
+ 5. Lexical Expansion: Resolution of linguistic contractions.
175
+ 6. Punctuation Removal: Strategic elimination of non-alphanumeric noise.
176
+ 7. Morphological Analysis: Removal of high-frequency stop words and
177
+ application of WordNet-based lemmatization to reduce words to
178
+ their base semantic roots.
179
+
180
+ Args:
181
+ tweet (str): Raw input tweet captured from the platform.
182
+
183
+ Returns:
184
+ str: Sanitized and normalized string ready for vectorization.
185
+ """
186
+ # Phase 1: Case Uniformity
187
+ tweet = tweet.lower()
188
+
189
+ # Phase 2: Structural Relevance Check (Filtering out pure URL content)
190
+ if re.match("(\w+:\/\/\S+)", tweet) is None:
191
+
192
+ # Phase 3: Targeted entity removal (Handles Twitter-specific artifacts)
193
+ tweet = ' '.join(
194
+ re.sub(
195
+ "(@[A-Za-z0-9]+)|(\#[A-Za-z0-9]+)|(<Emoji:.*>)|(pic\.twitter\.com\/.*)",
196
+ " ",
197
+ tweet
198
+ ).split()
199
+ )
200
+
201
+ # Phase 4: Resolution of malformed character encodings
202
+ tweet = ftfy.fix_text(tweet)
203
+
204
+ # Phase 5: Applied contraction expansion for token consistency
205
+ tweet = expand_contractions(tweet)
206
+
207
+ # Phase 6: Punctuation and non-essential character pruning
208
+ tweet = ' '.join(re.sub("([^0-9A-Za-z \t])", " ", tweet).split())
209
+
210
+ # Phase 7: Stop-word filtration and Lemmatization
211
+ # Methodology: Reducing inflectional forms to a common base word (Lemma)
212
+ stop_words_set = set(stopwords.words('english'))
213
+ tokens = nltk.word_tokenize(tweet)
214
+
215
+ lemmatizer_engine = WordNetLemmatizer()
216
+ filtered_lexicon = [
217
+ lemmatizer_engine.lemmatize(word)
218
+ for word in tokens
219
+ if word not in stop_words_set
220
+ ]
221
+
222
+ # Phase 8: Re-assembly of the normalized semantic string
223
+ tweet = ' '.join(filtered_lexicon)
224
+
225
+ return tweet
226
+
source_code/core/predict.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ==============================================================================
2
+ # PROJECT: DEPRESSION-DETECTION-USING-TWEETS
3
+ # AUTHORS: AMEY THAKUR & MEGA SATISH
4
+ # GITHUB (AMEY): https://github.com/Amey-Thakur
5
+ # GITHUB (MEGA): https://github.com/msatmod
6
+ # REPOSITORY: https://github.com/Amey-Thakur/DEPRESSION-DETECTION-USING-TWEETS
7
+ # RELEASE DATE: June 5, 2022
8
+ # LICENSE: MIT License
9
+ # DESCRIPTION: Utility for predicting depression levels in tweets using SVM.
10
+ # ==============================================================================
11
+
12
+ import argparse
13
+ import pickle
14
+ import warnings
15
+ import numpy as np
16
+ import pandas as pd
17
+ import spacy
18
+ import en_core_web_lg
19
+ import clean_utilities as CU
20
+
21
+ # Suppression of non-critical runtime warnings to maintain output integrity
22
+ warnings.filterwarnings("ignore")
23
+
24
+ def main():
25
+ """
26
+ Main entry point for the prediction utility.
27
+
28
+ This script encapsulates the end-to-end inference pipeline:
29
+ 1. Argument Parsing: Captures input text file and model selection.
30
+ 2. Text Preprocessing: Normalization via clean_utilities.
31
+ 3. Feature Extraction: Generating centroid embeddings via spaCy.
32
+ 4. Classification: Binary sentiment analysis via pre-trained SVM.
33
+ """
34
+ # Initialize the CLI argument parser with a descriptive header
35
+ parser = argparse.ArgumentParser(
36
+ description="Twitter Depression Detection: Machine Learning Inference Utility"
37
+ )
38
+
39
+ # Positional argument for the target tweet content (text file)
40
+ parser.add_argument(
41
+ 'filename',
42
+ help="Path to the text file containing the tweet for classification"
43
+ )
44
+
45
+ # Positional argument for the classification model type
46
+ parser.add_argument(
47
+ 'model',
48
+ help="Target model architecture (currently optimized for 'SVM')"
49
+ )
50
+
51
+ # Execution of the parsing logic
52
+ args = parser.parse_args()
53
+
54
+ # Pipeline validation: Ensuring input availability and model compatibility
55
+ if args.filename is not None and args.model == "SVM":
56
+ print(f"Loading input source: {args.filename}")
57
+
58
+ try:
59
+ # Step 1: Data Acquisition
60
+ with open(args.filename, 'r', encoding='utf-8') as file:
61
+ raw_test_tweet = file.read()
62
+ print(f"Captured Content: \"{raw_test_tweet}\"")
63
+
64
+ # Step 2: Linguistic Preprocessing
65
+ # Normalizes raw discourse into a tokenizable semantic format
66
+ print("Executing linguistic cleaning pipeline...")
67
+ cleaned_input = [CU.tweets_cleaner(raw_test_tweet)]
68
+ print(f"Normalized Form: {cleaned_input}")
69
+
70
+ # Step 3: Feature Space Transformation
71
+ # Utilizing dense word embeddings (spaCy 'en_core_web_lg' model)
72
+ print("Transforming text to 300-dimensional semantic vectors...")
73
+ nlp_engine = en_core_web_lg.load()
74
+
75
+ # Generating the centroid vector representing the tweet's linguistic context
76
+ semantic_features = np.array([
77
+ np.array([token.vector for token in nlp_engine(s)]).mean(axis=0) * np.ones((300))
78
+ for s in cleaned_input
79
+ ])
80
+
81
+ # Step 4: Model Artifact Loading
82
+ # Loading the serialized SVM classifier from the assets directory
83
+ model_artifact_path = "../assets/models/model_svm1.pkl"
84
+ with open(model_artifact_path, 'rb') as model_file:
85
+ classifier = pickle.load(model_file)
86
+
87
+ # Step 5: Algorithmic Inference
88
+ # The SVM determines the classification boundary for the semantic vector
89
+ print("Performing binary classification...")
90
+ prediction_bin = classifier.predict(semantic_features)
91
+
92
+ # Step 6: Result Interpretation and User Communication
93
+ is_depressive = prediction_bin[0]
94
+ if is_depressive == 1:
95
+ print("\n>>> CLASSIFICATION RESULT: The analyzed content exhibits depressive characteristics.")
96
+ else:
97
+ print("\n>>> CLASSIFICATION RESULT: The analyzed content is classified as non-depressive.")
98
+
99
+ except FileNotFoundError:
100
+ print(f"Error: The input file {args.filename} could not be located.")
101
+ except Exception as e:
102
+ print(f"An error occurred during the inference process: {e}")
103
+
104
+ else:
105
+ print("Usage Error: Please provide an input file and specify 'SVM' as the target model.")
106
+
107
+ if __name__ == '__main__':
108
+ main()
109
+
110
+
source_code/core/train.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ==============================================================================
2
+ # PROJECT: DEPRESSION-DETECTION-USING-TWEETS
3
+ # AUTHORS: AMEY THAKUR & MEGA SATISH
4
+ # GITHUB (AMEY): https://github.com/Amey-Thakur
5
+ # GITHUB (MEGA): https://github.com/msatmod
6
+ # REPOSITORY: https://github.com/Amey-Thakur/DEPRESSION-DETECTION-USING-TWEETS
7
+ # RELEASE DATE: June 5, 2022
8
+ # LICENSE: MIT License
9
+ # DESCRIPTION: Script for training machine learning models for tweet analysis.
10
+ # ==============================================================================
11
+
12
+ import argparse
13
+ import warnings
14
+ import train_utilities as TU
15
+
16
+ # Suppression of non-critical runtime warnings to ensure output clarity during training
17
+ warnings.filterwarnings("ignore")
18
+
19
+ def main():
20
+ """
21
+ Primary execution routine for the model training utility.
22
+
23
+ This script facilitates the training of various machine learning
24
+ architectures by providing a standardized interface for:
25
+ 1. Dataset Ingestion: Loading and splitting training data.
26
+ 2. Hyperparameter Configuration: Setting up model-specific parameters.
27
+ 3. Algorithmic Training: Executing the training process via train_utilities.
28
+ 4. Model Serialization: Persisting the resulting model for future inference.
29
+ """
30
+ # Initialize the CLI argument parser
31
+ parser = argparse.ArgumentParser(
32
+ description="Twitter Depression Detection: Model Training Utility"
33
+ )
34
+
35
+ # Positional argument for the training dataset path (CSV format)
36
+ parser.add_argument(
37
+ 'filename',
38
+ help="Path to the training dataset (TSV/CSV format with 'label' and 'clean_text')"
39
+ )
40
+
41
+ # Positional argument for the classification model architecture
42
+ # Supported: 'DT', 'LR', 'kNN', 'SVM', 'RF', 'NN', 'LSTM'
43
+ parser.add_argument(
44
+ 'model',
45
+ help="Target model architecture for training"
46
+ )
47
+
48
+ # Execution of the parsing logic
49
+ args = parser.parse_args()
50
+
51
+ # Deployment of the selected training pipeline based on the 'model' parameter
52
+ model_type = args.model
53
+ dataset_path = args.filename
54
+
55
+ # Pipeline selection logic
56
+ if model_type in ["DT", "LR", "kNN", "SVM", "RF", "NN"]:
57
+ # Logic for standardized Scikit-learn architectures
58
+ print(f"Initializing {model_type} training pipeline...")
59
+
60
+ # Step 1: Data Acquisition and Validation Splitting
61
+ X_train, X_test, Y_train, Y_test = TU.load_prepare_split_df(dataset_path)
62
+
63
+ # Step 2: Algorithmic Training and Parameter Optimization
64
+ # The 'classification' method handles instantiation and fitting
65
+ trained_model = TU.classification(X_train=X_train, Y_train=Y_train, model=model_type)
66
+
67
+ print(f"Training for {model_type} successful.")
68
+
69
+ elif model_type == "LSTM":
70
+ # Specialized logic for Long Short-Term Memory (LSTM) Neural Networks
71
+ # LSTMs are utilized here to capture long-range temporal dependencies in text
72
+ print("Initializing LSTM deep learning pipeline...")
73
+ TU.LSTM(dataset_path)
74
+
75
+ else:
76
+ print(f"Error: Model architecture '{model_type}' is not currently recognized.")
77
+ print("Supported architectures: DT, LR, kNN, SVM, RF, NN, LSTM")
78
+
79
+ if __name__ == '__main__':
80
+ main()
source_code/core/train_utilities.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ==============================================================================
2
+ # PROJECT: DEPRESSION-DETECTION-USING-TWEETS
3
+ # AUTHORS: AMEY THAKUR & MEGA SATISH
4
+ # GITHUB (AMEY): https://github.com/Amey-Thakur
5
+ # GITHUB (MEGA): https://github.com/msatmod
6
+ # REPOSITORY: https://github.com/Amey-Thakur/DEPRESSION-DETECTION-USING-TWEETS
7
+ # RELEASE DATE: June 5, 2022
8
+ # LICENSE: MIT License
9
+ # DESCRIPTION: Utility module for the model training pipeline.
10
+ # ==============================================================================
11
+
12
+ import pickle
13
+ import warnings
14
+ import numpy as np
15
+ import pandas as pd
16
+ import spacy
17
+ import en_core_web_lg
18
+ from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
19
+ from sklearn.metrics import confusion_matrix, accuracy_score
20
+ from sklearn.linear_model import LogisticRegression
21
+ from sklearn.neighbors import KNeighborsClassifier
22
+ from sklearn.svm import SVC
23
+ from sklearn.tree import DecisionTreeClassifier
24
+ from sklearn.neural_network import MLPClassifier
25
+ from sklearn.ensemble import RandomForestClassifier
26
+
27
+ # Suppression of non-critical runtime warnings to maintain algorithmic output integrity
28
+ warnings.filterwarnings("ignore")
29
+
30
+ def load_prepare_split_df(filename: str, targets=['label'], validation_size=0.3, seed=7):
31
+ """
32
+ Ingests raw data, performs feature extraction via word embeddings,
33
+ and partitions the dataset for model validation.
34
+
35
+ Methodology:
36
+ - TSV Ingestion: Data is loaded from the specified file.
37
+ - Semantic Vectorization: Utilizing spaCy's dense 300-dimensional
38
+ word embeddings (centroid of token vectors).
39
+ - Validation Partitioning: Stratified splitting of data into
40
+ training and testing subsets.
41
+
42
+ Args:
43
+ filename (str): Path to the TSV/CSV dataset.
44
+ targets (list): Column name for the dependent variable.
45
+ validation_size (float): Proportion of data reserved for testing.
46
+ seed (int): Random seed for reproducibility.
47
+
48
+ Returns:
49
+ tuple: (X_train, X_test, Y_train, Y_test) feature and label sets.
50
+ """
51
+ print(f"Acquiring dataset from: {filename}")
52
+ df_all = pd.read_csv(filename, sep='\t', encoding='utf-8')
53
+
54
+ # Step 1: Initialize the Linguistic Engine
55
+ nlp_engine = en_core_web_lg.load()
56
+
57
+ # Step 2: Compute Dense Word Embeddings (Feature Extraction)
58
+ print("Extracting semantic features via spaCy embeddings...")
59
+ feature_vectors = np.array([
60
+ np.array([token.vector for token in nlp_engine(s)]).mean(axis=0) * np.ones((300))
61
+ for s in df_all['clean_text']
62
+ ])
63
+
64
+ # Step 3: Dataset Splitting
65
+ y_labels = df_all.loc[:, targets]
66
+ x_features = feature_vectors
67
+
68
+ x_train, x_test, y_train, y_test = train_test_split(
69
+ x_features, y_labels, test_size=validation_size, random_state=seed
70
+ )
71
+
72
+ return x_train, x_test, y_train, y_test
73
+
74
+ def classification(X_train, Y_train, model=""):
75
+ """
76
+ Facilitates the training and serialization of various classification
77
+ architectures.
78
+
79
+ Architectures Supported:
80
+ - SVM: Support Vector Machine (Selected as the production primary).
81
+ - LR: Logistic Regression.
82
+ - DT: Decision Tree Classifier.
83
+ - KNN: k-Nearest Neighbors (with automated k-optimization).
84
+ - RF: Random Forest Classifier.
85
+ - NN: Multi-layer Perceptron (MLP) Neural Network.
86
+
87
+ Args:
88
+ X_train: Training feature set.
89
+ Y_train: Training label set.
90
+ model (str): Target architecture identifier.
91
+
92
+ Returns:
93
+ object: The trained Scikit-learn model instance.
94
+ """
95
+ if model == "SVM":
96
+ # Support Vector Machines are effective in high-dimensional semantic spaces
97
+ print("Initializing SVM (Support Vector Machine) training...")
98
+ clf = SVC(probability=True)
99
+ clf.fit(X_train, Y_train)
100
+
101
+ # Performance Evaluation (Accuracy Metric)
102
+ train_accuracy = accuracy_score(clf.predict(X_train), Y_train)
103
+ print(f"Training Convergence Accuracy: {train_accuracy:.4f}")
104
+
105
+ # Persistence: Serializing the model artifact
106
+ save_path = "../assets/models/model_svm_pc.pkl"
107
+ with open(save_path, 'wb') as file:
108
+ pickle.dump(clf, file)
109
+ return clf
110
+
111
+ elif model == "LR":
112
+ # Logistic Regression serves as a robust baseline for linear classification
113
+ print("Initializing Logistic Regression training...")
114
+ lr_model = LogisticRegression()
115
+ lr_model.fit(X_train, Y_train)
116
+
117
+ save_path = "../assets/models/model_LogReg.pkl"
118
+ with open(save_path, 'wb') as file:
119
+ pickle.dump(lr_model, file)
120
+ return lr_model
121
+
122
+ elif model == "DT":
123
+ # Decision Trees provide hierarchical decision boundaries
124
+ print("Initializing Decision Tree training...")
125
+ dt_model = DecisionTreeClassifier()
126
+ dt_model.fit(X_train, Y_train)
127
+
128
+ save_path = "../assets/models/model_DTC.pkl"
129
+ with open(save_path, 'wb') as file:
130
+ pickle.dump(dt_model, file)
131
+ return dt_model
132
+
133
+ elif model == "KNN":
134
+ # kNN requires hyperparameter tuning (k value) via cross-validation
135
+ print("Initializing kNN training with automated k-optimization...")
136
+ k_values = range(1, 32, 1)
137
+ k_scores = []
138
+
139
+ # 10-Fold Cross-Validation for optimal k-neighbor selection
140
+ for k in k_values:
141
+ knn = KNeighborsClassifier(n_neighbors=k)
142
+ score = np.mean(cross_val_score(knn, X_train, Y_train, cv=10))
143
+ k_scores.append(score)
144
+
145
+ optimal_k = k_values[np.argmax(k_scores)]
146
+ print(f"Optimized Hyperparameter discovered: k = {optimal_k}")
147
+
148
+ best_knn = KNeighborsClassifier(n_neighbors=optimal_k)
149
+ best_knn.fit(X_train, Y_train)
150
+
151
+ save_path = "../assets/models/model_KNN.pkl"
152
+ with open(save_path, 'wb') as file:
153
+ pickle.dump(best_knn, file)
154
+ return best_knn
155
+
156
+ elif model == "RF":
157
+ # Random Forest: Ensemble bagged decision trees for variance reduction
158
+ print("Initializing Random Forest training...")
159
+ rf_model = RandomForestClassifier()
160
+ rf_model.fit(X_train, Y_train)
161
+
162
+ save_path = "../assets/models/model_RF.pkl"
163
+ with open(save_path, 'wb') as file:
164
+ pickle.dump(rf_model, file)
165
+ return rf_model
166
+
167
+ elif model == "NN":
168
+ # MLP (Multi-layer Perceptron): Basic artificial neural network
169
+ print("Initializing Neural Network (MLP) training...")
170
+ nn_model = MLPClassifier()
171
+ nn_model.fit(X_train, Y_train)
172
+
173
+ save_path = "../assets/models/model_NN.pkl"
174
+ with open(save_path, 'wb') as file:
175
+ pickle.dump(nn_model, file)
176
+ return nn_model
177
+
178
+ def LSTM(filename: str):
179
+ """
180
+ Executes a Deep Learning pipeline using Long Short-Term Memory (LSTM)
181
+ recurrent neural networks for capturing temporal lingustical patterns.
182
+
183
+ Methodology:
184
+ - Tokenization: Integer encoding of sequences.
185
+ - Padding: Uniform sequence length normalization.
186
+ - Architecture: Embedding layer followed by LSTM with Dropouts.
187
+ """
188
+ from keras.models import Sequential
189
+ from keras.layers import Dense, Embedding, LSTM
190
+ from keras.preprocessing.text import Tokenizer
191
+ from keras.preprocessing.sequence import pad_sequences
192
+ from keras.wrappers.scikit_learn import KerasClassifier
193
+
194
+ print(f"Acquiring data for Deep Learning (LSTM): {filename}")
195
+ df_dl = pd.read_csv(filename, sep='\t', encoding='utf-8')
196
+
197
+ # Step 1: Sequence Tokenization and Padding
198
+ vocab_size = 20000
199
+ max_len = 50
200
+ tokenizer = Tokenizer(num_words=vocab_size)
201
+ tokenizer.fit_on_texts(df_dl['clean_text'])
202
+ seqs = tokenizer.texts_to_sequences(df_dl['clean_text'])
203
+ x_lstm = pad_sequences(seqs, maxlen=max_len)
204
+ y_lstm = df_dl["label"]
205
+
206
+ # Step 2: Architecture Definition
207
+ print("Constructing LSTM topology...")
208
+ model = Sequential()
209
+ model.add(Embedding(vocab_size, 300, input_length=max_len))
210
+ model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
211
+ model.add(Dense(1, activation='sigmoid'))
212
+ model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
213
+
214
+ # Step 3: Model Execution and Persistance
215
+ print("Commencing Deep Learning Convergence (LSTM)...")
216
+ # In a professional context, create_model should be passed to KerasClassifier
217
+ # Here we demonstrate the fundamental fit operation
218
+ model.fit(x_lstm, y_lstm, epochs=3, verbose=1, validation_split=0.3)
219
+
220
+ # Persistence: JSON topology and H5 weights
221
+ model_json = model.to_json()
222
+ with open("model_LSTM.json", "w") as json_file:
223
+ json_file.write(model_json)
224
+ model.save_weights("model_LSTM.h5")
225
+ print("Deep Learning model (LSTM) artifacts successfully persisted.")
226
+
source_code/notebooks/data_cleaning_exploration.py ADDED
@@ -0,0 +1,505 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """data_cleaning_exploration.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1jU6I53BYSZ2kX-qcxcWP_1bPNYWvB24f
8
+
9
+ # Data Cleaning
10
+
11
+ Data cleaning is the process of detecting and removing errors and inconsistencies from the data to improve its quality. Improper data cleaning process can lead to errors, faulty analysis, distortion in dataset and eventually incompatible datasets for machine learning purposes. There is no absolute way to prescribe the exact steps in the data cleaning process because the processes will vary from dataset to dataset. My data cleaning process includes:
12
+
13
+ * Check the data types
14
+ * Check for duplicates - Primary key ('tweets.id')
15
+ * Check missing values
16
+ * Make text all lower case
17
+ * Remove links and images
18
+ * Remove hashtags
19
+ * Remove @ mentions
20
+ * Remove emojis
21
+ * Remove stop words
22
+ * Remove punctuation
23
+ * Get rid of stuff like "what's" and making it "what is'
24
+ * Stemming / lemmatization
25
+ """
26
+
27
+ from google.colab import drive
28
+ drive.mount('/content/drive')
29
+
30
+ !pip install -qqq ftfy
31
+
32
+ ## Import required libraries
33
+
34
+ ## warnings
35
+ import warnings
36
+ warnings.filterwarnings("ignore")
37
+
38
+ ## for data
39
+ import numpy as np
40
+ import pandas as pd
41
+
42
+ ## for plotting
43
+ import matplotlib.pyplot as plt
44
+ import seaborn as sns
45
+
46
+ ## for processing
47
+ import nltk
48
+ import re
49
+ import ftfy
50
+ from nltk.stem import WordNetLemmatizer
51
+ from nltk.corpus import stopwords
52
+ nltk.download('stopwords')
53
+ nltk.download('punkt')
54
+ nltk.download('wordnet')
55
+ nltk.download('averaged_perceptron_tagger')
56
+
57
+ ## for opening, manipulating, and saving many different image file f
58
+ from PIL import Image
59
+
60
+ ## WordCloud - Python linrary for creating image wordclouds
61
+ from wordcloud import WordCloud
62
+ from nltk import pos_tag ## For Parts of Speech tagging
63
+ import random ## generating random numbers
64
+
65
+ """## Load the datasets"""
66
+
67
+ depressive_tweets_df = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/output/depressive_tweets.csv")
68
+ random_tweets_df = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Depression_tweets_Data/Data1/Sentiment Analysis Dataset 2.csv",
69
+ encoding = "ISO-8859-1", usecols = range(0,4), nrows = 40000)
70
+
71
+ depressive_tweets_df
72
+
73
+ random_tweets_df
74
+
75
+ ## Slicing the random tweets to have sentiment == 1
76
+ new_rand_df = random_tweets_df[random_tweets_df.Sentiment == 1]
77
+ new_rand_df.reset_index(inplace=True)
78
+
79
+ new_rand_df.shape
80
+
81
+ new_rand_df.head()
82
+
83
+ """20952 random tweets with sentiment == 1
84
+
85
+ ## Data Cleaning-Processing:
86
+ """
87
+
88
+ print(depressive_tweets_df.shape)
89
+ print(new_rand_df.shape)
90
+
91
+ ## Check the data type of each column
92
+ depressive_tweets_df.dtypes.to_frame().rename(columns={0:'data_type'})
93
+
94
+ ## Check the data type of each column
95
+ new_rand_df.dtypes.to_frame().rename(columns={0:'data_type'})
96
+
97
+ ## Drop unnecessary columns
98
+ depressive_tweets_df.drop(columns=['Unnamed: 0'], inplace=True)
99
+ new_rand_df.drop(columns=['ItemID', 'index','Sentiment', 'SentimentSource'], inplace=True)
100
+
101
+ """Since we are mostly dealing with the tweets in our datasets, it is not necessary to change the data types at this stage."""
102
+
103
+ ## Finding unique values in each column
104
+ for col in depressive_tweets_df:
105
+ print("There are ", len(depressive_tweets_df[col].unique()), "unique values in ", col)
106
+
107
+ """By considering **tweet.id** as our primary key, we have **18190** unique tweets, so we need to get rid of the duplicates."""
108
+
109
+ ## Finding unique values in each column
110
+ for col in new_rand_df:
111
+ print("There are ", len(new_rand_df[col].unique()), "unique values in ", col)
112
+
113
+ """No duplicates in random tweets dataset"""
114
+
115
+ ## drop duplicate values in tweet.id
116
+ depressive_tweets_df.drop_duplicates(subset=['tweet.id'], inplace=True)
117
+
118
+ depressive_tweets_df.reset_index(inplace=True)
119
+
120
+ depressive_tweets_df.shape
121
+
122
+ ## Find the number of Null values in each columns
123
+ depressive_tweets_df.isnull().sum().to_frame().rename(columns={0:'Null values'})
124
+
125
+ """There are **6384** Null values in the **location** columns but since location will not be used in our analysis or as a feature in our model, we don't need to replace them."""
126
+
127
+ ## Find the number of Null values in each columns
128
+ new_rand_df.isnull().sum().to_frame().rename(columns={0:'Null values'})
129
+
130
+ """No Null values in random tweets dataset."""
131
+
132
+ ## Drop all the columns except index, tweet.id and text
133
+ new_dep_df = depressive_tweets_df[['text']]
134
+
135
+ ## Add label to both datasets (0 is non-depressive and 1 is depressive)
136
+ new_dep_df['label'] = pd.Series([1 for x in range(len(new_dep_df.index))])
137
+ new_rand_df['label'] = pd.Series([0 for x in range(len(new_rand_df.index))])
138
+
139
+ new_dep_df
140
+
141
+ ## Change the column name to be aligned with depressive dataset
142
+ new_rand_df.rename(columns={'SentimentText': 'text'}, inplace=True)
143
+
144
+ new_rand_df
145
+
146
+ ## Combine two dataframes together
147
+ df_all = pd.concat([new_dep_df, new_rand_df], ignore_index=True)
148
+
149
+ df_all
150
+
151
+ # Expand Contraction
152
+ cList = {
153
+ "ain't": "am not",
154
+ "aren't": "are not",
155
+ "can't": "cannot",
156
+ "can't've": "cannot have",
157
+ "'cause": "because",
158
+ "could've": "could have",
159
+ "couldn't": "could not",
160
+ "couldn't've": "could not have",
161
+ "didn't": "did not",
162
+ "doesn't": "does not",
163
+ "don't": "do not",
164
+ "hadn't": "had not",
165
+ "hadn't've": "had not have",
166
+ "hasn't": "has not",
167
+ "haven't": "have not",
168
+ "he'd": "he would",
169
+ "he'd've": "he would have",
170
+ "he'll": "he will",
171
+ "he'll've": "he will have",
172
+ "he's": "he is",
173
+ "how'd": "how did",
174
+ "how'd'y": "how do you",
175
+ "how'll": "how will",
176
+ "how's": "how is",
177
+ "I'd": "I would",
178
+ "I'd've": "I would have",
179
+ "I'll": "I will",
180
+ "I'll've": "I will have",
181
+ "I'm": "I am",
182
+ "I've": "I have",
183
+ "isn't": "is not",
184
+ "it'd": "it had",
185
+ "it'd've": "it would have",
186
+ "it'll": "it will",
187
+ "it'll've": "it will have",
188
+ "it's": "it is",
189
+ "let's": "let us",
190
+ "ma'am": "madam",
191
+ "mayn't": "may not",
192
+ "might've": "might have",
193
+ "mightn't": "might not",
194
+ "mightn't've": "might not have",
195
+ "must've": "must have",
196
+ "mustn't": "must not",
197
+ "mustn't've": "must not have",
198
+ "needn't": "need not",
199
+ "needn't've": "need not have",
200
+ "o'clock": "of the clock",
201
+ "oughtn't": "ought not",
202
+ "oughtn't've": "ought not have",
203
+ "shan't": "shall not",
204
+ "sha'n't": "shall not",
205
+ "shan't've": "shall not have",
206
+ "she'd": "she would",
207
+ "she'd've": "she would have",
208
+ "she'll": "she will",
209
+ "she'll've": "she will have",
210
+ "she's": "she is",
211
+ "should've": "should have",
212
+ "shouldn't": "should not",
213
+ "shouldn't've": "should not have",
214
+ "so've": "so have",
215
+ "so's": "so is",
216
+ "that'd": "that would",
217
+ "that'd've": "that would have",
218
+ "that's": "that is",
219
+ "there'd": "there had",
220
+ "there'd've": "there would have",
221
+ "there's": "there is",
222
+ "they'd": "they would",
223
+ "they'd've": "they would have",
224
+ "they'll": "they will",
225
+ "they'll've": "they will have",
226
+ "they're": "they are",
227
+ "they've": "they have",
228
+ "to've": "to have",
229
+ "wasn't": "was not",
230
+ "we'd": "we had",
231
+ "we'd've": "we would have",
232
+ "we'll": "we will",
233
+ "we'll've": "we will have",
234
+ "we're": "we are",
235
+ "we've": "we have",
236
+ "weren't": "were not",
237
+ "what'll": "what will",
238
+ "what'll've": "what will have",
239
+ "what're": "what are",
240
+ "what's": "what is",
241
+ "what've": "what have",
242
+ "when's": "when is",
243
+ "when've": "when have",
244
+ "where'd": "where did",
245
+ "where's": "where is",
246
+ "where've": "where have",
247
+ "who'll": "who will",
248
+ "who'll've": "who will have",
249
+ "who's": "who is",
250
+ "who've": "who have",
251
+ "why's": "why is",
252
+ "why've": "why have",
253
+ "will've": "will have",
254
+ "won't": "will not",
255
+ "won't've": "will not have",
256
+ "would've": "would have",
257
+ "wouldn't": "would not",
258
+ "wouldn't've": "would not have",
259
+ "y'all": "you all",
260
+ "y'alls": "you alls",
261
+ "y'all'd": "you all would",
262
+ "y'all'd've": "you all would have",
263
+ "y'all're": "you all are",
264
+ "y'all've": "you all have",
265
+ "you'd": "you had",
266
+ "you'd've": "you would have",
267
+ "you'll": "you you will",
268
+ "you'll've": "you you will have",
269
+ "you're": "you are",
270
+ "you've": "you have"
271
+ }
272
+
273
+ c_re = re.compile('(%s)' % '|'.join(cList.keys()))
274
+
275
+ def expandContractions(text, c_re=c_re):
276
+ def replace(match):
277
+ return cList[match.group(0)]
278
+ return c_re.sub(replace, text)
279
+
280
+ ## Function to perform stepwise cleaning process
281
+ def tweets_cleaner(tweets):
282
+ cleaned_tweets = []
283
+ for tweet in tweets:
284
+ tweet = tweet.lower() #lowercase
285
+
286
+ # if url links then don't append to avoid news articles
287
+ # also check tweet length, save those > 5
288
+ if re.match("(\w+:\/\/\S+)", tweet) == None and len(tweet) > 5:
289
+
290
+ #remove hashtag, @mention, emoji and image URLs
291
+ tweet = ' '.join(re.sub("(@[A-Za-z0-9]+)|(\#[A-Za-z0-9]+)|(<Emoji:.*>)|(pic\.twitter\.com\/.*)", " ", tweet).split())
292
+
293
+ #fix weirdly encoded texts
294
+ tweet = ftfy.fix_text(tweet)
295
+
296
+ #expand contraction
297
+ tweet = expandContractions(tweet)
298
+
299
+
300
+ #remove punctuation
301
+ tweet = ' '.join(re.sub("([^0-9A-Za-z \t])", " ", tweet).split())
302
+
303
+ #stop words and lemmatization
304
+ stop_words = set(stopwords.words('english'))
305
+ word_tokens = nltk.word_tokenize(tweet)
306
+
307
+ lemmatizer=WordNetLemmatizer()
308
+ filtered_sentence = [lemmatizer.lemmatize(word) for word in word_tokens if not word in stop_words]
309
+ # back to string from list
310
+ tweet = ' '.join(filtered_sentence) # join words with a space in between them
311
+
312
+ cleaned_tweets.append(tweet)
313
+
314
+ return cleaned_tweets
315
+
316
+ """## Word Cloud:
317
+
318
+ To get the most common words used in depressive and random datasets, the POS-tag (Parts of Speech tagging) module in the NLTK library was used. Using the WordCloud library, one can generate a Word Cloud based on word frequency and superimpose these words on any image. In this case, I used the Twitter logo and Matplotlib to display the image. The Word Cloud shows the words with higher frequency in bigger text size while the "not-so" common words are in smaller text sizes.
319
+ """
320
+
321
+ depressive_tweets_arr = [x for x in new_dep_df['text']]
322
+ random_tweets_arr = [x for x in new_rand_df['text']]
323
+ X_d = tweets_cleaner(depressive_tweets_arr)
324
+ X_r = tweets_cleaner(random_tweets_arr)
325
+
326
+ ## function to obtain adjectives from tweets
327
+ def getadjectives(tweet):
328
+ tweet = nltk.word_tokenize(tweet) # convert string to tokens
329
+ tweet = [word for (word, tag) in pos_tag(tweet)
330
+ if tag == "JJ"] # pos_tag module in NLTK library
331
+ return " ".join(tweet) # join words with a space in between them
332
+
333
+ """### Depressive Tweets Exploration"""
334
+
335
+ ## Apply getadjectives function to the processed tweets
336
+ ## Extract all tweets into one long string with each word separate with a "space"
337
+ tweets_long_string = [getadjectives(x) for x in X_d]
338
+ tweets_long_string = " ".join(tweets_long_string)
339
+
340
+ # Import Twitter Logo
341
+ image = np.array(Image.open('/content/drive/MyDrive/NLP/Depression_Detection/data_cleaning/logo.jpeg'))
342
+
343
+ fig = plt.figure() # Instantiate the figure object
344
+ fig.set_figwidth(14) # set width
345
+ fig.set_figheight(18) # set height
346
+
347
+ plt.imshow(image, cmap=plt.cm.gray, interpolation='bilinear') # Display data as an image
348
+ plt.axis('off') # Remove axis
349
+ plt.show() # Display image
350
+
351
+ ## Create function to generate the blue colour for the Word CLoud
352
+
353
+ def blue_color_func(word, font_size, position, orientation, random_state=None,**kwargs):
354
+ return "hsl(210, 100%%, %d%%)" % random.randint(50, 70)
355
+
356
+ ## Instantiate the Twitter word cloud object
357
+ twitter_wc = WordCloud(background_color='white', max_words=1500, mask=image)
358
+
359
+ ## generate the word cloud
360
+ twitter_wc.generate(tweets_long_string)
361
+
362
+ ## display the word cloud
363
+ fig = plt.figure()
364
+ fig.set_figwidth(14) # set width
365
+ fig.set_figheight(18) # set height
366
+
367
+ plt.imshow(twitter_wc.recolor(color_func=blue_color_func, random_state=3),
368
+ interpolation="bilinear")
369
+ plt.axis('off')
370
+ plt.show()
371
+
372
+ twitter_wc.to_file("/content/drive/MyDrive/NLP/Depression_Detection/data_cleaning/wordcloud.png") #save to a png file
373
+
374
+ """**Analyzing Top Words in the Word Cloud for depressive dataset**"""
375
+
376
+ ## Combine all words in depressive into a list
377
+ tweets_long_string = [getadjectives(x) for x in X_d]
378
+ tweets_list=[]
379
+ for item in tweets_long_string:
380
+ item = item.split()
381
+ for i in item:
382
+ tweets_list.append(i)
383
+
384
+ # Use the Built-in Python Collections module to determine Word frequency
385
+ from collections import Counter
386
+ counts = Counter(tweets_list)
387
+ df = pd.DataFrame.from_dict(counts, orient='index').reset_index()
388
+ df.columns = ['Words', 'Count']
389
+ df.sort_values(by='Count', ascending=False, inplace=True)
390
+
391
+ df.head(10) # Check dataframe first 10 rows
392
+
393
+ """### Random Tweets Exploration
394
+
395
+ """
396
+
397
+ ## Apply getadjectives function to the processed tweets
398
+ ## Extract all tweets into one long string with each word separate with a "space"
399
+ tweets_long_string_rand = [getadjectives(x) for x in X_r]
400
+ tweets_long_string_rand = " ".join(tweets_long_string_rand)
401
+
402
+ # Import Twitter Logo
403
+ image = np.array(Image.open('/content/drive/MyDrive/NLP/Depression_Detection/data_cleaning/logo.jpeg'))
404
+
405
+ fig = plt.figure() # Instantiate the figure object
406
+ fig.set_figwidth(14) # set width
407
+ fig.set_figheight(18) # set height
408
+
409
+ plt.imshow(image, cmap=plt.cm.gray, interpolation='bilinear') # Display data as an image
410
+ plt.axis('off') # Remove axis
411
+ plt.show() # Display image
412
+
413
+ ## Create function to generate the blue colour for the Word CLoud
414
+
415
+ def blue_color_func(word, font_size, position, orientation, random_state=None,**kwargs):
416
+ return "hsl(210, 100%%, %d%%)" % random.randint(50, 70)
417
+
418
+ ## Instantiate the Twitter word cloud object
419
+ twitter_wc = WordCloud(background_color='white', max_words=1500, mask=image)
420
+
421
+ ## generate the word cloud
422
+ twitter_wc.generate(tweets_long_string_rand)
423
+
424
+ ## display the word cloud
425
+ fig = plt.figure()
426
+ fig.set_figwidth(14) # set width
427
+ fig.set_figheight(18) # set height
428
+
429
+ plt.imshow(twitter_wc.recolor(color_func=blue_color_func, random_state=3),
430
+ interpolation="bilinear")
431
+ plt.axis('off')
432
+ plt.show()
433
+
434
+ twitter_wc.to_file("/content/drive/MyDrive/NLP/Depression_Detection/data_cleaning/wordcloud_rand.png") #save to a png file
435
+
436
+ """**Analyzing Top Words in the Word Cloud for random dataset**"""
437
+
438
+ ## Combine all words in depressive into a list
439
+ tweets_long_string_rand = [getadjectives(x) for x in X_r]
440
+ tweets_list_rand=[]
441
+ for item in tweets_long_string_rand:
442
+ item = item.split()
443
+ for i in item:
444
+ tweets_list_rand.append(i)
445
+
446
+ ## Use the Built-in Python Collections module to determine Word frequency
447
+ from collections import Counter
448
+ counts = Counter(tweets_list_rand)
449
+ df_rand = pd.DataFrame.from_dict(counts, orient='index').reset_index()
450
+ df_rand.columns = ['Words', 'Count']
451
+ df_rand.sort_values(by='Count', ascending=False, inplace=True)
452
+
453
+ df_rand.head(10) # Check dataframe first 10 rows
454
+
455
+ """## Data Analysis:"""
456
+
457
+ ## distribution of classes for prediction
458
+ def create_distribution(dataFile):
459
+ return sns.countplot(x='label', data=dataFile, palette='hls')
460
+
461
+ create_distribution(df_all)
462
+
463
+ """Depreesive and random (Non-depressive) tweets are almost evenly distributed.
464
+
465
+ **Finding distribution of tweet lengths**
466
+ """
467
+
468
+ dep_line_lengths = [len(statement) for statement in new_dep_df['text']]
469
+ plt.plot(dep_line_lengths)
470
+ plt.show()
471
+
472
+ rand_line_lengths = [len(statement) for statement in new_rand_df['text']]
473
+ plt.plot(dep_line_lengths)
474
+ plt.show()
475
+
476
+ """From the distributions above, it is clear that there is no outliers in our depressive and random datasets.
477
+
478
+ ## Cleaning combined dataset and save it
479
+ """
480
+
481
+ tweets_arr = [x for x in df_all['text']]
482
+
483
+ corpus = tweets_cleaner(tweets_arr)
484
+
485
+ corpus[:10]
486
+
487
+ ## Adding clean tweets as a new column
488
+ df_all['clean_text'] = corpus
489
+
490
+ """We have to remove those rows with tweets that has been completely deleted in the cleaning process."""
491
+
492
+ # replace field that's entirely space (or empty) with NaN
493
+ df_all.replace(r'^\s*$', np.nan, regex=True, inplace=True)
494
+
495
+ df_all[df_all['clean_text'].isnull()]
496
+
497
+ ## Deleting the rows with nan
498
+ df_all.dropna(subset=['clean_text'], inplace=True)
499
+
500
+ ## Double_check for nan
501
+ df_all[df_all['clean_text'].isnull()]
502
+
503
+ ## Save cleaned_dataset
504
+ df_all.to_csv('/content/drive/MyDrive/NLP/Depression_Detection/data_cleaning/processed_data/processed_data.csv',
505
+ sep='\t', encoding='utf-8',index=False)
source_code/notebooks/data_gathering_twint.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Data_Gathering_Twint.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1zV6s2FhvQCmyNh0uyknfm47WATAOihbU
8
+ """
9
+
10
+ from google.colab import drive
11
+ drive.mount('/content/drive')
12
+
13
+ !git clone https://github.com/twintproject/twint.git
14
+
15
+ import os
16
+ os.chdir("/content/twint")
17
+
18
+ !pip freeze > requirements.txt
19
+
20
+ !pip install .
21
+
22
+ !pip install -U git+https://github.com/cyxv/twint.git@master
23
+
24
+ !pip install nest_asyncio
25
+
26
+ !pip3 install twint
27
+
28
+ # Import required libraries
29
+ import nest_asyncio
30
+ nest_asyncio.apply()
31
+ import pandas as pd
32
+ import twint
33
+ import pandas as pd
34
+ import re
35
+
36
+ # add some tweets with depressed and depression tags, for a particular year
37
+
38
+ depress_tags = ["#depressed", "#anxiety", "#depression", "#suicide", "#mentalhealth"
39
+ "#loneliness", "#hopelessness", "#itsokaynottobeokay"]
40
+
41
+ content = {}
42
+ for i in range(len(depress_tags)):
43
+ print(depress_tags[i])
44
+ c = twint.Config()
45
+
46
+ c.Format = "Tweet id: {id} | Tweet: {tweet}"
47
+ c.Search = depress_tags[i]
48
+ c.Limit = 1000
49
+ c.Year = 2019
50
+ c.Lang = "en"
51
+ c.Store_csv = True
52
+ c.Store_Object = True
53
+ c.Output = "/content/drive/MyDrive/NLP/Depression_Detection/depressive_en_2019.csv"
54
+ c.Hide_output = True
55
+ c.Stats = True
56
+ c.Lowercase = True
57
+ c.Filter_retweets = True
58
+ twint.run.Search(c)
59
+
60
+ # add some tweets with depressed and depression tags, for a particular year
61
+
62
+ depress_tags = ["#depressed", "#depression", "#loneliness", "#hopelessness"]
63
+
64
+ content = {}
65
+ for i in range(len(depress_tags)):
66
+ print(depress_tags[i])
67
+ c = twint.Config()
68
+
69
+ c.Format = "Tweet id: {id} | Tweet: {tweet}"
70
+ c.Search = depress_tags[i]
71
+ c.Limit = 1000
72
+ c.Year = 2020
73
+ c.Store_csv = True
74
+ c.Store_json = True
75
+ c.Output = "/content/drive/MyDrive/NLP/Depression_Detection/dataset_depression.json"
76
+ c.Hide_output = True
77
+ c.Stats = True
78
+ c.Lowercase = True
79
+ c.Filter_retweets = True
80
+ twint.run.Search(c)
source_code/notebooks/data_gathering_twitter_API.py ADDED
@@ -0,0 +1,388 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Twitter_API.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1UAilj_PmxYbwHsc_s79d9UyBvawBVZAS
8
+
9
+ # Tweet mining using Twitter API via Tweepy:
10
+
11
+ In this notebook I am using Tweepy python library to tweets using relevant hashtags. I was able to retrieve around 19000 unique tweets via twitter API. At the end, all the datasets with different depressive hashtags will be combined, cleaned and saved as depressive_tweets.csv.
12
+ """
13
+
14
+ from google.colab import drive
15
+ drive.mount('/content/drive')
16
+
17
+ """## Tweets mining"""
18
+
19
+ !pip install -qqq tweepy
20
+
21
+ ## Import required libraries
22
+ import tweepy
23
+ from tweepy.streaming import StreamListener
24
+ from tweepy import OAuthHandler
25
+ from tweepy import Stream
26
+ import csv
27
+ import pandas as pd
28
+
29
+ ## Access to twitter API cunsumer_key and access_secret
30
+ #import config.ipynb
31
+
32
+ ## Twitter API related information
33
+ consumer_key = config.API_KEY
34
+ consumer_secret = config.API_KEY_SECRET
35
+ access_key= config.ACCESS_TOKEN
36
+ access_secret = config.ACCESS_TOKEN_SECRET
37
+
38
+ auth = tweepy.OAuthHandler(consumer_key, consumer_secret) # Pass in Consumer key and secret for authentication by API
39
+ auth.set_access_token(access_key, access_secret) # Pass in Access key and secret for authentication by API
40
+ api = tweepy.API(auth,wait_on_rate_limit=True,wait_on_rate_limit_notify=True) # Sleeps when API limit is reached
41
+
42
+ ## depress_tags = ["#depressed", "#anxiety", "#depression", "#suicide", "#mentalhealth"
43
+ ## "#loneliness", "#hopelessness", "#itsokaynottobeokay", "#sad"]
44
+
45
+ """## "#depressed""""
46
+
47
+ ## Create a function for tweets mining
48
+ def tweets_mining1(search_query1, num_tweets1, since_id_num1):
49
+ # Collect tweets using the Cursor object
50
+ # Each item in the iterator has various attributes that you can access to get information about each tweet
51
+ tweet_list1 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query1, lang="en", since_id=since_id_num1,
52
+ tweet_mode='extended').items(num_tweets1)]
53
+
54
+ # Begin scraping the tweets individually:
55
+ for tweet in tweet_list1[::-1]:
56
+ tweet_id = tweet.id # get Tweet ID result
57
+ created_at = tweet.created_at # get time tweet was created
58
+ text = tweet.full_text # retrieve full tweet text
59
+ location = tweet.user.location # retrieve user location
60
+ retweet = tweet.retweet_count # retrieve number of retweets
61
+ favorite = tweet.favorite_count # retrieve number of likes
62
+ with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depressed_1.csv','a', newline='', encoding='utf-8') as csvFile1:
63
+ csv_writer1 = csv.writer(csvFile1, delimiter=',') # create an instance of csv object
64
+ csv_writer1.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row
65
+
66
+ search_words1 = "#depressed" # Specifying exact phrase to search
67
+ # Exclude Links, retweets, replies
68
+ search_query1 = search_words1 + " -filter:links AND -filter:retweets AND -filter:replies"
69
+ with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depressed_1.csv', encoding='utf-8') as data:
70
+ latest_tweet = int(list(csv.reader(data))[-1][0])
71
+ tweets_mining1(search_query1, 1000, latest_tweet)
72
+
73
+ df_depressed_1 = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depressed_1.csv",
74
+ names=['tweet.id', "created_at","text", "location", "retweet", "favorite"])
75
+
76
+ df_depressed_1
77
+
78
+ ## Finding unique values in each column
79
+ for col in df_depressed_1:
80
+ print("There are ", len(df_depressed_1[col].unique()), "unique values in ", col)
81
+
82
+ """### Anxiety and suicide """
83
+
84
+ ## Create a function for tweets mining
85
+ def tweets_mining2(search_query2, num_tweets2, since_id_num2):
86
+ # Collect tweets using the Cursor object
87
+ # Each item in the iterator has various attributes that you can access to get information about each tweet
88
+ tweet_list2 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query2, lang="en", since_id=since_id_num2,
89
+ tweet_mode='extended').items(num_tweets2)]
90
+
91
+ # Begin scraping the tweets individually:
92
+ for tweet in tweet_list2[::-1]:
93
+ tweet_id = tweet.id # get Tweet ID result
94
+ created_at = tweet.created_at # get time tweet was created
95
+ text = tweet.full_text # retrieve full tweet text
96
+ location = tweet.user.location # retrieve user location
97
+ retweet = tweet.retweet_count # retrieve number of retweets
98
+ favorite = tweet.favorite_count # retrieve number of likes
99
+ with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_anxiety_1.csv','a', newline='', encoding='utf-8') as csvFile2:
100
+ csv_writer2 = csv.writer(csvFile2, delimiter=',') # create an instance of csv object
101
+ csv_writer2.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row
102
+
103
+ search_words2 = "#anxiety" # Specifying exact phrase to search
104
+ # Exclude Links, retweets, replies
105
+ search_query2 = search_words2 + " -filter:links AND -filter:retweets AND -filter:replies"
106
+ with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_anxiety_1.csv', encoding='utf-8') as data:
107
+ latest_tweet = int(list(csv.reader(data))[-1][0])
108
+ tweets_mining2(search_query2, 2000, latest_tweet)
109
+
110
+ df_anxiety_1 = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_anxiety_1.csv",
111
+ names=['tweet.id', "created_at","text", "location", "retweet", "favorite"])
112
+
113
+ df_anxiety_1
114
+
115
+ ## Finding unique values in each column
116
+ for col in df_anxiety_1:
117
+ print("There are ", len(df_anxiety_1[col].unique()), "unique values in ", col)
118
+
119
+ """## "#Suicide""""
120
+
121
+ ## Create a function for tweets mining
122
+ def tweets_mining3(search_query3, num_tweets3, since_id_num3):
123
+ # Collect tweets using the Cursor object
124
+ # Each item in the iterator has various attributes that you can access to get information about each tweet
125
+ tweet_list3 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query3, lang="en", since_id=since_id_num3,
126
+ tweet_mode='extended').items(num_tweets3)]
127
+
128
+ # Begin scraping the tweets individually:
129
+ for tweet in tweet_list3[::-1]:
130
+ tweet_id = tweet.id # get Tweet ID result
131
+ created_at = tweet.created_at # get time tweet was created
132
+ text = tweet.full_text # retrieve full tweet text
133
+ location = tweet.user.location # retrieve user location
134
+ retweet = tweet.retweet_count # retrieve number of retweets
135
+ favorite = tweet.favorite_count # retrieve number of likes
136
+ with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_suicide_1.csv','a', newline='', encoding='utf-8') as csvFile3:
137
+ csv_writer3 = csv.writer(csvFile3, delimiter=',') # create an instance of csv object
138
+ csv_writer3.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row
139
+
140
+ search_words3 = "#suicide" # Specifying exact phrase to search
141
+ # Exclude Links, retweets, replies
142
+ search_query3 = search_words3 + " -filter:links AND -filter:retweets AND -filter:replies"
143
+ with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_suicide_1.csv', encoding='utf-8') as data:
144
+ latest_tweet = int(list(csv.reader(data))[-1][0])
145
+ tweets_mining3(search_query3, 10000, latest_tweet)
146
+
147
+ df_suicide_1 = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_suicide_1.csv",
148
+ names=['tweet.id', "created_at","text", "location", "retweet", "favorite"])
149
+
150
+ df_suicide_1
151
+
152
+ """## "#hopelessness""""
153
+
154
+ ## Create a function for tweets mining
155
+ def tweets_mining4(search_query4, num_tweets4, since_id_num4):
156
+ # Collect tweets using the Cursor object
157
+ # Each item in the iterator has various attributes that you can access to get information about each tweet
158
+ tweet_list4 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query4, lang="en", since_id=since_id_num4,
159
+ tweet_mode='extended').items(num_tweets4)]
160
+
161
+ # Begin scraping the tweets individually:
162
+ for tweet in tweet_list4[::-1]:
163
+ tweet_id = tweet.id # get Tweet ID result
164
+ created_at = tweet.created_at # get time tweet was created
165
+ text = tweet.full_text # retrieve full tweet text
166
+ location = tweet.user.location # retrieve user location
167
+ retweet = tweet.retweet_count # retrieve number of retweets
168
+ favorite = tweet.favorite_count # retrieve number of likes
169
+ with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_hopeless_1.csv','a', newline='', encoding='utf-8') as csvFile4:
170
+ csv_writer4 = csv.writer(csvFile4, delimiter=',') # create an instance of csv object
171
+ csv_writer4.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row
172
+
173
+ search_words4 = "#hopelessness" # Specifying exact phrase to search
174
+ # Exclude Links, retweets, replies
175
+ search_query4 = search_words4 + " -filter:links AND -filter:retweets AND -filter:replies"
176
+ with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_hopeless_1.csv', encoding='utf-8') as data:
177
+ latest_tweet = int(list(csv.reader(data))[-1][0])
178
+ tweets_mining4(search_query4, 10000, latest_tweet)
179
+
180
+ df_hopeless_1 = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_hopeless_1.csv",
181
+ names=['tweet.id', "created_at","text", "location", "retweet", "favorite"])
182
+
183
+ df_hopeless_1
184
+
185
+ """## "#mentalhealth""""
186
+
187
+ ## Create a function for tweets mining
188
+ def tweets_mining5(search_query5, num_tweets5, since_id_num5):
189
+ # Collect tweets using the Cursor object
190
+ # Each item in the iterator has various attributes that you can access to get information about each tweet
191
+ tweet_list5 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query5, lang="en", since_id=since_id_num5,
192
+ tweet_mode='extended').items(num_tweets5)]
193
+
194
+ # Begin scraping the tweets individually:
195
+ for tweet in tweet_list5[::-1]:
196
+ tweet_id = tweet.id # get Tweet ID result
197
+ created_at = tweet.created_at # get time tweet was created
198
+ text = tweet.full_text # retrieve full tweet text
199
+ location = tweet.user.location # retrieve user location
200
+ retweet = tweet.retweet_count # retrieve number of retweets
201
+ favorite = tweet.favorite_count # retrieve number of likes
202
+ with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_mentalhealth_1.csv','a', newline='', encoding='utf-8') as csvFile5:
203
+ csv_writer5 = csv.writer(csvFile5, delimiter=',') # create an instance of csv object
204
+ csv_writer5.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row
205
+
206
+ search_words5 = "#mentalhealth" # Specifying exact phrase to search
207
+ # Exclude Links, retweets, replies
208
+ search_query5 = search_words5 + " -filter:links AND -filter:retweets AND -filter:replies"
209
+ with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_mentalhealth_1.csv', encoding='utf-8') as data:
210
+ latest_tweet = int(list(csv.reader(data))[-1][0])
211
+ tweets_mining5(search_query5, 1000, latest_tweet)
212
+
213
+ df_mentalhealth_1 = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_mentalhealth_1.csv",
214
+ names=['tweet.id', "created_at","text", "location", "retweet", "favorite"])
215
+
216
+ df_mentalhealth_1
217
+
218
+ """## "#loneliness""""
219
+
220
+ ## Create a function for tweets mining
221
+ def tweets_mining6(search_query6, num_tweets6, since_id_num6):
222
+ # Collect tweets using the Cursor object
223
+ # Each item in the iterator has various attributes that you can access to get information about each tweet
224
+ tweet_list6 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query6, lang="en", since_id=since_id_num6,
225
+ tweet_mode='extended').items(num_tweets6)]
226
+
227
+ # Begin scraping the tweets individually:
228
+ for tweet in tweet_list6[::-1]:
229
+ tweet_id = tweet.id # get Tweet ID result
230
+ created_at = tweet.created_at # get time tweet was created
231
+ text = tweet.full_text # retrieve full tweet text
232
+ location = tweet.user.location # retrieve user location
233
+ retweet = tweet.retweet_count # retrieve number of retweets
234
+ favorite = tweet.favorite_count # retrieve number of likes
235
+ with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_loneliness_1.csv','a', newline='', encoding='utf-8') as csvFile6:
236
+ csv_writer6 = csv.writer(csvFile6, delimiter=',') # create an instance of csv object
237
+ csv_writer6.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row
238
+
239
+ search_words6 = "#loneliness" # Specifying exact phrase to search
240
+ # Exclude Links, retweets, replies
241
+ search_query6 = search_words6 + " -filter:links AND -filter:retweets AND -filter:replies"
242
+ with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_loneliness_1.csv', encoding='utf-8') as data:
243
+ latest_tweet = int(list(csv.reader(data))[-1][0])
244
+ tweets_mining6(search_query6, 10000, latest_tweet)
245
+
246
+ df_loneliness_1 = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_loneliness_1.csv",
247
+ names=['tweet.id', "created_at","text", "location", "retweet", "favorite"])
248
+
249
+ df_loneliness_1
250
+
251
+ """## "#itsokaynottobeokay""""
252
+
253
+ ## Create a function for tweets mining
254
+ def tweets_mining7(search_query7, num_tweets7, since_id_num7):
255
+ # Collect tweets using the Cursor object
256
+ # Each item in the iterator has various attributes that you can access to get information about each tweet
257
+ tweet_list7 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query7, lang="en", since_id=since_id_num7,
258
+ tweet_mode='extended').items(num_tweets7)]
259
+
260
+ # Begin scraping the tweets individually:
261
+ for tweet in tweet_list7[::-1]:
262
+ tweet_id = tweet.id # get Tweet ID result
263
+ created_at = tweet.created_at # get time tweet was created
264
+ text = tweet.full_text # retrieve full tweet text
265
+ location = tweet.user.location # retrieve user location
266
+ retweet = tweet.retweet_count # retrieve number of retweets
267
+ favorite = tweet.favorite_count # retrieve number of likes
268
+ with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_itsoknottobeok_1 copy.csv','a', newline='', encoding='utf-8') as csvFile7:
269
+ csv_writer7 = csv.writer(csvFile7, delimiter=',') # create an instance of csv object
270
+ csv_writer7.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row
271
+
272
+ search_words7 = "#itsokaynottobeokay" # Specifying exact phrase to search
273
+ # Exclude Links, retweets, replies
274
+ search_query7 = search_words7 + " -filter:links AND -filter:retweets AND -filter:replies"
275
+ with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_itsoknottobeok_1 copy.csv', encoding='utf-8') as data:
276
+ latest_tweet = int(list(csv.reader(data))[-1][0])
277
+ tweets_mining7(search_query7, 2000, latest_tweet)
278
+
279
+ df_itsok_1 = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_itsoknottobeok_1 copy.csv",
280
+ names=['tweet.id', "created_at","text", "location", "retweet", "favorite"])
281
+
282
+ df_itsok_1
283
+
284
+ """## "#depression""""
285
+
286
+ ## Create a function for tweets mining
287
+ def tweets_mining8(search_query8, num_tweets8, since_id_num8):
288
+ # Collect tweets using the Cursor object
289
+ # Each item in the iterator has various attributes that you can access to get information about each tweet
290
+ tweet_list8 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query8, lang="en", since_id=since_id_num8,
291
+ tweet_mode='extended').items(num_tweets8)]
292
+
293
+ # Begin scraping the tweets individually:
294
+ for tweet in tweet_list8[::-1]:
295
+ tweet_id = tweet.id # get Tweet ID result
296
+ created_at = tweet.created_at # get time tweet was created
297
+ text = tweet.full_text # retrieve full tweet text
298
+ location = tweet.user.location # retrieve user location
299
+ retweet = tweet.retweet_count # retrieve number of retweets
300
+ favorite = tweet.favorite_count # retrieve number of likes
301
+ with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depression_1.csv','a', newline='', encoding='utf-8') as csvFile8:
302
+ csv_writer8 = csv.writer(csvFile8, delimiter=',') # create an instance of csv object
303
+ csv_writer8.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row
304
+
305
+ search_words8 = "#depression" # Specifying exact phrase to search
306
+ # Exclude Links, retweets, replies
307
+ search_query8 = search_words8 + " -filter:links AND -filter:retweets AND -filter:replies"
308
+ with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depression_1.csv', encoding='utf-8') as data:
309
+ latest_tweet = int(list(csv.reader(data))[-1][0])
310
+ tweets_mining8(search_query8, 1000, latest_tweet)
311
+
312
+ df_depression_1 = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depression_1.csv",
313
+ names=['tweet.id', "created_at","text", "location", "retweet", "favorite"])
314
+
315
+ df_depression_1
316
+
317
+ ## Finding unique values in each column
318
+ for col in df_depression_1:
319
+ print("There are ", len(df_depression_1[col].unique()), "unique values in ", col)
320
+
321
+ """## "#sad""""
322
+
323
+ ## Create a function for tweets mining
324
+ def tweets_mining9(search_query9, num_tweets9, since_id_num9):
325
+ # Collect tweets using the Cursor object
326
+ # Each item in the iterator has various attributes that you can access to get information about each tweet
327
+ tweet_list9 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query9, lang="en", since_id=since_id_num9,
328
+ tweet_mode='extended').items(num_tweets9)]
329
+
330
+ # Begin scraping the tweets individually:
331
+ for tweet in tweet_list9[::-1]:
332
+ tweet_id = tweet.id # get Tweet ID result
333
+ created_at = tweet.created_at # get time tweet was created
334
+ text = tweet.full_text # retrieve full tweet text
335
+ location = tweet.user.location # retrieve user location
336
+ retweet = tweet.retweet_count # retrieve number of retweets
337
+ favorite = tweet.favorite_count # retrieve number of likes
338
+ with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_sad_1.csv','a', newline='', encoding='utf-8') as csvFile9:
339
+ csv_writer9 = csv.writer(csvFile9, delimiter=',') # create an instance of csv object
340
+ csv_writer9.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row
341
+
342
+ search_words9 = "#sad" # Specifying exact phrase to search
343
+ # Exclude Links, retweets, replies
344
+ search_query9 = search_words9 + " -filter:links AND -filter:retweets AND -filter:replies"
345
+ with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_sad_1.csv', encoding='utf-8') as data:
346
+ latest_tweet = int(list(csv.reader(data))[-1][0])
347
+ tweets_mining9(search_query9, 2000, latest_tweet)
348
+
349
+ df_sad_1 = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_sad_1.csv",
350
+ names=['tweet.id', "created_at","text", "location", "retweet", "favorite"])
351
+
352
+ df_sad_1
353
+
354
+ """# Combining all the tweets"""
355
+
356
+ import glob
357
+
358
+ path = r'/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API' # use your path
359
+ all_files = glob.glob(path + "/*.csv")
360
+
361
+ tweets = []
362
+
363
+ for filename in all_files:
364
+ df = pd.read_csv(filename,
365
+ names=['tweet.id', "created_at","text", "location", "retweet", "favorite"]) # Convert each csv to a dataframe
366
+ tweets.append(df)
367
+
368
+ tweets_df = pd.concat(tweets, ignore_index=True) # Merge all dataframes
369
+ #tweets_df.columns=['tweet.id', "created_at","text", "location", "retweet", "favorite"]
370
+ tweets_df.head()
371
+
372
+ tweets_df
373
+
374
+ tweets_df.to_csv('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/output/depressive_tweets.csv')
375
+
376
+ """## Data cleaning
377
+
378
+ Data cleaning is one of the essential steps because without a proper cleaning procedure you will have errors in your analysis and eventually your data-driven results. Here I try to eliminate duplicates tweets by using the Primary key ('tweets.id'), checked for empty rows and replaced “NaN” if there is any.
379
+ """
380
+
381
+ tweets_df.shape #Get number of rows and columns
382
+
383
+ ## Check the data type of each column
384
+ tweets_df.dtypes.to_frame().rename(columns={0:'data_type'})
385
+
386
+ ## Finding unique values in each column
387
+ for col in tweets_df:
388
+ print("There are ", len(tweets_df[col].unique()), "unique values in ", col)
source_code/notebooks/modeling.py ADDED
@@ -0,0 +1,378 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """modeling.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1x78fRDZAuK5FaSTKHPGy8eSbZ_gYAFr6
8
+ """
9
+
10
+ from google.colab import drive
11
+ drive.mount('/content/drive')
12
+
13
+ #!pip install -qqq h5py
14
+
15
+ #!pip install --upgrade -qqq gensim
16
+
17
+ !python -m spacy download en_core_web_lg
18
+
19
+ !pip install -U SpaCy==2.2.0
20
+
21
+ ## Import required libraries
22
+
23
+ ## warnings
24
+ import warnings
25
+ warnings.filterwarnings("ignore")
26
+
27
+ ## for data
28
+ import numpy as np
29
+ import pandas as pd
30
+
31
+ ## for plotting
32
+ import matplotlib.pyplot as plt
33
+ import seaborn as sns
34
+
35
+ ## TF-IDF
36
+ from sklearn.feature_extraction.text import TfidfVectorizer
37
+
38
+ ## T-Sne
39
+ from yellowbrick.text import TSNEVisualizer
40
+ from sklearn import manifold
41
+
42
+ ## Train-Test Split
43
+ from sklearn.model_selection import train_test_split
44
+
45
+ ## Feature selection
46
+ from sklearn import feature_selection
47
+
48
+ ## libraraies for classification
49
+ from sklearn.pipeline import Pipeline
50
+ import sklearn.metrics as skm
51
+ from sklearn.metrics import confusion_matrix, accuracy_score
52
+ from sklearn.linear_model import LogisticRegression
53
+ from sklearn.neighbors import KNeighborsClassifier
54
+ from sklearn.svm import SVC
55
+ from sklearn.tree import DecisionTreeClassifier
56
+ from sklearn.neural_network import MLPClassifier
57
+ from sklearn.ensemble import RandomForestClassifier
58
+
59
+ ## for saving model
60
+ import pickle
61
+
62
+ ## for explainer
63
+ #from lime import lime_text
64
+
65
+ ## detokenization
66
+ from nltk.tokenize.treebank import TreebankWordDetokenizer
67
+
68
+ ## for word embedding with gensim
69
+ import gensim
70
+ import gensim.downloader as gensim_api
71
+ from gensim.models import Word2Vec
72
+ from gensim.models import KeyedVectors
73
+ from keras.preprocessing.text import Tokenizer
74
+ from keras.preprocessing.sequence import pad_sequences
75
+
76
+ ## for word embedding with Spacy
77
+ import spacy
78
+ import en_core_web_lg
79
+
80
+ ## for deep learning
81
+ from keras.models import load_model
82
+ from keras.models import Model, Sequential
83
+ from keras.callbacks import EarlyStopping, ModelCheckpoint
84
+ from keras.layers import Conv1D, Dense, Input, LSTM, Embedding, Dropout, Activation, MaxPooling1D
85
+ from tensorflow.keras import models, layers, preprocessing as kprocessing
86
+ from tensorflow.keras import backend as K
87
+ import tensorflow as tf
88
+ import keras
89
+ from keras.layers import Lambda
90
+ import tensorflow as tf
91
+ from keras.models import model_from_json
92
+
93
+ ## for bert language model
94
+ #import transformers
95
+
96
+ """## Loading the dataset:"""
97
+
98
+ df_all = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/data_cleaning/processed_data/processed_data.csv",
99
+ sep='\t', encoding='utf-8')
100
+
101
+ df_all
102
+
103
+ """## Classification models as well as LSTM with pretrained model(Spacy):
104
+
105
+ In order to run a supervised learning model, we first need to convert the clean_text into feature representation.
106
+ """
107
+
108
+ nlp = en_core_web_lg.load()
109
+
110
+ ## word-embedding
111
+ all_vectors = pd.np.array([pd.np.array([token.vector for token in nlp(s)]).mean(axis=0) * pd.np.ones((300)) \
112
+ for s in df_all['clean_text']])
113
+
114
+ # split out validation dataset for the end
115
+ Y= df_all["label"]
116
+ X = all_vectors
117
+
118
+ from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
119
+ validation_size = 0.3
120
+ seed = 7
121
+ X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=validation_size, random_state=seed)
122
+
123
+ # test options for classification
124
+ num_folds = 10
125
+ seed = 7
126
+ scoring = 'accuracy'
127
+
128
+ ## spot check the algorithms
129
+ models = []
130
+ models.append(('LR', LogisticRegression()))
131
+ models.append(('KNN', KNeighborsClassifier()))
132
+ models.append(('CART', DecisionTreeClassifier()))
133
+ models.append(('SVM', SVC()))
134
+ ## Neural Network
135
+ models.append(('NN', MLPClassifier()))
136
+ ## Ensable Models
137
+ models.append(('RF', RandomForestClassifier()))
138
+
139
+ ## Running the classification models
140
+ results = []
141
+ names = []
142
+ kfold_results = []
143
+ test_results = []
144
+ train_results = []
145
+ for name, model in models:
146
+ kfold = KFold(n_splits=num_folds, random_state=seed)
147
+ cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
148
+ results.append(cv_results)
149
+ names.append(name)
150
+ #msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
151
+ #print(msg)
152
+
153
+ # Full Training period
154
+ res = model.fit(X_train, Y_train)
155
+ train_result = accuracy_score(res.predict(X_train), Y_train)
156
+ train_results.append(train_result)
157
+
158
+ # Test results
159
+ test_result = accuracy_score(res.predict(X_test), Y_test)
160
+ test_results.append(test_result)
161
+
162
+ msg = "%s: %f (%f) %f %f" % (name, cv_results.mean(), cv_results.std(), train_result, test_result)
163
+ print(msg)
164
+ print(confusion_matrix(res.predict(X_test), Y_test))
165
+ #print(classification_report(res.predict(X_test), Y_test))
166
+
167
+ # compare algorithms
168
+ from matplotlib import pyplot
169
+ fig = pyplot.figure()
170
+ ind = np.arange(len(names)) # the x locations for the groups
171
+ width = 0.35 # the width of the bars
172
+ fig.suptitle('Algorithm Comparison')
173
+ ax = fig.add_subplot(111)
174
+ pyplot.bar(ind - width/2, train_results, width=width, label='Train Error')
175
+ pyplot.bar(ind + width/2, test_results, width=width, label='Test Error')
176
+ fig.set_size_inches(15,8)
177
+ pyplot.legend()
178
+ ax.set_xticks(ind)
179
+ ax.set_xticklabels(names)
180
+ pyplot.show()
181
+
182
+ """The best model with the highest accuracy is **Support Vector Machine(SVM)** with **85.79**% accuracy on test dataset. Logistic Regression performed good as well but we see overfitting problem with CART, NN and RF.
183
+
184
+ ### LSTM model:
185
+ """
186
+
187
+ ### Create sequence
188
+ vocabulary_size = 20000
189
+ tokenizer = Tokenizer(num_words= vocabulary_size)
190
+ tokenizer.fit_on_texts(df_all['clean_text'])
191
+ sequences = tokenizer.texts_to_sequences(df_all['clean_text'])
192
+ X_LSTM = pad_sequences(sequences, maxlen=50)
193
+
194
+ ## Split the data into train and test
195
+ Y_LSTM = df_all["label"]
196
+ X_train_LSTM, X_test_LSTM, Y_train_LSTM, Y_test_LSTM = train_test_split(X_LSTM, \
197
+ Y_LSTM, test_size=validation_size, random_state=seed)
198
+
199
+ from keras.wrappers.scikit_learn import KerasClassifier
200
+ def create_model(input_length=50):
201
+ model = Sequential()
202
+ model.add(Embedding(20000, 300, input_length=50))
203
+ model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
204
+ model.add(Dense(1, activation='sigmoid'))
205
+ model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
206
+ return model
207
+ model_LSTM = KerasClassifier(build_fn=create_model, epochs=3, verbose=1, validation_split=0.4)
208
+ model_LSTM.fit(X_train_LSTM, Y_train_LSTM)
209
+
210
+ train_result_LSTM = accuracy_score(model_LSTM.predict(X_train_LSTM), Y_train_LSTM)
211
+ # Test results
212
+ test_result_LSTM = accuracy_score(model_LSTM.predict(X_test_LSTM), Y_test_LSTM)
213
+
214
+ print("train result:", train_result_LSTM)
215
+ print("test result:", test_result_LSTM)
216
+
217
+ confusion_matrix(model_LSTM.predict(X_test_LSTM), Y_test_LSTM)
218
+
219
+ """### Compare all the models:"""
220
+
221
+ train_results.append(train_result_LSTM);test_results.append(test_result_LSTM)
222
+ names.append("LSTM")
223
+
224
+ # compare algorithms
225
+ from matplotlib import pyplot
226
+ fig = pyplot.figure()
227
+ ind = np.arange(len(names)) # the x locations for the groups
228
+ width = 0.35 # the width of the bars
229
+ fig.suptitle('Algorithm Comparison')
230
+ ax = fig.add_subplot(111)
231
+ pyplot.bar(ind - width/2, train_results, width=width, label='Train Error')
232
+ pyplot.bar(ind + width/2, test_results, width=width, label='Test Error')
233
+ fig.set_size_inches(15,8)
234
+ pyplot.legend()
235
+ ax.set_xticks(ind)
236
+ ax.set_xticklabels(names)
237
+ pyplot.show()
238
+ plt.savefig('/content/drive/MyDrive/NLP/Depression_Detection/modeling/classification_comparision.png')
239
+
240
+ """## Evaluate the performance:
241
+
242
+ * **Accuracy:** the fraction of predictions the model got right.
243
+ * **Confusion Matrix:** a summary table that breaks down the number of correct and incorrect predictions by each class.
244
+ * **ROC:** a plot that illustrates the true positive rate against the false positive rate at various threshold settings. The area under the curve (AUC) indicates the probability that the classifier will rank a randomly chosen positive observation higher than a randomly chosen negative one.
245
+ * **Precision:** the fraction of relevant instances among the retrieved instances.
246
+ * **Recall:** the fraction of the total amount of relevant instances that were actually retrieved.
247
+ """
248
+
249
+ def conf_matrix_acc(y_true, y_pred):
250
+ ## Plot confusion matrix
251
+ cm = confusion_matrix(y_true, y_pred)
252
+ fig, ax = plt.subplots()
253
+ sns.heatmap(cm, annot=True, fmt='d', ax=ax, cmap=plt.cm.Blues,
254
+ cbar=False)
255
+ ax.set(xlabel="Pred", ylabel="True", xticklabels=classes,
256
+ yticklabels=classes, title="Confusion matrix")
257
+ plt.yticks(rotation=0)
258
+ print("=========================================")
259
+ print(f'Accuracy score is : {accuracy_score(y_true, y_pred)}')
260
+ print("=========================================")
261
+ print("Detail:")
262
+ print(skm.classification_report(y_true, y_pred))
263
+
264
+ ## Plot ROC and precision-recall curve
265
+ def roc_precision_auc():
266
+ fig, ax = plt.subplots(nrows=1, ncols=2)
267
+ ## Plot roc
268
+ for i in range(len(classes)):
269
+ fpr, tpr, thresholds = skm.roc_curve(y_test_array[:,i],
270
+ probs[:,i])
271
+ ax[0].plot(fpr, tpr, lw=3,
272
+ label='{0} (area={1:0.2f})'.format(classes[i],
273
+ skm.auc(fpr, tpr))
274
+ )
275
+ ax[0].plot([0,1], [0,1], color='navy', lw=3, linestyle='--')
276
+ ax[0].set(xlim=[-0.05,1.0], ylim=[0.0,1.05],
277
+ xlabel='False Positive Rate',
278
+ ylabel="True Positive Rate (Recall)",
279
+ title="Receiver operating characteristic")
280
+ ax[0].legend(loc="lower right")
281
+ ax[0].grid(True)
282
+
283
+ ## Plot precision-recall curve
284
+ for i in range(len(classes)):
285
+ precision, recall, thresholds = skm.precision_recall_curve(
286
+ y_test_array[:,i], probs[:,i])
287
+ ax[1].plot(recall, precision, lw=3,
288
+ label='{0} (area={1:0.2f})'.format(classes[i],
289
+ skm.auc(recall, precision))
290
+ )
291
+ ax[1].set(xlim=[0.0,1.05], ylim=[0.0,1.05], xlabel='Recall',
292
+ ylabel="Precision", title="Precision-Recall curve")
293
+ ax[1].legend(loc="best")
294
+ ax[1].grid(True)
295
+ plt.show()
296
+ #plt.savefig('/content/drive/MyDrive/NLP/Depression_Detection/modeling/ROC_Precision_LR.png')
297
+ #plt.savefig('/content/drive/MyDrive/NLP/Depression_Detection/modeling/ROC_Precision_SVM.png')
298
+ ## AUC score
299
+ print(f'AUC score is : {skm.roc_auc_score(Y_test, probs[:,1])}')
300
+
301
+ """## Support Vector Machine(SVM) with word embedding:"""
302
+
303
+ nlp = en_core_web_lg.load()
304
+
305
+ ## word-embedding
306
+ all_vectors = pd.np.array([pd.np.array([token.vector for token in nlp(s)]).mean(axis=0) * pd.np.ones((300)) \
307
+ for s in df_all['clean_text']])
308
+
309
+ # split out validation dataset for the end
310
+ Y= df_all["label"]
311
+ X = all_vectors
312
+
313
+ from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
314
+ validation_size = 0.3
315
+ seed = 7
316
+ X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=validation_size, random_state=seed)
317
+
318
+ # test options for classification
319
+ num_folds = 10
320
+ seed = 7
321
+ scoring = 'accuracy'
322
+
323
+ #Create a svm Classifier
324
+ clf = SVC(probability=True)
325
+
326
+ ## Running the svm Classifier
327
+
328
+ # Full Training period
329
+ res = clf.fit(X_train, Y_train)
330
+ train_result = accuracy_score(res.predict(X_train), Y_train)
331
+ test_result = accuracy_score(res.predict(X_test), Y_test)
332
+
333
+ print("train_result:", "test_resuld:", train_result, test_result, sep=" ")
334
+
335
+ ## Save the Modle to file in the current working directory
336
+ SVM = "/content/drive/MyDrive/NLP/Depression_Detection/modeling/model_svm1.pkl"
337
+
338
+ with open(SVM, 'wb') as file:
339
+ pickle.dump(clf, file)
340
+
341
+ ## Load the Model back from file
342
+ with open(SVM, 'rb') as file:
343
+ clf = pickle.load(file)
344
+
345
+ clf
346
+
347
+ ## Test results
348
+ ##
349
+ y_pred_svm = res.predict(X_test)
350
+ classes = np.unique(Y_test.to_list())
351
+ y_test_array = pd.get_dummies(Y_test, drop_first=False).values
352
+ probs = res.predict_proba(X_test)
353
+ conf_matrix_acc(Y_test.to_list(),y_pred_svm)
354
+ roc_precision_auc()
355
+
356
+ """## Exploring False positive and False negative:"""
357
+
358
+ ## creating lists of true values and predictions
359
+ y_test_1 = [x for x in y_test]
360
+ y_pred_lr_1 = [x for x in y_pred_lr]
361
+
362
+ ## Find the indices of wrong predictions
363
+ idx = []
364
+ for i in range(len(y_test_1)):
365
+ if y_test_1[i] != y_pred_lr_1[i]:
366
+ idx.append(i)
367
+ i+=1
368
+
369
+ print('There are", {} "wrong preditions", len(idx))
370
+
371
+ wrong_arr = cv.inverse_transform(X_test_tfidf[idx])
372
+
373
+ ## detokenize the wrong array
374
+ detokenized = [TreebankWordDetokenizer().detokenize(x) for x in wrong_arr]
375
+
376
+ detokenized[:50]
377
+
378
+ """There is no specific patterns between false positive and false negative predictions."""
source_code/notebooks/old_models.py ADDED
@@ -0,0 +1,637 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """old_models.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1Oc7A5TaGLg1qkYXzf0qLGIe0_ZxyAnXE
8
+
9
+ This notebook contains Feature selection with Chi-Square test, Logistic Regression with TFIDF as well as Bidirectional LSTM with gensim to classifies a given tweet into depressive or non-depressive ones.
10
+ """
11
+
12
+ from google.colab import drive
13
+ drive.mount('/content/drive')
14
+
15
+ ## Import required libraries
16
+
17
+ ## warnings
18
+ import warnings
19
+ warnings.filterwarnings("ignore")
20
+
21
+ ## for data
22
+ import numpy as np
23
+ import pandas as pd
24
+
25
+ ## for plotting
26
+ import matplotlib.pyplot as plt
27
+ import seaborn as sns
28
+
29
+ ## TF-IDF
30
+ from sklearn.feature_extraction.text import TfidfVectorizer
31
+
32
+ ## T-Sne
33
+ from yellowbrick.text import TSNEVisualizer
34
+ from sklearn import manifold
35
+
36
+ ## Train-Test Split
37
+ from sklearn.model_selection import train_test_split
38
+
39
+ ## Feature selection
40
+ from sklearn import feature_selection
41
+
42
+ ## libraraies for classification
43
+ from sklearn.pipeline import Pipeline
44
+ import sklearn.metrics as skm
45
+ from sklearn.metrics import confusion_matrix, accuracy_score
46
+ from sklearn.linear_model import LogisticRegression
47
+ from sklearn.neighbors import KNeighborsClassifier
48
+ from sklearn.svm import SVC
49
+ from sklearn.tree import DecisionTreeClassifier
50
+ from sklearn.neural_network import MLPClassifier
51
+ from sklearn.ensemble import RandomForestClassifier
52
+
53
+ ## for saving model
54
+ import pickle
55
+
56
+ ## detokenization
57
+ from nltk.tokenize.treebank import TreebankWordDetokenizer
58
+
59
+ ## for word embedding with gensim
60
+ import gensim
61
+ import gensim.downloader as gensim_api
62
+ from gensim.models import Word2Vec
63
+ from gensim.models import KeyedVectors
64
+ from keras.preprocessing.text import Tokenizer
65
+ from keras.preprocessing.sequence import pad_sequences
66
+
67
+ ## for word embedding with Spacy
68
+ # import spacy
69
+ # import en_core_web_lg
70
+
71
+ ## for deep learning
72
+ from keras.models import load_model
73
+ from keras.models import Model, Sequential
74
+ from keras.callbacks import EarlyStopping, ModelCheckpoint
75
+ from keras.layers import Conv1D, Dense, Input, LSTM, Embedding, Dropout, Activation, MaxPooling1D
76
+ from tensorflow.keras import models, layers, preprocessing as kprocessing
77
+ from tensorflow.keras import backend as K
78
+ import tensorflow as tf
79
+ import keras
80
+ from keras.layers import Lambda
81
+ import tensorflow as tf
82
+ from keras.models import model_from_json
83
+
84
+ ## for bert language model
85
+ #import transformers
86
+
87
+ """## Loading the dataset:"""
88
+
89
+ df_all = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/data_cleaning/processed_data/processed_data.csv",
90
+ sep='\t', encoding='utf-8')
91
+
92
+ df_all
93
+
94
+ """## Feature selection
95
+
96
+ In order to drop some columns and reduce the matrix dimensionality, we can carry out some Feature Selection, the process of selecting a subset of relevant variables. I will proceed as follows:
97
+
98
+
99
+
100
+ 1. treat each category as binary (for example, the “depressive” category is 1 for the depressive tweets and 0 for non_depressive);
101
+ 2. perform a Chi-Square test to determine whether a feature and the (binary) target are independent;
102
+ 3. keep only the features with a certain p-value from the Chi-Square test.
103
+
104
+ This snippet of code is derived from https://towardsdatascience.com/text-classification-with-nlp-tf-idf-vs-word2vec-vs-bert-41ff868d1794
105
+ """
106
+
107
+ y = y_train
108
+ X_names = cv.get_feature_names()
109
+ p_value_limit = 0.95
110
+ df_features = pd.DataFrame()
111
+ for cat in np.unique(y):
112
+ chi2, p = feature_selection.chi2(X_train_tfidf, y==cat)
113
+ df_features = df_features.append(pd.DataFrame(
114
+ {"feature":X_names, "score":1-p, "y":cat}))
115
+ df_features = df_features.sort_values(["y","score"],
116
+ ascending=[True,False])
117
+ df_features = df_features[df_features["score"]>p_value_limit]
118
+ X_names = df_features["feature"].unique().tolist()
119
+
120
+ print(len(X_names))
121
+
122
+ """I reduced the number of features from 20018 to 688 by keeping the most statistically relevant ones. Let’s print some:"""
123
+
124
+ for cat in np.unique(y):
125
+ print("# {}:".format(cat))
126
+ print(" . selected features:",
127
+ len(df_features[df_features["y"]==cat]))
128
+ print(" . top features:", ",".join(df_features[df_features["y"]==cat]["feature"].values[:10]))
129
+ print(" ")
130
+
131
+ """## Logistic Regression with TFIDF:
132
+
133
+ ### Spliting data to train and test datasets:
134
+ """
135
+
136
+ ## split dataset to train and test
137
+ X_train, X_test, y_train, y_test = train_test_split(df_all['clean_text'], df_all['label'], test_size=0.3, random_state= 42)
138
+
139
+ X_train.shape, X_test.shape, y_train.shape, y_test.shape
140
+
141
+ """### TF-IDF
142
+
143
+ TF-IDF (term frequency and inverse document frequency):
144
+ """
145
+
146
+ ## Creating the TF-IDF model
147
+ cv = TfidfVectorizer()
148
+ cv.fit(X_train.to_list())
149
+ dic_vocabulary = cv.vocabulary_
150
+
151
+ X_train_tfidf = cv.transform(X_train.to_list())
152
+
153
+ X_test_tfidf = cv.transform(X_test.to_list())
154
+
155
+ cv.inverse_transform(X_test_tfidf[0])
156
+
157
+ X_train_tfidf.shape
158
+
159
+ # ## Adding clean tweets to a list called corpus
160
+ # corpus = []
161
+ # corpus = [x for x in df_train['clean_text']]
162
+ # # corpus = df_train["clean_text"]
163
+
164
+ """The feature matrix X_train_tfidf has a shape of 16,464 (Number of documents in training) x 20018 (Length of vocabulary) and it’s pretty sparse:"""
165
+
166
+ sns.heatmap(X_train_tfidf.todense()[:,np.random.randint(0,X_train_tfidf.shape[1],100)]==0, vmin=0, vmax=1, cbar=False).set_title('Sparse Matrix Sample')
167
+
168
+ """In order to know the position of a certain word, we can look it up in the vocabulary:"""
169
+
170
+ word = "mental"
171
+ dic_vocabulary[word]
172
+
173
+ """Build a scikit-learn pipeline: a sequential application of a list of transformations and a final estimator. Putting the Tf-Idf vectorizer and Logistic Regression classifier in a pipeline allows us to transform and predict test data in just one step."""
174
+
175
+ # classifier = LogisticRegression(solver='liblinear', penalty='l1')
176
+
177
+ # ## pipeline
178
+ # model = Pipeline([("vectorizer", cv),
179
+ # ("classifier", classifier)])
180
+ # ## train classifier
181
+ # model["classifier"].fit(X_train, y_train)
182
+ # ## test
183
+ # predicted = model.predict(X_test)
184
+ # predicted_prob = model.predict_proba(X_test)
185
+
186
+ # ## creating the instance of the models
187
+ lr = LogisticRegression(solver='liblinear', penalty='l1')
188
+ ## fitting the model
189
+ print(lr.fit(X_train_tfidf, y_train.to_list()))
190
+
191
+ ## Save the Modle to file in the current working directory
192
+ LogisticReg = "/content/drive/MyDrive/NLP/Depression_Detection/modeling/model_LogReg.pkl"
193
+
194
+ with open(LogisticReg, 'wb') as file:
195
+ pickle.dump(lr, file)
196
+
197
+ ## Load the Model back from file
198
+ with open(LogisticReg, 'rb') as file:
199
+ lr = pickle.load(file)
200
+
201
+ lr
202
+
203
+ ## Test
204
+ y_pred_lr = lr.predict(X_test_tfidf)
205
+ probs = lr.predict_proba(X_test_tfidf)
206
+ classes = np.unique(y_test.to_list())
207
+ y_test_array = pd.get_dummies(y_test, drop_first=False).values
208
+
209
+ """## Evaluate the performance:
210
+
211
+ * **Accuracy:** the fraction of predictions the model got right.
212
+ * **Confusion Matrix:** a summary table that breaks down the number of correct and incorrect predictions by each class.
213
+ * **ROC:** a plot that illustrates the true positive rate against the false positive rate at various threshold settings. The area under the curve (AUC) indicates the probability that the classifier will rank a randomly chosen positive observation higher than a randomly chosen negative one.
214
+ * **Precision:** the fraction of relevant instances among the retrieved instances.
215
+ * **Recall:** the fraction of the total amount of relevant instances that were actually retrieved.
216
+ """
217
+
218
+ def conf_matrix_acc(y_true, y_pred):
219
+ ## Plot confusion matrix
220
+ cm = confusion_matrix(y_true, y_pred)
221
+ fig, ax = plt.subplots()
222
+ sns.heatmap(cm, annot=True, fmt='d', ax=ax, cmap=plt.cm.Blues,
223
+ cbar=False)
224
+ ax.set(xlabel="Pred", ylabel="True", xticklabels=classes,
225
+ yticklabels=classes, title="Confusion matrix")
226
+ plt.yticks(rotation=0)
227
+ print("=========================================")
228
+ print(f'Accuracy score is : {accuracy_score(y_true, y_pred)}')
229
+ print("=========================================")
230
+ print("Detail:")
231
+ print(skm.classification_report(y_true, y_pred))
232
+
233
+ ## Plot ROC and precision-recall curve
234
+ def roc_precision_auc():
235
+ fig, ax = plt.subplots(nrows=1, ncols=2)
236
+ ## Plot roc
237
+ for i in range(len(classes)):
238
+ fpr, tpr, thresholds = skm.roc_curve(y_test_array[:,i],
239
+ probs[:,i])
240
+ ax[0].plot(fpr, tpr, lw=3,
241
+ label='{0} (area={1:0.2f})'.format(classes[i],
242
+ skm.auc(fpr, tpr))
243
+ )
244
+ ax[0].plot([0,1], [0,1], color='navy', lw=3, linestyle='--')
245
+ ax[0].set(xlim=[-0.05,1.0], ylim=[0.0,1.05],
246
+ xlabel='False Positive Rate',
247
+ ylabel="True Positive Rate (Recall)",
248
+ title="Receiver operating characteristic")
249
+ ax[0].legend(loc="lower right")
250
+ ax[0].grid(True)
251
+
252
+ ## Plot precision-recall curve
253
+ for i in range(len(classes)):
254
+ precision, recall, thresholds = skm.precision_recall_curve(
255
+ y_test_array[:,i], probs[:,i])
256
+ ax[1].plot(recall, precision, lw=3,
257
+ label='{0} (area={1:0.2f})'.format(classes[i],
258
+ skm.auc(recall, precision))
259
+ )
260
+ ax[1].set(xlim=[0.0,1.05], ylim=[0.0,1.05], xlabel='Recall',
261
+ ylabel="Precision", title="Precision-Recall curve")
262
+ ax[1].legend(loc="best")
263
+ ax[1].grid(True)
264
+ plt.show()
265
+ #plt.savefig('/content/drive/MyDrive/NLP/Depression_Detection/modeling/ROC_Precision_LR.png')
266
+ plt.savefig('/content/drive/MyDrive/NLP/Depression_Detection/modeling/ROC_Precision_SVM.png')
267
+ ## AUC score
268
+ print(f'AUC score is : {skm.roc_auc_score(Y_test, probs[:,1])}')
269
+
270
+ conf_matrix_acc(y_test.to_list(),y_pred_lr)
271
+
272
+ roc_precision_auc()
273
+
274
+
275
+
276
+ """## Bidirectional LSTM:
277
+
278
+ In Python, you can load a pre-trained Word Embedding model from genism-data like this:
279
+ """
280
+
281
+ nlp_pre = gensim_api.load("word2vec-google-news-300")
282
+
283
+ word = "anxiety"
284
+ fig = plt.figure()
285
+ ## word embedding
286
+ tot_words = [word] + [tupla[0] for tupla in
287
+ nlp_pre.most_similar(word, topn=20)]
288
+ X = nlp_pre[tot_words]
289
+ ## pca to reduce dimensionality from 300 to 3
290
+ pca = manifold.TSNE(perplexity=40, n_components=3, init='pca')
291
+ X = pca.fit_transform(X)
292
+ ## create dtf
293
+ dtf_ = pd.DataFrame(X, index=tot_words, columns=["x","y","z"])
294
+ dtf_["input"] = 0
295
+ dtf_["input"].iloc[0:1] = 1
296
+ ## plot 3d
297
+ from mpl_toolkits.mplot3d import Axes3D
298
+ ax = fig.add_subplot(111, projection='3d')
299
+ ax.scatter(dtf_[dtf_["input"]==0]['x'],
300
+ dtf_[dtf_["input"]==0]['y'],
301
+ dtf_[dtf_["input"]==0]['z'], c="black")
302
+ ax.scatter(dtf_[dtf_["input"]==1]['x'],
303
+ dtf_[dtf_["input"]==1]['y'],
304
+ dtf_[dtf_["input"]==1]['z'], c="red")
305
+ ax.set(xlabel=None, ylabel=None, zlabel=None, xticklabels=[],
306
+ yticklabels=[], zticklabels=[])
307
+ for label, row in dtf_[["x","y","z"]].iterrows():
308
+ x, y, z = row
309
+ ax.text(x, y, z, s=label)
310
+
311
+ """Instead of using a pre-trained model, I am going to fit my own Word2Vec on the training data corpus with gensim. Before fitting the model, the corpus needs to be transformed into a list of lists of n-grams. In this particular case, I’ll try to capture unigrams (“york”), bigrams (“new york”), and trigrams (“new york city”)."""
312
+
313
+ ## split dataset
314
+ dtf_train, dtf_test = train_test_split(df_all, test_size=0.3)
315
+ ## get target
316
+ y_train = dtf_train["label"].values
317
+ y_test = dtf_test["label"].values
318
+
319
+ corpus = []
320
+ corpus = [x for x in dtf_train['clean_text']]
321
+
322
+ ## create list of lists of unigrams
323
+ lst_corpus = []
324
+ for string in corpus:
325
+ lst_words = str(string).split()
326
+ lst_grams = [" ".join(lst_words[i:i+1])
327
+ for i in range(0, len(lst_words), 1)]
328
+ lst_corpus.append(lst_grams)
329
+
330
+ ## detect bigrams and trigrams
331
+ bigrams_detector = gensim.models.phrases.Phrases(lst_corpus,
332
+ delimiter=" ".encode(), min_count=5, threshold=10)
333
+ bigrams_detector = gensim.models.phrases.Phraser(bigrams_detector)
334
+ trigrams_detector = gensim.models.phrases.Phrases(bigrams_detector[lst_corpus],
335
+ delimiter=" ".encode(), min_count=5, threshold=10)
336
+ trigrams_detector = gensim.models.phrases.Phraser(trigrams_detector)
337
+
338
+ """When fitting the Word2Vec, you need to specify:
339
+
340
+ * the target size of the word vectors, I’ll use 300;
341
+ * the window, or the maximum distance between the current and predicted word within a sentence, I’ll use the mean length of text in the corpus;
342
+ * the training algorithm, I’ll use skip-grams (sg=1) as in general it has better results.
343
+ """
344
+
345
+ ## fit w2v
346
+ nlp = gensim.models.word2vec.Word2Vec(lst_corpus, size=300,
347
+ window=8, min_count=1, sg=1, iter=30)
348
+
349
+ """We have our embedding model, so we can select any word from the corpus and transform it into a vector."""
350
+
351
+ word = "anxiety"
352
+ nlp[word].shape
353
+
354
+ """We can even use it to visualize a word and its context into a smaller dimensional space (2D or 3D) by applying any dimensionality reduction algorithm (i.e. TSNE)."""
355
+
356
+ word = "anxiety"
357
+ fig = plt.figure()
358
+ ## word embedding
359
+ tot_words = [word] + [tupla[0] for tupla in
360
+ nlp.most_similar(word, topn=20)]
361
+ X = nlp[tot_words]
362
+ ## pca to reduce dimensionality from 300 to 3
363
+ pca = manifold.TSNE(perplexity=40, n_components=3, init='pca')
364
+ X = pca.fit_transform(X)
365
+ ## create dtf
366
+ dtf_ = pd.DataFrame(X, index=tot_words, columns=["x","y","z"])
367
+ dtf_["input"] = 0
368
+ dtf_["input"].iloc[0:1] = 1
369
+ ## plot 3d
370
+ from mpl_toolkits.mplot3d import Axes3D
371
+ ax = fig.add_subplot(111, projection='3d')
372
+ ax.scatter(dtf_[dtf_["input"]==0]['x'],
373
+ dtf_[dtf_["input"]==0]['y'],
374
+ dtf_[dtf_["input"]==0]['z'], c="black")
375
+ ax.scatter(dtf_[dtf_["input"]==1]['x'],
376
+ dtf_[dtf_["input"]==1]['y'],
377
+ dtf_[dtf_["input"]==1]['z'], c="red")
378
+ ax.set(xlabel=None, ylabel=None, zlabel=None, xticklabels=[],
379
+ yticklabels=[], zticklabels=[])
380
+ for label, row in dtf_[["x","y","z"]].iterrows():
381
+ x, y, z = row
382
+ ax.text(x, y, z, s=label)
383
+
384
+ """The word vectors can be used in a neural network as weights in the follwing procedure:
385
+ 1. Transform the corpus into padded sequences of word ids to get a feature matrix.
386
+ 2. Create an embedding matrix so that the vector of the word with id N is located at the Nth row.
387
+ 3. Build a neural network with an embedding layer that weighs every word in the sequences with the corresponding vector.
388
+
389
+ **Feature Engineering:** by transforming the same preprocessed corpus (list of lists of n-grams) given to the Word2Vec into a list of sequences using tensorflow/keras:
390
+ """
391
+
392
+ ## tokenize text
393
+ tokenizer = kprocessing.text.Tokenizer(lower=True, split=' ',
394
+ oov_token="NaN",
395
+ filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
396
+ tokenizer.fit_on_texts(lst_corpus)
397
+ dic_vocabulary = tokenizer.word_index
398
+
399
+ ## create sequence
400
+ lst_text2seq= tokenizer.texts_to_sequences(lst_corpus)
401
+
402
+ ## padding sequence
403
+ X_train = kprocessing.sequence.pad_sequences(lst_text2seq,
404
+ maxlen=35, padding="post", truncating="post")
405
+
406
+ X_train.shape
407
+
408
+ """The feature matrix X_train has a shape of 16559 x 35 (Number of sequences x Sequences max length). Let’s visualize it:"""
409
+
410
+ sns.heatmap(X_train==0, vmin=0, vmax=1, cbar=False)
411
+ plt.show()
412
+
413
+ """Every text in the corpus is now an id sequence with length 35. For instance, if a text had 20 tokens in it, then the sequence is composed of 20 ids + 15 0s, which is the padding element (while the id for word not in the vocabulary is 1)
414
+
415
+ Let’s print how a text from the train set has been transformed into a sequence with the padding and the vocabulary.
416
+ """
417
+
418
+ i = 8
419
+
420
+ ## list of text: ["I like this", ...]
421
+ len_txt = len(dtf_train["clean_text"].iloc[i].split())
422
+ print("from: ", dtf_train["clean_text"].iloc[i], "| len:", len_txt)
423
+
424
+ ## sequence of token ids: [[1, 2, 3], ...]
425
+ len_tokens = len(X_train[i])
426
+ print("to: ", X_train[i], "| len:", len(X_train[i]))
427
+
428
+ ## vocabulary: {"I":1, "like":2, "this":3, ...}
429
+ print("check: ", dtf_train["clean_text"].iloc[i].split()[0],
430
+ " -- idx in vocabulary -->",
431
+ dic_vocabulary[dtf_train["clean_text"].iloc[i].split()[0]])
432
+
433
+ print("vocabulary: ", dict(list(dic_vocabulary.items())[0:5]), "... (padding element, 0)")
434
+
435
+ corpus = dtf_test["clean_text"]
436
+
437
+ ## create list of n-grams
438
+ lst_corpus = []
439
+ for string in corpus:
440
+ lst_words = str(string).split()
441
+ lst_grams = [" ".join(lst_words[i:i+1]) for i in range(0,
442
+ len(lst_words), 1)]
443
+ lst_corpus.append(lst_grams)
444
+
445
+ ## detect common bigrams and trigrams using the fitted detectors
446
+ lst_corpus = list(bigrams_detector[lst_corpus])
447
+ lst_corpus = list(trigrams_detector[lst_corpus])
448
+ ## text to sequence with the fitted tokenizer
449
+ lst_text2seq = tokenizer.texts_to_sequences(lst_corpus)
450
+
451
+ ## padding sequence
452
+ X_test = kprocessing.sequence.pad_sequences(lst_text2seq, maxlen=35,
453
+ padding="post", truncating="post")
454
+
455
+ X_test.shape
456
+
457
+ sns.heatmap(X_test==0, vmin=0, vmax=1, cbar=False)
458
+ plt.show()
459
+
460
+ """We’ve got our X_train and X_test, now we need to create the embedding matrix that will be used as a weight matrix in the neural network."""
461
+
462
+ ## start the matrix (length of vocabulary x vector size) with all 0s
463
+ embeddings = np.zeros((len(dic_vocabulary)+1, 300))
464
+ for word,idx in dic_vocabulary.items():
465
+ ## update the row with vector
466
+ try:
467
+ embeddings[idx] = nlp[word]
468
+ ## if word not in model then skip and the row stays all 0s
469
+ except:
470
+ pass
471
+
472
+ embeddings.shape
473
+
474
+ """That code generates a matrix of shape 20,050 x 300 (Length of vocabulary extracted from the corpus x Vector size). It can be navigated by word id, which can be obtained from the vocabulary."""
475
+
476
+ word = "anxiety"
477
+ print("dic[word]:", dic_vocabulary[word], "|idx")
478
+ print("embeddings[idx]:", embeddings[dic_vocabulary[word]].shape,
479
+ "|vector")
480
+
481
+ """### Deep Learning:
482
+
483
+ It’s finally time to build a deep learning model. I’m going to use the embedding matrix in the first Embedding layer of the neural network that I will build and train to classify the news. Each id in the input sequence will be used as the index to access the embedding matrix. The output of this Embedding layer will be a 2D matrix with a word vector for each word id in the input sequence (Sequence length x Vector size). Let’s use the sentence “I like this article” as an example:
484
+
485
+ My neural network shall be structured as follows:
486
+
487
+ * An Embedding layer that takes the sequences as input and the word vectors as weights, just as described before.
488
+
489
+ * A simple Attention layer that won’t affect the predictions but it’s going to capture the weights of each instance and allow us to build a nice explainer (it isn't necessary for the predictions, just for the explainability, so you can skip it).
490
+
491
+ * Two layers of Bidirectional LSTM to model the order of words in a sequence in both directions.
492
+
493
+ * Two final dense layers that will predict the probability of each category.
494
+ """
495
+
496
+ ## code attention layer
497
+ def attention_layer(inputs, neurons):
498
+ x = layers.Permute((2,1))(inputs)
499
+ x = layers.Dense(neurons, activation="softmax")(x)
500
+ x = layers.Permute((2,1), name="attention")(x)
501
+ x = layers.multiply([inputs, x])
502
+ return x
503
+
504
+ ## input
505
+ x_in = layers.Input(shape=(35,))
506
+ ## embedding
507
+ x = layers.Embedding(input_dim=embeddings.shape[0],
508
+ output_dim=embeddings.shape[1],
509
+ weights=[embeddings],
510
+ input_length=35, trainable=False)(x_in)
511
+ ## apply attention
512
+ x = attention_layer(x, neurons=35)
513
+ ## 2 layers of bidirectional lstm
514
+ x = layers.Bidirectional(layers.LSTM(units=35, dropout=0.2,
515
+ return_sequences=True))(x)
516
+ x = layers.Bidirectional(layers.LSTM(units=35, dropout=0.2))(x)
517
+ ## final dense layers
518
+ x = layers.Dense(64, activation='relu')(x)
519
+ y_out = layers.Dense(1, activation='sigmoid')(x)
520
+ ## compile
521
+ model = models.Model(x_in, y_out)
522
+ model.compile(loss='binary_crossentropy',
523
+ optimizer='adam', metrics=['accuracy'])
524
+
525
+ model.summary()
526
+
527
+ ## encode y
528
+ dic_y_mapping = {n:label for n,label in
529
+ enumerate(np.unique(y_train))}
530
+ inverse_dic = {v:k for k,v in dic_y_mapping.items()}
531
+ y_train = np.array([inverse_dic[y] for y in y_train])
532
+ ## train
533
+ training = model.fit(x=X_train, y=y_train, batch_size=256,
534
+ epochs=30, shuffle=True, verbose=0,
535
+ validation_split=0.3)
536
+
537
+ ## plot loss and accuracy
538
+ metrics = [k for k in training.history.keys() if ("loss" not in k) and ("val" not in k)]
539
+ fig, ax = plt.subplots(nrows=1, ncols=2, sharey=True)
540
+ ax[0].set(title="Training")
541
+ ax11 = ax[0].twinx()
542
+ ax[0].plot(training.history['loss'], color='black')
543
+ ax[0].set_xlabel('Epochs')
544
+ ax[0].set_ylabel('Loss', color='black')
545
+ for metric in metrics:
546
+ ax11.plot(training.history[metric], label=metric)
547
+ ax11.set_ylabel("Score", color='steelblue')
548
+ ax11.legend()
549
+ ax[1].set(title="Validation")
550
+ ax22 = ax[1].twinx()
551
+ ax[1].plot(training.history['val_loss'], color='black')
552
+ ax[1].set_xlabel('Epochs')
553
+ ax[1].set_ylabel('Loss', color='black')
554
+ for metric in metrics:
555
+ ax22.plot(training.history['val_'+metric], label=metric)
556
+ ax22.set_ylabel("Score", color="steelblue")
557
+ plt.savefig('/content/drive/MyDrive/NLP/Depression_Detection/modeling/loss_accuracy_LSTM_3.png')
558
+ plt.show()
559
+
560
+ # serialize model to JSON
561
+ model_json = model.to_json()
562
+ with open("/content/drive/MyDrive/NLP/Depression_Detection/modeling/model.json", "w") as json_file:
563
+ json_file.write(model_json)
564
+ # serialize weights to HDF5
565
+ model.save_weights("/content/drive/MyDrive/NLP/Depression_Detection/modeling/model.h5")
566
+ print("Saved model to disk")
567
+
568
+ loaded_model = model_from_json(open("/content/drive/MyDrive/NLP/Depression_Detection/modeling/model.json", "r").read(),
569
+ custom_objects={'tf': tf})
570
+ json_file.close()
571
+ # load weights into new model
572
+ loaded_model.load_weights("/content/drive/MyDrive/NLP/Depression_Detection/modeling/model.h5")
573
+ print("Loaded model from disk")
574
+
575
+ labels_pred = model.predict(X_test)
576
+ labels_pred = np.round(labels_pred.flatten())
577
+ accuracy = accuracy_score(y_test, labels_pred)
578
+ classes = np.unique(y_test)
579
+ print("Accuracy: %.2f%%" % (accuracy*100))
580
+
581
+ def conf_matrix_acc2(y_true, y_pred):
582
+ ## Plot confusion matrix
583
+ cm = confusion_matrix(y_test, y_pred)
584
+ fig, ax = plt.subplots()
585
+ sns.heatmap(cm, annot=True, fmt='d', ax=ax, cmap=plt.cm.Blues,
586
+ cbar=False)
587
+ ax.set(xlabel="Pred", ylabel="True", xticklabels=classes,
588
+ yticklabels=classes, title="Confusion matrix")
589
+ plt.yticks(rotation=0)
590
+ print("=========================================")
591
+ print(f'Accuracy score is : {accuracy_score(y_true, y_pred)}')
592
+ print("=========================================")
593
+ print("Detail:")
594
+ print(skm.classification_report(y_test, y_pred))
595
+
596
+ conf_matrix_acc2(y_test, labels_pred)
597
+
598
+ # classes = np.unique(y_test)
599
+ # y_test_array = pd.get_dummies(y_test, drop_first=False).values
600
+ # predicted_prob = model.predict_on_batch(X_test)
601
+
602
+ # ## Plot ROC and precision-recall curve
603
+ # def roc_precision_auc2():
604
+ # fig, ax = plt.subplots(nrows=1, ncols=2)
605
+ # ## Plot roc
606
+ # for i in range(len(classes)):
607
+ # fpr, tpr, thresholds = skm.roc_curve(y_test_array[:,i],
608
+ # predicted_prob[:,i])
609
+ # ax[0].plot(fpr, tpr, lw=3,
610
+ # label='{0} (area={1:0.2f})'.format(classes[i],
611
+ # skm.auc(fpr, tpr))
612
+ # )
613
+ # ax[0].plot([0,1], [0,1], color='navy', lw=3, linestyle='--')
614
+ # ax[0].set(xlim=[-0.05,1.0], ylim=[0.0,1.05],
615
+ # xlabel='False Positive Rate',
616
+ # ylabel="True Positive Rate (Recall)",
617
+ # title="Receiver operating characteristic")
618
+ # ax[0].legend(loc="lower right")
619
+ # ax[0].grid(True)
620
+
621
+ # ## Plot precision-recall curve
622
+ # for i in range(len(classes)):
623
+ # precision, recall, thresholds = skm.precision_recall_curve(
624
+ # y_test_array[:,i], probs[:,i])
625
+ # ax[1].plot(recall, precision, lw=3,
626
+ # label='{0} (area={1:0.2f})'.format(classes[i],
627
+ # skm.auc(recall, precision))
628
+ # )
629
+ # ax[1].set(xlim=[0.0,1.05], ylim=[0.0,1.05], xlabel='Recall',
630
+ # ylabel="Precision", title="Precision-Recall curve")
631
+ # ax[1].legend(loc="best")
632
+ # ax[1].grid(True)
633
+ # plt.show()
634
+ # #plt.savefig('/content/drive/MyDrive/NLP/Depression_Detection/modeling/ROC_Precision_LR.png')
635
+ # #plt.savefig('/content/drive/MyDrive/NLP/Depression_Detection/modeling/ROC_Precision_LSTM.png')
636
+ # ## AUC score
637
+ # print(f'AUC score is : {skm.roc_auc_score(y_test, probs[:,1])}')
source_code/notebooks/testing.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """testing.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1MCstbEJ_U20yRJDGRmZTjIpGTCzTFL_o
8
+ """
9
+
10
+ from google.colab import drive
11
+ drive.mount('/content/drive')
12
+
13
+ !pip install -qqq ftfy
14
+
15
+ !pip install -qqq json_file
16
+
17
+ !python -m spacy download en_core_web_lg
18
+
19
+ !pip install -U SpaCy==2.2.0
20
+
21
+ ## Import required libraries
22
+
23
+ ## warnings
24
+ import warnings
25
+ warnings.filterwarnings("ignore")
26
+
27
+ ## for data
28
+ import numpy as np
29
+ import pandas as pd
30
+
31
+ ## for plotting
32
+ import matplotlib.pyplot as plt
33
+ import seaborn as sns
34
+
35
+ ## Bag of Words
36
+ from sklearn.feature_extraction.text import CountVectorizer
37
+
38
+ ## TF-IDF
39
+ from sklearn.feature_extraction.text import TfidfVectorizer
40
+
41
+ ## Train-Test Split
42
+ from sklearn.model_selection import train_test_split
43
+
44
+ ## for processing
45
+ import nltk
46
+ import re
47
+ import ftfy
48
+ from nltk.stem import WordNetLemmatizer
49
+ from nltk.corpus import stopwords
50
+ nltk.download('stopwords')
51
+ nltk.download('punkt')
52
+ nltk.download('wordnet')
53
+ nltk.download('averaged_perceptron_tagger')
54
+
55
+ ## Feature selection
56
+ from sklearn import feature_selection
57
+
58
+ ## Support vector machine
59
+ from sklearn.pipeline import Pipeline
60
+ import sklearn.metrics as skm
61
+ from sklearn.metrics import confusion_matrix, accuracy_score
62
+ from sklearn.svm import SVC
63
+
64
+ ## for saving and loading model
65
+ import pickle
66
+
67
+ ## for word embedding with Spacy
68
+ import spacy
69
+ import en_core_web_lg
70
+
71
+ # ## for word embedding
72
+ # import gensim
73
+ # import gensim.downloader as gensim_api
74
+ # from gensim.models import Word2Vec
75
+ # from gensim.models import KeyedVectors
76
+ # from keras.preprocessing.text import Tokenizer
77
+ # from keras.preprocessing.sequence import pad_sequences
78
+
79
+ # ## for deep learning
80
+ # from keras.models import load_model
81
+ # from keras.models import Model, Sequential
82
+ # from keras.callbacks import EarlyStopping, ModelCheckpoint
83
+ # from keras.layers import Conv1D, Dense, Input, LSTM, Embedding, Dropout, Activation, MaxPooling1D
84
+ # from tensorflow.keras import models, layers, preprocessing as kprocessing
85
+ # from tensorflow.keras import backend as K
86
+ # from keras.models import model_from_json
87
+ # from keras.layers import Lambda
88
+ # import tensorflow as tf
89
+ # import json
90
+ # import json_file
91
+
92
+ # Expand Contraction
93
+ cList = {
94
+ "ain't": "am not",
95
+ "aren't": "are not",
96
+ "can't": "cannot",
97
+ "can't've": "cannot have",
98
+ "'cause": "because",
99
+ "could've": "could have",
100
+ "couldn't": "could not",
101
+ "couldn't've": "could not have",
102
+ "didn't": "did not",
103
+ "doesn't": "does not",
104
+ "don't": "do not",
105
+ "hadn't": "had not",
106
+ "hadn't've": "had not have",
107
+ "hasn't": "has not",
108
+ "haven't": "have not",
109
+ "he'd": "he would",
110
+ "he'd've": "he would have",
111
+ "he'll": "he will",
112
+ "he'll've": "he will have",
113
+ "he's": "he is",
114
+ "how'd": "how did",
115
+ "how'd'y": "how do you",
116
+ "how'll": "how will",
117
+ "how's": "how is",
118
+ "I'd": "I would",
119
+ "I'd've": "I would have",
120
+ "I'll": "I will",
121
+ "I'll've": "I will have",
122
+ "I'm": "I am",
123
+ "I've": "I have",
124
+ "isn't": "is not",
125
+ "it'd": "it had",
126
+ "it'd've": "it would have",
127
+ "it'll": "it will",
128
+ "it'll've": "it will have",
129
+ "it's": "it is",
130
+ "let's": "let us",
131
+ "ma'am": "madam",
132
+ "mayn't": "may not",
133
+ "might've": "might have",
134
+ "mightn't": "might not",
135
+ "mightn't've": "might not have",
136
+ "must've": "must have",
137
+ "mustn't": "must not",
138
+ "mustn't've": "must not have",
139
+ "needn't": "need not",
140
+ "needn't've": "need not have",
141
+ "o'clock": "of the clock",
142
+ "oughtn't": "ought not",
143
+ "oughtn't've": "ought not have",
144
+ "shan't": "shall not",
145
+ "sha'n't": "shall not",
146
+ "shan't've": "shall not have",
147
+ "she'd": "she would",
148
+ "she'd've": "she would have",
149
+ "she'll": "she will",
150
+ "she'll've": "she will have",
151
+ "she's": "she is",
152
+ "should've": "should have",
153
+ "shouldn't": "should not",
154
+ "shouldn't've": "should not have",
155
+ "so've": "so have",
156
+ "so's": "so is",
157
+ "that'd": "that would",
158
+ "that'd've": "that would have",
159
+ "that's": "that is",
160
+ "there'd": "there had",
161
+ "there'd've": "there would have",
162
+ "there's": "there is",
163
+ "they'd": "they would",
164
+ "they'd've": "they would have",
165
+ "they'll": "they will",
166
+ "they'll've": "they will have",
167
+ "they're": "they are",
168
+ "they've": "they have",
169
+ "to've": "to have",
170
+ "wasn't": "was not",
171
+ "we'd": "we had",
172
+ "we'd've": "we would have",
173
+ "we'll": "we will",
174
+ "we'll've": "we will have",
175
+ "we're": "we are",
176
+ "we've": "we have",
177
+ "weren't": "were not",
178
+ "what'll": "what will",
179
+ "what'll've": "what will have",
180
+ "what're": "what are",
181
+ "what's": "what is",
182
+ "what've": "what have",
183
+ "when's": "when is",
184
+ "when've": "when have",
185
+ "where'd": "where did",
186
+ "where's": "where is",
187
+ "where've": "where have",
188
+ "who'll": "who will",
189
+ "who'll've": "who will have",
190
+ "who's": "who is",
191
+ "who've": "who have",
192
+ "why's": "why is",
193
+ "why've": "why have",
194
+ "will've": "will have",
195
+ "won't": "will not",
196
+ "won't've": "will not have",
197
+ "would've": "would have",
198
+ "wouldn't": "would not",
199
+ "wouldn't've": "would not have",
200
+ "y'all": "you all",
201
+ "y'alls": "you alls",
202
+ "y'all'd": "you all would",
203
+ "y'all'd've": "you all would have",
204
+ "y'all're": "you all are",
205
+ "y'all've": "you all have",
206
+ "you'd": "you had",
207
+ "you'd've": "you would have",
208
+ "you'll": "you you will",
209
+ "you'll've": "you you will have",
210
+ "you're": "you are",
211
+ "you've": "you have"
212
+ }
213
+
214
+ c_re = re.compile('(%s)' % '|'.join(cList.keys()))
215
+
216
+ def expandContractions(text, c_re=c_re):
217
+ def replace(match):
218
+ return cList[match.group(0)]
219
+ return c_re.sub(replace, text)
220
+
221
+ ## Function to perform stepwise cleaning process
222
+ def tweets_cleaner(tweet):
223
+ cleaned_tweets = []
224
+ tweet = tweet.lower() #lowercase
225
+
226
+ # if url links then don't append to avoid news articles
227
+ # also check tweet length, save those > 5
228
+ if re.match("(\w+:\/\/\S+)", tweet) == None and len(tweet) > 5:
229
+
230
+ #remove hashtag, @mention, emoji and image URLs
231
+ tweet = ' '.join(re.sub("(@[A-Za-z0-9]+)|(\#[A-Za-z0-9]+)|(<Emoji:.*>)|(pic\.twitter\.com\/.*)", " ", tweet).split())
232
+
233
+ #fix weirdly encoded texts
234
+ tweet = ftfy.fix_text(tweet)
235
+
236
+ #expand contraction
237
+ tweet = expandContractions(tweet)
238
+
239
+
240
+ #remove punctuation
241
+ tweet = ' '.join(re.sub("([^0-9A-Za-z \t])", " ", tweet).split())
242
+
243
+ #stop words and lemmatization
244
+ stop_words = set(stopwords.words('english'))
245
+ word_tokens = nltk.word_tokenize(tweet)
246
+
247
+ lemmatizer=WordNetLemmatizer()
248
+ filtered_sentence = [lemmatizer.lemmatize(word) for word in word_tokens if not word in stop_words]
249
+ # back to string from list
250
+ tweet = ' '.join(filtered_sentence) # join words with a space in between them
251
+
252
+ cleaned_tweets.append(tweet)
253
+
254
+ return cleaned_tweets
255
+
256
+ nlp = en_core_web_lg.load()
257
+
258
+ ## Load the model
259
+ SVM = "/content/drive/MyDrive/NLP/Depression_Detection/modeling/model_svm.pkl"
260
+ with open(SVM, 'rb') as file:
261
+ clf = pickle.load(file)
262
+
263
+ clf
264
+
265
+ test_tweet = "I hate my life"
266
+
267
+ corpus = tweets_cleaner(test_tweet)
268
+
269
+ corpus
270
+
271
+ ## word-embedding
272
+ test = pd.np.array([pd.np.array([token.vector for token in nlp(s)]).mean(axis=0) * pd.np.ones((300)) \
273
+ for s in corpus])
274
+
275
+ labels_pred = clf.predict(test)
276
+
277
+ labels_pred[0]
278
+
279
+ # loaded_model = model_from_json(open("/content/drive/MyDrive/NLP/Depression_Detection/modeling/model.json", "r").read(),
280
+ # custom_objects={'tf': tf})
281
+ # # load weights into new model
282
+ # loaded_model.load_weights("/content/drive/MyDrive/NLP/Depression_Detection/modeling/model.h5")
283
+ # print("Loaded model from disk")
source_code/requirements.txt ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ blis==0.4.1
2
+ certifi==2021.10.8
3
+ charset-normalizer==2.0.7
4
+ click==8.0.3
5
+ cycler==0.11.0
6
+ cymem==2.0.6
7
+ fonttools==4.28.1
8
+ ftfy==6.0.3
9
+ idna==3.3
10
+ joblib==1.1.0
11
+ kiwisolver==1.3.2
12
+ matplotlib==3.5.0
13
+ murmurhash==1.0.6
14
+ nltk==3.6.5
15
+ numpy==1.21.4
16
+ packaging==21.2
17
+ pandas==1.3.4
18
+ Pillow==8.4.0
19
+ plac==0.9.6
20
+ preshed==3.0.6
21
+ pyparsing==2.4.7
22
+ python-dateutil==2.8.2
23
+ pytz==2021.3
24
+ regex==2021.11.10
25
+ requests==2.26.0
26
+ scikit-learn==1.0.1
27
+ scipy==1.7.2
28
+ seaborn==0.11.2
29
+ setuptools-scm==6.3.2
30
+ six==1.16.0
31
+ spacy==2.2.0
32
+ srsly==1.0.5
33
+ thinc==7.1.1
34
+ threadpoolctl==3.0.0
35
+ tomli==1.2.2
36
+ tqdm==4.62.3
37
+ urllib3==1.26.7
38
+ wasabi==0.8.2
39
+ wcwidth==0.2.5
40
+ wordcloud==1.8.1
41
+ Flask==1.1.2
42
+ Flask-Bootstrap==3.3.7.1
source_code/static/brain.svg ADDED
source_code/static/overlay.css ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Cinematic Overlay Styles */
2
+ .cinematic-overlay {
3
+ position: fixed;
4
+ top: 0;
5
+ left: 0;
6
+ width: 100vw;
7
+ height: 100vh;
8
+ background: rgba(29, 161, 242, 0.98);
9
+ /* Classic Twitter Blue #1DA1F2 */
10
+ z-index: 9999;
11
+ display: none;
12
+ align-items: center;
13
+ justify-content: center;
14
+ flex-direction: column;
15
+ opacity: 0;
16
+ transition: opacity 0.8s ease-in-out;
17
+ backdrop-filter: blur(10px);
18
+ }
19
+
20
+ .cinematic-overlay.active {
21
+ opacity: 1;
22
+ }
23
+
24
+ .overlay-content {
25
+ text-align: center;
26
+ color: var(--clr-white);
27
+ }
28
+
29
+ .overlay-logo {
30
+ font-size: 7rem;
31
+ color: #ffffff;
32
+ margin-bottom: 30px;
33
+ opacity: 0;
34
+ transform: scale(0.5);
35
+ transition: all 1s cubic-bezier(0.175, 0.885, 0.32, 1.275);
36
+ }
37
+
38
+ .cinematic-overlay.active .overlay-logo {
39
+ opacity: 1;
40
+ transform: scale(1);
41
+ filter: drop-shadow(0 0 20px rgba(29, 174, 255, 0.6));
42
+ }
43
+
44
+ .overlay-title {
45
+ font-family: 'Play', sans-serif;
46
+ font-size: 3rem;
47
+ font-weight: 700;
48
+ letter-spacing: 2px;
49
+ margin-bottom: 10px;
50
+ opacity: 0;
51
+ transform: translateY(20px);
52
+ transition: all 0.8s ease 0.4s;
53
+ color: #ffffff;
54
+ /* Solid white for max contrast */
55
+ text-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
56
+ }
57
+
58
+ .cinematic-overlay.active .overlay-title {
59
+ opacity: 1;
60
+ transform: translateY(0);
61
+ }
62
+
63
+ .overlay-author {
64
+ font-family: 'Play', sans-serif;
65
+ font-size: 1rem;
66
+ font-size: 1rem;
67
+ color: #ffffff;
68
+ /* Solid white */
69
+ text-shadow: 0 1px 2px rgba(0, 0, 0, 0.2);
70
+ text-transform: uppercase;
71
+ letter-spacing: 3px;
72
+ margin-top: 40px;
73
+ margin-bottom: 15px;
74
+ opacity: 0;
75
+ transition: opacity 1s ease 0.8s;
76
+ }
77
+
78
+ .cinematic-overlay.active .overlay-author {
79
+ opacity: 1;
80
+ }
81
+
82
+ .overlay-names {
83
+ font-size: 1.5rem;
84
+ font-weight: 300;
85
+ opacity: 0;
86
+ transform: scale(0.9);
87
+ transition: all 1s ease 1s;
88
+ }
89
+
90
+ .cinematic-overlay.active .overlay-names {
91
+ opacity: 1;
92
+ transform: scale(1);
93
+ }
94
+
95
+ .overlay-names .separator {
96
+ color: #ffffff;
97
+ margin: 0 15px;
98
+ font-weight: 700;
99
+ }
source_code/static/security.js ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * security.js
3
+ * Implements low-level security features and Easter eggs.
4
+ */
5
+
6
+ // Disable Right Click
7
+ document.addEventListener('contextmenu', function (e) {
8
+ e.preventDefault();
9
+ });
10
+
11
+ // Disable F12, Ctrl+Shift+I, Ctrl+Shift+J, Ctrl+U
12
+ document.onkeydown = function (e) {
13
+ if (e.keyCode == 123) {
14
+ return false;
15
+ }
16
+ if (e.ctrlKey && e.shiftKey && (e.keyCode == 'I'.charCodeAt(0) || e.keyCode == 'J'.charCodeAt(0))) {
17
+ return false;
18
+ }
19
+ if (e.ctrlKey && e.keyCode == 'U'.charCodeAt(0)) {
20
+ return false;
21
+ }
22
+ }
23
+
24
+ // Easter Egg - Console Warning
25
+ console.log("%cStop snooping around!", "color: red; font-family: sans-serif; font-size: 4.5em; font-weight: bolder; text-shadow: #000 1px 1px;");
26
+ console.log("%cThis is a project by Amey Thakur & Mega Satish.", "color: #1daeff; font-family: sans-serif; font-size: 1.5em;");
source_code/static/styles.css ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* ==============================================================================
2
+ * PROJECT: DEPRESSION-DETECTION-USING-TWEETS
3
+ * AUTHORS: AMEY THAKUR & MEGA SATISH
4
+ * GITHUB (AMEY): https://github.com/Amey-Thakur
5
+ * GITHUB (MEGA): https://github.com/msatmod
6
+ * REPOSITORY: https://github.com/Amey-Thakur/DEPRESSION-DETECTION-USING-TWEETS
7
+ * RELEASE DATE: June 5, 2022
8
+ * LICENSE: MIT License
9
+ * DESCRIPTION: Global style sheet with a professional and personal design.
10
+ * ============================================================================== */
11
+
12
+ @import url('https://fonts.googleapis.com/css2?family=Play:wght@400;700&display=swap');
13
+
14
+ :root {
15
+ /* Color Palette */
16
+ --clr-navy: #0A192F;
17
+ --clr-slate: #8892B0;
18
+ --clr-light-slate: #A8B2D1;
19
+ --clr-white: #E6F1FF;
20
+ --clr-ivory: #F8F9FA;
21
+ --clr-accent: #64FFDA;
22
+ --clr-accent-dark: #1daeff;
23
+ --clr-bg: #F4F7FB;
24
+
25
+ /* Typography */
26
+ --font-main: 'Play', sans-serif;
27
+
28
+ /* Layout */
29
+ --header-height: 80px;
30
+ --max-width: 900px;
31
+ --transition: all 0.3s cubic-bezier(0.645, 0.045, 0.355, 1);
32
+ }
33
+
34
+ * {
35
+ box-sizing: border-box;
36
+ margin: 0;
37
+ padding: 0;
38
+ }
39
+
40
+ body {
41
+ background-color: var(--clr-bg);
42
+ color: var(--clr-navy);
43
+ font-family: var(--font-main);
44
+ line-height: 1.6;
45
+ overflow-x: hidden;
46
+ user-select: none;
47
+ }
48
+
49
+ .container {
50
+ max-width: var(--max-width);
51
+ margin: 0 auto;
52
+ padding: 0 40px;
53
+ }
54
+
55
+ /* Header & Typography */
56
+ h1,
57
+ h2,
58
+ h3 {
59
+ font-family: var(--font-main);
60
+ color: var(--clr-navy);
61
+ }
62
+
63
+ .hero-section {
64
+ padding: 80px 0 40px;
65
+ text-align: center;
66
+ }
67
+
68
+ .hero-section h1 {
69
+ font-size: 3rem;
70
+ margin-bottom: 20px;
71
+ font-weight: 700;
72
+ }
73
+
74
+ .hero-section p {
75
+ font-size: 1.2rem;
76
+ color: var(--clr-slate);
77
+ max-width: 600px;
78
+ margin: 0 auto;
79
+ }
80
+
81
+ /* Analysis Card */
82
+ .analysis-card {
83
+ background: white;
84
+ border-radius: 12px;
85
+ box-shadow: 0 10px 30px -15px rgba(2, 12, 27, 0.1);
86
+ padding: 40px;
87
+ margin-bottom: 30px;
88
+ }
89
+
90
+ .input-group {
91
+ margin-top: 30px;
92
+ }
93
+
94
+ textarea.analysis-input {
95
+ width: 100%;
96
+ min-height: 150px;
97
+ padding: 20px;
98
+ border: 2px solid #E2E8F0;
99
+ border-radius: 8px;
100
+ font-family: var(--font-main);
101
+ font-size: 1.1rem;
102
+ transition: var(--transition);
103
+ resize: vertical;
104
+ outline: none;
105
+ }
106
+
107
+ textarea.analysis-input:focus {
108
+ border-color: var(--clr-accent-dark);
109
+ box-shadow: 0 0 0 4px rgba(29, 174, 255, 0.1);
110
+ }
111
+
112
+ .btn-primary {
113
+ display: inline-block;
114
+ background-color: var(--clr-accent-dark);
115
+ color: white;
116
+ padding: 15px 35px;
117
+ border-radius: 6px;
118
+ font-weight: 600;
119
+ text-decoration: none;
120
+ border: 2px solid var(--clr-accent-dark);
121
+ cursor: pointer;
122
+ transition: var(--transition);
123
+ margin-top: 20px;
124
+ font-size: 1rem;
125
+ }
126
+
127
+ .btn-primary:hover {
128
+ background-color: white;
129
+ color: var(--clr-accent-dark);
130
+ transform: translateY(-3px);
131
+ box-shadow: 0 5px 15px rgba(29, 174, 255, 0.3);
132
+ }
133
+
134
+ /* Result Section */
135
+ .result-display {
136
+ text-align: center;
137
+ padding: 40px 0;
138
+ }
139
+
140
+ .result-badge {
141
+ display: inline-block;
142
+ padding: 10px 25px;
143
+ border-radius: 50px;
144
+ font-weight: 700;
145
+ text-transform: uppercase;
146
+ letter-spacing: 1px;
147
+ margin-top: 20px;
148
+ }
149
+
150
+ .badge-depressive {
151
+ background-color: #FFF5F5;
152
+ color: #C53030;
153
+ border: 1px solid #FEB2B2;
154
+ }
155
+
156
+ .badge-non-depressive {
157
+ background-color: #F0FFF4;
158
+ color: #276749;
159
+ border: 1px solid #9AE6B4;
160
+ }
161
+
162
+ /* Footer & Authorship */
163
+ footer {
164
+ padding: 60px 0;
165
+ text-align: center;
166
+ border-top: 1px solid #E2E8F0;
167
+ margin-top: 60px;
168
+ }
169
+
170
+ .authorship {
171
+ color: var(--clr-slate);
172
+ font-size: 0.9rem;
173
+ }
174
+
175
+ .authorship a {
176
+ color: var(--clr-accent-dark);
177
+ text-decoration: none;
178
+ font-weight: 600;
179
+ }
180
+
181
+ .authorship a:hover {
182
+ text-decoration: underline;
183
+ }
184
+
185
+ .metadata {
186
+ margin-top: 10px;
187
+ font-size: 0.8rem;
188
+ color: var(--clr-light-slate);
189
+ }
190
+
191
+ /* Animations */
192
+ @keyframes fadeIn {
193
+ from {
194
+ opacity: 0;
195
+ transform: translateY(20px);
196
+ }
197
+
198
+ to {
199
+ opacity: 1;
200
+ transform: translateY(0);
201
+ }
202
+ }
203
+
204
+ .animate-fade {
205
+ animation: fadeIn 0.8s ease forwards;
206
+ }
207
+
208
+ /* Brain Icon Animation */
209
+ .brain-trigger {
210
+ color: var(--clr-accent-dark);
211
+ margin-bottom: 20px;
212
+ cursor: pointer;
213
+ transition: all 0.5s ease;
214
+ filter: drop-shadow(0 0 5px rgba(29, 174, 255, 0.3));
215
+ }
216
+
217
+ .brain-trigger:hover {
218
+ animation: brainPulse 1.5s infinite ease-in-out;
219
+ color: #4dc4ff;
220
+ /* Slightly brighter on hover */
221
+ filter: drop-shadow(0 0 15px rgba(29, 174, 255, 0.8));
222
+ }
223
+
224
+ @keyframes brainPulse {
225
+ 0% {
226
+ transform: scale(1);
227
+ }
228
+
229
+ 50% {
230
+ transform: scale(1.15);
231
+ filter: drop-shadow(0 0 25px rgba(29, 174, 255, 0.9));
232
+ }
233
+
234
+ 100% {
235
+ transform: scale(1);
236
+ }
237
+ }
source_code/static/tweet-sound.mp3 ADDED
Binary file (10.4 kB). View file
 
source_code/templates/404.html ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+
4
+ <head>
5
+ <meta charset="UTF-8">
6
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
7
+ <title>404 - Page Not Found | Tweet Depression Detection</title>
8
+ <!-- Simple Brain Icon as Favicon -->
9
+ <link rel="icon" href="{{url_for('.static', filename='brain.svg')}}" type="image/svg+xml">
10
+
11
+ <!-- Modern Typography & Iconography -->
12
+ <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
13
+ <link rel="stylesheet" href="{{url_for('.static', filename='styles.css')}}">
14
+ <script src="{{url_for('.static', filename='security.js')}}"></script>
15
+
16
+ <style>
17
+ body {
18
+ overflow-x: hidden;
19
+ display: flex;
20
+ flex-direction: column;
21
+ min-height: 100vh;
22
+ }
23
+
24
+ .error-container {
25
+ flex: 1;
26
+ display: flex;
27
+ flex-direction: column;
28
+ justify-content: center;
29
+ align-items: center;
30
+ text-align: center;
31
+ padding: 20px;
32
+ }
33
+
34
+ .error-code {
35
+ font-size: 10rem;
36
+ color: var(--clr-accent-dark);
37
+ font-weight: 700;
38
+ margin: 0;
39
+ line-height: 1;
40
+ position: relative;
41
+ animation: glitch 1s infinite alternate-reverse;
42
+ text-shadow: 2px 2px 0 var(--clr-accent), -2px -2px 0 #ff0055;
43
+ }
44
+
45
+ .error-message {
46
+ font-size: 1.5rem;
47
+ color: var(--clr-slate);
48
+ margin-top: 10px;
49
+ margin-bottom: 40px;
50
+ letter-spacing: 1px;
51
+ }
52
+
53
+ @keyframes glitch {
54
+ 0% {
55
+ transform: skew(0deg);
56
+ }
57
+
58
+ 20% {
59
+ transform: skew(-2deg);
60
+ }
61
+
62
+ 40% {
63
+ transform: skew(2deg);
64
+ }
65
+
66
+ 60% {
67
+ transform: skew(-1deg);
68
+ }
69
+
70
+ 80% {
71
+ transform: skew(3deg);
72
+ }
73
+
74
+ 100% {
75
+ transform: skew(0deg);
76
+ }
77
+ }
78
+
79
+ .broken-icon {
80
+ font-size: 5rem;
81
+ color: var(--clr-light-slate);
82
+ margin-bottom: 20px;
83
+ animation: float 3s ease-in-out infinite;
84
+ }
85
+
86
+ @keyframes float {
87
+ 0% {
88
+ transform: translateY(0px) rotate(0deg);
89
+ }
90
+
91
+ 50% {
92
+ transform: translateY(-15px) rotate(5deg);
93
+ }
94
+
95
+ 100% {
96
+ transform: translateY(0px) rotate(0deg);
97
+ }
98
+ }
99
+ </style>
100
+ </head>
101
+
102
+ <body>
103
+
104
+ <header class="hero-section container animate-fade">
105
+ <i class="fas fa-brain fa-3x" style="color: #1daeff; margin-bottom: 20px;"></i>
106
+ <h1>Tweet Depression Detection</h1>
107
+ </header>
108
+
109
+ <main class="container animate-fade" style="animation-delay: 0.2s;">
110
+ <div class="error-container">
111
+ <i class="fas fa-unlink broken-icon"></i>
112
+ <h1 class="error-code">404</h1>
113
+ <p class="error-message">Oops! This tweet seems to have disappeared.</p>
114
+
115
+ <a href="{{ url_for('index') }}" class="btn-primary">
116
+ <i class="fas fa-home" style="margin-right: 8px;"></i> Return to Home
117
+ </a>
118
+ </div>
119
+ </main>
120
+
121
+ <footer class="container"
122
+ style="padding: 60px 0 40px; text-align: center; border-top: 1px solid rgba(136, 146, 176, 0.1);">
123
+ <p style="color: var(--clr-slate); font-size: 0.95rem; margin-bottom: 12px;">
124
+ Developed by <a href="https://github.com/Amey-Thakur"
125
+ style="color: var(--clr-accent-dark); font-weight: 700;">Amey Thakur</a> & <a
126
+ href="https://github.com/msatmod" style="color: var(--clr-accent-dark); font-weight: 700;">Mega
127
+ Satish</a>
128
+ </p>
129
+ <p style="color: var(--clr-light-slate); font-size: 0.85rem; letter-spacing: 0.5px;">
130
+ © 2022 • MIT License • <a href="https://github.com/Amey-Thakur/DEPRESSION-DETECTION-USING-TWEETS"
131
+ target="_blank" style="margin-left: 5px;"><i class="fab fa-github"></i> GitHub</a>
132
+ </p>
133
+ </footer>
134
+
135
+ </body>
136
+
137
+ </html>
source_code/templates/index.html ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+
4
+ <head>
5
+ <meta charset="UTF-8">
6
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
7
+ <title>Tweet Depression Detection | AMEY & MEGA</title>
8
+ <!-- Simple Brain Icon as Favicon -->
9
+ <link rel="icon" href="{{url_for('.static', filename='brain.svg')}}" type="image/svg+xml">
10
+
11
+ <!-- Modern Typography & Iconography -->
12
+ <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
13
+ <link rel="stylesheet" href="{{url_for('.static', filename='styles.css')}}">
14
+ <link rel="stylesheet" href="{{url_for('.static', filename='overlay.css')}}">
15
+ <script src="{{url_for('.static', filename='security.js')}}"></script>
16
+ </head>
17
+
18
+ <!--
19
+ ==============================================================================
20
+ PROJECT: DEPRESSION-DETECTION-USING-TWEETS
21
+ AUTHORS: AMEY THAKUR & MEGA SATISH
22
+ GITHUB (AMEY): https://github.com/Amey-Thakur
23
+ GITHUB (MEGA): https://github.com/msatmod
24
+ REPOSITORY: https://github.com/Amey-Thakur/DEPRESSION-DETECTION-USING-TWEETS
25
+ RELEASE DATE: June 5, 2022
26
+ LICENSE: MIT License
27
+ DESCRIPTION: Main interface for tweet analysis.
28
+ ==============================================================================
29
+ -->
30
+
31
+ <body>
32
+
33
+ <header class="hero-section container animate-fade">
34
+ <i class="fas fa-brain fa-3x brain-trigger" onclick="triggerCinematic()"></i>
35
+ <h1>Tweet Depression Detection</h1>
36
+ <p>Using Machine Learning to predict sentiment in tweets.</p>
37
+ </header>
38
+
39
+ <main class="container animate-fade" style="animation-delay: 0.2s;">
40
+ <section class="analysis-card">
41
+ <h3><i class="fab fa-twitter" style="margin-right: 10px; color: #1daeff;"></i> Try it Out</h3>
42
+ <p style="font-size: 0.9rem; color: #8892B0; margin-bottom: 20px;">Paste a tweet below to see how our model
43
+ classifies its sentiment.</p>
44
+
45
+ <form id="analysisForm" action="{{ url_for('predict')}}" method="POST">
46
+ <div class="input-group">
47
+ <textarea name="tweet" class="analysis-input"
48
+ placeholder="Paste tweet content here for sentiment analysis..." required></textarea>
49
+ </div>
50
+ <div style="text-align: right;">
51
+ <button type="submit" class="btn-primary">
52
+ <i class="fab fa-twitter" style="margin-right: 8px;"></i> Analyze Tweet
53
+ </button>
54
+ </div>
55
+ </form>
56
+
57
+ <audio id="tweetSound" src="{{url_for('.static', filename='tweet-sound.mp3')}}" preload="auto"></audio>
58
+
59
+ <script>
60
+ // Cinematic Interaction Logic
61
+ function triggerCinematic() {
62
+ const overlay = document.getElementById('cinematicOverlay');
63
+ const sound = document.getElementById('tweetSound');
64
+
65
+ overlay.style.display = 'flex';
66
+ // Force reflow
67
+ void overlay.offsetWidth;
68
+ overlay.classList.add('active');
69
+
70
+ setTimeout(() => {
71
+ sound.play().catch(err => console.log("Audio playback failed:", err));
72
+ }, 200);
73
+ }
74
+
75
+ function closeOverlay() {
76
+ const overlay = document.getElementById('cinematicOverlay');
77
+ overlay.classList.remove('active');
78
+ setTimeout(() => {
79
+ overlay.style.display = 'none';
80
+ }, 800);
81
+ }
82
+
83
+ document.getElementById('analysisForm').addEventListener('submit', function (e) {
84
+ const form = this;
85
+ const sound = document.getElementById('tweetSound');
86
+
87
+ e.preventDefault();
88
+ sound.play().catch(err => console.log("Audio playback failed:", err));
89
+
90
+ setTimeout(() => {
91
+ form.submit();
92
+ }, 400); // 400ms delay to let the chirp start
93
+ });
94
+ </script>
95
+ </section>
96
+
97
+ <section class="analysis-card" style="background: rgba(29, 174, 255, 0.05);">
98
+ <h4>How it Works</h4>
99
+ <p style="font-size: 0.95rem;">This project uses an <strong>SVM (Support Vector Machine)</strong> model
100
+ combined with <strong>spaCy</strong> word embeddings to process and classify text. It was developed to
101
+ experiment with modern Machine Learning workflows.</p>
102
+ </section>
103
+ </main>
104
+
105
+ <footer class="container"
106
+ style="padding: 60px 0 40px; text-align: center; border-top: 1px solid rgba(136, 146, 176, 0.1);">
107
+ <p style="color: var(--clr-slate); font-size: 0.95rem; margin-bottom: 12px;">
108
+ Developed by <a href="https://github.com/Amey-Thakur"
109
+ style="color: var(--clr-accent-dark); font-weight: 700;">Amey Thakur</a> & <a
110
+ href="https://github.com/msatmod" style="color: var(--clr-accent-dark); font-weight: 700;">Mega
111
+ Satish</a>
112
+ </p>
113
+ <p style="color: var(--clr-light-slate); font-size: 0.85rem; letter-spacing: 0.5px;">
114
+ © 2022 • MIT License • <a href="https://github.com/Amey-Thakur/DEPRESSION-DETECTION-USING-TWEETS"
115
+ target="_blank" style="margin-left: 5px;"><i class="fab fa-github"></i> GitHub</a>
116
+ </p>
117
+ </footer>
118
+
119
+ <!-- Cinematic Overlay (Moved to root to avoid transform conflicts) -->
120
+ <div id="cinematicOverlay" class="cinematic-overlay" onclick="closeOverlay()">
121
+ <div class="overlay-content">
122
+ <!-- Cinematic Icon -->
123
+ <i class="fab fa-twitter overlay-logo"></i>
124
+ <h1 class="overlay-title">Tweet Depression Detection</h1>
125
+ <p class="overlay-author">Developed by</p>
126
+ <div class="overlay-names">
127
+ <span>Amey Thakur</span>
128
+ <span class="separator">&</span>
129
+ <span>Mega Satish</span>
130
+ </div>
131
+ </div>
132
+ </div>
133
+
134
+ </body>
135
+
136
+ </html>
source_code/templates/result.html ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--
2
+ ==============================================================================
3
+ PROJECT: DEPRESSION-DETECTION-USING-TWEETS
4
+ AUTHORS: AMEY THAKUR & MEGA SATISH
5
+ GITHUB (AMEY): https://github.com/Amey-Thakur
6
+ GITHUB (MEGA): https://github.com/msatmod
7
+ REPOSITORY: https://github.com/Amey-Thakur/DEPRESSION-DETECTION-USING-TWEETS
8
+ RELEASE DATE: June 5, 2022
9
+ LICENSE: MIT License
10
+ DESCRIPTION: Result page for the tweet analysis.
11
+ ==============================================================================
12
+ -->
13
+
14
+ <!DOCTYPE html>
15
+ <html lang="en">
16
+
17
+ <head>
18
+ <meta charset="UTF-8">
19
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
20
+ <title>Analysis Result | AMEY & MEGA </title>
21
+ <link rel="icon" href="{{url_for('.static', filename='chart-bar.svg')}}" type="image/svg+xml">
22
+
23
+ <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
24
+ <link rel="stylesheet" href="{{url_for('.static', filename='styles.css')}}">
25
+ <script src="{{url_for('.static', filename='security.js')}}"></script>
26
+ </head>
27
+
28
+ <body>
29
+
30
+ <header class="hero-section container animate-fade">
31
+ <i class="fas fa-chart-line fa-3x" style="color: #1daeff; margin-bottom: 20px;"></i>
32
+ <h1>Analysis Result</h1>
33
+ <p>Here is what our model predicted for the tweet you provided.</p>
34
+ </header>
35
+
36
+ <main class="container animate-fade" style="animation-delay: 0.2s;">
37
+ <section class="analysis-card">
38
+ <h3><i class="fas fa-quote-left" style="color: var(--clr-slate); margin-right: 10px;"></i> The Tweet
39
+ </h3>
40
+ <div
41
+ style="background: #F8F9FA; padding: 25px; border-radius: 8px; margin: 20px 0; border-left: 4px solid var(--clr-slate);">
42
+ <p
43
+ style="font-family: var(--font-serif); font-style: italic; font-size: 1.1rem; color: var(--clr-navy);">
44
+ {{ name }}
45
+ </p>
46
+ </div>
47
+ </section>
48
+
49
+ <section class="analysis-card result-display">
50
+ <h3>Our Prediction</h3>
51
+
52
+ {% if prediction == 0 %}
53
+ <div class="result-badge badge-non-depressive">
54
+ <i class="fas fa-check-circle"></i> Non-Depressive
55
+ </div>
56
+ <p style="margin-top: 25px; color: var(--clr-slate);">Our model didn't find any significant signs of
57
+ depression in this text.</p>
58
+ {% elif prediction == 1 %}
59
+ <div class="result-badge badge-depressive">
60
+ <i class="fas fa-exclamation-triangle"></i> Depressive
61
+ </div>
62
+ <p style="margin-top: 25px; color: var(--clr-slate);">Our model identified patterns that are often
63
+ associated with depression in this text.</p>
64
+ {% endif %}
65
+
66
+ <div style="margin-top: 40px; border-top: 1px solid #E2E8F0; padding-top: 30px;">
67
+ <a href="{{ url_for('index') }}" class="btn-primary">
68
+ <i class="fab fa-twitter" style="margin-right: 8px;"></i> Analyze Another Tweet
69
+ </a>
70
+ </div>
71
+ </section>
72
+ </main>
73
+
74
+ <footer class="container"
75
+ style="padding: 60px 0 40px; text-align: center; border-top: 1px solid rgba(136, 146, 176, 0.1);">
76
+ <p style="color: var(--clr-slate); font-size: 0.95rem; margin-bottom: 12px;">
77
+ Developed by <a href="https://github.com/Amey-Thakur"
78
+ style="color: var(--clr-accent-dark); font-weight: 700;">Amey Thakur</a> & <a
79
+ href="https://github.com/msatmod" style="color: var(--clr-accent-dark); font-weight: 700;">Mega
80
+ Satish</a>
81
+ </p>
82
+ <p style="color: var(--clr-light-slate); font-size: 0.85rem; letter-spacing: 0.5px;">
83
+ © 2022 • MIT License • <a href="https://github.com/Amey-Thakur/DEPRESSION-DETECTION-USING-TWEETS"
84
+ target="_blank" style="margin-left: 5px;"><i class="fab fa-github"></i> GitHub</a>
85
+ </p>
86
+ </footer>
87
+
88
+ </body>
89
+
90
+ </html>
source_code/test_app.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+
3
+ def test_app():
4
+ url = "http://127.0.0.1:5000/predict"
5
+ data = {"tweet": "I feel very sad and hopeless today"}
6
+ try:
7
+ response = requests.post(url, data=data)
8
+ if response.status_code == 200:
9
+ print("Successfully connected to the server.")
10
+ if "Outcome: Depressive" in response.text:
11
+ print("Prediction test PASSED: Correctly identified depressive sentiment.")
12
+ else:
13
+ print("Prediction test FAILED: Outcome not found in response.")
14
+ else:
15
+ print(f"Server returned status code: {response.status_code}")
16
+ except Exception as e:
17
+ print(f"Error connecting to server: {e}")
18
+
19
+ if __name__ == "__main__":
20
+ test_app()