DEPRESSION-DETECTION
Browse files- .gitattributes +2 -0
- Dockerfile +2 -2
- README.md +311 -311
- source_code/app.py +59 -0
- source_code/app_utilities.py +65 -0
- source_code/assets/data/external/README.md +3 -0
- source_code/assets/data/processed_data.csv +0 -0
- source_code/assets/data/scrapped/depressive_tweets.csv +0 -0
- source_code/assets/img/ROC_Precision_LR.png +3 -0
- source_code/assets/img/ROC_Precision_SVM.png +3 -0
- source_code/assets/img/app.png +3 -0
- source_code/assets/img/depression.png +3 -0
- source_code/assets/img/logo.jpeg +3 -0
- source_code/assets/img/loss_accuracy_LSTM.png +3 -0
- source_code/assets/img/models_comparison.png +3 -0
- source_code/assets/img/wordcloud_depressive.png +3 -0
- source_code/assets/img/wordcloud_random.png +3 -0
- source_code/assets/models/model_LSTM.pkl +3 -0
- source_code/assets/models/model_LogReg.pkl +3 -0
- source_code/assets/models/model_svm.pkl +3 -0
- source_code/assets/models/model_svm1.pkl +3 -0
- source_code/assets/notebooks/data_cleaning_exploration.ipynb +0 -0
- source_code/assets/notebooks/data_gathering_twint.ipynb +483 -0
- source_code/assets/notebooks/data_gathering_twitter_API.ipynb +0 -0
- source_code/assets/notebooks/modeling.ipynb +0 -0
- source_code/core/clean.py +72 -0
- source_code/core/clean_utilities.py +226 -0
- source_code/core/predict.py +110 -0
- source_code/core/train.py +80 -0
- source_code/core/train_utilities.py +226 -0
- source_code/notebooks/data_cleaning_exploration.py +505 -0
- source_code/notebooks/data_gathering_twint.py +80 -0
- source_code/notebooks/data_gathering_twitter_API.py +388 -0
- source_code/notebooks/modeling.py +378 -0
- source_code/notebooks/old_models.py +637 -0
- source_code/notebooks/testing.py +283 -0
- source_code/requirements.txt +42 -0
- source_code/static/brain.svg +1 -0
- source_code/static/overlay.css +99 -0
- source_code/static/security.js +26 -0
- source_code/static/styles.css +237 -0
- source_code/static/tweet-sound.mp3 +0 -0
- source_code/templates/404.html +137 -0
- source_code/templates/index.html +136 -0
- source_code/templates/result.html +90 -0
- source_code/test_app.py +20 -0
.gitattributes
CHANGED
|
@@ -1,2 +1,4 @@
|
|
| 1 |
Source[[:space:]]Code/assets/models/model_svm.pkl filter=lfs diff=lfs merge=lfs -text
|
| 2 |
Source[[:space:]]Code/assets/models/model_svm1.pkl filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 1 |
Source[[:space:]]Code/assets/models/model_svm.pkl filter=lfs diff=lfs merge=lfs -text
|
| 2 |
Source[[:space:]]Code/assets/models/model_svm1.pkl filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
source_code/assets/models/model_svm.pkl filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
source_code/assets/models/model_svm1.pkl filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
CHANGED
|
@@ -19,7 +19,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
| 19 |
|
| 20 |
# Install dependencies
|
| 21 |
# Note: Path is relative to the repository root where Dockerfile resides
|
| 22 |
-
COPY
|
| 23 |
RUN pip install --upgrade pip
|
| 24 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 25 |
|
|
@@ -27,7 +27,7 @@ RUN pip install --no-cache-dir -r requirements.txt
|
|
| 27 |
RUN python -m spacy download en_core_web_lg
|
| 28 |
|
| 29 |
# Copy project source code
|
| 30 |
-
COPY
|
| 31 |
|
| 32 |
# Hugging Face Spaces requires port 7860
|
| 33 |
EXPOSE 7860
|
|
|
|
| 19 |
|
| 20 |
# Install dependencies
|
| 21 |
# Note: Path is relative to the repository root where Dockerfile resides
|
| 22 |
+
COPY source_code/requirements.txt ./
|
| 23 |
RUN pip install --upgrade pip
|
| 24 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 25 |
|
|
|
|
| 27 |
RUN python -m spacy download en_core_web_lg
|
| 28 |
|
| 29 |
# Copy project source code
|
| 30 |
+
COPY source_code/ ./
|
| 31 |
|
| 32 |
# Hugging Face Spaces requires port 7860
|
| 33 |
EXPOSE 7860
|
README.md
CHANGED
|
@@ -1,312 +1,312 @@
|
|
| 1 |
-
---
|
| 2 |
-
title: Depression Detection Using Tweets
|
| 3 |
-
emoji: 🧠
|
| 4 |
-
colorFrom: blue
|
| 5 |
-
colorTo: indigo
|
| 6 |
-
sdk: docker
|
| 7 |
-
app_file: app.py
|
| 8 |
-
pinned: false
|
| 9 |
-
license: mit
|
| 10 |
-
short_description: Depression Detection in Tweets ML Web App
|
| 11 |
-
---
|
| 12 |
-
|
| 13 |
-
<div align="center">
|
| 14 |
-
|
| 15 |
-
<a name="readme-top"></a>
|
| 16 |
-
# Depression Detection Using Tweets
|
| 17 |
-
|
| 18 |
-
[](LICENSE)
|
| 19 |
-

|
| 20 |
-
[](https://github.com/Amey-Thakur/DEPRESSION_DETECTION_USING_TWEETS)
|
| 21 |
-
[](https://github.com/Amey-Thakur/DEPRESSION_DETECTION_USING_TWEETS)
|
| 22 |
-
|
| 23 |
-
A modern **Python** + **Flask** application designed to analyze tweet sentiment and predict depressive characteristics using a finalized **SVM** model and **spaCy** NLP pipeline.
|
| 24 |
-
|
| 25 |
-
**[Source Code](
|
| 26 |
-
|
| 27 |
-
</div>
|
| 28 |
-
|
| 29 |
-
---
|
| 30 |
-
|
| 31 |
-
<div align="center">
|
| 32 |
-
|
| 33 |
-
[Authors](#authors) · [Overview](#overview) · [Features](#features) · [Structure](#project-structure) · [Results](#results) · [Quick Start](#quick-start) · [Usage Guidelines](#usage-guidelines) · [License](#license) · [About](#about-this-repository) · [Acknowledgments](#acknowledgments)
|
| 34 |
-
|
| 35 |
-
</div>
|
| 36 |
-
|
| 37 |
-
---
|
| 38 |
-
|
| 39 |
-
<!-- AUTHORS -->
|
| 40 |
-
<div align="center">
|
| 41 |
-
|
| 42 |
-
<a name="authors"></a>
|
| 43 |
-
## Authors
|
| 44 |
-
|
| 45 |
-
| <a href="https://github.com/Amey-Thakur"><img src="https://github.com/Amey-Thakur.png" width="150" height="150" alt="Amey Thakur"></a><br>[**Amey Thakur**](https://github.com/Amey-Thakur)<br><br>[](https://orcid.org/0000-0001-5644-1575) | <a href="https://github.com/msatmod"><img src="Mega/Mega.png" width="150" height="150" alt="Mega Satish"></a><br>[**Mega Satish**](https://github.com/msatmod)<br><br>[](https://orcid.org/0000-0002-1844-9557) |
|
| 46 |
-
| :---: | :---: |
|
| 47 |
-
|
| 48 |
-
</div>
|
| 49 |
-
|
| 50 |
-
> [!IMPORTANT]
|
| 51 |
-
> ### 🤝🏻 Special Acknowledgement
|
| 52 |
-
> *Special thanks to **[Mega Satish](https://github.com/msatmod)** for her meaningful contributions, guidance, and support that helped shape this work.*
|
| 53 |
-
|
| 54 |
-
---
|
| 55 |
-
|
| 56 |
-
<!-- OVERVIEW -->
|
| 57 |
-
<a name="overview"></a>
|
| 58 |
-
## Overview
|
| 59 |
-
|
| 60 |
-
**Depression Detection Using Tweets** is a specialized Machine Learning framework designed to translate complex linguistic patterns into empirical psychological insights. This repository prioritizes **high-dimensional feature extraction** and **probabilistic classification** to provide a robust baseline for sentiment analysis within the context of mental health monitoring.
|
| 61 |
-
|
| 62 |
-
* **Linguistic Determinism**: The system utilizes deep NLP preprocessing, including lemmatization and entity normalization, to ensure that the semantic core of a tweet is preserved regardless of slang or stylistic variation.
|
| 63 |
-
* **Vector-Space Inference**: By leveraging **Support Vector Machines (SVM)** and **TF-IDF vectorization**, the model maps textual input into a multi-dimensional hyperplane, enabling precise binary classification of depressive sentiment.
|
| 64 |
-
* **Architectural Efficiency**: The backend is architected for low-latency serving via Flask, ensuring that model inference and result rendering occur in sub-second cycles, critical for interactive user feedback.
|
| 65 |
-
|
| 66 |
-
> [!TIP]
|
| 67 |
-
> **NLP Pipeline Optimization**
|
| 68 |
-
>
|
| 69 |
-
> To maximize classification reliability, the engine employs a **multi-stage linguistic filter**. **Stop-word suppression** and **morphological analysis** strip away structural noise, while the **en_core_web_lg** transformer model contextualizes surviving tokens. This ensures the classifier’s weights are strictly coupled with affective indicators, minimizing the false-positive skew common in generalized sentiment analysis models.
|
| 70 |
-
|
| 71 |
-
---
|
| 72 |
-
|
| 73 |
-
<!-- FEATURES -->
|
| 74 |
-
<a name="features"></a>
|
| 75 |
-
## Features
|
| 76 |
-
|
| 77 |
-
| Feature | Description |
|
| 78 |
-
|---------|-------------|
|
| 79 |
-
| **Core SVM Model** | **High-Dimensional Classification** engine optimized for binary depressive sentiment prediction. |
|
| 80 |
-
| **NLP Pipeline** | Deep linguistic feature extraction powered by the **spaCy transformer model** (`en_core_web_lg`). |
|
| 81 |
-
| **Prediction Hub** | **Real-Time Inference Interface** built with Flask for sub-second classification feedback. |
|
| 82 |
-
| **Security Suite** | Integrated **Browser-Side Integrity** protocols including anti-right-click and anti-select systems. |
|
| 83 |
-
| **Cinematic Surprise** | **Immersive Branding Overlay** featuring animated Twitter iconography and synchronized audio. |
|
| 84 |
-
|
| 85 |
-
> [!NOTE]
|
| 86 |
-
> ### Technical Polish: The Linguistic Singularity
|
| 87 |
-
> We have engineered a **Probabilistic Sentiment Manager** that calibrates model weights across thousands of TF-IDF vectors to simulate human-like linguistic intuition. The visual language focuses on a "Neural Slate" aesthetic, ensuring maximum cognitive focus on the diagnostic outputs without procedural distraction.
|
| 88 |
-
|
| 89 |
-
### Tech Stack
|
| 90 |
-
- **Languages**: Python 3.9+
|
| 91 |
-
- **Logic**: **SVM Classifier** (Scikit-Learn Inference Engine)
|
| 92 |
-
- **Linguistic Data**: **spaCy NLP** (Transformer-based word embeddings)
|
| 93 |
-
- **Web App**: **Flask Framework** (Micro-service architecture for model serving)
|
| 94 |
-
- **UI System**: Premium Modern Aesthetics (Custom CSS / Play Typography)
|
| 95 |
-
- **Deployment**: Standard Python Environment (PIP-managed dependencies)
|
| 96 |
-
|
| 97 |
-
---
|
| 98 |
-
|
| 99 |
-
<!-- PROJECT STRUCTURE -->
|
| 100 |
-
<a name="project-structure"></a>
|
| 101 |
-
## Project Structure
|
| 102 |
-
|
| 103 |
-
```python
|
| 104 |
-
DEPRESSION-DETECTION-USING-TWEETS/
|
| 105 |
-
│
|
| 106 |
-
├── docs/ # Technical Documentation
|
| 107 |
-
│ └── SPECIFICATION.md # Architecture & Design Specification
|
| 108 |
-
│
|
| 109 |
-
├── Mega/ # Archival Attribution Assets
|
| 110 |
-
│ ├── Filly.jpg # Companion (Filly)
|
| 111 |
-
│ └── Mega.png # Author Profile Image (Mega Satish)
|
| 112 |
-
│
|
| 113 |
-
├── screenshots/ # Project Visualization Gallery
|
| 114 |
-
│ ├── 01_landing_page.png # System Hub Initial State
|
| 115 |
-
│ ├── 02_footer_details.png # Brand and Metadata Footer
|
| 116 |
-
│ ├── 03_surprise_cinematic.png # Interactive Animated Sequence
|
| 117 |
-
│ ├── 04_predict_interface.png # Sentiment Analysis Entry Point
|
| 118 |
-
│ ├── 05_analysis_output.png # Model Inference result
|
| 119 |
-
│ └── 06_result_prediction.png # Final Sentiment Output
|
| 120 |
-
│
|
| 121 |
-
├──
|
| 122 |
-
│ ├── assets/ # Serialized Models & Linguistic Data
|
| 123 |
-
│ ├── core/ # ML Pipeline (Clean, Train, Predict)
|
| 124 |
-
│ ├── static/ # Styling, Audio, & Security Scripts
|
| 125 |
-
│ ├── templates/ # HTML Templates (Index, Result, 404)
|
| 126 |
-
│ └── app.py # Flask Application (Entry Point)
|
| 127 |
-
│
|
| 128 |
-
├── .gitattributes # Git configuration
|
| 129 |
-
├── .gitignore # Repository Filters
|
| 130 |
-
├── CITATION.cff # Scholarly Citation Metadata
|
| 131 |
-
├── codemeta.json # Machine-Readable Project Metadata
|
| 132 |
-
├── LICENSE # MIT License Terms
|
| 133 |
-
├── README.md # Comprehensive Scholarly Entrance
|
| 134 |
-
└── SECURITY.md # Security Policy & Protocol
|
| 135 |
-
```
|
| 136 |
-
|
| 137 |
-
---
|
| 138 |
-
|
| 139 |
-
<!-- RESULTS -->
|
| 140 |
-
<a name="results"></a>
|
| 141 |
-
## Results
|
| 142 |
-
|
| 143 |
-
<div align="center">
|
| 144 |
-
<b>Main Landing: System Hub Initialization</b>
|
| 145 |
-
<br>
|
| 146 |
-
<i>Minimalist interface for rapid tweet sentiment analysis.</i>
|
| 147 |
-
<br><br>
|
| 148 |
-
<img src="screenshots/01_landing_page.png" alt="Landing Page" width="90%">
|
| 149 |
-
<br><br><br>
|
| 150 |
-
|
| 151 |
-
<b>Metadata Synthesis: Branding and Footer Detail</b>
|
| 152 |
-
<br>
|
| 153 |
-
<i>Scholarly attribution and project status integration.</i>
|
| 154 |
-
<br><br>
|
| 155 |
-
<img src="screenshots/02_footer_details.png" alt="Footer Details" width="90%">
|
| 156 |
-
<br><br><br>
|
| 157 |
-
|
| 158 |
-
<b>Interactivity: Animated Twitter Sequence</b>
|
| 159 |
-
<br>
|
| 160 |
-
<i>Immersive audiovisual overlay triggered by core branding elements.</i>
|
| 161 |
-
<br><br>
|
| 162 |
-
<img src="screenshots/03_surprise_cinematic.png" alt="Cinematic Surprise" width="90%">
|
| 163 |
-
<br><br><br>
|
| 164 |
-
|
| 165 |
-
<b>Sentiment Entry: Real-time Analysis Interface</b>
|
| 166 |
-
<br>
|
| 167 |
-
<i>Direct manipulation environment for high-latency textual input.</i>
|
| 168 |
-
<br><br>
|
| 169 |
-
<img src="screenshots/04_predict_interface.png" alt="Predict Interface" width="90%">
|
| 170 |
-
<br><br><br>
|
| 171 |
-
|
| 172 |
-
<b>Model Inference: Feature Extraction Output</b>
|
| 173 |
-
<br>
|
| 174 |
-
<i>Deep linguistic analysis and probabilistic score generation.</i>
|
| 175 |
-
<br><br>
|
| 176 |
-
<img src="screenshots/05_analysis_output.png" alt="Analysis Output" width="90%">
|
| 177 |
-
<br><br><br>
|
| 178 |
-
|
| 179 |
-
<b>Statistical Output: Final Sentiment Classification</b>
|
| 180 |
-
<br>
|
| 181 |
-
<i>Categorized classification results with immediate visual feedback.</i>
|
| 182 |
-
<br><br>
|
| 183 |
-
<img src="screenshots/06_result_prediction.png" alt="Result Prediction" width="90%">
|
| 184 |
-
</div>
|
| 185 |
-
|
| 186 |
-
---
|
| 187 |
-
|
| 188 |
-
<!-- QUICK START -->
|
| 189 |
-
<a name="quick-start"></a>
|
| 190 |
-
## Quick Start
|
| 191 |
-
|
| 192 |
-
### 1. Prerequisites
|
| 193 |
-
- **Python 3.11+**: Required for runtime execution. [Download Python](https://www.python.org/downloads/)
|
| 194 |
-
- **Git**: For version control and cloning. [Download Git](https://git-scm.com/downloads)
|
| 195 |
-
|
| 196 |
-
> [!WARNING]
|
| 197 |
-
> **Data Acquisition & Memory Constraints**
|
| 198 |
-
>
|
| 199 |
-
> The linguistic pipeline relies on the **en_core_web_lg** transformer model, which requires an initial download of approximately **800MB**. Ensure a stable network connection during setup. Additionally, loading this model into memory requires at least **2GB of available RAM** to prevent swapping and ensure low-latency inference.
|
| 200 |
-
|
| 201 |
-
### 2. Installation & Setup
|
| 202 |
-
|
| 203 |
-
#### Step 1: Clone the Repository
|
| 204 |
-
Open your terminal and clone the repository:
|
| 205 |
-
```bash
|
| 206 |
-
git clone https://github.com/Amey-Thakur/DEPRESSION-DETECTION-USING-TWEETS.git
|
| 207 |
-
cd DEPRESSION-DETECTION-USING-TWEETS
|
| 208 |
-
```
|
| 209 |
-
|
| 210 |
-
#### Step 2: Configure Virtual Environment
|
| 211 |
-
Prepare an isolated environment to manage dependencies:
|
| 212 |
-
|
| 213 |
-
**Windows (Command Prompt / PowerShell):**
|
| 214 |
-
```bash
|
| 215 |
-
python -m venv venv
|
| 216 |
-
venv\Scripts\activate
|
| 217 |
-
```
|
| 218 |
-
|
| 219 |
-
**macOS / Linux (Terminal):**
|
| 220 |
-
```bash
|
| 221 |
-
python3 -m venv venv
|
| 222 |
-
source venv/bin/activate
|
| 223 |
-
```
|
| 224 |
-
|
| 225 |
-
#### Step 3: Install Core Dependencies
|
| 226 |
-
Ensure your environment is active, then install the required libraries:
|
| 227 |
-
```bash
|
| 228 |
-
pip install -r "Source Code/requirements.txt"
|
| 229 |
-
```
|
| 230 |
-
|
| 231 |
-
#### Step 4: Linguistic Model Acquisition
|
| 232 |
-
Download the large-scale linguistic model required for analysis (approx. 800MB):
|
| 233 |
-
```bash
|
| 234 |
-
python -m spacy download en_core_web_lg
|
| 235 |
-
```
|
| 236 |
-
|
| 237 |
-
### 3. Execution
|
| 238 |
-
Launch the sentiment analysis dashboard:
|
| 239 |
-
|
| 240 |
-
```bash
|
| 241 |
-
python "Source Code/app.py"
|
| 242 |
-
```
|
| 243 |
-
|
| 244 |
-
---
|
| 245 |
-
|
| 246 |
-
<!-- USAGE GUIDELINES -->
|
| 247 |
-
<a name="usage-guidelines"></a>
|
| 248 |
-
## Usage Guidelines
|
| 249 |
-
|
| 250 |
-
This repository is openly shared to support learning and knowledge exchange across the academic community.
|
| 251 |
-
|
| 252 |
-
**For Students**
|
| 253 |
-
Use this project as reference material for understanding **Support Vector Machines (SVM)**, **spaCy NLP pipelines**, and **sentiment analysis within the context of mental health monitoring**. The source code is available for study to facilitate self-paced learning and exploration of **high-dimensional feature extraction and model serving via Flask**.
|
| 254 |
-
|
| 255 |
-
**For Educators**
|
| 256 |
-
This project may serve as a practical lab example or supplementary teaching resource for **Data Science**, **Natural Language Processing**, and **Machine Learning** courses. Attribution is appreciated when utilizing content.
|
| 257 |
-
|
| 258 |
-
**For Researchers**
|
| 259 |
-
The documentation and architectural approach may provide insights into **academic project structuring**, **psychological linguistic modeling**, and **algorithmic deployment**.
|
| 260 |
-
|
| 261 |
-
---
|
| 262 |
-
|
| 263 |
-
<!-- LICENSE -->
|
| 264 |
-
<a name="license"></a>
|
| 265 |
-
## License
|
| 266 |
-
|
| 267 |
-
This repository and all its creative and technical assets are made available under the **MIT License**. See the [LICENSE](LICENSE) file for complete terms.
|
| 268 |
-
|
| 269 |
-
> [!NOTE]
|
| 270 |
-
> **Summary**: You are free to share and adapt this content for any purpose, even commercially, as long as you provide appropriate attribution to the original authors.
|
| 271 |
-
|
| 272 |
-
Copyright © 2022 Amey Thakur & Mega Satish
|
| 273 |
-
|
| 274 |
-
---
|
| 275 |
-
|
| 276 |
-
<!-- ABOUT -->
|
| 277 |
-
<a name="about-this-repository"></a>
|
| 278 |
-
## About This Repository
|
| 279 |
-
|
| 280 |
-
**Created & Maintained by**: [Amey Thakur](https://github.com/Amey-Thakur) & [Mega Satish](https://github.com/msatmod)
|
| 281 |
-
|
| 282 |
-
This project features **Depression Detection**, a high-performance sentiment analysis system. It represents a personal exploration into **Python**-based machine learning and interactive web-service architecture.
|
| 283 |
-
|
| 284 |
-
**Connect:** [GitHub](https://github.com/Amey-Thakur) · [LinkedIn](https://www.linkedin.com/in/amey-thakur) · [ORCID](https://orcid.org/0000-0001-5644-1575)
|
| 285 |
-
|
| 286 |
-
### Acknowledgments
|
| 287 |
-
|
| 288 |
-
Grateful acknowledgment to [**Mega Satish**](https://github.com/msatmod) for her exceptional collaboration and scholarly partnership during the development of this machine learning project. Her constant support, technical clarity, and dedication to software quality were instrumental in achieving the system's functional objectives. Learning alongside her was a transformative experience; her thoughtful approach to problem-solving and steady encouragement turned complex requirements into meaningful learning moments. This work reflects the growth and insights gained from our side-by-side academic journey. Thank you, Mega, for everything you shared and taught along the way.
|
| 289 |
-
|
| 290 |
-
Special thanks to the **mentors and peers** whose encouragement, discussions, and support contributed meaningfully to this learning experience.
|
| 291 |
-
|
| 292 |
-
---
|
| 293 |
-
|
| 294 |
-
<div align="center">
|
| 295 |
-
|
| 296 |
-
[↑ Back to Top](#readme-top)
|
| 297 |
-
|
| 298 |
-
[Authors](#authors) · [Overview](#overview) · [Features](#features) · [Structure](#project-structure) · [Results](#results) · [Quick Start](#quick-start) · [Usage Guidelines](#usage-guidelines) · [License](#license) · [About](#about-this-repository) · [Acknowledgments](#acknowledgments)
|
| 299 |
-
|
| 300 |
-
<br>
|
| 301 |
-
|
| 302 |
-
🧠 **[DEPRESSION-DETECTION](https://huggingface.co/spaces/ameythakur/Depression-Detection-Using-Tweets)**
|
| 303 |
-
|
| 304 |
-
---
|
| 305 |
-
|
| 306 |
-
### 🎓 [Computer Engineering Repository](https://github.com/Amey-Thakur/COMPUTER-ENGINEERING)
|
| 307 |
-
|
| 308 |
-
**Computer Engineering (B.E.)**
|
| 309 |
-
|
| 310 |
-
*Semester-wise curriculum, laboratories, projects, and academic notes.*
|
| 311 |
-
|
| 312 |
</div>
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Depression Detection Using Tweets
|
| 3 |
+
emoji: 🧠
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: indigo
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_file: app.py
|
| 8 |
+
pinned: false
|
| 9 |
+
license: mit
|
| 10 |
+
short_description: Depression Detection in Tweets ML Web App
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
<div align="center">
|
| 14 |
+
|
| 15 |
+
<a name="readme-top"></a>
|
| 16 |
+
# Depression Detection Using Tweets
|
| 17 |
+
|
| 18 |
+
[](LICENSE)
|
| 19 |
+

|
| 20 |
+
[](https://github.com/Amey-Thakur/DEPRESSION_DETECTION_USING_TWEETS)
|
| 21 |
+
[](https://github.com/Amey-Thakur/DEPRESSION_DETECTION_USING_TWEETS)
|
| 22 |
+
|
| 23 |
+
A modern **Python** + **Flask** application designed to analyze tweet sentiment and predict depressive characteristics using a finalized **SVM** model and **spaCy** NLP pipeline.
|
| 24 |
+
|
| 25 |
+
**[Source Code](source_code/)** · **[Technical Specification](docs/SPECIFICATION.md)** · **[Live Demo](https://huggingface.co/spaces/ameythakur/Depression-Detection-Using-Tweets)**
|
| 26 |
+
|
| 27 |
+
</div>
|
| 28 |
+
|
| 29 |
+
---
|
| 30 |
+
|
| 31 |
+
<div align="center">
|
| 32 |
+
|
| 33 |
+
[Authors](#authors) · [Overview](#overview) · [Features](#features) · [Structure](#project-structure) · [Results](#results) · [Quick Start](#quick-start) · [Usage Guidelines](#usage-guidelines) · [License](#license) · [About](#about-this-repository) · [Acknowledgments](#acknowledgments)
|
| 34 |
+
|
| 35 |
+
</div>
|
| 36 |
+
|
| 37 |
+
---
|
| 38 |
+
|
| 39 |
+
<!-- AUTHORS -->
|
| 40 |
+
<div align="center">
|
| 41 |
+
|
| 42 |
+
<a name="authors"></a>
|
| 43 |
+
## Authors
|
| 44 |
+
|
| 45 |
+
| <a href="https://github.com/Amey-Thakur"><img src="https://github.com/Amey-Thakur.png" width="150" height="150" alt="Amey Thakur"></a><br>[**Amey Thakur**](https://github.com/Amey-Thakur)<br><br>[](https://orcid.org/0000-0001-5644-1575) | <a href="https://github.com/msatmod"><img src="Mega/Mega.png" width="150" height="150" alt="Mega Satish"></a><br>[**Mega Satish**](https://github.com/msatmod)<br><br>[](https://orcid.org/0000-0002-1844-9557) |
|
| 46 |
+
| :---: | :---: |
|
| 47 |
+
|
| 48 |
+
</div>
|
| 49 |
+
|
| 50 |
+
> [!IMPORTANT]
|
| 51 |
+
> ### 🤝🏻 Special Acknowledgement
|
| 52 |
+
> *Special thanks to **[Mega Satish](https://github.com/msatmod)** for her meaningful contributions, guidance, and support that helped shape this work.*
|
| 53 |
+
|
| 54 |
+
---
|
| 55 |
+
|
| 56 |
+
<!-- OVERVIEW -->
|
| 57 |
+
<a name="overview"></a>
|
| 58 |
+
## Overview
|
| 59 |
+
|
| 60 |
+
**Depression Detection Using Tweets** is a specialized Machine Learning framework designed to translate complex linguistic patterns into empirical psychological insights. This repository prioritizes **high-dimensional feature extraction** and **probabilistic classification** to provide a robust baseline for sentiment analysis within the context of mental health monitoring.
|
| 61 |
+
|
| 62 |
+
* **Linguistic Determinism**: The system utilizes deep NLP preprocessing, including lemmatization and entity normalization, to ensure that the semantic core of a tweet is preserved regardless of slang or stylistic variation.
|
| 63 |
+
* **Vector-Space Inference**: By leveraging **Support Vector Machines (SVM)** and **TF-IDF vectorization**, the model maps textual input into a multi-dimensional hyperplane, enabling precise binary classification of depressive sentiment.
|
| 64 |
+
* **Architectural Efficiency**: The backend is architected for low-latency serving via Flask, ensuring that model inference and result rendering occur in sub-second cycles, critical for interactive user feedback.
|
| 65 |
+
|
| 66 |
+
> [!TIP]
|
| 67 |
+
> **NLP Pipeline Optimization**
|
| 68 |
+
>
|
| 69 |
+
> To maximize classification reliability, the engine employs a **multi-stage linguistic filter**. **Stop-word suppression** and **morphological analysis** strip away structural noise, while the **en_core_web_lg** transformer model contextualizes surviving tokens. This ensures the classifier’s weights are strictly coupled with affective indicators, minimizing the false-positive skew common in generalized sentiment analysis models.
|
| 70 |
+
|
| 71 |
+
---
|
| 72 |
+
|
| 73 |
+
<!-- FEATURES -->
|
| 74 |
+
<a name="features"></a>
|
| 75 |
+
## Features
|
| 76 |
+
|
| 77 |
+
| Feature | Description |
|
| 78 |
+
|---------|-------------|
|
| 79 |
+
| **Core SVM Model** | **High-Dimensional Classification** engine optimized for binary depressive sentiment prediction. |
|
| 80 |
+
| **NLP Pipeline** | Deep linguistic feature extraction powered by the **spaCy transformer model** (`en_core_web_lg`). |
|
| 81 |
+
| **Prediction Hub** | **Real-Time Inference Interface** built with Flask for sub-second classification feedback. |
|
| 82 |
+
| **Security Suite** | Integrated **Browser-Side Integrity** protocols including anti-right-click and anti-select systems. |
|
| 83 |
+
| **Cinematic Surprise** | **Immersive Branding Overlay** featuring animated Twitter iconography and synchronized audio. |
|
| 84 |
+
|
| 85 |
+
> [!NOTE]
|
| 86 |
+
> ### Technical Polish: The Linguistic Singularity
|
| 87 |
+
> We have engineered a **Probabilistic Sentiment Manager** that calibrates model weights across thousands of TF-IDF vectors to simulate human-like linguistic intuition. The visual language focuses on a "Neural Slate" aesthetic, ensuring maximum cognitive focus on the diagnostic outputs without procedural distraction.
|
| 88 |
+
|
| 89 |
+
### Tech Stack
|
| 90 |
+
- **Languages**: Python 3.9+
|
| 91 |
+
- **Logic**: **SVM Classifier** (Scikit-Learn Inference Engine)
|
| 92 |
+
- **Linguistic Data**: **spaCy NLP** (Transformer-based word embeddings)
|
| 93 |
+
- **Web App**: **Flask Framework** (Micro-service architecture for model serving)
|
| 94 |
+
- **UI System**: Premium Modern Aesthetics (Custom CSS / Play Typography)
|
| 95 |
+
- **Deployment**: Standard Python Environment (PIP-managed dependencies)
|
| 96 |
+
|
| 97 |
+
---
|
| 98 |
+
|
| 99 |
+
<!-- PROJECT STRUCTURE -->
|
| 100 |
+
<a name="project-structure"></a>
|
| 101 |
+
## Project Structure
|
| 102 |
+
|
| 103 |
+
```python
|
| 104 |
+
DEPRESSION-DETECTION-USING-TWEETS/
|
| 105 |
+
│
|
| 106 |
+
├── docs/ # Technical Documentation
|
| 107 |
+
│ └── SPECIFICATION.md # Architecture & Design Specification
|
| 108 |
+
│
|
| 109 |
+
├── Mega/ # Archival Attribution Assets
|
| 110 |
+
│ ├── Filly.jpg # Companion (Filly)
|
| 111 |
+
│ └── Mega.png # Author Profile Image (Mega Satish)
|
| 112 |
+
│
|
| 113 |
+
├── screenshots/ # Project Visualization Gallery
|
| 114 |
+
│ ├── 01_landing_page.png # System Hub Initial State
|
| 115 |
+
│ ├── 02_footer_details.png # Brand and Metadata Footer
|
| 116 |
+
│ ├── 03_surprise_cinematic.png # Interactive Animated Sequence
|
| 117 |
+
│ ├── 04_predict_interface.png # Sentiment Analysis Entry Point
|
| 118 |
+
│ ├── 05_analysis_output.png # Model Inference result
|
| 119 |
+
│ └── 06_result_prediction.png # Final Sentiment Output
|
| 120 |
+
│
|
| 121 |
+
├── source_code/ # Primary Application Layer
|
| 122 |
+
│ ├── assets/ # Serialized Models & Linguistic Data
|
| 123 |
+
│ ├── core/ # ML Pipeline (Clean, Train, Predict)
|
| 124 |
+
│ ├── static/ # Styling, Audio, & Security Scripts
|
| 125 |
+
│ ├── templates/ # HTML Templates (Index, Result, 404)
|
| 126 |
+
│ └── app.py # Flask Application (Entry Point)
|
| 127 |
+
│
|
| 128 |
+
├── .gitattributes # Git configuration
|
| 129 |
+
├── .gitignore # Repository Filters
|
| 130 |
+
├── CITATION.cff # Scholarly Citation Metadata
|
| 131 |
+
├── codemeta.json # Machine-Readable Project Metadata
|
| 132 |
+
├── LICENSE # MIT License Terms
|
| 133 |
+
├── README.md # Comprehensive Scholarly Entrance
|
| 134 |
+
└── SECURITY.md # Security Policy & Protocol
|
| 135 |
+
```
|
| 136 |
+
|
| 137 |
+
---
|
| 138 |
+
|
| 139 |
+
<!-- RESULTS -->
|
| 140 |
+
<a name="results"></a>
|
| 141 |
+
## Results
|
| 142 |
+
|
| 143 |
+
<div align="center">
|
| 144 |
+
<b>Main Landing: System Hub Initialization</b>
|
| 145 |
+
<br>
|
| 146 |
+
<i>Minimalist interface for rapid tweet sentiment analysis.</i>
|
| 147 |
+
<br><br>
|
| 148 |
+
<img src="screenshots/01_landing_page.png" alt="Landing Page" width="90%">
|
| 149 |
+
<br><br><br>
|
| 150 |
+
|
| 151 |
+
<b>Metadata Synthesis: Branding and Footer Detail</b>
|
| 152 |
+
<br>
|
| 153 |
+
<i>Scholarly attribution and project status integration.</i>
|
| 154 |
+
<br><br>
|
| 155 |
+
<img src="screenshots/02_footer_details.png" alt="Footer Details" width="90%">
|
| 156 |
+
<br><br><br>
|
| 157 |
+
|
| 158 |
+
<b>Interactivity: Animated Twitter Sequence</b>
|
| 159 |
+
<br>
|
| 160 |
+
<i>Immersive audiovisual overlay triggered by core branding elements.</i>
|
| 161 |
+
<br><br>
|
| 162 |
+
<img src="screenshots/03_surprise_cinematic.png" alt="Cinematic Surprise" width="90%">
|
| 163 |
+
<br><br><br>
|
| 164 |
+
|
| 165 |
+
<b>Sentiment Entry: Real-time Analysis Interface</b>
|
| 166 |
+
<br>
|
| 167 |
+
<i>Direct manipulation environment for high-latency textual input.</i>
|
| 168 |
+
<br><br>
|
| 169 |
+
<img src="screenshots/04_predict_interface.png" alt="Predict Interface" width="90%">
|
| 170 |
+
<br><br><br>
|
| 171 |
+
|
| 172 |
+
<b>Model Inference: Feature Extraction Output</b>
|
| 173 |
+
<br>
|
| 174 |
+
<i>Deep linguistic analysis and probabilistic score generation.</i>
|
| 175 |
+
<br><br>
|
| 176 |
+
<img src="screenshots/05_analysis_output.png" alt="Analysis Output" width="90%">
|
| 177 |
+
<br><br><br>
|
| 178 |
+
|
| 179 |
+
<b>Statistical Output: Final Sentiment Classification</b>
|
| 180 |
+
<br>
|
| 181 |
+
<i>Categorized classification results with immediate visual feedback.</i>
|
| 182 |
+
<br><br>
|
| 183 |
+
<img src="screenshots/06_result_prediction.png" alt="Result Prediction" width="90%">
|
| 184 |
+
</div>
|
| 185 |
+
|
| 186 |
+
---
|
| 187 |
+
|
| 188 |
+
<!-- QUICK START -->
|
| 189 |
+
<a name="quick-start"></a>
|
| 190 |
+
## Quick Start
|
| 191 |
+
|
| 192 |
+
### 1. Prerequisites
|
| 193 |
+
- **Python 3.11+**: Required for runtime execution. [Download Python](https://www.python.org/downloads/)
|
| 194 |
+
- **Git**: For version control and cloning. [Download Git](https://git-scm.com/downloads)
|
| 195 |
+
|
| 196 |
+
> [!WARNING]
|
| 197 |
+
> **Data Acquisition & Memory Constraints**
|
| 198 |
+
>
|
| 199 |
+
> The linguistic pipeline relies on the **en_core_web_lg** transformer model, which requires an initial download of approximately **800MB**. Ensure a stable network connection during setup. Additionally, loading this model into memory requires at least **2GB of available RAM** to prevent swapping and ensure low-latency inference.
|
| 200 |
+
|
| 201 |
+
### 2. Installation & Setup
|
| 202 |
+
|
| 203 |
+
#### Step 1: Clone the Repository
|
| 204 |
+
Open your terminal and clone the repository:
|
| 205 |
+
```bash
|
| 206 |
+
git clone https://github.com/Amey-Thakur/DEPRESSION-DETECTION-USING-TWEETS.git
|
| 207 |
+
cd DEPRESSION-DETECTION-USING-TWEETS
|
| 208 |
+
```
|
| 209 |
+
|
| 210 |
+
#### Step 2: Configure Virtual Environment
|
| 211 |
+
Prepare an isolated environment to manage dependencies:
|
| 212 |
+
|
| 213 |
+
**Windows (Command Prompt / PowerShell):**
|
| 214 |
+
```bash
|
| 215 |
+
python -m venv venv
|
| 216 |
+
venv\Scripts\activate
|
| 217 |
+
```
|
| 218 |
+
|
| 219 |
+
**macOS / Linux (Terminal):**
|
| 220 |
+
```bash
|
| 221 |
+
python3 -m venv venv
|
| 222 |
+
source venv/bin/activate
|
| 223 |
+
```
|
| 224 |
+
|
| 225 |
+
#### Step 3: Install Core Dependencies
|
| 226 |
+
Ensure your environment is active, then install the required libraries:
|
| 227 |
+
```bash
|
| 228 |
+
pip install -r "Source Code/requirements.txt"
|
| 229 |
+
```
|
| 230 |
+
|
| 231 |
+
#### Step 4: Linguistic Model Acquisition
|
| 232 |
+
Download the large-scale linguistic model required for analysis (approx. 800MB):
|
| 233 |
+
```bash
|
| 234 |
+
python -m spacy download en_core_web_lg
|
| 235 |
+
```
|
| 236 |
+
|
| 237 |
+
### 3. Execution
|
| 238 |
+
Launch the sentiment analysis dashboard:
|
| 239 |
+
|
| 240 |
+
```bash
|
| 241 |
+
python "Source Code/app.py"
|
| 242 |
+
```
|
| 243 |
+
|
| 244 |
+
---
|
| 245 |
+
|
| 246 |
+
<!-- USAGE GUIDELINES -->
|
| 247 |
+
<a name="usage-guidelines"></a>
|
| 248 |
+
## Usage Guidelines
|
| 249 |
+
|
| 250 |
+
This repository is openly shared to support learning and knowledge exchange across the academic community.
|
| 251 |
+
|
| 252 |
+
**For Students**
|
| 253 |
+
Use this project as reference material for understanding **Support Vector Machines (SVM)**, **spaCy NLP pipelines**, and **sentiment analysis within the context of mental health monitoring**. The source code is available for study to facilitate self-paced learning and exploration of **high-dimensional feature extraction and model serving via Flask**.
|
| 254 |
+
|
| 255 |
+
**For Educators**
|
| 256 |
+
This project may serve as a practical lab example or supplementary teaching resource for **Data Science**, **Natural Language Processing**, and **Machine Learning** courses. Attribution is appreciated when utilizing content.
|
| 257 |
+
|
| 258 |
+
**For Researchers**
|
| 259 |
+
The documentation and architectural approach may provide insights into **academic project structuring**, **psychological linguistic modeling**, and **algorithmic deployment**.
|
| 260 |
+
|
| 261 |
+
---
|
| 262 |
+
|
| 263 |
+
<!-- LICENSE -->
|
| 264 |
+
<a name="license"></a>
|
| 265 |
+
## License
|
| 266 |
+
|
| 267 |
+
This repository and all its creative and technical assets are made available under the **MIT License**. See the [LICENSE](LICENSE) file for complete terms.
|
| 268 |
+
|
| 269 |
+
> [!NOTE]
|
| 270 |
+
> **Summary**: You are free to share and adapt this content for any purpose, even commercially, as long as you provide appropriate attribution to the original authors.
|
| 271 |
+
|
| 272 |
+
Copyright © 2022 Amey Thakur & Mega Satish
|
| 273 |
+
|
| 274 |
+
---
|
| 275 |
+
|
| 276 |
+
<!-- ABOUT -->
|
| 277 |
+
<a name="about-this-repository"></a>
|
| 278 |
+
## About This Repository
|
| 279 |
+
|
| 280 |
+
**Created & Maintained by**: [Amey Thakur](https://github.com/Amey-Thakur) & [Mega Satish](https://github.com/msatmod)
|
| 281 |
+
|
| 282 |
+
This project features **Depression Detection**, a high-performance sentiment analysis system. It represents a personal exploration into **Python**-based machine learning and interactive web-service architecture.
|
| 283 |
+
|
| 284 |
+
**Connect:** [GitHub](https://github.com/Amey-Thakur) · [LinkedIn](https://www.linkedin.com/in/amey-thakur) · [ORCID](https://orcid.org/0000-0001-5644-1575)
|
| 285 |
+
|
| 286 |
+
### Acknowledgments
|
| 287 |
+
|
| 288 |
+
Grateful acknowledgment to [**Mega Satish**](https://github.com/msatmod) for her exceptional collaboration and scholarly partnership during the development of this machine learning project. Her constant support, technical clarity, and dedication to software quality were instrumental in achieving the system's functional objectives. Learning alongside her was a transformative experience; her thoughtful approach to problem-solving and steady encouragement turned complex requirements into meaningful learning moments. This work reflects the growth and insights gained from our side-by-side academic journey. Thank you, Mega, for everything you shared and taught along the way.
|
| 289 |
+
|
| 290 |
+
Special thanks to the **mentors and peers** whose encouragement, discussions, and support contributed meaningfully to this learning experience.
|
| 291 |
+
|
| 292 |
+
---
|
| 293 |
+
|
| 294 |
+
<div align="center">
|
| 295 |
+
|
| 296 |
+
[↑ Back to Top](#readme-top)
|
| 297 |
+
|
| 298 |
+
[Authors](#authors) · [Overview](#overview) · [Features](#features) · [Structure](#project-structure) · [Results](#results) · [Quick Start](#quick-start) · [Usage Guidelines](#usage-guidelines) · [License](#license) · [About](#about-this-repository) · [Acknowledgments](#acknowledgments)
|
| 299 |
+
|
| 300 |
+
<br>
|
| 301 |
+
|
| 302 |
+
🧠 **[DEPRESSION-DETECTION](https://huggingface.co/spaces/ameythakur/Depression-Detection-Using-Tweets)**
|
| 303 |
+
|
| 304 |
+
---
|
| 305 |
+
|
| 306 |
+
### 🎓 [Computer Engineering Repository](https://github.com/Amey-Thakur/COMPUTER-ENGINEERING)
|
| 307 |
+
|
| 308 |
+
**Computer Engineering (B.E.)**
|
| 309 |
+
|
| 310 |
+
*Semester-wise curriculum, laboratories, projects, and academic notes.*
|
| 311 |
+
|
| 312 |
</div>
|
source_code/app.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ==============================================================================
|
| 2 |
+
# PROJECT: DEPRESSION-DETECTION-USING-TWEETS
|
| 3 |
+
# AUTHORS: AMEY THAKUR & MEGA SATISH
|
| 4 |
+
# GITHUB (AMEY): https://github.com/Amey-Thakur
|
| 5 |
+
# GITHUB (MEGA): https://github.com/msatmod
|
| 6 |
+
# REPOSITORY: https://github.com/Amey-Thakur/DEPRESSION-DETECTION-USING-TWEETS
|
| 7 |
+
# RELEASE DATE: June 5, 2022
|
| 8 |
+
# LICENSE: MIT License
|
| 9 |
+
# DESCRIPTION: Flask application entry point for the tweet analysis project.
|
| 10 |
+
# ==============================================================================
|
| 11 |
+
|
| 12 |
+
#!/usr/bin/env python3
|
| 13 |
+
|
| 14 |
+
import pickle
|
| 15 |
+
from flask import Flask, request, render_template
|
| 16 |
+
from flask_bootstrap import Bootstrap
|
| 17 |
+
import app_utilities
|
| 18 |
+
|
| 19 |
+
# Initialize the Flask application
|
| 20 |
+
# Flask-Bootstrap is utilized for enhanced UI styling consistency
|
| 21 |
+
app = Flask(__name__)
|
| 22 |
+
Bootstrap(app)
|
| 23 |
+
|
| 24 |
+
@app.route('/')
|
| 25 |
+
def index():
|
| 26 |
+
"""Renders the landing page for tweet input."""
|
| 27 |
+
return render_template('index.html')
|
| 28 |
+
|
| 29 |
+
@app.route('/predict', methods=['POST'])
|
| 30 |
+
def predict():
|
| 31 |
+
"""
|
| 32 |
+
Handles the form submission and displays the prediction result.
|
| 33 |
+
|
| 34 |
+
Returns:
|
| 35 |
+
Rendered result HTML with the model's prediction outcome.
|
| 36 |
+
"""
|
| 37 |
+
if request.method == 'POST':
|
| 38 |
+
# Retrieve the tweet content submitted via the web interface
|
| 39 |
+
tweet = request.form["tweet"]
|
| 40 |
+
input_data = [tweet]
|
| 41 |
+
|
| 42 |
+
# Invoke the backend prediction utility to classify the tweet's sentiment
|
| 43 |
+
# The engine utilizes an SVM classifier with spaCy word embeddings
|
| 44 |
+
my_prediction = app_utilities.tweet_prediction(str(input_data))
|
| 45 |
+
|
| 46 |
+
return render_template("result.html", prediction=my_prediction, name=tweet)
|
| 47 |
+
|
| 48 |
+
@app.errorhandler(404)
|
| 49 |
+
def page_not_found(e):
|
| 50 |
+
"""
|
| 51 |
+
Custom 404 error handler.
|
| 52 |
+
Renders the personalized 404 page when a resource is not found.
|
| 53 |
+
"""
|
| 54 |
+
return render_template('404.html'), 404
|
| 55 |
+
|
| 56 |
+
# Entry point for the Flask development server
|
| 57 |
+
if __name__ == '__main__':
|
| 58 |
+
# Execution on port 7860 as required for Hugging Face Spaces
|
| 59 |
+
app.run(host='0.0.0.0', port=7860)
|
source_code/app_utilities.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ==============================================================================
|
| 2 |
+
# PROJECT: DEPRESSION-DETECTION-USING-TWEETS
|
| 3 |
+
# AUTHORS: AMEY THAKUR & MEGA SATISH
|
| 4 |
+
# GITHUB (AMEY): https://github.com/Amey-Thakur
|
| 5 |
+
# GITHUB (MEGA): https://github.com/msatmod
|
| 6 |
+
# REPOSITORY: https://github.com/Amey-Thakur/DEPRESSION-DETECTION-USING-TWEETS
|
| 7 |
+
# RELEASE DATE: June 5, 2022
|
| 8 |
+
# LICENSE: MIT License
|
| 9 |
+
# DESCRIPTION: Utility module for tweet analysis predictions.
|
| 10 |
+
# ==============================================================================
|
| 11 |
+
|
| 12 |
+
import sys
|
| 13 |
+
import pickle
|
| 14 |
+
import warnings
|
| 15 |
+
import numpy as np
|
| 16 |
+
import pandas as pd
|
| 17 |
+
import spacy
|
| 18 |
+
import en_core_web_lg
|
| 19 |
+
# Configure sys.path to permit localized module discovery within the core directory
|
| 20 |
+
sys.path.append('./core')
|
| 21 |
+
|
| 22 |
+
import clean_utilities as CU
|
| 23 |
+
|
| 24 |
+
# Suppression of non-critical runtime warnings to maintain a clean console log
|
| 25 |
+
warnings.filterwarnings("ignore")
|
| 26 |
+
|
| 27 |
+
def tweet_prediction(tweet: str) -> int:
|
| 28 |
+
"""
|
| 29 |
+
Takes a tweet and returns whether it's classified as depressive (1) or not (0).
|
| 30 |
+
|
| 31 |
+
The process:
|
| 32 |
+
1. Clean the text using our utility module.
|
| 33 |
+
2. Convert text to numbers using spaCy.
|
| 34 |
+
3. Use the trained SVM model to make a prediction.
|
| 35 |
+
Args:
|
| 36 |
+
tweet (str): The tweet text from the user.
|
| 37 |
+
|
| 38 |
+
Returns:
|
| 39 |
+
int: 1 for Depressive, 0 for Non-depressive.
|
| 40 |
+
"""
|
| 41 |
+
# Step 1: Clean the text
|
| 42 |
+
processed_tweet = tweet
|
| 43 |
+
cleaned_input = []
|
| 44 |
+
cleaned_input.append(CU.tweets_cleaner(processed_tweet))
|
| 45 |
+
|
| 46 |
+
# Step 2: Convert text to numbers using spaCy
|
| 47 |
+
nlp_engine = en_core_web_lg.load()
|
| 48 |
+
|
| 49 |
+
# Step 3: Compute centroid word embeddings
|
| 50 |
+
# We calculate the mean vector of all tokens to represent the tweet's semantic context
|
| 51 |
+
semantic_vectors = np.array([
|
| 52 |
+
np.array([token.vector for token in nlp_engine(s)]).mean(axis=0) * np.ones((300))
|
| 53 |
+
for s in cleaned_input
|
| 54 |
+
])
|
| 55 |
+
|
| 56 |
+
# Step 4: Load the pre-trained Support Vector Machine (SVM) model artifact
|
| 57 |
+
# The SVM was selected for its robust performance in high-dimensional text classification
|
| 58 |
+
model_path = "./assets/models/model_svm1.pkl"
|
| 59 |
+
with open(model_path, 'rb') as model_file:
|
| 60 |
+
classifier = pickle.load(model_file)
|
| 61 |
+
|
| 62 |
+
# Step 5: Perform binary classification
|
| 63 |
+
prediction_result = classifier.predict(semantic_vectors)
|
| 64 |
+
|
| 65 |
+
return int(prediction_result[0])
|
source_code/assets/data/external/README.md
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## External dataset:
|
| 2 |
+
|
| 3 |
+
We need two types of datasets one with tweets containing depressive characteristic which is obtained from twitter API and the other one with random tweets which is available in one of the [Kaggle datasets](https://www.kaggle.com/ywang311/twitter-sentiment/data).
|
source_code/assets/data/processed_data.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
source_code/assets/data/scrapped/depressive_tweets.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
source_code/assets/img/ROC_Precision_LR.png
ADDED
|
Git LFS Details
|
source_code/assets/img/ROC_Precision_SVM.png
ADDED
|
Git LFS Details
|
source_code/assets/img/app.png
ADDED
|
Git LFS Details
|
source_code/assets/img/depression.png
ADDED
|
Git LFS Details
|
source_code/assets/img/logo.jpeg
ADDED
|
Git LFS Details
|
source_code/assets/img/loss_accuracy_LSTM.png
ADDED
|
Git LFS Details
|
source_code/assets/img/models_comparison.png
ADDED
|
Git LFS Details
|
source_code/assets/img/wordcloud_depressive.png
ADDED
|
Git LFS Details
|
source_code/assets/img/wordcloud_random.png
ADDED
|
Git LFS Details
|
source_code/assets/models/model_LSTM.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b53fb9941244f6f5ad410246b5606c222763d55462b66b0be08d1845c3dd8574
|
| 3 |
+
size 81
|
source_code/assets/models/model_LogReg.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bc7de925c22a478f8168b698570b9775b28613847b0dfa998fe972a7c9273a0e
|
| 3 |
+
size 197693
|
source_code/assets/models/model_svm.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:db18cfbfd954728d2091154eda313a9c1ffb0f4a1778e878985c055181b17170
|
| 3 |
+
size 24450690
|
source_code/assets/models/model_svm1.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:84c846534b54571c35b0d34bc73d948a0c230e9047ce94df3796750c14e18351
|
| 3 |
+
size 24450707
|
source_code/assets/notebooks/data_cleaning_exploration.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
source_code/assets/notebooks/data_gathering_twint.ipynb
ADDED
|
@@ -0,0 +1,483 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"nbformat": 4,
|
| 3 |
+
"nbformat_minor": 0,
|
| 4 |
+
"metadata": {
|
| 5 |
+
"colab": {
|
| 6 |
+
"name": "Data_Gathering_Twint.ipynb",
|
| 7 |
+
"provenance": [],
|
| 8 |
+
"collapsed_sections": []
|
| 9 |
+
},
|
| 10 |
+
"kernelspec": {
|
| 11 |
+
"name": "python3",
|
| 12 |
+
"display_name": "Python 3"
|
| 13 |
+
},
|
| 14 |
+
"language_info": {
|
| 15 |
+
"name": "python"
|
| 16 |
+
}
|
| 17 |
+
},
|
| 18 |
+
"cells": [
|
| 19 |
+
{
|
| 20 |
+
"cell_type": "code",
|
| 21 |
+
"metadata": {
|
| 22 |
+
"colab": {
|
| 23 |
+
"base_uri": "https://localhost:8080/"
|
| 24 |
+
},
|
| 25 |
+
"id": "QgxmLN_lyiCS",
|
| 26 |
+
"outputId": "58d94201-559a-45fc-8e85-3dc7b73968fd"
|
| 27 |
+
},
|
| 28 |
+
"source": [
|
| 29 |
+
"from google.colab import drive\n",
|
| 30 |
+
"drive.mount('/content/drive')"
|
| 31 |
+
],
|
| 32 |
+
"execution_count": null,
|
| 33 |
+
"outputs": [
|
| 34 |
+
{
|
| 35 |
+
"output_type": "stream",
|
| 36 |
+
"name": "stdout",
|
| 37 |
+
"text": [
|
| 38 |
+
"Mounted at /content/drive\n"
|
| 39 |
+
]
|
| 40 |
+
}
|
| 41 |
+
]
|
| 42 |
+
},
|
| 43 |
+
{
|
| 44 |
+
"cell_type": "code",
|
| 45 |
+
"metadata": {
|
| 46 |
+
"colab": {
|
| 47 |
+
"base_uri": "https://localhost:8080/"
|
| 48 |
+
},
|
| 49 |
+
"id": "-nI0XtuNwnmQ",
|
| 50 |
+
"outputId": "66c8ac16-109f-4ba6-e556-574091755cfc"
|
| 51 |
+
},
|
| 52 |
+
"source": [
|
| 53 |
+
"!git clone https://github.com/twintproject/twint.git"
|
| 54 |
+
],
|
| 55 |
+
"execution_count": null,
|
| 56 |
+
"outputs": [
|
| 57 |
+
{
|
| 58 |
+
"output_type": "stream",
|
| 59 |
+
"name": "stdout",
|
| 60 |
+
"text": [
|
| 61 |
+
"Cloning into 'twint'...\n",
|
| 62 |
+
"remote: Enumerating objects: 4457, done.\u001b[K\n",
|
| 63 |
+
"remote: Counting objects: 100% (4/4), done.\u001b[K\n",
|
| 64 |
+
"remote: Compressing objects: 100% (4/4), done.\u001b[K\n",
|
| 65 |
+
"remote: Total 4457 (delta 0), reused 2 (delta 0), pack-reused 4453\u001b[K\n",
|
| 66 |
+
"Receiving objects: 100% (4457/4457), 4.47 MiB | 13.40 MiB/s, done.\n",
|
| 67 |
+
"Resolving deltas: 100% (2634/2634), done.\n"
|
| 68 |
+
]
|
| 69 |
+
}
|
| 70 |
+
]
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"cell_type": "code",
|
| 74 |
+
"metadata": {
|
| 75 |
+
"id": "DcLRNsvGya2i"
|
| 76 |
+
},
|
| 77 |
+
"source": [
|
| 78 |
+
"import os\n",
|
| 79 |
+
"os.chdir(\"/content/twint\")"
|
| 80 |
+
],
|
| 81 |
+
"execution_count": null,
|
| 82 |
+
"outputs": []
|
| 83 |
+
},
|
| 84 |
+
{
|
| 85 |
+
"cell_type": "code",
|
| 86 |
+
"metadata": {
|
| 87 |
+
"id": "XV0Tp_SQydvh"
|
| 88 |
+
},
|
| 89 |
+
"source": [
|
| 90 |
+
"!pip freeze > requirements.txt"
|
| 91 |
+
],
|
| 92 |
+
"execution_count": null,
|
| 93 |
+
"outputs": []
|
| 94 |
+
},
|
| 95 |
+
{
|
| 96 |
+
"cell_type": "code",
|
| 97 |
+
"metadata": {
|
| 98 |
+
"colab": {
|
| 99 |
+
"base_uri": "https://localhost:8080/"
|
| 100 |
+
},
|
| 101 |
+
"id": "84dyXWmLyrsn",
|
| 102 |
+
"outputId": "57188228-60e3-4a80-b3b9-737362a81227"
|
| 103 |
+
},
|
| 104 |
+
"source": [
|
| 105 |
+
"!pip install ."
|
| 106 |
+
],
|
| 107 |
+
"execution_count": null,
|
| 108 |
+
"outputs": [
|
| 109 |
+
{
|
| 110 |
+
"output_type": "stream",
|
| 111 |
+
"name": "stdout",
|
| 112 |
+
"text": [
|
| 113 |
+
"Processing /content/twint\n",
|
| 114 |
+
"\u001b[33m DEPRECATION: A future pip version will change local packages to be built in-place without first copying to a temporary directory. We recommend you use --use-feature=in-tree-build to test your packages with this new behavior before it becomes the default.\n",
|
| 115 |
+
" pip 21.3 will remove support for this functionality. You can find discussion regarding this at https://github.com/pypa/pip/issues/7555.\u001b[0m\n",
|
| 116 |
+
"Collecting aiohttp\n",
|
| 117 |
+
" Downloading aiohttp-3.7.4.post0-cp37-cp37m-manylinux2014_x86_64.whl (1.3 MB)\n",
|
| 118 |
+
"\u001b[K |████████████████████████████████| 1.3 MB 7.3 MB/s \n",
|
| 119 |
+
"\u001b[?25hCollecting aiodns\n",
|
| 120 |
+
" Downloading aiodns-3.0.0-py3-none-any.whl (5.0 kB)\n",
|
| 121 |
+
"Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.7/dist-packages (from twint==2.1.21) (4.6.3)\n",
|
| 122 |
+
"Collecting cchardet\n",
|
| 123 |
+
" Downloading cchardet-2.1.7-cp37-cp37m-manylinux2010_x86_64.whl (263 kB)\n",
|
| 124 |
+
"\u001b[K |████████████████████████████████| 263 kB 47.9 MB/s \n",
|
| 125 |
+
"\u001b[?25hCollecting dataclasses\n",
|
| 126 |
+
" Downloading dataclasses-0.6-py3-none-any.whl (14 kB)\n",
|
| 127 |
+
"Collecting elasticsearch\n",
|
| 128 |
+
" Downloading elasticsearch-7.15.1-py2.py3-none-any.whl (378 kB)\n",
|
| 129 |
+
"\u001b[K |████████████████████████████████| 378 kB 70.9 MB/s \n",
|
| 130 |
+
"\u001b[?25hRequirement already satisfied: pysocks in /usr/local/lib/python3.7/dist-packages (from twint==2.1.21) (1.7.1)\n",
|
| 131 |
+
"Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from twint==2.1.21) (1.1.5)\n",
|
| 132 |
+
"Collecting aiohttp_socks\n",
|
| 133 |
+
" Downloading aiohttp_socks-0.6.0-py3-none-any.whl (9.2 kB)\n",
|
| 134 |
+
"Collecting schedule\n",
|
| 135 |
+
" Downloading schedule-1.1.0-py2.py3-none-any.whl (10 kB)\n",
|
| 136 |
+
"Requirement already satisfied: geopy in /usr/local/lib/python3.7/dist-packages (from twint==2.1.21) (1.17.0)\n",
|
| 137 |
+
"Collecting fake-useragent\n",
|
| 138 |
+
" Downloading fake-useragent-0.1.11.tar.gz (13 kB)\n",
|
| 139 |
+
"Collecting googletransx\n",
|
| 140 |
+
" Downloading googletransx-2.4.2.tar.gz (13 kB)\n",
|
| 141 |
+
"Collecting pycares>=4.0.0\n",
|
| 142 |
+
" Downloading pycares-4.0.0-cp37-cp37m-manylinux2010_x86_64.whl (291 kB)\n",
|
| 143 |
+
"\u001b[K |████████████████████████████████| 291 kB 59.7 MB/s \n",
|
| 144 |
+
"\u001b[?25hRequirement already satisfied: cffi>=1.5.0 in /usr/local/lib/python3.7/dist-packages (from pycares>=4.0.0->aiodns->twint==2.1.21) (1.14.6)\n",
|
| 145 |
+
"Requirement already satisfied: pycparser in /usr/local/lib/python3.7/dist-packages (from cffi>=1.5.0->pycares>=4.0.0->aiodns->twint==2.1.21) (2.20)\n",
|
| 146 |
+
"Collecting multidict<7.0,>=4.5\n",
|
| 147 |
+
" Downloading multidict-5.2.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (160 kB)\n",
|
| 148 |
+
"\u001b[K |████████████████████████████████| 160 kB 67.8 MB/s \n",
|
| 149 |
+
"\u001b[?25hRequirement already satisfied: typing-extensions>=3.6.5 in /usr/local/lib/python3.7/dist-packages (from aiohttp->twint==2.1.21) (3.7.4.3)\n",
|
| 150 |
+
"Requirement already satisfied: chardet<5.0,>=2.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->twint==2.1.21) (3.0.4)\n",
|
| 151 |
+
"Collecting yarl<2.0,>=1.0\n",
|
| 152 |
+
" Downloading yarl-1.7.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (271 kB)\n",
|
| 153 |
+
"\u001b[K |████████████████████████████████| 271 kB 65.8 MB/s \n",
|
| 154 |
+
"\u001b[?25hCollecting async-timeout<4.0,>=3.0\n",
|
| 155 |
+
" Downloading async_timeout-3.0.1-py3-none-any.whl (8.2 kB)\n",
|
| 156 |
+
"Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->twint==2.1.21) (21.2.0)\n",
|
| 157 |
+
"Requirement already satisfied: idna>=2.0 in /usr/local/lib/python3.7/dist-packages (from yarl<2.0,>=1.0->aiohttp->twint==2.1.21) (2.10)\n",
|
| 158 |
+
"Collecting python-socks[asyncio]>=1.2.2\n",
|
| 159 |
+
" Downloading python_socks-1.2.4-py3-none-any.whl (35 kB)\n",
|
| 160 |
+
"Requirement already satisfied: urllib3<2,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from elasticsearch->twint==2.1.21) (1.24.3)\n",
|
| 161 |
+
"Requirement already satisfied: certifi in /usr/local/lib/python3.7/dist-packages (from elasticsearch->twint==2.1.21) (2021.5.30)\n",
|
| 162 |
+
"Requirement already satisfied: geographiclib<2,>=1.49 in /usr/local/lib/python3.7/dist-packages (from geopy->twint==2.1.21) (1.52)\n",
|
| 163 |
+
"Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from googletransx->twint==2.1.21) (2.23.0)\n",
|
| 164 |
+
"Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas->twint==2.1.21) (2018.9)\n",
|
| 165 |
+
"Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas->twint==2.1.21) (2.8.2)\n",
|
| 166 |
+
"Requirement already satisfied: numpy>=1.15.4 in /usr/local/lib/python3.7/dist-packages (from pandas->twint==2.1.21) (1.19.5)\n",
|
| 167 |
+
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas->twint==2.1.21) (1.15.0)\n",
|
| 168 |
+
"Building wheels for collected packages: twint, fake-useragent, googletransx\n",
|
| 169 |
+
" Building wheel for twint (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
|
| 170 |
+
" Created wheel for twint: filename=twint-2.1.21-py3-none-any.whl size=38870 sha256=a648841e8abdbeafa3718d69334377eef20411c64b11553f70be9937a84be56a\n",
|
| 171 |
+
" Stored in directory: /tmp/pip-ephem-wheel-cache-oe_ws1ie/wheels/f7/3e/11/2803f3c6890e87a9bec35bb8e37ef1ad0777a00f43e2441fb1\n",
|
| 172 |
+
" Building wheel for fake-useragent (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
|
| 173 |
+
" Created wheel for fake-useragent: filename=fake_useragent-0.1.11-py3-none-any.whl size=13502 sha256=db8ea0f861a9b913fb4822f3cbfb76d4ff27a371f7c125657f7cbb17766fc316\n",
|
| 174 |
+
" Stored in directory: /root/.cache/pip/wheels/ed/f7/62/50ab6c9a0b5567267ab76a9daa9d06315704209b2c5d032031\n",
|
| 175 |
+
" Building wheel for googletransx (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
|
| 176 |
+
" Created wheel for googletransx: filename=googletransx-2.4.2-py3-none-any.whl size=15968 sha256=513c5ee44dad1d794a351939b43e11cd2a5d2fbe541ad2a9271d661c96e29221\n",
|
| 177 |
+
" Stored in directory: /root/.cache/pip/wheels/66/d5/b1/31104b338f7fd45aa8f7d22587765db06773b13df48a89735f\n",
|
| 178 |
+
"Successfully built twint fake-useragent googletransx\n",
|
| 179 |
+
"Installing collected packages: multidict, yarl, python-socks, async-timeout, pycares, aiohttp, schedule, googletransx, fake-useragent, elasticsearch, dataclasses, cchardet, aiohttp-socks, aiodns, twint\n",
|
| 180 |
+
"Successfully installed aiodns-3.0.0 aiohttp-3.7.4.post0 aiohttp-socks-0.6.0 async-timeout-3.0.1 cchardet-2.1.7 dataclasses-0.6 elasticsearch-7.15.1 fake-useragent-0.1.11 googletransx-2.4.2 multidict-5.2.0 pycares-4.0.0 python-socks-1.2.4 schedule-1.1.0 twint-2.1.21 yarl-1.7.0\n"
|
| 181 |
+
]
|
| 182 |
+
}
|
| 183 |
+
]
|
| 184 |
+
},
|
| 185 |
+
{
|
| 186 |
+
"cell_type": "code",
|
| 187 |
+
"metadata": {
|
| 188 |
+
"colab": {
|
| 189 |
+
"base_uri": "https://localhost:8080/"
|
| 190 |
+
},
|
| 191 |
+
"id": "b8rOKGVQ8OBq",
|
| 192 |
+
"outputId": "499b3126-f2b4-4799-f265-5065322a0146"
|
| 193 |
+
},
|
| 194 |
+
"source": [
|
| 195 |
+
"!pip install -U git+https://github.com/cyxv/twint.git@master"
|
| 196 |
+
],
|
| 197 |
+
"execution_count": null,
|
| 198 |
+
"outputs": [
|
| 199 |
+
{
|
| 200 |
+
"output_type": "stream",
|
| 201 |
+
"name": "stdout",
|
| 202 |
+
"text": [
|
| 203 |
+
"Collecting git+https://github.com/cyxv/twint.git@master\n",
|
| 204 |
+
" Cloning https://github.com/cyxv/twint.git (to revision master) to /tmp/pip-req-build-bjyd0ng2\n",
|
| 205 |
+
" Running command git clone -q https://github.com/cyxv/twint.git /tmp/pip-req-build-bjyd0ng2\n",
|
| 206 |
+
"Requirement already satisfied: aiohttp in /usr/local/lib/python3.7/dist-packages (from twint==2.1.21) (3.7.4.post0)\n",
|
| 207 |
+
"Requirement already satisfied: aiodns in /usr/local/lib/python3.7/dist-packages (from twint==2.1.21) (3.0.0)\n",
|
| 208 |
+
"Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.7/dist-packages (from twint==2.1.21) (4.6.3)\n",
|
| 209 |
+
"Requirement already satisfied: cchardet in /usr/local/lib/python3.7/dist-packages (from twint==2.1.21) (2.1.7)\n",
|
| 210 |
+
"Requirement already satisfied: dataclasses in /usr/local/lib/python3.7/dist-packages (from twint==2.1.21) (0.6)\n",
|
| 211 |
+
"Requirement already satisfied: elasticsearch in /usr/local/lib/python3.7/dist-packages (from twint==2.1.21) (7.15.1)\n",
|
| 212 |
+
"Requirement already satisfied: pysocks in /usr/local/lib/python3.7/dist-packages (from twint==2.1.21) (1.7.1)\n",
|
| 213 |
+
"Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from twint==2.1.21) (1.1.5)\n",
|
| 214 |
+
"Requirement already satisfied: aiohttp_socks in /usr/local/lib/python3.7/dist-packages (from twint==2.1.21) (0.6.0)\n",
|
| 215 |
+
"Requirement already satisfied: schedule in /usr/local/lib/python3.7/dist-packages (from twint==2.1.21) (1.1.0)\n",
|
| 216 |
+
"Requirement already satisfied: geopy in /usr/local/lib/python3.7/dist-packages (from twint==2.1.21) (1.17.0)\n",
|
| 217 |
+
"Requirement already satisfied: fake-useragent in /usr/local/lib/python3.7/dist-packages (from twint==2.1.21) (0.1.11)\n",
|
| 218 |
+
"Requirement already satisfied: googletransx in /usr/local/lib/python3.7/dist-packages (from twint==2.1.21) (2.4.2)\n",
|
| 219 |
+
"Requirement already satisfied: pycares>=4.0.0 in /usr/local/lib/python3.7/dist-packages (from aiodns->twint==2.1.21) (4.0.0)\n",
|
| 220 |
+
"Requirement already satisfied: cffi>=1.5.0 in /usr/local/lib/python3.7/dist-packages (from pycares>=4.0.0->aiodns->twint==2.1.21) (1.14.6)\n",
|
| 221 |
+
"Requirement already satisfied: pycparser in /usr/local/lib/python3.7/dist-packages (from cffi>=1.5.0->pycares>=4.0.0->aiodns->twint==2.1.21) (2.20)\n",
|
| 222 |
+
"Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->twint==2.1.21) (1.7.0)\n",
|
| 223 |
+
"Requirement already satisfied: chardet<5.0,>=2.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->twint==2.1.21) (3.0.4)\n",
|
| 224 |
+
"Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.7/dist-packages (from aiohttp->twint==2.1.21) (5.2.0)\n",
|
| 225 |
+
"Requirement already satisfied: async-timeout<4.0,>=3.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->twint==2.1.21) (3.0.1)\n",
|
| 226 |
+
"Requirement already satisfied: typing-extensions>=3.6.5 in /usr/local/lib/python3.7/dist-packages (from aiohttp->twint==2.1.21) (3.7.4.3)\n",
|
| 227 |
+
"Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->twint==2.1.21) (21.2.0)\n",
|
| 228 |
+
"Requirement already satisfied: idna>=2.0 in /usr/local/lib/python3.7/dist-packages (from yarl<2.0,>=1.0->aiohttp->twint==2.1.21) (2.10)\n",
|
| 229 |
+
"Requirement already satisfied: python-socks[asyncio]>=1.2.2 in /usr/local/lib/python3.7/dist-packages (from aiohttp_socks->twint==2.1.21) (1.2.4)\n",
|
| 230 |
+
"Requirement already satisfied: urllib3<2,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from elasticsearch->twint==2.1.21) (1.24.3)\n",
|
| 231 |
+
"Requirement already satisfied: certifi in /usr/local/lib/python3.7/dist-packages (from elasticsearch->twint==2.1.21) (2021.5.30)\n",
|
| 232 |
+
"Requirement already satisfied: geographiclib<2,>=1.49 in /usr/local/lib/python3.7/dist-packages (from geopy->twint==2.1.21) (1.52)\n",
|
| 233 |
+
"Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from googletransx->twint==2.1.21) (2.23.0)\n",
|
| 234 |
+
"Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas->twint==2.1.21) (2018.9)\n",
|
| 235 |
+
"Requirement already satisfied: numpy>=1.15.4 in /usr/local/lib/python3.7/dist-packages (from pandas->twint==2.1.21) (1.19.5)\n",
|
| 236 |
+
"Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas->twint==2.1.21) (2.8.2)\n",
|
| 237 |
+
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas->twint==2.1.21) (1.15.0)\n"
|
| 238 |
+
]
|
| 239 |
+
}
|
| 240 |
+
]
|
| 241 |
+
},
|
| 242 |
+
{
|
| 243 |
+
"cell_type": "code",
|
| 244 |
+
"metadata": {
|
| 245 |
+
"colab": {
|
| 246 |
+
"base_uri": "https://localhost:8080/"
|
| 247 |
+
},
|
| 248 |
+
"id": "PKbYSKBJy2Ow",
|
| 249 |
+
"outputId": "8f24189f-56ec-41d5-f8c0-475ffb3b1bc2"
|
| 250 |
+
},
|
| 251 |
+
"source": [
|
| 252 |
+
"!pip install nest_asyncio"
|
| 253 |
+
],
|
| 254 |
+
"execution_count": null,
|
| 255 |
+
"outputs": [
|
| 256 |
+
{
|
| 257 |
+
"output_type": "stream",
|
| 258 |
+
"name": "stdout",
|
| 259 |
+
"text": [
|
| 260 |
+
"Requirement already satisfied: nest_asyncio in /usr/local/lib/python3.7/dist-packages (1.5.1)\n"
|
| 261 |
+
]
|
| 262 |
+
}
|
| 263 |
+
]
|
| 264 |
+
},
|
| 265 |
+
{
|
| 266 |
+
"cell_type": "code",
|
| 267 |
+
"metadata": {
|
| 268 |
+
"colab": {
|
| 269 |
+
"base_uri": "https://localhost:8080/"
|
| 270 |
+
},
|
| 271 |
+
"id": "Fks7CZYMy5cR",
|
| 272 |
+
"outputId": "180f0f46-4fca-4e7f-a134-2cf99283a6a4"
|
| 273 |
+
},
|
| 274 |
+
"source": [
|
| 275 |
+
"!pip3 install twint"
|
| 276 |
+
],
|
| 277 |
+
"execution_count": null,
|
| 278 |
+
"outputs": [
|
| 279 |
+
{
|
| 280 |
+
"output_type": "stream",
|
| 281 |
+
"name": "stdout",
|
| 282 |
+
"text": [
|
| 283 |
+
"Requirement already satisfied: twint in /usr/local/lib/python3.7/dist-packages (2.1.21)\n",
|
| 284 |
+
"Requirement already satisfied: schedule in /usr/local/lib/python3.7/dist-packages (from twint) (1.1.0)\n",
|
| 285 |
+
"Requirement already satisfied: pysocks in /usr/local/lib/python3.7/dist-packages (from twint) (1.7.1)\n",
|
| 286 |
+
"Requirement already satisfied: cchardet in /usr/local/lib/python3.7/dist-packages (from twint) (2.1.7)\n",
|
| 287 |
+
"Requirement already satisfied: fake-useragent in /usr/local/lib/python3.7/dist-packages (from twint) (0.1.11)\n",
|
| 288 |
+
"Requirement already satisfied: elasticsearch in /usr/local/lib/python3.7/dist-packages (from twint) (7.15.1)\n",
|
| 289 |
+
"Requirement already satisfied: aiohttp in /usr/local/lib/python3.7/dist-packages (from twint) (3.7.4.post0)\n",
|
| 290 |
+
"Requirement already satisfied: googletransx in /usr/local/lib/python3.7/dist-packages (from twint) (2.4.2)\n",
|
| 291 |
+
"Requirement already satisfied: geopy in /usr/local/lib/python3.7/dist-packages (from twint) (1.17.0)\n",
|
| 292 |
+
"Requirement already satisfied: aiohttp-socks in /usr/local/lib/python3.7/dist-packages (from twint) (0.6.0)\n",
|
| 293 |
+
"Requirement already satisfied: aiodns in /usr/local/lib/python3.7/dist-packages (from twint) (3.0.0)\n",
|
| 294 |
+
"Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from twint) (1.1.5)\n",
|
| 295 |
+
"Requirement already satisfied: dataclasses in /usr/local/lib/python3.7/dist-packages (from twint) (0.6)\n",
|
| 296 |
+
"Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.7/dist-packages (from twint) (4.6.3)\n",
|
| 297 |
+
"Requirement already satisfied: pycares>=4.0.0 in /usr/local/lib/python3.7/dist-packages (from aiodns->twint) (4.0.0)\n",
|
| 298 |
+
"Requirement already satisfied: cffi>=1.5.0 in /usr/local/lib/python3.7/dist-packages (from pycares>=4.0.0->aiodns->twint) (1.14.6)\n",
|
| 299 |
+
"Requirement already satisfied: pycparser in /usr/local/lib/python3.7/dist-packages (from cffi>=1.5.0->pycares>=4.0.0->aiodns->twint) (2.20)\n",
|
| 300 |
+
"Requirement already satisfied: chardet<5.0,>=2.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->twint) (3.0.4)\n",
|
| 301 |
+
"Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->twint) (1.7.0)\n",
|
| 302 |
+
"Requirement already satisfied: async-timeout<4.0,>=3.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->twint) (3.0.1)\n",
|
| 303 |
+
"Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->twint) (21.2.0)\n",
|
| 304 |
+
"Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.7/dist-packages (from aiohttp->twint) (5.2.0)\n",
|
| 305 |
+
"Requirement already satisfied: typing-extensions>=3.6.5 in /usr/local/lib/python3.7/dist-packages (from aiohttp->twint) (3.7.4.3)\n",
|
| 306 |
+
"Requirement already satisfied: idna>=2.0 in /usr/local/lib/python3.7/dist-packages (from yarl<2.0,>=1.0->aiohttp->twint) (2.10)\n",
|
| 307 |
+
"Requirement already satisfied: python-socks[asyncio]>=1.2.2 in /usr/local/lib/python3.7/dist-packages (from aiohttp-socks->twint) (1.2.4)\n",
|
| 308 |
+
"Requirement already satisfied: urllib3<2,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from elasticsearch->twint) (1.24.3)\n",
|
| 309 |
+
"Requirement already satisfied: certifi in /usr/local/lib/python3.7/dist-packages (from elasticsearch->twint) (2021.5.30)\n",
|
| 310 |
+
"Requirement already satisfied: geographiclib<2,>=1.49 in /usr/local/lib/python3.7/dist-packages (from geopy->twint) (1.52)\n",
|
| 311 |
+
"Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from googletransx->twint) (2.23.0)\n",
|
| 312 |
+
"Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas->twint) (2.8.2)\n",
|
| 313 |
+
"Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas->twint) (2018.9)\n",
|
| 314 |
+
"Requirement already satisfied: numpy>=1.15.4 in /usr/local/lib/python3.7/dist-packages (from pandas->twint) (1.19.5)\n",
|
| 315 |
+
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas->twint) (1.15.0)\n"
|
| 316 |
+
]
|
| 317 |
+
}
|
| 318 |
+
]
|
| 319 |
+
},
|
| 320 |
+
{
|
| 321 |
+
"cell_type": "code",
|
| 322 |
+
"metadata": {
|
| 323 |
+
"id": "qZyxEstcy8R9"
|
| 324 |
+
},
|
| 325 |
+
"source": [
|
| 326 |
+
"# Import required libraries\n",
|
| 327 |
+
"import nest_asyncio\n",
|
| 328 |
+
"nest_asyncio.apply()\n",
|
| 329 |
+
"import pandas as pd\n",
|
| 330 |
+
"import twint\n",
|
| 331 |
+
"import pandas as pd\n",
|
| 332 |
+
"import re"
|
| 333 |
+
],
|
| 334 |
+
"execution_count": null,
|
| 335 |
+
"outputs": []
|
| 336 |
+
},
|
| 337 |
+
{
|
| 338 |
+
"cell_type": "code",
|
| 339 |
+
"metadata": {
|
| 340 |
+
"colab": {
|
| 341 |
+
"base_uri": "https://localhost:8080/",
|
| 342 |
+
"height": 371
|
| 343 |
+
},
|
| 344 |
+
"id": "oAkguWB20dB4",
|
| 345 |
+
"outputId": "e595e29d-f470-4f3c-9846-32a60f7ed2ef"
|
| 346 |
+
},
|
| 347 |
+
"source": [
|
| 348 |
+
"# add some tweets with depressed and depression tags, for a particular year\n",
|
| 349 |
+
"\n",
|
| 350 |
+
"depress_tags = [\"#depressed\", \"#anxiety\", \"#depression\", \"#suicide\", \"#mentalhealth\"\n",
|
| 351 |
+
" \"#loneliness\", \"#hopelessness\", \"#itsokaynottobeokay\"]\n",
|
| 352 |
+
"\n",
|
| 353 |
+
"content = {}\n",
|
| 354 |
+
"for i in range(len(depress_tags)):\n",
|
| 355 |
+
" print(depress_tags[i])\n",
|
| 356 |
+
" c = twint.Config()\n",
|
| 357 |
+
" \n",
|
| 358 |
+
" c.Format = \"Tweet id: {id} | Tweet: {tweet}\"\n",
|
| 359 |
+
" c.Search = depress_tags[i]\n",
|
| 360 |
+
" c.Limit = 1000\n",
|
| 361 |
+
" c.Year = 2019\n",
|
| 362 |
+
" c.Lang = \"en\"\n",
|
| 363 |
+
" c.Store_csv = True\n",
|
| 364 |
+
" c.Store_Object = True\n",
|
| 365 |
+
" c.Output = \"/content/drive/MyDrive/NLP/Depression_Detection/depressive_en_2019.csv\"\n",
|
| 366 |
+
" c.Hide_output = True\n",
|
| 367 |
+
" c.Stats = True\n",
|
| 368 |
+
" c.Lowercase = True\n",
|
| 369 |
+
" c.Filter_retweets = True\n",
|
| 370 |
+
" twint.run.Search(c)"
|
| 371 |
+
],
|
| 372 |
+
"execution_count": null,
|
| 373 |
+
"outputs": [
|
| 374 |
+
{
|
| 375 |
+
"output_type": "stream",
|
| 376 |
+
"name": "stdout",
|
| 377 |
+
"text": [
|
| 378 |
+
"#depressed\n"
|
| 379 |
+
]
|
| 380 |
+
},
|
| 381 |
+
{
|
| 382 |
+
"output_type": "error",
|
| 383 |
+
"ename": "TypeError",
|
| 384 |
+
"evalue": "ignored",
|
| 385 |
+
"traceback": [
|
| 386 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
| 387 |
+
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
|
| 388 |
+
"\u001b[0;32m<ipython-input-3-092f46e39459>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[0mc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mLowercase\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[0mc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mFilter_retweets\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 23\u001b[0;31m \u001b[0mtwint\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSearch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
|
| 389 |
+
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/twint/run.py\u001b[0m in \u001b[0;36mSearch\u001b[0;34m(config, callback)\u001b[0m\n\u001b[1;32m 408\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mFollowers\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 409\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mProfile\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 410\u001b[0;31m \u001b[0mrun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcallback\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 411\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPandas_au\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 412\u001b[0m \u001b[0mstorage\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpanda\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_autoget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"tweet\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
| 390 |
+
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/twint/run.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(config, callback)\u001b[0m\n\u001b[1;32m 327\u001b[0m \u001b[0;32mraise\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 328\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 329\u001b[0;31m \u001b[0mget_event_loop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_until_complete\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mTwint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcallback\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 330\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 331\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
|
| 391 |
+
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/nest_asyncio.py\u001b[0m in \u001b[0;36mrun_until_complete\u001b[0;34m(self, future)\u001b[0m\n\u001b[1;32m 68\u001b[0m raise RuntimeError(\n\u001b[1;32m 69\u001b[0m 'Event loop stopped before Future completed.')\n\u001b[0;32m---> 70\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 71\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_run_once\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
| 392 |
+
"\u001b[0;32m/usr/lib/python3.7/asyncio/futures.py\u001b[0m in \u001b[0;36mresult\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 179\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__log_traceback\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 180\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_exception\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 181\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_exception\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 182\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_result\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 183\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
|
| 393 |
+
"\u001b[0;32m/usr/lib/python3.7/asyncio/tasks.py\u001b[0m in \u001b[0;36m__step\u001b[0;34m(***failed resolving arguments***)\u001b[0m\n\u001b[1;32m 249\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcoro\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 250\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 251\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcoro\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mthrow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 252\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mStopIteration\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 253\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_must_cancel\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
| 394 |
+
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/twint/run.py\u001b[0m in \u001b[0;36mmain\u001b[0;34m(self, callback)\u001b[0m\n\u001b[1;32m 233\u001b[0m \u001b[0mtask\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd_done_callback\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcallback\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 234\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 235\u001b[0;31m \u001b[0;32mawait\u001b[0m \u001b[0mtask\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 236\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 237\u001b[0m \u001b[0;32masync\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mrun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
| 395 |
+
"\u001b[0;32m/usr/lib/python3.7/asyncio/futures.py\u001b[0m in \u001b[0;36m__await__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 261\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdone\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 262\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_asyncio_future_blocking\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 263\u001b[0;31m \u001b[0;32myield\u001b[0m \u001b[0mself\u001b[0m \u001b[0;31m# This tells Task to wait for completion.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 264\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdone\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 265\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mRuntimeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"await wasn't used with future\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
| 396 |
+
"\u001b[0;32m/usr/lib/python3.7/asyncio/tasks.py\u001b[0m in \u001b[0;36m__wakeup\u001b[0;34m(self, future)\u001b[0m\n\u001b[1;32m 316\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__wakeup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfuture\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 317\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 318\u001b[0;31m \u001b[0mfuture\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 319\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 320\u001b[0m \u001b[0;31m# This may also be a cancellation.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
| 397 |
+
"\u001b[0;32m/usr/lib/python3.7/asyncio/futures.py\u001b[0m in \u001b[0;36mresult\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 179\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__log_traceback\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 180\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_exception\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 181\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_exception\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 182\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_result\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 183\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
|
| 398 |
+
"\u001b[0;32m/usr/lib/python3.7/asyncio/tasks.py\u001b[0m in \u001b[0;36m__step\u001b[0;34m(***failed resolving arguments***)\u001b[0m\n\u001b[1;32m 247\u001b[0m \u001b[0;31m# We use the `send` method directly, because coroutines\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 248\u001b[0m \u001b[0;31m# don't have `__iter__` and `__next__` methods.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 249\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcoro\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 250\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 251\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcoro\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mthrow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
| 399 |
+
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/twint/run.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 284\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTwitterSearch\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 285\u001b[0m \u001b[0mlogme\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdebug\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m':Twint:main:twitter-search'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 286\u001b[0;31m \u001b[0;32mawait\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtweets\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 287\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 288\u001b[0m \u001b[0mlogme\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdebug\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m':Twint:main:no-more-tweets'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
| 400 |
+
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/twint/run.py\u001b[0m in \u001b[0;36mtweets\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 224\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtweet\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfeed\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 225\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcount\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 226\u001b[0;31m \u001b[0;32mawait\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTweets\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtweet\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 227\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 228\u001b[0m \u001b[0;32masync\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mmain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcallback\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
| 401 |
+
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/twint/output.py\u001b[0m in \u001b[0;36mTweets\u001b[0;34m(tweets, config, conn)\u001b[0m\n\u001b[1;32m 164\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTwitterSearch\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mProfile\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 165\u001b[0m \u001b[0mlogme\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdebug\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m':Tweets:TwitterSearch'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 166\u001b[0;31m \u001b[0;32mawait\u001b[0m \u001b[0mcheckData\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtweets\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 167\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 168\u001b[0m \u001b[0mlogme\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdebug\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m':Tweets:else'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
| 402 |
+
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/twint/output.py\u001b[0m in \u001b[0;36mcheckData\u001b[0;34m(tweet, config, conn)\u001b[0m\n\u001b[1;32m 135\u001b[0m \u001b[0;32mreturn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 136\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdatecheck\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtweet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdatestamp\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\" \"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mtweet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtimestamp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 137\u001b[0;31m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mformat\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTweet\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtweet\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 138\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDatabase\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 139\u001b[0m \u001b[0mlogme\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdebug\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m':checkData:Database'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
| 403 |
+
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/twint/format.py\u001b[0m in \u001b[0;36mTweet\u001b[0;34m(config, t)\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"{hashtags}\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\",\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhashtags\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"{cashtags}\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\",\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcashtags\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 23\u001b[0;31m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"{replies}\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplies_count\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 24\u001b[0m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"{retweets}\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mretweets_count\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"{likes}\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlikes_count\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
| 404 |
+
"\u001b[0;31mTypeError\u001b[0m: replace() argument 2 must be str, not int"
|
| 405 |
+
]
|
| 406 |
+
}
|
| 407 |
+
]
|
| 408 |
+
},
|
| 409 |
+
{
|
| 410 |
+
"cell_type": "code",
|
| 411 |
+
"metadata": {
|
| 412 |
+
"colab": {
|
| 413 |
+
"base_uri": "https://localhost:8080/",
|
| 414 |
+
"height": 424
|
| 415 |
+
},
|
| 416 |
+
"id": "g4zJVi7sy_b2",
|
| 417 |
+
"outputId": "71712dc1-cfeb-4294-f883-86fc6ce82984"
|
| 418 |
+
},
|
| 419 |
+
"source": [
|
| 420 |
+
"# add some tweets with depressed and depression tags, for a particular year\n",
|
| 421 |
+
"\n",
|
| 422 |
+
"depress_tags = [\"#depressed\", \"#depression\", \"#loneliness\", \"#hopelessness\"]\n",
|
| 423 |
+
"\n",
|
| 424 |
+
"content = {}\n",
|
| 425 |
+
"for i in range(len(depress_tags)):\n",
|
| 426 |
+
" print(depress_tags[i])\n",
|
| 427 |
+
" c = twint.Config()\n",
|
| 428 |
+
" \n",
|
| 429 |
+
" c.Format = \"Tweet id: {id} | Tweet: {tweet}\"\n",
|
| 430 |
+
" c.Search = depress_tags[i]\n",
|
| 431 |
+
" c.Limit = 1000\n",
|
| 432 |
+
" c.Year = 2020\n",
|
| 433 |
+
" c.Store_csv = True\n",
|
| 434 |
+
" c.Store_json = True\n",
|
| 435 |
+
" c.Output = \"/content/drive/MyDrive/NLP/Depression_Detection/dataset_depression.json\"\n",
|
| 436 |
+
" c.Hide_output = True\n",
|
| 437 |
+
" c.Stats = True\n",
|
| 438 |
+
" c.Lowercase = True\n",
|
| 439 |
+
" c.Filter_retweets = True\n",
|
| 440 |
+
" twint.run.Search(c)"
|
| 441 |
+
],
|
| 442 |
+
"execution_count": null,
|
| 443 |
+
"outputs": [
|
| 444 |
+
{
|
| 445 |
+
"output_type": "stream",
|
| 446 |
+
"name": "stdout",
|
| 447 |
+
"text": [
|
| 448 |
+
"#depressed\n",
|
| 449 |
+
"[!] No more data! Scraping will stop now.\n",
|
| 450 |
+
"found 0 deleted tweets in this search.\n",
|
| 451 |
+
"#depression\n"
|
| 452 |
+
]
|
| 453 |
+
},
|
| 454 |
+
{
|
| 455 |
+
"output_type": "error",
|
| 456 |
+
"ename": "TypeError",
|
| 457 |
+
"evalue": "ignored",
|
| 458 |
+
"traceback": [
|
| 459 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
| 460 |
+
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
|
| 461 |
+
"\u001b[0;32m<ipython-input-6-d584c0441bfc>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0mc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mLowercase\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[0mc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mFilter_retweets\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 21\u001b[0;31m \u001b[0mtwint\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSearch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
|
| 462 |
+
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/twint/run.py\u001b[0m in \u001b[0;36mSearch\u001b[0;34m(config, callback)\u001b[0m\n\u001b[1;32m 408\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mFollowers\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 409\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mProfile\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 410\u001b[0;31m \u001b[0mrun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcallback\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 411\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPandas_au\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 412\u001b[0m \u001b[0mstorage\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpanda\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_autoget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"tweet\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
| 463 |
+
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/twint/run.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(config, callback)\u001b[0m\n\u001b[1;32m 327\u001b[0m \u001b[0;32mraise\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 328\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 329\u001b[0;31m \u001b[0mget_event_loop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_until_complete\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mTwint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcallback\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 330\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 331\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
|
| 464 |
+
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/nest_asyncio.py\u001b[0m in \u001b[0;36mrun_until_complete\u001b[0;34m(self, future)\u001b[0m\n\u001b[1;32m 68\u001b[0m raise RuntimeError(\n\u001b[1;32m 69\u001b[0m 'Event loop stopped before Future completed.')\n\u001b[0;32m---> 70\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 71\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_run_once\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
| 465 |
+
"\u001b[0;32m/usr/lib/python3.7/asyncio/futures.py\u001b[0m in \u001b[0;36mresult\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 179\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__log_traceback\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 180\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_exception\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 181\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_exception\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 182\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_result\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 183\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
|
| 466 |
+
"\u001b[0;32m/usr/lib/python3.7/asyncio/tasks.py\u001b[0m in \u001b[0;36m__step\u001b[0;34m(***failed resolving arguments***)\u001b[0m\n\u001b[1;32m 249\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcoro\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 250\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 251\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcoro\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mthrow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 252\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mStopIteration\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 253\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_must_cancel\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
| 467 |
+
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/twint/run.py\u001b[0m in \u001b[0;36mmain\u001b[0;34m(self, callback)\u001b[0m\n\u001b[1;32m 233\u001b[0m \u001b[0mtask\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd_done_callback\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcallback\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 234\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 235\u001b[0;31m \u001b[0;32mawait\u001b[0m \u001b[0mtask\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 236\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 237\u001b[0m \u001b[0;32masync\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mrun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
| 468 |
+
"\u001b[0;32m/usr/lib/python3.7/asyncio/futures.py\u001b[0m in \u001b[0;36m__await__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 261\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdone\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 262\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_asyncio_future_blocking\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 263\u001b[0;31m \u001b[0;32myield\u001b[0m \u001b[0mself\u001b[0m \u001b[0;31m# This tells Task to wait for completion.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 264\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdone\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 265\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mRuntimeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"await wasn't used with future\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
| 469 |
+
"\u001b[0;32m/usr/lib/python3.7/asyncio/tasks.py\u001b[0m in \u001b[0;36m__wakeup\u001b[0;34m(self, future)\u001b[0m\n\u001b[1;32m 316\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__wakeup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfuture\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 317\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 318\u001b[0;31m \u001b[0mfuture\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 319\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 320\u001b[0m \u001b[0;31m# This may also be a cancellation.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
| 470 |
+
"\u001b[0;32m/usr/lib/python3.7/asyncio/futures.py\u001b[0m in \u001b[0;36mresult\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 179\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__log_traceback\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 180\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_exception\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 181\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_exception\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 182\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_result\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 183\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
|
| 471 |
+
"\u001b[0;32m/usr/lib/python3.7/asyncio/tasks.py\u001b[0m in \u001b[0;36m__step\u001b[0;34m(***failed resolving arguments***)\u001b[0m\n\u001b[1;32m 247\u001b[0m \u001b[0;31m# We use the `send` method directly, because coroutines\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 248\u001b[0m \u001b[0;31m# don't have `__iter__` and `__next__` methods.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 249\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcoro\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 250\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 251\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcoro\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mthrow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
| 472 |
+
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/twint/run.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 284\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTwitterSearch\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 285\u001b[0m \u001b[0mlogme\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdebug\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m':Twint:main:twitter-search'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 286\u001b[0;31m \u001b[0;32mawait\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtweets\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 287\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 288\u001b[0m \u001b[0mlogme\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdebug\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m':Twint:main:no-more-tweets'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
| 473 |
+
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/twint/run.py\u001b[0m in \u001b[0;36mtweets\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 224\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtweet\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfeed\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 225\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcount\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 226\u001b[0;31m \u001b[0;32mawait\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTweets\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtweet\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 227\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 228\u001b[0m \u001b[0;32masync\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mmain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcallback\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
| 474 |
+
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/twint/output.py\u001b[0m in \u001b[0;36mTweets\u001b[0;34m(tweets, config, conn)\u001b[0m\n\u001b[1;32m 164\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTwitterSearch\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mProfile\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 165\u001b[0m \u001b[0mlogme\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdebug\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m':Tweets:TwitterSearch'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 166\u001b[0;31m \u001b[0;32mawait\u001b[0m \u001b[0mcheckData\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtweets\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 167\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 168\u001b[0m \u001b[0mlogme\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdebug\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m':Tweets:else'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
| 475 |
+
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/twint/output.py\u001b[0m in \u001b[0;36mcheckData\u001b[0;34m(tweet, config, conn)\u001b[0m\n\u001b[1;32m 135\u001b[0m \u001b[0;32mreturn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 136\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdatecheck\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtweet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdatestamp\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\" \"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mtweet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtimestamp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 137\u001b[0;31m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mformat\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTweet\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtweet\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 138\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDatabase\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 139\u001b[0m \u001b[0mlogme\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdebug\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m':checkData:Database'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
| 476 |
+
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/twint/format.py\u001b[0m in \u001b[0;36mTweet\u001b[0;34m(config, t)\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"{hashtags}\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\",\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhashtags\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"{cashtags}\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\",\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcashtags\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 23\u001b[0;31m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"{replies}\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplies_count\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 24\u001b[0m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"{retweets}\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mretweets_count\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"{likes}\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlikes_count\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
| 477 |
+
"\u001b[0;31mTypeError\u001b[0m: replace() argument 2 must be str, not int"
|
| 478 |
+
]
|
| 479 |
+
}
|
| 480 |
+
]
|
| 481 |
+
}
|
| 482 |
+
]
|
| 483 |
+
}
|
source_code/assets/notebooks/data_gathering_twitter_API.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
source_code/assets/notebooks/modeling.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
source_code/core/clean.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ==============================================================================
|
| 2 |
+
# PROJECT: DEPRESSION-DETECTION-USING-TWEETS
|
| 3 |
+
# AUTHORS: AMEY THAKUR & MEGA SATISH
|
| 4 |
+
# GITHUB (AMEY): https://github.com/Amey-Thakur
|
| 5 |
+
# GITHUB (MEGA): https://github.com/msatmod
|
| 6 |
+
# REPOSITORY: https://github.com/Amey-Thakur/DEPRESSION-DETECTION-USING-TWEETS
|
| 7 |
+
# RELEASE DATE: June 5, 2022
|
| 8 |
+
# LICENSE: MIT License
|
| 9 |
+
# DESCRIPTION: Utility for cleaning raw tweet data for analysis.
|
| 10 |
+
# ==============================================================================
|
| 11 |
+
|
| 12 |
+
import argparse
|
| 13 |
+
import warnings
|
| 14 |
+
import clean_utilities as CU
|
| 15 |
+
|
| 16 |
+
# Suppression of non-critical runtime warnings to ensure output clarity
|
| 17 |
+
warnings.filterwarnings("ignore")
|
| 18 |
+
|
| 19 |
+
def main():
|
| 20 |
+
"""
|
| 21 |
+
Primary execution routine for the tweet cleaning utility.
|
| 22 |
+
|
| 23 |
+
This script facilitates the transformation of raw unstructured text
|
| 24 |
+
into a standardized format, essential for downstream machine learning
|
| 25 |
+
inference and training.
|
| 26 |
+
"""
|
| 27 |
+
# Configuration of the command-line argument parser
|
| 28 |
+
parser = argparse.ArgumentParser(
|
| 29 |
+
description="Twitter Depression Detection: Text Cleaning Utility"
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
# Definition of the mandatory positional argument for input file path
|
| 33 |
+
parser.add_argument(
|
| 34 |
+
'filename',
|
| 35 |
+
help="Path to the raw text file containing the tweet to be sanitized"
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
# Parsing and validation of terminal arguments
|
| 39 |
+
args = parser.parse_args()
|
| 40 |
+
|
| 41 |
+
# Conditional logic to verify input availability before processing
|
| 42 |
+
if args.filename is not None:
|
| 43 |
+
print(f"Targeting file for preprocessing: {args.filename}")
|
| 44 |
+
|
| 45 |
+
try:
|
| 46 |
+
# Atomic read operation for the target text file
|
| 47 |
+
with open(args.filename, 'r', encoding='utf-8') as file:
|
| 48 |
+
raw_tweet = file.read()
|
| 49 |
+
|
| 50 |
+
# Invocation of the granular cleaning pipeline
|
| 51 |
+
# Methodology includes contraction expansion, tokenization, and lemmatization
|
| 52 |
+
print("Linguistic cleaning in progress...")
|
| 53 |
+
sanitized_tweet = CU.tweets_cleaner(raw_tweet)
|
| 54 |
+
|
| 55 |
+
# Persisting the sanitized result to local storage
|
| 56 |
+
with open('clean_tweet.txt', 'w', encoding='utf-8') as output_file:
|
| 57 |
+
print("Sanitization complete. Persistence target: clean_tweet.txt")
|
| 58 |
+
output_file.write(sanitized_tweet)
|
| 59 |
+
|
| 60 |
+
except FileNotFoundError:
|
| 61 |
+
print(f"Error: The specified file '{args.filename}' was not discovered.")
|
| 62 |
+
except Exception as e:
|
| 63 |
+
print(f"An unexpected analytical error occurred: {e}")
|
| 64 |
+
|
| 65 |
+
else:
|
| 66 |
+
print("Required input: Please specify a valid filename as a positional argument.")
|
| 67 |
+
|
| 68 |
+
if __name__ == '__main__':
|
| 69 |
+
main()
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
|
source_code/core/clean_utilities.py
ADDED
|
@@ -0,0 +1,226 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ==============================================================================
|
| 2 |
+
# PROJECT: DEPRESSION-DETECTION-USING-TWEETS
|
| 3 |
+
# AUTHORS: AMEY THAKUR & MEGA SATISH
|
| 4 |
+
# GITHUB (AMEY): https://github.com/Amey-Thakur
|
| 5 |
+
# GITHUB (MEGA): https://github.com/msatmod
|
| 6 |
+
# REPOSITORY: https://github.com/Amey-Thakur/DEPRESSION-DETECTION-USING-TWEETS
|
| 7 |
+
# RELEASE DATE: June 5, 2022
|
| 8 |
+
# LICENSE: MIT License
|
| 9 |
+
# DESCRIPTION: Core NLP logic for cleaning and normalizing tweet text.
|
| 10 |
+
# ==============================================================================
|
| 11 |
+
|
| 12 |
+
import re
|
| 13 |
+
import warnings
|
| 14 |
+
import nltk
|
| 15 |
+
import ftfy
|
| 16 |
+
from nltk.stem import WordNetLemmatizer
|
| 17 |
+
from nltk.corpus import stopwords
|
| 18 |
+
|
| 19 |
+
# Suppression of non-critical warnings to ensure a streamlined algorithmic log
|
| 20 |
+
warnings.filterwarnings("ignore")
|
| 21 |
+
|
| 22 |
+
# Dictionary of standard English contractions for lexical expansion
|
| 23 |
+
# This facilitates uniform tokenization by resolving ambiguous shorthand
|
| 24 |
+
CONTRACTIONS_LIST = {
|
| 25 |
+
"ain't": "am not",
|
| 26 |
+
"aren't": "are not",
|
| 27 |
+
"can't": "cannot",
|
| 28 |
+
"can't've": "cannot have",
|
| 29 |
+
"'cause": "because",
|
| 30 |
+
"could've": "could have",
|
| 31 |
+
"couldn't": "could not",
|
| 32 |
+
"couldn't've": "could not have",
|
| 33 |
+
"didn't": "did not",
|
| 34 |
+
"doesn't": "does not",
|
| 35 |
+
"don't": "do not",
|
| 36 |
+
"hadn't": "had not",
|
| 37 |
+
"hadn't've": "had not have",
|
| 38 |
+
"hasn't": "has not",
|
| 39 |
+
"haven't": "have not",
|
| 40 |
+
"he'd": "he would",
|
| 41 |
+
"he'd've": "he would have",
|
| 42 |
+
"he'll": "he will",
|
| 43 |
+
"he'll've": "he will have",
|
| 44 |
+
"he's": "he is",
|
| 45 |
+
"how'd": "how did",
|
| 46 |
+
"how'd'y": "how do you",
|
| 47 |
+
"how'll": "how will",
|
| 48 |
+
"how's": "how is",
|
| 49 |
+
"I'd": "I would",
|
| 50 |
+
"I'd've": "I would have",
|
| 51 |
+
"I'll": "I will",
|
| 52 |
+
"I'll've": "I will have",
|
| 53 |
+
"I'm": "I am",
|
| 54 |
+
"I've": "I have",
|
| 55 |
+
"isn't": "is not",
|
| 56 |
+
"it'd": "it had",
|
| 57 |
+
"it'd've": "it would have",
|
| 58 |
+
"it'll": "it will",
|
| 59 |
+
"it'll've": "it will have",
|
| 60 |
+
"it's": "it is",
|
| 61 |
+
"let's": "let us",
|
| 62 |
+
"ma'am": "madam",
|
| 63 |
+
"mayn't": "may not",
|
| 64 |
+
"might've": "might have",
|
| 65 |
+
"mightn't": "might not",
|
| 66 |
+
"mightn't've": "might not have",
|
| 67 |
+
"must've": "must have",
|
| 68 |
+
"mustn't": "must not",
|
| 69 |
+
"mustn't've": "must not have",
|
| 70 |
+
"needn't": "need not",
|
| 71 |
+
"needn't've": "need not have",
|
| 72 |
+
"o'clock": "of the clock",
|
| 73 |
+
"oughtn't": "ought not",
|
| 74 |
+
"oughtn't've": "ought not have",
|
| 75 |
+
"shan't": "shall not",
|
| 76 |
+
"sha'n't": "shall not",
|
| 77 |
+
"shan't've": "shall not have",
|
| 78 |
+
"she'd": "she would",
|
| 79 |
+
"she'd've": "she would have",
|
| 80 |
+
"she'll": "she will",
|
| 81 |
+
"she'll've": "she will have",
|
| 82 |
+
"she's": "she is",
|
| 83 |
+
"should've": "should have",
|
| 84 |
+
"shouldn't": "should not",
|
| 85 |
+
"shouldn't've": "should not have",
|
| 86 |
+
"so've": "so have",
|
| 87 |
+
"so's": "so is",
|
| 88 |
+
"that'd": "that would",
|
| 89 |
+
"that'd've": "that would have",
|
| 90 |
+
"that's": "that is",
|
| 91 |
+
"there'd": "there had",
|
| 92 |
+
"there'd've": "there would have",
|
| 93 |
+
"there's": "there is",
|
| 94 |
+
"they'd": "they would",
|
| 95 |
+
"they'd've": "they would have",
|
| 96 |
+
"they'll": "they will",
|
| 97 |
+
"they'll've": "they will have",
|
| 98 |
+
"they're": "they are",
|
| 99 |
+
"they've": "they have",
|
| 100 |
+
"to've": "to have",
|
| 101 |
+
"wasn't": "was not",
|
| 102 |
+
"we'd": "we had",
|
| 103 |
+
"we'd've": "we would have",
|
| 104 |
+
"we'll": "we will",
|
| 105 |
+
"we'll've": "we will have",
|
| 106 |
+
"we're": "we are",
|
| 107 |
+
"we've": "we have",
|
| 108 |
+
"weren't": "were not",
|
| 109 |
+
"what'll": "what will",
|
| 110 |
+
"what'll've": "what will have",
|
| 111 |
+
"what're": "what are",
|
| 112 |
+
"what's": "what is",
|
| 113 |
+
"what've": "what have",
|
| 114 |
+
"when's": "when is",
|
| 115 |
+
"when've": "when have",
|
| 116 |
+
"where'd": "where did",
|
| 117 |
+
"where's": "where is",
|
| 118 |
+
"where've": "where have",
|
| 119 |
+
"who'll": "who will",
|
| 120 |
+
"who'll've": "who will have",
|
| 121 |
+
"who's": "who is",
|
| 122 |
+
"who've": "who have",
|
| 123 |
+
"why's": "why is",
|
| 124 |
+
"why've": "why have",
|
| 125 |
+
"will've": "will have",
|
| 126 |
+
"won't": "will not",
|
| 127 |
+
"won't've": "will not have",
|
| 128 |
+
"would've": "would have",
|
| 129 |
+
"wouldn't": "would not",
|
| 130 |
+
"wouldn't've": "would not have",
|
| 131 |
+
"y'all": "you all",
|
| 132 |
+
"y'alls": "you alls",
|
| 133 |
+
"y'all'd": "you all would",
|
| 134 |
+
"y'all'd've": "you all would have",
|
| 135 |
+
"y'all're": "you all are",
|
| 136 |
+
"y'all've": "you all have",
|
| 137 |
+
"you'd": "you had",
|
| 138 |
+
"you'd've": "you would have",
|
| 139 |
+
"you'll": "you you will",
|
| 140 |
+
"you'll've": "you you will have",
|
| 141 |
+
"you're": "you are",
|
| 142 |
+
"you've": "you have"
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
# Pre-compiled regular expression for efficient contraction matching
|
| 146 |
+
CONTRACTIONS_RE = re.compile('(%s)' % '|'.join(CONTRACTIONS_LIST.keys()))
|
| 147 |
+
|
| 148 |
+
def expand_contractions(text: str, contractions_re=CONTRACTIONS_RE) -> str:
|
| 149 |
+
"""
|
| 150 |
+
Identifies and replaces English contractions within the input text
|
| 151 |
+
using a predefined mapping.
|
| 152 |
+
|
| 153 |
+
Args:
|
| 154 |
+
text (str): The raw text potentially containing contractions.
|
| 155 |
+
contractions_re: Compiled regex pattern for matching contractions.
|
| 156 |
+
|
| 157 |
+
Returns:
|
| 158 |
+
str: Expanded lexical form of the input text.
|
| 159 |
+
"""
|
| 160 |
+
def replace(match):
|
| 161 |
+
return CONTRACTIONS_LIST[match.group(0)]
|
| 162 |
+
return contractions_re.sub(replace, text)
|
| 163 |
+
|
| 164 |
+
def tweets_cleaner(tweet: str) -> str:
|
| 165 |
+
"""
|
| 166 |
+
Executes a comprehensive analytical pipeline for the linguistic
|
| 167 |
+
normalization of microblogging content (Tweets).
|
| 168 |
+
|
| 169 |
+
Analytical Methodology:
|
| 170 |
+
1. Case Normalization: Lowercasting to ensure uniformity.
|
| 171 |
+
2. Relevance Filtering: Exclusion of tweets consisting solely of URLs.
|
| 172 |
+
3. Noise Reduction: Removal of hashtags, mentions, and visual asset links.
|
| 173 |
+
4. Encoding Correction: Fixing malformed Unicode sequences (via ftfy).
|
| 174 |
+
5. Lexical Expansion: Resolution of linguistic contractions.
|
| 175 |
+
6. Punctuation Removal: Strategic elimination of non-alphanumeric noise.
|
| 176 |
+
7. Morphological Analysis: Removal of high-frequency stop words and
|
| 177 |
+
application of WordNet-based lemmatization to reduce words to
|
| 178 |
+
their base semantic roots.
|
| 179 |
+
|
| 180 |
+
Args:
|
| 181 |
+
tweet (str): Raw input tweet captured from the platform.
|
| 182 |
+
|
| 183 |
+
Returns:
|
| 184 |
+
str: Sanitized and normalized string ready for vectorization.
|
| 185 |
+
"""
|
| 186 |
+
# Phase 1: Case Uniformity
|
| 187 |
+
tweet = tweet.lower()
|
| 188 |
+
|
| 189 |
+
# Phase 2: Structural Relevance Check (Filtering out pure URL content)
|
| 190 |
+
if re.match("(\w+:\/\/\S+)", tweet) is None:
|
| 191 |
+
|
| 192 |
+
# Phase 3: Targeted entity removal (Handles Twitter-specific artifacts)
|
| 193 |
+
tweet = ' '.join(
|
| 194 |
+
re.sub(
|
| 195 |
+
"(@[A-Za-z0-9]+)|(\#[A-Za-z0-9]+)|(<Emoji:.*>)|(pic\.twitter\.com\/.*)",
|
| 196 |
+
" ",
|
| 197 |
+
tweet
|
| 198 |
+
).split()
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
# Phase 4: Resolution of malformed character encodings
|
| 202 |
+
tweet = ftfy.fix_text(tweet)
|
| 203 |
+
|
| 204 |
+
# Phase 5: Applied contraction expansion for token consistency
|
| 205 |
+
tweet = expand_contractions(tweet)
|
| 206 |
+
|
| 207 |
+
# Phase 6: Punctuation and non-essential character pruning
|
| 208 |
+
tweet = ' '.join(re.sub("([^0-9A-Za-z \t])", " ", tweet).split())
|
| 209 |
+
|
| 210 |
+
# Phase 7: Stop-word filtration and Lemmatization
|
| 211 |
+
# Methodology: Reducing inflectional forms to a common base word (Lemma)
|
| 212 |
+
stop_words_set = set(stopwords.words('english'))
|
| 213 |
+
tokens = nltk.word_tokenize(tweet)
|
| 214 |
+
|
| 215 |
+
lemmatizer_engine = WordNetLemmatizer()
|
| 216 |
+
filtered_lexicon = [
|
| 217 |
+
lemmatizer_engine.lemmatize(word)
|
| 218 |
+
for word in tokens
|
| 219 |
+
if word not in stop_words_set
|
| 220 |
+
]
|
| 221 |
+
|
| 222 |
+
# Phase 8: Re-assembly of the normalized semantic string
|
| 223 |
+
tweet = ' '.join(filtered_lexicon)
|
| 224 |
+
|
| 225 |
+
return tweet
|
| 226 |
+
|
source_code/core/predict.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ==============================================================================
|
| 2 |
+
# PROJECT: DEPRESSION-DETECTION-USING-TWEETS
|
| 3 |
+
# AUTHORS: AMEY THAKUR & MEGA SATISH
|
| 4 |
+
# GITHUB (AMEY): https://github.com/Amey-Thakur
|
| 5 |
+
# GITHUB (MEGA): https://github.com/msatmod
|
| 6 |
+
# REPOSITORY: https://github.com/Amey-Thakur/DEPRESSION-DETECTION-USING-TWEETS
|
| 7 |
+
# RELEASE DATE: June 5, 2022
|
| 8 |
+
# LICENSE: MIT License
|
| 9 |
+
# DESCRIPTION: Utility for predicting depression levels in tweets using SVM.
|
| 10 |
+
# ==============================================================================
|
| 11 |
+
|
| 12 |
+
import argparse
|
| 13 |
+
import pickle
|
| 14 |
+
import warnings
|
| 15 |
+
import numpy as np
|
| 16 |
+
import pandas as pd
|
| 17 |
+
import spacy
|
| 18 |
+
import en_core_web_lg
|
| 19 |
+
import clean_utilities as CU
|
| 20 |
+
|
| 21 |
+
# Suppression of non-critical runtime warnings to maintain output integrity
|
| 22 |
+
warnings.filterwarnings("ignore")
|
| 23 |
+
|
| 24 |
+
def main():
|
| 25 |
+
"""
|
| 26 |
+
Main entry point for the prediction utility.
|
| 27 |
+
|
| 28 |
+
This script encapsulates the end-to-end inference pipeline:
|
| 29 |
+
1. Argument Parsing: Captures input text file and model selection.
|
| 30 |
+
2. Text Preprocessing: Normalization via clean_utilities.
|
| 31 |
+
3. Feature Extraction: Generating centroid embeddings via spaCy.
|
| 32 |
+
4. Classification: Binary sentiment analysis via pre-trained SVM.
|
| 33 |
+
"""
|
| 34 |
+
# Initialize the CLI argument parser with a descriptive header
|
| 35 |
+
parser = argparse.ArgumentParser(
|
| 36 |
+
description="Twitter Depression Detection: Machine Learning Inference Utility"
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
# Positional argument for the target tweet content (text file)
|
| 40 |
+
parser.add_argument(
|
| 41 |
+
'filename',
|
| 42 |
+
help="Path to the text file containing the tweet for classification"
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
# Positional argument for the classification model type
|
| 46 |
+
parser.add_argument(
|
| 47 |
+
'model',
|
| 48 |
+
help="Target model architecture (currently optimized for 'SVM')"
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
# Execution of the parsing logic
|
| 52 |
+
args = parser.parse_args()
|
| 53 |
+
|
| 54 |
+
# Pipeline validation: Ensuring input availability and model compatibility
|
| 55 |
+
if args.filename is not None and args.model == "SVM":
|
| 56 |
+
print(f"Loading input source: {args.filename}")
|
| 57 |
+
|
| 58 |
+
try:
|
| 59 |
+
# Step 1: Data Acquisition
|
| 60 |
+
with open(args.filename, 'r', encoding='utf-8') as file:
|
| 61 |
+
raw_test_tweet = file.read()
|
| 62 |
+
print(f"Captured Content: \"{raw_test_tweet}\"")
|
| 63 |
+
|
| 64 |
+
# Step 2: Linguistic Preprocessing
|
| 65 |
+
# Normalizes raw discourse into a tokenizable semantic format
|
| 66 |
+
print("Executing linguistic cleaning pipeline...")
|
| 67 |
+
cleaned_input = [CU.tweets_cleaner(raw_test_tweet)]
|
| 68 |
+
print(f"Normalized Form: {cleaned_input}")
|
| 69 |
+
|
| 70 |
+
# Step 3: Feature Space Transformation
|
| 71 |
+
# Utilizing dense word embeddings (spaCy 'en_core_web_lg' model)
|
| 72 |
+
print("Transforming text to 300-dimensional semantic vectors...")
|
| 73 |
+
nlp_engine = en_core_web_lg.load()
|
| 74 |
+
|
| 75 |
+
# Generating the centroid vector representing the tweet's linguistic context
|
| 76 |
+
semantic_features = np.array([
|
| 77 |
+
np.array([token.vector for token in nlp_engine(s)]).mean(axis=0) * np.ones((300))
|
| 78 |
+
for s in cleaned_input
|
| 79 |
+
])
|
| 80 |
+
|
| 81 |
+
# Step 4: Model Artifact Loading
|
| 82 |
+
# Loading the serialized SVM classifier from the assets directory
|
| 83 |
+
model_artifact_path = "../assets/models/model_svm1.pkl"
|
| 84 |
+
with open(model_artifact_path, 'rb') as model_file:
|
| 85 |
+
classifier = pickle.load(model_file)
|
| 86 |
+
|
| 87 |
+
# Step 5: Algorithmic Inference
|
| 88 |
+
# The SVM determines the classification boundary for the semantic vector
|
| 89 |
+
print("Performing binary classification...")
|
| 90 |
+
prediction_bin = classifier.predict(semantic_features)
|
| 91 |
+
|
| 92 |
+
# Step 6: Result Interpretation and User Communication
|
| 93 |
+
is_depressive = prediction_bin[0]
|
| 94 |
+
if is_depressive == 1:
|
| 95 |
+
print("\n>>> CLASSIFICATION RESULT: The analyzed content exhibits depressive characteristics.")
|
| 96 |
+
else:
|
| 97 |
+
print("\n>>> CLASSIFICATION RESULT: The analyzed content is classified as non-depressive.")
|
| 98 |
+
|
| 99 |
+
except FileNotFoundError:
|
| 100 |
+
print(f"Error: The input file {args.filename} could not be located.")
|
| 101 |
+
except Exception as e:
|
| 102 |
+
print(f"An error occurred during the inference process: {e}")
|
| 103 |
+
|
| 104 |
+
else:
|
| 105 |
+
print("Usage Error: Please provide an input file and specify 'SVM' as the target model.")
|
| 106 |
+
|
| 107 |
+
if __name__ == '__main__':
|
| 108 |
+
main()
|
| 109 |
+
|
| 110 |
+
|
source_code/core/train.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ==============================================================================
|
| 2 |
+
# PROJECT: DEPRESSION-DETECTION-USING-TWEETS
|
| 3 |
+
# AUTHORS: AMEY THAKUR & MEGA SATISH
|
| 4 |
+
# GITHUB (AMEY): https://github.com/Amey-Thakur
|
| 5 |
+
# GITHUB (MEGA): https://github.com/msatmod
|
| 6 |
+
# REPOSITORY: https://github.com/Amey-Thakur/DEPRESSION-DETECTION-USING-TWEETS
|
| 7 |
+
# RELEASE DATE: June 5, 2022
|
| 8 |
+
# LICENSE: MIT License
|
| 9 |
+
# DESCRIPTION: Script for training machine learning models for tweet analysis.
|
| 10 |
+
# ==============================================================================
|
| 11 |
+
|
| 12 |
+
import argparse
|
| 13 |
+
import warnings
|
| 14 |
+
import train_utilities as TU
|
| 15 |
+
|
| 16 |
+
# Suppression of non-critical runtime warnings to ensure output clarity during training
|
| 17 |
+
warnings.filterwarnings("ignore")
|
| 18 |
+
|
| 19 |
+
def main():
|
| 20 |
+
"""
|
| 21 |
+
Primary execution routine for the model training utility.
|
| 22 |
+
|
| 23 |
+
This script facilitates the training of various machine learning
|
| 24 |
+
architectures by providing a standardized interface for:
|
| 25 |
+
1. Dataset Ingestion: Loading and splitting training data.
|
| 26 |
+
2. Hyperparameter Configuration: Setting up model-specific parameters.
|
| 27 |
+
3. Algorithmic Training: Executing the training process via train_utilities.
|
| 28 |
+
4. Model Serialization: Persisting the resulting model for future inference.
|
| 29 |
+
"""
|
| 30 |
+
# Initialize the CLI argument parser
|
| 31 |
+
parser = argparse.ArgumentParser(
|
| 32 |
+
description="Twitter Depression Detection: Model Training Utility"
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
# Positional argument for the training dataset path (CSV format)
|
| 36 |
+
parser.add_argument(
|
| 37 |
+
'filename',
|
| 38 |
+
help="Path to the training dataset (TSV/CSV format with 'label' and 'clean_text')"
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
# Positional argument for the classification model architecture
|
| 42 |
+
# Supported: 'DT', 'LR', 'kNN', 'SVM', 'RF', 'NN', 'LSTM'
|
| 43 |
+
parser.add_argument(
|
| 44 |
+
'model',
|
| 45 |
+
help="Target model architecture for training"
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
# Execution of the parsing logic
|
| 49 |
+
args = parser.parse_args()
|
| 50 |
+
|
| 51 |
+
# Deployment of the selected training pipeline based on the 'model' parameter
|
| 52 |
+
model_type = args.model
|
| 53 |
+
dataset_path = args.filename
|
| 54 |
+
|
| 55 |
+
# Pipeline selection logic
|
| 56 |
+
if model_type in ["DT", "LR", "kNN", "SVM", "RF", "NN"]:
|
| 57 |
+
# Logic for standardized Scikit-learn architectures
|
| 58 |
+
print(f"Initializing {model_type} training pipeline...")
|
| 59 |
+
|
| 60 |
+
# Step 1: Data Acquisition and Validation Splitting
|
| 61 |
+
X_train, X_test, Y_train, Y_test = TU.load_prepare_split_df(dataset_path)
|
| 62 |
+
|
| 63 |
+
# Step 2: Algorithmic Training and Parameter Optimization
|
| 64 |
+
# The 'classification' method handles instantiation and fitting
|
| 65 |
+
trained_model = TU.classification(X_train=X_train, Y_train=Y_train, model=model_type)
|
| 66 |
+
|
| 67 |
+
print(f"Training for {model_type} successful.")
|
| 68 |
+
|
| 69 |
+
elif model_type == "LSTM":
|
| 70 |
+
# Specialized logic for Long Short-Term Memory (LSTM) Neural Networks
|
| 71 |
+
# LSTMs are utilized here to capture long-range temporal dependencies in text
|
| 72 |
+
print("Initializing LSTM deep learning pipeline...")
|
| 73 |
+
TU.LSTM(dataset_path)
|
| 74 |
+
|
| 75 |
+
else:
|
| 76 |
+
print(f"Error: Model architecture '{model_type}' is not currently recognized.")
|
| 77 |
+
print("Supported architectures: DT, LR, kNN, SVM, RF, NN, LSTM")
|
| 78 |
+
|
| 79 |
+
if __name__ == '__main__':
|
| 80 |
+
main()
|
source_code/core/train_utilities.py
ADDED
|
@@ -0,0 +1,226 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ==============================================================================
|
| 2 |
+
# PROJECT: DEPRESSION-DETECTION-USING-TWEETS
|
| 3 |
+
# AUTHORS: AMEY THAKUR & MEGA SATISH
|
| 4 |
+
# GITHUB (AMEY): https://github.com/Amey-Thakur
|
| 5 |
+
# GITHUB (MEGA): https://github.com/msatmod
|
| 6 |
+
# REPOSITORY: https://github.com/Amey-Thakur/DEPRESSION-DETECTION-USING-TWEETS
|
| 7 |
+
# RELEASE DATE: June 5, 2022
|
| 8 |
+
# LICENSE: MIT License
|
| 9 |
+
# DESCRIPTION: Utility module for the model training pipeline.
|
| 10 |
+
# ==============================================================================
|
| 11 |
+
|
| 12 |
+
import pickle
|
| 13 |
+
import warnings
|
| 14 |
+
import numpy as np
|
| 15 |
+
import pandas as pd
|
| 16 |
+
import spacy
|
| 17 |
+
import en_core_web_lg
|
| 18 |
+
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
|
| 19 |
+
from sklearn.metrics import confusion_matrix, accuracy_score
|
| 20 |
+
from sklearn.linear_model import LogisticRegression
|
| 21 |
+
from sklearn.neighbors import KNeighborsClassifier
|
| 22 |
+
from sklearn.svm import SVC
|
| 23 |
+
from sklearn.tree import DecisionTreeClassifier
|
| 24 |
+
from sklearn.neural_network import MLPClassifier
|
| 25 |
+
from sklearn.ensemble import RandomForestClassifier
|
| 26 |
+
|
| 27 |
+
# Suppression of non-critical runtime warnings to maintain algorithmic output integrity
|
| 28 |
+
warnings.filterwarnings("ignore")
|
| 29 |
+
|
| 30 |
+
def load_prepare_split_df(filename: str, targets=['label'], validation_size=0.3, seed=7):
|
| 31 |
+
"""
|
| 32 |
+
Ingests raw data, performs feature extraction via word embeddings,
|
| 33 |
+
and partitions the dataset for model validation.
|
| 34 |
+
|
| 35 |
+
Methodology:
|
| 36 |
+
- TSV Ingestion: Data is loaded from the specified file.
|
| 37 |
+
- Semantic Vectorization: Utilizing spaCy's dense 300-dimensional
|
| 38 |
+
word embeddings (centroid of token vectors).
|
| 39 |
+
- Validation Partitioning: Stratified splitting of data into
|
| 40 |
+
training and testing subsets.
|
| 41 |
+
|
| 42 |
+
Args:
|
| 43 |
+
filename (str): Path to the TSV/CSV dataset.
|
| 44 |
+
targets (list): Column name for the dependent variable.
|
| 45 |
+
validation_size (float): Proportion of data reserved for testing.
|
| 46 |
+
seed (int): Random seed for reproducibility.
|
| 47 |
+
|
| 48 |
+
Returns:
|
| 49 |
+
tuple: (X_train, X_test, Y_train, Y_test) feature and label sets.
|
| 50 |
+
"""
|
| 51 |
+
print(f"Acquiring dataset from: {filename}")
|
| 52 |
+
df_all = pd.read_csv(filename, sep='\t', encoding='utf-8')
|
| 53 |
+
|
| 54 |
+
# Step 1: Initialize the Linguistic Engine
|
| 55 |
+
nlp_engine = en_core_web_lg.load()
|
| 56 |
+
|
| 57 |
+
# Step 2: Compute Dense Word Embeddings (Feature Extraction)
|
| 58 |
+
print("Extracting semantic features via spaCy embeddings...")
|
| 59 |
+
feature_vectors = np.array([
|
| 60 |
+
np.array([token.vector for token in nlp_engine(s)]).mean(axis=0) * np.ones((300))
|
| 61 |
+
for s in df_all['clean_text']
|
| 62 |
+
])
|
| 63 |
+
|
| 64 |
+
# Step 3: Dataset Splitting
|
| 65 |
+
y_labels = df_all.loc[:, targets]
|
| 66 |
+
x_features = feature_vectors
|
| 67 |
+
|
| 68 |
+
x_train, x_test, y_train, y_test = train_test_split(
|
| 69 |
+
x_features, y_labels, test_size=validation_size, random_state=seed
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
return x_train, x_test, y_train, y_test
|
| 73 |
+
|
| 74 |
+
def classification(X_train, Y_train, model=""):
|
| 75 |
+
"""
|
| 76 |
+
Facilitates the training and serialization of various classification
|
| 77 |
+
architectures.
|
| 78 |
+
|
| 79 |
+
Architectures Supported:
|
| 80 |
+
- SVM: Support Vector Machine (Selected as the production primary).
|
| 81 |
+
- LR: Logistic Regression.
|
| 82 |
+
- DT: Decision Tree Classifier.
|
| 83 |
+
- KNN: k-Nearest Neighbors (with automated k-optimization).
|
| 84 |
+
- RF: Random Forest Classifier.
|
| 85 |
+
- NN: Multi-layer Perceptron (MLP) Neural Network.
|
| 86 |
+
|
| 87 |
+
Args:
|
| 88 |
+
X_train: Training feature set.
|
| 89 |
+
Y_train: Training label set.
|
| 90 |
+
model (str): Target architecture identifier.
|
| 91 |
+
|
| 92 |
+
Returns:
|
| 93 |
+
object: The trained Scikit-learn model instance.
|
| 94 |
+
"""
|
| 95 |
+
if model == "SVM":
|
| 96 |
+
# Support Vector Machines are effective in high-dimensional semantic spaces
|
| 97 |
+
print("Initializing SVM (Support Vector Machine) training...")
|
| 98 |
+
clf = SVC(probability=True)
|
| 99 |
+
clf.fit(X_train, Y_train)
|
| 100 |
+
|
| 101 |
+
# Performance Evaluation (Accuracy Metric)
|
| 102 |
+
train_accuracy = accuracy_score(clf.predict(X_train), Y_train)
|
| 103 |
+
print(f"Training Convergence Accuracy: {train_accuracy:.4f}")
|
| 104 |
+
|
| 105 |
+
# Persistence: Serializing the model artifact
|
| 106 |
+
save_path = "../assets/models/model_svm_pc.pkl"
|
| 107 |
+
with open(save_path, 'wb') as file:
|
| 108 |
+
pickle.dump(clf, file)
|
| 109 |
+
return clf
|
| 110 |
+
|
| 111 |
+
elif model == "LR":
|
| 112 |
+
# Logistic Regression serves as a robust baseline for linear classification
|
| 113 |
+
print("Initializing Logistic Regression training...")
|
| 114 |
+
lr_model = LogisticRegression()
|
| 115 |
+
lr_model.fit(X_train, Y_train)
|
| 116 |
+
|
| 117 |
+
save_path = "../assets/models/model_LogReg.pkl"
|
| 118 |
+
with open(save_path, 'wb') as file:
|
| 119 |
+
pickle.dump(lr_model, file)
|
| 120 |
+
return lr_model
|
| 121 |
+
|
| 122 |
+
elif model == "DT":
|
| 123 |
+
# Decision Trees provide hierarchical decision boundaries
|
| 124 |
+
print("Initializing Decision Tree training...")
|
| 125 |
+
dt_model = DecisionTreeClassifier()
|
| 126 |
+
dt_model.fit(X_train, Y_train)
|
| 127 |
+
|
| 128 |
+
save_path = "../assets/models/model_DTC.pkl"
|
| 129 |
+
with open(save_path, 'wb') as file:
|
| 130 |
+
pickle.dump(dt_model, file)
|
| 131 |
+
return dt_model
|
| 132 |
+
|
| 133 |
+
elif model == "KNN":
|
| 134 |
+
# kNN requires hyperparameter tuning (k value) via cross-validation
|
| 135 |
+
print("Initializing kNN training with automated k-optimization...")
|
| 136 |
+
k_values = range(1, 32, 1)
|
| 137 |
+
k_scores = []
|
| 138 |
+
|
| 139 |
+
# 10-Fold Cross-Validation for optimal k-neighbor selection
|
| 140 |
+
for k in k_values:
|
| 141 |
+
knn = KNeighborsClassifier(n_neighbors=k)
|
| 142 |
+
score = np.mean(cross_val_score(knn, X_train, Y_train, cv=10))
|
| 143 |
+
k_scores.append(score)
|
| 144 |
+
|
| 145 |
+
optimal_k = k_values[np.argmax(k_scores)]
|
| 146 |
+
print(f"Optimized Hyperparameter discovered: k = {optimal_k}")
|
| 147 |
+
|
| 148 |
+
best_knn = KNeighborsClassifier(n_neighbors=optimal_k)
|
| 149 |
+
best_knn.fit(X_train, Y_train)
|
| 150 |
+
|
| 151 |
+
save_path = "../assets/models/model_KNN.pkl"
|
| 152 |
+
with open(save_path, 'wb') as file:
|
| 153 |
+
pickle.dump(best_knn, file)
|
| 154 |
+
return best_knn
|
| 155 |
+
|
| 156 |
+
elif model == "RF":
|
| 157 |
+
# Random Forest: Ensemble bagged decision trees for variance reduction
|
| 158 |
+
print("Initializing Random Forest training...")
|
| 159 |
+
rf_model = RandomForestClassifier()
|
| 160 |
+
rf_model.fit(X_train, Y_train)
|
| 161 |
+
|
| 162 |
+
save_path = "../assets/models/model_RF.pkl"
|
| 163 |
+
with open(save_path, 'wb') as file:
|
| 164 |
+
pickle.dump(rf_model, file)
|
| 165 |
+
return rf_model
|
| 166 |
+
|
| 167 |
+
elif model == "NN":
|
| 168 |
+
# MLP (Multi-layer Perceptron): Basic artificial neural network
|
| 169 |
+
print("Initializing Neural Network (MLP) training...")
|
| 170 |
+
nn_model = MLPClassifier()
|
| 171 |
+
nn_model.fit(X_train, Y_train)
|
| 172 |
+
|
| 173 |
+
save_path = "../assets/models/model_NN.pkl"
|
| 174 |
+
with open(save_path, 'wb') as file:
|
| 175 |
+
pickle.dump(nn_model, file)
|
| 176 |
+
return nn_model
|
| 177 |
+
|
| 178 |
+
def LSTM(filename: str):
|
| 179 |
+
"""
|
| 180 |
+
Executes a Deep Learning pipeline using Long Short-Term Memory (LSTM)
|
| 181 |
+
recurrent neural networks for capturing temporal lingustical patterns.
|
| 182 |
+
|
| 183 |
+
Methodology:
|
| 184 |
+
- Tokenization: Integer encoding of sequences.
|
| 185 |
+
- Padding: Uniform sequence length normalization.
|
| 186 |
+
- Architecture: Embedding layer followed by LSTM with Dropouts.
|
| 187 |
+
"""
|
| 188 |
+
from keras.models import Sequential
|
| 189 |
+
from keras.layers import Dense, Embedding, LSTM
|
| 190 |
+
from keras.preprocessing.text import Tokenizer
|
| 191 |
+
from keras.preprocessing.sequence import pad_sequences
|
| 192 |
+
from keras.wrappers.scikit_learn import KerasClassifier
|
| 193 |
+
|
| 194 |
+
print(f"Acquiring data for Deep Learning (LSTM): {filename}")
|
| 195 |
+
df_dl = pd.read_csv(filename, sep='\t', encoding='utf-8')
|
| 196 |
+
|
| 197 |
+
# Step 1: Sequence Tokenization and Padding
|
| 198 |
+
vocab_size = 20000
|
| 199 |
+
max_len = 50
|
| 200 |
+
tokenizer = Tokenizer(num_words=vocab_size)
|
| 201 |
+
tokenizer.fit_on_texts(df_dl['clean_text'])
|
| 202 |
+
seqs = tokenizer.texts_to_sequences(df_dl['clean_text'])
|
| 203 |
+
x_lstm = pad_sequences(seqs, maxlen=max_len)
|
| 204 |
+
y_lstm = df_dl["label"]
|
| 205 |
+
|
| 206 |
+
# Step 2: Architecture Definition
|
| 207 |
+
print("Constructing LSTM topology...")
|
| 208 |
+
model = Sequential()
|
| 209 |
+
model.add(Embedding(vocab_size, 300, input_length=max_len))
|
| 210 |
+
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
|
| 211 |
+
model.add(Dense(1, activation='sigmoid'))
|
| 212 |
+
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
|
| 213 |
+
|
| 214 |
+
# Step 3: Model Execution and Persistance
|
| 215 |
+
print("Commencing Deep Learning Convergence (LSTM)...")
|
| 216 |
+
# In a professional context, create_model should be passed to KerasClassifier
|
| 217 |
+
# Here we demonstrate the fundamental fit operation
|
| 218 |
+
model.fit(x_lstm, y_lstm, epochs=3, verbose=1, validation_split=0.3)
|
| 219 |
+
|
| 220 |
+
# Persistence: JSON topology and H5 weights
|
| 221 |
+
model_json = model.to_json()
|
| 222 |
+
with open("model_LSTM.json", "w") as json_file:
|
| 223 |
+
json_file.write(model_json)
|
| 224 |
+
model.save_weights("model_LSTM.h5")
|
| 225 |
+
print("Deep Learning model (LSTM) artifacts successfully persisted.")
|
| 226 |
+
|
source_code/notebooks/data_cleaning_exploration.py
ADDED
|
@@ -0,0 +1,505 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""data_cleaning_exploration.ipynb
|
| 3 |
+
|
| 4 |
+
Automatically generated by Colaboratory.
|
| 5 |
+
|
| 6 |
+
Original file is located at
|
| 7 |
+
https://colab.research.google.com/drive/1jU6I53BYSZ2kX-qcxcWP_1bPNYWvB24f
|
| 8 |
+
|
| 9 |
+
# Data Cleaning
|
| 10 |
+
|
| 11 |
+
Data cleaning is the process of detecting and removing errors and inconsistencies from the data to improve its quality. Improper data cleaning process can lead to errors, faulty analysis, distortion in dataset and eventually incompatible datasets for machine learning purposes. There is no absolute way to prescribe the exact steps in the data cleaning process because the processes will vary from dataset to dataset. My data cleaning process includes:
|
| 12 |
+
|
| 13 |
+
* Check the data types
|
| 14 |
+
* Check for duplicates - Primary key ('tweets.id')
|
| 15 |
+
* Check missing values
|
| 16 |
+
* Make text all lower case
|
| 17 |
+
* Remove links and images
|
| 18 |
+
* Remove hashtags
|
| 19 |
+
* Remove @ mentions
|
| 20 |
+
* Remove emojis
|
| 21 |
+
* Remove stop words
|
| 22 |
+
* Remove punctuation
|
| 23 |
+
* Get rid of stuff like "what's" and making it "what is'
|
| 24 |
+
* Stemming / lemmatization
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
from google.colab import drive
|
| 28 |
+
drive.mount('/content/drive')
|
| 29 |
+
|
| 30 |
+
!pip install -qqq ftfy
|
| 31 |
+
|
| 32 |
+
## Import required libraries
|
| 33 |
+
|
| 34 |
+
## warnings
|
| 35 |
+
import warnings
|
| 36 |
+
warnings.filterwarnings("ignore")
|
| 37 |
+
|
| 38 |
+
## for data
|
| 39 |
+
import numpy as np
|
| 40 |
+
import pandas as pd
|
| 41 |
+
|
| 42 |
+
## for plotting
|
| 43 |
+
import matplotlib.pyplot as plt
|
| 44 |
+
import seaborn as sns
|
| 45 |
+
|
| 46 |
+
## for processing
|
| 47 |
+
import nltk
|
| 48 |
+
import re
|
| 49 |
+
import ftfy
|
| 50 |
+
from nltk.stem import WordNetLemmatizer
|
| 51 |
+
from nltk.corpus import stopwords
|
| 52 |
+
nltk.download('stopwords')
|
| 53 |
+
nltk.download('punkt')
|
| 54 |
+
nltk.download('wordnet')
|
| 55 |
+
nltk.download('averaged_perceptron_tagger')
|
| 56 |
+
|
| 57 |
+
## for opening, manipulating, and saving many different image file f
|
| 58 |
+
from PIL import Image
|
| 59 |
+
|
| 60 |
+
## WordCloud - Python linrary for creating image wordclouds
|
| 61 |
+
from wordcloud import WordCloud
|
| 62 |
+
from nltk import pos_tag ## For Parts of Speech tagging
|
| 63 |
+
import random ## generating random numbers
|
| 64 |
+
|
| 65 |
+
"""## Load the datasets"""
|
| 66 |
+
|
| 67 |
+
depressive_tweets_df = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/output/depressive_tweets.csv")
|
| 68 |
+
random_tweets_df = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Depression_tweets_Data/Data1/Sentiment Analysis Dataset 2.csv",
|
| 69 |
+
encoding = "ISO-8859-1", usecols = range(0,4), nrows = 40000)
|
| 70 |
+
|
| 71 |
+
depressive_tweets_df
|
| 72 |
+
|
| 73 |
+
random_tweets_df
|
| 74 |
+
|
| 75 |
+
## Slicing the random tweets to have sentiment == 1
|
| 76 |
+
new_rand_df = random_tweets_df[random_tweets_df.Sentiment == 1]
|
| 77 |
+
new_rand_df.reset_index(inplace=True)
|
| 78 |
+
|
| 79 |
+
new_rand_df.shape
|
| 80 |
+
|
| 81 |
+
new_rand_df.head()
|
| 82 |
+
|
| 83 |
+
"""20952 random tweets with sentiment == 1
|
| 84 |
+
|
| 85 |
+
## Data Cleaning-Processing:
|
| 86 |
+
"""
|
| 87 |
+
|
| 88 |
+
print(depressive_tweets_df.shape)
|
| 89 |
+
print(new_rand_df.shape)
|
| 90 |
+
|
| 91 |
+
## Check the data type of each column
|
| 92 |
+
depressive_tweets_df.dtypes.to_frame().rename(columns={0:'data_type'})
|
| 93 |
+
|
| 94 |
+
## Check the data type of each column
|
| 95 |
+
new_rand_df.dtypes.to_frame().rename(columns={0:'data_type'})
|
| 96 |
+
|
| 97 |
+
## Drop unnecessary columns
|
| 98 |
+
depressive_tweets_df.drop(columns=['Unnamed: 0'], inplace=True)
|
| 99 |
+
new_rand_df.drop(columns=['ItemID', 'index','Sentiment', 'SentimentSource'], inplace=True)
|
| 100 |
+
|
| 101 |
+
"""Since we are mostly dealing with the tweets in our datasets, it is not necessary to change the data types at this stage."""
|
| 102 |
+
|
| 103 |
+
## Finding unique values in each column
|
| 104 |
+
for col in depressive_tweets_df:
|
| 105 |
+
print("There are ", len(depressive_tweets_df[col].unique()), "unique values in ", col)
|
| 106 |
+
|
| 107 |
+
"""By considering **tweet.id** as our primary key, we have **18190** unique tweets, so we need to get rid of the duplicates."""
|
| 108 |
+
|
| 109 |
+
## Finding unique values in each column
|
| 110 |
+
for col in new_rand_df:
|
| 111 |
+
print("There are ", len(new_rand_df[col].unique()), "unique values in ", col)
|
| 112 |
+
|
| 113 |
+
"""No duplicates in random tweets dataset"""
|
| 114 |
+
|
| 115 |
+
## drop duplicate values in tweet.id
|
| 116 |
+
depressive_tweets_df.drop_duplicates(subset=['tweet.id'], inplace=True)
|
| 117 |
+
|
| 118 |
+
depressive_tweets_df.reset_index(inplace=True)
|
| 119 |
+
|
| 120 |
+
depressive_tweets_df.shape
|
| 121 |
+
|
| 122 |
+
## Find the number of Null values in each columns
|
| 123 |
+
depressive_tweets_df.isnull().sum().to_frame().rename(columns={0:'Null values'})
|
| 124 |
+
|
| 125 |
+
"""There are **6384** Null values in the **location** columns but since location will not be used in our analysis or as a feature in our model, we don't need to replace them."""
|
| 126 |
+
|
| 127 |
+
## Find the number of Null values in each columns
|
| 128 |
+
new_rand_df.isnull().sum().to_frame().rename(columns={0:'Null values'})
|
| 129 |
+
|
| 130 |
+
"""No Null values in random tweets dataset."""
|
| 131 |
+
|
| 132 |
+
## Drop all the columns except index, tweet.id and text
|
| 133 |
+
new_dep_df = depressive_tweets_df[['text']]
|
| 134 |
+
|
| 135 |
+
## Add label to both datasets (0 is non-depressive and 1 is depressive)
|
| 136 |
+
new_dep_df['label'] = pd.Series([1 for x in range(len(new_dep_df.index))])
|
| 137 |
+
new_rand_df['label'] = pd.Series([0 for x in range(len(new_rand_df.index))])
|
| 138 |
+
|
| 139 |
+
new_dep_df
|
| 140 |
+
|
| 141 |
+
## Change the column name to be aligned with depressive dataset
|
| 142 |
+
new_rand_df.rename(columns={'SentimentText': 'text'}, inplace=True)
|
| 143 |
+
|
| 144 |
+
new_rand_df
|
| 145 |
+
|
| 146 |
+
## Combine two dataframes together
|
| 147 |
+
df_all = pd.concat([new_dep_df, new_rand_df], ignore_index=True)
|
| 148 |
+
|
| 149 |
+
df_all
|
| 150 |
+
|
| 151 |
+
# Expand Contraction
|
| 152 |
+
cList = {
|
| 153 |
+
"ain't": "am not",
|
| 154 |
+
"aren't": "are not",
|
| 155 |
+
"can't": "cannot",
|
| 156 |
+
"can't've": "cannot have",
|
| 157 |
+
"'cause": "because",
|
| 158 |
+
"could've": "could have",
|
| 159 |
+
"couldn't": "could not",
|
| 160 |
+
"couldn't've": "could not have",
|
| 161 |
+
"didn't": "did not",
|
| 162 |
+
"doesn't": "does not",
|
| 163 |
+
"don't": "do not",
|
| 164 |
+
"hadn't": "had not",
|
| 165 |
+
"hadn't've": "had not have",
|
| 166 |
+
"hasn't": "has not",
|
| 167 |
+
"haven't": "have not",
|
| 168 |
+
"he'd": "he would",
|
| 169 |
+
"he'd've": "he would have",
|
| 170 |
+
"he'll": "he will",
|
| 171 |
+
"he'll've": "he will have",
|
| 172 |
+
"he's": "he is",
|
| 173 |
+
"how'd": "how did",
|
| 174 |
+
"how'd'y": "how do you",
|
| 175 |
+
"how'll": "how will",
|
| 176 |
+
"how's": "how is",
|
| 177 |
+
"I'd": "I would",
|
| 178 |
+
"I'd've": "I would have",
|
| 179 |
+
"I'll": "I will",
|
| 180 |
+
"I'll've": "I will have",
|
| 181 |
+
"I'm": "I am",
|
| 182 |
+
"I've": "I have",
|
| 183 |
+
"isn't": "is not",
|
| 184 |
+
"it'd": "it had",
|
| 185 |
+
"it'd've": "it would have",
|
| 186 |
+
"it'll": "it will",
|
| 187 |
+
"it'll've": "it will have",
|
| 188 |
+
"it's": "it is",
|
| 189 |
+
"let's": "let us",
|
| 190 |
+
"ma'am": "madam",
|
| 191 |
+
"mayn't": "may not",
|
| 192 |
+
"might've": "might have",
|
| 193 |
+
"mightn't": "might not",
|
| 194 |
+
"mightn't've": "might not have",
|
| 195 |
+
"must've": "must have",
|
| 196 |
+
"mustn't": "must not",
|
| 197 |
+
"mustn't've": "must not have",
|
| 198 |
+
"needn't": "need not",
|
| 199 |
+
"needn't've": "need not have",
|
| 200 |
+
"o'clock": "of the clock",
|
| 201 |
+
"oughtn't": "ought not",
|
| 202 |
+
"oughtn't've": "ought not have",
|
| 203 |
+
"shan't": "shall not",
|
| 204 |
+
"sha'n't": "shall not",
|
| 205 |
+
"shan't've": "shall not have",
|
| 206 |
+
"she'd": "she would",
|
| 207 |
+
"she'd've": "she would have",
|
| 208 |
+
"she'll": "she will",
|
| 209 |
+
"she'll've": "she will have",
|
| 210 |
+
"she's": "she is",
|
| 211 |
+
"should've": "should have",
|
| 212 |
+
"shouldn't": "should not",
|
| 213 |
+
"shouldn't've": "should not have",
|
| 214 |
+
"so've": "so have",
|
| 215 |
+
"so's": "so is",
|
| 216 |
+
"that'd": "that would",
|
| 217 |
+
"that'd've": "that would have",
|
| 218 |
+
"that's": "that is",
|
| 219 |
+
"there'd": "there had",
|
| 220 |
+
"there'd've": "there would have",
|
| 221 |
+
"there's": "there is",
|
| 222 |
+
"they'd": "they would",
|
| 223 |
+
"they'd've": "they would have",
|
| 224 |
+
"they'll": "they will",
|
| 225 |
+
"they'll've": "they will have",
|
| 226 |
+
"they're": "they are",
|
| 227 |
+
"they've": "they have",
|
| 228 |
+
"to've": "to have",
|
| 229 |
+
"wasn't": "was not",
|
| 230 |
+
"we'd": "we had",
|
| 231 |
+
"we'd've": "we would have",
|
| 232 |
+
"we'll": "we will",
|
| 233 |
+
"we'll've": "we will have",
|
| 234 |
+
"we're": "we are",
|
| 235 |
+
"we've": "we have",
|
| 236 |
+
"weren't": "were not",
|
| 237 |
+
"what'll": "what will",
|
| 238 |
+
"what'll've": "what will have",
|
| 239 |
+
"what're": "what are",
|
| 240 |
+
"what's": "what is",
|
| 241 |
+
"what've": "what have",
|
| 242 |
+
"when's": "when is",
|
| 243 |
+
"when've": "when have",
|
| 244 |
+
"where'd": "where did",
|
| 245 |
+
"where's": "where is",
|
| 246 |
+
"where've": "where have",
|
| 247 |
+
"who'll": "who will",
|
| 248 |
+
"who'll've": "who will have",
|
| 249 |
+
"who's": "who is",
|
| 250 |
+
"who've": "who have",
|
| 251 |
+
"why's": "why is",
|
| 252 |
+
"why've": "why have",
|
| 253 |
+
"will've": "will have",
|
| 254 |
+
"won't": "will not",
|
| 255 |
+
"won't've": "will not have",
|
| 256 |
+
"would've": "would have",
|
| 257 |
+
"wouldn't": "would not",
|
| 258 |
+
"wouldn't've": "would not have",
|
| 259 |
+
"y'all": "you all",
|
| 260 |
+
"y'alls": "you alls",
|
| 261 |
+
"y'all'd": "you all would",
|
| 262 |
+
"y'all'd've": "you all would have",
|
| 263 |
+
"y'all're": "you all are",
|
| 264 |
+
"y'all've": "you all have",
|
| 265 |
+
"you'd": "you had",
|
| 266 |
+
"you'd've": "you would have",
|
| 267 |
+
"you'll": "you you will",
|
| 268 |
+
"you'll've": "you you will have",
|
| 269 |
+
"you're": "you are",
|
| 270 |
+
"you've": "you have"
|
| 271 |
+
}
|
| 272 |
+
|
| 273 |
+
c_re = re.compile('(%s)' % '|'.join(cList.keys()))
|
| 274 |
+
|
| 275 |
+
def expandContractions(text, c_re=c_re):
|
| 276 |
+
def replace(match):
|
| 277 |
+
return cList[match.group(0)]
|
| 278 |
+
return c_re.sub(replace, text)
|
| 279 |
+
|
| 280 |
+
## Function to perform stepwise cleaning process
|
| 281 |
+
def tweets_cleaner(tweets):
|
| 282 |
+
cleaned_tweets = []
|
| 283 |
+
for tweet in tweets:
|
| 284 |
+
tweet = tweet.lower() #lowercase
|
| 285 |
+
|
| 286 |
+
# if url links then don't append to avoid news articles
|
| 287 |
+
# also check tweet length, save those > 5
|
| 288 |
+
if re.match("(\w+:\/\/\S+)", tweet) == None and len(tweet) > 5:
|
| 289 |
+
|
| 290 |
+
#remove hashtag, @mention, emoji and image URLs
|
| 291 |
+
tweet = ' '.join(re.sub("(@[A-Za-z0-9]+)|(\#[A-Za-z0-9]+)|(<Emoji:.*>)|(pic\.twitter\.com\/.*)", " ", tweet).split())
|
| 292 |
+
|
| 293 |
+
#fix weirdly encoded texts
|
| 294 |
+
tweet = ftfy.fix_text(tweet)
|
| 295 |
+
|
| 296 |
+
#expand contraction
|
| 297 |
+
tweet = expandContractions(tweet)
|
| 298 |
+
|
| 299 |
+
|
| 300 |
+
#remove punctuation
|
| 301 |
+
tweet = ' '.join(re.sub("([^0-9A-Za-z \t])", " ", tweet).split())
|
| 302 |
+
|
| 303 |
+
#stop words and lemmatization
|
| 304 |
+
stop_words = set(stopwords.words('english'))
|
| 305 |
+
word_tokens = nltk.word_tokenize(tweet)
|
| 306 |
+
|
| 307 |
+
lemmatizer=WordNetLemmatizer()
|
| 308 |
+
filtered_sentence = [lemmatizer.lemmatize(word) for word in word_tokens if not word in stop_words]
|
| 309 |
+
# back to string from list
|
| 310 |
+
tweet = ' '.join(filtered_sentence) # join words with a space in between them
|
| 311 |
+
|
| 312 |
+
cleaned_tweets.append(tweet)
|
| 313 |
+
|
| 314 |
+
return cleaned_tweets
|
| 315 |
+
|
| 316 |
+
"""## Word Cloud:
|
| 317 |
+
|
| 318 |
+
To get the most common words used in depressive and random datasets, the POS-tag (Parts of Speech tagging) module in the NLTK library was used. Using the WordCloud library, one can generate a Word Cloud based on word frequency and superimpose these words on any image. In this case, I used the Twitter logo and Matplotlib to display the image. The Word Cloud shows the words with higher frequency in bigger text size while the "not-so" common words are in smaller text sizes.
|
| 319 |
+
"""
|
| 320 |
+
|
| 321 |
+
depressive_tweets_arr = [x for x in new_dep_df['text']]
|
| 322 |
+
random_tweets_arr = [x for x in new_rand_df['text']]
|
| 323 |
+
X_d = tweets_cleaner(depressive_tweets_arr)
|
| 324 |
+
X_r = tweets_cleaner(random_tweets_arr)
|
| 325 |
+
|
| 326 |
+
## function to obtain adjectives from tweets
|
| 327 |
+
def getadjectives(tweet):
|
| 328 |
+
tweet = nltk.word_tokenize(tweet) # convert string to tokens
|
| 329 |
+
tweet = [word for (word, tag) in pos_tag(tweet)
|
| 330 |
+
if tag == "JJ"] # pos_tag module in NLTK library
|
| 331 |
+
return " ".join(tweet) # join words with a space in between them
|
| 332 |
+
|
| 333 |
+
"""### Depressive Tweets Exploration"""
|
| 334 |
+
|
| 335 |
+
## Apply getadjectives function to the processed tweets
|
| 336 |
+
## Extract all tweets into one long string with each word separate with a "space"
|
| 337 |
+
tweets_long_string = [getadjectives(x) for x in X_d]
|
| 338 |
+
tweets_long_string = " ".join(tweets_long_string)
|
| 339 |
+
|
| 340 |
+
# Import Twitter Logo
|
| 341 |
+
image = np.array(Image.open('/content/drive/MyDrive/NLP/Depression_Detection/data_cleaning/logo.jpeg'))
|
| 342 |
+
|
| 343 |
+
fig = plt.figure() # Instantiate the figure object
|
| 344 |
+
fig.set_figwidth(14) # set width
|
| 345 |
+
fig.set_figheight(18) # set height
|
| 346 |
+
|
| 347 |
+
plt.imshow(image, cmap=plt.cm.gray, interpolation='bilinear') # Display data as an image
|
| 348 |
+
plt.axis('off') # Remove axis
|
| 349 |
+
plt.show() # Display image
|
| 350 |
+
|
| 351 |
+
## Create function to generate the blue colour for the Word CLoud
|
| 352 |
+
|
| 353 |
+
def blue_color_func(word, font_size, position, orientation, random_state=None,**kwargs):
|
| 354 |
+
return "hsl(210, 100%%, %d%%)" % random.randint(50, 70)
|
| 355 |
+
|
| 356 |
+
## Instantiate the Twitter word cloud object
|
| 357 |
+
twitter_wc = WordCloud(background_color='white', max_words=1500, mask=image)
|
| 358 |
+
|
| 359 |
+
## generate the word cloud
|
| 360 |
+
twitter_wc.generate(tweets_long_string)
|
| 361 |
+
|
| 362 |
+
## display the word cloud
|
| 363 |
+
fig = plt.figure()
|
| 364 |
+
fig.set_figwidth(14) # set width
|
| 365 |
+
fig.set_figheight(18) # set height
|
| 366 |
+
|
| 367 |
+
plt.imshow(twitter_wc.recolor(color_func=blue_color_func, random_state=3),
|
| 368 |
+
interpolation="bilinear")
|
| 369 |
+
plt.axis('off')
|
| 370 |
+
plt.show()
|
| 371 |
+
|
| 372 |
+
twitter_wc.to_file("/content/drive/MyDrive/NLP/Depression_Detection/data_cleaning/wordcloud.png") #save to a png file
|
| 373 |
+
|
| 374 |
+
"""**Analyzing Top Words in the Word Cloud for depressive dataset**"""
|
| 375 |
+
|
| 376 |
+
## Combine all words in depressive into a list
|
| 377 |
+
tweets_long_string = [getadjectives(x) for x in X_d]
|
| 378 |
+
tweets_list=[]
|
| 379 |
+
for item in tweets_long_string:
|
| 380 |
+
item = item.split()
|
| 381 |
+
for i in item:
|
| 382 |
+
tweets_list.append(i)
|
| 383 |
+
|
| 384 |
+
# Use the Built-in Python Collections module to determine Word frequency
|
| 385 |
+
from collections import Counter
|
| 386 |
+
counts = Counter(tweets_list)
|
| 387 |
+
df = pd.DataFrame.from_dict(counts, orient='index').reset_index()
|
| 388 |
+
df.columns = ['Words', 'Count']
|
| 389 |
+
df.sort_values(by='Count', ascending=False, inplace=True)
|
| 390 |
+
|
| 391 |
+
df.head(10) # Check dataframe first 10 rows
|
| 392 |
+
|
| 393 |
+
"""### Random Tweets Exploration
|
| 394 |
+
|
| 395 |
+
"""
|
| 396 |
+
|
| 397 |
+
## Apply getadjectives function to the processed tweets
|
| 398 |
+
## Extract all tweets into one long string with each word separate with a "space"
|
| 399 |
+
tweets_long_string_rand = [getadjectives(x) for x in X_r]
|
| 400 |
+
tweets_long_string_rand = " ".join(tweets_long_string_rand)
|
| 401 |
+
|
| 402 |
+
# Import Twitter Logo
|
| 403 |
+
image = np.array(Image.open('/content/drive/MyDrive/NLP/Depression_Detection/data_cleaning/logo.jpeg'))
|
| 404 |
+
|
| 405 |
+
fig = plt.figure() # Instantiate the figure object
|
| 406 |
+
fig.set_figwidth(14) # set width
|
| 407 |
+
fig.set_figheight(18) # set height
|
| 408 |
+
|
| 409 |
+
plt.imshow(image, cmap=plt.cm.gray, interpolation='bilinear') # Display data as an image
|
| 410 |
+
plt.axis('off') # Remove axis
|
| 411 |
+
plt.show() # Display image
|
| 412 |
+
|
| 413 |
+
## Create function to generate the blue colour for the Word CLoud
|
| 414 |
+
|
| 415 |
+
def blue_color_func(word, font_size, position, orientation, random_state=None,**kwargs):
|
| 416 |
+
return "hsl(210, 100%%, %d%%)" % random.randint(50, 70)
|
| 417 |
+
|
| 418 |
+
## Instantiate the Twitter word cloud object
|
| 419 |
+
twitter_wc = WordCloud(background_color='white', max_words=1500, mask=image)
|
| 420 |
+
|
| 421 |
+
## generate the word cloud
|
| 422 |
+
twitter_wc.generate(tweets_long_string_rand)
|
| 423 |
+
|
| 424 |
+
## display the word cloud
|
| 425 |
+
fig = plt.figure()
|
| 426 |
+
fig.set_figwidth(14) # set width
|
| 427 |
+
fig.set_figheight(18) # set height
|
| 428 |
+
|
| 429 |
+
plt.imshow(twitter_wc.recolor(color_func=blue_color_func, random_state=3),
|
| 430 |
+
interpolation="bilinear")
|
| 431 |
+
plt.axis('off')
|
| 432 |
+
plt.show()
|
| 433 |
+
|
| 434 |
+
twitter_wc.to_file("/content/drive/MyDrive/NLP/Depression_Detection/data_cleaning/wordcloud_rand.png") #save to a png file
|
| 435 |
+
|
| 436 |
+
"""**Analyzing Top Words in the Word Cloud for random dataset**"""
|
| 437 |
+
|
| 438 |
+
## Combine all words in depressive into a list
|
| 439 |
+
tweets_long_string_rand = [getadjectives(x) for x in X_r]
|
| 440 |
+
tweets_list_rand=[]
|
| 441 |
+
for item in tweets_long_string_rand:
|
| 442 |
+
item = item.split()
|
| 443 |
+
for i in item:
|
| 444 |
+
tweets_list_rand.append(i)
|
| 445 |
+
|
| 446 |
+
## Use the Built-in Python Collections module to determine Word frequency
|
| 447 |
+
from collections import Counter
|
| 448 |
+
counts = Counter(tweets_list_rand)
|
| 449 |
+
df_rand = pd.DataFrame.from_dict(counts, orient='index').reset_index()
|
| 450 |
+
df_rand.columns = ['Words', 'Count']
|
| 451 |
+
df_rand.sort_values(by='Count', ascending=False, inplace=True)
|
| 452 |
+
|
| 453 |
+
df_rand.head(10) # Check dataframe first 10 rows
|
| 454 |
+
|
| 455 |
+
"""## Data Analysis:"""
|
| 456 |
+
|
| 457 |
+
## distribution of classes for prediction
|
| 458 |
+
def create_distribution(dataFile):
|
| 459 |
+
return sns.countplot(x='label', data=dataFile, palette='hls')
|
| 460 |
+
|
| 461 |
+
create_distribution(df_all)
|
| 462 |
+
|
| 463 |
+
"""Depreesive and random (Non-depressive) tweets are almost evenly distributed.
|
| 464 |
+
|
| 465 |
+
**Finding distribution of tweet lengths**
|
| 466 |
+
"""
|
| 467 |
+
|
| 468 |
+
dep_line_lengths = [len(statement) for statement in new_dep_df['text']]
|
| 469 |
+
plt.plot(dep_line_lengths)
|
| 470 |
+
plt.show()
|
| 471 |
+
|
| 472 |
+
rand_line_lengths = [len(statement) for statement in new_rand_df['text']]
|
| 473 |
+
plt.plot(dep_line_lengths)
|
| 474 |
+
plt.show()
|
| 475 |
+
|
| 476 |
+
"""From the distributions above, it is clear that there is no outliers in our depressive and random datasets.
|
| 477 |
+
|
| 478 |
+
## Cleaning combined dataset and save it
|
| 479 |
+
"""
|
| 480 |
+
|
| 481 |
+
tweets_arr = [x for x in df_all['text']]
|
| 482 |
+
|
| 483 |
+
corpus = tweets_cleaner(tweets_arr)
|
| 484 |
+
|
| 485 |
+
corpus[:10]
|
| 486 |
+
|
| 487 |
+
## Adding clean tweets as a new column
|
| 488 |
+
df_all['clean_text'] = corpus
|
| 489 |
+
|
| 490 |
+
"""We have to remove those rows with tweets that has been completely deleted in the cleaning process."""
|
| 491 |
+
|
| 492 |
+
# replace field that's entirely space (or empty) with NaN
|
| 493 |
+
df_all.replace(r'^\s*$', np.nan, regex=True, inplace=True)
|
| 494 |
+
|
| 495 |
+
df_all[df_all['clean_text'].isnull()]
|
| 496 |
+
|
| 497 |
+
## Deleting the rows with nan
|
| 498 |
+
df_all.dropna(subset=['clean_text'], inplace=True)
|
| 499 |
+
|
| 500 |
+
## Double_check for nan
|
| 501 |
+
df_all[df_all['clean_text'].isnull()]
|
| 502 |
+
|
| 503 |
+
## Save cleaned_dataset
|
| 504 |
+
df_all.to_csv('/content/drive/MyDrive/NLP/Depression_Detection/data_cleaning/processed_data/processed_data.csv',
|
| 505 |
+
sep='\t', encoding='utf-8',index=False)
|
source_code/notebooks/data_gathering_twint.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""Data_Gathering_Twint.ipynb
|
| 3 |
+
|
| 4 |
+
Automatically generated by Colaboratory.
|
| 5 |
+
|
| 6 |
+
Original file is located at
|
| 7 |
+
https://colab.research.google.com/drive/1zV6s2FhvQCmyNh0uyknfm47WATAOihbU
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from google.colab import drive
|
| 11 |
+
drive.mount('/content/drive')
|
| 12 |
+
|
| 13 |
+
!git clone https://github.com/twintproject/twint.git
|
| 14 |
+
|
| 15 |
+
import os
|
| 16 |
+
os.chdir("/content/twint")
|
| 17 |
+
|
| 18 |
+
!pip freeze > requirements.txt
|
| 19 |
+
|
| 20 |
+
!pip install .
|
| 21 |
+
|
| 22 |
+
!pip install -U git+https://github.com/cyxv/twint.git@master
|
| 23 |
+
|
| 24 |
+
!pip install nest_asyncio
|
| 25 |
+
|
| 26 |
+
!pip3 install twint
|
| 27 |
+
|
| 28 |
+
# Import required libraries
|
| 29 |
+
import nest_asyncio
|
| 30 |
+
nest_asyncio.apply()
|
| 31 |
+
import pandas as pd
|
| 32 |
+
import twint
|
| 33 |
+
import pandas as pd
|
| 34 |
+
import re
|
| 35 |
+
|
| 36 |
+
# add some tweets with depressed and depression tags, for a particular year
|
| 37 |
+
|
| 38 |
+
depress_tags = ["#depressed", "#anxiety", "#depression", "#suicide", "#mentalhealth"
|
| 39 |
+
"#loneliness", "#hopelessness", "#itsokaynottobeokay"]
|
| 40 |
+
|
| 41 |
+
content = {}
|
| 42 |
+
for i in range(len(depress_tags)):
|
| 43 |
+
print(depress_tags[i])
|
| 44 |
+
c = twint.Config()
|
| 45 |
+
|
| 46 |
+
c.Format = "Tweet id: {id} | Tweet: {tweet}"
|
| 47 |
+
c.Search = depress_tags[i]
|
| 48 |
+
c.Limit = 1000
|
| 49 |
+
c.Year = 2019
|
| 50 |
+
c.Lang = "en"
|
| 51 |
+
c.Store_csv = True
|
| 52 |
+
c.Store_Object = True
|
| 53 |
+
c.Output = "/content/drive/MyDrive/NLP/Depression_Detection/depressive_en_2019.csv"
|
| 54 |
+
c.Hide_output = True
|
| 55 |
+
c.Stats = True
|
| 56 |
+
c.Lowercase = True
|
| 57 |
+
c.Filter_retweets = True
|
| 58 |
+
twint.run.Search(c)
|
| 59 |
+
|
| 60 |
+
# add some tweets with depressed and depression tags, for a particular year
|
| 61 |
+
|
| 62 |
+
depress_tags = ["#depressed", "#depression", "#loneliness", "#hopelessness"]
|
| 63 |
+
|
| 64 |
+
content = {}
|
| 65 |
+
for i in range(len(depress_tags)):
|
| 66 |
+
print(depress_tags[i])
|
| 67 |
+
c = twint.Config()
|
| 68 |
+
|
| 69 |
+
c.Format = "Tweet id: {id} | Tweet: {tweet}"
|
| 70 |
+
c.Search = depress_tags[i]
|
| 71 |
+
c.Limit = 1000
|
| 72 |
+
c.Year = 2020
|
| 73 |
+
c.Store_csv = True
|
| 74 |
+
c.Store_json = True
|
| 75 |
+
c.Output = "/content/drive/MyDrive/NLP/Depression_Detection/dataset_depression.json"
|
| 76 |
+
c.Hide_output = True
|
| 77 |
+
c.Stats = True
|
| 78 |
+
c.Lowercase = True
|
| 79 |
+
c.Filter_retweets = True
|
| 80 |
+
twint.run.Search(c)
|
source_code/notebooks/data_gathering_twitter_API.py
ADDED
|
@@ -0,0 +1,388 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""Twitter_API.ipynb
|
| 3 |
+
|
| 4 |
+
Automatically generated by Colaboratory.
|
| 5 |
+
|
| 6 |
+
Original file is located at
|
| 7 |
+
https://colab.research.google.com/drive/1UAilj_PmxYbwHsc_s79d9UyBvawBVZAS
|
| 8 |
+
|
| 9 |
+
# Tweet mining using Twitter API via Tweepy:
|
| 10 |
+
|
| 11 |
+
In this notebook I am using Tweepy python library to tweets using relevant hashtags. I was able to retrieve around 19000 unique tweets via twitter API. At the end, all the datasets with different depressive hashtags will be combined, cleaned and saved as depressive_tweets.csv.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
from google.colab import drive
|
| 15 |
+
drive.mount('/content/drive')
|
| 16 |
+
|
| 17 |
+
"""## Tweets mining"""
|
| 18 |
+
|
| 19 |
+
!pip install -qqq tweepy
|
| 20 |
+
|
| 21 |
+
## Import required libraries
|
| 22 |
+
import tweepy
|
| 23 |
+
from tweepy.streaming import StreamListener
|
| 24 |
+
from tweepy import OAuthHandler
|
| 25 |
+
from tweepy import Stream
|
| 26 |
+
import csv
|
| 27 |
+
import pandas as pd
|
| 28 |
+
|
| 29 |
+
## Access to twitter API cunsumer_key and access_secret
|
| 30 |
+
#import config.ipynb
|
| 31 |
+
|
| 32 |
+
## Twitter API related information
|
| 33 |
+
consumer_key = config.API_KEY
|
| 34 |
+
consumer_secret = config.API_KEY_SECRET
|
| 35 |
+
access_key= config.ACCESS_TOKEN
|
| 36 |
+
access_secret = config.ACCESS_TOKEN_SECRET
|
| 37 |
+
|
| 38 |
+
auth = tweepy.OAuthHandler(consumer_key, consumer_secret) # Pass in Consumer key and secret for authentication by API
|
| 39 |
+
auth.set_access_token(access_key, access_secret) # Pass in Access key and secret for authentication by API
|
| 40 |
+
api = tweepy.API(auth,wait_on_rate_limit=True,wait_on_rate_limit_notify=True) # Sleeps when API limit is reached
|
| 41 |
+
|
| 42 |
+
## depress_tags = ["#depressed", "#anxiety", "#depression", "#suicide", "#mentalhealth"
|
| 43 |
+
## "#loneliness", "#hopelessness", "#itsokaynottobeokay", "#sad"]
|
| 44 |
+
|
| 45 |
+
"""## "#depressed""""
|
| 46 |
+
|
| 47 |
+
## Create a function for tweets mining
|
| 48 |
+
def tweets_mining1(search_query1, num_tweets1, since_id_num1):
|
| 49 |
+
# Collect tweets using the Cursor object
|
| 50 |
+
# Each item in the iterator has various attributes that you can access to get information about each tweet
|
| 51 |
+
tweet_list1 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query1, lang="en", since_id=since_id_num1,
|
| 52 |
+
tweet_mode='extended').items(num_tweets1)]
|
| 53 |
+
|
| 54 |
+
# Begin scraping the tweets individually:
|
| 55 |
+
for tweet in tweet_list1[::-1]:
|
| 56 |
+
tweet_id = tweet.id # get Tweet ID result
|
| 57 |
+
created_at = tweet.created_at # get time tweet was created
|
| 58 |
+
text = tweet.full_text # retrieve full tweet text
|
| 59 |
+
location = tweet.user.location # retrieve user location
|
| 60 |
+
retweet = tweet.retweet_count # retrieve number of retweets
|
| 61 |
+
favorite = tweet.favorite_count # retrieve number of likes
|
| 62 |
+
with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depressed_1.csv','a', newline='', encoding='utf-8') as csvFile1:
|
| 63 |
+
csv_writer1 = csv.writer(csvFile1, delimiter=',') # create an instance of csv object
|
| 64 |
+
csv_writer1.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row
|
| 65 |
+
|
| 66 |
+
search_words1 = "#depressed" # Specifying exact phrase to search
|
| 67 |
+
# Exclude Links, retweets, replies
|
| 68 |
+
search_query1 = search_words1 + " -filter:links AND -filter:retweets AND -filter:replies"
|
| 69 |
+
with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depressed_1.csv', encoding='utf-8') as data:
|
| 70 |
+
latest_tweet = int(list(csv.reader(data))[-1][0])
|
| 71 |
+
tweets_mining1(search_query1, 1000, latest_tweet)
|
| 72 |
+
|
| 73 |
+
df_depressed_1 = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depressed_1.csv",
|
| 74 |
+
names=['tweet.id', "created_at","text", "location", "retweet", "favorite"])
|
| 75 |
+
|
| 76 |
+
df_depressed_1
|
| 77 |
+
|
| 78 |
+
## Finding unique values in each column
|
| 79 |
+
for col in df_depressed_1:
|
| 80 |
+
print("There are ", len(df_depressed_1[col].unique()), "unique values in ", col)
|
| 81 |
+
|
| 82 |
+
"""### Anxiety and suicide """
|
| 83 |
+
|
| 84 |
+
## Create a function for tweets mining
|
| 85 |
+
def tweets_mining2(search_query2, num_tweets2, since_id_num2):
|
| 86 |
+
# Collect tweets using the Cursor object
|
| 87 |
+
# Each item in the iterator has various attributes that you can access to get information about each tweet
|
| 88 |
+
tweet_list2 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query2, lang="en", since_id=since_id_num2,
|
| 89 |
+
tweet_mode='extended').items(num_tweets2)]
|
| 90 |
+
|
| 91 |
+
# Begin scraping the tweets individually:
|
| 92 |
+
for tweet in tweet_list2[::-1]:
|
| 93 |
+
tweet_id = tweet.id # get Tweet ID result
|
| 94 |
+
created_at = tweet.created_at # get time tweet was created
|
| 95 |
+
text = tweet.full_text # retrieve full tweet text
|
| 96 |
+
location = tweet.user.location # retrieve user location
|
| 97 |
+
retweet = tweet.retweet_count # retrieve number of retweets
|
| 98 |
+
favorite = tweet.favorite_count # retrieve number of likes
|
| 99 |
+
with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_anxiety_1.csv','a', newline='', encoding='utf-8') as csvFile2:
|
| 100 |
+
csv_writer2 = csv.writer(csvFile2, delimiter=',') # create an instance of csv object
|
| 101 |
+
csv_writer2.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row
|
| 102 |
+
|
| 103 |
+
search_words2 = "#anxiety" # Specifying exact phrase to search
|
| 104 |
+
# Exclude Links, retweets, replies
|
| 105 |
+
search_query2 = search_words2 + " -filter:links AND -filter:retweets AND -filter:replies"
|
| 106 |
+
with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_anxiety_1.csv', encoding='utf-8') as data:
|
| 107 |
+
latest_tweet = int(list(csv.reader(data))[-1][0])
|
| 108 |
+
tweets_mining2(search_query2, 2000, latest_tweet)
|
| 109 |
+
|
| 110 |
+
df_anxiety_1 = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_anxiety_1.csv",
|
| 111 |
+
names=['tweet.id', "created_at","text", "location", "retweet", "favorite"])
|
| 112 |
+
|
| 113 |
+
df_anxiety_1
|
| 114 |
+
|
| 115 |
+
## Finding unique values in each column
|
| 116 |
+
for col in df_anxiety_1:
|
| 117 |
+
print("There are ", len(df_anxiety_1[col].unique()), "unique values in ", col)
|
| 118 |
+
|
| 119 |
+
"""## "#Suicide""""
|
| 120 |
+
|
| 121 |
+
## Create a function for tweets mining
|
| 122 |
+
def tweets_mining3(search_query3, num_tweets3, since_id_num3):
|
| 123 |
+
# Collect tweets using the Cursor object
|
| 124 |
+
# Each item in the iterator has various attributes that you can access to get information about each tweet
|
| 125 |
+
tweet_list3 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query3, lang="en", since_id=since_id_num3,
|
| 126 |
+
tweet_mode='extended').items(num_tweets3)]
|
| 127 |
+
|
| 128 |
+
# Begin scraping the tweets individually:
|
| 129 |
+
for tweet in tweet_list3[::-1]:
|
| 130 |
+
tweet_id = tweet.id # get Tweet ID result
|
| 131 |
+
created_at = tweet.created_at # get time tweet was created
|
| 132 |
+
text = tweet.full_text # retrieve full tweet text
|
| 133 |
+
location = tweet.user.location # retrieve user location
|
| 134 |
+
retweet = tweet.retweet_count # retrieve number of retweets
|
| 135 |
+
favorite = tweet.favorite_count # retrieve number of likes
|
| 136 |
+
with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_suicide_1.csv','a', newline='', encoding='utf-8') as csvFile3:
|
| 137 |
+
csv_writer3 = csv.writer(csvFile3, delimiter=',') # create an instance of csv object
|
| 138 |
+
csv_writer3.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row
|
| 139 |
+
|
| 140 |
+
search_words3 = "#suicide" # Specifying exact phrase to search
|
| 141 |
+
# Exclude Links, retweets, replies
|
| 142 |
+
search_query3 = search_words3 + " -filter:links AND -filter:retweets AND -filter:replies"
|
| 143 |
+
with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_suicide_1.csv', encoding='utf-8') as data:
|
| 144 |
+
latest_tweet = int(list(csv.reader(data))[-1][0])
|
| 145 |
+
tweets_mining3(search_query3, 10000, latest_tweet)
|
| 146 |
+
|
| 147 |
+
df_suicide_1 = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_suicide_1.csv",
|
| 148 |
+
names=['tweet.id', "created_at","text", "location", "retweet", "favorite"])
|
| 149 |
+
|
| 150 |
+
df_suicide_1
|
| 151 |
+
|
| 152 |
+
"""## "#hopelessness""""
|
| 153 |
+
|
| 154 |
+
## Create a function for tweets mining
|
| 155 |
+
def tweets_mining4(search_query4, num_tweets4, since_id_num4):
|
| 156 |
+
# Collect tweets using the Cursor object
|
| 157 |
+
# Each item in the iterator has various attributes that you can access to get information about each tweet
|
| 158 |
+
tweet_list4 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query4, lang="en", since_id=since_id_num4,
|
| 159 |
+
tweet_mode='extended').items(num_tweets4)]
|
| 160 |
+
|
| 161 |
+
# Begin scraping the tweets individually:
|
| 162 |
+
for tweet in tweet_list4[::-1]:
|
| 163 |
+
tweet_id = tweet.id # get Tweet ID result
|
| 164 |
+
created_at = tweet.created_at # get time tweet was created
|
| 165 |
+
text = tweet.full_text # retrieve full tweet text
|
| 166 |
+
location = tweet.user.location # retrieve user location
|
| 167 |
+
retweet = tweet.retweet_count # retrieve number of retweets
|
| 168 |
+
favorite = tweet.favorite_count # retrieve number of likes
|
| 169 |
+
with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_hopeless_1.csv','a', newline='', encoding='utf-8') as csvFile4:
|
| 170 |
+
csv_writer4 = csv.writer(csvFile4, delimiter=',') # create an instance of csv object
|
| 171 |
+
csv_writer4.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row
|
| 172 |
+
|
| 173 |
+
search_words4 = "#hopelessness" # Specifying exact phrase to search
|
| 174 |
+
# Exclude Links, retweets, replies
|
| 175 |
+
search_query4 = search_words4 + " -filter:links AND -filter:retweets AND -filter:replies"
|
| 176 |
+
with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_hopeless_1.csv', encoding='utf-8') as data:
|
| 177 |
+
latest_tweet = int(list(csv.reader(data))[-1][0])
|
| 178 |
+
tweets_mining4(search_query4, 10000, latest_tweet)
|
| 179 |
+
|
| 180 |
+
df_hopeless_1 = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_hopeless_1.csv",
|
| 181 |
+
names=['tweet.id', "created_at","text", "location", "retweet", "favorite"])
|
| 182 |
+
|
| 183 |
+
df_hopeless_1
|
| 184 |
+
|
| 185 |
+
"""## "#mentalhealth""""
|
| 186 |
+
|
| 187 |
+
## Create a function for tweets mining
|
| 188 |
+
def tweets_mining5(search_query5, num_tweets5, since_id_num5):
|
| 189 |
+
# Collect tweets using the Cursor object
|
| 190 |
+
# Each item in the iterator has various attributes that you can access to get information about each tweet
|
| 191 |
+
tweet_list5 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query5, lang="en", since_id=since_id_num5,
|
| 192 |
+
tweet_mode='extended').items(num_tweets5)]
|
| 193 |
+
|
| 194 |
+
# Begin scraping the tweets individually:
|
| 195 |
+
for tweet in tweet_list5[::-1]:
|
| 196 |
+
tweet_id = tweet.id # get Tweet ID result
|
| 197 |
+
created_at = tweet.created_at # get time tweet was created
|
| 198 |
+
text = tweet.full_text # retrieve full tweet text
|
| 199 |
+
location = tweet.user.location # retrieve user location
|
| 200 |
+
retweet = tweet.retweet_count # retrieve number of retweets
|
| 201 |
+
favorite = tweet.favorite_count # retrieve number of likes
|
| 202 |
+
with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_mentalhealth_1.csv','a', newline='', encoding='utf-8') as csvFile5:
|
| 203 |
+
csv_writer5 = csv.writer(csvFile5, delimiter=',') # create an instance of csv object
|
| 204 |
+
csv_writer5.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row
|
| 205 |
+
|
| 206 |
+
search_words5 = "#mentalhealth" # Specifying exact phrase to search
|
| 207 |
+
# Exclude Links, retweets, replies
|
| 208 |
+
search_query5 = search_words5 + " -filter:links AND -filter:retweets AND -filter:replies"
|
| 209 |
+
with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_mentalhealth_1.csv', encoding='utf-8') as data:
|
| 210 |
+
latest_tweet = int(list(csv.reader(data))[-1][0])
|
| 211 |
+
tweets_mining5(search_query5, 1000, latest_tweet)
|
| 212 |
+
|
| 213 |
+
df_mentalhealth_1 = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_mentalhealth_1.csv",
|
| 214 |
+
names=['tweet.id', "created_at","text", "location", "retweet", "favorite"])
|
| 215 |
+
|
| 216 |
+
df_mentalhealth_1
|
| 217 |
+
|
| 218 |
+
"""## "#loneliness""""
|
| 219 |
+
|
| 220 |
+
## Create a function for tweets mining
|
| 221 |
+
def tweets_mining6(search_query6, num_tweets6, since_id_num6):
|
| 222 |
+
# Collect tweets using the Cursor object
|
| 223 |
+
# Each item in the iterator has various attributes that you can access to get information about each tweet
|
| 224 |
+
tweet_list6 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query6, lang="en", since_id=since_id_num6,
|
| 225 |
+
tweet_mode='extended').items(num_tweets6)]
|
| 226 |
+
|
| 227 |
+
# Begin scraping the tweets individually:
|
| 228 |
+
for tweet in tweet_list6[::-1]:
|
| 229 |
+
tweet_id = tweet.id # get Tweet ID result
|
| 230 |
+
created_at = tweet.created_at # get time tweet was created
|
| 231 |
+
text = tweet.full_text # retrieve full tweet text
|
| 232 |
+
location = tweet.user.location # retrieve user location
|
| 233 |
+
retweet = tweet.retweet_count # retrieve number of retweets
|
| 234 |
+
favorite = tweet.favorite_count # retrieve number of likes
|
| 235 |
+
with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_loneliness_1.csv','a', newline='', encoding='utf-8') as csvFile6:
|
| 236 |
+
csv_writer6 = csv.writer(csvFile6, delimiter=',') # create an instance of csv object
|
| 237 |
+
csv_writer6.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row
|
| 238 |
+
|
| 239 |
+
search_words6 = "#loneliness" # Specifying exact phrase to search
|
| 240 |
+
# Exclude Links, retweets, replies
|
| 241 |
+
search_query6 = search_words6 + " -filter:links AND -filter:retweets AND -filter:replies"
|
| 242 |
+
with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_loneliness_1.csv', encoding='utf-8') as data:
|
| 243 |
+
latest_tweet = int(list(csv.reader(data))[-1][0])
|
| 244 |
+
tweets_mining6(search_query6, 10000, latest_tweet)
|
| 245 |
+
|
| 246 |
+
df_loneliness_1 = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_loneliness_1.csv",
|
| 247 |
+
names=['tweet.id', "created_at","text", "location", "retweet", "favorite"])
|
| 248 |
+
|
| 249 |
+
df_loneliness_1
|
| 250 |
+
|
| 251 |
+
"""## "#itsokaynottobeokay""""
|
| 252 |
+
|
| 253 |
+
## Create a function for tweets mining
|
| 254 |
+
def tweets_mining7(search_query7, num_tweets7, since_id_num7):
|
| 255 |
+
# Collect tweets using the Cursor object
|
| 256 |
+
# Each item in the iterator has various attributes that you can access to get information about each tweet
|
| 257 |
+
tweet_list7 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query7, lang="en", since_id=since_id_num7,
|
| 258 |
+
tweet_mode='extended').items(num_tweets7)]
|
| 259 |
+
|
| 260 |
+
# Begin scraping the tweets individually:
|
| 261 |
+
for tweet in tweet_list7[::-1]:
|
| 262 |
+
tweet_id = tweet.id # get Tweet ID result
|
| 263 |
+
created_at = tweet.created_at # get time tweet was created
|
| 264 |
+
text = tweet.full_text # retrieve full tweet text
|
| 265 |
+
location = tweet.user.location # retrieve user location
|
| 266 |
+
retweet = tweet.retweet_count # retrieve number of retweets
|
| 267 |
+
favorite = tweet.favorite_count # retrieve number of likes
|
| 268 |
+
with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_itsoknottobeok_1 copy.csv','a', newline='', encoding='utf-8') as csvFile7:
|
| 269 |
+
csv_writer7 = csv.writer(csvFile7, delimiter=',') # create an instance of csv object
|
| 270 |
+
csv_writer7.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row
|
| 271 |
+
|
| 272 |
+
search_words7 = "#itsokaynottobeokay" # Specifying exact phrase to search
|
| 273 |
+
# Exclude Links, retweets, replies
|
| 274 |
+
search_query7 = search_words7 + " -filter:links AND -filter:retweets AND -filter:replies"
|
| 275 |
+
with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_itsoknottobeok_1 copy.csv', encoding='utf-8') as data:
|
| 276 |
+
latest_tweet = int(list(csv.reader(data))[-1][0])
|
| 277 |
+
tweets_mining7(search_query7, 2000, latest_tweet)
|
| 278 |
+
|
| 279 |
+
df_itsok_1 = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_itsoknottobeok_1 copy.csv",
|
| 280 |
+
names=['tweet.id', "created_at","text", "location", "retweet", "favorite"])
|
| 281 |
+
|
| 282 |
+
df_itsok_1
|
| 283 |
+
|
| 284 |
+
"""## "#depression""""
|
| 285 |
+
|
| 286 |
+
## Create a function for tweets mining
|
| 287 |
+
def tweets_mining8(search_query8, num_tweets8, since_id_num8):
|
| 288 |
+
# Collect tweets using the Cursor object
|
| 289 |
+
# Each item in the iterator has various attributes that you can access to get information about each tweet
|
| 290 |
+
tweet_list8 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query8, lang="en", since_id=since_id_num8,
|
| 291 |
+
tweet_mode='extended').items(num_tweets8)]
|
| 292 |
+
|
| 293 |
+
# Begin scraping the tweets individually:
|
| 294 |
+
for tweet in tweet_list8[::-1]:
|
| 295 |
+
tweet_id = tweet.id # get Tweet ID result
|
| 296 |
+
created_at = tweet.created_at # get time tweet was created
|
| 297 |
+
text = tweet.full_text # retrieve full tweet text
|
| 298 |
+
location = tweet.user.location # retrieve user location
|
| 299 |
+
retweet = tweet.retweet_count # retrieve number of retweets
|
| 300 |
+
favorite = tweet.favorite_count # retrieve number of likes
|
| 301 |
+
with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depression_1.csv','a', newline='', encoding='utf-8') as csvFile8:
|
| 302 |
+
csv_writer8 = csv.writer(csvFile8, delimiter=',') # create an instance of csv object
|
| 303 |
+
csv_writer8.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row
|
| 304 |
+
|
| 305 |
+
search_words8 = "#depression" # Specifying exact phrase to search
|
| 306 |
+
# Exclude Links, retweets, replies
|
| 307 |
+
search_query8 = search_words8 + " -filter:links AND -filter:retweets AND -filter:replies"
|
| 308 |
+
with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depression_1.csv', encoding='utf-8') as data:
|
| 309 |
+
latest_tweet = int(list(csv.reader(data))[-1][0])
|
| 310 |
+
tweets_mining8(search_query8, 1000, latest_tweet)
|
| 311 |
+
|
| 312 |
+
df_depression_1 = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depression_1.csv",
|
| 313 |
+
names=['tweet.id', "created_at","text", "location", "retweet", "favorite"])
|
| 314 |
+
|
| 315 |
+
df_depression_1
|
| 316 |
+
|
| 317 |
+
## Finding unique values in each column
|
| 318 |
+
for col in df_depression_1:
|
| 319 |
+
print("There are ", len(df_depression_1[col].unique()), "unique values in ", col)
|
| 320 |
+
|
| 321 |
+
"""## "#sad""""
|
| 322 |
+
|
| 323 |
+
## Create a function for tweets mining
|
| 324 |
+
def tweets_mining9(search_query9, num_tweets9, since_id_num9):
|
| 325 |
+
# Collect tweets using the Cursor object
|
| 326 |
+
# Each item in the iterator has various attributes that you can access to get information about each tweet
|
| 327 |
+
tweet_list9 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query9, lang="en", since_id=since_id_num9,
|
| 328 |
+
tweet_mode='extended').items(num_tweets9)]
|
| 329 |
+
|
| 330 |
+
# Begin scraping the tweets individually:
|
| 331 |
+
for tweet in tweet_list9[::-1]:
|
| 332 |
+
tweet_id = tweet.id # get Tweet ID result
|
| 333 |
+
created_at = tweet.created_at # get time tweet was created
|
| 334 |
+
text = tweet.full_text # retrieve full tweet text
|
| 335 |
+
location = tweet.user.location # retrieve user location
|
| 336 |
+
retweet = tweet.retweet_count # retrieve number of retweets
|
| 337 |
+
favorite = tweet.favorite_count # retrieve number of likes
|
| 338 |
+
with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_sad_1.csv','a', newline='', encoding='utf-8') as csvFile9:
|
| 339 |
+
csv_writer9 = csv.writer(csvFile9, delimiter=',') # create an instance of csv object
|
| 340 |
+
csv_writer9.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row
|
| 341 |
+
|
| 342 |
+
search_words9 = "#sad" # Specifying exact phrase to search
|
| 343 |
+
# Exclude Links, retweets, replies
|
| 344 |
+
search_query9 = search_words9 + " -filter:links AND -filter:retweets AND -filter:replies"
|
| 345 |
+
with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_sad_1.csv', encoding='utf-8') as data:
|
| 346 |
+
latest_tweet = int(list(csv.reader(data))[-1][0])
|
| 347 |
+
tweets_mining9(search_query9, 2000, latest_tweet)
|
| 348 |
+
|
| 349 |
+
df_sad_1 = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_sad_1.csv",
|
| 350 |
+
names=['tweet.id', "created_at","text", "location", "retweet", "favorite"])
|
| 351 |
+
|
| 352 |
+
df_sad_1
|
| 353 |
+
|
| 354 |
+
"""# Combining all the tweets"""
|
| 355 |
+
|
| 356 |
+
import glob
|
| 357 |
+
|
| 358 |
+
path = r'/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API' # use your path
|
| 359 |
+
all_files = glob.glob(path + "/*.csv")
|
| 360 |
+
|
| 361 |
+
tweets = []
|
| 362 |
+
|
| 363 |
+
for filename in all_files:
|
| 364 |
+
df = pd.read_csv(filename,
|
| 365 |
+
names=['tweet.id', "created_at","text", "location", "retweet", "favorite"]) # Convert each csv to a dataframe
|
| 366 |
+
tweets.append(df)
|
| 367 |
+
|
| 368 |
+
tweets_df = pd.concat(tweets, ignore_index=True) # Merge all dataframes
|
| 369 |
+
#tweets_df.columns=['tweet.id', "created_at","text", "location", "retweet", "favorite"]
|
| 370 |
+
tweets_df.head()
|
| 371 |
+
|
| 372 |
+
tweets_df
|
| 373 |
+
|
| 374 |
+
tweets_df.to_csv('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/output/depressive_tweets.csv')
|
| 375 |
+
|
| 376 |
+
"""## Data cleaning
|
| 377 |
+
|
| 378 |
+
Data cleaning is one of the essential steps because without a proper cleaning procedure you will have errors in your analysis and eventually your data-driven results. Here I try to eliminate duplicates tweets by using the Primary key ('tweets.id'), checked for empty rows and replaced “NaN” if there is any.
|
| 379 |
+
"""
|
| 380 |
+
|
| 381 |
+
tweets_df.shape #Get number of rows and columns
|
| 382 |
+
|
| 383 |
+
## Check the data type of each column
|
| 384 |
+
tweets_df.dtypes.to_frame().rename(columns={0:'data_type'})
|
| 385 |
+
|
| 386 |
+
## Finding unique values in each column
|
| 387 |
+
for col in tweets_df:
|
| 388 |
+
print("There are ", len(tweets_df[col].unique()), "unique values in ", col)
|
source_code/notebooks/modeling.py
ADDED
|
@@ -0,0 +1,378 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""modeling.ipynb
|
| 3 |
+
|
| 4 |
+
Automatically generated by Colaboratory.
|
| 5 |
+
|
| 6 |
+
Original file is located at
|
| 7 |
+
https://colab.research.google.com/drive/1x78fRDZAuK5FaSTKHPGy8eSbZ_gYAFr6
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from google.colab import drive
|
| 11 |
+
drive.mount('/content/drive')
|
| 12 |
+
|
| 13 |
+
#!pip install -qqq h5py
|
| 14 |
+
|
| 15 |
+
#!pip install --upgrade -qqq gensim
|
| 16 |
+
|
| 17 |
+
!python -m spacy download en_core_web_lg
|
| 18 |
+
|
| 19 |
+
!pip install -U SpaCy==2.2.0
|
| 20 |
+
|
| 21 |
+
## Import required libraries
|
| 22 |
+
|
| 23 |
+
## warnings
|
| 24 |
+
import warnings
|
| 25 |
+
warnings.filterwarnings("ignore")
|
| 26 |
+
|
| 27 |
+
## for data
|
| 28 |
+
import numpy as np
|
| 29 |
+
import pandas as pd
|
| 30 |
+
|
| 31 |
+
## for plotting
|
| 32 |
+
import matplotlib.pyplot as plt
|
| 33 |
+
import seaborn as sns
|
| 34 |
+
|
| 35 |
+
## TF-IDF
|
| 36 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 37 |
+
|
| 38 |
+
## T-Sne
|
| 39 |
+
from yellowbrick.text import TSNEVisualizer
|
| 40 |
+
from sklearn import manifold
|
| 41 |
+
|
| 42 |
+
## Train-Test Split
|
| 43 |
+
from sklearn.model_selection import train_test_split
|
| 44 |
+
|
| 45 |
+
## Feature selection
|
| 46 |
+
from sklearn import feature_selection
|
| 47 |
+
|
| 48 |
+
## libraraies for classification
|
| 49 |
+
from sklearn.pipeline import Pipeline
|
| 50 |
+
import sklearn.metrics as skm
|
| 51 |
+
from sklearn.metrics import confusion_matrix, accuracy_score
|
| 52 |
+
from sklearn.linear_model import LogisticRegression
|
| 53 |
+
from sklearn.neighbors import KNeighborsClassifier
|
| 54 |
+
from sklearn.svm import SVC
|
| 55 |
+
from sklearn.tree import DecisionTreeClassifier
|
| 56 |
+
from sklearn.neural_network import MLPClassifier
|
| 57 |
+
from sklearn.ensemble import RandomForestClassifier
|
| 58 |
+
|
| 59 |
+
## for saving model
|
| 60 |
+
import pickle
|
| 61 |
+
|
| 62 |
+
## for explainer
|
| 63 |
+
#from lime import lime_text
|
| 64 |
+
|
| 65 |
+
## detokenization
|
| 66 |
+
from nltk.tokenize.treebank import TreebankWordDetokenizer
|
| 67 |
+
|
| 68 |
+
## for word embedding with gensim
|
| 69 |
+
import gensim
|
| 70 |
+
import gensim.downloader as gensim_api
|
| 71 |
+
from gensim.models import Word2Vec
|
| 72 |
+
from gensim.models import KeyedVectors
|
| 73 |
+
from keras.preprocessing.text import Tokenizer
|
| 74 |
+
from keras.preprocessing.sequence import pad_sequences
|
| 75 |
+
|
| 76 |
+
## for word embedding with Spacy
|
| 77 |
+
import spacy
|
| 78 |
+
import en_core_web_lg
|
| 79 |
+
|
| 80 |
+
## for deep learning
|
| 81 |
+
from keras.models import load_model
|
| 82 |
+
from keras.models import Model, Sequential
|
| 83 |
+
from keras.callbacks import EarlyStopping, ModelCheckpoint
|
| 84 |
+
from keras.layers import Conv1D, Dense, Input, LSTM, Embedding, Dropout, Activation, MaxPooling1D
|
| 85 |
+
from tensorflow.keras import models, layers, preprocessing as kprocessing
|
| 86 |
+
from tensorflow.keras import backend as K
|
| 87 |
+
import tensorflow as tf
|
| 88 |
+
import keras
|
| 89 |
+
from keras.layers import Lambda
|
| 90 |
+
import tensorflow as tf
|
| 91 |
+
from keras.models import model_from_json
|
| 92 |
+
|
| 93 |
+
## for bert language model
|
| 94 |
+
#import transformers
|
| 95 |
+
|
| 96 |
+
"""## Loading the dataset:"""
|
| 97 |
+
|
| 98 |
+
df_all = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/data_cleaning/processed_data/processed_data.csv",
|
| 99 |
+
sep='\t', encoding='utf-8')
|
| 100 |
+
|
| 101 |
+
df_all
|
| 102 |
+
|
| 103 |
+
"""## Classification models as well as LSTM with pretrained model(Spacy):
|
| 104 |
+
|
| 105 |
+
In order to run a supervised learning model, we first need to convert the clean_text into feature representation.
|
| 106 |
+
"""
|
| 107 |
+
|
| 108 |
+
nlp = en_core_web_lg.load()
|
| 109 |
+
|
| 110 |
+
## word-embedding
|
| 111 |
+
all_vectors = pd.np.array([pd.np.array([token.vector for token in nlp(s)]).mean(axis=0) * pd.np.ones((300)) \
|
| 112 |
+
for s in df_all['clean_text']])
|
| 113 |
+
|
| 114 |
+
# split out validation dataset for the end
|
| 115 |
+
Y= df_all["label"]
|
| 116 |
+
X = all_vectors
|
| 117 |
+
|
| 118 |
+
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
|
| 119 |
+
validation_size = 0.3
|
| 120 |
+
seed = 7
|
| 121 |
+
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=validation_size, random_state=seed)
|
| 122 |
+
|
| 123 |
+
# test options for classification
|
| 124 |
+
num_folds = 10
|
| 125 |
+
seed = 7
|
| 126 |
+
scoring = 'accuracy'
|
| 127 |
+
|
| 128 |
+
## spot check the algorithms
|
| 129 |
+
models = []
|
| 130 |
+
models.append(('LR', LogisticRegression()))
|
| 131 |
+
models.append(('KNN', KNeighborsClassifier()))
|
| 132 |
+
models.append(('CART', DecisionTreeClassifier()))
|
| 133 |
+
models.append(('SVM', SVC()))
|
| 134 |
+
## Neural Network
|
| 135 |
+
models.append(('NN', MLPClassifier()))
|
| 136 |
+
## Ensable Models
|
| 137 |
+
models.append(('RF', RandomForestClassifier()))
|
| 138 |
+
|
| 139 |
+
## Running the classification models
|
| 140 |
+
results = []
|
| 141 |
+
names = []
|
| 142 |
+
kfold_results = []
|
| 143 |
+
test_results = []
|
| 144 |
+
train_results = []
|
| 145 |
+
for name, model in models:
|
| 146 |
+
kfold = KFold(n_splits=num_folds, random_state=seed)
|
| 147 |
+
cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
|
| 148 |
+
results.append(cv_results)
|
| 149 |
+
names.append(name)
|
| 150 |
+
#msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
|
| 151 |
+
#print(msg)
|
| 152 |
+
|
| 153 |
+
# Full Training period
|
| 154 |
+
res = model.fit(X_train, Y_train)
|
| 155 |
+
train_result = accuracy_score(res.predict(X_train), Y_train)
|
| 156 |
+
train_results.append(train_result)
|
| 157 |
+
|
| 158 |
+
# Test results
|
| 159 |
+
test_result = accuracy_score(res.predict(X_test), Y_test)
|
| 160 |
+
test_results.append(test_result)
|
| 161 |
+
|
| 162 |
+
msg = "%s: %f (%f) %f %f" % (name, cv_results.mean(), cv_results.std(), train_result, test_result)
|
| 163 |
+
print(msg)
|
| 164 |
+
print(confusion_matrix(res.predict(X_test), Y_test))
|
| 165 |
+
#print(classification_report(res.predict(X_test), Y_test))
|
| 166 |
+
|
| 167 |
+
# compare algorithms
|
| 168 |
+
from matplotlib import pyplot
|
| 169 |
+
fig = pyplot.figure()
|
| 170 |
+
ind = np.arange(len(names)) # the x locations for the groups
|
| 171 |
+
width = 0.35 # the width of the bars
|
| 172 |
+
fig.suptitle('Algorithm Comparison')
|
| 173 |
+
ax = fig.add_subplot(111)
|
| 174 |
+
pyplot.bar(ind - width/2, train_results, width=width, label='Train Error')
|
| 175 |
+
pyplot.bar(ind + width/2, test_results, width=width, label='Test Error')
|
| 176 |
+
fig.set_size_inches(15,8)
|
| 177 |
+
pyplot.legend()
|
| 178 |
+
ax.set_xticks(ind)
|
| 179 |
+
ax.set_xticklabels(names)
|
| 180 |
+
pyplot.show()
|
| 181 |
+
|
| 182 |
+
"""The best model with the highest accuracy is **Support Vector Machine(SVM)** with **85.79**% accuracy on test dataset. Logistic Regression performed good as well but we see overfitting problem with CART, NN and RF.
|
| 183 |
+
|
| 184 |
+
### LSTM model:
|
| 185 |
+
"""
|
| 186 |
+
|
| 187 |
+
### Create sequence
|
| 188 |
+
vocabulary_size = 20000
|
| 189 |
+
tokenizer = Tokenizer(num_words= vocabulary_size)
|
| 190 |
+
tokenizer.fit_on_texts(df_all['clean_text'])
|
| 191 |
+
sequences = tokenizer.texts_to_sequences(df_all['clean_text'])
|
| 192 |
+
X_LSTM = pad_sequences(sequences, maxlen=50)
|
| 193 |
+
|
| 194 |
+
## Split the data into train and test
|
| 195 |
+
Y_LSTM = df_all["label"]
|
| 196 |
+
X_train_LSTM, X_test_LSTM, Y_train_LSTM, Y_test_LSTM = train_test_split(X_LSTM, \
|
| 197 |
+
Y_LSTM, test_size=validation_size, random_state=seed)
|
| 198 |
+
|
| 199 |
+
from keras.wrappers.scikit_learn import KerasClassifier
|
| 200 |
+
def create_model(input_length=50):
|
| 201 |
+
model = Sequential()
|
| 202 |
+
model.add(Embedding(20000, 300, input_length=50))
|
| 203 |
+
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
|
| 204 |
+
model.add(Dense(1, activation='sigmoid'))
|
| 205 |
+
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
|
| 206 |
+
return model
|
| 207 |
+
model_LSTM = KerasClassifier(build_fn=create_model, epochs=3, verbose=1, validation_split=0.4)
|
| 208 |
+
model_LSTM.fit(X_train_LSTM, Y_train_LSTM)
|
| 209 |
+
|
| 210 |
+
train_result_LSTM = accuracy_score(model_LSTM.predict(X_train_LSTM), Y_train_LSTM)
|
| 211 |
+
# Test results
|
| 212 |
+
test_result_LSTM = accuracy_score(model_LSTM.predict(X_test_LSTM), Y_test_LSTM)
|
| 213 |
+
|
| 214 |
+
print("train result:", train_result_LSTM)
|
| 215 |
+
print("test result:", test_result_LSTM)
|
| 216 |
+
|
| 217 |
+
confusion_matrix(model_LSTM.predict(X_test_LSTM), Y_test_LSTM)
|
| 218 |
+
|
| 219 |
+
"""### Compare all the models:"""
|
| 220 |
+
|
| 221 |
+
train_results.append(train_result_LSTM);test_results.append(test_result_LSTM)
|
| 222 |
+
names.append("LSTM")
|
| 223 |
+
|
| 224 |
+
# compare algorithms
|
| 225 |
+
from matplotlib import pyplot
|
| 226 |
+
fig = pyplot.figure()
|
| 227 |
+
ind = np.arange(len(names)) # the x locations for the groups
|
| 228 |
+
width = 0.35 # the width of the bars
|
| 229 |
+
fig.suptitle('Algorithm Comparison')
|
| 230 |
+
ax = fig.add_subplot(111)
|
| 231 |
+
pyplot.bar(ind - width/2, train_results, width=width, label='Train Error')
|
| 232 |
+
pyplot.bar(ind + width/2, test_results, width=width, label='Test Error')
|
| 233 |
+
fig.set_size_inches(15,8)
|
| 234 |
+
pyplot.legend()
|
| 235 |
+
ax.set_xticks(ind)
|
| 236 |
+
ax.set_xticklabels(names)
|
| 237 |
+
pyplot.show()
|
| 238 |
+
plt.savefig('/content/drive/MyDrive/NLP/Depression_Detection/modeling/classification_comparision.png')
|
| 239 |
+
|
| 240 |
+
"""## Evaluate the performance:
|
| 241 |
+
|
| 242 |
+
* **Accuracy:** the fraction of predictions the model got right.
|
| 243 |
+
* **Confusion Matrix:** a summary table that breaks down the number of correct and incorrect predictions by each class.
|
| 244 |
+
* **ROC:** a plot that illustrates the true positive rate against the false positive rate at various threshold settings. The area under the curve (AUC) indicates the probability that the classifier will rank a randomly chosen positive observation higher than a randomly chosen negative one.
|
| 245 |
+
* **Precision:** the fraction of relevant instances among the retrieved instances.
|
| 246 |
+
* **Recall:** the fraction of the total amount of relevant instances that were actually retrieved.
|
| 247 |
+
"""
|
| 248 |
+
|
| 249 |
+
def conf_matrix_acc(y_true, y_pred):
|
| 250 |
+
## Plot confusion matrix
|
| 251 |
+
cm = confusion_matrix(y_true, y_pred)
|
| 252 |
+
fig, ax = plt.subplots()
|
| 253 |
+
sns.heatmap(cm, annot=True, fmt='d', ax=ax, cmap=plt.cm.Blues,
|
| 254 |
+
cbar=False)
|
| 255 |
+
ax.set(xlabel="Pred", ylabel="True", xticklabels=classes,
|
| 256 |
+
yticklabels=classes, title="Confusion matrix")
|
| 257 |
+
plt.yticks(rotation=0)
|
| 258 |
+
print("=========================================")
|
| 259 |
+
print(f'Accuracy score is : {accuracy_score(y_true, y_pred)}')
|
| 260 |
+
print("=========================================")
|
| 261 |
+
print("Detail:")
|
| 262 |
+
print(skm.classification_report(y_true, y_pred))
|
| 263 |
+
|
| 264 |
+
## Plot ROC and precision-recall curve
|
| 265 |
+
def roc_precision_auc():
|
| 266 |
+
fig, ax = plt.subplots(nrows=1, ncols=2)
|
| 267 |
+
## Plot roc
|
| 268 |
+
for i in range(len(classes)):
|
| 269 |
+
fpr, tpr, thresholds = skm.roc_curve(y_test_array[:,i],
|
| 270 |
+
probs[:,i])
|
| 271 |
+
ax[0].plot(fpr, tpr, lw=3,
|
| 272 |
+
label='{0} (area={1:0.2f})'.format(classes[i],
|
| 273 |
+
skm.auc(fpr, tpr))
|
| 274 |
+
)
|
| 275 |
+
ax[0].plot([0,1], [0,1], color='navy', lw=3, linestyle='--')
|
| 276 |
+
ax[0].set(xlim=[-0.05,1.0], ylim=[0.0,1.05],
|
| 277 |
+
xlabel='False Positive Rate',
|
| 278 |
+
ylabel="True Positive Rate (Recall)",
|
| 279 |
+
title="Receiver operating characteristic")
|
| 280 |
+
ax[0].legend(loc="lower right")
|
| 281 |
+
ax[0].grid(True)
|
| 282 |
+
|
| 283 |
+
## Plot precision-recall curve
|
| 284 |
+
for i in range(len(classes)):
|
| 285 |
+
precision, recall, thresholds = skm.precision_recall_curve(
|
| 286 |
+
y_test_array[:,i], probs[:,i])
|
| 287 |
+
ax[1].plot(recall, precision, lw=3,
|
| 288 |
+
label='{0} (area={1:0.2f})'.format(classes[i],
|
| 289 |
+
skm.auc(recall, precision))
|
| 290 |
+
)
|
| 291 |
+
ax[1].set(xlim=[0.0,1.05], ylim=[0.0,1.05], xlabel='Recall',
|
| 292 |
+
ylabel="Precision", title="Precision-Recall curve")
|
| 293 |
+
ax[1].legend(loc="best")
|
| 294 |
+
ax[1].grid(True)
|
| 295 |
+
plt.show()
|
| 296 |
+
#plt.savefig('/content/drive/MyDrive/NLP/Depression_Detection/modeling/ROC_Precision_LR.png')
|
| 297 |
+
#plt.savefig('/content/drive/MyDrive/NLP/Depression_Detection/modeling/ROC_Precision_SVM.png')
|
| 298 |
+
## AUC score
|
| 299 |
+
print(f'AUC score is : {skm.roc_auc_score(Y_test, probs[:,1])}')
|
| 300 |
+
|
| 301 |
+
"""## Support Vector Machine(SVM) with word embedding:"""
|
| 302 |
+
|
| 303 |
+
nlp = en_core_web_lg.load()
|
| 304 |
+
|
| 305 |
+
## word-embedding
|
| 306 |
+
all_vectors = pd.np.array([pd.np.array([token.vector for token in nlp(s)]).mean(axis=0) * pd.np.ones((300)) \
|
| 307 |
+
for s in df_all['clean_text']])
|
| 308 |
+
|
| 309 |
+
# split out validation dataset for the end
|
| 310 |
+
Y= df_all["label"]
|
| 311 |
+
X = all_vectors
|
| 312 |
+
|
| 313 |
+
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
|
| 314 |
+
validation_size = 0.3
|
| 315 |
+
seed = 7
|
| 316 |
+
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=validation_size, random_state=seed)
|
| 317 |
+
|
| 318 |
+
# test options for classification
|
| 319 |
+
num_folds = 10
|
| 320 |
+
seed = 7
|
| 321 |
+
scoring = 'accuracy'
|
| 322 |
+
|
| 323 |
+
#Create a svm Classifier
|
| 324 |
+
clf = SVC(probability=True)
|
| 325 |
+
|
| 326 |
+
## Running the svm Classifier
|
| 327 |
+
|
| 328 |
+
# Full Training period
|
| 329 |
+
res = clf.fit(X_train, Y_train)
|
| 330 |
+
train_result = accuracy_score(res.predict(X_train), Y_train)
|
| 331 |
+
test_result = accuracy_score(res.predict(X_test), Y_test)
|
| 332 |
+
|
| 333 |
+
print("train_result:", "test_resuld:", train_result, test_result, sep=" ")
|
| 334 |
+
|
| 335 |
+
## Save the Modle to file in the current working directory
|
| 336 |
+
SVM = "/content/drive/MyDrive/NLP/Depression_Detection/modeling/model_svm1.pkl"
|
| 337 |
+
|
| 338 |
+
with open(SVM, 'wb') as file:
|
| 339 |
+
pickle.dump(clf, file)
|
| 340 |
+
|
| 341 |
+
## Load the Model back from file
|
| 342 |
+
with open(SVM, 'rb') as file:
|
| 343 |
+
clf = pickle.load(file)
|
| 344 |
+
|
| 345 |
+
clf
|
| 346 |
+
|
| 347 |
+
## Test results
|
| 348 |
+
##
|
| 349 |
+
y_pred_svm = res.predict(X_test)
|
| 350 |
+
classes = np.unique(Y_test.to_list())
|
| 351 |
+
y_test_array = pd.get_dummies(Y_test, drop_first=False).values
|
| 352 |
+
probs = res.predict_proba(X_test)
|
| 353 |
+
conf_matrix_acc(Y_test.to_list(),y_pred_svm)
|
| 354 |
+
roc_precision_auc()
|
| 355 |
+
|
| 356 |
+
"""## Exploring False positive and False negative:"""
|
| 357 |
+
|
| 358 |
+
## creating lists of true values and predictions
|
| 359 |
+
y_test_1 = [x for x in y_test]
|
| 360 |
+
y_pred_lr_1 = [x for x in y_pred_lr]
|
| 361 |
+
|
| 362 |
+
## Find the indices of wrong predictions
|
| 363 |
+
idx = []
|
| 364 |
+
for i in range(len(y_test_1)):
|
| 365 |
+
if y_test_1[i] != y_pred_lr_1[i]:
|
| 366 |
+
idx.append(i)
|
| 367 |
+
i+=1
|
| 368 |
+
|
| 369 |
+
print('There are", {} "wrong preditions", len(idx))
|
| 370 |
+
|
| 371 |
+
wrong_arr = cv.inverse_transform(X_test_tfidf[idx])
|
| 372 |
+
|
| 373 |
+
## detokenize the wrong array
|
| 374 |
+
detokenized = [TreebankWordDetokenizer().detokenize(x) for x in wrong_arr]
|
| 375 |
+
|
| 376 |
+
detokenized[:50]
|
| 377 |
+
|
| 378 |
+
"""There is no specific patterns between false positive and false negative predictions."""
|
source_code/notebooks/old_models.py
ADDED
|
@@ -0,0 +1,637 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""old_models.ipynb
|
| 3 |
+
|
| 4 |
+
Automatically generated by Colaboratory.
|
| 5 |
+
|
| 6 |
+
Original file is located at
|
| 7 |
+
https://colab.research.google.com/drive/1Oc7A5TaGLg1qkYXzf0qLGIe0_ZxyAnXE
|
| 8 |
+
|
| 9 |
+
This notebook contains Feature selection with Chi-Square test, Logistic Regression with TFIDF as well as Bidirectional LSTM with gensim to classifies a given tweet into depressive or non-depressive ones.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from google.colab import drive
|
| 13 |
+
drive.mount('/content/drive')
|
| 14 |
+
|
| 15 |
+
## Import required libraries
|
| 16 |
+
|
| 17 |
+
## warnings
|
| 18 |
+
import warnings
|
| 19 |
+
warnings.filterwarnings("ignore")
|
| 20 |
+
|
| 21 |
+
## for data
|
| 22 |
+
import numpy as np
|
| 23 |
+
import pandas as pd
|
| 24 |
+
|
| 25 |
+
## for plotting
|
| 26 |
+
import matplotlib.pyplot as plt
|
| 27 |
+
import seaborn as sns
|
| 28 |
+
|
| 29 |
+
## TF-IDF
|
| 30 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 31 |
+
|
| 32 |
+
## T-Sne
|
| 33 |
+
from yellowbrick.text import TSNEVisualizer
|
| 34 |
+
from sklearn import manifold
|
| 35 |
+
|
| 36 |
+
## Train-Test Split
|
| 37 |
+
from sklearn.model_selection import train_test_split
|
| 38 |
+
|
| 39 |
+
## Feature selection
|
| 40 |
+
from sklearn import feature_selection
|
| 41 |
+
|
| 42 |
+
## libraraies for classification
|
| 43 |
+
from sklearn.pipeline import Pipeline
|
| 44 |
+
import sklearn.metrics as skm
|
| 45 |
+
from sklearn.metrics import confusion_matrix, accuracy_score
|
| 46 |
+
from sklearn.linear_model import LogisticRegression
|
| 47 |
+
from sklearn.neighbors import KNeighborsClassifier
|
| 48 |
+
from sklearn.svm import SVC
|
| 49 |
+
from sklearn.tree import DecisionTreeClassifier
|
| 50 |
+
from sklearn.neural_network import MLPClassifier
|
| 51 |
+
from sklearn.ensemble import RandomForestClassifier
|
| 52 |
+
|
| 53 |
+
## for saving model
|
| 54 |
+
import pickle
|
| 55 |
+
|
| 56 |
+
## detokenization
|
| 57 |
+
from nltk.tokenize.treebank import TreebankWordDetokenizer
|
| 58 |
+
|
| 59 |
+
## for word embedding with gensim
|
| 60 |
+
import gensim
|
| 61 |
+
import gensim.downloader as gensim_api
|
| 62 |
+
from gensim.models import Word2Vec
|
| 63 |
+
from gensim.models import KeyedVectors
|
| 64 |
+
from keras.preprocessing.text import Tokenizer
|
| 65 |
+
from keras.preprocessing.sequence import pad_sequences
|
| 66 |
+
|
| 67 |
+
## for word embedding with Spacy
|
| 68 |
+
# import spacy
|
| 69 |
+
# import en_core_web_lg
|
| 70 |
+
|
| 71 |
+
## for deep learning
|
| 72 |
+
from keras.models import load_model
|
| 73 |
+
from keras.models import Model, Sequential
|
| 74 |
+
from keras.callbacks import EarlyStopping, ModelCheckpoint
|
| 75 |
+
from keras.layers import Conv1D, Dense, Input, LSTM, Embedding, Dropout, Activation, MaxPooling1D
|
| 76 |
+
from tensorflow.keras import models, layers, preprocessing as kprocessing
|
| 77 |
+
from tensorflow.keras import backend as K
|
| 78 |
+
import tensorflow as tf
|
| 79 |
+
import keras
|
| 80 |
+
from keras.layers import Lambda
|
| 81 |
+
import tensorflow as tf
|
| 82 |
+
from keras.models import model_from_json
|
| 83 |
+
|
| 84 |
+
## for bert language model
|
| 85 |
+
#import transformers
|
| 86 |
+
|
| 87 |
+
"""## Loading the dataset:"""
|
| 88 |
+
|
| 89 |
+
df_all = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/data_cleaning/processed_data/processed_data.csv",
|
| 90 |
+
sep='\t', encoding='utf-8')
|
| 91 |
+
|
| 92 |
+
df_all
|
| 93 |
+
|
| 94 |
+
"""## Feature selection
|
| 95 |
+
|
| 96 |
+
In order to drop some columns and reduce the matrix dimensionality, we can carry out some Feature Selection, the process of selecting a subset of relevant variables. I will proceed as follows:
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
1. treat each category as binary (for example, the “depressive” category is 1 for the depressive tweets and 0 for non_depressive);
|
| 101 |
+
2. perform a Chi-Square test to determine whether a feature and the (binary) target are independent;
|
| 102 |
+
3. keep only the features with a certain p-value from the Chi-Square test.
|
| 103 |
+
|
| 104 |
+
This snippet of code is derived from https://towardsdatascience.com/text-classification-with-nlp-tf-idf-vs-word2vec-vs-bert-41ff868d1794
|
| 105 |
+
"""
|
| 106 |
+
|
| 107 |
+
y = y_train
|
| 108 |
+
X_names = cv.get_feature_names()
|
| 109 |
+
p_value_limit = 0.95
|
| 110 |
+
df_features = pd.DataFrame()
|
| 111 |
+
for cat in np.unique(y):
|
| 112 |
+
chi2, p = feature_selection.chi2(X_train_tfidf, y==cat)
|
| 113 |
+
df_features = df_features.append(pd.DataFrame(
|
| 114 |
+
{"feature":X_names, "score":1-p, "y":cat}))
|
| 115 |
+
df_features = df_features.sort_values(["y","score"],
|
| 116 |
+
ascending=[True,False])
|
| 117 |
+
df_features = df_features[df_features["score"]>p_value_limit]
|
| 118 |
+
X_names = df_features["feature"].unique().tolist()
|
| 119 |
+
|
| 120 |
+
print(len(X_names))
|
| 121 |
+
|
| 122 |
+
"""I reduced the number of features from 20018 to 688 by keeping the most statistically relevant ones. Let’s print some:"""
|
| 123 |
+
|
| 124 |
+
for cat in np.unique(y):
|
| 125 |
+
print("# {}:".format(cat))
|
| 126 |
+
print(" . selected features:",
|
| 127 |
+
len(df_features[df_features["y"]==cat]))
|
| 128 |
+
print(" . top features:", ",".join(df_features[df_features["y"]==cat]["feature"].values[:10]))
|
| 129 |
+
print(" ")
|
| 130 |
+
|
| 131 |
+
"""## Logistic Regression with TFIDF:
|
| 132 |
+
|
| 133 |
+
### Spliting data to train and test datasets:
|
| 134 |
+
"""
|
| 135 |
+
|
| 136 |
+
## split dataset to train and test
|
| 137 |
+
X_train, X_test, y_train, y_test = train_test_split(df_all['clean_text'], df_all['label'], test_size=0.3, random_state= 42)
|
| 138 |
+
|
| 139 |
+
X_train.shape, X_test.shape, y_train.shape, y_test.shape
|
| 140 |
+
|
| 141 |
+
"""### TF-IDF
|
| 142 |
+
|
| 143 |
+
TF-IDF (term frequency and inverse document frequency):
|
| 144 |
+
"""
|
| 145 |
+
|
| 146 |
+
## Creating the TF-IDF model
|
| 147 |
+
cv = TfidfVectorizer()
|
| 148 |
+
cv.fit(X_train.to_list())
|
| 149 |
+
dic_vocabulary = cv.vocabulary_
|
| 150 |
+
|
| 151 |
+
X_train_tfidf = cv.transform(X_train.to_list())
|
| 152 |
+
|
| 153 |
+
X_test_tfidf = cv.transform(X_test.to_list())
|
| 154 |
+
|
| 155 |
+
cv.inverse_transform(X_test_tfidf[0])
|
| 156 |
+
|
| 157 |
+
X_train_tfidf.shape
|
| 158 |
+
|
| 159 |
+
# ## Adding clean tweets to a list called corpus
|
| 160 |
+
# corpus = []
|
| 161 |
+
# corpus = [x for x in df_train['clean_text']]
|
| 162 |
+
# # corpus = df_train["clean_text"]
|
| 163 |
+
|
| 164 |
+
"""The feature matrix X_train_tfidf has a shape of 16,464 (Number of documents in training) x 20018 (Length of vocabulary) and it’s pretty sparse:"""
|
| 165 |
+
|
| 166 |
+
sns.heatmap(X_train_tfidf.todense()[:,np.random.randint(0,X_train_tfidf.shape[1],100)]==0, vmin=0, vmax=1, cbar=False).set_title('Sparse Matrix Sample')
|
| 167 |
+
|
| 168 |
+
"""In order to know the position of a certain word, we can look it up in the vocabulary:"""
|
| 169 |
+
|
| 170 |
+
word = "mental"
|
| 171 |
+
dic_vocabulary[word]
|
| 172 |
+
|
| 173 |
+
"""Build a scikit-learn pipeline: a sequential application of a list of transformations and a final estimator. Putting the Tf-Idf vectorizer and Logistic Regression classifier in a pipeline allows us to transform and predict test data in just one step."""
|
| 174 |
+
|
| 175 |
+
# classifier = LogisticRegression(solver='liblinear', penalty='l1')
|
| 176 |
+
|
| 177 |
+
# ## pipeline
|
| 178 |
+
# model = Pipeline([("vectorizer", cv),
|
| 179 |
+
# ("classifier", classifier)])
|
| 180 |
+
# ## train classifier
|
| 181 |
+
# model["classifier"].fit(X_train, y_train)
|
| 182 |
+
# ## test
|
| 183 |
+
# predicted = model.predict(X_test)
|
| 184 |
+
# predicted_prob = model.predict_proba(X_test)
|
| 185 |
+
|
| 186 |
+
# ## creating the instance of the models
|
| 187 |
+
lr = LogisticRegression(solver='liblinear', penalty='l1')
|
| 188 |
+
## fitting the model
|
| 189 |
+
print(lr.fit(X_train_tfidf, y_train.to_list()))
|
| 190 |
+
|
| 191 |
+
## Save the Modle to file in the current working directory
|
| 192 |
+
LogisticReg = "/content/drive/MyDrive/NLP/Depression_Detection/modeling/model_LogReg.pkl"
|
| 193 |
+
|
| 194 |
+
with open(LogisticReg, 'wb') as file:
|
| 195 |
+
pickle.dump(lr, file)
|
| 196 |
+
|
| 197 |
+
## Load the Model back from file
|
| 198 |
+
with open(LogisticReg, 'rb') as file:
|
| 199 |
+
lr = pickle.load(file)
|
| 200 |
+
|
| 201 |
+
lr
|
| 202 |
+
|
| 203 |
+
## Test
|
| 204 |
+
y_pred_lr = lr.predict(X_test_tfidf)
|
| 205 |
+
probs = lr.predict_proba(X_test_tfidf)
|
| 206 |
+
classes = np.unique(y_test.to_list())
|
| 207 |
+
y_test_array = pd.get_dummies(y_test, drop_first=False).values
|
| 208 |
+
|
| 209 |
+
"""## Evaluate the performance:
|
| 210 |
+
|
| 211 |
+
* **Accuracy:** the fraction of predictions the model got right.
|
| 212 |
+
* **Confusion Matrix:** a summary table that breaks down the number of correct and incorrect predictions by each class.
|
| 213 |
+
* **ROC:** a plot that illustrates the true positive rate against the false positive rate at various threshold settings. The area under the curve (AUC) indicates the probability that the classifier will rank a randomly chosen positive observation higher than a randomly chosen negative one.
|
| 214 |
+
* **Precision:** the fraction of relevant instances among the retrieved instances.
|
| 215 |
+
* **Recall:** the fraction of the total amount of relevant instances that were actually retrieved.
|
| 216 |
+
"""
|
| 217 |
+
|
| 218 |
+
def conf_matrix_acc(y_true, y_pred):
|
| 219 |
+
## Plot confusion matrix
|
| 220 |
+
cm = confusion_matrix(y_true, y_pred)
|
| 221 |
+
fig, ax = plt.subplots()
|
| 222 |
+
sns.heatmap(cm, annot=True, fmt='d', ax=ax, cmap=plt.cm.Blues,
|
| 223 |
+
cbar=False)
|
| 224 |
+
ax.set(xlabel="Pred", ylabel="True", xticklabels=classes,
|
| 225 |
+
yticklabels=classes, title="Confusion matrix")
|
| 226 |
+
plt.yticks(rotation=0)
|
| 227 |
+
print("=========================================")
|
| 228 |
+
print(f'Accuracy score is : {accuracy_score(y_true, y_pred)}')
|
| 229 |
+
print("=========================================")
|
| 230 |
+
print("Detail:")
|
| 231 |
+
print(skm.classification_report(y_true, y_pred))
|
| 232 |
+
|
| 233 |
+
## Plot ROC and precision-recall curve
|
| 234 |
+
def roc_precision_auc():
|
| 235 |
+
fig, ax = plt.subplots(nrows=1, ncols=2)
|
| 236 |
+
## Plot roc
|
| 237 |
+
for i in range(len(classes)):
|
| 238 |
+
fpr, tpr, thresholds = skm.roc_curve(y_test_array[:,i],
|
| 239 |
+
probs[:,i])
|
| 240 |
+
ax[0].plot(fpr, tpr, lw=3,
|
| 241 |
+
label='{0} (area={1:0.2f})'.format(classes[i],
|
| 242 |
+
skm.auc(fpr, tpr))
|
| 243 |
+
)
|
| 244 |
+
ax[0].plot([0,1], [0,1], color='navy', lw=3, linestyle='--')
|
| 245 |
+
ax[0].set(xlim=[-0.05,1.0], ylim=[0.0,1.05],
|
| 246 |
+
xlabel='False Positive Rate',
|
| 247 |
+
ylabel="True Positive Rate (Recall)",
|
| 248 |
+
title="Receiver operating characteristic")
|
| 249 |
+
ax[0].legend(loc="lower right")
|
| 250 |
+
ax[0].grid(True)
|
| 251 |
+
|
| 252 |
+
## Plot precision-recall curve
|
| 253 |
+
for i in range(len(classes)):
|
| 254 |
+
precision, recall, thresholds = skm.precision_recall_curve(
|
| 255 |
+
y_test_array[:,i], probs[:,i])
|
| 256 |
+
ax[1].plot(recall, precision, lw=3,
|
| 257 |
+
label='{0} (area={1:0.2f})'.format(classes[i],
|
| 258 |
+
skm.auc(recall, precision))
|
| 259 |
+
)
|
| 260 |
+
ax[1].set(xlim=[0.0,1.05], ylim=[0.0,1.05], xlabel='Recall',
|
| 261 |
+
ylabel="Precision", title="Precision-Recall curve")
|
| 262 |
+
ax[1].legend(loc="best")
|
| 263 |
+
ax[1].grid(True)
|
| 264 |
+
plt.show()
|
| 265 |
+
#plt.savefig('/content/drive/MyDrive/NLP/Depression_Detection/modeling/ROC_Precision_LR.png')
|
| 266 |
+
plt.savefig('/content/drive/MyDrive/NLP/Depression_Detection/modeling/ROC_Precision_SVM.png')
|
| 267 |
+
## AUC score
|
| 268 |
+
print(f'AUC score is : {skm.roc_auc_score(Y_test, probs[:,1])}')
|
| 269 |
+
|
| 270 |
+
conf_matrix_acc(y_test.to_list(),y_pred_lr)
|
| 271 |
+
|
| 272 |
+
roc_precision_auc()
|
| 273 |
+
|
| 274 |
+
|
| 275 |
+
|
| 276 |
+
"""## Bidirectional LSTM:
|
| 277 |
+
|
| 278 |
+
In Python, you can load a pre-trained Word Embedding model from genism-data like this:
|
| 279 |
+
"""
|
| 280 |
+
|
| 281 |
+
nlp_pre = gensim_api.load("word2vec-google-news-300")
|
| 282 |
+
|
| 283 |
+
word = "anxiety"
|
| 284 |
+
fig = plt.figure()
|
| 285 |
+
## word embedding
|
| 286 |
+
tot_words = [word] + [tupla[0] for tupla in
|
| 287 |
+
nlp_pre.most_similar(word, topn=20)]
|
| 288 |
+
X = nlp_pre[tot_words]
|
| 289 |
+
## pca to reduce dimensionality from 300 to 3
|
| 290 |
+
pca = manifold.TSNE(perplexity=40, n_components=3, init='pca')
|
| 291 |
+
X = pca.fit_transform(X)
|
| 292 |
+
## create dtf
|
| 293 |
+
dtf_ = pd.DataFrame(X, index=tot_words, columns=["x","y","z"])
|
| 294 |
+
dtf_["input"] = 0
|
| 295 |
+
dtf_["input"].iloc[0:1] = 1
|
| 296 |
+
## plot 3d
|
| 297 |
+
from mpl_toolkits.mplot3d import Axes3D
|
| 298 |
+
ax = fig.add_subplot(111, projection='3d')
|
| 299 |
+
ax.scatter(dtf_[dtf_["input"]==0]['x'],
|
| 300 |
+
dtf_[dtf_["input"]==0]['y'],
|
| 301 |
+
dtf_[dtf_["input"]==0]['z'], c="black")
|
| 302 |
+
ax.scatter(dtf_[dtf_["input"]==1]['x'],
|
| 303 |
+
dtf_[dtf_["input"]==1]['y'],
|
| 304 |
+
dtf_[dtf_["input"]==1]['z'], c="red")
|
| 305 |
+
ax.set(xlabel=None, ylabel=None, zlabel=None, xticklabels=[],
|
| 306 |
+
yticklabels=[], zticklabels=[])
|
| 307 |
+
for label, row in dtf_[["x","y","z"]].iterrows():
|
| 308 |
+
x, y, z = row
|
| 309 |
+
ax.text(x, y, z, s=label)
|
| 310 |
+
|
| 311 |
+
"""Instead of using a pre-trained model, I am going to fit my own Word2Vec on the training data corpus with gensim. Before fitting the model, the corpus needs to be transformed into a list of lists of n-grams. In this particular case, I’ll try to capture unigrams (“york”), bigrams (“new york”), and trigrams (“new york city”)."""
|
| 312 |
+
|
| 313 |
+
## split dataset
|
| 314 |
+
dtf_train, dtf_test = train_test_split(df_all, test_size=0.3)
|
| 315 |
+
## get target
|
| 316 |
+
y_train = dtf_train["label"].values
|
| 317 |
+
y_test = dtf_test["label"].values
|
| 318 |
+
|
| 319 |
+
corpus = []
|
| 320 |
+
corpus = [x for x in dtf_train['clean_text']]
|
| 321 |
+
|
| 322 |
+
## create list of lists of unigrams
|
| 323 |
+
lst_corpus = []
|
| 324 |
+
for string in corpus:
|
| 325 |
+
lst_words = str(string).split()
|
| 326 |
+
lst_grams = [" ".join(lst_words[i:i+1])
|
| 327 |
+
for i in range(0, len(lst_words), 1)]
|
| 328 |
+
lst_corpus.append(lst_grams)
|
| 329 |
+
|
| 330 |
+
## detect bigrams and trigrams
|
| 331 |
+
bigrams_detector = gensim.models.phrases.Phrases(lst_corpus,
|
| 332 |
+
delimiter=" ".encode(), min_count=5, threshold=10)
|
| 333 |
+
bigrams_detector = gensim.models.phrases.Phraser(bigrams_detector)
|
| 334 |
+
trigrams_detector = gensim.models.phrases.Phrases(bigrams_detector[lst_corpus],
|
| 335 |
+
delimiter=" ".encode(), min_count=5, threshold=10)
|
| 336 |
+
trigrams_detector = gensim.models.phrases.Phraser(trigrams_detector)
|
| 337 |
+
|
| 338 |
+
"""When fitting the Word2Vec, you need to specify:
|
| 339 |
+
|
| 340 |
+
* the target size of the word vectors, I’ll use 300;
|
| 341 |
+
* the window, or the maximum distance between the current and predicted word within a sentence, I’ll use the mean length of text in the corpus;
|
| 342 |
+
* the training algorithm, I’ll use skip-grams (sg=1) as in general it has better results.
|
| 343 |
+
"""
|
| 344 |
+
|
| 345 |
+
## fit w2v
|
| 346 |
+
nlp = gensim.models.word2vec.Word2Vec(lst_corpus, size=300,
|
| 347 |
+
window=8, min_count=1, sg=1, iter=30)
|
| 348 |
+
|
| 349 |
+
"""We have our embedding model, so we can select any word from the corpus and transform it into a vector."""
|
| 350 |
+
|
| 351 |
+
word = "anxiety"
|
| 352 |
+
nlp[word].shape
|
| 353 |
+
|
| 354 |
+
"""We can even use it to visualize a word and its context into a smaller dimensional space (2D or 3D) by applying any dimensionality reduction algorithm (i.e. TSNE)."""
|
| 355 |
+
|
| 356 |
+
word = "anxiety"
|
| 357 |
+
fig = plt.figure()
|
| 358 |
+
## word embedding
|
| 359 |
+
tot_words = [word] + [tupla[0] for tupla in
|
| 360 |
+
nlp.most_similar(word, topn=20)]
|
| 361 |
+
X = nlp[tot_words]
|
| 362 |
+
## pca to reduce dimensionality from 300 to 3
|
| 363 |
+
pca = manifold.TSNE(perplexity=40, n_components=3, init='pca')
|
| 364 |
+
X = pca.fit_transform(X)
|
| 365 |
+
## create dtf
|
| 366 |
+
dtf_ = pd.DataFrame(X, index=tot_words, columns=["x","y","z"])
|
| 367 |
+
dtf_["input"] = 0
|
| 368 |
+
dtf_["input"].iloc[0:1] = 1
|
| 369 |
+
## plot 3d
|
| 370 |
+
from mpl_toolkits.mplot3d import Axes3D
|
| 371 |
+
ax = fig.add_subplot(111, projection='3d')
|
| 372 |
+
ax.scatter(dtf_[dtf_["input"]==0]['x'],
|
| 373 |
+
dtf_[dtf_["input"]==0]['y'],
|
| 374 |
+
dtf_[dtf_["input"]==0]['z'], c="black")
|
| 375 |
+
ax.scatter(dtf_[dtf_["input"]==1]['x'],
|
| 376 |
+
dtf_[dtf_["input"]==1]['y'],
|
| 377 |
+
dtf_[dtf_["input"]==1]['z'], c="red")
|
| 378 |
+
ax.set(xlabel=None, ylabel=None, zlabel=None, xticklabels=[],
|
| 379 |
+
yticklabels=[], zticklabels=[])
|
| 380 |
+
for label, row in dtf_[["x","y","z"]].iterrows():
|
| 381 |
+
x, y, z = row
|
| 382 |
+
ax.text(x, y, z, s=label)
|
| 383 |
+
|
| 384 |
+
"""The word vectors can be used in a neural network as weights in the follwing procedure:
|
| 385 |
+
1. Transform the corpus into padded sequences of word ids to get a feature matrix.
|
| 386 |
+
2. Create an embedding matrix so that the vector of the word with id N is located at the Nth row.
|
| 387 |
+
3. Build a neural network with an embedding layer that weighs every word in the sequences with the corresponding vector.
|
| 388 |
+
|
| 389 |
+
**Feature Engineering:** by transforming the same preprocessed corpus (list of lists of n-grams) given to the Word2Vec into a list of sequences using tensorflow/keras:
|
| 390 |
+
"""
|
| 391 |
+
|
| 392 |
+
## tokenize text
|
| 393 |
+
tokenizer = kprocessing.text.Tokenizer(lower=True, split=' ',
|
| 394 |
+
oov_token="NaN",
|
| 395 |
+
filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
|
| 396 |
+
tokenizer.fit_on_texts(lst_corpus)
|
| 397 |
+
dic_vocabulary = tokenizer.word_index
|
| 398 |
+
|
| 399 |
+
## create sequence
|
| 400 |
+
lst_text2seq= tokenizer.texts_to_sequences(lst_corpus)
|
| 401 |
+
|
| 402 |
+
## padding sequence
|
| 403 |
+
X_train = kprocessing.sequence.pad_sequences(lst_text2seq,
|
| 404 |
+
maxlen=35, padding="post", truncating="post")
|
| 405 |
+
|
| 406 |
+
X_train.shape
|
| 407 |
+
|
| 408 |
+
"""The feature matrix X_train has a shape of 16559 x 35 (Number of sequences x Sequences max length). Let’s visualize it:"""
|
| 409 |
+
|
| 410 |
+
sns.heatmap(X_train==0, vmin=0, vmax=1, cbar=False)
|
| 411 |
+
plt.show()
|
| 412 |
+
|
| 413 |
+
"""Every text in the corpus is now an id sequence with length 35. For instance, if a text had 20 tokens in it, then the sequence is composed of 20 ids + 15 0s, which is the padding element (while the id for word not in the vocabulary is 1)
|
| 414 |
+
|
| 415 |
+
Let’s print how a text from the train set has been transformed into a sequence with the padding and the vocabulary.
|
| 416 |
+
"""
|
| 417 |
+
|
| 418 |
+
i = 8
|
| 419 |
+
|
| 420 |
+
## list of text: ["I like this", ...]
|
| 421 |
+
len_txt = len(dtf_train["clean_text"].iloc[i].split())
|
| 422 |
+
print("from: ", dtf_train["clean_text"].iloc[i], "| len:", len_txt)
|
| 423 |
+
|
| 424 |
+
## sequence of token ids: [[1, 2, 3], ...]
|
| 425 |
+
len_tokens = len(X_train[i])
|
| 426 |
+
print("to: ", X_train[i], "| len:", len(X_train[i]))
|
| 427 |
+
|
| 428 |
+
## vocabulary: {"I":1, "like":2, "this":3, ...}
|
| 429 |
+
print("check: ", dtf_train["clean_text"].iloc[i].split()[0],
|
| 430 |
+
" -- idx in vocabulary -->",
|
| 431 |
+
dic_vocabulary[dtf_train["clean_text"].iloc[i].split()[0]])
|
| 432 |
+
|
| 433 |
+
print("vocabulary: ", dict(list(dic_vocabulary.items())[0:5]), "... (padding element, 0)")
|
| 434 |
+
|
| 435 |
+
corpus = dtf_test["clean_text"]
|
| 436 |
+
|
| 437 |
+
## create list of n-grams
|
| 438 |
+
lst_corpus = []
|
| 439 |
+
for string in corpus:
|
| 440 |
+
lst_words = str(string).split()
|
| 441 |
+
lst_grams = [" ".join(lst_words[i:i+1]) for i in range(0,
|
| 442 |
+
len(lst_words), 1)]
|
| 443 |
+
lst_corpus.append(lst_grams)
|
| 444 |
+
|
| 445 |
+
## detect common bigrams and trigrams using the fitted detectors
|
| 446 |
+
lst_corpus = list(bigrams_detector[lst_corpus])
|
| 447 |
+
lst_corpus = list(trigrams_detector[lst_corpus])
|
| 448 |
+
## text to sequence with the fitted tokenizer
|
| 449 |
+
lst_text2seq = tokenizer.texts_to_sequences(lst_corpus)
|
| 450 |
+
|
| 451 |
+
## padding sequence
|
| 452 |
+
X_test = kprocessing.sequence.pad_sequences(lst_text2seq, maxlen=35,
|
| 453 |
+
padding="post", truncating="post")
|
| 454 |
+
|
| 455 |
+
X_test.shape
|
| 456 |
+
|
| 457 |
+
sns.heatmap(X_test==0, vmin=0, vmax=1, cbar=False)
|
| 458 |
+
plt.show()
|
| 459 |
+
|
| 460 |
+
"""We’ve got our X_train and X_test, now we need to create the embedding matrix that will be used as a weight matrix in the neural network."""
|
| 461 |
+
|
| 462 |
+
## start the matrix (length of vocabulary x vector size) with all 0s
|
| 463 |
+
embeddings = np.zeros((len(dic_vocabulary)+1, 300))
|
| 464 |
+
for word,idx in dic_vocabulary.items():
|
| 465 |
+
## update the row with vector
|
| 466 |
+
try:
|
| 467 |
+
embeddings[idx] = nlp[word]
|
| 468 |
+
## if word not in model then skip and the row stays all 0s
|
| 469 |
+
except:
|
| 470 |
+
pass
|
| 471 |
+
|
| 472 |
+
embeddings.shape
|
| 473 |
+
|
| 474 |
+
"""That code generates a matrix of shape 20,050 x 300 (Length of vocabulary extracted from the corpus x Vector size). It can be navigated by word id, which can be obtained from the vocabulary."""
|
| 475 |
+
|
| 476 |
+
word = "anxiety"
|
| 477 |
+
print("dic[word]:", dic_vocabulary[word], "|idx")
|
| 478 |
+
print("embeddings[idx]:", embeddings[dic_vocabulary[word]].shape,
|
| 479 |
+
"|vector")
|
| 480 |
+
|
| 481 |
+
"""### Deep Learning:
|
| 482 |
+
|
| 483 |
+
It’s finally time to build a deep learning model. I’m going to use the embedding matrix in the first Embedding layer of the neural network that I will build and train to classify the news. Each id in the input sequence will be used as the index to access the embedding matrix. The output of this Embedding layer will be a 2D matrix with a word vector for each word id in the input sequence (Sequence length x Vector size). Let’s use the sentence “I like this article” as an example:
|
| 484 |
+
|
| 485 |
+
My neural network shall be structured as follows:
|
| 486 |
+
|
| 487 |
+
* An Embedding layer that takes the sequences as input and the word vectors as weights, just as described before.
|
| 488 |
+
|
| 489 |
+
* A simple Attention layer that won’t affect the predictions but it’s going to capture the weights of each instance and allow us to build a nice explainer (it isn't necessary for the predictions, just for the explainability, so you can skip it).
|
| 490 |
+
|
| 491 |
+
* Two layers of Bidirectional LSTM to model the order of words in a sequence in both directions.
|
| 492 |
+
|
| 493 |
+
* Two final dense layers that will predict the probability of each category.
|
| 494 |
+
"""
|
| 495 |
+
|
| 496 |
+
## code attention layer
|
| 497 |
+
def attention_layer(inputs, neurons):
|
| 498 |
+
x = layers.Permute((2,1))(inputs)
|
| 499 |
+
x = layers.Dense(neurons, activation="softmax")(x)
|
| 500 |
+
x = layers.Permute((2,1), name="attention")(x)
|
| 501 |
+
x = layers.multiply([inputs, x])
|
| 502 |
+
return x
|
| 503 |
+
|
| 504 |
+
## input
|
| 505 |
+
x_in = layers.Input(shape=(35,))
|
| 506 |
+
## embedding
|
| 507 |
+
x = layers.Embedding(input_dim=embeddings.shape[0],
|
| 508 |
+
output_dim=embeddings.shape[1],
|
| 509 |
+
weights=[embeddings],
|
| 510 |
+
input_length=35, trainable=False)(x_in)
|
| 511 |
+
## apply attention
|
| 512 |
+
x = attention_layer(x, neurons=35)
|
| 513 |
+
## 2 layers of bidirectional lstm
|
| 514 |
+
x = layers.Bidirectional(layers.LSTM(units=35, dropout=0.2,
|
| 515 |
+
return_sequences=True))(x)
|
| 516 |
+
x = layers.Bidirectional(layers.LSTM(units=35, dropout=0.2))(x)
|
| 517 |
+
## final dense layers
|
| 518 |
+
x = layers.Dense(64, activation='relu')(x)
|
| 519 |
+
y_out = layers.Dense(1, activation='sigmoid')(x)
|
| 520 |
+
## compile
|
| 521 |
+
model = models.Model(x_in, y_out)
|
| 522 |
+
model.compile(loss='binary_crossentropy',
|
| 523 |
+
optimizer='adam', metrics=['accuracy'])
|
| 524 |
+
|
| 525 |
+
model.summary()
|
| 526 |
+
|
| 527 |
+
## encode y
|
| 528 |
+
dic_y_mapping = {n:label for n,label in
|
| 529 |
+
enumerate(np.unique(y_train))}
|
| 530 |
+
inverse_dic = {v:k for k,v in dic_y_mapping.items()}
|
| 531 |
+
y_train = np.array([inverse_dic[y] for y in y_train])
|
| 532 |
+
## train
|
| 533 |
+
training = model.fit(x=X_train, y=y_train, batch_size=256,
|
| 534 |
+
epochs=30, shuffle=True, verbose=0,
|
| 535 |
+
validation_split=0.3)
|
| 536 |
+
|
| 537 |
+
## plot loss and accuracy
|
| 538 |
+
metrics = [k for k in training.history.keys() if ("loss" not in k) and ("val" not in k)]
|
| 539 |
+
fig, ax = plt.subplots(nrows=1, ncols=2, sharey=True)
|
| 540 |
+
ax[0].set(title="Training")
|
| 541 |
+
ax11 = ax[0].twinx()
|
| 542 |
+
ax[0].plot(training.history['loss'], color='black')
|
| 543 |
+
ax[0].set_xlabel('Epochs')
|
| 544 |
+
ax[0].set_ylabel('Loss', color='black')
|
| 545 |
+
for metric in metrics:
|
| 546 |
+
ax11.plot(training.history[metric], label=metric)
|
| 547 |
+
ax11.set_ylabel("Score", color='steelblue')
|
| 548 |
+
ax11.legend()
|
| 549 |
+
ax[1].set(title="Validation")
|
| 550 |
+
ax22 = ax[1].twinx()
|
| 551 |
+
ax[1].plot(training.history['val_loss'], color='black')
|
| 552 |
+
ax[1].set_xlabel('Epochs')
|
| 553 |
+
ax[1].set_ylabel('Loss', color='black')
|
| 554 |
+
for metric in metrics:
|
| 555 |
+
ax22.plot(training.history['val_'+metric], label=metric)
|
| 556 |
+
ax22.set_ylabel("Score", color="steelblue")
|
| 557 |
+
plt.savefig('/content/drive/MyDrive/NLP/Depression_Detection/modeling/loss_accuracy_LSTM_3.png')
|
| 558 |
+
plt.show()
|
| 559 |
+
|
| 560 |
+
# serialize model to JSON
|
| 561 |
+
model_json = model.to_json()
|
| 562 |
+
with open("/content/drive/MyDrive/NLP/Depression_Detection/modeling/model.json", "w") as json_file:
|
| 563 |
+
json_file.write(model_json)
|
| 564 |
+
# serialize weights to HDF5
|
| 565 |
+
model.save_weights("/content/drive/MyDrive/NLP/Depression_Detection/modeling/model.h5")
|
| 566 |
+
print("Saved model to disk")
|
| 567 |
+
|
| 568 |
+
loaded_model = model_from_json(open("/content/drive/MyDrive/NLP/Depression_Detection/modeling/model.json", "r").read(),
|
| 569 |
+
custom_objects={'tf': tf})
|
| 570 |
+
json_file.close()
|
| 571 |
+
# load weights into new model
|
| 572 |
+
loaded_model.load_weights("/content/drive/MyDrive/NLP/Depression_Detection/modeling/model.h5")
|
| 573 |
+
print("Loaded model from disk")
|
| 574 |
+
|
| 575 |
+
labels_pred = model.predict(X_test)
|
| 576 |
+
labels_pred = np.round(labels_pred.flatten())
|
| 577 |
+
accuracy = accuracy_score(y_test, labels_pred)
|
| 578 |
+
classes = np.unique(y_test)
|
| 579 |
+
print("Accuracy: %.2f%%" % (accuracy*100))
|
| 580 |
+
|
| 581 |
+
def conf_matrix_acc2(y_true, y_pred):
|
| 582 |
+
## Plot confusion matrix
|
| 583 |
+
cm = confusion_matrix(y_test, y_pred)
|
| 584 |
+
fig, ax = plt.subplots()
|
| 585 |
+
sns.heatmap(cm, annot=True, fmt='d', ax=ax, cmap=plt.cm.Blues,
|
| 586 |
+
cbar=False)
|
| 587 |
+
ax.set(xlabel="Pred", ylabel="True", xticklabels=classes,
|
| 588 |
+
yticklabels=classes, title="Confusion matrix")
|
| 589 |
+
plt.yticks(rotation=0)
|
| 590 |
+
print("=========================================")
|
| 591 |
+
print(f'Accuracy score is : {accuracy_score(y_true, y_pred)}')
|
| 592 |
+
print("=========================================")
|
| 593 |
+
print("Detail:")
|
| 594 |
+
print(skm.classification_report(y_test, y_pred))
|
| 595 |
+
|
| 596 |
+
conf_matrix_acc2(y_test, labels_pred)
|
| 597 |
+
|
| 598 |
+
# classes = np.unique(y_test)
|
| 599 |
+
# y_test_array = pd.get_dummies(y_test, drop_first=False).values
|
| 600 |
+
# predicted_prob = model.predict_on_batch(X_test)
|
| 601 |
+
|
| 602 |
+
# ## Plot ROC and precision-recall curve
|
| 603 |
+
# def roc_precision_auc2():
|
| 604 |
+
# fig, ax = plt.subplots(nrows=1, ncols=2)
|
| 605 |
+
# ## Plot roc
|
| 606 |
+
# for i in range(len(classes)):
|
| 607 |
+
# fpr, tpr, thresholds = skm.roc_curve(y_test_array[:,i],
|
| 608 |
+
# predicted_prob[:,i])
|
| 609 |
+
# ax[0].plot(fpr, tpr, lw=3,
|
| 610 |
+
# label='{0} (area={1:0.2f})'.format(classes[i],
|
| 611 |
+
# skm.auc(fpr, tpr))
|
| 612 |
+
# )
|
| 613 |
+
# ax[0].plot([0,1], [0,1], color='navy', lw=3, linestyle='--')
|
| 614 |
+
# ax[0].set(xlim=[-0.05,1.0], ylim=[0.0,1.05],
|
| 615 |
+
# xlabel='False Positive Rate',
|
| 616 |
+
# ylabel="True Positive Rate (Recall)",
|
| 617 |
+
# title="Receiver operating characteristic")
|
| 618 |
+
# ax[0].legend(loc="lower right")
|
| 619 |
+
# ax[0].grid(True)
|
| 620 |
+
|
| 621 |
+
# ## Plot precision-recall curve
|
| 622 |
+
# for i in range(len(classes)):
|
| 623 |
+
# precision, recall, thresholds = skm.precision_recall_curve(
|
| 624 |
+
# y_test_array[:,i], probs[:,i])
|
| 625 |
+
# ax[1].plot(recall, precision, lw=3,
|
| 626 |
+
# label='{0} (area={1:0.2f})'.format(classes[i],
|
| 627 |
+
# skm.auc(recall, precision))
|
| 628 |
+
# )
|
| 629 |
+
# ax[1].set(xlim=[0.0,1.05], ylim=[0.0,1.05], xlabel='Recall',
|
| 630 |
+
# ylabel="Precision", title="Precision-Recall curve")
|
| 631 |
+
# ax[1].legend(loc="best")
|
| 632 |
+
# ax[1].grid(True)
|
| 633 |
+
# plt.show()
|
| 634 |
+
# #plt.savefig('/content/drive/MyDrive/NLP/Depression_Detection/modeling/ROC_Precision_LR.png')
|
| 635 |
+
# #plt.savefig('/content/drive/MyDrive/NLP/Depression_Detection/modeling/ROC_Precision_LSTM.png')
|
| 636 |
+
# ## AUC score
|
| 637 |
+
# print(f'AUC score is : {skm.roc_auc_score(y_test, probs[:,1])}')
|
source_code/notebooks/testing.py
ADDED
|
@@ -0,0 +1,283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""testing.ipynb
|
| 3 |
+
|
| 4 |
+
Automatically generated by Colaboratory.
|
| 5 |
+
|
| 6 |
+
Original file is located at
|
| 7 |
+
https://colab.research.google.com/drive/1MCstbEJ_U20yRJDGRmZTjIpGTCzTFL_o
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from google.colab import drive
|
| 11 |
+
drive.mount('/content/drive')
|
| 12 |
+
|
| 13 |
+
!pip install -qqq ftfy
|
| 14 |
+
|
| 15 |
+
!pip install -qqq json_file
|
| 16 |
+
|
| 17 |
+
!python -m spacy download en_core_web_lg
|
| 18 |
+
|
| 19 |
+
!pip install -U SpaCy==2.2.0
|
| 20 |
+
|
| 21 |
+
## Import required libraries
|
| 22 |
+
|
| 23 |
+
## warnings
|
| 24 |
+
import warnings
|
| 25 |
+
warnings.filterwarnings("ignore")
|
| 26 |
+
|
| 27 |
+
## for data
|
| 28 |
+
import numpy as np
|
| 29 |
+
import pandas as pd
|
| 30 |
+
|
| 31 |
+
## for plotting
|
| 32 |
+
import matplotlib.pyplot as plt
|
| 33 |
+
import seaborn as sns
|
| 34 |
+
|
| 35 |
+
## Bag of Words
|
| 36 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
| 37 |
+
|
| 38 |
+
## TF-IDF
|
| 39 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 40 |
+
|
| 41 |
+
## Train-Test Split
|
| 42 |
+
from sklearn.model_selection import train_test_split
|
| 43 |
+
|
| 44 |
+
## for processing
|
| 45 |
+
import nltk
|
| 46 |
+
import re
|
| 47 |
+
import ftfy
|
| 48 |
+
from nltk.stem import WordNetLemmatizer
|
| 49 |
+
from nltk.corpus import stopwords
|
| 50 |
+
nltk.download('stopwords')
|
| 51 |
+
nltk.download('punkt')
|
| 52 |
+
nltk.download('wordnet')
|
| 53 |
+
nltk.download('averaged_perceptron_tagger')
|
| 54 |
+
|
| 55 |
+
## Feature selection
|
| 56 |
+
from sklearn import feature_selection
|
| 57 |
+
|
| 58 |
+
## Support vector machine
|
| 59 |
+
from sklearn.pipeline import Pipeline
|
| 60 |
+
import sklearn.metrics as skm
|
| 61 |
+
from sklearn.metrics import confusion_matrix, accuracy_score
|
| 62 |
+
from sklearn.svm import SVC
|
| 63 |
+
|
| 64 |
+
## for saving and loading model
|
| 65 |
+
import pickle
|
| 66 |
+
|
| 67 |
+
## for word embedding with Spacy
|
| 68 |
+
import spacy
|
| 69 |
+
import en_core_web_lg
|
| 70 |
+
|
| 71 |
+
# ## for word embedding
|
| 72 |
+
# import gensim
|
| 73 |
+
# import gensim.downloader as gensim_api
|
| 74 |
+
# from gensim.models import Word2Vec
|
| 75 |
+
# from gensim.models import KeyedVectors
|
| 76 |
+
# from keras.preprocessing.text import Tokenizer
|
| 77 |
+
# from keras.preprocessing.sequence import pad_sequences
|
| 78 |
+
|
| 79 |
+
# ## for deep learning
|
| 80 |
+
# from keras.models import load_model
|
| 81 |
+
# from keras.models import Model, Sequential
|
| 82 |
+
# from keras.callbacks import EarlyStopping, ModelCheckpoint
|
| 83 |
+
# from keras.layers import Conv1D, Dense, Input, LSTM, Embedding, Dropout, Activation, MaxPooling1D
|
| 84 |
+
# from tensorflow.keras import models, layers, preprocessing as kprocessing
|
| 85 |
+
# from tensorflow.keras import backend as K
|
| 86 |
+
# from keras.models import model_from_json
|
| 87 |
+
# from keras.layers import Lambda
|
| 88 |
+
# import tensorflow as tf
|
| 89 |
+
# import json
|
| 90 |
+
# import json_file
|
| 91 |
+
|
| 92 |
+
# Expand Contraction
|
| 93 |
+
cList = {
|
| 94 |
+
"ain't": "am not",
|
| 95 |
+
"aren't": "are not",
|
| 96 |
+
"can't": "cannot",
|
| 97 |
+
"can't've": "cannot have",
|
| 98 |
+
"'cause": "because",
|
| 99 |
+
"could've": "could have",
|
| 100 |
+
"couldn't": "could not",
|
| 101 |
+
"couldn't've": "could not have",
|
| 102 |
+
"didn't": "did not",
|
| 103 |
+
"doesn't": "does not",
|
| 104 |
+
"don't": "do not",
|
| 105 |
+
"hadn't": "had not",
|
| 106 |
+
"hadn't've": "had not have",
|
| 107 |
+
"hasn't": "has not",
|
| 108 |
+
"haven't": "have not",
|
| 109 |
+
"he'd": "he would",
|
| 110 |
+
"he'd've": "he would have",
|
| 111 |
+
"he'll": "he will",
|
| 112 |
+
"he'll've": "he will have",
|
| 113 |
+
"he's": "he is",
|
| 114 |
+
"how'd": "how did",
|
| 115 |
+
"how'd'y": "how do you",
|
| 116 |
+
"how'll": "how will",
|
| 117 |
+
"how's": "how is",
|
| 118 |
+
"I'd": "I would",
|
| 119 |
+
"I'd've": "I would have",
|
| 120 |
+
"I'll": "I will",
|
| 121 |
+
"I'll've": "I will have",
|
| 122 |
+
"I'm": "I am",
|
| 123 |
+
"I've": "I have",
|
| 124 |
+
"isn't": "is not",
|
| 125 |
+
"it'd": "it had",
|
| 126 |
+
"it'd've": "it would have",
|
| 127 |
+
"it'll": "it will",
|
| 128 |
+
"it'll've": "it will have",
|
| 129 |
+
"it's": "it is",
|
| 130 |
+
"let's": "let us",
|
| 131 |
+
"ma'am": "madam",
|
| 132 |
+
"mayn't": "may not",
|
| 133 |
+
"might've": "might have",
|
| 134 |
+
"mightn't": "might not",
|
| 135 |
+
"mightn't've": "might not have",
|
| 136 |
+
"must've": "must have",
|
| 137 |
+
"mustn't": "must not",
|
| 138 |
+
"mustn't've": "must not have",
|
| 139 |
+
"needn't": "need not",
|
| 140 |
+
"needn't've": "need not have",
|
| 141 |
+
"o'clock": "of the clock",
|
| 142 |
+
"oughtn't": "ought not",
|
| 143 |
+
"oughtn't've": "ought not have",
|
| 144 |
+
"shan't": "shall not",
|
| 145 |
+
"sha'n't": "shall not",
|
| 146 |
+
"shan't've": "shall not have",
|
| 147 |
+
"she'd": "she would",
|
| 148 |
+
"she'd've": "she would have",
|
| 149 |
+
"she'll": "she will",
|
| 150 |
+
"she'll've": "she will have",
|
| 151 |
+
"she's": "she is",
|
| 152 |
+
"should've": "should have",
|
| 153 |
+
"shouldn't": "should not",
|
| 154 |
+
"shouldn't've": "should not have",
|
| 155 |
+
"so've": "so have",
|
| 156 |
+
"so's": "so is",
|
| 157 |
+
"that'd": "that would",
|
| 158 |
+
"that'd've": "that would have",
|
| 159 |
+
"that's": "that is",
|
| 160 |
+
"there'd": "there had",
|
| 161 |
+
"there'd've": "there would have",
|
| 162 |
+
"there's": "there is",
|
| 163 |
+
"they'd": "they would",
|
| 164 |
+
"they'd've": "they would have",
|
| 165 |
+
"they'll": "they will",
|
| 166 |
+
"they'll've": "they will have",
|
| 167 |
+
"they're": "they are",
|
| 168 |
+
"they've": "they have",
|
| 169 |
+
"to've": "to have",
|
| 170 |
+
"wasn't": "was not",
|
| 171 |
+
"we'd": "we had",
|
| 172 |
+
"we'd've": "we would have",
|
| 173 |
+
"we'll": "we will",
|
| 174 |
+
"we'll've": "we will have",
|
| 175 |
+
"we're": "we are",
|
| 176 |
+
"we've": "we have",
|
| 177 |
+
"weren't": "were not",
|
| 178 |
+
"what'll": "what will",
|
| 179 |
+
"what'll've": "what will have",
|
| 180 |
+
"what're": "what are",
|
| 181 |
+
"what's": "what is",
|
| 182 |
+
"what've": "what have",
|
| 183 |
+
"when's": "when is",
|
| 184 |
+
"when've": "when have",
|
| 185 |
+
"where'd": "where did",
|
| 186 |
+
"where's": "where is",
|
| 187 |
+
"where've": "where have",
|
| 188 |
+
"who'll": "who will",
|
| 189 |
+
"who'll've": "who will have",
|
| 190 |
+
"who's": "who is",
|
| 191 |
+
"who've": "who have",
|
| 192 |
+
"why's": "why is",
|
| 193 |
+
"why've": "why have",
|
| 194 |
+
"will've": "will have",
|
| 195 |
+
"won't": "will not",
|
| 196 |
+
"won't've": "will not have",
|
| 197 |
+
"would've": "would have",
|
| 198 |
+
"wouldn't": "would not",
|
| 199 |
+
"wouldn't've": "would not have",
|
| 200 |
+
"y'all": "you all",
|
| 201 |
+
"y'alls": "you alls",
|
| 202 |
+
"y'all'd": "you all would",
|
| 203 |
+
"y'all'd've": "you all would have",
|
| 204 |
+
"y'all're": "you all are",
|
| 205 |
+
"y'all've": "you all have",
|
| 206 |
+
"you'd": "you had",
|
| 207 |
+
"you'd've": "you would have",
|
| 208 |
+
"you'll": "you you will",
|
| 209 |
+
"you'll've": "you you will have",
|
| 210 |
+
"you're": "you are",
|
| 211 |
+
"you've": "you have"
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
c_re = re.compile('(%s)' % '|'.join(cList.keys()))
|
| 215 |
+
|
| 216 |
+
def expandContractions(text, c_re=c_re):
|
| 217 |
+
def replace(match):
|
| 218 |
+
return cList[match.group(0)]
|
| 219 |
+
return c_re.sub(replace, text)
|
| 220 |
+
|
| 221 |
+
## Function to perform stepwise cleaning process
|
| 222 |
+
def tweets_cleaner(tweet):
|
| 223 |
+
cleaned_tweets = []
|
| 224 |
+
tweet = tweet.lower() #lowercase
|
| 225 |
+
|
| 226 |
+
# if url links then don't append to avoid news articles
|
| 227 |
+
# also check tweet length, save those > 5
|
| 228 |
+
if re.match("(\w+:\/\/\S+)", tweet) == None and len(tweet) > 5:
|
| 229 |
+
|
| 230 |
+
#remove hashtag, @mention, emoji and image URLs
|
| 231 |
+
tweet = ' '.join(re.sub("(@[A-Za-z0-9]+)|(\#[A-Za-z0-9]+)|(<Emoji:.*>)|(pic\.twitter\.com\/.*)", " ", tweet).split())
|
| 232 |
+
|
| 233 |
+
#fix weirdly encoded texts
|
| 234 |
+
tweet = ftfy.fix_text(tweet)
|
| 235 |
+
|
| 236 |
+
#expand contraction
|
| 237 |
+
tweet = expandContractions(tweet)
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
#remove punctuation
|
| 241 |
+
tweet = ' '.join(re.sub("([^0-9A-Za-z \t])", " ", tweet).split())
|
| 242 |
+
|
| 243 |
+
#stop words and lemmatization
|
| 244 |
+
stop_words = set(stopwords.words('english'))
|
| 245 |
+
word_tokens = nltk.word_tokenize(tweet)
|
| 246 |
+
|
| 247 |
+
lemmatizer=WordNetLemmatizer()
|
| 248 |
+
filtered_sentence = [lemmatizer.lemmatize(word) for word in word_tokens if not word in stop_words]
|
| 249 |
+
# back to string from list
|
| 250 |
+
tweet = ' '.join(filtered_sentence) # join words with a space in between them
|
| 251 |
+
|
| 252 |
+
cleaned_tweets.append(tweet)
|
| 253 |
+
|
| 254 |
+
return cleaned_tweets
|
| 255 |
+
|
| 256 |
+
nlp = en_core_web_lg.load()
|
| 257 |
+
|
| 258 |
+
## Load the model
|
| 259 |
+
SVM = "/content/drive/MyDrive/NLP/Depression_Detection/modeling/model_svm.pkl"
|
| 260 |
+
with open(SVM, 'rb') as file:
|
| 261 |
+
clf = pickle.load(file)
|
| 262 |
+
|
| 263 |
+
clf
|
| 264 |
+
|
| 265 |
+
test_tweet = "I hate my life"
|
| 266 |
+
|
| 267 |
+
corpus = tweets_cleaner(test_tweet)
|
| 268 |
+
|
| 269 |
+
corpus
|
| 270 |
+
|
| 271 |
+
## word-embedding
|
| 272 |
+
test = pd.np.array([pd.np.array([token.vector for token in nlp(s)]).mean(axis=0) * pd.np.ones((300)) \
|
| 273 |
+
for s in corpus])
|
| 274 |
+
|
| 275 |
+
labels_pred = clf.predict(test)
|
| 276 |
+
|
| 277 |
+
labels_pred[0]
|
| 278 |
+
|
| 279 |
+
# loaded_model = model_from_json(open("/content/drive/MyDrive/NLP/Depression_Detection/modeling/model.json", "r").read(),
|
| 280 |
+
# custom_objects={'tf': tf})
|
| 281 |
+
# # load weights into new model
|
| 282 |
+
# loaded_model.load_weights("/content/drive/MyDrive/NLP/Depression_Detection/modeling/model.h5")
|
| 283 |
+
# print("Loaded model from disk")
|
source_code/requirements.txt
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
blis==0.4.1
|
| 2 |
+
certifi==2021.10.8
|
| 3 |
+
charset-normalizer==2.0.7
|
| 4 |
+
click==8.0.3
|
| 5 |
+
cycler==0.11.0
|
| 6 |
+
cymem==2.0.6
|
| 7 |
+
fonttools==4.28.1
|
| 8 |
+
ftfy==6.0.3
|
| 9 |
+
idna==3.3
|
| 10 |
+
joblib==1.1.0
|
| 11 |
+
kiwisolver==1.3.2
|
| 12 |
+
matplotlib==3.5.0
|
| 13 |
+
murmurhash==1.0.6
|
| 14 |
+
nltk==3.6.5
|
| 15 |
+
numpy==1.21.4
|
| 16 |
+
packaging==21.2
|
| 17 |
+
pandas==1.3.4
|
| 18 |
+
Pillow==8.4.0
|
| 19 |
+
plac==0.9.6
|
| 20 |
+
preshed==3.0.6
|
| 21 |
+
pyparsing==2.4.7
|
| 22 |
+
python-dateutil==2.8.2
|
| 23 |
+
pytz==2021.3
|
| 24 |
+
regex==2021.11.10
|
| 25 |
+
requests==2.26.0
|
| 26 |
+
scikit-learn==1.0.1
|
| 27 |
+
scipy==1.7.2
|
| 28 |
+
seaborn==0.11.2
|
| 29 |
+
setuptools-scm==6.3.2
|
| 30 |
+
six==1.16.0
|
| 31 |
+
spacy==2.2.0
|
| 32 |
+
srsly==1.0.5
|
| 33 |
+
thinc==7.1.1
|
| 34 |
+
threadpoolctl==3.0.0
|
| 35 |
+
tomli==1.2.2
|
| 36 |
+
tqdm==4.62.3
|
| 37 |
+
urllib3==1.26.7
|
| 38 |
+
wasabi==0.8.2
|
| 39 |
+
wcwidth==0.2.5
|
| 40 |
+
wordcloud==1.8.1
|
| 41 |
+
Flask==1.1.2
|
| 42 |
+
Flask-Bootstrap==3.3.7.1
|
source_code/static/brain.svg
ADDED
|
|
source_code/static/overlay.css
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Cinematic Overlay Styles */
|
| 2 |
+
.cinematic-overlay {
|
| 3 |
+
position: fixed;
|
| 4 |
+
top: 0;
|
| 5 |
+
left: 0;
|
| 6 |
+
width: 100vw;
|
| 7 |
+
height: 100vh;
|
| 8 |
+
background: rgba(29, 161, 242, 0.98);
|
| 9 |
+
/* Classic Twitter Blue #1DA1F2 */
|
| 10 |
+
z-index: 9999;
|
| 11 |
+
display: none;
|
| 12 |
+
align-items: center;
|
| 13 |
+
justify-content: center;
|
| 14 |
+
flex-direction: column;
|
| 15 |
+
opacity: 0;
|
| 16 |
+
transition: opacity 0.8s ease-in-out;
|
| 17 |
+
backdrop-filter: blur(10px);
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
.cinematic-overlay.active {
|
| 21 |
+
opacity: 1;
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
.overlay-content {
|
| 25 |
+
text-align: center;
|
| 26 |
+
color: var(--clr-white);
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
.overlay-logo {
|
| 30 |
+
font-size: 7rem;
|
| 31 |
+
color: #ffffff;
|
| 32 |
+
margin-bottom: 30px;
|
| 33 |
+
opacity: 0;
|
| 34 |
+
transform: scale(0.5);
|
| 35 |
+
transition: all 1s cubic-bezier(0.175, 0.885, 0.32, 1.275);
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
.cinematic-overlay.active .overlay-logo {
|
| 39 |
+
opacity: 1;
|
| 40 |
+
transform: scale(1);
|
| 41 |
+
filter: drop-shadow(0 0 20px rgba(29, 174, 255, 0.6));
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
.overlay-title {
|
| 45 |
+
font-family: 'Play', sans-serif;
|
| 46 |
+
font-size: 3rem;
|
| 47 |
+
font-weight: 700;
|
| 48 |
+
letter-spacing: 2px;
|
| 49 |
+
margin-bottom: 10px;
|
| 50 |
+
opacity: 0;
|
| 51 |
+
transform: translateY(20px);
|
| 52 |
+
transition: all 0.8s ease 0.4s;
|
| 53 |
+
color: #ffffff;
|
| 54 |
+
/* Solid white for max contrast */
|
| 55 |
+
text-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
.cinematic-overlay.active .overlay-title {
|
| 59 |
+
opacity: 1;
|
| 60 |
+
transform: translateY(0);
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
.overlay-author {
|
| 64 |
+
font-family: 'Play', sans-serif;
|
| 65 |
+
font-size: 1rem;
|
| 66 |
+
font-size: 1rem;
|
| 67 |
+
color: #ffffff;
|
| 68 |
+
/* Solid white */
|
| 69 |
+
text-shadow: 0 1px 2px rgba(0, 0, 0, 0.2);
|
| 70 |
+
text-transform: uppercase;
|
| 71 |
+
letter-spacing: 3px;
|
| 72 |
+
margin-top: 40px;
|
| 73 |
+
margin-bottom: 15px;
|
| 74 |
+
opacity: 0;
|
| 75 |
+
transition: opacity 1s ease 0.8s;
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
.cinematic-overlay.active .overlay-author {
|
| 79 |
+
opacity: 1;
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
.overlay-names {
|
| 83 |
+
font-size: 1.5rem;
|
| 84 |
+
font-weight: 300;
|
| 85 |
+
opacity: 0;
|
| 86 |
+
transform: scale(0.9);
|
| 87 |
+
transition: all 1s ease 1s;
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
.cinematic-overlay.active .overlay-names {
|
| 91 |
+
opacity: 1;
|
| 92 |
+
transform: scale(1);
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
.overlay-names .separator {
|
| 96 |
+
color: #ffffff;
|
| 97 |
+
margin: 0 15px;
|
| 98 |
+
font-weight: 700;
|
| 99 |
+
}
|
source_code/static/security.js
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* security.js
|
| 3 |
+
* Implements low-level security features and Easter eggs.
|
| 4 |
+
*/
|
| 5 |
+
|
| 6 |
+
// Disable Right Click
|
| 7 |
+
document.addEventListener('contextmenu', function (e) {
|
| 8 |
+
e.preventDefault();
|
| 9 |
+
});
|
| 10 |
+
|
| 11 |
+
// Disable F12, Ctrl+Shift+I, Ctrl+Shift+J, Ctrl+U
|
| 12 |
+
document.onkeydown = function (e) {
|
| 13 |
+
if (e.keyCode == 123) {
|
| 14 |
+
return false;
|
| 15 |
+
}
|
| 16 |
+
if (e.ctrlKey && e.shiftKey && (e.keyCode == 'I'.charCodeAt(0) || e.keyCode == 'J'.charCodeAt(0))) {
|
| 17 |
+
return false;
|
| 18 |
+
}
|
| 19 |
+
if (e.ctrlKey && e.keyCode == 'U'.charCodeAt(0)) {
|
| 20 |
+
return false;
|
| 21 |
+
}
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
// Easter Egg - Console Warning
|
| 25 |
+
console.log("%cStop snooping around!", "color: red; font-family: sans-serif; font-size: 4.5em; font-weight: bolder; text-shadow: #000 1px 1px;");
|
| 26 |
+
console.log("%cThis is a project by Amey Thakur & Mega Satish.", "color: #1daeff; font-family: sans-serif; font-size: 1.5em;");
|
source_code/static/styles.css
ADDED
|
@@ -0,0 +1,237 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* ==============================================================================
|
| 2 |
+
* PROJECT: DEPRESSION-DETECTION-USING-TWEETS
|
| 3 |
+
* AUTHORS: AMEY THAKUR & MEGA SATISH
|
| 4 |
+
* GITHUB (AMEY): https://github.com/Amey-Thakur
|
| 5 |
+
* GITHUB (MEGA): https://github.com/msatmod
|
| 6 |
+
* REPOSITORY: https://github.com/Amey-Thakur/DEPRESSION-DETECTION-USING-TWEETS
|
| 7 |
+
* RELEASE DATE: June 5, 2022
|
| 8 |
+
* LICENSE: MIT License
|
| 9 |
+
* DESCRIPTION: Global style sheet with a professional and personal design.
|
| 10 |
+
* ============================================================================== */
|
| 11 |
+
|
| 12 |
+
@import url('https://fonts.googleapis.com/css2?family=Play:wght@400;700&display=swap');
|
| 13 |
+
|
| 14 |
+
:root {
|
| 15 |
+
/* Color Palette */
|
| 16 |
+
--clr-navy: #0A192F;
|
| 17 |
+
--clr-slate: #8892B0;
|
| 18 |
+
--clr-light-slate: #A8B2D1;
|
| 19 |
+
--clr-white: #E6F1FF;
|
| 20 |
+
--clr-ivory: #F8F9FA;
|
| 21 |
+
--clr-accent: #64FFDA;
|
| 22 |
+
--clr-accent-dark: #1daeff;
|
| 23 |
+
--clr-bg: #F4F7FB;
|
| 24 |
+
|
| 25 |
+
/* Typography */
|
| 26 |
+
--font-main: 'Play', sans-serif;
|
| 27 |
+
|
| 28 |
+
/* Layout */
|
| 29 |
+
--header-height: 80px;
|
| 30 |
+
--max-width: 900px;
|
| 31 |
+
--transition: all 0.3s cubic-bezier(0.645, 0.045, 0.355, 1);
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
* {
|
| 35 |
+
box-sizing: border-box;
|
| 36 |
+
margin: 0;
|
| 37 |
+
padding: 0;
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
body {
|
| 41 |
+
background-color: var(--clr-bg);
|
| 42 |
+
color: var(--clr-navy);
|
| 43 |
+
font-family: var(--font-main);
|
| 44 |
+
line-height: 1.6;
|
| 45 |
+
overflow-x: hidden;
|
| 46 |
+
user-select: none;
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
.container {
|
| 50 |
+
max-width: var(--max-width);
|
| 51 |
+
margin: 0 auto;
|
| 52 |
+
padding: 0 40px;
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
/* Header & Typography */
|
| 56 |
+
h1,
|
| 57 |
+
h2,
|
| 58 |
+
h3 {
|
| 59 |
+
font-family: var(--font-main);
|
| 60 |
+
color: var(--clr-navy);
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
.hero-section {
|
| 64 |
+
padding: 80px 0 40px;
|
| 65 |
+
text-align: center;
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
.hero-section h1 {
|
| 69 |
+
font-size: 3rem;
|
| 70 |
+
margin-bottom: 20px;
|
| 71 |
+
font-weight: 700;
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
.hero-section p {
|
| 75 |
+
font-size: 1.2rem;
|
| 76 |
+
color: var(--clr-slate);
|
| 77 |
+
max-width: 600px;
|
| 78 |
+
margin: 0 auto;
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
/* Analysis Card */
|
| 82 |
+
.analysis-card {
|
| 83 |
+
background: white;
|
| 84 |
+
border-radius: 12px;
|
| 85 |
+
box-shadow: 0 10px 30px -15px rgba(2, 12, 27, 0.1);
|
| 86 |
+
padding: 40px;
|
| 87 |
+
margin-bottom: 30px;
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
.input-group {
|
| 91 |
+
margin-top: 30px;
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
textarea.analysis-input {
|
| 95 |
+
width: 100%;
|
| 96 |
+
min-height: 150px;
|
| 97 |
+
padding: 20px;
|
| 98 |
+
border: 2px solid #E2E8F0;
|
| 99 |
+
border-radius: 8px;
|
| 100 |
+
font-family: var(--font-main);
|
| 101 |
+
font-size: 1.1rem;
|
| 102 |
+
transition: var(--transition);
|
| 103 |
+
resize: vertical;
|
| 104 |
+
outline: none;
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
textarea.analysis-input:focus {
|
| 108 |
+
border-color: var(--clr-accent-dark);
|
| 109 |
+
box-shadow: 0 0 0 4px rgba(29, 174, 255, 0.1);
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
.btn-primary {
|
| 113 |
+
display: inline-block;
|
| 114 |
+
background-color: var(--clr-accent-dark);
|
| 115 |
+
color: white;
|
| 116 |
+
padding: 15px 35px;
|
| 117 |
+
border-radius: 6px;
|
| 118 |
+
font-weight: 600;
|
| 119 |
+
text-decoration: none;
|
| 120 |
+
border: 2px solid var(--clr-accent-dark);
|
| 121 |
+
cursor: pointer;
|
| 122 |
+
transition: var(--transition);
|
| 123 |
+
margin-top: 20px;
|
| 124 |
+
font-size: 1rem;
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
.btn-primary:hover {
|
| 128 |
+
background-color: white;
|
| 129 |
+
color: var(--clr-accent-dark);
|
| 130 |
+
transform: translateY(-3px);
|
| 131 |
+
box-shadow: 0 5px 15px rgba(29, 174, 255, 0.3);
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
/* Result Section */
|
| 135 |
+
.result-display {
|
| 136 |
+
text-align: center;
|
| 137 |
+
padding: 40px 0;
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
.result-badge {
|
| 141 |
+
display: inline-block;
|
| 142 |
+
padding: 10px 25px;
|
| 143 |
+
border-radius: 50px;
|
| 144 |
+
font-weight: 700;
|
| 145 |
+
text-transform: uppercase;
|
| 146 |
+
letter-spacing: 1px;
|
| 147 |
+
margin-top: 20px;
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
.badge-depressive {
|
| 151 |
+
background-color: #FFF5F5;
|
| 152 |
+
color: #C53030;
|
| 153 |
+
border: 1px solid #FEB2B2;
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
.badge-non-depressive {
|
| 157 |
+
background-color: #F0FFF4;
|
| 158 |
+
color: #276749;
|
| 159 |
+
border: 1px solid #9AE6B4;
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
/* Footer & Authorship */
|
| 163 |
+
footer {
|
| 164 |
+
padding: 60px 0;
|
| 165 |
+
text-align: center;
|
| 166 |
+
border-top: 1px solid #E2E8F0;
|
| 167 |
+
margin-top: 60px;
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
.authorship {
|
| 171 |
+
color: var(--clr-slate);
|
| 172 |
+
font-size: 0.9rem;
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
.authorship a {
|
| 176 |
+
color: var(--clr-accent-dark);
|
| 177 |
+
text-decoration: none;
|
| 178 |
+
font-weight: 600;
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
.authorship a:hover {
|
| 182 |
+
text-decoration: underline;
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
.metadata {
|
| 186 |
+
margin-top: 10px;
|
| 187 |
+
font-size: 0.8rem;
|
| 188 |
+
color: var(--clr-light-slate);
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
/* Animations */
|
| 192 |
+
@keyframes fadeIn {
|
| 193 |
+
from {
|
| 194 |
+
opacity: 0;
|
| 195 |
+
transform: translateY(20px);
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
to {
|
| 199 |
+
opacity: 1;
|
| 200 |
+
transform: translateY(0);
|
| 201 |
+
}
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
.animate-fade {
|
| 205 |
+
animation: fadeIn 0.8s ease forwards;
|
| 206 |
+
}
|
| 207 |
+
|
| 208 |
+
/* Brain Icon Animation */
|
| 209 |
+
.brain-trigger {
|
| 210 |
+
color: var(--clr-accent-dark);
|
| 211 |
+
margin-bottom: 20px;
|
| 212 |
+
cursor: pointer;
|
| 213 |
+
transition: all 0.5s ease;
|
| 214 |
+
filter: drop-shadow(0 0 5px rgba(29, 174, 255, 0.3));
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
.brain-trigger:hover {
|
| 218 |
+
animation: brainPulse 1.5s infinite ease-in-out;
|
| 219 |
+
color: #4dc4ff;
|
| 220 |
+
/* Slightly brighter on hover */
|
| 221 |
+
filter: drop-shadow(0 0 15px rgba(29, 174, 255, 0.8));
|
| 222 |
+
}
|
| 223 |
+
|
| 224 |
+
@keyframes brainPulse {
|
| 225 |
+
0% {
|
| 226 |
+
transform: scale(1);
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
50% {
|
| 230 |
+
transform: scale(1.15);
|
| 231 |
+
filter: drop-shadow(0 0 25px rgba(29, 174, 255, 0.9));
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
100% {
|
| 235 |
+
transform: scale(1);
|
| 236 |
+
}
|
| 237 |
+
}
|
source_code/static/tweet-sound.mp3
ADDED
|
Binary file (10.4 kB). View file
|
|
|
source_code/templates/404.html
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
|
| 4 |
+
<head>
|
| 5 |
+
<meta charset="UTF-8">
|
| 6 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 7 |
+
<title>404 - Page Not Found | Tweet Depression Detection</title>
|
| 8 |
+
<!-- Simple Brain Icon as Favicon -->
|
| 9 |
+
<link rel="icon" href="{{url_for('.static', filename='brain.svg')}}" type="image/svg+xml">
|
| 10 |
+
|
| 11 |
+
<!-- Modern Typography & Iconography -->
|
| 12 |
+
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
|
| 13 |
+
<link rel="stylesheet" href="{{url_for('.static', filename='styles.css')}}">
|
| 14 |
+
<script src="{{url_for('.static', filename='security.js')}}"></script>
|
| 15 |
+
|
| 16 |
+
<style>
|
| 17 |
+
body {
|
| 18 |
+
overflow-x: hidden;
|
| 19 |
+
display: flex;
|
| 20 |
+
flex-direction: column;
|
| 21 |
+
min-height: 100vh;
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
.error-container {
|
| 25 |
+
flex: 1;
|
| 26 |
+
display: flex;
|
| 27 |
+
flex-direction: column;
|
| 28 |
+
justify-content: center;
|
| 29 |
+
align-items: center;
|
| 30 |
+
text-align: center;
|
| 31 |
+
padding: 20px;
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
.error-code {
|
| 35 |
+
font-size: 10rem;
|
| 36 |
+
color: var(--clr-accent-dark);
|
| 37 |
+
font-weight: 700;
|
| 38 |
+
margin: 0;
|
| 39 |
+
line-height: 1;
|
| 40 |
+
position: relative;
|
| 41 |
+
animation: glitch 1s infinite alternate-reverse;
|
| 42 |
+
text-shadow: 2px 2px 0 var(--clr-accent), -2px -2px 0 #ff0055;
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
.error-message {
|
| 46 |
+
font-size: 1.5rem;
|
| 47 |
+
color: var(--clr-slate);
|
| 48 |
+
margin-top: 10px;
|
| 49 |
+
margin-bottom: 40px;
|
| 50 |
+
letter-spacing: 1px;
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
@keyframes glitch {
|
| 54 |
+
0% {
|
| 55 |
+
transform: skew(0deg);
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
20% {
|
| 59 |
+
transform: skew(-2deg);
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
40% {
|
| 63 |
+
transform: skew(2deg);
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
60% {
|
| 67 |
+
transform: skew(-1deg);
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
80% {
|
| 71 |
+
transform: skew(3deg);
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
100% {
|
| 75 |
+
transform: skew(0deg);
|
| 76 |
+
}
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
.broken-icon {
|
| 80 |
+
font-size: 5rem;
|
| 81 |
+
color: var(--clr-light-slate);
|
| 82 |
+
margin-bottom: 20px;
|
| 83 |
+
animation: float 3s ease-in-out infinite;
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
@keyframes float {
|
| 87 |
+
0% {
|
| 88 |
+
transform: translateY(0px) rotate(0deg);
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
50% {
|
| 92 |
+
transform: translateY(-15px) rotate(5deg);
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
100% {
|
| 96 |
+
transform: translateY(0px) rotate(0deg);
|
| 97 |
+
}
|
| 98 |
+
}
|
| 99 |
+
</style>
|
| 100 |
+
</head>
|
| 101 |
+
|
| 102 |
+
<body>
|
| 103 |
+
|
| 104 |
+
<header class="hero-section container animate-fade">
|
| 105 |
+
<i class="fas fa-brain fa-3x" style="color: #1daeff; margin-bottom: 20px;"></i>
|
| 106 |
+
<h1>Tweet Depression Detection</h1>
|
| 107 |
+
</header>
|
| 108 |
+
|
| 109 |
+
<main class="container animate-fade" style="animation-delay: 0.2s;">
|
| 110 |
+
<div class="error-container">
|
| 111 |
+
<i class="fas fa-unlink broken-icon"></i>
|
| 112 |
+
<h1 class="error-code">404</h1>
|
| 113 |
+
<p class="error-message">Oops! This tweet seems to have disappeared.</p>
|
| 114 |
+
|
| 115 |
+
<a href="{{ url_for('index') }}" class="btn-primary">
|
| 116 |
+
<i class="fas fa-home" style="margin-right: 8px;"></i> Return to Home
|
| 117 |
+
</a>
|
| 118 |
+
</div>
|
| 119 |
+
</main>
|
| 120 |
+
|
| 121 |
+
<footer class="container"
|
| 122 |
+
style="padding: 60px 0 40px; text-align: center; border-top: 1px solid rgba(136, 146, 176, 0.1);">
|
| 123 |
+
<p style="color: var(--clr-slate); font-size: 0.95rem; margin-bottom: 12px;">
|
| 124 |
+
Developed by <a href="https://github.com/Amey-Thakur"
|
| 125 |
+
style="color: var(--clr-accent-dark); font-weight: 700;">Amey Thakur</a> & <a
|
| 126 |
+
href="https://github.com/msatmod" style="color: var(--clr-accent-dark); font-weight: 700;">Mega
|
| 127 |
+
Satish</a>
|
| 128 |
+
</p>
|
| 129 |
+
<p style="color: var(--clr-light-slate); font-size: 0.85rem; letter-spacing: 0.5px;">
|
| 130 |
+
© 2022 • MIT License • <a href="https://github.com/Amey-Thakur/DEPRESSION-DETECTION-USING-TWEETS"
|
| 131 |
+
target="_blank" style="margin-left: 5px;"><i class="fab fa-github"></i> GitHub</a>
|
| 132 |
+
</p>
|
| 133 |
+
</footer>
|
| 134 |
+
|
| 135 |
+
</body>
|
| 136 |
+
|
| 137 |
+
</html>
|
source_code/templates/index.html
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
|
| 4 |
+
<head>
|
| 5 |
+
<meta charset="UTF-8">
|
| 6 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 7 |
+
<title>Tweet Depression Detection | AMEY & MEGA</title>
|
| 8 |
+
<!-- Simple Brain Icon as Favicon -->
|
| 9 |
+
<link rel="icon" href="{{url_for('.static', filename='brain.svg')}}" type="image/svg+xml">
|
| 10 |
+
|
| 11 |
+
<!-- Modern Typography & Iconography -->
|
| 12 |
+
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
|
| 13 |
+
<link rel="stylesheet" href="{{url_for('.static', filename='styles.css')}}">
|
| 14 |
+
<link rel="stylesheet" href="{{url_for('.static', filename='overlay.css')}}">
|
| 15 |
+
<script src="{{url_for('.static', filename='security.js')}}"></script>
|
| 16 |
+
</head>
|
| 17 |
+
|
| 18 |
+
<!--
|
| 19 |
+
==============================================================================
|
| 20 |
+
PROJECT: DEPRESSION-DETECTION-USING-TWEETS
|
| 21 |
+
AUTHORS: AMEY THAKUR & MEGA SATISH
|
| 22 |
+
GITHUB (AMEY): https://github.com/Amey-Thakur
|
| 23 |
+
GITHUB (MEGA): https://github.com/msatmod
|
| 24 |
+
REPOSITORY: https://github.com/Amey-Thakur/DEPRESSION-DETECTION-USING-TWEETS
|
| 25 |
+
RELEASE DATE: June 5, 2022
|
| 26 |
+
LICENSE: MIT License
|
| 27 |
+
DESCRIPTION: Main interface for tweet analysis.
|
| 28 |
+
==============================================================================
|
| 29 |
+
-->
|
| 30 |
+
|
| 31 |
+
<body>
|
| 32 |
+
|
| 33 |
+
<header class="hero-section container animate-fade">
|
| 34 |
+
<i class="fas fa-brain fa-3x brain-trigger" onclick="triggerCinematic()"></i>
|
| 35 |
+
<h1>Tweet Depression Detection</h1>
|
| 36 |
+
<p>Using Machine Learning to predict sentiment in tweets.</p>
|
| 37 |
+
</header>
|
| 38 |
+
|
| 39 |
+
<main class="container animate-fade" style="animation-delay: 0.2s;">
|
| 40 |
+
<section class="analysis-card">
|
| 41 |
+
<h3><i class="fab fa-twitter" style="margin-right: 10px; color: #1daeff;"></i> Try it Out</h3>
|
| 42 |
+
<p style="font-size: 0.9rem; color: #8892B0; margin-bottom: 20px;">Paste a tweet below to see how our model
|
| 43 |
+
classifies its sentiment.</p>
|
| 44 |
+
|
| 45 |
+
<form id="analysisForm" action="{{ url_for('predict')}}" method="POST">
|
| 46 |
+
<div class="input-group">
|
| 47 |
+
<textarea name="tweet" class="analysis-input"
|
| 48 |
+
placeholder="Paste tweet content here for sentiment analysis..." required></textarea>
|
| 49 |
+
</div>
|
| 50 |
+
<div style="text-align: right;">
|
| 51 |
+
<button type="submit" class="btn-primary">
|
| 52 |
+
<i class="fab fa-twitter" style="margin-right: 8px;"></i> Analyze Tweet
|
| 53 |
+
</button>
|
| 54 |
+
</div>
|
| 55 |
+
</form>
|
| 56 |
+
|
| 57 |
+
<audio id="tweetSound" src="{{url_for('.static', filename='tweet-sound.mp3')}}" preload="auto"></audio>
|
| 58 |
+
|
| 59 |
+
<script>
|
| 60 |
+
// Cinematic Interaction Logic
|
| 61 |
+
function triggerCinematic() {
|
| 62 |
+
const overlay = document.getElementById('cinematicOverlay');
|
| 63 |
+
const sound = document.getElementById('tweetSound');
|
| 64 |
+
|
| 65 |
+
overlay.style.display = 'flex';
|
| 66 |
+
// Force reflow
|
| 67 |
+
void overlay.offsetWidth;
|
| 68 |
+
overlay.classList.add('active');
|
| 69 |
+
|
| 70 |
+
setTimeout(() => {
|
| 71 |
+
sound.play().catch(err => console.log("Audio playback failed:", err));
|
| 72 |
+
}, 200);
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
function closeOverlay() {
|
| 76 |
+
const overlay = document.getElementById('cinematicOverlay');
|
| 77 |
+
overlay.classList.remove('active');
|
| 78 |
+
setTimeout(() => {
|
| 79 |
+
overlay.style.display = 'none';
|
| 80 |
+
}, 800);
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
document.getElementById('analysisForm').addEventListener('submit', function (e) {
|
| 84 |
+
const form = this;
|
| 85 |
+
const sound = document.getElementById('tweetSound');
|
| 86 |
+
|
| 87 |
+
e.preventDefault();
|
| 88 |
+
sound.play().catch(err => console.log("Audio playback failed:", err));
|
| 89 |
+
|
| 90 |
+
setTimeout(() => {
|
| 91 |
+
form.submit();
|
| 92 |
+
}, 400); // 400ms delay to let the chirp start
|
| 93 |
+
});
|
| 94 |
+
</script>
|
| 95 |
+
</section>
|
| 96 |
+
|
| 97 |
+
<section class="analysis-card" style="background: rgba(29, 174, 255, 0.05);">
|
| 98 |
+
<h4>How it Works</h4>
|
| 99 |
+
<p style="font-size: 0.95rem;">This project uses an <strong>SVM (Support Vector Machine)</strong> model
|
| 100 |
+
combined with <strong>spaCy</strong> word embeddings to process and classify text. It was developed to
|
| 101 |
+
experiment with modern Machine Learning workflows.</p>
|
| 102 |
+
</section>
|
| 103 |
+
</main>
|
| 104 |
+
|
| 105 |
+
<footer class="container"
|
| 106 |
+
style="padding: 60px 0 40px; text-align: center; border-top: 1px solid rgba(136, 146, 176, 0.1);">
|
| 107 |
+
<p style="color: var(--clr-slate); font-size: 0.95rem; margin-bottom: 12px;">
|
| 108 |
+
Developed by <a href="https://github.com/Amey-Thakur"
|
| 109 |
+
style="color: var(--clr-accent-dark); font-weight: 700;">Amey Thakur</a> & <a
|
| 110 |
+
href="https://github.com/msatmod" style="color: var(--clr-accent-dark); font-weight: 700;">Mega
|
| 111 |
+
Satish</a>
|
| 112 |
+
</p>
|
| 113 |
+
<p style="color: var(--clr-light-slate); font-size: 0.85rem; letter-spacing: 0.5px;">
|
| 114 |
+
© 2022 • MIT License • <a href="https://github.com/Amey-Thakur/DEPRESSION-DETECTION-USING-TWEETS"
|
| 115 |
+
target="_blank" style="margin-left: 5px;"><i class="fab fa-github"></i> GitHub</a>
|
| 116 |
+
</p>
|
| 117 |
+
</footer>
|
| 118 |
+
|
| 119 |
+
<!-- Cinematic Overlay (Moved to root to avoid transform conflicts) -->
|
| 120 |
+
<div id="cinematicOverlay" class="cinematic-overlay" onclick="closeOverlay()">
|
| 121 |
+
<div class="overlay-content">
|
| 122 |
+
<!-- Cinematic Icon -->
|
| 123 |
+
<i class="fab fa-twitter overlay-logo"></i>
|
| 124 |
+
<h1 class="overlay-title">Tweet Depression Detection</h1>
|
| 125 |
+
<p class="overlay-author">Developed by</p>
|
| 126 |
+
<div class="overlay-names">
|
| 127 |
+
<span>Amey Thakur</span>
|
| 128 |
+
<span class="separator">&</span>
|
| 129 |
+
<span>Mega Satish</span>
|
| 130 |
+
</div>
|
| 131 |
+
</div>
|
| 132 |
+
</div>
|
| 133 |
+
|
| 134 |
+
</body>
|
| 135 |
+
|
| 136 |
+
</html>
|
source_code/templates/result.html
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!--
|
| 2 |
+
==============================================================================
|
| 3 |
+
PROJECT: DEPRESSION-DETECTION-USING-TWEETS
|
| 4 |
+
AUTHORS: AMEY THAKUR & MEGA SATISH
|
| 5 |
+
GITHUB (AMEY): https://github.com/Amey-Thakur
|
| 6 |
+
GITHUB (MEGA): https://github.com/msatmod
|
| 7 |
+
REPOSITORY: https://github.com/Amey-Thakur/DEPRESSION-DETECTION-USING-TWEETS
|
| 8 |
+
RELEASE DATE: June 5, 2022
|
| 9 |
+
LICENSE: MIT License
|
| 10 |
+
DESCRIPTION: Result page for the tweet analysis.
|
| 11 |
+
==============================================================================
|
| 12 |
+
-->
|
| 13 |
+
|
| 14 |
+
<!DOCTYPE html>
|
| 15 |
+
<html lang="en">
|
| 16 |
+
|
| 17 |
+
<head>
|
| 18 |
+
<meta charset="UTF-8">
|
| 19 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 20 |
+
<title>Analysis Result | AMEY & MEGA </title>
|
| 21 |
+
<link rel="icon" href="{{url_for('.static', filename='chart-bar.svg')}}" type="image/svg+xml">
|
| 22 |
+
|
| 23 |
+
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
|
| 24 |
+
<link rel="stylesheet" href="{{url_for('.static', filename='styles.css')}}">
|
| 25 |
+
<script src="{{url_for('.static', filename='security.js')}}"></script>
|
| 26 |
+
</head>
|
| 27 |
+
|
| 28 |
+
<body>
|
| 29 |
+
|
| 30 |
+
<header class="hero-section container animate-fade">
|
| 31 |
+
<i class="fas fa-chart-line fa-3x" style="color: #1daeff; margin-bottom: 20px;"></i>
|
| 32 |
+
<h1>Analysis Result</h1>
|
| 33 |
+
<p>Here is what our model predicted for the tweet you provided.</p>
|
| 34 |
+
</header>
|
| 35 |
+
|
| 36 |
+
<main class="container animate-fade" style="animation-delay: 0.2s;">
|
| 37 |
+
<section class="analysis-card">
|
| 38 |
+
<h3><i class="fas fa-quote-left" style="color: var(--clr-slate); margin-right: 10px;"></i> The Tweet
|
| 39 |
+
</h3>
|
| 40 |
+
<div
|
| 41 |
+
style="background: #F8F9FA; padding: 25px; border-radius: 8px; margin: 20px 0; border-left: 4px solid var(--clr-slate);">
|
| 42 |
+
<p
|
| 43 |
+
style="font-family: var(--font-serif); font-style: italic; font-size: 1.1rem; color: var(--clr-navy);">
|
| 44 |
+
{{ name }}
|
| 45 |
+
</p>
|
| 46 |
+
</div>
|
| 47 |
+
</section>
|
| 48 |
+
|
| 49 |
+
<section class="analysis-card result-display">
|
| 50 |
+
<h3>Our Prediction</h3>
|
| 51 |
+
|
| 52 |
+
{% if prediction == 0 %}
|
| 53 |
+
<div class="result-badge badge-non-depressive">
|
| 54 |
+
<i class="fas fa-check-circle"></i> Non-Depressive
|
| 55 |
+
</div>
|
| 56 |
+
<p style="margin-top: 25px; color: var(--clr-slate);">Our model didn't find any significant signs of
|
| 57 |
+
depression in this text.</p>
|
| 58 |
+
{% elif prediction == 1 %}
|
| 59 |
+
<div class="result-badge badge-depressive">
|
| 60 |
+
<i class="fas fa-exclamation-triangle"></i> Depressive
|
| 61 |
+
</div>
|
| 62 |
+
<p style="margin-top: 25px; color: var(--clr-slate);">Our model identified patterns that are often
|
| 63 |
+
associated with depression in this text.</p>
|
| 64 |
+
{% endif %}
|
| 65 |
+
|
| 66 |
+
<div style="margin-top: 40px; border-top: 1px solid #E2E8F0; padding-top: 30px;">
|
| 67 |
+
<a href="{{ url_for('index') }}" class="btn-primary">
|
| 68 |
+
<i class="fab fa-twitter" style="margin-right: 8px;"></i> Analyze Another Tweet
|
| 69 |
+
</a>
|
| 70 |
+
</div>
|
| 71 |
+
</section>
|
| 72 |
+
</main>
|
| 73 |
+
|
| 74 |
+
<footer class="container"
|
| 75 |
+
style="padding: 60px 0 40px; text-align: center; border-top: 1px solid rgba(136, 146, 176, 0.1);">
|
| 76 |
+
<p style="color: var(--clr-slate); font-size: 0.95rem; margin-bottom: 12px;">
|
| 77 |
+
Developed by <a href="https://github.com/Amey-Thakur"
|
| 78 |
+
style="color: var(--clr-accent-dark); font-weight: 700;">Amey Thakur</a> & <a
|
| 79 |
+
href="https://github.com/msatmod" style="color: var(--clr-accent-dark); font-weight: 700;">Mega
|
| 80 |
+
Satish</a>
|
| 81 |
+
</p>
|
| 82 |
+
<p style="color: var(--clr-light-slate); font-size: 0.85rem; letter-spacing: 0.5px;">
|
| 83 |
+
© 2022 • MIT License • <a href="https://github.com/Amey-Thakur/DEPRESSION-DETECTION-USING-TWEETS"
|
| 84 |
+
target="_blank" style="margin-left: 5px;"><i class="fab fa-github"></i> GitHub</a>
|
| 85 |
+
</p>
|
| 86 |
+
</footer>
|
| 87 |
+
|
| 88 |
+
</body>
|
| 89 |
+
|
| 90 |
+
</html>
|
source_code/test_app.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
|
| 3 |
+
def test_app():
|
| 4 |
+
url = "http://127.0.0.1:5000/predict"
|
| 5 |
+
data = {"tweet": "I feel very sad and hopeless today"}
|
| 6 |
+
try:
|
| 7 |
+
response = requests.post(url, data=data)
|
| 8 |
+
if response.status_code == 200:
|
| 9 |
+
print("Successfully connected to the server.")
|
| 10 |
+
if "Outcome: Depressive" in response.text:
|
| 11 |
+
print("Prediction test PASSED: Correctly identified depressive sentiment.")
|
| 12 |
+
else:
|
| 13 |
+
print("Prediction test FAILED: Outcome not found in response.")
|
| 14 |
+
else:
|
| 15 |
+
print(f"Server returned status code: {response.status_code}")
|
| 16 |
+
except Exception as e:
|
| 17 |
+
print(f"Error connecting to server: {e}")
|
| 18 |
+
|
| 19 |
+
if __name__ == "__main__":
|
| 20 |
+
test_app()
|