Upload 15 files
Browse files- scraper/.DS_Store +0 -0
- scraper/.gitignore +169 -0
- scraper/README.md +77 -0
- scraper/__pycache__/topic_crawling.cpython-310.pyc +0 -0
- scraper/auto_crawl.sh +7 -0
- scraper/auto_crawl_topic.sh +2 -0
- scraper/main.ipynb +416 -0
- scraper/main.py +258 -0
- scraper/preprocessing/__pycache__/preprocessing_sub_functions.cpython-310.pyc +0 -0
- scraper/preprocessing/preprocessing.py +130 -0
- scraper/preprocessing/preprocessing.sh +7 -0
- scraper/preprocessing/preprocessing_sub_functions.py +248 -0
- scraper/sort.py +53 -0
- scraper/topic_crawling.py +243 -0
- scraper/website_format.json +130 -0
scraper/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
scraper/.gitignore
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
share/python-wheels/
|
| 24 |
+
*.egg-info/
|
| 25 |
+
.installed.cfg
|
| 26 |
+
*.egg
|
| 27 |
+
MANIFEST
|
| 28 |
+
|
| 29 |
+
# PyInstaller
|
| 30 |
+
# Usually these files are written by a python script from a template
|
| 31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 32 |
+
*.manifest
|
| 33 |
+
*.spec
|
| 34 |
+
|
| 35 |
+
# Installer logs
|
| 36 |
+
pip-log.txt
|
| 37 |
+
pip-delete-this-directory.txt
|
| 38 |
+
|
| 39 |
+
# Unit test / coverage reports
|
| 40 |
+
htmlcov/
|
| 41 |
+
.tox/
|
| 42 |
+
.nox/
|
| 43 |
+
.coverage
|
| 44 |
+
.coverage.*
|
| 45 |
+
.cache
|
| 46 |
+
nosetests.xml
|
| 47 |
+
coverage.xml
|
| 48 |
+
*.cover
|
| 49 |
+
*.py,cover
|
| 50 |
+
.hypothesis/
|
| 51 |
+
.pytest_cache/
|
| 52 |
+
cover/
|
| 53 |
+
|
| 54 |
+
# Translations
|
| 55 |
+
*.mo
|
| 56 |
+
*.pot
|
| 57 |
+
|
| 58 |
+
# Django stuff:
|
| 59 |
+
*.log
|
| 60 |
+
local_settings.py
|
| 61 |
+
db.sqlite3
|
| 62 |
+
db.sqlite3-journal
|
| 63 |
+
|
| 64 |
+
# Flask stuff:
|
| 65 |
+
instance/
|
| 66 |
+
.webassets-cache
|
| 67 |
+
|
| 68 |
+
# Scrapy stuff:
|
| 69 |
+
.scrapy
|
| 70 |
+
|
| 71 |
+
# Sphinx documentation
|
| 72 |
+
docs/_build/
|
| 73 |
+
|
| 74 |
+
# PyBuilder
|
| 75 |
+
.pybuilder/
|
| 76 |
+
target/
|
| 77 |
+
|
| 78 |
+
# Jupyter Notebook
|
| 79 |
+
.ipynb_checkpoints
|
| 80 |
+
|
| 81 |
+
# IPython
|
| 82 |
+
profile_default/
|
| 83 |
+
ipython_config.py
|
| 84 |
+
|
| 85 |
+
# pyenv
|
| 86 |
+
# For a library or package, you might want to ignore these files since the code is
|
| 87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
| 88 |
+
# .python-version
|
| 89 |
+
|
| 90 |
+
# pipenv
|
| 91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 94 |
+
# install all needed dependencies.
|
| 95 |
+
#Pipfile.lock
|
| 96 |
+
|
| 97 |
+
# poetry
|
| 98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 100 |
+
# commonly ignored for libraries.
|
| 101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 102 |
+
#poetry.lock
|
| 103 |
+
|
| 104 |
+
# pdm
|
| 105 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 106 |
+
#pdm.lock
|
| 107 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
| 108 |
+
# in version control.
|
| 109 |
+
# https://pdm.fming.dev/#use-with-ide
|
| 110 |
+
.pdm.toml
|
| 111 |
+
|
| 112 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 113 |
+
__pypackages__/
|
| 114 |
+
|
| 115 |
+
# Celery stuff
|
| 116 |
+
celerybeat-schedule
|
| 117 |
+
celerybeat.pid
|
| 118 |
+
|
| 119 |
+
# SageMath parsed files
|
| 120 |
+
*.sage.py
|
| 121 |
+
|
| 122 |
+
# Environments
|
| 123 |
+
.env
|
| 124 |
+
.venv
|
| 125 |
+
env/
|
| 126 |
+
venv/
|
| 127 |
+
ENV/
|
| 128 |
+
env.bak/
|
| 129 |
+
venv.bak/
|
| 130 |
+
|
| 131 |
+
# Spyder project settings
|
| 132 |
+
.spyderproject
|
| 133 |
+
.spyproject
|
| 134 |
+
|
| 135 |
+
# Rope project settings
|
| 136 |
+
.ropeproject
|
| 137 |
+
|
| 138 |
+
# mkdocs documentation
|
| 139 |
+
/site
|
| 140 |
+
|
| 141 |
+
# mypy
|
| 142 |
+
.mypy_cache/
|
| 143 |
+
.dmypy.json
|
| 144 |
+
dmypy.json
|
| 145 |
+
|
| 146 |
+
# Pyre type checker
|
| 147 |
+
.pyre/
|
| 148 |
+
|
| 149 |
+
# pytype static type analyzer
|
| 150 |
+
.pytype/
|
| 151 |
+
|
| 152 |
+
# Cython debug symbols
|
| 153 |
+
cython_debug/
|
| 154 |
+
|
| 155 |
+
# PyCharm
|
| 156 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 157 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 158 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 159 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 160 |
+
#.idea/
|
| 161 |
+
|
| 162 |
+
.vscode
|
| 163 |
+
|
| 164 |
+
data/
|
| 165 |
+
preprocessed-data/
|
| 166 |
+
raw-data/
|
| 167 |
+
sorted-preprocessed-data/
|
| 168 |
+
sorted-raw-data/
|
| 169 |
+
*.zip
|
scraper/README.md
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# bitcointalk_crawler
|
| 2 |
+
|
| 3 |
+
---
|
| 4 |
+
|
| 5 |
+
## DataFrame Columns Description
|
| 6 |
+
|
| 7 |
+
### 1. `start_edit`
|
| 8 |
+
- **Description**: This column represents the date when the post or content was initially created.
|
| 9 |
+
- **Type**: Date (format: YYYY-MM-DD)
|
| 10 |
+
- **Example**: `2013-11-02`
|
| 11 |
+
|
| 12 |
+
### 2. `last_edit`
|
| 13 |
+
- **Description**: This column represents the last date when the post or content was edited.
|
| 14 |
+
- **Type**: Date (format: YYYY-MM-DD)
|
| 15 |
+
- **Example**: `2013-11-02`
|
| 16 |
+
|
| 17 |
+
### 3. `author`
|
| 18 |
+
- **Description**: The user who created the post.
|
| 19 |
+
- **Type**: String
|
| 20 |
+
- **Example**: `guyver`
|
| 21 |
+
|
| 22 |
+
### 4. `post`
|
| 23 |
+
- **Description**: The actual content or message of the post.
|
| 24 |
+
- **Type**: String
|
| 25 |
+
- **Example**: `before we all get excited about the second batch...`
|
| 26 |
+
|
| 27 |
+
### 5. `topic`
|
| 28 |
+
- **Description**: The topic or title of the thread in which the post was made.
|
| 29 |
+
- **Type**: String
|
| 30 |
+
- **Example**: `[EU/UK GROUP BUY] Blue Fury USB miner 2.2 ...`
|
| 31 |
+
|
| 32 |
+
### 6. `attachment`
|
| 33 |
+
- **Description**: Indicates whether the post has an attachment or not. A value of `1` means there's an attachment(image or video), and `0` means there isn't. In the website, it using img tag to show the emoji but seems not to be an attachment, such that it also ignring the emojis.
|
| 34 |
+
- **Type**: Integer (0 or 1)
|
| 35 |
+
- **Example**: `0`
|
| 36 |
+
- **Note**: The script 'attachment_fix.py' is run subsequent to the crawling process, as the initial values populated in this column post-crawling are not accurate.
|
| 37 |
+
|
| 38 |
+
### 7. `link`
|
| 39 |
+
- **Description**: Indicates whether the post contains a link or not. A value of `1` means there's a link, and `0` means there isn't.
|
| 40 |
+
- **Type**: Integer (0 or 1)
|
| 41 |
+
- **Example**: `0`
|
| 42 |
+
|
| 43 |
+
### 8. `original_info`
|
| 44 |
+
- **Description**: This column contains raw HTML or metadata related to the post. It may contain styling and layout information.
|
| 45 |
+
- **Type**: String (HTML format)
|
| 46 |
+
- **Example**: `<td class="td_headerandpost" height="100%" sty...`
|
| 47 |
+
|
| 48 |
+
### 9. `preprocessed_post`
|
| 49 |
+
- **Description**: Preprocessed of `post` column that for analysis or other tasks.
|
| 50 |
+
- **Type**: String
|
| 51 |
+
- **Example**: `get excited second batch.let us wait first bat...`
|
| 52 |
+
|
| 53 |
+
---
|
| 54 |
+
|
| 55 |
+
## Usage
|
| 56 |
+
|
| 57 |
+
### 1. `main.py` and `auto_crawl.sh`
|
| 58 |
+
- **Description**: The `main.py` script is the full script that is used to crawl the Bitcointalk forum with given the first board page. The `auto_crawl.sh` script is used to automate the process of running the `main.py` script.
|
| 59 |
+
- **example**:
|
| 60 |
+
```python
|
| 61 |
+
python main.py
|
| 62 |
+
https://bitcointalk.org/index.php?board=40.0 # board url
|
| 63 |
+
--board mining_support # board name
|
| 64 |
+
-pages 183 # number of pages in the board
|
| 65 |
+
```
|
| 66 |
+
|
| 67 |
+
### 2. `topic_craawling.py` and `auto_crawl_topic.sh`
|
| 68 |
+
|
| 69 |
+
- **Description**: The `topic_crawling.py` script is used to crawl exact topic from Bitcointalk forum with given the first page url of the topic. The `auto_crawl_topic.sh` script is used to automate the process of running the `topic_craawling.py` script.
|
| 70 |
+
|
| 71 |
+
- **example**:
|
| 72 |
+
```python
|
| 73 |
+
python topic_crawling.py
|
| 74 |
+
https://bitcointalk.org/index.php?topic=168174.0 # topic url
|
| 75 |
+
--board miners # board name that topic belongs to
|
| 76 |
+
--num_of_pages 165 # total pages of this topic
|
| 77 |
+
```
|
scraper/__pycache__/topic_crawling.cpython-310.pyc
ADDED
|
Binary file (6.34 kB). View file
|
|
|
scraper/auto_crawl.sh
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# python main.py https://bitcointalk.org/index.php?board=42.0 --board miners -pages 41 -posts 523
|
| 2 |
+
# python main.py https://bitcointalk.org/index.php?board=40.0 --board mining_support -pages 183
|
| 3 |
+
# python main.py https://bitcointalk.org/index.php?board=76.0 --board hardware -pages 145
|
| 4 |
+
# python main.py https://bitcointalk.org/index.php?board=137.0 --board groupbuys -pages 24 -posts 322
|
| 5 |
+
# python main.py https://bitcointalk.org/index.php?board=81.0 --board mining_speculation -pages 95 --update
|
| 6 |
+
# python main.py https://bitcointalk.org/index.php?board=41.0 --board pools -pages 52 -posts 32
|
| 7 |
+
# python main.py https://bitcointalk.org/index.php?board=14.0 --board mining -pages 143 -posts 1524
|
scraper/auto_crawl_topic.sh
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# python topic_crawling.py https://bitcointalk.org/index.php?topic=168174.0 --board miners --num_of_pages 165
|
| 2 |
+
# python topic_crawling.py https://bitcointalk.org/index.php?topic=6458.0 --board miners --num_of_pages 57
|
scraper/main.ipynb
ADDED
|
@@ -0,0 +1,416 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"outputs": [],
|
| 8 |
+
"source": [
|
| 9 |
+
"DEMO_MODE = True"
|
| 10 |
+
]
|
| 11 |
+
},
|
| 12 |
+
{
|
| 13 |
+
"cell_type": "code",
|
| 14 |
+
"execution_count": 2,
|
| 15 |
+
"metadata": {},
|
| 16 |
+
"outputs": [],
|
| 17 |
+
"source": [
|
| 18 |
+
"# Importing necessary libraries:\n",
|
| 19 |
+
"# - os, json, time for file, data and time operations respectively.\n",
|
| 20 |
+
"# - requests for making HTTP requests.\n",
|
| 21 |
+
"# - BeautifulSoup for parsing HTML content.\n",
|
| 22 |
+
"# - Other imports for logging, data manipulation, progress indication, and more.\n",
|
| 23 |
+
"import os\n",
|
| 24 |
+
"import json\n",
|
| 25 |
+
"import time\n",
|
| 26 |
+
"import munch\n",
|
| 27 |
+
"import requests\n",
|
| 28 |
+
"import argparse\n",
|
| 29 |
+
"import pandas as pd\n",
|
| 30 |
+
"from tqdm import tqdm\n",
|
| 31 |
+
"from datetime import date\n",
|
| 32 |
+
"from loguru import logger\n",
|
| 33 |
+
"from random import randint\n",
|
| 34 |
+
"from bs4 import BeautifulSoup, NavigableString\n",
|
| 35 |
+
"\n",
|
| 36 |
+
"from preprocessing.preprocessing_sub_functions import remove_emojis\n",
|
| 37 |
+
"from topic_crawling import loop_through_posts\n",
|
| 38 |
+
"\n",
|
| 39 |
+
"\n",
|
| 40 |
+
"# This function reads a JSON file named \"website_format.json\".\n",
|
| 41 |
+
"# The file contain a list of user agents.\n",
|
| 42 |
+
"# User agents are strings that browsers send to websites to identify themselves.\n",
|
| 43 |
+
"# This list is likely used to rotate between different user agents when making requests,\n",
|
| 44 |
+
"# making the scraper seem like different browsers and reducing the chances of being blocked.\n",
|
| 45 |
+
"def get_web_component():\n",
|
| 46 |
+
" with open(\"website_format.json\") as json_file:\n",
|
| 47 |
+
" website_format = json.load(json_file)\n",
|
| 48 |
+
" website_format = munch.munchify(website_format)\n",
|
| 49 |
+
" return website_format.USER_AGENTS\n",
|
| 50 |
+
"\n",
|
| 51 |
+
"\n",
|
| 52 |
+
"# This function fetches a webpage's content.\n",
|
| 53 |
+
"# It randomly selects a user agent from the provided list to make the request.\n",
|
| 54 |
+
"# After fetching, it uses BeautifulSoup to parse the page's HTML content.\n",
|
| 55 |
+
"# def get_web_content(url, USER_AGENTS):\n",
|
| 56 |
+
"# random_agent = USER_AGENTS[randint(0, len(USER_AGENTS) - 1)]\n",
|
| 57 |
+
"# headers = {\"User-Agent\": random_agent}\n",
|
| 58 |
+
"# req = requests.get(url, headers=headers)\n",
|
| 59 |
+
"# req.encoding = req.apparent_encoding\n",
|
| 60 |
+
"# soup = BeautifulSoup(req.text, features=\"lxml\")\n",
|
| 61 |
+
"# return soup\n",
|
| 62 |
+
"from topic_crawling import get_web_content\n",
|
| 63 |
+
"\n",
|
| 64 |
+
"\n",
|
| 65 |
+
"# This function extracts pagination links from a page.\n",
|
| 66 |
+
"# These links point to other pages of content, often seen at the bottom of forums or search results.\n",
|
| 67 |
+
"# The function returns both the individual page links and the \"next\" link,\n",
|
| 68 |
+
"# which points to the next set of results.\n",
|
| 69 |
+
"def get_pages_urls(url, USER_AGENTS):\n",
|
| 70 |
+
" time.sleep(1)\n",
|
| 71 |
+
" soup = get_web_content(url, USER_AGENTS)\n",
|
| 72 |
+
" # Finding the pagination links based on their HTML structure and CSS classes.\n",
|
| 73 |
+
" first_td = soup.find(\"td\", class_=\"middletext\", id=\"toppages\")\n",
|
| 74 |
+
" nav_pages_links = first_td.find_all(\"a\", class_=\"navPages\")\n",
|
| 75 |
+
" href_links = [link[\"href\"] for link in nav_pages_links]\n",
|
| 76 |
+
" next_50_link = href_links[-3] # Assuming the third-last link is the \"next\" link.\n",
|
| 77 |
+
" href_links.insert(0, url)\n",
|
| 78 |
+
" return href_links, next_50_link\n",
|
| 79 |
+
"\n",
|
| 80 |
+
"\n",
|
| 81 |
+
"# This function extracts individual post URLs from a page.\n",
|
| 82 |
+
"# It's likely targeting a forum or blog structure, where multiple posts or threads are listed on one page.\n",
|
| 83 |
+
"def get_post_urls(url, USER_AGENTS):\n",
|
| 84 |
+
" time.sleep(1)\n",
|
| 85 |
+
" soup = get_web_content(url, USER_AGENTS)\n",
|
| 86 |
+
" # Finding post links based on their HTML structure and CSS classes.\n",
|
| 87 |
+
" links_elements = soup.select(\"td.windowbg span a\")\n",
|
| 88 |
+
" links = [link[\"href\"] for link in links_elements]\n",
|
| 89 |
+
"\n",
|
| 90 |
+
" # # If including rules and announcements posts\n",
|
| 91 |
+
" # links_elements = soup.select('td.windowbg3 span a')\n",
|
| 92 |
+
" # links_ = [link['href'] for link in links_elements]\n",
|
| 93 |
+
" # links.extend(links_)\n",
|
| 94 |
+
"\n",
|
| 95 |
+
" return links\n",
|
| 96 |
+
"\n",
|
| 97 |
+
"\n",
|
| 98 |
+
"# This function loops through the main page and its paginated versions to collect URLs.\n",
|
| 99 |
+
"# It repeatedly calls 'get_pages_urls' to fetch batches of URLs until the desired number (num_of_pages) is reached.\n",
|
| 100 |
+
"def loop_through_source_url(USER_AGENTS, url, num_of_pages):\n",
|
| 101 |
+
" pages_urls = []\n",
|
| 102 |
+
" counter = 0\n",
|
| 103 |
+
" while len(pages_urls) < num_of_pages:\n",
|
| 104 |
+
" print(\"loop_through_source_url: \", len(pages_urls))\n",
|
| 105 |
+
" href_links, next_50_link = get_pages_urls(url, USER_AGENTS)\n",
|
| 106 |
+
" pages_urls.extend(href_links)\n",
|
| 107 |
+
" pages_urls = list(dict.fromkeys(pages_urls)) # Remove any duplicate URLs.\n",
|
| 108 |
+
" url = next_50_link\n",
|
| 109 |
+
" return pages_urls\n",
|
| 110 |
+
"\n",
|
| 111 |
+
"\n",
|
| 112 |
+
"# This function loops through the provided list of page URLs and extracts post URLs from each of these pages.\n",
|
| 113 |
+
"# It ensures that there are no duplicate post URLs by converting the list into a dictionary and back to a list.\n",
|
| 114 |
+
"# It returns a list of unique post URLs.\n",
|
| 115 |
+
"def loop_through_pages(USER_AGENTS, pages_urls):\n",
|
| 116 |
+
" post_urls = []\n",
|
| 117 |
+
" for url in tqdm(pages_urls):\n",
|
| 118 |
+
" herf_links = get_post_urls(url, USER_AGENTS)\n",
|
| 119 |
+
" post_urls.extend(herf_links)\n",
|
| 120 |
+
" post_urls = list(dict.fromkeys(post_urls))\n",
|
| 121 |
+
" if DEMO_MODE:\n",
|
| 122 |
+
" break\n",
|
| 123 |
+
" return post_urls\n",
|
| 124 |
+
"\n",
|
| 125 |
+
"# This function processes a post page. It extracts various details like timestamps, author information, post content, topic, attachments, links, and original HTML information.\n",
|
| 126 |
+
"# The function returns a dictionary containing all this extracted data.\n",
|
| 127 |
+
"def read_subject_page(USER_AGENTS, post_url, df, remove_emoji):\n",
|
| 128 |
+
" time.sleep(1)\n",
|
| 129 |
+
" soup = get_web_content(post_url, USER_AGENTS)\n",
|
| 130 |
+
" form_tag = soup.find(\"form\", id=\"quickModForm\")\n",
|
| 131 |
+
" table_tag = form_tag.find(\"table\", class_=\"bordercolor\")\n",
|
| 132 |
+
" td_tag = table_tag.find_all(\"td\", class_=\"windowbg\")\n",
|
| 133 |
+
" td_tag.extend(table_tag.find_all(\"td\", class_=\"windowbg2\"))\n",
|
| 134 |
+
"\n",
|
| 135 |
+
" for comment in tqdm(td_tag):\n",
|
| 136 |
+
" res = extract_useful_content_windowbg(comment, remove_emoji)\n",
|
| 137 |
+
" if res is not None:\n",
|
| 138 |
+
" df = pd.concat([df, pd.DataFrame([res])])\n",
|
| 139 |
+
"\n",
|
| 140 |
+
" return df\n",
|
| 141 |
+
"\n",
|
| 142 |
+
"# This function extracts meaningful content from a given HTML element (`tr_tag`). This tag is likely a row in a table, given its name.\n",
|
| 143 |
+
"# The function checks the presence of specific tags and classes within this row to extract information such as timestamps, author, post content, topic, attachments, and links.\n",
|
| 144 |
+
"# The extracted data is returned as a dictionary.\n",
|
| 145 |
+
"def extract_useful_content_windowbg(tr_tag, remove_emoji=True):\n",
|
| 146 |
+
" \"\"\"\n",
|
| 147 |
+
" Timestamp of the post (ex: September 11, 2023, 07:49:45 AM; but if you want just 11/09/2023 is enough)\n",
|
| 148 |
+
" Author of the post (ex: SupermanBitcoin)\n",
|
| 149 |
+
" The post itself\n",
|
| 150 |
+
"\n",
|
| 151 |
+
" The topic where the post was posted (ex: [INFO - DISCUSSION] Security Budget Problem) eg. Whats your thoughts: Next-Gen Bitcoin Mining Machine With 1X Efficiency Rating.\n",
|
| 152 |
+
" Number of characters in the post --> so this is an integer\n",
|
| 153 |
+
" Does the post contain at least one attachment (image, video etc.) --> if yes put '1' in the column, if no, just put '0'\n",
|
| 154 |
+
" Does the post contain at least one link --> if yes put '1' in the column, if no, just put '0'\n",
|
| 155 |
+
" \"\"\"\n",
|
| 156 |
+
" headerandpost = tr_tag.find(\"td\", class_=\"td_headerandpost\")\n",
|
| 157 |
+
" if not headerandpost:\n",
|
| 158 |
+
" return None\n",
|
| 159 |
+
"\n",
|
| 160 |
+
" timestamp = headerandpost.find(\"div\", class_=\"smalltext\").get_text()\n",
|
| 161 |
+
" timestamps = timestamp.split(\"Last edit: \")\n",
|
| 162 |
+
" timestamp = timestamps[0].strip()\n",
|
| 163 |
+
" last_edit = None\n",
|
| 164 |
+
" if len(timestamps) > 1:\n",
|
| 165 |
+
" if 'Today ' in timestamps[1]:\n",
|
| 166 |
+
" last_edit = date.today().strftime(\"%B %d, %Y\")+', '+timestamps[1].split('by')[0].split(\"Today at\")[1].strip()\n",
|
| 167 |
+
" last_edit = timestamps[1].split('by')[0].strip()\n",
|
| 168 |
+
"\n",
|
| 169 |
+
" poster_info_tag = tr_tag.find('td', class_='poster_info')\n",
|
| 170 |
+
" anchor_tag = poster_info_tag.find('a')\n",
|
| 171 |
+
" author = \"Anonymous\" if anchor_tag is None else anchor_tag.get_text()\n",
|
| 172 |
+
"\n",
|
| 173 |
+
" link = 0\n",
|
| 174 |
+
"\n",
|
| 175 |
+
" post_ = tr_tag.find('div', class_='post')\n",
|
| 176 |
+
" texts = []\n",
|
| 177 |
+
" for child in post_.children:\n",
|
| 178 |
+
" if isinstance(child, NavigableString):\n",
|
| 179 |
+
" texts.append(child.strip())\n",
|
| 180 |
+
" elif child.has_attr('class') and 'ul' in child['class']:\n",
|
| 181 |
+
" link = 1\n",
|
| 182 |
+
" texts.append(child.get_text(strip=True))\n",
|
| 183 |
+
" post = ' '.join(texts)\n",
|
| 184 |
+
"\n",
|
| 185 |
+
" topic = headerandpost.find('div', class_='subject').get_text()\n",
|
| 186 |
+
"\n",
|
| 187 |
+
" image = headerandpost.find('div', class_='post').find_all('img')\n",
|
| 188 |
+
" if remove_emoji:\n",
|
| 189 |
+
" image = remove_emojis(image)\n",
|
| 190 |
+
" image_ = min(len(image), 1)\n",
|
| 191 |
+
" \n",
|
| 192 |
+
" video = headerandpost.find('div', class_='post').find('video')\n",
|
| 193 |
+
" video_ = 0 if video is None else 1\n",
|
| 194 |
+
" attachment = max(image_, video_)\n",
|
| 195 |
+
"\n",
|
| 196 |
+
" original_info = headerandpost\n",
|
| 197 |
+
"\n",
|
| 198 |
+
" return {\n",
|
| 199 |
+
" \"timestamp\": timestamp,\n",
|
| 200 |
+
" \"last_edit\": last_edit,\n",
|
| 201 |
+
" \"author\": author.strip(),\n",
|
| 202 |
+
" \"post\": post.strip(),\n",
|
| 203 |
+
" \"topic\": topic.strip(),\n",
|
| 204 |
+
" \"attachment\": attachment,\n",
|
| 205 |
+
" \"link\": link,\n",
|
| 206 |
+
" \"original_info\": original_info,\n",
|
| 207 |
+
" }\n",
|
| 208 |
+
"\n",
|
| 209 |
+
"\n",
|
| 210 |
+
"# A utility function to save a list (e.g., URLs) to a text file.\n",
|
| 211 |
+
"# Each item in the list gets its own line in the file.\n",
|
| 212 |
+
"def save_page_file(data, file_name):\n",
|
| 213 |
+
" with open(file_name, \"w\") as filehandle:\n",
|
| 214 |
+
" for listitem in data:\n",
|
| 215 |
+
" filehandle.write(\"%s\\n\" % listitem)\n",
|
| 216 |
+
"\n",
|
| 217 |
+
"def get_post_max_page(url, USER_AGENTS):\n",
|
| 218 |
+
" soup = get_web_content(url, USER_AGENTS)\n",
|
| 219 |
+
" # Finding the pagination links based on their HTML structure and CSS classes.\n",
|
| 220 |
+
" first_td = soup.find('td', class_='middletext')\n",
|
| 221 |
+
" nav_pages_links = first_td.find_all('a', class_='navPages')\n",
|
| 222 |
+
"\n",
|
| 223 |
+
" href_links = [int(link.text) if link.text.isdigit() else 0 for link in nav_pages_links]\n",
|
| 224 |
+
" if len(href_links) == 0:\n",
|
| 225 |
+
" # print('No pagination links found: ', url)\n",
|
| 226 |
+
" return 1\n",
|
| 227 |
+
" m = max(href_links)\n",
|
| 228 |
+
" # we can't use more than 10 pages\n",
|
| 229 |
+
" m = m if m < 10 else 10\n",
|
| 230 |
+
" return m\n",
|
| 231 |
+
"\n",
|
| 232 |
+
"\n",
|
| 233 |
+
"# def parse_args():\n",
|
| 234 |
+
"# parser = argparse.ArgumentParser()\n",
|
| 235 |
+
"# parser.add_argument(\"url\", help=\"url for the extraction\")\n",
|
| 236 |
+
"# parser.add_argument(\"--update\", help=\"extract updated data\", action=\"store_true\")\n",
|
| 237 |
+
"# parser.add_argument(\"--board\", help=\"board name\")\n",
|
| 238 |
+
"# parser.add_argument(\"--num_of_pages\", '-pages', help=\"number of pages to extract\", type=int)\n",
|
| 239 |
+
"# parser.add_argument(\"--num_of_posts_start\", '-posts', help=\"the number of posts start to extract\", type=int, default=0)\n",
|
| 240 |
+
"\n",
|
| 241 |
+
"# parser.add_argument(\"remove_emoji\", help=\"remove emoji from the post\", action=\"store_true\")\n",
|
| 242 |
+
"# return vars(parser.parse_args())\n",
|
| 243 |
+
"\n",
|
| 244 |
+
"# \n",
|
| 245 |
+
"\n"
|
| 246 |
+
]
|
| 247 |
+
},
|
| 248 |
+
{
|
| 249 |
+
"cell_type": "code",
|
| 250 |
+
"execution_count": 3,
|
| 251 |
+
"metadata": {},
|
| 252 |
+
"outputs": [],
|
| 253 |
+
"source": [
|
| 254 |
+
"mining_section = True\n",
|
| 255 |
+
"if mining_section:\n",
|
| 256 |
+
" url = \"https://bitcointalk.org/index.php?board=14.0\"\n",
|
| 257 |
+
"else:\n",
|
| 258 |
+
" url = \"https://bitcointalk.org/index.php?board=1.0\"\n",
|
| 259 |
+
"update = False\n",
|
| 260 |
+
"\n",
|
| 261 |
+
"if DEMO_MODE:\n",
|
| 262 |
+
" board = \"Demo\"\n",
|
| 263 |
+
" num_of_pages = 1\n",
|
| 264 |
+
" num_of_posts_start = 0\n",
|
| 265 |
+
"else:\n",
|
| 266 |
+
" board = \"Bitcoin\"\n",
|
| 267 |
+
" num_of_pages = 1528\n",
|
| 268 |
+
" num_of_posts_start = 248\n",
|
| 269 |
+
"\n",
|
| 270 |
+
"\n",
|
| 271 |
+
"\n",
|
| 272 |
+
"remove_emoji = True\n",
|
| 273 |
+
"\n",
|
| 274 |
+
"USER_AGENTS = get_web_component()\n",
|
| 275 |
+
"# Ensuring the data directory exists.\n",
|
| 276 |
+
"os.makedirs(f\"data/{board}/\", exist_ok=True)\n",
|
| 277 |
+
"pages_file_path = f\"data/{board}/pages_urls.txt\"\n",
|
| 278 |
+
"post_file_path = f\"data/{board}/post_urls.txt\"\n",
|
| 279 |
+
"# If the user chose to update the data, existing files are deleted to make way for new data.\n",
|
| 280 |
+
"if update:\n",
|
| 281 |
+
" if os.path.exists(pages_file_path):\n",
|
| 282 |
+
" os.remove(pages_file_path)\n",
|
| 283 |
+
" if os.path.exists(post_file_path):\n",
|
| 284 |
+
" os.remove(post_file_path)\n",
|
| 285 |
+
" \n",
|
| 286 |
+
"\n"
|
| 287 |
+
]
|
| 288 |
+
},
|
| 289 |
+
{
|
| 290 |
+
"cell_type": "code",
|
| 291 |
+
"execution_count": 4,
|
| 292 |
+
"metadata": {},
|
| 293 |
+
"outputs": [
|
| 294 |
+
{
|
| 295 |
+
"name": "stdout",
|
| 296 |
+
"output_type": "stream",
|
| 297 |
+
"text": [
|
| 298 |
+
"loop_through_source_url: 0\n"
|
| 299 |
+
]
|
| 300 |
+
}
|
| 301 |
+
],
|
| 302 |
+
"source": [
|
| 303 |
+
"# If the pages file doesn't exist, the script collects page URLs.\n",
|
| 304 |
+
"if not os.path.exists(pages_file_path):\n",
|
| 305 |
+
" pages_urls = loop_through_source_url(USER_AGENTS, url, num_of_pages)\n",
|
| 306 |
+
" save_page_file(pages_urls, pages_file_path)\n",
|
| 307 |
+
"# Reading the existing page URLs from the file.\n",
|
| 308 |
+
"with open(pages_file_path, \"r\") as filehandle:\n",
|
| 309 |
+
" pages_urls = [\n",
|
| 310 |
+
" current_place.rstrip() for current_place in filehandle.readlines()\n",
|
| 311 |
+
" ]\n"
|
| 312 |
+
]
|
| 313 |
+
},
|
| 314 |
+
{
|
| 315 |
+
"cell_type": "code",
|
| 316 |
+
"execution_count": 5,
|
| 317 |
+
"metadata": {},
|
| 318 |
+
"outputs": [
|
| 319 |
+
{
|
| 320 |
+
"name": "stderr",
|
| 321 |
+
"output_type": "stream",
|
| 322 |
+
"text": [
|
| 323 |
+
" 0%| | 0/52 [00:01<?, ?it/s]\n"
|
| 324 |
+
]
|
| 325 |
+
}
|
| 326 |
+
],
|
| 327 |
+
"source": [
|
| 328 |
+
"\n",
|
| 329 |
+
"# If the posts file doesn't exist, the script collects post URLs.\n",
|
| 330 |
+
"if not os.path.exists(post_file_path):\n",
|
| 331 |
+
" post_urls = loop_through_pages(USER_AGENTS, pages_urls)\n",
|
| 332 |
+
" save_page_file(post_urls, post_file_path)\n",
|
| 333 |
+
"# Reading the existing post URLs from the file.\n",
|
| 334 |
+
"with open(post_file_path, \"r\") as filehandle:\n",
|
| 335 |
+
" post_urls = [current_place.rstrip() for current_place in filehandle.readlines()]\n",
|
| 336 |
+
"\n"
|
| 337 |
+
]
|
| 338 |
+
},
|
| 339 |
+
{
|
| 340 |
+
"cell_type": "code",
|
| 341 |
+
"execution_count": 6,
|
| 342 |
+
"metadata": {},
|
| 343 |
+
"outputs": [],
|
| 344 |
+
"source": [
|
| 345 |
+
"# # for post_url in [\"https://bitcointalk.org/index.php?topic=1306983.0\"]:\n",
|
| 346 |
+
"# for post_url in [\"https://bitcointalk.org/index.php?topic=5489570.0\"]:\n",
|
| 347 |
+
"# time.sleep(0.8)\n",
|
| 348 |
+
"# num_of_post_pages = get_post_max_page(post_url, USER_AGENTS)\n",
|
| 349 |
+
"# loop_through_posts(USER_AGENTS, post_url, board, num_of_post_pages, remove_emoji)"
|
| 350 |
+
]
|
| 351 |
+
},
|
| 352 |
+
{
|
| 353 |
+
"cell_type": "code",
|
| 354 |
+
"execution_count": 7,
|
| 355 |
+
"metadata": {},
|
| 356 |
+
"outputs": [
|
| 357 |
+
{
|
| 358 |
+
"name": "stderr",
|
| 359 |
+
"output_type": "stream",
|
| 360 |
+
"text": [
|
| 361 |
+
"100%|██████████| 39/39 [01:02<00:00, 1.60s/it]\n"
|
| 362 |
+
]
|
| 363 |
+
}
|
| 364 |
+
],
|
| 365 |
+
"source": [
|
| 366 |
+
"post_urls_to_process = []\n",
|
| 367 |
+
"for post_url in post_urls:\n",
|
| 368 |
+
" topic_id = post_url.split('topic=')[1]\n",
|
| 369 |
+
" if os.path.exists(f'data/{board}/data_{topic_id}.csv'):\n",
|
| 370 |
+
" # print(f'data/{board}/data_{topic_id}.csv already exists')\n",
|
| 371 |
+
" continue\n",
|
| 372 |
+
" post_urls_to_process.append(post_url)\n",
|
| 373 |
+
"\n",
|
| 374 |
+
"\n",
|
| 375 |
+
"# for (i,post_url) in enumerate(post_urls_to_process):\n",
|
| 376 |
+
"for post_url in tqdm(post_urls_to_process):\n",
|
| 377 |
+
" num_of_post_pages = get_post_max_page(post_url, USER_AGENTS)\n",
|
| 378 |
+
" loop_through_posts(USER_AGENTS, post_url, board, num_of_post_pages, remove_emoji)\n",
|
| 379 |
+
" # print(f'{i+1}/{len(post_urls_to_process)} urls done')"
|
| 380 |
+
]
|
| 381 |
+
},
|
| 382 |
+
{
|
| 383 |
+
"cell_type": "code",
|
| 384 |
+
"execution_count": 8,
|
| 385 |
+
"metadata": {},
|
| 386 |
+
"outputs": [],
|
| 387 |
+
"source": [
|
| 388 |
+
"# import time\n",
|
| 389 |
+
"# import winsound\n",
|
| 390 |
+
"# from tqdm import tqdm\n",
|
| 391 |
+
"# winsound.MessageBeep(winsound.MB_ICONEXCLAMATION)"
|
| 392 |
+
]
|
| 393 |
+
}
|
| 394 |
+
],
|
| 395 |
+
"metadata": {
|
| 396 |
+
"kernelspec": {
|
| 397 |
+
"display_name": "py310",
|
| 398 |
+
"language": "python",
|
| 399 |
+
"name": "python3"
|
| 400 |
+
},
|
| 401 |
+
"language_info": {
|
| 402 |
+
"codemirror_mode": {
|
| 403 |
+
"name": "ipython",
|
| 404 |
+
"version": 3
|
| 405 |
+
},
|
| 406 |
+
"file_extension": ".py",
|
| 407 |
+
"mimetype": "text/x-python",
|
| 408 |
+
"name": "python",
|
| 409 |
+
"nbconvert_exporter": "python",
|
| 410 |
+
"pygments_lexer": "ipython3",
|
| 411 |
+
"version": "3.10.13"
|
| 412 |
+
}
|
| 413 |
+
},
|
| 414 |
+
"nbformat": 4,
|
| 415 |
+
"nbformat_minor": 2
|
| 416 |
+
}
|
scraper/main.py
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Importing necessary libraries:
|
| 2 |
+
# - os, json, time for file, data and time operations respectively.
|
| 3 |
+
# - requests for making HTTP requests.
|
| 4 |
+
# - BeautifulSoup for parsing HTML content.
|
| 5 |
+
# - Other imports for logging, data manipulation, progress indication, and more.
|
| 6 |
+
import os
|
| 7 |
+
import json
|
| 8 |
+
import time
|
| 9 |
+
import munch
|
| 10 |
+
import requests
|
| 11 |
+
import argparse
|
| 12 |
+
import pandas as pd
|
| 13 |
+
from tqdm import tqdm
|
| 14 |
+
from datetime import date
|
| 15 |
+
from loguru import logger
|
| 16 |
+
from random import randint
|
| 17 |
+
from bs4 import BeautifulSoup, NavigableString
|
| 18 |
+
|
| 19 |
+
from preprocessing.preprocessing_sub_functions import remove_emojis
|
| 20 |
+
from topic_crawling import loop_through_posts
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
# This function reads a JSON file named "website_format.json".
|
| 24 |
+
# The file contain a list of user agents.
|
| 25 |
+
# User agents are strings that browsers send to websites to identify themselves.
|
| 26 |
+
# This list is likely used to rotate between different user agents when making requests,
|
| 27 |
+
# making the scraper seem like different browsers and reducing the chances of being blocked.
|
| 28 |
+
def get_web_component():
|
| 29 |
+
with open("website_format.json") as json_file:
|
| 30 |
+
website_format = json.load(json_file)
|
| 31 |
+
website_format = munch.munchify(website_format)
|
| 32 |
+
return website_format.USER_AGENTS
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# This function fetches a webpage's content.
|
| 36 |
+
# It randomly selects a user agent from the provided list to make the request.
|
| 37 |
+
# After fetching, it uses BeautifulSoup to parse the page's HTML content.
|
| 38 |
+
def get_web_content(url, USER_AGENTS):
|
| 39 |
+
random_agent = USER_AGENTS[randint(0, len(USER_AGENTS) - 1)]
|
| 40 |
+
headers = {"User-Agent": random_agent}
|
| 41 |
+
req = requests.get(url, headers=headers)
|
| 42 |
+
req.encoding = req.apparent_encoding
|
| 43 |
+
soup = BeautifulSoup(req.text, features="lxml")
|
| 44 |
+
return soup
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
# This function extracts pagination links from a page.
|
| 48 |
+
# These links point to other pages of content, often seen at the bottom of forums or search results.
|
| 49 |
+
# The function returns both the individual page links and the "next" link,
|
| 50 |
+
# which points to the next set of results.
|
| 51 |
+
def get_pages_urls(url, USER_AGENTS):
|
| 52 |
+
time.sleep(1)
|
| 53 |
+
soup = get_web_content(url, USER_AGENTS)
|
| 54 |
+
# Finding the pagination links based on their HTML structure and CSS classes.
|
| 55 |
+
first_td = soup.find("td", class_="middletext", id="toppages")
|
| 56 |
+
nav_pages_links = first_td.find_all("a", class_="navPages")
|
| 57 |
+
href_links = [link["href"] for link in nav_pages_links]
|
| 58 |
+
next_50_link = href_links[-3] # Assuming the third-last link is the "next" link.
|
| 59 |
+
href_links.insert(0, url)
|
| 60 |
+
return href_links, next_50_link
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
# This function extracts individual post URLs from a page.
|
| 64 |
+
# It's likely targeting a forum or blog structure, where multiple posts or threads are listed on one page.
|
| 65 |
+
def get_post_urls(url, USER_AGENTS):
|
| 66 |
+
time.sleep(1)
|
| 67 |
+
soup = get_web_content(url, USER_AGENTS)
|
| 68 |
+
# Finding post links based on their HTML structure and CSS classes.
|
| 69 |
+
links_elements = soup.select("td.windowbg span a")
|
| 70 |
+
links = [link["href"] for link in links_elements]
|
| 71 |
+
|
| 72 |
+
# # If including rules and announcements posts
|
| 73 |
+
# links_elements = soup.select('td.windowbg3 span a')
|
| 74 |
+
# links_ = [link['href'] for link in links_elements]
|
| 75 |
+
# links.extend(links_)
|
| 76 |
+
|
| 77 |
+
return links
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
# This function loops through the main page and its paginated versions to collect URLs.
|
| 81 |
+
# It repeatedly calls 'get_pages_urls' to fetch batches of URLs until the desired number (num_of_pages) is reached.
|
| 82 |
+
def loop_through_source_url(USER_AGENTS, url, num_of_pages):
|
| 83 |
+
pages_urls = []
|
| 84 |
+
counter = 0
|
| 85 |
+
while len(pages_urls) != num_of_pages:
|
| 86 |
+
href_links, next_50_link = get_pages_urls(url, USER_AGENTS)
|
| 87 |
+
pages_urls.extend(href_links)
|
| 88 |
+
pages_urls = list(dict.fromkeys(pages_urls)) # Remove any duplicate URLs.
|
| 89 |
+
url = next_50_link
|
| 90 |
+
return pages_urls
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
# This function loops through the provided list of page URLs and extracts post URLs from each of these pages.
|
| 94 |
+
# It ensures that there are no duplicate post URLs by converting the list into a dictionary and back to a list.
|
| 95 |
+
# It returns a list of unique post URLs.
|
| 96 |
+
def loop_through_pages(USER_AGENTS, pages_urls):
|
| 97 |
+
post_urls = []
|
| 98 |
+
for url in tqdm(pages_urls):
|
| 99 |
+
herf_links = get_post_urls(url, USER_AGENTS)
|
| 100 |
+
post_urls.extend(herf_links)
|
| 101 |
+
post_urls = list(dict.fromkeys(post_urls))
|
| 102 |
+
return post_urls
|
| 103 |
+
|
| 104 |
+
# This function processes a post page. It extracts various details like timestamps, author information, post content, topic, attachments, links, and original HTML information.
|
| 105 |
+
# The function returns a dictionary containing all this extracted data.
|
| 106 |
+
def read_subject_page(USER_AGENTS, post_url, df, remove_emoji):
|
| 107 |
+
time.sleep(1)
|
| 108 |
+
soup = get_web_content(post_url, USER_AGENTS)
|
| 109 |
+
form_tag = soup.find("form", id="quickModForm")
|
| 110 |
+
table_tag = form_tag.find("table", class_="bordercolor")
|
| 111 |
+
td_tag = table_tag.find_all("td", class_="windowbg")
|
| 112 |
+
td_tag.extend(table_tag.find_all("td", class_="windowbg2"))
|
| 113 |
+
|
| 114 |
+
for comment in tqdm(td_tag):
|
| 115 |
+
res = extract_useful_content_windowbg(comment, remove_emoji)
|
| 116 |
+
if res is not None:
|
| 117 |
+
df = pd.concat([df, pd.DataFrame([res])])
|
| 118 |
+
|
| 119 |
+
return df
|
| 120 |
+
|
| 121 |
+
# This function extracts meaningful content from a given HTML element (`tr_tag`). This tag is likely a row in a table, given its name.
|
| 122 |
+
# The function checks the presence of specific tags and classes within this row to extract information such as timestamps, author, post content, topic, attachments, and links.
|
| 123 |
+
# The extracted data is returned as a dictionary.
|
| 124 |
+
def extract_useful_content_windowbg(tr_tag, remove_emoji=True):
|
| 125 |
+
"""
|
| 126 |
+
Timestamp of the post (ex: September 11, 2023, 07:49:45 AM; but if you want just 11/09/2023 is enough)
|
| 127 |
+
Author of the post (ex: SupermanBitcoin)
|
| 128 |
+
The post itself
|
| 129 |
+
|
| 130 |
+
The topic where the post was posted (ex: [INFO - DISCUSSION] Security Budget Problem) eg. Whats your thoughts: Next-Gen Bitcoin Mining Machine With 1X Efficiency Rating.
|
| 131 |
+
Number of characters in the post --> so this is an integer
|
| 132 |
+
Does the post contain at least one attachment (image, video etc.) --> if yes put '1' in the column, if no, just put '0'
|
| 133 |
+
Does the post contain at least one link --> if yes put '1' in the column, if no, just put '0'
|
| 134 |
+
"""
|
| 135 |
+
headerandpost = tr_tag.find("td", class_="td_headerandpost")
|
| 136 |
+
if not headerandpost:
|
| 137 |
+
return None
|
| 138 |
+
|
| 139 |
+
timestamp = headerandpost.find("div", class_="smalltext").get_text()
|
| 140 |
+
timestamps = timestamp.split("Last edit: ")
|
| 141 |
+
timestamp = timestamps[0].strip()
|
| 142 |
+
last_edit = None
|
| 143 |
+
if len(timestamps) > 1:
|
| 144 |
+
if 'Today ' in timestamps[1]:
|
| 145 |
+
last_edit = date.today().strftime("%B %d, %Y")+', '+timestamps[1].split('by')[0].split("Today at")[1].strip()
|
| 146 |
+
last_edit = timestamps[1].split('by')[0].strip()
|
| 147 |
+
|
| 148 |
+
poster_info_tag = tr_tag.find('td', class_='poster_info')
|
| 149 |
+
anchor_tag = poster_info_tag.find('a')
|
| 150 |
+
author = "Anonymous" if anchor_tag is None else anchor_tag.get_text()
|
| 151 |
+
|
| 152 |
+
link = 0
|
| 153 |
+
|
| 154 |
+
post_ = tr_tag.find('div', class_='post')
|
| 155 |
+
texts = []
|
| 156 |
+
for child in post_.children:
|
| 157 |
+
if isinstance(child, NavigableString):
|
| 158 |
+
texts.append(child.strip())
|
| 159 |
+
elif child.has_attr('class') and 'ul' in child['class']:
|
| 160 |
+
link = 1
|
| 161 |
+
texts.append(child.get_text(strip=True))
|
| 162 |
+
post = ' '.join(texts)
|
| 163 |
+
|
| 164 |
+
topic = headerandpost.find('div', class_='subject').get_text()
|
| 165 |
+
|
| 166 |
+
image = headerandpost.find('div', class_='post').find_all('img')
|
| 167 |
+
if remove_emoji:
|
| 168 |
+
image = remove_emojis(image)
|
| 169 |
+
image_ = min(len(image), 1)
|
| 170 |
+
|
| 171 |
+
video = headerandpost.find('div', class_='post').find('video')
|
| 172 |
+
video_ = 0 if video is None else 1
|
| 173 |
+
attachment = max(image_, video_)
|
| 174 |
+
|
| 175 |
+
original_info = headerandpost
|
| 176 |
+
|
| 177 |
+
return {
|
| 178 |
+
"timestamp": timestamp,
|
| 179 |
+
"last_edit": last_edit,
|
| 180 |
+
"author": author.strip(),
|
| 181 |
+
"post": post.strip(),
|
| 182 |
+
"topic": topic.strip(),
|
| 183 |
+
"attachment": attachment,
|
| 184 |
+
"link": link,
|
| 185 |
+
"original_info": original_info,
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
# A utility function to save a list (e.g., URLs) to a text file.
|
| 190 |
+
# Each item in the list gets its own line in the file.
|
| 191 |
+
def save_page_file(data, file_name):
|
| 192 |
+
with open(file_name, "w") as filehandle:
|
| 193 |
+
for listitem in data:
|
| 194 |
+
filehandle.write("%s\n" % listitem)
|
| 195 |
+
|
| 196 |
+
def get_post_max_page(url, USER_AGENTS):
|
| 197 |
+
soup = get_web_content(url, USER_AGENTS)
|
| 198 |
+
# Finding the pagination links based on their HTML structure and CSS classes.
|
| 199 |
+
first_td = soup.find('td', class_='middletext')
|
| 200 |
+
nav_pages_links = first_td.find_all('a', class_='navPages')
|
| 201 |
+
|
| 202 |
+
href_links = [int(link.text) if link.text.isdigit() else 0 for link in nav_pages_links]
|
| 203 |
+
return max(href_links)
|
| 204 |
+
|
| 205 |
+
# This function sets up command-line arguments for the script, allowing users to provide input without modifying the code.
|
| 206 |
+
# Possible inputs include the starting URL, whether or not to update data, the board's name, and how many pages or posts to process.
|
| 207 |
+
def parse_args():
|
| 208 |
+
parser = argparse.ArgumentParser()
|
| 209 |
+
parser.add_argument("url", help="url for the extraction")
|
| 210 |
+
parser.add_argument("--update", help="extract updated data", action="store_true")
|
| 211 |
+
parser.add_argument("--board", help="board name")
|
| 212 |
+
parser.add_argument("--num_of_pages", '-pages', help="number of pages to extract", type=int)
|
| 213 |
+
parser.add_argument("--num_of_posts_start", '-posts', help="the number of posts start to extract", type=int, default=0)
|
| 214 |
+
|
| 215 |
+
parser.add_argument("remove_emoji", help="remove emoji from the post", action="store_true")
|
| 216 |
+
return vars(parser.parse_args())
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
def main(url, update, board, num_of_pages, num_of_posts_start, remove_emoji):
|
| 221 |
+
USER_AGENTS = get_web_component()
|
| 222 |
+
# Ensuring the data directory exists.
|
| 223 |
+
os.makedirs(f"data/{board}/", exist_ok=True)
|
| 224 |
+
pages_file_path = f"data/{board}/pages_urls.txt"
|
| 225 |
+
post_file_path = f"data/{board}/post_urls.txt"
|
| 226 |
+
# If the user chose to update the data, existing files are deleted to make way for new data.
|
| 227 |
+
if update:
|
| 228 |
+
if os.path.exists(pages_file_path):
|
| 229 |
+
os.remove(pages_file_path)
|
| 230 |
+
if os.path.exists(post_file_path):
|
| 231 |
+
os.remove(post_file_path)
|
| 232 |
+
|
| 233 |
+
# If the pages file doesn't exist, the script collects page URLs.
|
| 234 |
+
if not os.path.exists(pages_file_path):
|
| 235 |
+
pages_urls = loop_through_source_url(USER_AGENTS, url, num_of_pages)
|
| 236 |
+
save_page_file(pages_urls, pages_file_path)
|
| 237 |
+
# Reading the existing page URLs from the file.
|
| 238 |
+
with open(pages_file_path, "r") as filehandle:
|
| 239 |
+
pages_urls = [
|
| 240 |
+
current_place.rstrip() for current_place in filehandle.readlines()
|
| 241 |
+
]
|
| 242 |
+
|
| 243 |
+
# If the posts file doesn't exist, the script collects post URLs.
|
| 244 |
+
if not os.path.exists(post_file_path):
|
| 245 |
+
post_urls = loop_through_pages(USER_AGENTS, pages_urls)
|
| 246 |
+
save_page_file(post_urls, post_file_path)
|
| 247 |
+
# Reading the existing post URLs from the file.
|
| 248 |
+
with open(post_file_path, "r") as filehandle:
|
| 249 |
+
post_urls = [current_place.rstrip() for current_place in filehandle.readlines()]
|
| 250 |
+
|
| 251 |
+
for post_url in tqdm(post_urls[num_of_posts_start:]):
|
| 252 |
+
time.sleep(0.8)
|
| 253 |
+
num_of_post_pages = get_post_max_page(post_url, USER_AGENTS)
|
| 254 |
+
loop_through_posts(USER_AGENTS, post_url, board, num_of_post_pages, remove_emoji)
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
if __name__ == "__main__":
|
| 258 |
+
main(**parse_args())
|
scraper/preprocessing/__pycache__/preprocessing_sub_functions.cpython-310.pyc
ADDED
|
Binary file (7.91 kB). View file
|
|
|
scraper/preprocessing/preprocessing.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Importing standard libraries
|
| 2 |
+
import os
|
| 3 |
+
import glob
|
| 4 |
+
import argparse
|
| 5 |
+
import pandas as pd
|
| 6 |
+
from tqdm import tqdm
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
# Additional preprocessing functions are imported from another module.
|
| 10 |
+
from preprocessing_sub_functions import *
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
# This function returns a list of all CSV files in the given directory path.
|
| 14 |
+
def get_files(path):
|
| 15 |
+
return glob.glob(path + "/*.csv")
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
# This function aims to remove meta information from the text.
|
| 19 |
+
# The specifics of what meta information is removed depends on the function 'remove_meta_info'.
|
| 20 |
+
def raw_preprocess(text):
|
| 21 |
+
text = remove_meta_info(text)
|
| 22 |
+
return text
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# A comprehensive text preprocessing function that applies several common preprocessing steps:
|
| 26 |
+
# - URLs are removed from the text.
|
| 27 |
+
# - The entire text is converted to lowercase to ensure uniformity.
|
| 28 |
+
# - Punctuation is stripped from the text.
|
| 29 |
+
# - Extra whitespaces (if any) are removed.
|
| 30 |
+
# - The text is tokenized (split into individual words or tokens).
|
| 31 |
+
# - Contractions (like "can't" or "won't") are expanded to their full forms.
|
| 32 |
+
# - Common words (stopwords) that don't add significant meaning are removed.
|
| 33 |
+
# Finally, the cleaned tokens are joined back into a string.
|
| 34 |
+
def text_preprocess(text):
|
| 35 |
+
text = remove_urls(text)
|
| 36 |
+
text = to_lowercase(text)
|
| 37 |
+
text = remove_sentence_punctuation(text)
|
| 38 |
+
text = remove_extra_whitespace(text)
|
| 39 |
+
tokens = tokenize(text)
|
| 40 |
+
tokens = expand_contractions(tokens)
|
| 41 |
+
tokens = remove_stopwords(tokens)
|
| 42 |
+
text = " ".join(tokens)
|
| 43 |
+
return text
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
# This function preprocesses a dataframe.
|
| 47 |
+
# Specific preprocessing steps include:
|
| 48 |
+
# - Removing rows marked as 'deleted'.
|
| 49 |
+
# - Removing posts marked as 'deleted'.
|
| 50 |
+
# - Updating the 'lastEdit' column.
|
| 51 |
+
# - Converting timestamps to a datetime format.
|
| 52 |
+
# - Renaming the 'timestamp' column to 'start_edit'.
|
| 53 |
+
def csv_preprocess(df):
|
| 54 |
+
df = remove_deleted(df)
|
| 55 |
+
df = remove_deleted_post(df)
|
| 56 |
+
df = update_lastEdit(df)
|
| 57 |
+
df = convert_to_datetime(df)
|
| 58 |
+
df.rename(columns={"timestamp": "start_edit"}, inplace=True)
|
| 59 |
+
return df
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
# This function processes individual CSV files:
|
| 63 |
+
# - Reads the CSV into a DataFrame.
|
| 64 |
+
# - Applies dataframe preprocessing.
|
| 65 |
+
# - Applies raw text preprocessing to the 'post' column.
|
| 66 |
+
# - Saves the raw preprocessed data into a 'raw-data' folder.
|
| 67 |
+
# - Applies comprehensive text preprocessing to the 'post' column.
|
| 68 |
+
# - Saves the fully preprocessed data into a 'preprocessed-data' folder.
|
| 69 |
+
def loop_through_csvs(filePath):
|
| 70 |
+
file = os.path.basename(filePath)
|
| 71 |
+
folder = os.path.basename(os.path.dirname(filePath))
|
| 72 |
+
df = pd.read_csv(filePath)
|
| 73 |
+
df = csv_preprocess(df)
|
| 74 |
+
|
| 75 |
+
# Create a directory for raw data if it doesn't exist.
|
| 76 |
+
raw_folder = Path(f"raw-data/{folder}")
|
| 77 |
+
raw_folder.mkdir(parents=True, exist_ok=True)
|
| 78 |
+
|
| 79 |
+
# Apply raw preprocessing to the 'post' column of the dataframe.
|
| 80 |
+
df["post"] = df["post"].apply(raw_preprocess)
|
| 81 |
+
|
| 82 |
+
# Sort the dataframe by the 'last_edit' column.
|
| 83 |
+
df.sort_values(by=["last_edit"], inplace=True)
|
| 84 |
+
|
| 85 |
+
# Save the raw preprocessed dataframe to a CSV file.
|
| 86 |
+
df.to_csv(f"{raw_folder}/{file}", index=False)
|
| 87 |
+
|
| 88 |
+
# Create a directory for fully preprocessed data if it doesn't exist.
|
| 89 |
+
clean_folder = Path(f"preprocessed-data/{folder}")
|
| 90 |
+
clean_folder.mkdir(parents=True, exist_ok=True)
|
| 91 |
+
|
| 92 |
+
# Apply the comprehensive text preprocessing to the 'post' column and store the result in a new column.
|
| 93 |
+
df["preprocessed_post"] = df["post"].apply(text_preprocess)
|
| 94 |
+
|
| 95 |
+
# Sort the dataframe by the 'last_edit' column again.
|
| 96 |
+
df.sort_values(by=["last_edit"], inplace=True)
|
| 97 |
+
|
| 98 |
+
# Save the fully preprocessed dataframe to a CSV file.
|
| 99 |
+
df.to_csv(f"{clean_folder}/{file}", index=False)
|
| 100 |
+
|
| 101 |
+
return df
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
# A function to parse command-line arguments.
|
| 105 |
+
# The script expects a 'path' argument which indicates the directory where the raw CSV files are located.
|
| 106 |
+
def parse_args():
|
| 107 |
+
parser = argparse.ArgumentParser()
|
| 108 |
+
parser.add_argument("path", help="path for the extraction")
|
| 109 |
+
return vars(parser.parse_args())
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
# The main function of the script:
|
| 113 |
+
# - It retrieves all the CSV files from the specified directory.
|
| 114 |
+
# - Loops through each file, applying the preprocessing steps.
|
| 115 |
+
# - If an error occurs during processing, the error message is appended to an 'error_log.txt' file.
|
| 116 |
+
def main(path):
|
| 117 |
+
print(f'Preprocessing data in {path}')
|
| 118 |
+
rawFiles = get_files(path)
|
| 119 |
+
for filePath in tqdm(rawFiles):
|
| 120 |
+
try:
|
| 121 |
+
df = loop_through_csvs(filePath)
|
| 122 |
+
except Exception as e:
|
| 123 |
+
# If an error occurs, log the error message to a file.
|
| 124 |
+
with open(f"{path}/error_log.txt", "a") as f:
|
| 125 |
+
f.write(f"{filePath} -- {e}\\n")
|
| 126 |
+
continue
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
if __name__ == "__main__":
|
| 130 |
+
main(**parse_args())
|
scraper/preprocessing/preprocessing.sh
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
python preprocessing.py /local/home/puwong/bitcoin/bitcointalk_crawler/data/mining_support
|
| 2 |
+
python preprocessing.py /local/home/puwong/bitcoin/bitcointalk_crawler/data/mining_speculation
|
| 3 |
+
python preprocessing.py /local/home/puwong/bitcoin/bitcointalk_crawler/data/miners
|
| 4 |
+
python preprocessing.py /local/home/puwong/bitcoin/bitcointalk_crawler/data/hardware
|
| 5 |
+
python preprocessing.py /local/home/puwong/bitcoin/bitcointalk_crawler/data/groupbuys
|
| 6 |
+
python preprocessing.py /local/home/puwong/bitcoin/bitcointalk_crawler/data/pools
|
| 7 |
+
python preprocessing.py /local/home/puwong/bitcoin/bitcointalk_crawler/data/mining
|
scraper/preprocessing/preprocessing_sub_functions.py
ADDED
|
@@ -0,0 +1,248 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# preprocessing sub functions
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
import os
|
| 5 |
+
import glob
|
| 6 |
+
import string
|
| 7 |
+
import pandas as pd
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
import nltk
|
| 10 |
+
from nltk.corpus import stopwords
|
| 11 |
+
from nltk.stem import WordNetLemmatizer
|
| 12 |
+
import contractions
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def remove_deleted(df):
|
| 16 |
+
r"""
|
| 17 |
+
remove_deleted function.
|
| 18 |
+
This function appears to remove deleted post from crawled website data.
|
| 19 |
+
|
| 20 |
+
Args:
|
| 21 |
+
df: dataframe of crawled website data.
|
| 22 |
+
|
| 23 |
+
Returns:
|
| 24 |
+
df: dataframe of crawled website data without deleted post.
|
| 25 |
+
"""
|
| 26 |
+
# Remove rows where the 'timestamp' column is numeric
|
| 27 |
+
df = df[~df['timestamp'].str.isnumeric()]
|
| 28 |
+
df.reset_index(drop=True, inplace=True)
|
| 29 |
+
return df
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def remove_deleted_post(df):
|
| 33 |
+
r"""
|
| 34 |
+
remove_deleted_post function.
|
| 35 |
+
This function appears to remove deleted post where is in another format.
|
| 36 |
+
|
| 37 |
+
Args:
|
| 38 |
+
df: dataframe of crawled website data.
|
| 39 |
+
|
| 40 |
+
Returns:
|
| 41 |
+
df: dataframe of crawled website data without deleted post.
|
| 42 |
+
"""
|
| 43 |
+
# Remove rows where the 'post' column contains 'del'
|
| 44 |
+
df = df[df['post'] != 'del']
|
| 45 |
+
df.reset_index(drop=True, inplace=True)
|
| 46 |
+
return df
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def update_lastEdit(df):
|
| 50 |
+
r"""
|
| 51 |
+
update_lastEdit function.
|
| 52 |
+
This function appears to fill NaN values in the 'last_edit' column with corresponding values from the 'timestamp' column
|
| 53 |
+
|
| 54 |
+
Args:
|
| 55 |
+
df: dataframe of crawled website data.
|
| 56 |
+
|
| 57 |
+
Returns:
|
| 58 |
+
df: dataframe of crawled website data with updated last_edit.
|
| 59 |
+
"""
|
| 60 |
+
df.loc[:, 'last_edit'] = df['last_edit'].fillna(df['timestamp'])
|
| 61 |
+
return df
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def preprocess_date(date_str):
|
| 65 |
+
r"""
|
| 66 |
+
preprocess_date function.
|
| 67 |
+
This function appears to convert occurrences of 'Today' in a date string to the current date
|
| 68 |
+
Args:
|
| 69 |
+
date_str: str that contains date information.
|
| 70 |
+
|
| 71 |
+
Returns:
|
| 72 |
+
str that contains date information with updated 'Today' to current date.
|
| 73 |
+
"""
|
| 74 |
+
if "Today " in date_str:
|
| 75 |
+
current_date = datetime.now().strftime("%B %d, %Y")
|
| 76 |
+
return date_str.replace("Today", current_date)
|
| 77 |
+
return date_str
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def convert_datetime_with_multiple_formats(date_str, formats):
|
| 81 |
+
r"""
|
| 82 |
+
convert_datetime_with_multiple_formats function.
|
| 83 |
+
This function appears to Convert a date string to a datetime object using multiple possible formats.
|
| 84 |
+
|
| 85 |
+
Args:
|
| 86 |
+
date_str: str that contains date information.
|
| 87 |
+
formats: list of possible date formats.
|
| 88 |
+
|
| 89 |
+
Returns:
|
| 90 |
+
datetime object.
|
| 91 |
+
"""
|
| 92 |
+
for fmt in formats:
|
| 93 |
+
try:
|
| 94 |
+
return pd.to_datetime(date_str, format=fmt)
|
| 95 |
+
except ValueError:
|
| 96 |
+
continue
|
| 97 |
+
raise ValueError(f"Time data {date_str} doesn't match provided formats")
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def convert_to_datetime(df_):
|
| 101 |
+
r"""
|
| 102 |
+
convert_to_datetime function.
|
| 103 |
+
This function appears to convert 'timestamp' and 'last_edit' columns to datetime format
|
| 104 |
+
|
| 105 |
+
Args:
|
| 106 |
+
df_: dataframe of crawled website data.
|
| 107 |
+
|
| 108 |
+
Returns:
|
| 109 |
+
df: dataframe of crawled website data with datatime format in 'timestamp' and 'last_edit' columns.
|
| 110 |
+
"""
|
| 111 |
+
df = df_.copy()
|
| 112 |
+
|
| 113 |
+
# Preprocess 'timestamp' and 'last_edit' columns to handle 'Today' values
|
| 114 |
+
df['timestamp'] = df['timestamp'].apply(preprocess_date)
|
| 115 |
+
df['last_edit'] = df['last_edit'].apply(preprocess_date)
|
| 116 |
+
|
| 117 |
+
# List of potential datetime formats
|
| 118 |
+
datetime_formats = ["%B %d, %Y at %I:%M:%S %p", "%B %d, %Y, %I:%M:%S %p"]
|
| 119 |
+
|
| 120 |
+
df['timestamp'] = df['timestamp'].apply(
|
| 121 |
+
convert_datetime_with_multiple_formats, formats=datetime_formats)
|
| 122 |
+
df['timestamp'] = df['timestamp'].dt.date
|
| 123 |
+
df['last_edit'] = df['last_edit'].apply(
|
| 124 |
+
convert_datetime_with_multiple_formats, formats=datetime_formats)
|
| 125 |
+
df['last_edit'] = df['last_edit'].dt.date
|
| 126 |
+
|
| 127 |
+
return df
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def remove_urls(text):
|
| 131 |
+
r"""
|
| 132 |
+
remove_urls function.
|
| 133 |
+
This function appears to Remove URLs from a text.
|
| 134 |
+
"""
|
| 135 |
+
return re.sub(r'http\S+', '', text)
|
| 136 |
+
|
| 137 |
+
#
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def remove_extra_whitespace(text):
|
| 141 |
+
r"""
|
| 142 |
+
remove_extra_whitespace function.
|
| 143 |
+
This function appears to Remove extra whitespace characters from a text.
|
| 144 |
+
"""
|
| 145 |
+
return ' '.join(text.split())
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def remove_special_characters(text):
|
| 149 |
+
r"""
|
| 150 |
+
remove_special_characters function.
|
| 151 |
+
This function appears to remove special characters from a text.
|
| 152 |
+
"""
|
| 153 |
+
return re.sub(r'[^\w\s]', '', text)
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
def to_lowercase(text):
|
| 157 |
+
r"""
|
| 158 |
+
to_lowercase function.
|
| 159 |
+
This function appears to convert a text to lowercase.
|
| 160 |
+
"""
|
| 161 |
+
return text.lower()
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
def remove_meta_info(text):
|
| 165 |
+
r"""
|
| 166 |
+
remove_meta_info function.
|
| 167 |
+
This function appears to remove meta information where it contain quotes information.
|
| 168 |
+
"""
|
| 169 |
+
text = str(text)
|
| 170 |
+
return re.sub(r'Quote from: [a-zA-Z0-9_]+ on [a-zA-Z0-9, :]+ (AM|PM)', '', text)
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
def tokenize(text):
|
| 174 |
+
r"""
|
| 175 |
+
tokenize function.
|
| 176 |
+
This function appears to Tokenize a text into individual words.
|
| 177 |
+
"""
|
| 178 |
+
return text.split(' ')
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
def remove_sentence_punctuation(text):
|
| 182 |
+
r"""
|
| 183 |
+
remove_sentence_punctuation function.
|
| 184 |
+
This function appears to remove punctuation from a text, excluding math symbols.
|
| 185 |
+
"""
|
| 186 |
+
math_symbols = "+-×*÷/=()[]{},.<>%^"
|
| 187 |
+
punctuations_to_remove = ''.join(
|
| 188 |
+
set(string.punctuation) - set(math_symbols))
|
| 189 |
+
return text.translate(str.maketrans(punctuations_to_remove, ' ' * len(punctuations_to_remove)))
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
def lemmatize_text(text):
|
| 193 |
+
r"""
|
| 194 |
+
lemmatize_text function.
|
| 195 |
+
This function appears to lemmatize text, where it convert words to their base form.
|
| 196 |
+
"""
|
| 197 |
+
lemmatizer = WordNetLemmatizer()
|
| 198 |
+
return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
def replace_numbers(text, replace_with="<NUM>"):
|
| 202 |
+
r"""
|
| 203 |
+
replace_numbers function.
|
| 204 |
+
This function appears to replace numbers in a text with a specified string (default is "<NUM>").
|
| 205 |
+
"""
|
| 206 |
+
return re.sub(r'\b\d+\b', replace_with, text)
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
def remove_stopwords(tokens):
|
| 210 |
+
r"""
|
| 211 |
+
remove_stopwords function.
|
| 212 |
+
This function appears to remove stopwords from a list of tokens.
|
| 213 |
+
"""
|
| 214 |
+
stop_words = set(stopwords.words('english'))
|
| 215 |
+
return [word for word in tokens if word not in stop_words]
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
def expand_contractions(tokens):
|
| 219 |
+
r"""
|
| 220 |
+
expand_contractions function.
|
| 221 |
+
This function appears to expand contractions in a list of tokens (e.g., "isn't" to "is not")
|
| 222 |
+
"""
|
| 223 |
+
return [contractions.fix(word) for word in tokens]
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
def remove_repeated_phrases(text):
|
| 227 |
+
r"""
|
| 228 |
+
remove_repeated_phrases function.
|
| 229 |
+
This function appears to remove repeated phrases from a text.
|
| 230 |
+
eg. "hello hello world" -> "hello world"
|
| 231 |
+
"""
|
| 232 |
+
phrases = text.split()
|
| 233 |
+
seen = set()
|
| 234 |
+
output = []
|
| 235 |
+
for phrase in phrases:
|
| 236 |
+
if phrase not in seen:
|
| 237 |
+
seen.add(phrase)
|
| 238 |
+
output.append(phrase)
|
| 239 |
+
return ' '.join(output)
|
| 240 |
+
|
| 241 |
+
def remove_emojis(images):
|
| 242 |
+
pattern = r"https://bitcointalk\.org/Smileys/default/[a-zA-Z0-9_-]+\.gif"
|
| 243 |
+
filtered_images = []
|
| 244 |
+
for i in images:
|
| 245 |
+
emoji_urls = re.findall(pattern, i["src"])
|
| 246 |
+
if len(emoji_urls)<1:
|
| 247 |
+
filtered_images.append(i)
|
| 248 |
+
return filtered_images
|
scraper/sort.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from tqdm import tqdm
|
| 5 |
+
|
| 6 |
+
# Function to process and sort CSV files within a given folder
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def process_csvs(folder_path, new_folder_name):
|
| 10 |
+
# Extracting the name of the board from the folder path
|
| 11 |
+
board = os.path.basename(folder_path)
|
| 12 |
+
# Creating a new directory to store the sorted CSV files
|
| 13 |
+
sorted_folder = Path(new_folder_name)
|
| 14 |
+
sorted_folder.mkdir(parents=True, exist_ok=True)
|
| 15 |
+
|
| 16 |
+
# Retrieving all CSV files from the given folder path
|
| 17 |
+
all_files = [
|
| 18 |
+
os.path.join(folder_path, file)
|
| 19 |
+
for file in os.listdir(folder_path)
|
| 20 |
+
if file.endswith(".csv")
|
| 21 |
+
]
|
| 22 |
+
# Reading each CSV file into a dataframe
|
| 23 |
+
list_of_dataframes = [pd.read_csv(file) for file in all_files]
|
| 24 |
+
# Combining all dataframes into a single dataframe
|
| 25 |
+
combined_df = pd.concat(list_of_dataframes, ignore_index=True)
|
| 26 |
+
|
| 27 |
+
# Sorting the combined dataframe based on the "last_edit" column
|
| 28 |
+
combined_df = combined_df.sort_values(by="last_edit")
|
| 29 |
+
|
| 30 |
+
# Splitting the sorted dataframe into chunks of 10,000 rows each
|
| 31 |
+
num_chunks = len(combined_df) // 10000 + (1 if len(combined_df) % 10000 else 0)
|
| 32 |
+
chunks = [combined_df.iloc[i * 10000 : (i + 1) * 10000] for i in range(num_chunks)]
|
| 33 |
+
|
| 34 |
+
# Saving each chunk as a separate CSV with a filename based on date ranges
|
| 35 |
+
for idx, chunk in tqdm(enumerate(chunks)):
|
| 36 |
+
start_date = pd.to_datetime(chunk["last_edit"].iloc[0]).strftime("%d%m%y")
|
| 37 |
+
end_date = pd.to_datetime(chunk["last_edit"].iloc[-1]).strftime("%d%m%y")
|
| 38 |
+
filename = f"BitcoinForum_{board}_{start_date}_to_{end_date}.csv"
|
| 39 |
+
chunk.to_csv(os.path.join(sorted_folder, filename), index=False)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
folder_paths = [
|
| 43 |
+
"./raw-data",
|
| 44 |
+
"./preprocessed-data",
|
| 45 |
+
]
|
| 46 |
+
|
| 47 |
+
# Iterating over each folder path and processing its CSV files
|
| 48 |
+
for folder_path in folder_paths:
|
| 49 |
+
folder_name = os.path.basename(folder_path)
|
| 50 |
+
new_folder_name = f"sorted-{folder_name}"
|
| 51 |
+
for folder in tqdm(os.listdir(folder_path)):
|
| 52 |
+
if os.path.isdir(os.path.join(folder_path, folder)):
|
| 53 |
+
process_csvs(os.path.join(folder_path, folder), new_folder_name)
|
scraper/topic_crawling.py
ADDED
|
@@ -0,0 +1,243 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Importing necessary libraries:
|
| 2 |
+
# - os, json, time for file, data and time operations respectively.
|
| 3 |
+
# - requests for making HTTP requests.
|
| 4 |
+
# - BeautifulSoup for parsing HTML content.
|
| 5 |
+
# - Other imports for logging, data manipulation, progress indication, and more.
|
| 6 |
+
import os
|
| 7 |
+
import json
|
| 8 |
+
import time
|
| 9 |
+
import munch
|
| 10 |
+
import requests
|
| 11 |
+
import argparse
|
| 12 |
+
import pandas as pd
|
| 13 |
+
from tqdm import tqdm
|
| 14 |
+
from datetime import date
|
| 15 |
+
from loguru import logger
|
| 16 |
+
from random import randint
|
| 17 |
+
from bs4 import BeautifulSoup, NavigableString
|
| 18 |
+
import random
|
| 19 |
+
|
| 20 |
+
from preprocessing.preprocessing_sub_functions import remove_emojis
|
| 21 |
+
from torch import save,load
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
# This function reads a JSON file named "website_format.json".
|
| 25 |
+
# The file contain a list of user agents.
|
| 26 |
+
# User agents are strings that browsers send to websites to identify themselves.
|
| 27 |
+
# This list is likely used to rotate between different user agents when making requests,
|
| 28 |
+
# making the scraper seem like different browsers and reducing the chances of being blocked.
|
| 29 |
+
def get_web_component():
|
| 30 |
+
# Opening JSON file
|
| 31 |
+
with open("website_format.json") as json_file:
|
| 32 |
+
website_format = json.load(json_file)
|
| 33 |
+
website_format = munch.munchify(website_format)
|
| 34 |
+
return website_format.USER_AGENTS
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
# This function fetches a webpage's content.
|
| 38 |
+
# It randomly selects a user agent from the provided list to make the request.
|
| 39 |
+
# After fetching, it uses BeautifulSoup to parse the page's HTML content.
|
| 40 |
+
from collections import OrderedDict
|
| 41 |
+
cache = OrderedDict()
|
| 42 |
+
|
| 43 |
+
def get_web_content(url, USER_AGENTS):
|
| 44 |
+
global cache # Refer to the global cache variable
|
| 45 |
+
|
| 46 |
+
# Check if the URL is in cache
|
| 47 |
+
if url in cache:
|
| 48 |
+
# print(f"Using cache for {url}")
|
| 49 |
+
return cache[url]
|
| 50 |
+
else:
|
| 51 |
+
# print(f"Waiting to fetch {url}")
|
| 52 |
+
time.sleep(0.7) # Simulate delay
|
| 53 |
+
# print(f"Fetching {url}")
|
| 54 |
+
|
| 55 |
+
# Choose a random user agent
|
| 56 |
+
random_agent = USER_AGENTS[random.randint(0, len(USER_AGENTS) - 1)]
|
| 57 |
+
headers = {"User-Agent": random_agent}
|
| 58 |
+
|
| 59 |
+
# Fetch the web content
|
| 60 |
+
req = requests.get(url, headers=headers)
|
| 61 |
+
req.encoding = req.apparent_encoding
|
| 62 |
+
soup = BeautifulSoup(req.text, features="lxml")
|
| 63 |
+
|
| 64 |
+
# Add the new content to the cache
|
| 65 |
+
# If the cache already has 3 items, remove the oldest one
|
| 66 |
+
if len(cache) >= 3:
|
| 67 |
+
cache.popitem(last=False) # Remove the oldest item
|
| 68 |
+
cache[url] = soup # Add the new item
|
| 69 |
+
|
| 70 |
+
# print("Fetching done")
|
| 71 |
+
return soup
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
# This function extracts pagination links from a page.
|
| 75 |
+
# These links point to other pages of content, often seen at the bottom of forums or search results.
|
| 76 |
+
# The function returns both the individual page links and the "next" link,
|
| 77 |
+
# which points to the next set of results.
|
| 78 |
+
def get_pages_urls(url, USER_AGENTS, next_50_pages):
|
| 79 |
+
soup = get_web_content(url, USER_AGENTS)
|
| 80 |
+
# Finding the pagination links based on their HTML structure and CSS classes.
|
| 81 |
+
first_td = soup.find('td', class_='middletext')
|
| 82 |
+
nav_pages_links = first_td.find_all('a', class_='navPages')
|
| 83 |
+
href_links = [link['href'] for link in nav_pages_links]
|
| 84 |
+
|
| 85 |
+
next_50_link = None
|
| 86 |
+
if next_50_pages:
|
| 87 |
+
next_50_link = href_links[-3] # HACK: Assuming the third-last link is the "next" link.
|
| 88 |
+
|
| 89 |
+
href_links.insert(0, url)
|
| 90 |
+
return href_links, next_50_link
|
| 91 |
+
|
| 92 |
+
# This function loops through the main page and its paginated versions to collect URLs.
|
| 93 |
+
# It repeatedly calls 'get_pages_urls' to fetch batches of URLs until the desired number (num_of_pages) is reached.
|
| 94 |
+
def loop_through_source_url(USER_AGENTS, url, num_of_pages):
|
| 95 |
+
pages_urls = []
|
| 96 |
+
while len(pages_urls) < num_of_pages:
|
| 97 |
+
next_50_pages = num_of_pages >= 50
|
| 98 |
+
href_links, next_50_link = get_pages_urls(url, USER_AGENTS, next_50_pages)
|
| 99 |
+
pages_urls.extend(href_links)
|
| 100 |
+
pages_urls = list(dict.fromkeys(pages_urls)) # Remove any duplicate URLs.
|
| 101 |
+
url = next_50_link
|
| 102 |
+
return pages_urls
|
| 103 |
+
|
| 104 |
+
def get_subpages_urls(url, USER_AGENTS):
|
| 105 |
+
soup = get_web_content(url, USER_AGENTS)
|
| 106 |
+
middletext = soup.find('td', class_='middletext')
|
| 107 |
+
nav_pages_links = middletext.find_all('a', class_='navPages')
|
| 108 |
+
|
| 109 |
+
return nav_pages_links[:-1]
|
| 110 |
+
|
| 111 |
+
def loop_through_posts(USER_AGENTS, post_url, board, num_of_pages, remove_emoji):
|
| 112 |
+
# print("loop_through_posts: num_of_pages =",num_of_pages)
|
| 113 |
+
try:
|
| 114 |
+
href_links = loop_through_source_url(USER_AGENTS, post_url, num_of_pages)
|
| 115 |
+
|
| 116 |
+
df = pd.DataFrame(columns=['timestamp', 'last_edit', 'author', 'post', 'topic', 'attachment', 'link', 'original_info'])
|
| 117 |
+
|
| 118 |
+
for url in href_links[:1]: # we only need the first page to analyse the category of the thread
|
| 119 |
+
df = read_subject_page(USER_AGENTS, url, df, remove_emoji)
|
| 120 |
+
|
| 121 |
+
topic_id = post_url.split('topic=')[1]
|
| 122 |
+
df.to_csv(f'data/{board}/data_{topic_id}.csv', mode='w', index=False)
|
| 123 |
+
|
| 124 |
+
except Exception as e:
|
| 125 |
+
print(e)
|
| 126 |
+
with open(f"data/{board}/error_log.txt", "a") as f:
|
| 127 |
+
f.write(f"{post_url}\n -- {e}\n")
|
| 128 |
+
|
| 129 |
+
# This function processes a post page. It extracts various details like timestamps, author information, post content, topic, attachments, links, and original HTML information.
|
| 130 |
+
# The function returns a dictionary containing all this extracted data.
|
| 131 |
+
def read_subject_page(USER_AGENTS, post_url, df, remove_emoji):
|
| 132 |
+
soup = get_web_content(post_url, USER_AGENTS)
|
| 133 |
+
form_tag = soup.find('form', id='quickModForm')
|
| 134 |
+
table_tag = form_tag.find('table', class_='bordercolor')
|
| 135 |
+
td_tag = table_tag.find_all('td', class_='windowbg')
|
| 136 |
+
td_tag.extend(table_tag.find_all('td', class_='windowbg2'))
|
| 137 |
+
|
| 138 |
+
for comment in td_tag:
|
| 139 |
+
res = extract_useful_content_windowbg(comment, remove_emoji)
|
| 140 |
+
if res is not None:
|
| 141 |
+
df = pd.concat([df, pd.DataFrame([res])])
|
| 142 |
+
|
| 143 |
+
return df
|
| 144 |
+
|
| 145 |
+
# This function extracts meaningful content from a given HTML element (`tr_tag`). This tag is likely a row in a table, given its name.
|
| 146 |
+
# The function checks the presence of specific tags and classes within this row to extract information such as timestamps, author, post content, topic, attachments, and links.
|
| 147 |
+
# The extracted data is returned as a dictionary.
|
| 148 |
+
def extract_useful_content_windowbg(tr_tag, remove_emoji=True):
|
| 149 |
+
"""
|
| 150 |
+
Timestamp of the post (ex: September 11, 2023, 07:49:45 AM; but if you want just 11/09/2023 is enough)
|
| 151 |
+
Author of the post (ex: SupermanBitcoin)
|
| 152 |
+
The post itself
|
| 153 |
+
|
| 154 |
+
The topic where the post was posted (ex: [INFO - DISCUSSION] Security Budget Problem) eg. Whats your thoughts: Next-Gen Bitcoin Mining Machine With 1X Efficiency Rating.
|
| 155 |
+
Number of characters in the post --> so this is an integer
|
| 156 |
+
Does the post contain at least one attachment (image, video etc.) --> if yes put '1' in the column, if no, just put '0'
|
| 157 |
+
Does the post contain at least one link --> if yes put '1' in the column, if no, just put '0'
|
| 158 |
+
"""
|
| 159 |
+
headerandpost = tr_tag.find('td', class_='td_headerandpost')
|
| 160 |
+
if not headerandpost:
|
| 161 |
+
return None
|
| 162 |
+
|
| 163 |
+
timestamp = headerandpost.find('div', class_='smalltext').get_text()
|
| 164 |
+
timestamps = timestamp.split('Last edit: ')
|
| 165 |
+
timestamp = timestamps[0].strip()
|
| 166 |
+
last_edit = None
|
| 167 |
+
if len(timestamps) > 1:
|
| 168 |
+
if 'Today ' in timestamps[1]:
|
| 169 |
+
last_edit = date.today().strftime("%B %d, %Y")+', '+timestamps[1].split('by')[0].split("Today at")[1].strip()
|
| 170 |
+
last_edit = timestamps[1].split('by')[0].strip()
|
| 171 |
+
|
| 172 |
+
poster_info_tag = tr_tag.find('td', class_='poster_info')
|
| 173 |
+
anchor_tag = poster_info_tag.find('a')
|
| 174 |
+
author = "Anonymous" if anchor_tag is None else anchor_tag.get_text()
|
| 175 |
+
|
| 176 |
+
link = 0
|
| 177 |
+
|
| 178 |
+
post_ = tr_tag.find('div', class_='post')
|
| 179 |
+
texts = []
|
| 180 |
+
for child in post_.children:
|
| 181 |
+
if isinstance(child, NavigableString):
|
| 182 |
+
texts.append(child.strip())
|
| 183 |
+
elif child.has_attr('class') and 'ul' in child['class']:
|
| 184 |
+
link = 1
|
| 185 |
+
texts.append(child.get_text(strip=True))
|
| 186 |
+
post = ' '.join(texts)
|
| 187 |
+
|
| 188 |
+
topic = headerandpost.find('div', class_='subject').get_text()
|
| 189 |
+
|
| 190 |
+
image = headerandpost.find('div', class_='post').find_all('img')
|
| 191 |
+
if remove_emoji:
|
| 192 |
+
image = remove_emojis(image)
|
| 193 |
+
image_ = min(len(image), 1)
|
| 194 |
+
|
| 195 |
+
video = headerandpost.find('div', class_='post').find('video')
|
| 196 |
+
video_ = 0 if video is None else 1
|
| 197 |
+
attachment = max(image_, video_)
|
| 198 |
+
|
| 199 |
+
original_info = headerandpost
|
| 200 |
+
|
| 201 |
+
return {
|
| 202 |
+
'timestamp': timestamp,
|
| 203 |
+
'last_edit': last_edit,
|
| 204 |
+
'author': author.strip(),
|
| 205 |
+
'post': post.strip(),
|
| 206 |
+
'topic': topic.strip(),
|
| 207 |
+
'attachment': attachment,
|
| 208 |
+
'link': link,
|
| 209 |
+
'original_info': original_info,
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
# A utility function to save a list (e.g., URLs) to a text file.
|
| 215 |
+
# Each item in the list gets its own line in the file.
|
| 216 |
+
def save_page_file(data, file_name):
|
| 217 |
+
with open(file_name, 'w') as filehandle:
|
| 218 |
+
for listitem in data:
|
| 219 |
+
filehandle.write('%s\n' % listitem)
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
# This function sets up command-line arguments for the script, allowing users to provide input without modifying the code.
|
| 223 |
+
# Possible inputs include the starting URL, whether or not to update data, the board's name, and how many pages or posts to process.
|
| 224 |
+
def parse_args():
|
| 225 |
+
"""Parse command-line arguments."""
|
| 226 |
+
parser = argparse.ArgumentParser()
|
| 227 |
+
parser.add_argument("url", help="url for the extraction")
|
| 228 |
+
parser.add_argument("--board", help="board name")
|
| 229 |
+
parser.add_argument("--num_of_pages", '-pages', help="number of pages to extract", type=int)
|
| 230 |
+
parser.add_argument("remove_emoji", help="remove emoji from the post", action="store_true")
|
| 231 |
+
return vars(parser.parse_args())
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
def main(url, board, num_of_pages, remove_emoji):
|
| 235 |
+
USER_AGENTS = get_web_component()
|
| 236 |
+
loop_through_posts(USER_AGENTS, url, board, num_of_pages, remove_emoji)
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
if __name__ == "__main__":
|
| 241 |
+
main(**parse_args())
|
| 242 |
+
|
| 243 |
+
# python topic_crawling.py https://bitcointalk.org/index.php?topic=28402.0 --board miners --num_of_pages 843
|
scraper/website_format.json
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"USER_AGENTS": [
|
| 3 |
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
|
| 4 |
+
|
| 5 |
+
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
|
| 6 |
+
|
| 7 |
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
|
| 8 |
+
|
| 9 |
+
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
|
| 10 |
+
|
| 11 |
+
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
|
| 12 |
+
|
| 13 |
+
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
|
| 14 |
+
|
| 15 |
+
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
|
| 16 |
+
|
| 17 |
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
|
| 18 |
+
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
|
| 19 |
+
|
| 20 |
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
|
| 21 |
+
|
| 22 |
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
|
| 23 |
+
|
| 24 |
+
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
|
| 25 |
+
|
| 26 |
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
|
| 27 |
+
|
| 28 |
+
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
|
| 29 |
+
|
| 30 |
+
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
|
| 31 |
+
|
| 32 |
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
|
| 33 |
+
|
| 34 |
+
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
|
| 35 |
+
|
| 36 |
+
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
|
| 37 |
+
|
| 38 |
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
|
| 39 |
+
|
| 40 |
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
|
| 41 |
+
|
| 42 |
+
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
|
| 43 |
+
|
| 44 |
+
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12",
|
| 45 |
+
|
| 46 |
+
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
|
| 47 |
+
|
| 48 |
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
|
| 49 |
+
|
| 50 |
+
"Mozilla/5.0 (iPhone; CPU iPhone OS 10_3 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.3 Mobile/14E277 Safari/603.1.30",
|
| 51 |
+
|
| 52 |
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
|
| 53 |
+
|
| 54 |
+
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
|
| 55 |
+
|
| 56 |
+
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
|
| 57 |
+
|
| 58 |
+
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
|
| 59 |
+
|
| 60 |
+
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
|
| 61 |
+
|
| 62 |
+
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
|
| 63 |
+
|
| 64 |
+
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
|
| 65 |
+
|
| 66 |
+
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
|
| 67 |
+
|
| 68 |
+
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
|
| 69 |
+
|
| 70 |
+
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
|
| 71 |
+
|
| 72 |
+
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
|
| 73 |
+
|
| 74 |
+
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
|
| 75 |
+
|
| 76 |
+
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
|
| 77 |
+
|
| 78 |
+
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
|
| 79 |
+
|
| 80 |
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
|
| 81 |
+
|
| 82 |
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
|
| 83 |
+
|
| 84 |
+
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
|
| 85 |
+
|
| 86 |
+
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
|
| 87 |
+
|
| 88 |
+
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
|
| 89 |
+
|
| 90 |
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
|
| 91 |
+
|
| 92 |
+
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
|
| 93 |
+
|
| 94 |
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
|
| 95 |
+
|
| 96 |
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
|
| 97 |
+
|
| 98 |
+
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
|
| 99 |
+
|
| 100 |
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
|
| 101 |
+
|
| 102 |
+
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
|
| 103 |
+
|
| 104 |
+
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
|
| 105 |
+
|
| 106 |
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
|
| 107 |
+
|
| 108 |
+
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
|
| 109 |
+
|
| 110 |
+
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
|
| 111 |
+
|
| 112 |
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
|
| 113 |
+
|
| 114 |
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
|
| 115 |
+
|
| 116 |
+
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
|
| 117 |
+
|
| 118 |
+
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12",
|
| 119 |
+
|
| 120 |
+
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
|
| 121 |
+
|
| 122 |
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
|
| 123 |
+
|
| 124 |
+
"Mozilla/5.0 (iPhone; CPU iPhone OS 10_3 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.3 Mobile/14E277 Safari/603.1.30",
|
| 125 |
+
|
| 126 |
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
|
| 127 |
+
|
| 128 |
+
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)"
|
| 129 |
+
]
|
| 130 |
+
}
|