Spaces:
Sleeping
Sleeping
Commit
·
79d285f
0
Parent(s):
Initial commit
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .dockerignore +0 -0
- .github/workflows/ci.yml +34 -0
- .gitignore +162 -0
- .idx/dev.nix +55 -0
- Dockerfile +19 -0
- README.md +12 -0
- app.py +205 -0
- crawler.py +222 -0
- demo.py +38 -0
- docs/design.md +0 -0
- docs/requirements.md +0 -0
- main.py +40 -0
- requirements-dev.txt +0 -0
- requirements.txt +22 -0
- setup.py +18 -0
- src/introlix_api/app/__init__.py +0 -0
- src/introlix_api/app/algolia.py +82 -0
- src/introlix_api/app/appwrite.py +179 -0
- src/introlix_api/app/database.py +23 -0
- src/introlix_api/app/introlix_spider/introlix_spider/__init__.py +0 -0
- src/introlix_api/app/introlix_spider/introlix_spider/items.py +12 -0
- src/introlix_api/app/introlix_spider/introlix_spider/middlewares.py +103 -0
- src/introlix_api/app/introlix_spider/introlix_spider/pipelines.py +13 -0
- src/introlix_api/app/introlix_spider/introlix_spider/settings.py +100 -0
- src/introlix_api/app/introlix_spider/introlix_spider/spiders/__init__.py +4 -0
- src/introlix_api/app/introlix_spider/introlix_spider/spiders/generic.py +286 -0
- src/introlix_api/app/introlix_spider/scrapy.cfg +11 -0
- src/introlix_api/app/model.py +37 -0
- src/introlix_api/app/routes/__init__.py +0 -0
- src/introlix_api/app/routes/auth.py +109 -0
- src/introlix_api/app/routes/posts.py +208 -0
- src/introlix_api/app/routes/run_spider.py +23 -0
- src/introlix_api/app/routes/similarity.py +83 -0
- src/introlix_api/crawler/__init__.py +0 -0
- src/introlix_api/crawler/bot.py +390 -0
- src/introlix_api/engine/__init__.py +0 -0
- src/introlix_api/engine/api_data.py +101 -0
- src/introlix_api/engine/discussion.py +41 -0
- src/introlix_api/engine/graphql.py +69 -0
- src/introlix_api/engine/third_party_apis.py +108 -0
- src/introlix_api/engine/youtube.py +54 -0
- src/introlix_api/exception/__init__.py +34 -0
- src/introlix_api/logger/__init__.py +22 -0
- src/introlix_api/ml/__init__.py +0 -0
- src/introlix_api/ml/model.py +0 -0
- src/introlix_api/ml/recommendation.py +89 -0
- src/introlix_api/pipeline/__init__.py +0 -0
- src/introlix_api/pipeline/common_pipeline.py +0 -0
- src/introlix_api/pipeline/periodic_pipeline.py +0 -0
- src/introlix_api/utils/__init__.py +0 -0
.dockerignore
ADDED
|
File without changes
|
.github/workflows/ci.yml
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Sync to Hugging Face hub
|
| 2 |
+
on:
|
| 3 |
+
push:
|
| 4 |
+
branches: [main]
|
| 5 |
+
|
| 6 |
+
# To run this workflow manually from the Actions tab
|
| 7 |
+
workflow_dispatch:
|
| 8 |
+
|
| 9 |
+
jobs:
|
| 10 |
+
sync-to-hub:
|
| 11 |
+
runs-on: ubuntu-latest
|
| 12 |
+
steps:
|
| 13 |
+
- uses: actions/checkout@v3
|
| 14 |
+
with:
|
| 15 |
+
fetch-depth: 0
|
| 16 |
+
lfs: true
|
| 17 |
+
|
| 18 |
+
- name: Set up Git user
|
| 19 |
+
run: |
|
| 20 |
+
git config --global user.email "tubex998@gmail.com"
|
| 21 |
+
git config --global user.name "satyam998"
|
| 22 |
+
|
| 23 |
+
- name: Create a new branch
|
| 24 |
+
run: |
|
| 25 |
+
git checkout --orphan temp
|
| 26 |
+
git add -A
|
| 27 |
+
git commit -m "Initial commit"
|
| 28 |
+
git branch -D main
|
| 29 |
+
git branch -m main
|
| 30 |
+
|
| 31 |
+
- name: Force push to hub
|
| 32 |
+
env:
|
| 33 |
+
HF: ${{ secrets.HF_TOKEN }}
|
| 34 |
+
run: git push --force https://satyam998:$HF@huggingface.co/spaces/satyam998/introlix_api main
|
.gitignore
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
share/python-wheels/
|
| 24 |
+
*.egg-info/
|
| 25 |
+
.installed.cfg
|
| 26 |
+
*.egg
|
| 27 |
+
MANIFEST
|
| 28 |
+
|
| 29 |
+
# PyInstaller
|
| 30 |
+
# Usually these files are written by a python script from a template
|
| 31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 32 |
+
*.manifest
|
| 33 |
+
*.spec
|
| 34 |
+
|
| 35 |
+
# Installer logs
|
| 36 |
+
pip-log.txt
|
| 37 |
+
pip-delete-this-directory.txt
|
| 38 |
+
|
| 39 |
+
# Unit test / coverage reports
|
| 40 |
+
htmlcov/
|
| 41 |
+
.tox/
|
| 42 |
+
.nox/
|
| 43 |
+
.coverage
|
| 44 |
+
.coverage.*
|
| 45 |
+
.cache
|
| 46 |
+
nosetests.xml
|
| 47 |
+
coverage.xml
|
| 48 |
+
*.cover
|
| 49 |
+
*.py,cover
|
| 50 |
+
.hypothesis/
|
| 51 |
+
.pytest_cache/
|
| 52 |
+
cover/
|
| 53 |
+
|
| 54 |
+
# Translations
|
| 55 |
+
*.mo
|
| 56 |
+
*.pot
|
| 57 |
+
|
| 58 |
+
# Django stuff:
|
| 59 |
+
*.log
|
| 60 |
+
local_settings.py
|
| 61 |
+
db.sqlite3
|
| 62 |
+
db.sqlite3-journal
|
| 63 |
+
|
| 64 |
+
# Flask stuff:
|
| 65 |
+
instance/
|
| 66 |
+
.webassets-cache
|
| 67 |
+
|
| 68 |
+
# Scrapy stuff:
|
| 69 |
+
.scrapy
|
| 70 |
+
|
| 71 |
+
# Sphinx documentation
|
| 72 |
+
docs/_build/
|
| 73 |
+
|
| 74 |
+
# PyBuilder
|
| 75 |
+
.pybuilder/
|
| 76 |
+
target/
|
| 77 |
+
|
| 78 |
+
# Jupyter Notebook
|
| 79 |
+
.ipynb_checkpoints
|
| 80 |
+
|
| 81 |
+
# IPython
|
| 82 |
+
profile_default/
|
| 83 |
+
ipython_config.py
|
| 84 |
+
|
| 85 |
+
# pyenv
|
| 86 |
+
# For a library or package, you might want to ignore these files since the code is
|
| 87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
| 88 |
+
# .python-version
|
| 89 |
+
|
| 90 |
+
# pipenv
|
| 91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 94 |
+
# install all needed dependencies.
|
| 95 |
+
#Pipfile.lock
|
| 96 |
+
|
| 97 |
+
# poetry
|
| 98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 100 |
+
# commonly ignored for libraries.
|
| 101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 102 |
+
#poetry.lock
|
| 103 |
+
|
| 104 |
+
# pdm
|
| 105 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 106 |
+
#pdm.lock
|
| 107 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
| 108 |
+
# in version control.
|
| 109 |
+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
| 110 |
+
.pdm.toml
|
| 111 |
+
.pdm-python
|
| 112 |
+
.pdm-build/
|
| 113 |
+
|
| 114 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 115 |
+
__pypackages__/
|
| 116 |
+
|
| 117 |
+
# Celery stuff
|
| 118 |
+
celerybeat-schedule
|
| 119 |
+
celerybeat.pid
|
| 120 |
+
|
| 121 |
+
# SageMath parsed files
|
| 122 |
+
*.sage.py
|
| 123 |
+
|
| 124 |
+
# Environments
|
| 125 |
+
.env
|
| 126 |
+
.venv
|
| 127 |
+
env/
|
| 128 |
+
venv/
|
| 129 |
+
ENV/
|
| 130 |
+
env.bak/
|
| 131 |
+
venv.bak/
|
| 132 |
+
|
| 133 |
+
# Spyder project settings
|
| 134 |
+
.spyderproject
|
| 135 |
+
.spyproject
|
| 136 |
+
|
| 137 |
+
# Rope project settings
|
| 138 |
+
.ropeproject
|
| 139 |
+
|
| 140 |
+
# mkdocs documentation
|
| 141 |
+
/site
|
| 142 |
+
|
| 143 |
+
# mypy
|
| 144 |
+
.mypy_cache/
|
| 145 |
+
.dmypy.json
|
| 146 |
+
dmypy.json
|
| 147 |
+
|
| 148 |
+
# Pyre type checker
|
| 149 |
+
.pyre/
|
| 150 |
+
|
| 151 |
+
# pytype static type analyzer
|
| 152 |
+
.pytype/
|
| 153 |
+
|
| 154 |
+
# Cython debug symbols
|
| 155 |
+
cython_debug/
|
| 156 |
+
|
| 157 |
+
# PyCharm
|
| 158 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 159 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 160 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 161 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 162 |
+
#.idea/
|
.idx/dev.nix
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# To learn more about how to use Nix to configure your environment
|
| 2 |
+
# see: https://developers.google.com/idx/guides/customize-idx-env
|
| 3 |
+
{ pkgs, ... }: {
|
| 4 |
+
# Which nixpkgs channel to use.
|
| 5 |
+
channel = "stable-23.11"; # or "unstable"
|
| 6 |
+
|
| 7 |
+
# Use https://search.nixos.org/packages to find packages
|
| 8 |
+
packages = [
|
| 9 |
+
# pkgs.go
|
| 10 |
+
pkgs.python311
|
| 11 |
+
pkgs.python311Packages.pip
|
| 12 |
+
# pkgs.nodejs_20
|
| 13 |
+
# pkgs.nodePackages.nodemon
|
| 14 |
+
];
|
| 15 |
+
|
| 16 |
+
# Sets environment variables in the workspace
|
| 17 |
+
env = {};
|
| 18 |
+
idx = {
|
| 19 |
+
# Search for the extensions you want on https://open-vsx.org/ and use "publisher.id"
|
| 20 |
+
extensions = [
|
| 21 |
+
# "vscodevim.vim"
|
| 22 |
+
];
|
| 23 |
+
|
| 24 |
+
# Enable previews
|
| 25 |
+
previews = {
|
| 26 |
+
enable = true;
|
| 27 |
+
previews = {
|
| 28 |
+
# web = {
|
| 29 |
+
# # Example: run "npm run dev" with PORT set to IDX's defined port for previews,
|
| 30 |
+
# # and show it in IDX's web preview panel
|
| 31 |
+
# command = ["npm" "run" "dev"];
|
| 32 |
+
# manager = "web";
|
| 33 |
+
# env = {
|
| 34 |
+
# # Environment variables to set for your server
|
| 35 |
+
# PORT = "$PORT";
|
| 36 |
+
# };
|
| 37 |
+
# };
|
| 38 |
+
};
|
| 39 |
+
};
|
| 40 |
+
|
| 41 |
+
# Workspace lifecycle hooks
|
| 42 |
+
workspace = {
|
| 43 |
+
# Runs when a workspace is first created
|
| 44 |
+
onCreate = {
|
| 45 |
+
# Example: install JS dependencies from NPM
|
| 46 |
+
# npm-install = "npm install";
|
| 47 |
+
};
|
| 48 |
+
# Runs when the workspace is (re)started
|
| 49 |
+
onStart = {
|
| 50 |
+
# Example: start a background task to watch and re-build backend code
|
| 51 |
+
# watch-backend = "npm run watch-backend";
|
| 52 |
+
};
|
| 53 |
+
};
|
| 54 |
+
};
|
| 55 |
+
}
|
Dockerfile
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10
|
| 2 |
+
|
| 3 |
+
RUN useradd -m -u 1000 user
|
| 4 |
+
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
COPY --chown=user . /app
|
| 8 |
+
|
| 9 |
+
RUN pip install -r requirements.txt
|
| 10 |
+
|
| 11 |
+
RUN mkdir -p /app/logs
|
| 12 |
+
RUN chmod 777 /app/logs
|
| 13 |
+
|
| 14 |
+
# Copy the shell script into the container
|
| 15 |
+
COPY start.sh /app/start.sh
|
| 16 |
+
RUN chmod +x /app/start.sh
|
| 17 |
+
|
| 18 |
+
# Use the shell script to start both processes
|
| 19 |
+
CMD ["/bin/bash", "/app/start.sh"]
|
README.md
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Introlix API
|
| 3 |
+
emoji: 🔥
|
| 4 |
+
colorFrom: green
|
| 5 |
+
colorTo: blue
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
license: apache-2.0
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
# Introlix API
|
| 12 |
+
<p>Introlix API offers a comprehensive suite of tools and APIs utilized in Introlix Feed. It is an advanced API that integrates multiple external APIs, RSS feed crawlers, and other data sources to provide a robust and efficient backend service.</p>
|
app.py
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, Query, HTTPException
|
| 2 |
+
from bson import ObjectId
|
| 3 |
+
import sys
|
| 4 |
+
import httpx
|
| 5 |
+
import os
|
| 6 |
+
import crawler
|
| 7 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 8 |
+
from starlette.responses import RedirectResponse
|
| 9 |
+
from introlix_api.app.routes import auth, posts, run_spider, similarity
|
| 10 |
+
from typing import List
|
| 11 |
+
from dotenv import load_dotenv, dotenv_values
|
| 12 |
+
|
| 13 |
+
from introlix_api.app.appwrite import databases, APPWRITE_DATABASE_ID, ID, APPWRITE_ACCOUNT_COLLECTION_ID, get_interests
|
| 14 |
+
from introlix_api.app.database import startup_db_client, shutdown_db_client
|
| 15 |
+
from introlix_api.ml.recommendation import Recommendation
|
| 16 |
+
from introlix_api.utils.tags import fetch_tags
|
| 17 |
+
|
| 18 |
+
from introlix_api.exception import CustomException
|
| 19 |
+
|
| 20 |
+
from contextlib import asynccontextmanager
|
| 21 |
+
|
| 22 |
+
from pydantic import BaseModel, Field
|
| 23 |
+
|
| 24 |
+
load_dotenv()
|
| 25 |
+
|
| 26 |
+
YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")
|
| 27 |
+
|
| 28 |
+
class FeedModel(BaseModel):
|
| 29 |
+
id: str = Field(..., alias="_id")
|
| 30 |
+
title: str
|
| 31 |
+
desc: str
|
| 32 |
+
url: str
|
| 33 |
+
publication_date: str
|
| 34 |
+
image_url: str
|
| 35 |
+
category: str
|
| 36 |
+
source: str
|
| 37 |
+
|
| 38 |
+
@asynccontextmanager
|
| 39 |
+
async def lifespan(app: FastAPI):
|
| 40 |
+
# Start the database connection
|
| 41 |
+
await startup_db_client(app)
|
| 42 |
+
yield
|
| 43 |
+
# Close the database connection
|
| 44 |
+
await shutdown_db_client(app)
|
| 45 |
+
|
| 46 |
+
app = FastAPI(lifespan=lifespan)
|
| 47 |
+
|
| 48 |
+
origins = [
|
| 49 |
+
"http://localhost:3000",
|
| 50 |
+
"http://192.168.1.64:3000",
|
| 51 |
+
"https://introlixfeed.vercel.app/",
|
| 52 |
+
"https://introlixfeed.vercel.com/"
|
| 53 |
+
# Add other allowed origins here if needed
|
| 54 |
+
]
|
| 55 |
+
|
| 56 |
+
app.add_middleware(
|
| 57 |
+
CORSMiddleware,
|
| 58 |
+
allow_origins=origins, # Specify allowed origins
|
| 59 |
+
allow_credentials=True,
|
| 60 |
+
allow_methods=["*"],
|
| 61 |
+
allow_headers=["*"],
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
@app.get("/", tags=["authentication"])
|
| 65 |
+
async def index():
|
| 66 |
+
return RedirectResponse(url='/docs')
|
| 67 |
+
|
| 68 |
+
@app.get("/feed_data", response_model=List[FeedModel])
|
| 69 |
+
async def get_feed_data(page: int = 1, limit: int = 20, user_id: str = Query(...), category=None):
|
| 70 |
+
try:
|
| 71 |
+
skip = (page - 1) * limit
|
| 72 |
+
|
| 73 |
+
response = get_interests()
|
| 74 |
+
user_interests = []
|
| 75 |
+
# getting only the interests not keywords
|
| 76 |
+
for interest in response:
|
| 77 |
+
user_interests.append(interest['interest'])
|
| 78 |
+
|
| 79 |
+
users = databases.list_documents(
|
| 80 |
+
database_id=APPWRITE_DATABASE_ID,
|
| 81 |
+
collection_id=APPWRITE_ACCOUNT_COLLECTION_ID
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
for doc in users['documents']:
|
| 85 |
+
if user_id == doc['$id']:
|
| 86 |
+
user_interests = doc['interests']
|
| 87 |
+
|
| 88 |
+
user_interests = [item.split(':')[0] for item in user_interests]
|
| 89 |
+
# response = await app.mongodb['feedData'].find({"category": {"$in": user_interests}}).skip(skip).limit(limit).to_list(limit)
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
# Perform the aggregation
|
| 94 |
+
if category == None:
|
| 95 |
+
response = await app.mongodb['feedData'].find({"category": {"$in": user_interests}}).skip(skip).limit(limit).to_list(limit)
|
| 96 |
+
else:
|
| 97 |
+
response = await app.mongodb['feedData'].find({"category": category}).skip(skip).limit(limit).to_list(limit)
|
| 98 |
+
|
| 99 |
+
# random.shuffle(response)
|
| 100 |
+
|
| 101 |
+
# Filter out items that do not have a title
|
| 102 |
+
response = [item for item in response if item.get('title')]
|
| 103 |
+
response = [item for item in response if item.get('desc')]
|
| 104 |
+
|
| 105 |
+
article_titles = [item['title'] for item in response]
|
| 106 |
+
recommendation_system = Recommendation(user_interests, article_titles)
|
| 107 |
+
recommended_titles = recommendation_system.recommend()
|
| 108 |
+
|
| 109 |
+
response = [post for post in response if post['title'] in recommended_titles]
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
for item in response:
|
| 113 |
+
item['_id'] = str(item['_id'])
|
| 114 |
+
item['title'] = item.get('title') or ''
|
| 115 |
+
item['desc'] = item.get('desc') or ''
|
| 116 |
+
item['url'] = item.get('url') or ''
|
| 117 |
+
item['publication_date'] = item.get('publication_date') or ''
|
| 118 |
+
item['image_url'] = item.get('image_url') or ''
|
| 119 |
+
item['category'] = item.get('category') or ''
|
| 120 |
+
item['source'] = item.get('source') or ''
|
| 121 |
+
|
| 122 |
+
return response
|
| 123 |
+
except Exception as e:
|
| 124 |
+
raise CustomException(e, sys) from e
|
| 125 |
+
|
| 126 |
+
@app.get("/fetch_post", response_model=FeedModel)
|
| 127 |
+
async def get_feed_data(post_id: str = Query(...)):
|
| 128 |
+
try:
|
| 129 |
+
post_id = ObjectId(post_id)
|
| 130 |
+
response = await app.mongodb['feedData'].find_one({"_id": post_id})
|
| 131 |
+
|
| 132 |
+
if not response:
|
| 133 |
+
raise HTTPException(status_code=404, detail="Post not found")
|
| 134 |
+
|
| 135 |
+
# Convert _id to string
|
| 136 |
+
response["_id"] = str(response["_id"])
|
| 137 |
+
|
| 138 |
+
# Check for null values and set defaults if needed
|
| 139 |
+
response["desc"] = (response.get("desc") or "No Description")[:90]
|
| 140 |
+
response["publication_date"] = response.get("publication_date") or "Unknown Date"
|
| 141 |
+
response["image_url"] = response.get("image_url") or "No Image URL"
|
| 142 |
+
response["category"] = response.get("category") or "Uncategorized"
|
| 143 |
+
response["source"] = response.get("source") or "Unknown Source"
|
| 144 |
+
|
| 145 |
+
# for item in response:
|
| 146 |
+
# item['title'] = item.get('title') or ''
|
| 147 |
+
# item['desc'] = item.get('desc') or ''
|
| 148 |
+
# item['url'] = item.get('url') or ''
|
| 149 |
+
# item['publication_date'] = item.get('publication_date') or ''
|
| 150 |
+
# item['image_url'] = item.get('image_url') or ''
|
| 151 |
+
# item['category'] = item.get('category') or ''
|
| 152 |
+
# item['source'] = item.get('source') or ''
|
| 153 |
+
|
| 154 |
+
return response
|
| 155 |
+
except Exception as e:
|
| 156 |
+
raise CustomException(e, sys) from e
|
| 157 |
+
|
| 158 |
+
@app.get("/test_recommendation")
|
| 159 |
+
async def test_recommendation(
|
| 160 |
+
user_interests: list[str] = Query(..., description="Comma-separated list of user interests"),
|
| 161 |
+
articles: list[str] = Query(..., description="Comma-separated list of articles")
|
| 162 |
+
):
|
| 163 |
+
"""
|
| 164 |
+
Test endpoint for recommendations.
|
| 165 |
+
Takes user interests and articles as query parameters and returns recommended articles.
|
| 166 |
+
"""
|
| 167 |
+
|
| 168 |
+
# Create a recommendation instance
|
| 169 |
+
recommendation = Recommendation(user_interests, articles)
|
| 170 |
+
|
| 171 |
+
# Get the recommended articles
|
| 172 |
+
recommended_articles = recommendation.recommend()
|
| 173 |
+
|
| 174 |
+
return {
|
| 175 |
+
"user_interests": user_interests,
|
| 176 |
+
"recommended_articles": recommended_articles,
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
@app.get("/youtube/videos")
|
| 180 |
+
async def get_youtube_videos(query: str = None):
|
| 181 |
+
url = "https://www.googleapis.com/youtube/v3/search"
|
| 182 |
+
params = {
|
| 183 |
+
"key": YOUTUBE_API_KEY,
|
| 184 |
+
"part": "snippet",
|
| 185 |
+
"q": query or "trending",
|
| 186 |
+
"type": "video",
|
| 187 |
+
"maxResults": 10,
|
| 188 |
+
"order": "viewCount" # You can change this to 'date' for recent uploads
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
async with httpx.AsyncClient() as client:
|
| 192 |
+
response = await client.get(url, params=params)
|
| 193 |
+
response.raise_for_status() # Raise an error for bad responses
|
| 194 |
+
return response.json()
|
| 195 |
+
|
| 196 |
+
@app.get("/tags")
|
| 197 |
+
async def get_tags():
|
| 198 |
+
tags = fetch_tags()
|
| 199 |
+
return tags
|
| 200 |
+
|
| 201 |
+
app.include_router(auth.router, prefix="/auth")
|
| 202 |
+
app.include_router(run_spider.router, prefix="/spider")
|
| 203 |
+
app.include_router(similarity.router, prefix="/feed")
|
| 204 |
+
app.include_router(crawler.router)
|
| 205 |
+
app.include_router(posts.router)
|
crawler.py
ADDED
|
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import sys
|
| 3 |
+
import time
|
| 4 |
+
from urllib.parse import urlparse
|
| 5 |
+
from fastapi import APIRouter, HTTPException, Query
|
| 6 |
+
from introlix_api.crawler.bot import IntrolixBot, BotArgs
|
| 7 |
+
from introlix_api.exception import CustomException
|
| 8 |
+
from introlix_api.logger import logger
|
| 9 |
+
from introlix_api.utils.root_sites import root_sites
|
| 10 |
+
from introlix_api.app.database import search_data, db
|
| 11 |
+
from introlix_api.app.appwrite import fetch_root_sites, fetch_saved_urls, save_urls
|
| 12 |
+
from pymongo import ASCENDING
|
| 13 |
+
from pymongo.errors import DuplicateKeyError
|
| 14 |
+
|
| 15 |
+
router = APIRouter()
|
| 16 |
+
|
| 17 |
+
BATCH_SIZE = 10
|
| 18 |
+
urls_batch = []
|
| 19 |
+
storage_threshold = 500 * 1024 * 1024
|
| 20 |
+
delete_batch = 1000
|
| 21 |
+
|
| 22 |
+
def filter_urls(url: str) -> bool:
|
| 23 |
+
"""
|
| 24 |
+
A function to filter non article urls from the scraped urls
|
| 25 |
+
Args:
|
| 26 |
+
url (list): url
|
| 27 |
+
Returns:
|
| 28 |
+
bool: True if the url is article url else False
|
| 29 |
+
"""
|
| 30 |
+
parsed_url = urlparse(url)
|
| 31 |
+
|
| 32 |
+
if parsed_url.path in ('', '/'):
|
| 33 |
+
return False
|
| 34 |
+
|
| 35 |
+
non_article_keywords = [
|
| 36 |
+
"/product", "/products", "/home", "/item", "/items", "/category", "/categories",
|
| 37 |
+
"/login", "/signin", "/logout", "/signup", "/register", "/account", "/user",
|
| 38 |
+
"/profile", "/dashboard", "/settings", "/preferences", "/order", "/orders",
|
| 39 |
+
"/cart", "/checkout", "/payment", "/subscribe", "/subscription",
|
| 40 |
+
"/contact", "/support", "/help", "/faq", "/about", "/privacy", "/terms",
|
| 41 |
+
"/policy", "/conditions", "/legal", "/service", "/services", "/guide",
|
| 42 |
+
"/how-to", "/pricing", "/price", "fees", "/plans", "/features", "/partners",
|
| 43 |
+
"/team", "/careers", "/jobs", "/join", "/apply", "/training", "/demo",
|
| 44 |
+
"/trial", "/download", "/install", "/app", "/apps", "/software", "/portal",
|
| 45 |
+
"/index", "/main", "/video", "/videos", "/photo", "/photos",
|
| 46 |
+
"/image", "/images", "/gallery", "/portfolio", "/showcase", "/testimonials",
|
| 47 |
+
"/reviews", "/search", "/find", "/browse", "/list", "/tags", "/explore",
|
| 48 |
+
"/new", "/trending", "/latest", "/promotions", "/offers", "/deals", "/discount",
|
| 49 |
+
"/coupon", "/coupons", "/gift", "/store", "/stores", "/locator", "/locations",
|
| 50 |
+
"/branches", "/events", "/webinar", "/calendar", "/schedule",
|
| 51 |
+
"/class", "/classes", "/lesson", "/lessons", "/training", "/activity",
|
| 52 |
+
"/activities", "/workshop", "/exhibit", "/performance", "/map", "/directions",
|
| 53 |
+
"/weather", "/traffic", "/rates", "/auction", "/bid", "/tender", "/investment",
|
| 54 |
+
"/loan", "/mortgage", "/property", "/real-estate", "/construction", "/project",
|
| 55 |
+
"/client", "/clients", "/partner", "/sponsor", "/media", "/press", "/releases",
|
| 56 |
+
"/announcements", "/newsroom", "/resources", "courses", "collections", "/u/", "/members/",
|
| 57 |
+
"/@", "/shop", "/wiki", "/author", "/dynamic", "/image", "/submit" # TODO: need to add more
|
| 58 |
+
]
|
| 59 |
+
|
| 60 |
+
article_keywords = [
|
| 61 |
+
"/blog/", "post", "article", "insights", "guide", "tutorial",
|
| 62 |
+
"how-to", "what", "how", "introduction", "/news/"
|
| 63 |
+
]
|
| 64 |
+
|
| 65 |
+
article_pattern = [
|
| 66 |
+
r'/(/blog/|article|articles|post|posts|blogs|news|)/\d{4}/\d{2}/+[a-z0-9-]+/?',
|
| 67 |
+
r'/(/blog/|article|articles|post|posts|blogs|news|)/[a-z0-9-]+/[a-z0-9-]+',
|
| 68 |
+
r'(?<!\/\/www)(/blog/|article|articles|post|posts|blogs|news|)/[a-z0-9-]+',
|
| 69 |
+
r'^(?!.*\/category\/).*\/[a-z0-9-]+\/[a-z0-9-]+(-[a-z0-9-]+)+$',
|
| 70 |
+
r'/[^/]+/\d{4}/\d{2}/\d{2}/+[a-z0-9]+/?',
|
| 71 |
+
r'/[^/]+/\d{4}/\d{2}/+[a-z0-9]+/?'
|
| 72 |
+
r'/[a-z0-9-]+/\d{4}/\d{2}/+/?',
|
| 73 |
+
r'/[a-z0-9-]+/\d{4}/\d{2}/\d{2}/+/?'
|
| 74 |
+
]
|
| 75 |
+
|
| 76 |
+
for pattern in article_pattern:
|
| 77 |
+
if re.search(pattern, url):
|
| 78 |
+
if not any(keyword in url for keyword in non_article_keywords):
|
| 79 |
+
return True
|
| 80 |
+
|
| 81 |
+
if any (keyword in url for keyword in article_keywords):
|
| 82 |
+
return True
|
| 83 |
+
|
| 84 |
+
last_segment = parsed_url.path.strip('/').split('/')[-1]
|
| 85 |
+
if '-' in last_segment and len(last_segment.split('-')) > 2:
|
| 86 |
+
return True
|
| 87 |
+
|
| 88 |
+
return False
|
| 89 |
+
|
| 90 |
+
def save_to_db(data):
|
| 91 |
+
global urls_batch
|
| 92 |
+
try:
|
| 93 |
+
# Check database storage size and delete old documents if needed
|
| 94 |
+
stats = db.command("collStats", "search_data")
|
| 95 |
+
storage_size = stats['size']
|
| 96 |
+
|
| 97 |
+
if storage_size >= storage_threshold:
|
| 98 |
+
oldest_docs = search_data.find().sort("createdAt", ASCENDING).limit(delete_batch)
|
| 99 |
+
oldest_ids = [doc['_id'] for doc in oldest_docs]
|
| 100 |
+
search_data.delete_many({"_id": {"$in": oldest_ids}})
|
| 101 |
+
|
| 102 |
+
# Prepare list of URLs to check in the database
|
| 103 |
+
urls = [d["url"] for d in data if filter_urls(d["url"])]
|
| 104 |
+
|
| 105 |
+
# Retrieve existing URLs from the database to filter out duplicates
|
| 106 |
+
existing_urls = set(search_data.find({"url": {"$in": urls}}).distinct("url"))
|
| 107 |
+
|
| 108 |
+
# Filter out documents with URLs that already exist in the database
|
| 109 |
+
unique_data = [
|
| 110 |
+
{"url": d["url"], "content": d["content"], "type": "article"}
|
| 111 |
+
for d in data
|
| 112 |
+
if d["url"] not in existing_urls and d.get("content") is not None
|
| 113 |
+
]
|
| 114 |
+
|
| 115 |
+
# Insert only unique documents
|
| 116 |
+
if unique_data:
|
| 117 |
+
try:
|
| 118 |
+
search_data.insert_many(unique_data)
|
| 119 |
+
except DuplicateKeyError as e:
|
| 120 |
+
logger.info("Duplicate URL detected during insertion. Skipping duplicate entries.")
|
| 121 |
+
|
| 122 |
+
# Process URLs in `urls_batch` if it has URLs
|
| 123 |
+
if urls_batch:
|
| 124 |
+
try:
|
| 125 |
+
save_urls(urls_batch)
|
| 126 |
+
except Exception as e:
|
| 127 |
+
logger.error(f"Error saving URLs to Appwrite: {str(e)}")
|
| 128 |
+
urls_batch.clear()
|
| 129 |
+
|
| 130 |
+
except Exception as e:
|
| 131 |
+
raise CustomException(e, sys) from e
|
| 132 |
+
|
| 133 |
+
def extract_urls(batch_size=BATCH_SIZE):
|
| 134 |
+
# Fetch documents with required fields only, reducing memory footprint per document
|
| 135 |
+
documents = search_data.find({}, {"content.links": 1})
|
| 136 |
+
|
| 137 |
+
# Initialize a list to store URLs in batches
|
| 138 |
+
batch_urls = []
|
| 139 |
+
|
| 140 |
+
for doc in documents:
|
| 141 |
+
# Extract URLs only if 'content' and 'links' exist
|
| 142 |
+
links = doc.get("content", {}).get("links")
|
| 143 |
+
if links:
|
| 144 |
+
# Use a generator to iterate over links directly
|
| 145 |
+
for url in links:
|
| 146 |
+
batch_urls.append(url)
|
| 147 |
+
# Yield URLs in batches to control memory usage
|
| 148 |
+
if len(batch_urls) >= batch_size:
|
| 149 |
+
yield batch_urls
|
| 150 |
+
batch_urls = [] # Clear the batch after yielding
|
| 151 |
+
|
| 152 |
+
# Yield any remaining URLs
|
| 153 |
+
if batch_urls:
|
| 154 |
+
yield batch_urls
|
| 155 |
+
|
| 156 |
+
def crawler(urls_batch):
|
| 157 |
+
try:
|
| 158 |
+
bot = IntrolixBot(urls=urls_batch, args=BotArgs)
|
| 159 |
+
|
| 160 |
+
# Process each batch of scraped data
|
| 161 |
+
for data_batch in bot.scrape_parallel(batch_size=BATCH_SIZE):
|
| 162 |
+
save_to_db(data_batch)
|
| 163 |
+
|
| 164 |
+
except Exception as e:
|
| 165 |
+
raise CustomException(e, sys) from e
|
| 166 |
+
|
| 167 |
+
def run_crawler_continuously():
|
| 168 |
+
global urls_batch
|
| 169 |
+
try:
|
| 170 |
+
while True:
|
| 171 |
+
start_time = time.time() # Record the start time
|
| 172 |
+
|
| 173 |
+
while (time.time() - start_time) < 600: # Run for 10 minutes (600 seconds)
|
| 174 |
+
try:
|
| 175 |
+
root_urls = fetch_root_sites()
|
| 176 |
+
saved_urls = fetch_saved_urls()
|
| 177 |
+
except Exception as e:
|
| 178 |
+
logger.info("Error fetching URLs from Appwrite: %s", str(e))
|
| 179 |
+
root_urls = []
|
| 180 |
+
saved_urls = []
|
| 181 |
+
|
| 182 |
+
if root_urls and saved_urls:
|
| 183 |
+
urls = root_urls + saved_urls
|
| 184 |
+
urls = list(set(urls))
|
| 185 |
+
else:
|
| 186 |
+
urls = root_sites() + urls_batch
|
| 187 |
+
|
| 188 |
+
if urls:
|
| 189 |
+
logger.info(f"Starting crawler with {len(urls)} root URLs")
|
| 190 |
+
crawler(urls[::-1])
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
# Extract and process URLs in batches
|
| 194 |
+
for extracted_urls in extract_urls(batch_size=BATCH_SIZE):
|
| 195 |
+
urls_batch.extend(list(set(extracted_urls)))
|
| 196 |
+
# logger.info(f"Starting crawler with {len(set(urls_batch))} extracted URLs from MongoDB")
|
| 197 |
+
# crawler(list(set(urls_batch)))
|
| 198 |
+
time.sleep(1)
|
| 199 |
+
|
| 200 |
+
time.sleep(1)
|
| 201 |
+
|
| 202 |
+
# After 10 minutes, the while loop will restart without any pause
|
| 203 |
+
logger.info("Restarting the crawler for another 10-minute session.")
|
| 204 |
+
except Exception as e:
|
| 205 |
+
raise CustomException(e, sys) from e
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
@router.post('/crawler')
|
| 209 |
+
def run_crawler():
|
| 210 |
+
try:
|
| 211 |
+
run_crawler_continuously()
|
| 212 |
+
except Exception as e:
|
| 213 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
if __name__ == "__main__":
|
| 217 |
+
while True:
|
| 218 |
+
start_time = time.time()
|
| 219 |
+
while (time.time() - start_time) < 600:
|
| 220 |
+
run_crawler_continuously()
|
| 221 |
+
# # urls = extract_urls()
|
| 222 |
+
# # print(urls)
|
demo.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#import csv
|
| 2 |
+
# from introlix_api.app.database import feed_data
|
| 3 |
+
|
| 4 |
+
# data = feed_data.find({}, {"_id": 0, "title": 1}) # Exclude _id, include only title
|
| 5 |
+
|
| 6 |
+
# # Specify the CSV file to write to
|
| 7 |
+
# csv_file = 'feed_data_titles.csv'
|
| 8 |
+
|
| 9 |
+
# # Write data to a CSV file
|
| 10 |
+
# with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
|
| 11 |
+
# writer = csv.writer(file)
|
| 12 |
+
|
| 13 |
+
# # Write header (just the title field)
|
| 14 |
+
# writer.writerow(["title"])
|
| 15 |
+
|
| 16 |
+
# # Write each document's title to the CSV
|
| 17 |
+
# for document in data:
|
| 18 |
+
# writer.writerow([document.get("title")])
|
| 19 |
+
|
| 20 |
+
# print(f"Title data successfully saved to {csv_file}")
|
| 21 |
+
# from introlix_api.crawler.bot import IntrolixBot, BotArgs
|
| 22 |
+
# import time
|
| 23 |
+
|
| 24 |
+
# start = time.time()
|
| 25 |
+
# inbot = IntrolixBot(args=BotArgs, urls=["https://www.wikipedia.org/", "https://medium.com/", "https://www.bbc.com/"])
|
| 26 |
+
|
| 27 |
+
# print(inbot.crawl(batch_size=1048))
|
| 28 |
+
# # end = time.time()
|
| 29 |
+
# print(f"Time taken: {end - start}")
|
| 30 |
+
|
| 31 |
+
# from introlix_api.app.appwrite import fetch_root_sites
|
| 32 |
+
|
| 33 |
+
# print(len(set(fetch_root_sites())))
|
| 34 |
+
# Access the scraped data
|
| 35 |
+
# for index, page_data in enumerate(inbot.data):
|
| 36 |
+
# print(f"Page {index + 1}:")
|
| 37 |
+
# print(page_data)
|
| 38 |
+
# print('-' * 40)
|
docs/design.md
ADDED
|
File without changes
|
docs/requirements.md
ADDED
|
File without changes
|
main.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import subprocess
|
| 2 |
+
|
| 3 |
+
def run_app():
|
| 4 |
+
command = ["scrapy", "crawl", "generic"]
|
| 5 |
+
working_directory = "src/introlix_api/app/introlix_spider"
|
| 6 |
+
|
| 7 |
+
result = subprocess.run(command, cwd=working_directory, capture_output=True, text=True)
|
| 8 |
+
|
| 9 |
+
print("Output:", result.stdout)
|
| 10 |
+
print("Error:", result.stderr)
|
| 11 |
+
|
| 12 |
+
if __name__ == "__main__":
|
| 13 |
+
# running the spider
|
| 14 |
+
run_app()
|
| 15 |
+
|
| 16 |
+
# def run_get_urls_from_page_parallel(self, urls: list, max_workers: int=10) -> list:
|
| 17 |
+
# """
|
| 18 |
+
# Running get_urls_from_page function in parallel for many runs.
|
| 19 |
+
|
| 20 |
+
# Args:
|
| 21 |
+
# urls (list): list of urls
|
| 22 |
+
# max_workers (int, optional): number of workers. Defaults to 10.
|
| 23 |
+
# Returns:
|
| 24 |
+
# list: list of fetched urls
|
| 25 |
+
# """
|
| 26 |
+
# fetched_urls = []
|
| 27 |
+
|
| 28 |
+
# with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
| 29 |
+
# futures = {executor.submit(self.get_urls_from_page, url): url for url in urls}
|
| 30 |
+
|
| 31 |
+
# for future in concurrent.futures.as_completed(futures):
|
| 32 |
+
# url = futures[future]
|
| 33 |
+
|
| 34 |
+
# try:
|
| 35 |
+
# result = future.result()
|
| 36 |
+
# fetched_urls.append(result)
|
| 37 |
+
# except Exception as e:
|
| 38 |
+
# raise CustomException(e, sys) from e
|
| 39 |
+
|
| 40 |
+
# return list(set(list(url for sublist in fetched_urls if sublist is not None for url in sublist)))
|
requirements-dev.txt
ADDED
|
File without changes
|
requirements.txt
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
numpy
|
| 2 |
+
pandas
|
| 3 |
+
scrapy
|
| 4 |
+
fastapi
|
| 5 |
+
uvicorn
|
| 6 |
+
Jinja2
|
| 7 |
+
appwrite
|
| 8 |
+
python-dotenv
|
| 9 |
+
pymongo
|
| 10 |
+
aiohttp
|
| 11 |
+
motor
|
| 12 |
+
httpx
|
| 13 |
+
torch
|
| 14 |
+
scikit-learn
|
| 15 |
+
beautifulsoup4
|
| 16 |
+
sentence-transformers
|
| 17 |
+
nltk
|
| 18 |
+
algoliasearch
|
| 19 |
+
apscheduler
|
| 20 |
+
cachetools
|
| 21 |
+
|
| 22 |
+
-e .
|
setup.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from setuptools import setup
|
| 2 |
+
import setuptools
|
| 3 |
+
|
| 4 |
+
with open("README.md", "r", encoding="utf-8") as f:
|
| 5 |
+
long_description = f.read()
|
| 6 |
+
|
| 7 |
+
setup(
|
| 8 |
+
name="introlix_api",
|
| 9 |
+
version="0.0.1",
|
| 10 |
+
author="Satyam Mishra",
|
| 11 |
+
author_email="tubex998@gmail.com",
|
| 12 |
+
description="Introlix API offers a comprehensive suite of tools and APIs utilized in Introlix Feed.",
|
| 13 |
+
long_description=long_description,
|
| 14 |
+
long_description_content_type="text/markdown",
|
| 15 |
+
package_dir={"": "src"},
|
| 16 |
+
packages=setuptools.find_packages(where="src"),
|
| 17 |
+
python_requires=">=3.10",
|
| 18 |
+
)
|
src/introlix_api/app/__init__.py
ADDED
|
File without changes
|
src/introlix_api/app/algolia.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import asyncio
|
| 4 |
+
from bson import ObjectId
|
| 5 |
+
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
| 6 |
+
from introlix_api.app.database import search_data
|
| 7 |
+
from algoliasearch.search.client import SearchClientSync
|
| 8 |
+
from dotenv import load_dotenv
|
| 9 |
+
|
| 10 |
+
load_dotenv()
|
| 11 |
+
|
| 12 |
+
ALGOLIA_USER = os.getenv("ALGOLIA_USER")
|
| 13 |
+
ALGOLIA_KEY = os.getenv("ALGOLIA_KEY")
|
| 14 |
+
INDEX_NAME = "introlix_data"
|
| 15 |
+
|
| 16 |
+
# Initialize the Algolia client
|
| 17 |
+
_client = SearchClientSync(ALGOLIA_USER, ALGOLIA_KEY)
|
| 18 |
+
|
| 19 |
+
def convert_object_ids(doc):
|
| 20 |
+
"""Recursively convert ObjectId fields to strings in the document."""
|
| 21 |
+
for key, value in doc.items():
|
| 22 |
+
if isinstance(value, ObjectId):
|
| 23 |
+
doc[key] = str(value)
|
| 24 |
+
elif isinstance(value, dict):
|
| 25 |
+
convert_object_ids(value) # Recursively convert in nested dicts
|
| 26 |
+
elif isinstance(value, list):
|
| 27 |
+
for item in value:
|
| 28 |
+
if isinstance(item, dict):
|
| 29 |
+
convert_object_ids(item) # Recursively convert in dicts within lists
|
| 30 |
+
return doc
|
| 31 |
+
|
| 32 |
+
async def upload_data():
|
| 33 |
+
"""Uploads data to Algolia in batches, updating records by setting `objectID` to prevent duplicates."""
|
| 34 |
+
batch_size = 1000
|
| 35 |
+
batch = []
|
| 36 |
+
|
| 37 |
+
cursor = search_data.find()
|
| 38 |
+
for doc in cursor:
|
| 39 |
+
# Convert any ObjectId fields to strings for JSON compatibility
|
| 40 |
+
doc = convert_object_ids(doc)
|
| 41 |
+
|
| 42 |
+
# Set `objectID` to ensure uniqueness and prevent duplicates
|
| 43 |
+
doc['objectID'] = str(doc['_id']) # Using MongoDB _id as `objectID`
|
| 44 |
+
|
| 45 |
+
# Convert document to JSON string and check its size
|
| 46 |
+
doc_json = json.dumps(doc)
|
| 47 |
+
doc_size = len(doc_json.encode('utf-8'))
|
| 48 |
+
|
| 49 |
+
# Only add to batch if size is within Algolia's 10 KB limit
|
| 50 |
+
if doc_size <= 10000:
|
| 51 |
+
batch.append(doc)
|
| 52 |
+
|
| 53 |
+
# Send batch to Algolia when the batch size is reached
|
| 54 |
+
if len(batch) >= batch_size:
|
| 55 |
+
_client.save_objects(index_name=INDEX_NAME, objects=batch)
|
| 56 |
+
batch.clear() # Clear the batch after sending
|
| 57 |
+
|
| 58 |
+
# Send any remaining documents
|
| 59 |
+
if batch:
|
| 60 |
+
_client.save_objects(index_name=INDEX_NAME, objects=batch)
|
| 61 |
+
|
| 62 |
+
print("Uploaded data to Algolia.")
|
| 63 |
+
|
| 64 |
+
async def main():
|
| 65 |
+
# Run the upload function immediately
|
| 66 |
+
await upload_data()
|
| 67 |
+
|
| 68 |
+
scheduler = AsyncIOScheduler()
|
| 69 |
+
# Schedule `upload_data` to run every 4 hours
|
| 70 |
+
scheduler.add_job(upload_data, 'interval', hours=4)
|
| 71 |
+
scheduler.start()
|
| 72 |
+
|
| 73 |
+
print("Scheduler started. Uploading data to Algolia every 4 hours.")
|
| 74 |
+
|
| 75 |
+
# Keep the main thread alive to allow scheduled tasks to run
|
| 76 |
+
try:
|
| 77 |
+
await asyncio.Event().wait()
|
| 78 |
+
except (KeyboardInterrupt, SystemExit):
|
| 79 |
+
scheduler.shutdown()
|
| 80 |
+
|
| 81 |
+
if __name__ == "__main__":
|
| 82 |
+
asyncio.run(main())
|
src/introlix_api/app/appwrite.py
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
from appwrite.client import Client
|
| 4 |
+
from appwrite.query import Query
|
| 5 |
+
from appwrite.services.databases import Databases
|
| 6 |
+
from appwrite.id import ID
|
| 7 |
+
from dotenv import load_dotenv, dotenv_values
|
| 8 |
+
|
| 9 |
+
from introlix_api.logger import logger
|
| 10 |
+
from introlix_api.exception import CustomException
|
| 11 |
+
from introlix_api.utils.common import is_valid_url, sanitize_url
|
| 12 |
+
|
| 13 |
+
from pydantic import HttpUrl
|
| 14 |
+
|
| 15 |
+
load_dotenv()
|
| 16 |
+
|
| 17 |
+
APPWRITE_PROJECT_ID = os.getenv("APPWRITE_PROJECT_ID")
|
| 18 |
+
APPWRITE_API_KEY = os.getenv("APPWRITE_API_KEY")
|
| 19 |
+
APPWRITE_DATABASE_ID = os.getenv("APPWRITE_DATABASE_ID")
|
| 20 |
+
APPWRITE_ROOTSITES_COLLECTION_ID = os.getenv("APPWRITE_ROOTSITES_COLLECTION_ID")
|
| 21 |
+
APPWRITE_SAVED_URLS_COLLECTION_ID = os.getenv("APPWRITE_SAVED_URLS_COLLECTION_ID")
|
| 22 |
+
APPWRITE_INTERESTS_TO_PICK_COLLECTION_ID = os.getenv("APPWRITE_INTERESTS_TO_PICK_COLLECTION_ID")
|
| 23 |
+
APPWRITE_ACCOUNT_COLLECTION_ID = os.getenv("APPWRITE_ACCOUNT_COLLECTION_ID")
|
| 24 |
+
|
| 25 |
+
client = Client()
|
| 26 |
+
client.set_endpoint('https://cloud.appwrite.io/v1')
|
| 27 |
+
client.set_project(APPWRITE_PROJECT_ID)
|
| 28 |
+
client.set_key(APPWRITE_API_KEY)
|
| 29 |
+
|
| 30 |
+
databases = Databases(client)
|
| 31 |
+
|
| 32 |
+
# models for database
|
| 33 |
+
class RootSitesModel:
|
| 34 |
+
url: HttpUrl
|
| 35 |
+
|
| 36 |
+
# fetching the data from appwrite
|
| 37 |
+
def fetch_root_sites():
|
| 38 |
+
"""
|
| 39 |
+
Function to fetch the root sites from appwrite
|
| 40 |
+
"""
|
| 41 |
+
try:
|
| 42 |
+
logger.info("Fetching all of the root sites...")
|
| 43 |
+
limit = 100
|
| 44 |
+
offset = 0
|
| 45 |
+
|
| 46 |
+
root_sites = []
|
| 47 |
+
|
| 48 |
+
while True:
|
| 49 |
+
response = databases.list_documents(database_id=APPWRITE_DATABASE_ID,collection_id=APPWRITE_ROOTSITES_COLLECTION_ID, queries=[Query.limit(limit), Query.offset(offset)]) # fetching all of the root sites
|
| 50 |
+
|
| 51 |
+
for root_site in response['documents']:
|
| 52 |
+
root_sites.append(root_site['url'])
|
| 53 |
+
|
| 54 |
+
if len(response['documents']) < limit:
|
| 55 |
+
break
|
| 56 |
+
|
| 57 |
+
offset += limit
|
| 58 |
+
|
| 59 |
+
# root_sites = [root_site['url'] for root_site in response['documents']] # extracting the urls
|
| 60 |
+
|
| 61 |
+
return root_sites
|
| 62 |
+
|
| 63 |
+
except Exception as e:
|
| 64 |
+
raise CustomException(e, sys) from e
|
| 65 |
+
|
| 66 |
+
def fetch_saved_urls():
|
| 67 |
+
"""
|
| 68 |
+
Function to fetch the root sites from appwrite
|
| 69 |
+
"""
|
| 70 |
+
try:
|
| 71 |
+
logger.info("Fetching all of the saved urls...")
|
| 72 |
+
limit = 100
|
| 73 |
+
offset = 0
|
| 74 |
+
|
| 75 |
+
root_sites = []
|
| 76 |
+
|
| 77 |
+
while True:
|
| 78 |
+
response = databases.list_documents(database_id=APPWRITE_DATABASE_ID,collection_id=APPWRITE_SAVED_URLS_COLLECTION_ID, queries=[Query.limit(limit), Query.offset(offset)]) # fetching all of the root sites
|
| 79 |
+
|
| 80 |
+
for root_site in response['documents']:
|
| 81 |
+
root_sites.append(root_site['url'])
|
| 82 |
+
|
| 83 |
+
if len(response['documents']) < limit:
|
| 84 |
+
break
|
| 85 |
+
|
| 86 |
+
offset += limit
|
| 87 |
+
|
| 88 |
+
# root_sites = [root_site['url'] for root_site in response['documents']] # extracting the urls
|
| 89 |
+
|
| 90 |
+
return root_sites[-4000:]
|
| 91 |
+
|
| 92 |
+
except Exception as e:
|
| 93 |
+
raise CustomException(e, sys) from e
|
| 94 |
+
|
| 95 |
+
def get_interests():
|
| 96 |
+
"""
|
| 97 |
+
Function to fetch the interests list from where user can choose its interests
|
| 98 |
+
"""
|
| 99 |
+
try:
|
| 100 |
+
response = databases.list_documents(database_id=APPWRITE_DATABASE_ID,collection_id=APPWRITE_INTERESTS_TO_PICK_COLLECTION_ID, queries=[Query.limit(100), Query.offset(0)])
|
| 101 |
+
|
| 102 |
+
interests = [{"interest": interest['interest'], "keywords": interest['keywords']} for interest in response['documents']]
|
| 103 |
+
|
| 104 |
+
return interests
|
| 105 |
+
except Exception as e:
|
| 106 |
+
raise CustomException(e, sys) from e
|
| 107 |
+
|
| 108 |
+
def save_urls(urls):
|
| 109 |
+
"""
|
| 110 |
+
Function to save the URLs in Appwrite. Handles large collections efficiently.
|
| 111 |
+
"""
|
| 112 |
+
try:
|
| 113 |
+
limit = 10
|
| 114 |
+
offset = 0
|
| 115 |
+
existing_urls = set() # Set to store unique URLs
|
| 116 |
+
|
| 117 |
+
# Check the total number of documents in the collection
|
| 118 |
+
total_count_response = databases.list_documents(
|
| 119 |
+
database_id=APPWRITE_DATABASE_ID,
|
| 120 |
+
collection_id=APPWRITE_SAVED_URLS_COLLECTION_ID,
|
| 121 |
+
queries=[Query.limit(1)]
|
| 122 |
+
)
|
| 123 |
+
total_count = total_count_response['total']
|
| 124 |
+
|
| 125 |
+
# Delete all documents if the count exceeds 20,000
|
| 126 |
+
if total_count > 20000:
|
| 127 |
+
logger.info("URL count exceeded 20,000. Deleting all documents in the collection.")
|
| 128 |
+
while True:
|
| 129 |
+
response = databases.list_documents(
|
| 130 |
+
database_id=APPWRITE_DATABASE_ID,
|
| 131 |
+
collection_id=APPWRITE_SAVED_URLS_COLLECTION_ID,
|
| 132 |
+
queries=[Query.limit(limit)]
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
if not response['documents']:
|
| 136 |
+
break # All documents have been deleted
|
| 137 |
+
|
| 138 |
+
for doc in response['documents']:
|
| 139 |
+
databases.delete_document(
|
| 140 |
+
database_id=APPWRITE_DATABASE_ID,
|
| 141 |
+
collection_id=APPWRITE_SAVED_URLS_COLLECTION_ID,
|
| 142 |
+
document_id=doc['$id']
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
# Fetch and process all documents in chunks to populate existing_urls set
|
| 146 |
+
offset = 0 # Reset offset after deletion
|
| 147 |
+
while True:
|
| 148 |
+
# Fetch a chunk of documents from the database
|
| 149 |
+
response = databases.list_documents(
|
| 150 |
+
database_id=APPWRITE_DATABASE_ID,
|
| 151 |
+
collection_id=APPWRITE_SAVED_URLS_COLLECTION_ID,
|
| 152 |
+
queries=[Query.limit(limit), Query.offset(offset)]
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
+
# Add the fetched URLs to the set
|
| 156 |
+
for doc in response['documents']:
|
| 157 |
+
existing_urls.add(doc['url'])
|
| 158 |
+
|
| 159 |
+
# Check if we have fetched all documents
|
| 160 |
+
if len(response['documents']) < limit:
|
| 161 |
+
break # No more documents to fetch, exit the loop
|
| 162 |
+
|
| 163 |
+
# Move to the next batch
|
| 164 |
+
offset += limit
|
| 165 |
+
|
| 166 |
+
# Save only unique URLs that are not already in the set
|
| 167 |
+
for url in urls:
|
| 168 |
+
if url not in existing_urls:
|
| 169 |
+
if is_valid_url(url):
|
| 170 |
+
sanitized_url = sanitize_url(url)
|
| 171 |
+
databases.create_document(
|
| 172 |
+
database_id=APPWRITE_DATABASE_ID,
|
| 173 |
+
collection_id=APPWRITE_SAVED_URLS_COLLECTION_ID,
|
| 174 |
+
document_id=ID.unique(),
|
| 175 |
+
data={'url': sanitized_url}
|
| 176 |
+
)
|
| 177 |
+
except Exception as e:
|
| 178 |
+
raise CustomException(e, sys) from e
|
| 179 |
+
|
src/introlix_api/app/database.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from pymongo import MongoClient
|
| 3 |
+
from motor.motor_asyncio import AsyncIOMotorClient
|
| 4 |
+
|
| 5 |
+
MONGODB_CLIENT_ID = os.getenv("MONGODB_CLIENT_ID")
|
| 6 |
+
|
| 7 |
+
client = MongoClient(MONGODB_CLIENT_ID)
|
| 8 |
+
|
| 9 |
+
db = client.IntrolixDb
|
| 10 |
+
|
| 11 |
+
feed_data = db.feedData
|
| 12 |
+
search_data = db.search_data
|
| 13 |
+
votes = db.votes
|
| 14 |
+
|
| 15 |
+
async def startup_db_client(app):
|
| 16 |
+
app.mongodb_client = AsyncIOMotorClient(MONGODB_CLIENT_ID)
|
| 17 |
+
app.mongodb = app.mongodb_client.get_database("IntrolixDb")
|
| 18 |
+
print("MongoDB connected.")
|
| 19 |
+
|
| 20 |
+
async def shutdown_db_client(app):
|
| 21 |
+
app.mongodb_client.close()
|
| 22 |
+
print("Database disconnected.")
|
| 23 |
+
|
src/introlix_api/app/introlix_spider/introlix_spider/__init__.py
ADDED
|
File without changes
|
src/introlix_api/app/introlix_spider/introlix_spider/items.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Define here the models for your scraped items
|
| 2 |
+
#
|
| 3 |
+
# See documentation in:
|
| 4 |
+
# https://docs.scrapy.org/en/latest/topics/items.html
|
| 5 |
+
|
| 6 |
+
import scrapy
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class IntrolixSpiderItem(scrapy.Item):
|
| 10 |
+
# define the fields for your item here like:
|
| 11 |
+
# name = scrapy.Field()
|
| 12 |
+
pass
|
src/introlix_api/app/introlix_spider/introlix_spider/middlewares.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Define here the models for your spider middleware
|
| 2 |
+
#
|
| 3 |
+
# See documentation in:
|
| 4 |
+
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
| 5 |
+
|
| 6 |
+
from scrapy import signals
|
| 7 |
+
|
| 8 |
+
# useful for handling different item types with a single interface
|
| 9 |
+
from itemadapter import is_item, ItemAdapter
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class IntrolixSpiderSpiderMiddleware:
|
| 13 |
+
# Not all methods need to be defined. If a method is not defined,
|
| 14 |
+
# scrapy acts as if the spider middleware does not modify the
|
| 15 |
+
# passed objects.
|
| 16 |
+
|
| 17 |
+
@classmethod
|
| 18 |
+
def from_crawler(cls, crawler):
|
| 19 |
+
# This method is used by Scrapy to create your spiders.
|
| 20 |
+
s = cls()
|
| 21 |
+
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
| 22 |
+
return s
|
| 23 |
+
|
| 24 |
+
def process_spider_input(self, response, spider):
|
| 25 |
+
# Called for each response that goes through the spider
|
| 26 |
+
# middleware and into the spider.
|
| 27 |
+
|
| 28 |
+
# Should return None or raise an exception.
|
| 29 |
+
return None
|
| 30 |
+
|
| 31 |
+
def process_spider_output(self, response, result, spider):
|
| 32 |
+
# Called with the results returned from the Spider, after
|
| 33 |
+
# it has processed the response.
|
| 34 |
+
|
| 35 |
+
# Must return an iterable of Request, or item objects.
|
| 36 |
+
for i in result:
|
| 37 |
+
yield i
|
| 38 |
+
|
| 39 |
+
def process_spider_exception(self, response, exception, spider):
|
| 40 |
+
# Called when a spider or process_spider_input() method
|
| 41 |
+
# (from other spider middleware) raises an exception.
|
| 42 |
+
|
| 43 |
+
# Should return either None or an iterable of Request or item objects.
|
| 44 |
+
pass
|
| 45 |
+
|
| 46 |
+
def process_start_requests(self, start_requests, spider):
|
| 47 |
+
# Called with the start requests of the spider, and works
|
| 48 |
+
# similarly to the process_spider_output() method, except
|
| 49 |
+
# that it doesn’t have a response associated.
|
| 50 |
+
|
| 51 |
+
# Must return only requests (not items).
|
| 52 |
+
for r in start_requests:
|
| 53 |
+
yield r
|
| 54 |
+
|
| 55 |
+
def spider_opened(self, spider):
|
| 56 |
+
spider.logger.info("Spider opened: %s" % spider.name)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
class IntrolixSpiderDownloaderMiddleware:
|
| 60 |
+
# Not all methods need to be defined. If a method is not defined,
|
| 61 |
+
# scrapy acts as if the downloader middleware does not modify the
|
| 62 |
+
# passed objects.
|
| 63 |
+
|
| 64 |
+
@classmethod
|
| 65 |
+
def from_crawler(cls, crawler):
|
| 66 |
+
# This method is used by Scrapy to create your spiders.
|
| 67 |
+
s = cls()
|
| 68 |
+
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
| 69 |
+
return s
|
| 70 |
+
|
| 71 |
+
def process_request(self, request, spider):
|
| 72 |
+
# Called for each request that goes through the downloader
|
| 73 |
+
# middleware.
|
| 74 |
+
|
| 75 |
+
# Must either:
|
| 76 |
+
# - return None: continue processing this request
|
| 77 |
+
# - or return a Response object
|
| 78 |
+
# - or return a Request object
|
| 79 |
+
# - or raise IgnoreRequest: process_exception() methods of
|
| 80 |
+
# installed downloader middleware will be called
|
| 81 |
+
return None
|
| 82 |
+
|
| 83 |
+
def process_response(self, request, response, spider):
|
| 84 |
+
# Called with the response returned from the downloader.
|
| 85 |
+
|
| 86 |
+
# Must either;
|
| 87 |
+
# - return a Response object
|
| 88 |
+
# - return a Request object
|
| 89 |
+
# - or raise IgnoreRequest
|
| 90 |
+
return response
|
| 91 |
+
|
| 92 |
+
def process_exception(self, request, exception, spider):
|
| 93 |
+
# Called when a download handler or a process_request()
|
| 94 |
+
# (from other downloader middleware) raises an exception.
|
| 95 |
+
|
| 96 |
+
# Must either:
|
| 97 |
+
# - return None: continue processing this exception
|
| 98 |
+
# - return a Response object: stops process_exception() chain
|
| 99 |
+
# - return a Request object: stops process_exception() chain
|
| 100 |
+
pass
|
| 101 |
+
|
| 102 |
+
def spider_opened(self, spider):
|
| 103 |
+
spider.logger.info("Spider opened: %s" % spider.name)
|
src/introlix_api/app/introlix_spider/introlix_spider/pipelines.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Define your item pipelines here
|
| 2 |
+
#
|
| 3 |
+
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
| 4 |
+
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
# useful for handling different item types with a single interface
|
| 8 |
+
from itemadapter import ItemAdapter
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class IntrolixSpiderPipeline:
|
| 12 |
+
def process_item(self, item, spider):
|
| 13 |
+
return item
|
src/introlix_api/app/introlix_spider/introlix_spider/settings.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Scrapy settings for introlix_spider project
|
| 2 |
+
#
|
| 3 |
+
# For simplicity, this file contains only settings considered important or
|
| 4 |
+
# commonly used. You can find more settings consulting the documentation:
|
| 5 |
+
#
|
| 6 |
+
# https://docs.scrapy.org/en/latest/topics/settings.html
|
| 7 |
+
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
| 8 |
+
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
| 9 |
+
|
| 10 |
+
BOT_NAME = "introlix_spider"
|
| 11 |
+
|
| 12 |
+
SPIDER_MODULES = ["introlix_spider.spiders"]
|
| 13 |
+
NEWSPIDER_MODULE = "introlix_spider.spiders"
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
| 17 |
+
#USER_AGENT = "introlix_spider (+http://www.yourdomain.com)"
|
| 18 |
+
|
| 19 |
+
# Obey robots.txt rules
|
| 20 |
+
ROBOTSTXT_OBEY = True
|
| 21 |
+
|
| 22 |
+
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
| 23 |
+
#CONCURRENT_REQUESTS = 32
|
| 24 |
+
|
| 25 |
+
# Configure a delay for requests for the same website (default: 0)
|
| 26 |
+
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
| 27 |
+
# See also autothrottle settings and docs
|
| 28 |
+
#DOWNLOAD_DELAY = 3
|
| 29 |
+
# The download delay setting will honor only one of:
|
| 30 |
+
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
| 31 |
+
#CONCURRENT_REQUESTS_PER_IP = 16
|
| 32 |
+
|
| 33 |
+
# Disable cookies (enabled by default)
|
| 34 |
+
#COOKIES_ENABLED = False
|
| 35 |
+
|
| 36 |
+
# Disable Telnet Console (enabled by default)
|
| 37 |
+
#TELNETCONSOLE_ENABLED = False
|
| 38 |
+
|
| 39 |
+
# Override the default request headers:
|
| 40 |
+
#DEFAULT_REQUEST_HEADERS = {
|
| 41 |
+
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
| 42 |
+
# "Accept-Language": "en",
|
| 43 |
+
#}
|
| 44 |
+
|
| 45 |
+
# Enable or disable spider middlewares
|
| 46 |
+
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
| 47 |
+
#SPIDER_MIDDLEWARES = {
|
| 48 |
+
# "introlix_spider.middlewares.IntrolixSpiderSpiderMiddleware": 543,
|
| 49 |
+
#}
|
| 50 |
+
|
| 51 |
+
# Enable or disable downloader middlewares
|
| 52 |
+
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
| 53 |
+
#DOWNLOADER_MIDDLEWARES = {
|
| 54 |
+
# "introlix_spider.middlewares.IntrolixSpiderDownloaderMiddleware": 543,
|
| 55 |
+
#}
|
| 56 |
+
|
| 57 |
+
# Enable or disable extensions
|
| 58 |
+
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
| 59 |
+
#EXTENSIONS = {
|
| 60 |
+
# "scrapy.extensions.telnet.TelnetConsole": None,
|
| 61 |
+
#}
|
| 62 |
+
|
| 63 |
+
# Configure item pipelines
|
| 64 |
+
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
| 65 |
+
#ITEM_PIPELINES = {
|
| 66 |
+
# "introlix_spider.pipelines.IntrolixSpiderPipeline": 300,
|
| 67 |
+
#}
|
| 68 |
+
|
| 69 |
+
# Enable and configure the AutoThrottle extension (disabled by default)
|
| 70 |
+
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
| 71 |
+
#AUTOTHROTTLE_ENABLED = True
|
| 72 |
+
# The initial download delay
|
| 73 |
+
#AUTOTHROTTLE_START_DELAY = 5
|
| 74 |
+
# The maximum download delay to be set in case of high latencies
|
| 75 |
+
#AUTOTHROTTLE_MAX_DELAY = 60
|
| 76 |
+
# The average number of requests Scrapy should be sending in parallel to
|
| 77 |
+
# each remote server
|
| 78 |
+
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
| 79 |
+
# Enable showing throttling stats for every response received:
|
| 80 |
+
#AUTOTHROTTLE_DEBUG = False
|
| 81 |
+
|
| 82 |
+
# Enable and configure HTTP caching (disabled by default)
|
| 83 |
+
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
| 84 |
+
#HTTPCACHE_ENABLED = True
|
| 85 |
+
#HTTPCACHE_EXPIRATION_SECS = 0
|
| 86 |
+
#HTTPCACHE_DIR = "httpcache"
|
| 87 |
+
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
| 88 |
+
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
|
| 89 |
+
|
| 90 |
+
# Set settings whose default value is deprecated to a future-proof value
|
| 91 |
+
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
|
| 92 |
+
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
| 93 |
+
FEED_EXPORT_ENCODING = "utf-8"
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
# Increase the number of concurrent requests
|
| 97 |
+
CONCURRENT_REQUESTS = 32
|
| 98 |
+
|
| 99 |
+
# Set to 0 for no delay between requests
|
| 100 |
+
DOWNLOAD_DELAY = 0
|
src/introlix_api/app/introlix_spider/introlix_spider/spiders/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This package will contain the spiders of your Scrapy project
|
| 2 |
+
#
|
| 3 |
+
# Please refer to the documentation for information on how to create and manage
|
| 4 |
+
# your spiders.
|
src/introlix_api/app/introlix_spider/introlix_spider/spiders/generic.py
ADDED
|
@@ -0,0 +1,286 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
import scrapy
|
| 4 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 5 |
+
import aiohttp
|
| 6 |
+
import asyncio
|
| 7 |
+
from dotenv import load_dotenv, dotenv_values
|
| 8 |
+
from introlix_api.app.database import feed_data, db
|
| 9 |
+
from introlix_api.app.appwrite import fetch_root_sites
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
load_dotenv()
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class GenericSpider(scrapy.Spider):
|
| 16 |
+
"""
|
| 17 |
+
Spider to crawl internet to get data to display it on introlix feed
|
| 18 |
+
"""
|
| 19 |
+
name = "generic"
|
| 20 |
+
|
| 21 |
+
def __init__(self, *args, **kwargs):
|
| 22 |
+
super(GenericSpider, self).__init__(*args, **kwargs)
|
| 23 |
+
self.executor = ThreadPoolExecutor(max_workers=10) # Control parallelism
|
| 24 |
+
|
| 25 |
+
self.data = []
|
| 26 |
+
|
| 27 |
+
self.all_urls = fetch_root_sites()
|
| 28 |
+
self.domain_pattern = r'(?:[a-z0-9-]+\.)?([a-z0-9-]+\.[a-z]{2,})(?:\/|$)'
|
| 29 |
+
|
| 30 |
+
self.allowed_domains = []
|
| 31 |
+
self.start_urls = []
|
| 32 |
+
self.CLASSIFICATION_API = os.getenv('CLASSIFICATION_API')
|
| 33 |
+
|
| 34 |
+
for url in self.all_urls:
|
| 35 |
+
result = re.search(self.domain_pattern, url)
|
| 36 |
+
|
| 37 |
+
if result:
|
| 38 |
+
self.allowed_domains.append(result.group(1))
|
| 39 |
+
self.start_urls.append(result.group(1))
|
| 40 |
+
|
| 41 |
+
def start_requests(self):
|
| 42 |
+
for url in self.all_urls:
|
| 43 |
+
yield scrapy.Request(url=url, callback=self.parse)
|
| 44 |
+
|
| 45 |
+
def is_this_article(self, url):
|
| 46 |
+
"""
|
| 47 |
+
Function to verify if the url is article url or not
|
| 48 |
+
"""
|
| 49 |
+
|
| 50 |
+
# list of article url patterns
|
| 51 |
+
article_pattern = [
|
| 52 |
+
r'/(blog|article|articles|post|blog|posts|blogs|)/\d{4}/\d{2}/+[a-z0-9-]+/?',
|
| 53 |
+
r'/(blog|article|articles|post|blog|posts|blogs|)/[a-z0-9-]+/[a-z0-9-]+',
|
| 54 |
+
r'(?<!\/\/www)(blog|article|articles|post|posts|blogs)/[a-z0-9-]+',
|
| 55 |
+
r'^(?!.*\/category\/).*\/[a-z0-9-]+\/[a-z0-9-]+(-[a-z0-9-]+)+$',
|
| 56 |
+
r'/[^/]+/\d{4}/\d{2}/\d{2}/+[a-z0-9]+/?',
|
| 57 |
+
r'/[^/]+/\d{4}/\d{2}/+[a-z0-9]+/?'
|
| 58 |
+
r'/[a-z0-9-]+/\d{4}/\d{2}/+/?',
|
| 59 |
+
r'/[a-z0-9-]+/\d{4}/\d{2}/\d{2}/+/?'
|
| 60 |
+
]
|
| 61 |
+
|
| 62 |
+
# list of non article keywords
|
| 63 |
+
non_article_words = [
|
| 64 |
+
"category", "signup", "login", "about", "contact", # Add more non-article keywords...
|
| 65 |
+
]
|
| 66 |
+
|
| 67 |
+
# Check if the url matches any of the article patterns
|
| 68 |
+
for pattern in article_pattern:
|
| 69 |
+
if re.search(pattern, url):
|
| 70 |
+
if not any(word in url for word in non_article_words):
|
| 71 |
+
return True
|
| 72 |
+
return False
|
| 73 |
+
|
| 74 |
+
def parse(self, response):
|
| 75 |
+
# Get all the urls from the response
|
| 76 |
+
urls = response.css('a::attr(href)').extract()
|
| 77 |
+
|
| 78 |
+
# Filter out the urls that are not article urls
|
| 79 |
+
article_urls = [response.urljoin(url.split("?")[0]) for url in urls if self.is_this_article(url)]
|
| 80 |
+
|
| 81 |
+
# Send a request to each article url
|
| 82 |
+
for url in article_urls:
|
| 83 |
+
yield scrapy.Request(url=url, callback=self.parse_article)
|
| 84 |
+
|
| 85 |
+
async def classify_article(self, text):
|
| 86 |
+
"""
|
| 87 |
+
function to classify the article
|
| 88 |
+
"""
|
| 89 |
+
classify_ai = self.CLASSIFICATION_API
|
| 90 |
+
payload = {"text": text}
|
| 91 |
+
|
| 92 |
+
# Send a request to the classification API
|
| 93 |
+
async with aiohttp.ClientSession() as session:
|
| 94 |
+
try:
|
| 95 |
+
async with session.post(classify_ai, json=payload) as response:
|
| 96 |
+
response.raise_for_status()
|
| 97 |
+
result = await response.json()
|
| 98 |
+
return result.get('category', 'Unknown')
|
| 99 |
+
except aiohttp.ClientError as e:
|
| 100 |
+
self.logger.error(f"Error making request to classification API: {e}")
|
| 101 |
+
return 'Error'
|
| 102 |
+
|
| 103 |
+
async def parse_article(self, response):
|
| 104 |
+
"""
|
| 105 |
+
Function to get all details of the article
|
| 106 |
+
"""
|
| 107 |
+
hostname = response.url.split("/")[2] # getting the website name of the article
|
| 108 |
+
title = response.css("h1::text").get() # getting the title of the article
|
| 109 |
+
url = response.url # getting the url of the article
|
| 110 |
+
desc = response.css('meta[name="description"]::attr(content)').get() # getting the description of the article
|
| 111 |
+
publication_date = response.css('span::text, time::text').re_first(r'(\w+ \d+|\d+\s?\w+,? \w+)') # getting the publication date of the article
|
| 112 |
+
image_url = response.css('meta[property="og:image"]::attr(content)').get() # getting the image url of the article
|
| 113 |
+
|
| 114 |
+
# Classify article title asynchronously
|
| 115 |
+
category = await self.classify_article(title) # getting the category of the article from the classification API
|
| 116 |
+
|
| 117 |
+
# Prepare feed item
|
| 118 |
+
feed_items = {
|
| 119 |
+
"title": title,
|
| 120 |
+
"desc": desc,
|
| 121 |
+
"url": url,
|
| 122 |
+
"publication_date": publication_date,
|
| 123 |
+
"image_url": image_url,
|
| 124 |
+
"category": category,
|
| 125 |
+
"source": hostname
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
self.data.append(feed_items)
|
| 129 |
+
|
| 130 |
+
def closed(self, reason):
|
| 131 |
+
print(f"Spider closed: {reason}")
|
| 132 |
+
print("Saving ----")
|
| 133 |
+
self.save_data()
|
| 134 |
+
|
| 135 |
+
def save_data(self):
|
| 136 |
+
# if "feed_Data" in db.list_collection_names():
|
| 137 |
+
# feed_data.drop()
|
| 138 |
+
|
| 139 |
+
for feed_items in self.data:
|
| 140 |
+
feed_data.insert_one(feed_items)
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
# import re
|
| 144 |
+
# import scrapy
|
| 145 |
+
# from pathlib import Path
|
| 146 |
+
# import requests
|
| 147 |
+
# from concurrent.futures import ThreadPoolExecutor
|
| 148 |
+
# from twisted.internet.defer import ensureDeferred
|
| 149 |
+
# from introlix_api.app.database import feed_data, db
|
| 150 |
+
# from introlix_api.app.appwrite import fetch_root_sites
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
# class GenericSpider(scrapy.Spider):
|
| 154 |
+
# name = "generic"
|
| 155 |
+
|
| 156 |
+
# def __init__(self, *args, **kwargs):
|
| 157 |
+
# super(GenericSpider, self).__init__(*args, **kwargs)
|
| 158 |
+
# self.executor = ThreadPoolExecutor(max_workers=10)
|
| 159 |
+
|
| 160 |
+
# self.data = []
|
| 161 |
+
|
| 162 |
+
# self.all_urls = fetch_root_sites()
|
| 163 |
+
# self.domain_pattern = r'(?:[a-z0-9-]+\.)?([a-z0-9-]+\.[a-z]{2,})(?:\/|$)'
|
| 164 |
+
|
| 165 |
+
# self.allowed_domains = []
|
| 166 |
+
# self.start_urls = []
|
| 167 |
+
|
| 168 |
+
# for url in self.all_urls:
|
| 169 |
+
# result = re.search(self.domain_pattern, url)
|
| 170 |
+
|
| 171 |
+
# if result:
|
| 172 |
+
# self.allowed_domains.append(result.group(1))
|
| 173 |
+
# self.start_urls.append(result.group(1))
|
| 174 |
+
|
| 175 |
+
# def start_requests(self):
|
| 176 |
+
|
| 177 |
+
# for url in self.all_urls:
|
| 178 |
+
# yield scrapy.Request(url=url, callback=self.parse)
|
| 179 |
+
|
| 180 |
+
# def is_this_article(self, url):
|
| 181 |
+
# article_pattern = [
|
| 182 |
+
# r'/(blog|article|articles|post|blog|posts|blogs|)/\d{4}/\d{2}/+[a-z0-9-]+/?',
|
| 183 |
+
# r'/(blog|article|articles|post|blog|posts|blogs|)/[a-z0-9-]+/[a-z0-9-]+',
|
| 184 |
+
# r'(?<!\/\/www)(blog|article|articles|post|posts|blogs)/[a-z0-9-]+',
|
| 185 |
+
# r'^(?!.*\/category\/).*\/[a-z0-9-]+\/[a-z0-9-]+(-[a-z0-9-]+)+$',
|
| 186 |
+
# r'/[^/]+/\d{4}/\d{2}/\d{2}/+[a-z0-9]+/?',
|
| 187 |
+
# r'/[^/]+/\d{4}/\d{2}/+[a-z0-9]+/?'
|
| 188 |
+
# r'/[a-z0-9-]+/\d{4}/\d{2}/+/?',
|
| 189 |
+
# r'/[a-z0-9-]+/\d{4}/\d{2}/\d{2}/+/?'
|
| 190 |
+
# ]
|
| 191 |
+
|
| 192 |
+
# # List of non-article keywords
|
| 193 |
+
# non_article_words = [
|
| 194 |
+
# "category",
|
| 195 |
+
# "signup",
|
| 196 |
+
# "login",
|
| 197 |
+
# "about",
|
| 198 |
+
# "contact",
|
| 199 |
+
# "privacy",
|
| 200 |
+
# "terms",
|
| 201 |
+
# "faq",
|
| 202 |
+
# "help",
|
| 203 |
+
# "support",
|
| 204 |
+
# "user",
|
| 205 |
+
# "account",
|
| 206 |
+
# "settings",
|
| 207 |
+
# "profile",
|
| 208 |
+
# "admin",
|
| 209 |
+
# "dashboard",
|
| 210 |
+
# "search",
|
| 211 |
+
# "index",
|
| 212 |
+
# "topics",
|
| 213 |
+
# "rss",
|
| 214 |
+
# "solutions",
|
| 215 |
+
# "shows",
|
| 216 |
+
# "author"
|
| 217 |
+
# ]
|
| 218 |
+
|
| 219 |
+
# for pattern in article_pattern:
|
| 220 |
+
# if re.search(pattern, url):
|
| 221 |
+
# for word in non_article_words:
|
| 222 |
+
# if word in url:
|
| 223 |
+
# return False
|
| 224 |
+
# return True
|
| 225 |
+
# return False
|
| 226 |
+
|
| 227 |
+
# def parse(self, response):
|
| 228 |
+
# urls = response.css('a::attr(href)').extract()
|
| 229 |
+
|
| 230 |
+
# article_urls = [response.urljoin(url.split("?")[0]) for url in urls if self.is_this_article(url)]
|
| 231 |
+
|
| 232 |
+
# for url in article_urls:
|
| 233 |
+
# yield scrapy.Request(url=url, callback=self.parse_article)
|
| 234 |
+
|
| 235 |
+
# def classify_article(self, text):
|
| 236 |
+
# classify_ai = "dont show api"
|
| 237 |
+
# payload = {"text": text}
|
| 238 |
+
|
| 239 |
+
# try:
|
| 240 |
+
# response = requests.post(classify_ai, json=payload)
|
| 241 |
+
# response.raise_for_status()
|
| 242 |
+
# return response.json().get('category', 'Unknown')
|
| 243 |
+
# except requests.RequestException as e:
|
| 244 |
+
# self.logger.error(f"Error making request to classification API: {e}")
|
| 245 |
+
# return 'Error'
|
| 246 |
+
|
| 247 |
+
# def parse_article(self, response):
|
| 248 |
+
# # getting all the infomation from the article
|
| 249 |
+
|
| 250 |
+
# hostname = response.url.split("/")[2]
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
# title = response.css("h1::text").get()
|
| 254 |
+
# url = response.url
|
| 255 |
+
# desc = response.css('meta[name="description"]::attr(content)').get()
|
| 256 |
+
# publication_date = response.css('span::text, time::text').re_first(r'(\w+ \d+|\d+\s?\w+,? \w+)')
|
| 257 |
+
# image_url = response.css('meta[property="og:image"]::attr(content)').get()
|
| 258 |
+
|
| 259 |
+
# # Using ThreadPoolExecutor to classify the title in a separate thread
|
| 260 |
+
# future = self.executor.submit(self.classify_article, title)
|
| 261 |
+
# category = future.result()
|
| 262 |
+
|
| 263 |
+
# # storing the infomation on mongodb
|
| 264 |
+
# feed_items = {
|
| 265 |
+
# "title": title,
|
| 266 |
+
# "desc": desc,
|
| 267 |
+
# "url": url,
|
| 268 |
+
# "publication_date": publication_date,
|
| 269 |
+
# "image_url": image_url,
|
| 270 |
+
# "category": category,
|
| 271 |
+
# "source": hostname
|
| 272 |
+
# }
|
| 273 |
+
|
| 274 |
+
# self.data.append(feed_items)
|
| 275 |
+
|
| 276 |
+
# def closed(self, reason):
|
| 277 |
+
# print(f"Spider closed: {reason}")
|
| 278 |
+
# print("Saving ----")
|
| 279 |
+
# self.save_data()
|
| 280 |
+
|
| 281 |
+
# def save_data(self):
|
| 282 |
+
# if "feed_Data" in db.list_collection_names():
|
| 283 |
+
# feed_data.drop()
|
| 284 |
+
|
| 285 |
+
# for feed_items in self.data:
|
| 286 |
+
# feed_data.insert_one(feed_items)
|
src/introlix_api/app/introlix_spider/scrapy.cfg
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Automatically created by: scrapy startproject
|
| 2 |
+
#
|
| 3 |
+
# For more information about the [deploy] section see:
|
| 4 |
+
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
| 5 |
+
|
| 6 |
+
[settings]
|
| 7 |
+
default = introlix_spider.settings
|
| 8 |
+
|
| 9 |
+
[deploy]
|
| 10 |
+
#url = http://localhost:6800/
|
| 11 |
+
project = introlix_spider
|
src/introlix_api/app/model.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel, Field
|
| 2 |
+
from typing import List, Optional
|
| 3 |
+
from datetime import date
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
|
| 6 |
+
# signup model
|
| 7 |
+
class UserSignup(BaseModel):
|
| 8 |
+
name: str
|
| 9 |
+
email: str
|
| 10 |
+
password: str
|
| 11 |
+
dob: date
|
| 12 |
+
interestList: List[str]
|
| 13 |
+
|
| 14 |
+
# login model
|
| 15 |
+
class UserLogin(BaseModel):
|
| 16 |
+
email: str
|
| 17 |
+
password: str
|
| 18 |
+
|
| 19 |
+
# feed model
|
| 20 |
+
class FeedModel(BaseModel):
|
| 21 |
+
id: str = Field(..., alias="_id")
|
| 22 |
+
title: str
|
| 23 |
+
desc: str
|
| 24 |
+
url: str
|
| 25 |
+
image_url: str
|
| 26 |
+
tags: list
|
| 27 |
+
vote: int
|
| 28 |
+
created_at: Optional[datetime]
|
| 29 |
+
|
| 30 |
+
class DiscussionModel(BaseModel):
|
| 31 |
+
id: str = Field(..., alias="_id")
|
| 32 |
+
title: str
|
| 33 |
+
url: str
|
| 34 |
+
tags: list
|
| 35 |
+
vote: int
|
| 36 |
+
created_at: Optional[datetime]
|
| 37 |
+
answer_count: int
|
src/introlix_api/app/routes/__init__.py
ADDED
|
File without changes
|
src/introlix_api/app/routes/auth.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import random
|
| 2 |
+
from fastapi import APIRouter, HTTPException, Query
|
| 3 |
+
|
| 4 |
+
from introlix_api.exception import CustomException
|
| 5 |
+
from introlix_api.app.model import UserSignup, UserLogin
|
| 6 |
+
from introlix_api.app.appwrite import databases, APPWRITE_DATABASE_ID, ID, APPWRITE_ACCOUNT_COLLECTION_ID
|
| 7 |
+
from introlix_api.logger import logger
|
| 8 |
+
|
| 9 |
+
router = APIRouter()
|
| 10 |
+
|
| 11 |
+
@router.post("/test")
|
| 12 |
+
async def test(data: dict):
|
| 13 |
+
return {"message": f"POST request works with data {data}"}
|
| 14 |
+
|
| 15 |
+
@router.post('/signup')
|
| 16 |
+
async def signup(user: UserSignup):
|
| 17 |
+
"""
|
| 18 |
+
Function to signup a new user
|
| 19 |
+
"""
|
| 20 |
+
try:
|
| 21 |
+
# List of avatar colors
|
| 22 |
+
avatar_colors = [
|
| 23 |
+
"#FF4500", # Orange Red
|
| 24 |
+
"#FF6347", # Tomato
|
| 25 |
+
"#FF7F50", # Coral
|
| 26 |
+
"#FF8C00", # Dark Orange
|
| 27 |
+
"#FFD700", # Gold
|
| 28 |
+
"#ADFF2F", # Green Yellow
|
| 29 |
+
"#32CD32", # Lime Green
|
| 30 |
+
"#00FA9A", # Medium Spring Green
|
| 31 |
+
"#40E0D0", # Turquoise
|
| 32 |
+
"#1E90FF", # Dodger Blue
|
| 33 |
+
"#4682B4", # Steel Blue
|
| 34 |
+
"#8A2BE2", # Blue Violet
|
| 35 |
+
"#FF69B4", # Hot Pink
|
| 36 |
+
"#FF1493", # Deep Pink
|
| 37 |
+
"#C71585" # Medium Violet Red
|
| 38 |
+
|
| 39 |
+
]
|
| 40 |
+
# Check if the email is already registered
|
| 41 |
+
existing_users = databases.list_documents(
|
| 42 |
+
database_id=APPWRITE_DATABASE_ID,
|
| 43 |
+
collection_id=APPWRITE_ACCOUNT_COLLECTION_ID,
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
# Iterate through existing users to check if the email already exists
|
| 47 |
+
for doc in existing_users['documents']:
|
| 48 |
+
if doc['Email'] == user.email:
|
| 49 |
+
raise HTTPException(status_code=400, detail="Email is already registered")
|
| 50 |
+
|
| 51 |
+
# If email is not found, proceed with signup
|
| 52 |
+
result = databases.create_document(
|
| 53 |
+
database_id=APPWRITE_DATABASE_ID,
|
| 54 |
+
collection_id=APPWRITE_ACCOUNT_COLLECTION_ID,
|
| 55 |
+
document_id=ID.unique(),
|
| 56 |
+
data={
|
| 57 |
+
"Name": user.name,
|
| 58 |
+
"Email": user.email,
|
| 59 |
+
"Password": user.password,
|
| 60 |
+
"DOB": user.dob.isoformat(),
|
| 61 |
+
"interests": user.interestList,
|
| 62 |
+
"profileColor": random.choice(avatar_colors)
|
| 63 |
+
}
|
| 64 |
+
)
|
| 65 |
+
return {"message": "User created successfully", "document_id": result['$id'], "interests": result["interests"], "name": result["Name"][0], "profileColor": result["profileColor"]}
|
| 66 |
+
|
| 67 |
+
except Exception as e:
|
| 68 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 69 |
+
|
| 70 |
+
@router.post('/login')
|
| 71 |
+
async def login(user: UserLogin):
|
| 72 |
+
"""
|
| 73 |
+
Function to login a user
|
| 74 |
+
"""
|
| 75 |
+
try:
|
| 76 |
+
# List of users
|
| 77 |
+
users = databases.list_documents(
|
| 78 |
+
database_id=APPWRITE_DATABASE_ID,
|
| 79 |
+
collection_id=APPWRITE_ACCOUNT_COLLECTION_ID
|
| 80 |
+
)
|
| 81 |
+
# Find user with matching email and password
|
| 82 |
+
for doc in users['documents']:
|
| 83 |
+
if doc['Email'] == user.email and doc['Password'] == user.password:
|
| 84 |
+
return {"message": "Login successful", "document_id": doc['$id'], "interests": doc["interests"], "name": doc["Name"][0], "profileColor": doc["profileColor"]}
|
| 85 |
+
raise HTTPException(status_code=400, detail="Invalid credentials")
|
| 86 |
+
except Exception as e:
|
| 87 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 88 |
+
|
| 89 |
+
@router.post("/verify_it_user")
|
| 90 |
+
async def verify_user_exist(user_id: str = Query(...)):
|
| 91 |
+
"""
|
| 92 |
+
Function to verify if the user exists
|
| 93 |
+
"""
|
| 94 |
+
try:
|
| 95 |
+
# List of users
|
| 96 |
+
users = databases.list_documents(
|
| 97 |
+
database_id=APPWRITE_DATABASE_ID,
|
| 98 |
+
collection_id=APPWRITE_ACCOUNT_COLLECTION_ID
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
# Find user with matching id
|
| 102 |
+
for doc in users['documents']:
|
| 103 |
+
if user_id == doc['$id']:
|
| 104 |
+
return {"message": "It's User", "interests": doc["interests"], "name": doc["Name"][0], "profileColor": doc["profileColor"]}
|
| 105 |
+
|
| 106 |
+
# If no matching user found
|
| 107 |
+
raise HTTPException(status_code=404, detail="User not found")
|
| 108 |
+
except Exception as e:
|
| 109 |
+
raise HTTPException(status_code=500, detail=str(e))
|
src/introlix_api/app/routes/posts.py
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from bson import ObjectId
|
| 2 |
+
import pytz
|
| 3 |
+
from dateutil import parser
|
| 4 |
+
from datetime import datetime, timezone
|
| 5 |
+
from fastapi import FastAPI, APIRouter, HTTPException, Request, Query
|
| 6 |
+
from introlix_api.app.database import votes
|
| 7 |
+
from introlix_api.exception import CustomException
|
| 8 |
+
from introlix_api.app.database import startup_db_client, shutdown_db_client
|
| 9 |
+
from introlix_api.app.model import FeedModel, DiscussionModel
|
| 10 |
+
from contextlib import asynccontextmanager
|
| 11 |
+
from typing import List
|
| 12 |
+
|
| 13 |
+
@asynccontextmanager
|
| 14 |
+
async def lifespan(app: FastAPI):
|
| 15 |
+
# Start the database connection
|
| 16 |
+
await startup_db_client(app)
|
| 17 |
+
yield
|
| 18 |
+
# Close the database connection
|
| 19 |
+
await shutdown_db_client(app)
|
| 20 |
+
|
| 21 |
+
router = APIRouter()
|
| 22 |
+
|
| 23 |
+
def normalize_date(date_str):
|
| 24 |
+
try:
|
| 25 |
+
# Attempt to parse as ISO format with timezone
|
| 26 |
+
date_obj = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
|
| 27 |
+
except ValueError:
|
| 28 |
+
# If fromisoformat fails, fall back to a more flexible parser
|
| 29 |
+
try:
|
| 30 |
+
date_obj = parser.parse(date_str)
|
| 31 |
+
except (ValueError, TypeError):
|
| 32 |
+
print(f"Warning: Unrecognized date format for '{date_str}'")
|
| 33 |
+
return None # Return None or handle the invalid date as needed
|
| 34 |
+
|
| 35 |
+
# Convert to UTC and return in ISO format
|
| 36 |
+
return date_obj.astimezone(pytz.UTC)
|
| 37 |
+
|
| 38 |
+
@router.get('/posts', response_model=List[FeedModel])
|
| 39 |
+
async def fetch_data(request: Request, tags: List[str] = Query(...), page: int = 1, limit: int = 20):
|
| 40 |
+
"""
|
| 41 |
+
Function to fetch posts based on pagination, query, and sorting options.
|
| 42 |
+
"""
|
| 43 |
+
try:
|
| 44 |
+
skip = (page - 1) * limit
|
| 45 |
+
query = {
|
| 46 |
+
"content.tags": {"$in": tags},
|
| 47 |
+
"type": "article"
|
| 48 |
+
}
|
| 49 |
+
response = await request.app.mongodb['search_data'].find(query).skip(skip).limit(limit).to_list(limit)
|
| 50 |
+
|
| 51 |
+
current_date = datetime.now(timezone.utc)
|
| 52 |
+
hotness_ranked_posts = []
|
| 53 |
+
|
| 54 |
+
for item in response:
|
| 55 |
+
item["_id"] = str(item['_id'])
|
| 56 |
+
item["title"] = item['content'].get('title', '')
|
| 57 |
+
item["desc"] = item['content'].get('desc', '')
|
| 58 |
+
item["url"] = item.get('url', '')
|
| 59 |
+
item["image_url"] = item['content'].get('image', '') or ""
|
| 60 |
+
item["tags"] = item['content'].get('tags', [])
|
| 61 |
+
item["vote"] = item['content'].get('vote', 0)
|
| 62 |
+
|
| 63 |
+
# Handle created_at normalization
|
| 64 |
+
created_at_str = item['content'].get('created_at', '')
|
| 65 |
+
if created_at_str in [None, "No date found"]:
|
| 66 |
+
created_at = current_date
|
| 67 |
+
else:
|
| 68 |
+
created_at = normalize_date(created_at_str)
|
| 69 |
+
|
| 70 |
+
# Ensure created_at is a datetime object; if None, skip the calculation
|
| 71 |
+
if created_at:
|
| 72 |
+
# Calculate age in hours
|
| 73 |
+
age_hours = (current_date - created_at).total_seconds() / 3600
|
| 74 |
+
|
| 75 |
+
# Hotness ranking formula
|
| 76 |
+
rank = (item["vote"] - 1) / ((age_hours + 2) ** 1.5)
|
| 77 |
+
item["rank"] = rank
|
| 78 |
+
else:
|
| 79 |
+
# If created_at is invalid, set rank low
|
| 80 |
+
item["rank"] = float('-inf')
|
| 81 |
+
|
| 82 |
+
item["created_at"] = created_at.isoformat() if created_at else "Unknown"
|
| 83 |
+
hotness_ranked_posts.append(item)
|
| 84 |
+
|
| 85 |
+
hotness_ranked_posts.sort(key=lambda x: x["rank"], reverse=False)
|
| 86 |
+
return hotness_ranked_posts
|
| 87 |
+
|
| 88 |
+
except Exception as e:
|
| 89 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 90 |
+
|
| 91 |
+
@router.get('/discussion', response_model=List[DiscussionModel])
|
| 92 |
+
async def fetch_disscussion(request: Request, tags: List[str] = Query(...), page: int = 1, limit: int = 20):
|
| 93 |
+
"""
|
| 94 |
+
Function to fetch discussion based on pagination, query, and sorting options.
|
| 95 |
+
"""
|
| 96 |
+
try:
|
| 97 |
+
skip = (page - 1) * limit
|
| 98 |
+
query = {
|
| 99 |
+
"content.tags": {"$in": tags},
|
| 100 |
+
"type": "discussion"
|
| 101 |
+
}
|
| 102 |
+
response = await request.app.mongodb['search_data'].find(query).skip(skip).limit(limit).to_list(limit)
|
| 103 |
+
|
| 104 |
+
current_date = datetime.now(timezone.utc)
|
| 105 |
+
hotness_ranked_posts = []
|
| 106 |
+
|
| 107 |
+
for item in response:
|
| 108 |
+
item["_id"] = str(item['_id'])
|
| 109 |
+
item["title"] = item['content'].get('title', '')
|
| 110 |
+
item["url"] = item.get('url', '')
|
| 111 |
+
item["tags"] = item['content'].get('tags', [])
|
| 112 |
+
item["vote"] = item['content'].get('vote', 0)
|
| 113 |
+
item["answer_count"] = item['content'].get('answer_count', 0)
|
| 114 |
+
|
| 115 |
+
# Handle created_at normalization
|
| 116 |
+
created_at_str = item['content'].get('created_at', '')
|
| 117 |
+
created_at_str = datetime.utcfromtimestamp(created_at_str)
|
| 118 |
+
if created_at_str in [None, "No date found"]:
|
| 119 |
+
created_at = current_date
|
| 120 |
+
else:
|
| 121 |
+
created_at = normalize_date(str(created_at_str))
|
| 122 |
+
|
| 123 |
+
# Ensure created_at is a datetime object; if None, skip the calculation
|
| 124 |
+
if created_at:
|
| 125 |
+
# Calculate age in hours
|
| 126 |
+
age_hours = (current_date - created_at).total_seconds() / 3600
|
| 127 |
+
|
| 128 |
+
# Hotness ranking formula
|
| 129 |
+
rank = (item["vote"] - 1) / ((age_hours + 2) ** 1.5)
|
| 130 |
+
item["rank"] = rank
|
| 131 |
+
else:
|
| 132 |
+
# If created_at is invalid, set rank low
|
| 133 |
+
item["rank"] = float('-inf')
|
| 134 |
+
|
| 135 |
+
item["created_at"] = created_at.isoformat() if created_at else "Unknown"
|
| 136 |
+
hotness_ranked_posts.append(item)
|
| 137 |
+
|
| 138 |
+
hotness_ranked_posts.sort(key=lambda x: x["rank"], reverse=False)
|
| 139 |
+
return hotness_ranked_posts
|
| 140 |
+
|
| 141 |
+
except Exception as e:
|
| 142 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 143 |
+
|
| 144 |
+
@router.post('/vote')
|
| 145 |
+
async def vote(request: Request, vote: int, post_id: str = Query(...), user_id: str = Query(...)):
|
| 146 |
+
"""
|
| 147 |
+
Function to vote for a post.
|
| 148 |
+
"""
|
| 149 |
+
try:
|
| 150 |
+
post_id = ObjectId(post_id)
|
| 151 |
+
|
| 152 |
+
# Check if the user has already voted for the post
|
| 153 |
+
result = await request.app.mongodb['votes'].find_one({"user_id": user_id, "post_id": post_id, "vote": vote})
|
| 154 |
+
|
| 155 |
+
if result:
|
| 156 |
+
votes.delete_one({
|
| 157 |
+
"_id": result["_id"]
|
| 158 |
+
})
|
| 159 |
+
else:
|
| 160 |
+
existing_vote = await request.app.mongodb['votes'].find_one({"user_id": user_id, "post_id": post_id})
|
| 161 |
+
if existing_vote:
|
| 162 |
+
votes.delete_one({
|
| 163 |
+
"_id": existing_vote["_id"]
|
| 164 |
+
})
|
| 165 |
+
|
| 166 |
+
votes.insert_one({
|
| 167 |
+
"post_id": post_id,
|
| 168 |
+
"user_id": user_id,
|
| 169 |
+
"vote": vote
|
| 170 |
+
})
|
| 171 |
+
|
| 172 |
+
# counting total vote
|
| 173 |
+
# Calculate the total vote count for the post
|
| 174 |
+
total_votes = await request.app.mongodb['votes'].aggregate([
|
| 175 |
+
{"$match": {"post_id": post_id}},
|
| 176 |
+
{"$group": {"_id": "$post_id", "total_votes": {"$sum": "$vote"}}}
|
| 177 |
+
]).to_list(length=1)
|
| 178 |
+
|
| 179 |
+
# Extract the total vote count or default to 0 if no votes are found
|
| 180 |
+
vote_count = total_votes[0]["total_votes"] if total_votes else 0
|
| 181 |
+
|
| 182 |
+
# Update the vote count in the post document
|
| 183 |
+
await request.app.mongodb['search_data'].update_one(
|
| 184 |
+
{"_id": post_id},
|
| 185 |
+
{"$set": {"content.vote": vote_count}}
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
return {"message": f"Vote submitted successfully with total vote {vote_count}"}
|
| 189 |
+
except Exception as e:
|
| 190 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
@router.get('/hasvoted')
|
| 194 |
+
async def hasVote(request: Request, post_id: str = Query(...), user_id: str = Query(...)):
|
| 195 |
+
"""
|
| 196 |
+
Function to check if the user has already voted for a post.
|
| 197 |
+
"""
|
| 198 |
+
try:
|
| 199 |
+
post_id = ObjectId(post_id)
|
| 200 |
+
|
| 201 |
+
existing_vote = await request.app.mongodb['votes'].find_one({"user_id": user_id, "post_id": post_id})
|
| 202 |
+
|
| 203 |
+
if existing_vote:
|
| 204 |
+
return {"has_voted": True, "vote": existing_vote['vote']}
|
| 205 |
+
else:
|
| 206 |
+
return {"has_voted": False}
|
| 207 |
+
except Exception as e:
|
| 208 |
+
raise HTTPException(status_code=400, detail=str(e))
|
src/introlix_api/app/routes/run_spider.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import subprocess
|
| 3 |
+
from fastapi import APIRouter, HTTPException, Query
|
| 4 |
+
|
| 5 |
+
from introlix_api.exception import CustomException
|
| 6 |
+
from introlix_api.logger import logger
|
| 7 |
+
|
| 8 |
+
router = APIRouter()
|
| 9 |
+
|
| 10 |
+
@router.post('/run_spider')
|
| 11 |
+
async def run_spider():
|
| 12 |
+
"""
|
| 13 |
+
Function to run the introlix spider
|
| 14 |
+
"""
|
| 15 |
+
try:
|
| 16 |
+
command = ["scrapy", "crawl", "generic"] # command to run the spider
|
| 17 |
+
working_directory = "src/introlix_api/app/introlix_spider" # directory to run the spider
|
| 18 |
+
|
| 19 |
+
result = subprocess.run(command, cwd=working_directory, capture_output=True, text=True) # run the spider
|
| 20 |
+
|
| 21 |
+
return result.stdout, result.stderr
|
| 22 |
+
except Exception as e:
|
| 23 |
+
raise HTTPException(status_code=400, detail=str(e))
|
src/introlix_api/app/routes/similarity.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import random
|
| 2 |
+
import re
|
| 3 |
+
from fastapi import FastAPI, APIRouter, HTTPException, Request
|
| 4 |
+
|
| 5 |
+
from introlix_api.exception import CustomException
|
| 6 |
+
from introlix_api.app.database import startup_db_client, shutdown_db_client
|
| 7 |
+
from introlix_api.logger import logger
|
| 8 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 9 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 10 |
+
from contextlib import asynccontextmanager
|
| 11 |
+
|
| 12 |
+
@asynccontextmanager
|
| 13 |
+
async def lifespan(app: FastAPI):
|
| 14 |
+
# Start the database connection
|
| 15 |
+
await startup_db_client(app)
|
| 16 |
+
yield
|
| 17 |
+
# Close the database connection
|
| 18 |
+
await shutdown_db_client(app)
|
| 19 |
+
|
| 20 |
+
router = APIRouter()
|
| 21 |
+
|
| 22 |
+
# Preprocessing function to clean text
|
| 23 |
+
def preprocess_text(text):
|
| 24 |
+
# Convert to lowercase
|
| 25 |
+
text = text.lower()
|
| 26 |
+
# Remove extra spaces, newlines, and tabs
|
| 27 |
+
text = re.sub(r'\s+', ' ', text)
|
| 28 |
+
# Remove punctuation (optional, depending on your data)
|
| 29 |
+
text = re.sub(r'[^\w\s]', '', text)
|
| 30 |
+
return text
|
| 31 |
+
|
| 32 |
+
@router.get('/similarity')
|
| 33 |
+
async def similarity(request: Request, page: int = 1, limit: int = 20, query: str = None):
|
| 34 |
+
"""
|
| 35 |
+
Function to calculate cosine similarity between posts and a query.
|
| 36 |
+
"""
|
| 37 |
+
try:
|
| 38 |
+
# Ensure the query is provided
|
| 39 |
+
if not query:
|
| 40 |
+
raise HTTPException(status_code=400, detail="Query parameter is required")
|
| 41 |
+
|
| 42 |
+
skip = (page - 1) * limit
|
| 43 |
+
|
| 44 |
+
# Fetch posts from MongoDB
|
| 45 |
+
response = await request.app.mongodb['feedData'].find().skip(skip).limit(limit).to_list(limit)
|
| 46 |
+
|
| 47 |
+
# Filter out items that do not have both title and description
|
| 48 |
+
response = [item for item in response if item.get('title') and item.get('desc')]
|
| 49 |
+
|
| 50 |
+
# Convert ObjectId to string for MongoDB compatibility
|
| 51 |
+
for item in response:
|
| 52 |
+
item['_id'] = str(item['_id'])
|
| 53 |
+
|
| 54 |
+
# Prepare document texts (title + desc) for similarity calculation
|
| 55 |
+
posts_texts = [preprocess_text(item['title'] + ' ' + item['desc']) for item in response]
|
| 56 |
+
|
| 57 |
+
# Preprocess the query
|
| 58 |
+
query = preprocess_text(query)
|
| 59 |
+
|
| 60 |
+
# Include the query at the start of the document list
|
| 61 |
+
documents = [query] + posts_texts
|
| 62 |
+
|
| 63 |
+
# Apply TF-IDF Vectorizer
|
| 64 |
+
vectorizer = TfidfVectorizer(stop_words='english')
|
| 65 |
+
tfidf = vectorizer.fit_transform(documents)
|
| 66 |
+
|
| 67 |
+
# Calculate cosine similarity between the query and the posts
|
| 68 |
+
cosine_similarities = cosine_similarity(tfidf[0:1], tfidf[1:]).flatten()
|
| 69 |
+
|
| 70 |
+
# Debugging: Print cosine similarity scores for better understanding
|
| 71 |
+
print("Cosine Similarities:", cosine_similarities)
|
| 72 |
+
|
| 73 |
+
# Lower the similarity threshold for short text comparisons
|
| 74 |
+
similarity_threshold = 0.05
|
| 75 |
+
|
| 76 |
+
# Filter posts that have a cosine similarity above the threshold
|
| 77 |
+
similar_posts = [
|
| 78 |
+
response[i] for i in range(len(response)) if cosine_similarities[i] >= similarity_threshold
|
| 79 |
+
]
|
| 80 |
+
|
| 81 |
+
return similar_posts
|
| 82 |
+
except Exception as e:
|
| 83 |
+
raise HTTPException(status_code=400, detail=str)
|
src/introlix_api/crawler/__init__.py
ADDED
|
File without changes
|
src/introlix_api/crawler/bot.py
ADDED
|
@@ -0,0 +1,390 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os, sys, re, time
|
| 2 |
+
import errno
|
| 3 |
+
import string
|
| 4 |
+
import requests
|
| 5 |
+
import multiprocessing
|
| 6 |
+
from bs4 import BeautifulSoup
|
| 7 |
+
from dataclasses import dataclass
|
| 8 |
+
from introlix_api.logger import logger
|
| 9 |
+
from urllib.parse import urlparse, urlunsplit, urljoin
|
| 10 |
+
from urllib.robotparser import RobotFileParser
|
| 11 |
+
from introlix_api.exception import CustomException
|
| 12 |
+
from urllib.robotparser import RobotFileParser
|
| 13 |
+
|
| 14 |
+
from requests import ReadTimeout
|
| 15 |
+
from introlix_api.utils.core import html_to_dom
|
| 16 |
+
from introlix_api.utils.tags import fetch_tags
|
| 17 |
+
from introlix_api.utils.root_sites import root_sites
|
| 18 |
+
from ssl import SSLCertVerificationError
|
| 19 |
+
from urllib3.exceptions import NewConnectionError, MaxRetryError
|
| 20 |
+
|
| 21 |
+
@dataclass
|
| 22 |
+
class BotArgs:
|
| 23 |
+
TIMEOUT_SECONDS = 3
|
| 24 |
+
MAX_FETCH_SIZE = 1024*1024
|
| 25 |
+
BAD_URL_REGEX = re.compile(r'\/\/localhost\b|\.jpg$|\.png$|\.js$|\.gz$|\.zip$|\.pdf$|\.bz2$|\.ipynb$|\.py$')
|
| 26 |
+
GOOD_URL_REGEX = re.compile(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')
|
| 27 |
+
DEFAULT_ENCODING = 'utf8'
|
| 28 |
+
DEFAULT_ENC_ERRORS = 'replace'
|
| 29 |
+
ALLOWED_EXCEPTIONS = (ValueError, ConnectionError, ReadTimeout, TimeoutError,
|
| 30 |
+
OSError, NewConnectionError, MaxRetryError, SSLCertVerificationError)
|
| 31 |
+
|
| 32 |
+
class IntrolixBot:
|
| 33 |
+
def __init__(self, urls: list, args: BotArgs, obey_robots_txt: bool = True):
|
| 34 |
+
"""
|
| 35 |
+
Initialize the IntrolixBot.
|
| 36 |
+
|
| 37 |
+
Args:
|
| 38 |
+
urls (list): List of URLs to scrape.
|
| 39 |
+
obey_robots_txt (bool, optional): Whether to obey robots.txt. Defaults to True.
|
| 40 |
+
"""
|
| 41 |
+
self.urls = urls
|
| 42 |
+
self.obey_robots_txt = obey_robots_txt
|
| 43 |
+
self.root_sites = root_sites()
|
| 44 |
+
self.root_sites_netlocs = {urlparse(root_url).netloc for root_url in self.root_sites}
|
| 45 |
+
self.good_tags = fetch_tags()
|
| 46 |
+
|
| 47 |
+
# bot args
|
| 48 |
+
self.TIMEOUT_SECONDS = args.TIMEOUT_SECONDS
|
| 49 |
+
self.MAX_FETCH_SIZE = args.MAX_FETCH_SIZE
|
| 50 |
+
self.BAD_URL_REGEX = args.BAD_URL_REGEX
|
| 51 |
+
self.GOOD_URL_REGEX = args.GOOD_URL_REGEX
|
| 52 |
+
self.DEFAULT_ENCODING = args.DEFAULT_ENCODING
|
| 53 |
+
self.DEFAULT_ENC_ERRORS = args.DEFAULT_ENC_ERRORS
|
| 54 |
+
self.ALLOWED_EXCEPTIONS = args.ALLOWED_EXCEPTIONS
|
| 55 |
+
|
| 56 |
+
def fetch(self, url:str) -> tuple[int, bytes]:
|
| 57 |
+
"""
|
| 58 |
+
Function to fetch a URL.
|
| 59 |
+
|
| 60 |
+
Args:
|
| 61 |
+
url (str): URL to fetch.
|
| 62 |
+
Returns:
|
| 63 |
+
tuple[int, bytes]: status code and content.
|
| 64 |
+
"""
|
| 65 |
+
|
| 66 |
+
r = requests.get(url, stream=True, timeout=self.TIMEOUT_SECONDS)
|
| 67 |
+
|
| 68 |
+
size = 0
|
| 69 |
+
start = time.time()
|
| 70 |
+
|
| 71 |
+
content = b""
|
| 72 |
+
for chunk in r.iter_content(1024):
|
| 73 |
+
if time.time() - start > self.TIMEOUT_SECONDS:
|
| 74 |
+
raise ValueError('Timeout reached')
|
| 75 |
+
|
| 76 |
+
content += chunk
|
| 77 |
+
|
| 78 |
+
size += len(chunk)
|
| 79 |
+
if size > self.MAX_FETCH_SIZE:
|
| 80 |
+
logger.debug(f"Maximum size reached for URL {url}")
|
| 81 |
+
break
|
| 82 |
+
|
| 83 |
+
return r.status_code, content
|
| 84 |
+
|
| 85 |
+
def see_robots_txt(self, url: str) -> bool:
|
| 86 |
+
"""
|
| 87 |
+
Function to check if robots.txt allows this bot to crawl.
|
| 88 |
+
|
| 89 |
+
Args:
|
| 90 |
+
main_url (str): main root url of the site.
|
| 91 |
+
url (str): URL to check.
|
| 92 |
+
Returns:
|
| 93 |
+
bool: True if the bot is allowed to crawl, False otherwise.
|
| 94 |
+
"""
|
| 95 |
+
try:
|
| 96 |
+
try:
|
| 97 |
+
parsed_url = urlparse(url)
|
| 98 |
+
except ValueError:
|
| 99 |
+
logger.debug(f"Unable to parse URL: {url}")
|
| 100 |
+
return False
|
| 101 |
+
|
| 102 |
+
robots_url = urlunsplit((parsed_url.scheme, parsed_url.netloc, 'robots.txt', '', ''))
|
| 103 |
+
parse_robots = RobotFileParser(robots_url)
|
| 104 |
+
|
| 105 |
+
try:
|
| 106 |
+
status_code, content = self.fetch(robots_url)
|
| 107 |
+
except Exception as e: # Catch all exceptions for now
|
| 108 |
+
logger.debug(f"Robots error: {robots_url}, {e}")
|
| 109 |
+
return True
|
| 110 |
+
|
| 111 |
+
decoded = None
|
| 112 |
+
for encoding in ['utf-8', 'iso-8859-1']:
|
| 113 |
+
try:
|
| 114 |
+
decoded = content.decode(encoding).splitlines()
|
| 115 |
+
break
|
| 116 |
+
except UnicodeDecodeError:
|
| 117 |
+
pass
|
| 118 |
+
|
| 119 |
+
if decoded is None:
|
| 120 |
+
logger.debug(f"Unable to decode robots file {robots_url}")
|
| 121 |
+
return True
|
| 122 |
+
|
| 123 |
+
parse_robots.parse(decoded)
|
| 124 |
+
allowed = parse_robots.can_fetch('IntrolixBot', url) # Your bot's name
|
| 125 |
+
logger.debug(f"Robots allowed for {url}: {allowed} and {decoded} is decoded with {robots_url}")
|
| 126 |
+
return allowed
|
| 127 |
+
except Exception as e:
|
| 128 |
+
raise CustomException(e, sys) from e
|
| 129 |
+
|
| 130 |
+
def get_urls_from_page(self, url: str) -> list:
|
| 131 |
+
"""
|
| 132 |
+
Function to get all URLs from a page.
|
| 133 |
+
|
| 134 |
+
Args:
|
| 135 |
+
url (str): URL of the page.
|
| 136 |
+
Returns:
|
| 137 |
+
list: List of URLs from the page.
|
| 138 |
+
"""
|
| 139 |
+
try:
|
| 140 |
+
status_code, content = self.fetch(url)
|
| 141 |
+
|
| 142 |
+
if status_code != 200:
|
| 143 |
+
return []
|
| 144 |
+
|
| 145 |
+
soup = BeautifulSoup(content, 'html.parser')
|
| 146 |
+
urls = []
|
| 147 |
+
|
| 148 |
+
for link in soup.find_all('a'):
|
| 149 |
+
href = link.get('href')
|
| 150 |
+
if href:
|
| 151 |
+
if not href.startswith('http'):
|
| 152 |
+
href = urljoin(url, href)
|
| 153 |
+
# if not self.BAD_URL_REGEX.search(href):
|
| 154 |
+
# href = href
|
| 155 |
+
if self.GOOD_URL_REGEX.search(href):
|
| 156 |
+
href_netloc = urlparse(href).netloc
|
| 157 |
+
|
| 158 |
+
logger.debug(f"Checking href domain: {href_netloc} against root domains")
|
| 159 |
+
|
| 160 |
+
if href_netloc in self.root_sites_netlocs:
|
| 161 |
+
urls.append(href)
|
| 162 |
+
|
| 163 |
+
return list(set(urls))
|
| 164 |
+
|
| 165 |
+
except Exception as e:
|
| 166 |
+
logger.info(f"Error occured while getting urls from page {e}")
|
| 167 |
+
return []
|
| 168 |
+
# raise CustomException(e, sys) from e
|
| 169 |
+
|
| 170 |
+
def scrape(self, url: str) -> dict:
|
| 171 |
+
"""
|
| 172 |
+
Function to scrape the site.
|
| 173 |
+
|
| 174 |
+
Args:
|
| 175 |
+
url (str): URL to scrape.
|
| 176 |
+
Returns:
|
| 177 |
+
dict: scraped data.
|
| 178 |
+
"""
|
| 179 |
+
try:
|
| 180 |
+
logger.info(f"Crawling URL {url}")
|
| 181 |
+
js_timestamp = int(time.time() * 1000)
|
| 182 |
+
|
| 183 |
+
if self.obey_robots_txt:
|
| 184 |
+
allowed = self.see_robots_txt(url)
|
| 185 |
+
|
| 186 |
+
if not allowed:
|
| 187 |
+
return {
|
| 188 |
+
'url': url,
|
| 189 |
+
'status': None,
|
| 190 |
+
'timestamp': js_timestamp,
|
| 191 |
+
'content': None,
|
| 192 |
+
'error': {
|
| 193 |
+
'name': 'RobotsDenied',
|
| 194 |
+
'message': 'Robots do not allow this URL',
|
| 195 |
+
}
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
try:
|
| 199 |
+
status_code, content = self.fetch(url)
|
| 200 |
+
except self.ALLOWED_EXCEPTIONS as e:
|
| 201 |
+
logger.debug(f"Exception crawling URl {url}: {e}")
|
| 202 |
+
return {
|
| 203 |
+
'url': url,
|
| 204 |
+
'status': None,
|
| 205 |
+
'timestamp': js_timestamp,
|
| 206 |
+
'content': None,
|
| 207 |
+
'error': {
|
| 208 |
+
'name': 'AbortError',
|
| 209 |
+
'message': str(e),
|
| 210 |
+
}
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
if len(content) == 0:
|
| 214 |
+
return {
|
| 215 |
+
'url': url,
|
| 216 |
+
'status': status_code,
|
| 217 |
+
'timestamp': js_timestamp,
|
| 218 |
+
'content': None,
|
| 219 |
+
'error': {
|
| 220 |
+
'name': 'NoResponseText',
|
| 221 |
+
'message': 'No response found',
|
| 222 |
+
}
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
try:
|
| 226 |
+
dom = html_to_dom(content, self.DEFAULT_ENCODING, None, self.DEFAULT_ENC_ERRORS)
|
| 227 |
+
except Exception as e:
|
| 228 |
+
logger.exception(f"Error parsing dom: {url}")
|
| 229 |
+
return {
|
| 230 |
+
'url': url,
|
| 231 |
+
'status': status_code,
|
| 232 |
+
'timestamp': js_timestamp,
|
| 233 |
+
'content': None,
|
| 234 |
+
'error': {
|
| 235 |
+
'name': e.__class__.__name__,
|
| 236 |
+
'message': str(e),
|
| 237 |
+
}
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
title_element = dom.xpath("//title")
|
| 241 |
+
title = ""
|
| 242 |
+
if len(title_element) > 0:
|
| 243 |
+
title_text = title_element[0].text
|
| 244 |
+
if title_text is not None:
|
| 245 |
+
title = title_text.strip()
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
desc_element = dom.xpath("//meta[@name='description']")
|
| 249 |
+
desc = ""
|
| 250 |
+
if len(desc_element) > 0:
|
| 251 |
+
desc_text = desc_element[0].get('content')
|
| 252 |
+
if desc_text is not None:
|
| 253 |
+
desc = desc_text.strip()
|
| 254 |
+
|
| 255 |
+
og_image_element = dom.xpath("//meta[@property='og:image']/@content")
|
| 256 |
+
if og_image_element:
|
| 257 |
+
image = og_image_element[0]
|
| 258 |
+
else:
|
| 259 |
+
image_elements = dom.xpath("//img")
|
| 260 |
+
image_urls = [urljoin(url, img.get("src")) for img in image_elements if img.get("src")]
|
| 261 |
+
if len(image_urls) > 0:
|
| 262 |
+
image = image_urls[0]
|
| 263 |
+
else:
|
| 264 |
+
image = ""
|
| 265 |
+
|
| 266 |
+
new_links = self.get_urls_from_page(url)
|
| 267 |
+
new_links = list(set(new_links))
|
| 268 |
+
|
| 269 |
+
# Normalize extracted keywords to match the format in good_tags
|
| 270 |
+
normalized_title = re.split(r'[\s-]+', title.lower().translate(str.maketrans('', '',
|
| 271 |
+
string.punctuation)))
|
| 272 |
+
# Filter based on good_tags
|
| 273 |
+
tags = [tag for tag in self.good_tags if tag in normalized_title]
|
| 274 |
+
if not tags:
|
| 275 |
+
tags = ['general']
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
date = dom.xpath("string(//meta[@property='article:published_time']/@content)")
|
| 279 |
+
|
| 280 |
+
# Fallback: Check JSON-LD for datePublished in <script>
|
| 281 |
+
if not date:
|
| 282 |
+
json_ld_date = dom.xpath("string(//script[@type='application/ld+json'])")
|
| 283 |
+
if json_ld_date:
|
| 284 |
+
import json
|
| 285 |
+
try:
|
| 286 |
+
data = json.loads(json_ld_date)
|
| 287 |
+
date = data.get("datePublished", "").split("T")[0]
|
| 288 |
+
except json.JSONDecodeError:
|
| 289 |
+
pass
|
| 290 |
+
|
| 291 |
+
# Fallback: Look for <time> tag with datetime attribute
|
| 292 |
+
if not date:
|
| 293 |
+
date = dom.xpath("string(//time/@datetime)")
|
| 294 |
+
|
| 295 |
+
# Fallback: Check for common patterns with 'Last Updated'
|
| 296 |
+
if not date:
|
| 297 |
+
date = dom.xpath("string(//span[contains(text(), 'Last Updated')])")
|
| 298 |
+
|
| 299 |
+
# Clean up date format if necessary (for example, strip out extra text)
|
| 300 |
+
if date:
|
| 301 |
+
# Extract date pattern YYYY-MM-DD or similar
|
| 302 |
+
match = re.search(r"\d{4}-\d{2}-\d{2}", date) or re.search(r"\d{2} \w{3}, \d{4}", date)
|
| 303 |
+
date = match.group(0) if match else date
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
return {
|
| 307 |
+
'url': url,
|
| 308 |
+
'content': {
|
| 309 |
+
'title': title,
|
| 310 |
+
'desc': desc,
|
| 311 |
+
'image': image,
|
| 312 |
+
'tags': tags,
|
| 313 |
+
'vote': 0,
|
| 314 |
+
'links': sorted(new_links),
|
| 315 |
+
'created_at': date if date else 'No date found'
|
| 316 |
+
},
|
| 317 |
+
}
|
| 318 |
+
|
| 319 |
+
except Exception as e:
|
| 320 |
+
raise CustomException(e, sys) from e
|
| 321 |
+
|
| 322 |
+
def batch_converter(self, lst: list, batch_size: int):
|
| 323 |
+
"""
|
| 324 |
+
Convert list into batches of a specified size.
|
| 325 |
+
|
| 326 |
+
Args:
|
| 327 |
+
list (list): list to convert
|
| 328 |
+
batch_size (int): size of the batch
|
| 329 |
+
"""
|
| 330 |
+
for i in range(0, len(lst), batch_size):
|
| 331 |
+
yield lst[i:i + batch_size]
|
| 332 |
+
|
| 333 |
+
def scrape_parallel(self, batch_size: int):
|
| 334 |
+
"""
|
| 335 |
+
Process scrape in parallel using multiprocessing.
|
| 336 |
+
|
| 337 |
+
Args:
|
| 338 |
+
urls (list): List of site URLs to process.
|
| 339 |
+
batch_size (int): Number of URLs to process in each batch.
|
| 340 |
+
Returns:
|
| 341 |
+
|
| 342 |
+
"""
|
| 343 |
+
num_workers = max(1, os.cpu_count() - 1)
|
| 344 |
+
# getting urls in batch
|
| 345 |
+
batch_url = list(self.batch_converter(self.urls, batch_size))
|
| 346 |
+
|
| 347 |
+
try:
|
| 348 |
+
# Create a multiprocessing pool
|
| 349 |
+
with multiprocessing.Pool(processes=num_workers) as pool:
|
| 350 |
+
for batch in batch_url:
|
| 351 |
+
results = pool.map(self.scrape, batch)
|
| 352 |
+
# data = list([sublist for sublist in results])
|
| 353 |
+
|
| 354 |
+
yield results
|
| 355 |
+
time.sleep(0.1)
|
| 356 |
+
except IOError as e:
|
| 357 |
+
if e.errno == errno.EPIPE:
|
| 358 |
+
pass
|
| 359 |
+
|
| 360 |
+
def get_urls_from_page_parallel(self, urls: list, batch_size: int):
|
| 361 |
+
"""
|
| 362 |
+
Process get_urls_from_page in parallel using multiprocessing.
|
| 363 |
+
|
| 364 |
+
Args:
|
| 365 |
+
urls (list): List of site URLs to process.
|
| 366 |
+
batch_size (int): Number of URLs to process in each batch.
|
| 367 |
+
"""
|
| 368 |
+
num_workers = max(1, os.cpu_count() - 1)
|
| 369 |
+
|
| 370 |
+
# getting urls in batch
|
| 371 |
+
batch_url = list(self.batch_converter(urls, batch_size))
|
| 372 |
+
|
| 373 |
+
try:
|
| 374 |
+
# Create a multiprocessing pool
|
| 375 |
+
with multiprocessing.Pool(processes=num_workers) as pool:
|
| 376 |
+
for batch in batch_url:
|
| 377 |
+
results = pool.map(self.get_urls_from_page, batch)
|
| 378 |
+
# return list([url for sublist in results for url in sublist])
|
| 379 |
+
for sublist in results:
|
| 380 |
+
for url in sublist:
|
| 381 |
+
yield url # Yield each URL incrementally
|
| 382 |
+
time.sleep(0.1)
|
| 383 |
+
|
| 384 |
+
|
| 385 |
+
except IOError as e:
|
| 386 |
+
if e.errno == errno.EPIPE:
|
| 387 |
+
pass
|
| 388 |
+
|
| 389 |
+
def fetch_tags(self):
|
| 390 |
+
return self.good_tags
|
src/introlix_api/engine/__init__.py
ADDED
|
File without changes
|
src/introlix_api/engine/api_data.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from introlix_api.engine.third_party_apis import get_devDotTo_data
|
| 2 |
+
from introlix_api.engine.graphql import fetch_hashnode_posts
|
| 3 |
+
from introlix_api.app.database import search_data
|
| 4 |
+
from introlix_api.logger import logger
|
| 5 |
+
|
| 6 |
+
def fetch_data(page: int = 1, per_page: int = 10, tag = ''):
|
| 7 |
+
"""
|
| 8 |
+
Function to fetch data from multiple sources and combine them.
|
| 9 |
+
"""
|
| 10 |
+
devDotTo_data = get_devDotTo_data(page, per_page, tag)
|
| 11 |
+
hashnode_posts = fetch_hashnode_posts(page=page, per_page=per_page, tag=tag)
|
| 12 |
+
|
| 13 |
+
# Combine the fetched data
|
| 14 |
+
if hashnode_posts and devDotTo_data:
|
| 15 |
+
combined_data = devDotTo_data + hashnode_posts
|
| 16 |
+
elif hashnode_posts:
|
| 17 |
+
data = []
|
| 18 |
+
|
| 19 |
+
for item in hashnode_posts:
|
| 20 |
+
new_entry = {
|
| 21 |
+
"url": item["url"],
|
| 22 |
+
"content": {
|
| 23 |
+
"title": item["title"],
|
| 24 |
+
"desc": item["description"],
|
| 25 |
+
"image": item["image"],
|
| 26 |
+
"tags": item["tags"],
|
| 27 |
+
"vote": 0,
|
| 28 |
+
"created_at": item["created_at"],
|
| 29 |
+
},
|
| 30 |
+
"type": item["type"]
|
| 31 |
+
}
|
| 32 |
+
data.append(new_entry)
|
| 33 |
+
return data
|
| 34 |
+
elif devDotTo_data:
|
| 35 |
+
data = []
|
| 36 |
+
|
| 37 |
+
for item in devDotTo_data:
|
| 38 |
+
new_entry = {
|
| 39 |
+
"url": item["url"],
|
| 40 |
+
"content": {
|
| 41 |
+
"title": item["title"],
|
| 42 |
+
"desc": item["description"],
|
| 43 |
+
"image": item["image"],
|
| 44 |
+
"tags": item["tags"],
|
| 45 |
+
"vote": 0,
|
| 46 |
+
"created_at": item["created_at"]
|
| 47 |
+
},
|
| 48 |
+
"type": item["type"]
|
| 49 |
+
}
|
| 50 |
+
data.append(new_entry)
|
| 51 |
+
return data
|
| 52 |
+
else:
|
| 53 |
+
return []
|
| 54 |
+
|
| 55 |
+
data = []
|
| 56 |
+
|
| 57 |
+
for item in combined_data:
|
| 58 |
+
new_entry = {
|
| 59 |
+
"url": item["url"],
|
| 60 |
+
"content": {
|
| 61 |
+
"title": item["title"],
|
| 62 |
+
"desc": item["description"],
|
| 63 |
+
"image": item["image"],
|
| 64 |
+
"tags": item["tags"],
|
| 65 |
+
"vote": 0,
|
| 66 |
+
"created_at": item["created_at"]
|
| 67 |
+
},
|
| 68 |
+
"type": item["type"]
|
| 69 |
+
}
|
| 70 |
+
data.append(new_entry)
|
| 71 |
+
|
| 72 |
+
return data
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def batch_converter(lst: list, batch_size: int):
|
| 77 |
+
"""
|
| 78 |
+
Convert list into batches of a specified size.
|
| 79 |
+
|
| 80 |
+
Args:
|
| 81 |
+
list (list): list to convert
|
| 82 |
+
batch_size (int): size of the batch
|
| 83 |
+
"""
|
| 84 |
+
for i in range(0, len(lst), batch_size):
|
| 85 |
+
yield lst[i:i + batch_size]
|
| 86 |
+
|
| 87 |
+
if __name__ == '__main__':
|
| 88 |
+
|
| 89 |
+
for page_no in range(1, 1001):
|
| 90 |
+
data = fetch_data(page=page_no)
|
| 91 |
+
if data:
|
| 92 |
+
for batch in batch_converter(data, batch_size=100):
|
| 93 |
+
urls = [d["url"] for d in data]
|
| 94 |
+
|
| 95 |
+
existing_urls = {doc["url"] for doc in search_data.find({"url": {"$in": urls}})}
|
| 96 |
+
|
| 97 |
+
for d in batch:
|
| 98 |
+
if d["url"] not in existing_urls:
|
| 99 |
+
search_data.insert_one(d)
|
| 100 |
+
else:
|
| 101 |
+
logger.debug("No data to save")
|
src/introlix_api/engine/discussion.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from introlix_api.engine.third_party_apis import get_stack_overflow_data
|
| 2 |
+
from introlix_api.utils.tags import fetch_tags
|
| 3 |
+
from introlix_api.logger import logger
|
| 4 |
+
from introlix_api.app.database import search_data
|
| 5 |
+
|
| 6 |
+
def fetch_discussion(page: int = 1, per_page: int = 10, tag: str = ''):
|
| 7 |
+
"""
|
| 8 |
+
Function to fetch data from Stack Overflow API.
|
| 9 |
+
"""
|
| 10 |
+
stack_overflow_data = get_stack_overflow_data(page=page, per_page=per_page, tag=tag)
|
| 11 |
+
data = []
|
| 12 |
+
|
| 13 |
+
for item in stack_overflow_data:
|
| 14 |
+
new_entry = {
|
| 15 |
+
"url": item["url"],
|
| 16 |
+
"content": {
|
| 17 |
+
"title": item["title"],
|
| 18 |
+
"tags": item["tags"],
|
| 19 |
+
"vote": 0,
|
| 20 |
+
"created_at": item["created_at"],
|
| 21 |
+
"answer_count": item["answer_count"],
|
| 22 |
+
},
|
| 23 |
+
"type": item["type"]
|
| 24 |
+
}
|
| 25 |
+
data.append(new_entry)
|
| 26 |
+
|
| 27 |
+
return data
|
| 28 |
+
|
| 29 |
+
if __name__ == '__main__':
|
| 30 |
+
for tag in fetch_tags():
|
| 31 |
+
data = fetch_discussion(page=1, per_page=10, tag=tag)
|
| 32 |
+
if data:
|
| 33 |
+
urls = [d["url"] for d in data]
|
| 34 |
+
|
| 35 |
+
existing_urls = {doc["url"] for doc in search_data.find({"url": {"$in": urls}})}
|
| 36 |
+
|
| 37 |
+
for d in data:
|
| 38 |
+
if d["url"] not in existing_urls:
|
| 39 |
+
search_data.insert_one(d)
|
| 40 |
+
else:
|
| 41 |
+
logger.debug("No data to save")
|
src/introlix_api/engine/graphql.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
|
| 3 |
+
def fetch_hashnode_posts(page=1, per_page = 10, tag = ''):
|
| 4 |
+
all_posts = []
|
| 5 |
+
has_next_page = True
|
| 6 |
+
end_cursor = None
|
| 7 |
+
posts_per_page = per_page # Number of posts per page
|
| 8 |
+
|
| 9 |
+
# Calculate the number of posts to skip based on the requested page
|
| 10 |
+
skip_count = (page - 1) * posts_per_page
|
| 11 |
+
|
| 12 |
+
while has_next_page:
|
| 13 |
+
# Construct the GraphQL query with the specified number of posts per page
|
| 14 |
+
query = {
|
| 15 |
+
"query": f"""
|
| 16 |
+
query Publication {{
|
| 17 |
+
publication(host: "blog.developerdao.com") {{
|
| 18 |
+
title
|
| 19 |
+
posts(first: {posts_per_page}, after: {f'"{end_cursor}"' if end_cursor else 'null'}) {{
|
| 20 |
+
edges {{
|
| 21 |
+
node {{
|
| 22 |
+
title
|
| 23 |
+
brief
|
| 24 |
+
url
|
| 25 |
+
publishedAt
|
| 26 |
+
tags {{
|
| 27 |
+
id
|
| 28 |
+
name
|
| 29 |
+
}}
|
| 30 |
+
coverImage {{
|
| 31 |
+
url
|
| 32 |
+
}}
|
| 33 |
+
}}
|
| 34 |
+
}}
|
| 35 |
+
pageInfo {{
|
| 36 |
+
endCursor
|
| 37 |
+
hasNextPage
|
| 38 |
+
}}
|
| 39 |
+
}}
|
| 40 |
+
}}
|
| 41 |
+
}}"""
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
# Make the POST request to the Hashnode GraphQL endpoint
|
| 45 |
+
response = requests.post("https://gql.hashnode.com/", json=query)
|
| 46 |
+
|
| 47 |
+
# Check for request success
|
| 48 |
+
if response.status_code == 200:
|
| 49 |
+
data = response.json()
|
| 50 |
+
posts = data['data']['publication']['posts']['edges']
|
| 51 |
+
|
| 52 |
+
# Append fetched posts to the all_posts list
|
| 53 |
+
all_posts.extend([edge['node'] for edge in posts])
|
| 54 |
+
|
| 55 |
+
# Update pagination info
|
| 56 |
+
page_info = data['data']['publication']['posts']['pageInfo']
|
| 57 |
+
end_cursor = page_info['endCursor']
|
| 58 |
+
has_next_page = page_info['hasNextPage']
|
| 59 |
+
|
| 60 |
+
# Stop if we've fetched enough posts
|
| 61 |
+
if len(all_posts) >= skip_count + posts_per_page:
|
| 62 |
+
break
|
| 63 |
+
else:
|
| 64 |
+
print(f"Error: {response.status_code} - {response.text}")
|
| 65 |
+
break
|
| 66 |
+
|
| 67 |
+
# Return only the posts for the requested page
|
| 68 |
+
if tag == 'bitcoin' or tag == 'web3':
|
| 69 |
+
return all_posts[skip_count:skip_count + posts_per_page]
|
src/introlix_api/engine/third_party_apis.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import requests
|
| 4 |
+
from introlix_api.logger import logger
|
| 5 |
+
from introlix_api.exception import CustomException
|
| 6 |
+
|
| 7 |
+
# Define the URL of the API endpoint
|
| 8 |
+
DEV_DOT_TO_API = "https://dev.to/api/articles?tag={}&page={}&per_page={}"
|
| 9 |
+
|
| 10 |
+
def get_devDotTo_data(page: int = 1, per_page: int = 10, tag: int = '') -> dict:
|
| 11 |
+
"""
|
| 12 |
+
Function to fetch data from the dev.to API.
|
| 13 |
+
"""
|
| 14 |
+
try:
|
| 15 |
+
# Construct the URL with the provided parameters
|
| 16 |
+
url = DEV_DOT_TO_API.format(tag, page, per_page)
|
| 17 |
+
response = requests.get(url)
|
| 18 |
+
|
| 19 |
+
if response.status_code!= 200:
|
| 20 |
+
logger.debug(f"Failed to fetch data from dev.to: {response.status_code}")
|
| 21 |
+
|
| 22 |
+
# Convert the response to JSON
|
| 23 |
+
articles = response.json()
|
| 24 |
+
|
| 25 |
+
extracted_articles = [
|
| 26 |
+
{
|
| 27 |
+
"title": article["title"],
|
| 28 |
+
"description": article["description"],
|
| 29 |
+
"url": article["url"],
|
| 30 |
+
"tags": article["tag_list"],
|
| 31 |
+
"image": article["cover_image"],
|
| 32 |
+
"created_at": article["created_at"],
|
| 33 |
+
"type": "article"
|
| 34 |
+
}
|
| 35 |
+
for article in articles
|
| 36 |
+
]
|
| 37 |
+
|
| 38 |
+
return extracted_articles
|
| 39 |
+
|
| 40 |
+
except Exception as e:
|
| 41 |
+
raise CustomException(e, sys) from e
|
| 42 |
+
|
| 43 |
+
def get_github_repo(page: int = 1, per_page: int = 10, tag: int = ''):
|
| 44 |
+
"""
|
| 45 |
+
Function to fetch data from GitHub API.
|
| 46 |
+
"""
|
| 47 |
+
try:
|
| 48 |
+
# Construct the URL with the provided parameters
|
| 49 |
+
url = f"https://api.github.com/search/repositories?q=topic:{tag}&sort=stars&page={page}&per_page={per_page}"
|
| 50 |
+
response = requests.get(url)
|
| 51 |
+
|
| 52 |
+
if response.status_code!= 200:
|
| 53 |
+
logger.debug(f"Failed to fetch data from GitHub: {response.status_code}")
|
| 54 |
+
|
| 55 |
+
# Convert the response to JSON
|
| 56 |
+
repos = response.json()
|
| 57 |
+
|
| 58 |
+
extracted_repos = [
|
| 59 |
+
{
|
| 60 |
+
"name": repo["name"],
|
| 61 |
+
"description": repo["description"],
|
| 62 |
+
"url": repo["html_url"],
|
| 63 |
+
"stars": repo["stargazers_count"],
|
| 64 |
+
"created_at": repo["created_at"],
|
| 65 |
+
"type": "article"
|
| 66 |
+
}
|
| 67 |
+
for repo in repos["items"]
|
| 68 |
+
]
|
| 69 |
+
|
| 70 |
+
return extracted_repos
|
| 71 |
+
|
| 72 |
+
except Exception as e:
|
| 73 |
+
raise CustomException(e, sys) from e
|
| 74 |
+
|
| 75 |
+
def get_stack_overflow_data(page: int = 1, per_page: int = 10, tag: int = ''):
|
| 76 |
+
"""
|
| 77 |
+
Function to fetch data from Stack Overflow API.
|
| 78 |
+
"""
|
| 79 |
+
try:
|
| 80 |
+
# Construct the URL with the provided parameters
|
| 81 |
+
url = f"https://api.stackexchange.com/2.3/questions?order=desc&sort=activity&tagged={tag}&site=stackoverflow&page={page}&pagesize={per_page}"
|
| 82 |
+
response = requests.get(url)
|
| 83 |
+
|
| 84 |
+
if response.status_code!= 200:
|
| 85 |
+
logger.debug(f"Failed to fetch data from Stack Overflow: {response.status_code}")
|
| 86 |
+
|
| 87 |
+
# Convert the response to JSON
|
| 88 |
+
questions = response.json()
|
| 89 |
+
|
| 90 |
+
extracted_questions = [
|
| 91 |
+
{
|
| 92 |
+
"title": question["title"],
|
| 93 |
+
"url": question["link"],
|
| 94 |
+
"tags": question["tags"],
|
| 95 |
+
"created_at": question["creation_date"],
|
| 96 |
+
"answer_count": question["answer_count"],
|
| 97 |
+
"type": "discussion"
|
| 98 |
+
}
|
| 99 |
+
for question in questions["items"]
|
| 100 |
+
]
|
| 101 |
+
|
| 102 |
+
return extracted_questions
|
| 103 |
+
|
| 104 |
+
except Exception as e:
|
| 105 |
+
raise CustomException(e, sys) from e
|
| 106 |
+
|
| 107 |
+
if __name__ == "__main__":
|
| 108 |
+
print(get_stack_overflow_data(1, 10, tag='python'))
|
src/introlix_api/engine/youtube.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import httpx
|
| 3 |
+
import asyncio
|
| 4 |
+
import time
|
| 5 |
+
from introlix_api.utils.tags import fetch_tags
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
from cachetools import TTLCache
|
| 8 |
+
|
| 9 |
+
load_dotenv()
|
| 10 |
+
YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")
|
| 11 |
+
|
| 12 |
+
# Cache with TTL of 6 hours (21600 seconds)
|
| 13 |
+
cache = TTLCache(maxsize=100, ttl=21600)
|
| 14 |
+
|
| 15 |
+
async def get_youtube_videos():
|
| 16 |
+
url = "https://www.googleapis.com/youtube/v3/search"
|
| 17 |
+
videos = []
|
| 18 |
+
|
| 19 |
+
for tag in fetch_tags():
|
| 20 |
+
if tag in cache:
|
| 21 |
+
videos.append(cache[tag]) # Use cached data
|
| 22 |
+
continue
|
| 23 |
+
|
| 24 |
+
params = {
|
| 25 |
+
"key": YOUTUBE_API_KEY,
|
| 26 |
+
"part": "snippet",
|
| 27 |
+
"q": tag,
|
| 28 |
+
"type": "video",
|
| 29 |
+
"maxResults": 5,
|
| 30 |
+
"order": "viewCount"
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
async with httpx.AsyncClient() as client:
|
| 34 |
+
try:
|
| 35 |
+
response = await client.get(url, params=params)
|
| 36 |
+
response.raise_for_status()
|
| 37 |
+
result = response.json()
|
| 38 |
+
videos.append(result)
|
| 39 |
+
cache[tag] = result # Cache the result
|
| 40 |
+
except httpx.HTTPStatusError as e:
|
| 41 |
+
print(f"HTTP error for tag '{tag}': {e}")
|
| 42 |
+
await asyncio.sleep(1)
|
| 43 |
+
except Exception as e:
|
| 44 |
+
print(f"Unexpected error: {e}")
|
| 45 |
+
|
| 46 |
+
await asyncio.sleep(0.5)
|
| 47 |
+
|
| 48 |
+
return videos
|
| 49 |
+
|
| 50 |
+
async def main():
|
| 51 |
+
data = await get_youtube_videos()
|
| 52 |
+
print(data)
|
| 53 |
+
|
| 54 |
+
asyncio.run(main())
|
src/introlix_api/exception/__init__.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
from introlix_api.logger import logger
|
| 4 |
+
|
| 5 |
+
def error_message_detail(error, error_detail):
|
| 6 |
+
"""
|
| 7 |
+
Retruns the error message and error details and logs the error
|
| 8 |
+
|
| 9 |
+
Args:
|
| 10 |
+
error: error message
|
| 11 |
+
error_detail: error details
|
| 12 |
+
|
| 13 |
+
Returns:
|
| 14 |
+
error_message: error message
|
| 15 |
+
"""
|
| 16 |
+
_, _, exe_tb = error_detail.exc_info()
|
| 17 |
+
file_name = exe_tb.tb_frame.f_code.co_filename
|
| 18 |
+
line_number = exe_tb.tb_lineno
|
| 19 |
+
error_message = "Error occured in file called [{0}] line number: [{1}] error message: [{2}]".format(
|
| 20 |
+
file_name, line_number, str(error)
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
logger.info(error_message)
|
| 24 |
+
|
| 25 |
+
return error_message
|
| 26 |
+
|
| 27 |
+
class CustomException(Exception):
|
| 28 |
+
def __init__(self, error_message, error_detail):
|
| 29 |
+
super().__init__(error_message)
|
| 30 |
+
self.error_message = error_message_detail(error_message, error_detail=error_detail)
|
| 31 |
+
|
| 32 |
+
def __str__(self):
|
| 33 |
+
return self.error_message
|
| 34 |
+
|
src/introlix_api/logger/__init__.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import os
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
|
| 5 |
+
"""
|
| 6 |
+
Logging Every error and in logging file that is in the logs directory.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
LOG_FILE = f"running_logs.log"
|
| 10 |
+
logs_path = os.path.join(os.getcwd(), "logs")
|
| 11 |
+
os.makedirs(logs_path, exist_ok=True)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
LOG_FILE_PATH = os.path.join(logs_path, LOG_FILE)
|
| 15 |
+
|
| 16 |
+
logging.basicConfig(
|
| 17 |
+
filename=LOG_FILE_PATH,
|
| 18 |
+
format="[ %(asctime)s ] %(lineno)d %(name)s - %(levelname)s - %(message)s",
|
| 19 |
+
level=logging.INFO,
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
logger = logging.getLogger("introlixLogger")
|
src/introlix_api/ml/__init__.py
ADDED
|
File without changes
|
src/introlix_api/ml/model.py
ADDED
|
File without changes
|
src/introlix_api/ml/recommendation.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import numpy as np
|
| 4 |
+
from sentence_transformers import SentenceTransformer
|
| 5 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 6 |
+
from introlix_api.exception import CustomException
|
| 7 |
+
from introlix_api.logger import logger
|
| 8 |
+
from introlix_api.app.appwrite import get_interests
|
| 9 |
+
|
| 10 |
+
class Recommendation:
|
| 11 |
+
def __init__(self, user_interests: list, articles: list):
|
| 12 |
+
"""
|
| 13 |
+
Recommendation system for articles using sentence-transformers and cosine similarity
|
| 14 |
+
|
| 15 |
+
Args:
|
| 16 |
+
user_interests (list): list of user interests
|
| 17 |
+
articles (list): list of all articles
|
| 18 |
+
"""
|
| 19 |
+
self.user_interests = user_interests
|
| 20 |
+
self.articles = articles
|
| 21 |
+
self.recommendations = []
|
| 22 |
+
self.model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 23 |
+
self.response = get_interests()
|
| 24 |
+
self.user_interests = [interest['interest'] for interest in self.response]
|
| 25 |
+
self.interest_keywords = {item['interest'].split(':')[1]: item['keywords'] for item in self.response}
|
| 26 |
+
|
| 27 |
+
def encode(self, texts: list):
|
| 28 |
+
"""
|
| 29 |
+
Function to encode text into embeddings using sentence-transformers
|
| 30 |
+
|
| 31 |
+
Args:
|
| 32 |
+
texts (list): list of text to be encoded
|
| 33 |
+
Returns:
|
| 34 |
+
encoded embedding values
|
| 35 |
+
"""
|
| 36 |
+
try:
|
| 37 |
+
return self.model.encode(texts)
|
| 38 |
+
except Exception as e:
|
| 39 |
+
raise CustomException(e, sys)
|
| 40 |
+
|
| 41 |
+
def recommend(self):
|
| 42 |
+
"""
|
| 43 |
+
Function to recommend aritcles based on user interests
|
| 44 |
+
|
| 45 |
+
Args:
|
| 46 |
+
None
|
| 47 |
+
Returns:
|
| 48 |
+
list of recommended articles
|
| 49 |
+
"""
|
| 50 |
+
|
| 51 |
+
# Initialize new interests
|
| 52 |
+
new_interests = self.user_interests.copy() # Start with the old
|
| 53 |
+
new_interests = [item.split(':')[0] for item in new_interests]
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
# Adding keywords to user interests based on existing interests
|
| 57 |
+
for interest in self.user_interests:
|
| 58 |
+
if interest in self.interest_keywords:
|
| 59 |
+
# Append related keywords to new_interests
|
| 60 |
+
new_interests.extend(self.interest_keywords[interest])
|
| 61 |
+
|
| 62 |
+
# Remove duplicates if needed
|
| 63 |
+
new_interests = list(set(new_interests))
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
# encoding user interests into embeddings
|
| 67 |
+
# print(f"Here is user interest keywords: {self.interest_keywords}")
|
| 68 |
+
user_interests_embeddings = self.encode(new_interests)
|
| 69 |
+
user_interests_embeddings = np.mean(user_interests_embeddings, axis=0) # Averaging embeddings
|
| 70 |
+
|
| 71 |
+
# Reshape user embedding to (1, -1) for compatibility with cosine_similarity
|
| 72 |
+
user_interests_embeddings = user_interests_embeddings.reshape(1, -1)
|
| 73 |
+
|
| 74 |
+
# encoding all articles into embeddings
|
| 75 |
+
article_embeddings = self.encode(self.articles)
|
| 76 |
+
|
| 77 |
+
# print(f"Shape of user_interests_embeddings: {user_interests_embeddings.shape}")
|
| 78 |
+
# print(f"Shape of article_embeddings: {article_embeddings.shape}")
|
| 79 |
+
|
| 80 |
+
# calculate cosine similarity between user interests and all article embeddings
|
| 81 |
+
similarities = cosine_similarity(user_interests_embeddings, article_embeddings).flatten()
|
| 82 |
+
|
| 83 |
+
# sort articles based on similarity
|
| 84 |
+
recommended_indices = np.argsort(similarities)[::-1]
|
| 85 |
+
|
| 86 |
+
# Get all recommended articles sorted by similarity
|
| 87 |
+
recommended_articles = [self.articles[i] for i in recommended_indices]
|
| 88 |
+
|
| 89 |
+
return recommended_articles
|
src/introlix_api/pipeline/__init__.py
ADDED
|
File without changes
|
src/introlix_api/pipeline/common_pipeline.py
ADDED
|
File without changes
|
src/introlix_api/pipeline/periodic_pipeline.py
ADDED
|
File without changes
|
src/introlix_api/utils/__init__.py
ADDED
|
File without changes
|