satyam998 commited on
Commit
79d285f
·
0 Parent(s):

Initial commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +0 -0
  2. .github/workflows/ci.yml +34 -0
  3. .gitignore +162 -0
  4. .idx/dev.nix +55 -0
  5. Dockerfile +19 -0
  6. README.md +12 -0
  7. app.py +205 -0
  8. crawler.py +222 -0
  9. demo.py +38 -0
  10. docs/design.md +0 -0
  11. docs/requirements.md +0 -0
  12. main.py +40 -0
  13. requirements-dev.txt +0 -0
  14. requirements.txt +22 -0
  15. setup.py +18 -0
  16. src/introlix_api/app/__init__.py +0 -0
  17. src/introlix_api/app/algolia.py +82 -0
  18. src/introlix_api/app/appwrite.py +179 -0
  19. src/introlix_api/app/database.py +23 -0
  20. src/introlix_api/app/introlix_spider/introlix_spider/__init__.py +0 -0
  21. src/introlix_api/app/introlix_spider/introlix_spider/items.py +12 -0
  22. src/introlix_api/app/introlix_spider/introlix_spider/middlewares.py +103 -0
  23. src/introlix_api/app/introlix_spider/introlix_spider/pipelines.py +13 -0
  24. src/introlix_api/app/introlix_spider/introlix_spider/settings.py +100 -0
  25. src/introlix_api/app/introlix_spider/introlix_spider/spiders/__init__.py +4 -0
  26. src/introlix_api/app/introlix_spider/introlix_spider/spiders/generic.py +286 -0
  27. src/introlix_api/app/introlix_spider/scrapy.cfg +11 -0
  28. src/introlix_api/app/model.py +37 -0
  29. src/introlix_api/app/routes/__init__.py +0 -0
  30. src/introlix_api/app/routes/auth.py +109 -0
  31. src/introlix_api/app/routes/posts.py +208 -0
  32. src/introlix_api/app/routes/run_spider.py +23 -0
  33. src/introlix_api/app/routes/similarity.py +83 -0
  34. src/introlix_api/crawler/__init__.py +0 -0
  35. src/introlix_api/crawler/bot.py +390 -0
  36. src/introlix_api/engine/__init__.py +0 -0
  37. src/introlix_api/engine/api_data.py +101 -0
  38. src/introlix_api/engine/discussion.py +41 -0
  39. src/introlix_api/engine/graphql.py +69 -0
  40. src/introlix_api/engine/third_party_apis.py +108 -0
  41. src/introlix_api/engine/youtube.py +54 -0
  42. src/introlix_api/exception/__init__.py +34 -0
  43. src/introlix_api/logger/__init__.py +22 -0
  44. src/introlix_api/ml/__init__.py +0 -0
  45. src/introlix_api/ml/model.py +0 -0
  46. src/introlix_api/ml/recommendation.py +89 -0
  47. src/introlix_api/pipeline/__init__.py +0 -0
  48. src/introlix_api/pipeline/common_pipeline.py +0 -0
  49. src/introlix_api/pipeline/periodic_pipeline.py +0 -0
  50. src/introlix_api/utils/__init__.py +0 -0
.dockerignore ADDED
File without changes
.github/workflows/ci.yml ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face hub
2
+ on:
3
+ push:
4
+ branches: [main]
5
+
6
+ # To run this workflow manually from the Actions tab
7
+ workflow_dispatch:
8
+
9
+ jobs:
10
+ sync-to-hub:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v3
14
+ with:
15
+ fetch-depth: 0
16
+ lfs: true
17
+
18
+ - name: Set up Git user
19
+ run: |
20
+ git config --global user.email "tubex998@gmail.com"
21
+ git config --global user.name "satyam998"
22
+
23
+ - name: Create a new branch
24
+ run: |
25
+ git checkout --orphan temp
26
+ git add -A
27
+ git commit -m "Initial commit"
28
+ git branch -D main
29
+ git branch -m main
30
+
31
+ - name: Force push to hub
32
+ env:
33
+ HF: ${{ secrets.HF_TOKEN }}
34
+ run: git push --force https://satyam998:$HF@huggingface.co/spaces/satyam998/introlix_api main
.gitignore ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110
+ .pdm.toml
111
+ .pdm-python
112
+ .pdm-build/
113
+
114
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115
+ __pypackages__/
116
+
117
+ # Celery stuff
118
+ celerybeat-schedule
119
+ celerybeat.pid
120
+
121
+ # SageMath parsed files
122
+ *.sage.py
123
+
124
+ # Environments
125
+ .env
126
+ .venv
127
+ env/
128
+ venv/
129
+ ENV/
130
+ env.bak/
131
+ venv.bak/
132
+
133
+ # Spyder project settings
134
+ .spyderproject
135
+ .spyproject
136
+
137
+ # Rope project settings
138
+ .ropeproject
139
+
140
+ # mkdocs documentation
141
+ /site
142
+
143
+ # mypy
144
+ .mypy_cache/
145
+ .dmypy.json
146
+ dmypy.json
147
+
148
+ # Pyre type checker
149
+ .pyre/
150
+
151
+ # pytype static type analyzer
152
+ .pytype/
153
+
154
+ # Cython debug symbols
155
+ cython_debug/
156
+
157
+ # PyCharm
158
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
161
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162
+ #.idea/
.idx/dev.nix ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # To learn more about how to use Nix to configure your environment
2
+ # see: https://developers.google.com/idx/guides/customize-idx-env
3
+ { pkgs, ... }: {
4
+ # Which nixpkgs channel to use.
5
+ channel = "stable-23.11"; # or "unstable"
6
+
7
+ # Use https://search.nixos.org/packages to find packages
8
+ packages = [
9
+ # pkgs.go
10
+ pkgs.python311
11
+ pkgs.python311Packages.pip
12
+ # pkgs.nodejs_20
13
+ # pkgs.nodePackages.nodemon
14
+ ];
15
+
16
+ # Sets environment variables in the workspace
17
+ env = {};
18
+ idx = {
19
+ # Search for the extensions you want on https://open-vsx.org/ and use "publisher.id"
20
+ extensions = [
21
+ # "vscodevim.vim"
22
+ ];
23
+
24
+ # Enable previews
25
+ previews = {
26
+ enable = true;
27
+ previews = {
28
+ # web = {
29
+ # # Example: run "npm run dev" with PORT set to IDX's defined port for previews,
30
+ # # and show it in IDX's web preview panel
31
+ # command = ["npm" "run" "dev"];
32
+ # manager = "web";
33
+ # env = {
34
+ # # Environment variables to set for your server
35
+ # PORT = "$PORT";
36
+ # };
37
+ # };
38
+ };
39
+ };
40
+
41
+ # Workspace lifecycle hooks
42
+ workspace = {
43
+ # Runs when a workspace is first created
44
+ onCreate = {
45
+ # Example: install JS dependencies from NPM
46
+ # npm-install = "npm install";
47
+ };
48
+ # Runs when the workspace is (re)started
49
+ onStart = {
50
+ # Example: start a background task to watch and re-build backend code
51
+ # watch-backend = "npm run watch-backend";
52
+ };
53
+ };
54
+ };
55
+ }
Dockerfile ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10
2
+
3
+ RUN useradd -m -u 1000 user
4
+
5
+ WORKDIR /app
6
+
7
+ COPY --chown=user . /app
8
+
9
+ RUN pip install -r requirements.txt
10
+
11
+ RUN mkdir -p /app/logs
12
+ RUN chmod 777 /app/logs
13
+
14
+ # Copy the shell script into the container
15
+ COPY start.sh /app/start.sh
16
+ RUN chmod +x /app/start.sh
17
+
18
+ # Use the shell script to start both processes
19
+ CMD ["/bin/bash", "/app/start.sh"]
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Introlix API
3
+ emoji: 🔥
4
+ colorFrom: green
5
+ colorTo: blue
6
+ sdk: docker
7
+ pinned: false
8
+ license: apache-2.0
9
+ ---
10
+
11
+ # Introlix API
12
+ <p>Introlix API offers a comprehensive suite of tools and APIs utilized in Introlix Feed. It is an advanced API that integrates multiple external APIs, RSS feed crawlers, and other data sources to provide a robust and efficient backend service.</p>
app.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, Query, HTTPException
2
+ from bson import ObjectId
3
+ import sys
4
+ import httpx
5
+ import os
6
+ import crawler
7
+ from fastapi.middleware.cors import CORSMiddleware
8
+ from starlette.responses import RedirectResponse
9
+ from introlix_api.app.routes import auth, posts, run_spider, similarity
10
+ from typing import List
11
+ from dotenv import load_dotenv, dotenv_values
12
+
13
+ from introlix_api.app.appwrite import databases, APPWRITE_DATABASE_ID, ID, APPWRITE_ACCOUNT_COLLECTION_ID, get_interests
14
+ from introlix_api.app.database import startup_db_client, shutdown_db_client
15
+ from introlix_api.ml.recommendation import Recommendation
16
+ from introlix_api.utils.tags import fetch_tags
17
+
18
+ from introlix_api.exception import CustomException
19
+
20
+ from contextlib import asynccontextmanager
21
+
22
+ from pydantic import BaseModel, Field
23
+
24
+ load_dotenv()
25
+
26
+ YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")
27
+
28
+ class FeedModel(BaseModel):
29
+ id: str = Field(..., alias="_id")
30
+ title: str
31
+ desc: str
32
+ url: str
33
+ publication_date: str
34
+ image_url: str
35
+ category: str
36
+ source: str
37
+
38
+ @asynccontextmanager
39
+ async def lifespan(app: FastAPI):
40
+ # Start the database connection
41
+ await startup_db_client(app)
42
+ yield
43
+ # Close the database connection
44
+ await shutdown_db_client(app)
45
+
46
+ app = FastAPI(lifespan=lifespan)
47
+
48
+ origins = [
49
+ "http://localhost:3000",
50
+ "http://192.168.1.64:3000",
51
+ "https://introlixfeed.vercel.app/",
52
+ "https://introlixfeed.vercel.com/"
53
+ # Add other allowed origins here if needed
54
+ ]
55
+
56
+ app.add_middleware(
57
+ CORSMiddleware,
58
+ allow_origins=origins, # Specify allowed origins
59
+ allow_credentials=True,
60
+ allow_methods=["*"],
61
+ allow_headers=["*"],
62
+ )
63
+
64
+ @app.get("/", tags=["authentication"])
65
+ async def index():
66
+ return RedirectResponse(url='/docs')
67
+
68
+ @app.get("/feed_data", response_model=List[FeedModel])
69
+ async def get_feed_data(page: int = 1, limit: int = 20, user_id: str = Query(...), category=None):
70
+ try:
71
+ skip = (page - 1) * limit
72
+
73
+ response = get_interests()
74
+ user_interests = []
75
+ # getting only the interests not keywords
76
+ for interest in response:
77
+ user_interests.append(interest['interest'])
78
+
79
+ users = databases.list_documents(
80
+ database_id=APPWRITE_DATABASE_ID,
81
+ collection_id=APPWRITE_ACCOUNT_COLLECTION_ID
82
+ )
83
+
84
+ for doc in users['documents']:
85
+ if user_id == doc['$id']:
86
+ user_interests = doc['interests']
87
+
88
+ user_interests = [item.split(':')[0] for item in user_interests]
89
+ # response = await app.mongodb['feedData'].find({"category": {"$in": user_interests}}).skip(skip).limit(limit).to_list(limit)
90
+
91
+
92
+
93
+ # Perform the aggregation
94
+ if category == None:
95
+ response = await app.mongodb['feedData'].find({"category": {"$in": user_interests}}).skip(skip).limit(limit).to_list(limit)
96
+ else:
97
+ response = await app.mongodb['feedData'].find({"category": category}).skip(skip).limit(limit).to_list(limit)
98
+
99
+ # random.shuffle(response)
100
+
101
+ # Filter out items that do not have a title
102
+ response = [item for item in response if item.get('title')]
103
+ response = [item for item in response if item.get('desc')]
104
+
105
+ article_titles = [item['title'] for item in response]
106
+ recommendation_system = Recommendation(user_interests, article_titles)
107
+ recommended_titles = recommendation_system.recommend()
108
+
109
+ response = [post for post in response if post['title'] in recommended_titles]
110
+
111
+
112
+ for item in response:
113
+ item['_id'] = str(item['_id'])
114
+ item['title'] = item.get('title') or ''
115
+ item['desc'] = item.get('desc') or ''
116
+ item['url'] = item.get('url') or ''
117
+ item['publication_date'] = item.get('publication_date') or ''
118
+ item['image_url'] = item.get('image_url') or ''
119
+ item['category'] = item.get('category') or ''
120
+ item['source'] = item.get('source') or ''
121
+
122
+ return response
123
+ except Exception as e:
124
+ raise CustomException(e, sys) from e
125
+
126
+ @app.get("/fetch_post", response_model=FeedModel)
127
+ async def get_feed_data(post_id: str = Query(...)):
128
+ try:
129
+ post_id = ObjectId(post_id)
130
+ response = await app.mongodb['feedData'].find_one({"_id": post_id})
131
+
132
+ if not response:
133
+ raise HTTPException(status_code=404, detail="Post not found")
134
+
135
+ # Convert _id to string
136
+ response["_id"] = str(response["_id"])
137
+
138
+ # Check for null values and set defaults if needed
139
+ response["desc"] = (response.get("desc") or "No Description")[:90]
140
+ response["publication_date"] = response.get("publication_date") or "Unknown Date"
141
+ response["image_url"] = response.get("image_url") or "No Image URL"
142
+ response["category"] = response.get("category") or "Uncategorized"
143
+ response["source"] = response.get("source") or "Unknown Source"
144
+
145
+ # for item in response:
146
+ # item['title'] = item.get('title') or ''
147
+ # item['desc'] = item.get('desc') or ''
148
+ # item['url'] = item.get('url') or ''
149
+ # item['publication_date'] = item.get('publication_date') or ''
150
+ # item['image_url'] = item.get('image_url') or ''
151
+ # item['category'] = item.get('category') or ''
152
+ # item['source'] = item.get('source') or ''
153
+
154
+ return response
155
+ except Exception as e:
156
+ raise CustomException(e, sys) from e
157
+
158
+ @app.get("/test_recommendation")
159
+ async def test_recommendation(
160
+ user_interests: list[str] = Query(..., description="Comma-separated list of user interests"),
161
+ articles: list[str] = Query(..., description="Comma-separated list of articles")
162
+ ):
163
+ """
164
+ Test endpoint for recommendations.
165
+ Takes user interests and articles as query parameters and returns recommended articles.
166
+ """
167
+
168
+ # Create a recommendation instance
169
+ recommendation = Recommendation(user_interests, articles)
170
+
171
+ # Get the recommended articles
172
+ recommended_articles = recommendation.recommend()
173
+
174
+ return {
175
+ "user_interests": user_interests,
176
+ "recommended_articles": recommended_articles,
177
+ }
178
+
179
+ @app.get("/youtube/videos")
180
+ async def get_youtube_videos(query: str = None):
181
+ url = "https://www.googleapis.com/youtube/v3/search"
182
+ params = {
183
+ "key": YOUTUBE_API_KEY,
184
+ "part": "snippet",
185
+ "q": query or "trending",
186
+ "type": "video",
187
+ "maxResults": 10,
188
+ "order": "viewCount" # You can change this to 'date' for recent uploads
189
+ }
190
+
191
+ async with httpx.AsyncClient() as client:
192
+ response = await client.get(url, params=params)
193
+ response.raise_for_status() # Raise an error for bad responses
194
+ return response.json()
195
+
196
+ @app.get("/tags")
197
+ async def get_tags():
198
+ tags = fetch_tags()
199
+ return tags
200
+
201
+ app.include_router(auth.router, prefix="/auth")
202
+ app.include_router(run_spider.router, prefix="/spider")
203
+ app.include_router(similarity.router, prefix="/feed")
204
+ app.include_router(crawler.router)
205
+ app.include_router(posts.router)
crawler.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import sys
3
+ import time
4
+ from urllib.parse import urlparse
5
+ from fastapi import APIRouter, HTTPException, Query
6
+ from introlix_api.crawler.bot import IntrolixBot, BotArgs
7
+ from introlix_api.exception import CustomException
8
+ from introlix_api.logger import logger
9
+ from introlix_api.utils.root_sites import root_sites
10
+ from introlix_api.app.database import search_data, db
11
+ from introlix_api.app.appwrite import fetch_root_sites, fetch_saved_urls, save_urls
12
+ from pymongo import ASCENDING
13
+ from pymongo.errors import DuplicateKeyError
14
+
15
+ router = APIRouter()
16
+
17
+ BATCH_SIZE = 10
18
+ urls_batch = []
19
+ storage_threshold = 500 * 1024 * 1024
20
+ delete_batch = 1000
21
+
22
+ def filter_urls(url: str) -> bool:
23
+ """
24
+ A function to filter non article urls from the scraped urls
25
+ Args:
26
+ url (list): url
27
+ Returns:
28
+ bool: True if the url is article url else False
29
+ """
30
+ parsed_url = urlparse(url)
31
+
32
+ if parsed_url.path in ('', '/'):
33
+ return False
34
+
35
+ non_article_keywords = [
36
+ "/product", "/products", "/home", "/item", "/items", "/category", "/categories",
37
+ "/login", "/signin", "/logout", "/signup", "/register", "/account", "/user",
38
+ "/profile", "/dashboard", "/settings", "/preferences", "/order", "/orders",
39
+ "/cart", "/checkout", "/payment", "/subscribe", "/subscription",
40
+ "/contact", "/support", "/help", "/faq", "/about", "/privacy", "/terms",
41
+ "/policy", "/conditions", "/legal", "/service", "/services", "/guide",
42
+ "/how-to", "/pricing", "/price", "fees", "/plans", "/features", "/partners",
43
+ "/team", "/careers", "/jobs", "/join", "/apply", "/training", "/demo",
44
+ "/trial", "/download", "/install", "/app", "/apps", "/software", "/portal",
45
+ "/index", "/main", "/video", "/videos", "/photo", "/photos",
46
+ "/image", "/images", "/gallery", "/portfolio", "/showcase", "/testimonials",
47
+ "/reviews", "/search", "/find", "/browse", "/list", "/tags", "/explore",
48
+ "/new", "/trending", "/latest", "/promotions", "/offers", "/deals", "/discount",
49
+ "/coupon", "/coupons", "/gift", "/store", "/stores", "/locator", "/locations",
50
+ "/branches", "/events", "/webinar", "/calendar", "/schedule",
51
+ "/class", "/classes", "/lesson", "/lessons", "/training", "/activity",
52
+ "/activities", "/workshop", "/exhibit", "/performance", "/map", "/directions",
53
+ "/weather", "/traffic", "/rates", "/auction", "/bid", "/tender", "/investment",
54
+ "/loan", "/mortgage", "/property", "/real-estate", "/construction", "/project",
55
+ "/client", "/clients", "/partner", "/sponsor", "/media", "/press", "/releases",
56
+ "/announcements", "/newsroom", "/resources", "courses", "collections", "/u/", "/members/",
57
+ "/@", "/shop", "/wiki", "/author", "/dynamic", "/image", "/submit" # TODO: need to add more
58
+ ]
59
+
60
+ article_keywords = [
61
+ "/blog/", "post", "article", "insights", "guide", "tutorial",
62
+ "how-to", "what", "how", "introduction", "/news/"
63
+ ]
64
+
65
+ article_pattern = [
66
+ r'/(/blog/|article|articles|post|posts|blogs|news|)/\d{4}/\d{2}/+[a-z0-9-]+/?',
67
+ r'/(/blog/|article|articles|post|posts|blogs|news|)/[a-z0-9-]+/[a-z0-9-]+',
68
+ r'(?<!\/\/www)(/blog/|article|articles|post|posts|blogs|news|)/[a-z0-9-]+',
69
+ r'^(?!.*\/category\/).*\/[a-z0-9-]+\/[a-z0-9-]+(-[a-z0-9-]+)+$',
70
+ r'/[^/]+/\d{4}/\d{2}/\d{2}/+[a-z0-9]+/?',
71
+ r'/[^/]+/\d{4}/\d{2}/+[a-z0-9]+/?'
72
+ r'/[a-z0-9-]+/\d{4}/\d{2}/+/?',
73
+ r'/[a-z0-9-]+/\d{4}/\d{2}/\d{2}/+/?'
74
+ ]
75
+
76
+ for pattern in article_pattern:
77
+ if re.search(pattern, url):
78
+ if not any(keyword in url for keyword in non_article_keywords):
79
+ return True
80
+
81
+ if any (keyword in url for keyword in article_keywords):
82
+ return True
83
+
84
+ last_segment = parsed_url.path.strip('/').split('/')[-1]
85
+ if '-' in last_segment and len(last_segment.split('-')) > 2:
86
+ return True
87
+
88
+ return False
89
+
90
+ def save_to_db(data):
91
+ global urls_batch
92
+ try:
93
+ # Check database storage size and delete old documents if needed
94
+ stats = db.command("collStats", "search_data")
95
+ storage_size = stats['size']
96
+
97
+ if storage_size >= storage_threshold:
98
+ oldest_docs = search_data.find().sort("createdAt", ASCENDING).limit(delete_batch)
99
+ oldest_ids = [doc['_id'] for doc in oldest_docs]
100
+ search_data.delete_many({"_id": {"$in": oldest_ids}})
101
+
102
+ # Prepare list of URLs to check in the database
103
+ urls = [d["url"] for d in data if filter_urls(d["url"])]
104
+
105
+ # Retrieve existing URLs from the database to filter out duplicates
106
+ existing_urls = set(search_data.find({"url": {"$in": urls}}).distinct("url"))
107
+
108
+ # Filter out documents with URLs that already exist in the database
109
+ unique_data = [
110
+ {"url": d["url"], "content": d["content"], "type": "article"}
111
+ for d in data
112
+ if d["url"] not in existing_urls and d.get("content") is not None
113
+ ]
114
+
115
+ # Insert only unique documents
116
+ if unique_data:
117
+ try:
118
+ search_data.insert_many(unique_data)
119
+ except DuplicateKeyError as e:
120
+ logger.info("Duplicate URL detected during insertion. Skipping duplicate entries.")
121
+
122
+ # Process URLs in `urls_batch` if it has URLs
123
+ if urls_batch:
124
+ try:
125
+ save_urls(urls_batch)
126
+ except Exception as e:
127
+ logger.error(f"Error saving URLs to Appwrite: {str(e)}")
128
+ urls_batch.clear()
129
+
130
+ except Exception as e:
131
+ raise CustomException(e, sys) from e
132
+
133
+ def extract_urls(batch_size=BATCH_SIZE):
134
+ # Fetch documents with required fields only, reducing memory footprint per document
135
+ documents = search_data.find({}, {"content.links": 1})
136
+
137
+ # Initialize a list to store URLs in batches
138
+ batch_urls = []
139
+
140
+ for doc in documents:
141
+ # Extract URLs only if 'content' and 'links' exist
142
+ links = doc.get("content", {}).get("links")
143
+ if links:
144
+ # Use a generator to iterate over links directly
145
+ for url in links:
146
+ batch_urls.append(url)
147
+ # Yield URLs in batches to control memory usage
148
+ if len(batch_urls) >= batch_size:
149
+ yield batch_urls
150
+ batch_urls = [] # Clear the batch after yielding
151
+
152
+ # Yield any remaining URLs
153
+ if batch_urls:
154
+ yield batch_urls
155
+
156
+ def crawler(urls_batch):
157
+ try:
158
+ bot = IntrolixBot(urls=urls_batch, args=BotArgs)
159
+
160
+ # Process each batch of scraped data
161
+ for data_batch in bot.scrape_parallel(batch_size=BATCH_SIZE):
162
+ save_to_db(data_batch)
163
+
164
+ except Exception as e:
165
+ raise CustomException(e, sys) from e
166
+
167
+ def run_crawler_continuously():
168
+ global urls_batch
169
+ try:
170
+ while True:
171
+ start_time = time.time() # Record the start time
172
+
173
+ while (time.time() - start_time) < 600: # Run for 10 minutes (600 seconds)
174
+ try:
175
+ root_urls = fetch_root_sites()
176
+ saved_urls = fetch_saved_urls()
177
+ except Exception as e:
178
+ logger.info("Error fetching URLs from Appwrite: %s", str(e))
179
+ root_urls = []
180
+ saved_urls = []
181
+
182
+ if root_urls and saved_urls:
183
+ urls = root_urls + saved_urls
184
+ urls = list(set(urls))
185
+ else:
186
+ urls = root_sites() + urls_batch
187
+
188
+ if urls:
189
+ logger.info(f"Starting crawler with {len(urls)} root URLs")
190
+ crawler(urls[::-1])
191
+
192
+
193
+ # Extract and process URLs in batches
194
+ for extracted_urls in extract_urls(batch_size=BATCH_SIZE):
195
+ urls_batch.extend(list(set(extracted_urls)))
196
+ # logger.info(f"Starting crawler with {len(set(urls_batch))} extracted URLs from MongoDB")
197
+ # crawler(list(set(urls_batch)))
198
+ time.sleep(1)
199
+
200
+ time.sleep(1)
201
+
202
+ # After 10 minutes, the while loop will restart without any pause
203
+ logger.info("Restarting the crawler for another 10-minute session.")
204
+ except Exception as e:
205
+ raise CustomException(e, sys) from e
206
+
207
+
208
+ @router.post('/crawler')
209
+ def run_crawler():
210
+ try:
211
+ run_crawler_continuously()
212
+ except Exception as e:
213
+ raise HTTPException(status_code=400, detail=str(e))
214
+
215
+
216
+ if __name__ == "__main__":
217
+ while True:
218
+ start_time = time.time()
219
+ while (time.time() - start_time) < 600:
220
+ run_crawler_continuously()
221
+ # # urls = extract_urls()
222
+ # # print(urls)
demo.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #import csv
2
+ # from introlix_api.app.database import feed_data
3
+
4
+ # data = feed_data.find({}, {"_id": 0, "title": 1}) # Exclude _id, include only title
5
+
6
+ # # Specify the CSV file to write to
7
+ # csv_file = 'feed_data_titles.csv'
8
+
9
+ # # Write data to a CSV file
10
+ # with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
11
+ # writer = csv.writer(file)
12
+
13
+ # # Write header (just the title field)
14
+ # writer.writerow(["title"])
15
+
16
+ # # Write each document's title to the CSV
17
+ # for document in data:
18
+ # writer.writerow([document.get("title")])
19
+
20
+ # print(f"Title data successfully saved to {csv_file}")
21
+ # from introlix_api.crawler.bot import IntrolixBot, BotArgs
22
+ # import time
23
+
24
+ # start = time.time()
25
+ # inbot = IntrolixBot(args=BotArgs, urls=["https://www.wikipedia.org/", "https://medium.com/", "https://www.bbc.com/"])
26
+
27
+ # print(inbot.crawl(batch_size=1048))
28
+ # # end = time.time()
29
+ # print(f"Time taken: {end - start}")
30
+
31
+ # from introlix_api.app.appwrite import fetch_root_sites
32
+
33
+ # print(len(set(fetch_root_sites())))
34
+ # Access the scraped data
35
+ # for index, page_data in enumerate(inbot.data):
36
+ # print(f"Page {index + 1}:")
37
+ # print(page_data)
38
+ # print('-' * 40)
docs/design.md ADDED
File without changes
docs/requirements.md ADDED
File without changes
main.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+
3
+ def run_app():
4
+ command = ["scrapy", "crawl", "generic"]
5
+ working_directory = "src/introlix_api/app/introlix_spider"
6
+
7
+ result = subprocess.run(command, cwd=working_directory, capture_output=True, text=True)
8
+
9
+ print("Output:", result.stdout)
10
+ print("Error:", result.stderr)
11
+
12
+ if __name__ == "__main__":
13
+ # running the spider
14
+ run_app()
15
+
16
+ # def run_get_urls_from_page_parallel(self, urls: list, max_workers: int=10) -> list:
17
+ # """
18
+ # Running get_urls_from_page function in parallel for many runs.
19
+
20
+ # Args:
21
+ # urls (list): list of urls
22
+ # max_workers (int, optional): number of workers. Defaults to 10.
23
+ # Returns:
24
+ # list: list of fetched urls
25
+ # """
26
+ # fetched_urls = []
27
+
28
+ # with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
29
+ # futures = {executor.submit(self.get_urls_from_page, url): url for url in urls}
30
+
31
+ # for future in concurrent.futures.as_completed(futures):
32
+ # url = futures[future]
33
+
34
+ # try:
35
+ # result = future.result()
36
+ # fetched_urls.append(result)
37
+ # except Exception as e:
38
+ # raise CustomException(e, sys) from e
39
+
40
+ # return list(set(list(url for sublist in fetched_urls if sublist is not None for url in sublist)))
requirements-dev.txt ADDED
File without changes
requirements.txt ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy
2
+ pandas
3
+ scrapy
4
+ fastapi
5
+ uvicorn
6
+ Jinja2
7
+ appwrite
8
+ python-dotenv
9
+ pymongo
10
+ aiohttp
11
+ motor
12
+ httpx
13
+ torch
14
+ scikit-learn
15
+ beautifulsoup4
16
+ sentence-transformers
17
+ nltk
18
+ algoliasearch
19
+ apscheduler
20
+ cachetools
21
+
22
+ -e .
setup.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setuptools import setup
2
+ import setuptools
3
+
4
+ with open("README.md", "r", encoding="utf-8") as f:
5
+ long_description = f.read()
6
+
7
+ setup(
8
+ name="introlix_api",
9
+ version="0.0.1",
10
+ author="Satyam Mishra",
11
+ author_email="tubex998@gmail.com",
12
+ description="Introlix API offers a comprehensive suite of tools and APIs utilized in Introlix Feed.",
13
+ long_description=long_description,
14
+ long_description_content_type="text/markdown",
15
+ package_dir={"": "src"},
16
+ packages=setuptools.find_packages(where="src"),
17
+ python_requires=">=3.10",
18
+ )
src/introlix_api/app/__init__.py ADDED
File without changes
src/introlix_api/app/algolia.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import asyncio
4
+ from bson import ObjectId
5
+ from apscheduler.schedulers.asyncio import AsyncIOScheduler
6
+ from introlix_api.app.database import search_data
7
+ from algoliasearch.search.client import SearchClientSync
8
+ from dotenv import load_dotenv
9
+
10
+ load_dotenv()
11
+
12
+ ALGOLIA_USER = os.getenv("ALGOLIA_USER")
13
+ ALGOLIA_KEY = os.getenv("ALGOLIA_KEY")
14
+ INDEX_NAME = "introlix_data"
15
+
16
+ # Initialize the Algolia client
17
+ _client = SearchClientSync(ALGOLIA_USER, ALGOLIA_KEY)
18
+
19
+ def convert_object_ids(doc):
20
+ """Recursively convert ObjectId fields to strings in the document."""
21
+ for key, value in doc.items():
22
+ if isinstance(value, ObjectId):
23
+ doc[key] = str(value)
24
+ elif isinstance(value, dict):
25
+ convert_object_ids(value) # Recursively convert in nested dicts
26
+ elif isinstance(value, list):
27
+ for item in value:
28
+ if isinstance(item, dict):
29
+ convert_object_ids(item) # Recursively convert in dicts within lists
30
+ return doc
31
+
32
+ async def upload_data():
33
+ """Uploads data to Algolia in batches, updating records by setting `objectID` to prevent duplicates."""
34
+ batch_size = 1000
35
+ batch = []
36
+
37
+ cursor = search_data.find()
38
+ for doc in cursor:
39
+ # Convert any ObjectId fields to strings for JSON compatibility
40
+ doc = convert_object_ids(doc)
41
+
42
+ # Set `objectID` to ensure uniqueness and prevent duplicates
43
+ doc['objectID'] = str(doc['_id']) # Using MongoDB _id as `objectID`
44
+
45
+ # Convert document to JSON string and check its size
46
+ doc_json = json.dumps(doc)
47
+ doc_size = len(doc_json.encode('utf-8'))
48
+
49
+ # Only add to batch if size is within Algolia's 10 KB limit
50
+ if doc_size <= 10000:
51
+ batch.append(doc)
52
+
53
+ # Send batch to Algolia when the batch size is reached
54
+ if len(batch) >= batch_size:
55
+ _client.save_objects(index_name=INDEX_NAME, objects=batch)
56
+ batch.clear() # Clear the batch after sending
57
+
58
+ # Send any remaining documents
59
+ if batch:
60
+ _client.save_objects(index_name=INDEX_NAME, objects=batch)
61
+
62
+ print("Uploaded data to Algolia.")
63
+
64
+ async def main():
65
+ # Run the upload function immediately
66
+ await upload_data()
67
+
68
+ scheduler = AsyncIOScheduler()
69
+ # Schedule `upload_data` to run every 4 hours
70
+ scheduler.add_job(upload_data, 'interval', hours=4)
71
+ scheduler.start()
72
+
73
+ print("Scheduler started. Uploading data to Algolia every 4 hours.")
74
+
75
+ # Keep the main thread alive to allow scheduled tasks to run
76
+ try:
77
+ await asyncio.Event().wait()
78
+ except (KeyboardInterrupt, SystemExit):
79
+ scheduler.shutdown()
80
+
81
+ if __name__ == "__main__":
82
+ asyncio.run(main())
src/introlix_api/app/appwrite.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from appwrite.client import Client
4
+ from appwrite.query import Query
5
+ from appwrite.services.databases import Databases
6
+ from appwrite.id import ID
7
+ from dotenv import load_dotenv, dotenv_values
8
+
9
+ from introlix_api.logger import logger
10
+ from introlix_api.exception import CustomException
11
+ from introlix_api.utils.common import is_valid_url, sanitize_url
12
+
13
+ from pydantic import HttpUrl
14
+
15
+ load_dotenv()
16
+
17
+ APPWRITE_PROJECT_ID = os.getenv("APPWRITE_PROJECT_ID")
18
+ APPWRITE_API_KEY = os.getenv("APPWRITE_API_KEY")
19
+ APPWRITE_DATABASE_ID = os.getenv("APPWRITE_DATABASE_ID")
20
+ APPWRITE_ROOTSITES_COLLECTION_ID = os.getenv("APPWRITE_ROOTSITES_COLLECTION_ID")
21
+ APPWRITE_SAVED_URLS_COLLECTION_ID = os.getenv("APPWRITE_SAVED_URLS_COLLECTION_ID")
22
+ APPWRITE_INTERESTS_TO_PICK_COLLECTION_ID = os.getenv("APPWRITE_INTERESTS_TO_PICK_COLLECTION_ID")
23
+ APPWRITE_ACCOUNT_COLLECTION_ID = os.getenv("APPWRITE_ACCOUNT_COLLECTION_ID")
24
+
25
+ client = Client()
26
+ client.set_endpoint('https://cloud.appwrite.io/v1')
27
+ client.set_project(APPWRITE_PROJECT_ID)
28
+ client.set_key(APPWRITE_API_KEY)
29
+
30
+ databases = Databases(client)
31
+
32
+ # models for database
33
+ class RootSitesModel:
34
+ url: HttpUrl
35
+
36
+ # fetching the data from appwrite
37
+ def fetch_root_sites():
38
+ """
39
+ Function to fetch the root sites from appwrite
40
+ """
41
+ try:
42
+ logger.info("Fetching all of the root sites...")
43
+ limit = 100
44
+ offset = 0
45
+
46
+ root_sites = []
47
+
48
+ while True:
49
+ response = databases.list_documents(database_id=APPWRITE_DATABASE_ID,collection_id=APPWRITE_ROOTSITES_COLLECTION_ID, queries=[Query.limit(limit), Query.offset(offset)]) # fetching all of the root sites
50
+
51
+ for root_site in response['documents']:
52
+ root_sites.append(root_site['url'])
53
+
54
+ if len(response['documents']) < limit:
55
+ break
56
+
57
+ offset += limit
58
+
59
+ # root_sites = [root_site['url'] for root_site in response['documents']] # extracting the urls
60
+
61
+ return root_sites
62
+
63
+ except Exception as e:
64
+ raise CustomException(e, sys) from e
65
+
66
+ def fetch_saved_urls():
67
+ """
68
+ Function to fetch the root sites from appwrite
69
+ """
70
+ try:
71
+ logger.info("Fetching all of the saved urls...")
72
+ limit = 100
73
+ offset = 0
74
+
75
+ root_sites = []
76
+
77
+ while True:
78
+ response = databases.list_documents(database_id=APPWRITE_DATABASE_ID,collection_id=APPWRITE_SAVED_URLS_COLLECTION_ID, queries=[Query.limit(limit), Query.offset(offset)]) # fetching all of the root sites
79
+
80
+ for root_site in response['documents']:
81
+ root_sites.append(root_site['url'])
82
+
83
+ if len(response['documents']) < limit:
84
+ break
85
+
86
+ offset += limit
87
+
88
+ # root_sites = [root_site['url'] for root_site in response['documents']] # extracting the urls
89
+
90
+ return root_sites[-4000:]
91
+
92
+ except Exception as e:
93
+ raise CustomException(e, sys) from e
94
+
95
+ def get_interests():
96
+ """
97
+ Function to fetch the interests list from where user can choose its interests
98
+ """
99
+ try:
100
+ response = databases.list_documents(database_id=APPWRITE_DATABASE_ID,collection_id=APPWRITE_INTERESTS_TO_PICK_COLLECTION_ID, queries=[Query.limit(100), Query.offset(0)])
101
+
102
+ interests = [{"interest": interest['interest'], "keywords": interest['keywords']} for interest in response['documents']]
103
+
104
+ return interests
105
+ except Exception as e:
106
+ raise CustomException(e, sys) from e
107
+
108
+ def save_urls(urls):
109
+ """
110
+ Function to save the URLs in Appwrite. Handles large collections efficiently.
111
+ """
112
+ try:
113
+ limit = 10
114
+ offset = 0
115
+ existing_urls = set() # Set to store unique URLs
116
+
117
+ # Check the total number of documents in the collection
118
+ total_count_response = databases.list_documents(
119
+ database_id=APPWRITE_DATABASE_ID,
120
+ collection_id=APPWRITE_SAVED_URLS_COLLECTION_ID,
121
+ queries=[Query.limit(1)]
122
+ )
123
+ total_count = total_count_response['total']
124
+
125
+ # Delete all documents if the count exceeds 20,000
126
+ if total_count > 20000:
127
+ logger.info("URL count exceeded 20,000. Deleting all documents in the collection.")
128
+ while True:
129
+ response = databases.list_documents(
130
+ database_id=APPWRITE_DATABASE_ID,
131
+ collection_id=APPWRITE_SAVED_URLS_COLLECTION_ID,
132
+ queries=[Query.limit(limit)]
133
+ )
134
+
135
+ if not response['documents']:
136
+ break # All documents have been deleted
137
+
138
+ for doc in response['documents']:
139
+ databases.delete_document(
140
+ database_id=APPWRITE_DATABASE_ID,
141
+ collection_id=APPWRITE_SAVED_URLS_COLLECTION_ID,
142
+ document_id=doc['$id']
143
+ )
144
+
145
+ # Fetch and process all documents in chunks to populate existing_urls set
146
+ offset = 0 # Reset offset after deletion
147
+ while True:
148
+ # Fetch a chunk of documents from the database
149
+ response = databases.list_documents(
150
+ database_id=APPWRITE_DATABASE_ID,
151
+ collection_id=APPWRITE_SAVED_URLS_COLLECTION_ID,
152
+ queries=[Query.limit(limit), Query.offset(offset)]
153
+ )
154
+
155
+ # Add the fetched URLs to the set
156
+ for doc in response['documents']:
157
+ existing_urls.add(doc['url'])
158
+
159
+ # Check if we have fetched all documents
160
+ if len(response['documents']) < limit:
161
+ break # No more documents to fetch, exit the loop
162
+
163
+ # Move to the next batch
164
+ offset += limit
165
+
166
+ # Save only unique URLs that are not already in the set
167
+ for url in urls:
168
+ if url not in existing_urls:
169
+ if is_valid_url(url):
170
+ sanitized_url = sanitize_url(url)
171
+ databases.create_document(
172
+ database_id=APPWRITE_DATABASE_ID,
173
+ collection_id=APPWRITE_SAVED_URLS_COLLECTION_ID,
174
+ document_id=ID.unique(),
175
+ data={'url': sanitized_url}
176
+ )
177
+ except Exception as e:
178
+ raise CustomException(e, sys) from e
179
+
src/introlix_api/app/database.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pymongo import MongoClient
3
+ from motor.motor_asyncio import AsyncIOMotorClient
4
+
5
+ MONGODB_CLIENT_ID = os.getenv("MONGODB_CLIENT_ID")
6
+
7
+ client = MongoClient(MONGODB_CLIENT_ID)
8
+
9
+ db = client.IntrolixDb
10
+
11
+ feed_data = db.feedData
12
+ search_data = db.search_data
13
+ votes = db.votes
14
+
15
+ async def startup_db_client(app):
16
+ app.mongodb_client = AsyncIOMotorClient(MONGODB_CLIENT_ID)
17
+ app.mongodb = app.mongodb_client.get_database("IntrolixDb")
18
+ print("MongoDB connected.")
19
+
20
+ async def shutdown_db_client(app):
21
+ app.mongodb_client.close()
22
+ print("Database disconnected.")
23
+
src/introlix_api/app/introlix_spider/introlix_spider/__init__.py ADDED
File without changes
src/introlix_api/app/introlix_spider/introlix_spider/items.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define here the models for your scraped items
2
+ #
3
+ # See documentation in:
4
+ # https://docs.scrapy.org/en/latest/topics/items.html
5
+
6
+ import scrapy
7
+
8
+
9
+ class IntrolixSpiderItem(scrapy.Item):
10
+ # define the fields for your item here like:
11
+ # name = scrapy.Field()
12
+ pass
src/introlix_api/app/introlix_spider/introlix_spider/middlewares.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define here the models for your spider middleware
2
+ #
3
+ # See documentation in:
4
+ # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
5
+
6
+ from scrapy import signals
7
+
8
+ # useful for handling different item types with a single interface
9
+ from itemadapter import is_item, ItemAdapter
10
+
11
+
12
+ class IntrolixSpiderSpiderMiddleware:
13
+ # Not all methods need to be defined. If a method is not defined,
14
+ # scrapy acts as if the spider middleware does not modify the
15
+ # passed objects.
16
+
17
+ @classmethod
18
+ def from_crawler(cls, crawler):
19
+ # This method is used by Scrapy to create your spiders.
20
+ s = cls()
21
+ crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
22
+ return s
23
+
24
+ def process_spider_input(self, response, spider):
25
+ # Called for each response that goes through the spider
26
+ # middleware and into the spider.
27
+
28
+ # Should return None or raise an exception.
29
+ return None
30
+
31
+ def process_spider_output(self, response, result, spider):
32
+ # Called with the results returned from the Spider, after
33
+ # it has processed the response.
34
+
35
+ # Must return an iterable of Request, or item objects.
36
+ for i in result:
37
+ yield i
38
+
39
+ def process_spider_exception(self, response, exception, spider):
40
+ # Called when a spider or process_spider_input() method
41
+ # (from other spider middleware) raises an exception.
42
+
43
+ # Should return either None or an iterable of Request or item objects.
44
+ pass
45
+
46
+ def process_start_requests(self, start_requests, spider):
47
+ # Called with the start requests of the spider, and works
48
+ # similarly to the process_spider_output() method, except
49
+ # that it doesn’t have a response associated.
50
+
51
+ # Must return only requests (not items).
52
+ for r in start_requests:
53
+ yield r
54
+
55
+ def spider_opened(self, spider):
56
+ spider.logger.info("Spider opened: %s" % spider.name)
57
+
58
+
59
+ class IntrolixSpiderDownloaderMiddleware:
60
+ # Not all methods need to be defined. If a method is not defined,
61
+ # scrapy acts as if the downloader middleware does not modify the
62
+ # passed objects.
63
+
64
+ @classmethod
65
+ def from_crawler(cls, crawler):
66
+ # This method is used by Scrapy to create your spiders.
67
+ s = cls()
68
+ crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69
+ return s
70
+
71
+ def process_request(self, request, spider):
72
+ # Called for each request that goes through the downloader
73
+ # middleware.
74
+
75
+ # Must either:
76
+ # - return None: continue processing this request
77
+ # - or return a Response object
78
+ # - or return a Request object
79
+ # - or raise IgnoreRequest: process_exception() methods of
80
+ # installed downloader middleware will be called
81
+ return None
82
+
83
+ def process_response(self, request, response, spider):
84
+ # Called with the response returned from the downloader.
85
+
86
+ # Must either;
87
+ # - return a Response object
88
+ # - return a Request object
89
+ # - or raise IgnoreRequest
90
+ return response
91
+
92
+ def process_exception(self, request, exception, spider):
93
+ # Called when a download handler or a process_request()
94
+ # (from other downloader middleware) raises an exception.
95
+
96
+ # Must either:
97
+ # - return None: continue processing this exception
98
+ # - return a Response object: stops process_exception() chain
99
+ # - return a Request object: stops process_exception() chain
100
+ pass
101
+
102
+ def spider_opened(self, spider):
103
+ spider.logger.info("Spider opened: %s" % spider.name)
src/introlix_api/app/introlix_spider/introlix_spider/pipelines.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define your item pipelines here
2
+ #
3
+ # Don't forget to add your pipeline to the ITEM_PIPELINES setting
4
+ # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
5
+
6
+
7
+ # useful for handling different item types with a single interface
8
+ from itemadapter import ItemAdapter
9
+
10
+
11
+ class IntrolixSpiderPipeline:
12
+ def process_item(self, item, spider):
13
+ return item
src/introlix_api/app/introlix_spider/introlix_spider/settings.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Scrapy settings for introlix_spider project
2
+ #
3
+ # For simplicity, this file contains only settings considered important or
4
+ # commonly used. You can find more settings consulting the documentation:
5
+ #
6
+ # https://docs.scrapy.org/en/latest/topics/settings.html
7
+ # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
8
+ # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
9
+
10
+ BOT_NAME = "introlix_spider"
11
+
12
+ SPIDER_MODULES = ["introlix_spider.spiders"]
13
+ NEWSPIDER_MODULE = "introlix_spider.spiders"
14
+
15
+
16
+ # Crawl responsibly by identifying yourself (and your website) on the user-agent
17
+ #USER_AGENT = "introlix_spider (+http://www.yourdomain.com)"
18
+
19
+ # Obey robots.txt rules
20
+ ROBOTSTXT_OBEY = True
21
+
22
+ # Configure maximum concurrent requests performed by Scrapy (default: 16)
23
+ #CONCURRENT_REQUESTS = 32
24
+
25
+ # Configure a delay for requests for the same website (default: 0)
26
+ # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
27
+ # See also autothrottle settings and docs
28
+ #DOWNLOAD_DELAY = 3
29
+ # The download delay setting will honor only one of:
30
+ #CONCURRENT_REQUESTS_PER_DOMAIN = 16
31
+ #CONCURRENT_REQUESTS_PER_IP = 16
32
+
33
+ # Disable cookies (enabled by default)
34
+ #COOKIES_ENABLED = False
35
+
36
+ # Disable Telnet Console (enabled by default)
37
+ #TELNETCONSOLE_ENABLED = False
38
+
39
+ # Override the default request headers:
40
+ #DEFAULT_REQUEST_HEADERS = {
41
+ # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
42
+ # "Accept-Language": "en",
43
+ #}
44
+
45
+ # Enable or disable spider middlewares
46
+ # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
47
+ #SPIDER_MIDDLEWARES = {
48
+ # "introlix_spider.middlewares.IntrolixSpiderSpiderMiddleware": 543,
49
+ #}
50
+
51
+ # Enable or disable downloader middlewares
52
+ # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
53
+ #DOWNLOADER_MIDDLEWARES = {
54
+ # "introlix_spider.middlewares.IntrolixSpiderDownloaderMiddleware": 543,
55
+ #}
56
+
57
+ # Enable or disable extensions
58
+ # See https://docs.scrapy.org/en/latest/topics/extensions.html
59
+ #EXTENSIONS = {
60
+ # "scrapy.extensions.telnet.TelnetConsole": None,
61
+ #}
62
+
63
+ # Configure item pipelines
64
+ # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
65
+ #ITEM_PIPELINES = {
66
+ # "introlix_spider.pipelines.IntrolixSpiderPipeline": 300,
67
+ #}
68
+
69
+ # Enable and configure the AutoThrottle extension (disabled by default)
70
+ # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
71
+ #AUTOTHROTTLE_ENABLED = True
72
+ # The initial download delay
73
+ #AUTOTHROTTLE_START_DELAY = 5
74
+ # The maximum download delay to be set in case of high latencies
75
+ #AUTOTHROTTLE_MAX_DELAY = 60
76
+ # The average number of requests Scrapy should be sending in parallel to
77
+ # each remote server
78
+ #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
79
+ # Enable showing throttling stats for every response received:
80
+ #AUTOTHROTTLE_DEBUG = False
81
+
82
+ # Enable and configure HTTP caching (disabled by default)
83
+ # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
84
+ #HTTPCACHE_ENABLED = True
85
+ #HTTPCACHE_EXPIRATION_SECS = 0
86
+ #HTTPCACHE_DIR = "httpcache"
87
+ #HTTPCACHE_IGNORE_HTTP_CODES = []
88
+ #HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
89
+
90
+ # Set settings whose default value is deprecated to a future-proof value
91
+ REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
92
+ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
93
+ FEED_EXPORT_ENCODING = "utf-8"
94
+
95
+
96
+ # Increase the number of concurrent requests
97
+ CONCURRENT_REQUESTS = 32
98
+
99
+ # Set to 0 for no delay between requests
100
+ DOWNLOAD_DELAY = 0
src/introlix_api/app/introlix_spider/introlix_spider/spiders/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # This package will contain the spiders of your Scrapy project
2
+ #
3
+ # Please refer to the documentation for information on how to create and manage
4
+ # your spiders.
src/introlix_api/app/introlix_spider/introlix_spider/spiders/generic.py ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import scrapy
4
+ from concurrent.futures import ThreadPoolExecutor
5
+ import aiohttp
6
+ import asyncio
7
+ from dotenv import load_dotenv, dotenv_values
8
+ from introlix_api.app.database import feed_data, db
9
+ from introlix_api.app.appwrite import fetch_root_sites
10
+
11
+
12
+ load_dotenv()
13
+
14
+
15
+ class GenericSpider(scrapy.Spider):
16
+ """
17
+ Spider to crawl internet to get data to display it on introlix feed
18
+ """
19
+ name = "generic"
20
+
21
+ def __init__(self, *args, **kwargs):
22
+ super(GenericSpider, self).__init__(*args, **kwargs)
23
+ self.executor = ThreadPoolExecutor(max_workers=10) # Control parallelism
24
+
25
+ self.data = []
26
+
27
+ self.all_urls = fetch_root_sites()
28
+ self.domain_pattern = r'(?:[a-z0-9-]+\.)?([a-z0-9-]+\.[a-z]{2,})(?:\/|$)'
29
+
30
+ self.allowed_domains = []
31
+ self.start_urls = []
32
+ self.CLASSIFICATION_API = os.getenv('CLASSIFICATION_API')
33
+
34
+ for url in self.all_urls:
35
+ result = re.search(self.domain_pattern, url)
36
+
37
+ if result:
38
+ self.allowed_domains.append(result.group(1))
39
+ self.start_urls.append(result.group(1))
40
+
41
+ def start_requests(self):
42
+ for url in self.all_urls:
43
+ yield scrapy.Request(url=url, callback=self.parse)
44
+
45
+ def is_this_article(self, url):
46
+ """
47
+ Function to verify if the url is article url or not
48
+ """
49
+
50
+ # list of article url patterns
51
+ article_pattern = [
52
+ r'/(blog|article|articles|post|blog|posts|blogs|)/\d{4}/\d{2}/+[a-z0-9-]+/?',
53
+ r'/(blog|article|articles|post|blog|posts|blogs|)/[a-z0-9-]+/[a-z0-9-]+',
54
+ r'(?<!\/\/www)(blog|article|articles|post|posts|blogs)/[a-z0-9-]+',
55
+ r'^(?!.*\/category\/).*\/[a-z0-9-]+\/[a-z0-9-]+(-[a-z0-9-]+)+$',
56
+ r'/[^/]+/\d{4}/\d{2}/\d{2}/+[a-z0-9]+/?',
57
+ r'/[^/]+/\d{4}/\d{2}/+[a-z0-9]+/?'
58
+ r'/[a-z0-9-]+/\d{4}/\d{2}/+/?',
59
+ r'/[a-z0-9-]+/\d{4}/\d{2}/\d{2}/+/?'
60
+ ]
61
+
62
+ # list of non article keywords
63
+ non_article_words = [
64
+ "category", "signup", "login", "about", "contact", # Add more non-article keywords...
65
+ ]
66
+
67
+ # Check if the url matches any of the article patterns
68
+ for pattern in article_pattern:
69
+ if re.search(pattern, url):
70
+ if not any(word in url for word in non_article_words):
71
+ return True
72
+ return False
73
+
74
+ def parse(self, response):
75
+ # Get all the urls from the response
76
+ urls = response.css('a::attr(href)').extract()
77
+
78
+ # Filter out the urls that are not article urls
79
+ article_urls = [response.urljoin(url.split("?")[0]) for url in urls if self.is_this_article(url)]
80
+
81
+ # Send a request to each article url
82
+ for url in article_urls:
83
+ yield scrapy.Request(url=url, callback=self.parse_article)
84
+
85
+ async def classify_article(self, text):
86
+ """
87
+ function to classify the article
88
+ """
89
+ classify_ai = self.CLASSIFICATION_API
90
+ payload = {"text": text}
91
+
92
+ # Send a request to the classification API
93
+ async with aiohttp.ClientSession() as session:
94
+ try:
95
+ async with session.post(classify_ai, json=payload) as response:
96
+ response.raise_for_status()
97
+ result = await response.json()
98
+ return result.get('category', 'Unknown')
99
+ except aiohttp.ClientError as e:
100
+ self.logger.error(f"Error making request to classification API: {e}")
101
+ return 'Error'
102
+
103
+ async def parse_article(self, response):
104
+ """
105
+ Function to get all details of the article
106
+ """
107
+ hostname = response.url.split("/")[2] # getting the website name of the article
108
+ title = response.css("h1::text").get() # getting the title of the article
109
+ url = response.url # getting the url of the article
110
+ desc = response.css('meta[name="description"]::attr(content)').get() # getting the description of the article
111
+ publication_date = response.css('span::text, time::text').re_first(r'(\w+ \d+|\d+\s?\w+,? \w+)') # getting the publication date of the article
112
+ image_url = response.css('meta[property="og:image"]::attr(content)').get() # getting the image url of the article
113
+
114
+ # Classify article title asynchronously
115
+ category = await self.classify_article(title) # getting the category of the article from the classification API
116
+
117
+ # Prepare feed item
118
+ feed_items = {
119
+ "title": title,
120
+ "desc": desc,
121
+ "url": url,
122
+ "publication_date": publication_date,
123
+ "image_url": image_url,
124
+ "category": category,
125
+ "source": hostname
126
+ }
127
+
128
+ self.data.append(feed_items)
129
+
130
+ def closed(self, reason):
131
+ print(f"Spider closed: {reason}")
132
+ print("Saving ----")
133
+ self.save_data()
134
+
135
+ def save_data(self):
136
+ # if "feed_Data" in db.list_collection_names():
137
+ # feed_data.drop()
138
+
139
+ for feed_items in self.data:
140
+ feed_data.insert_one(feed_items)
141
+
142
+
143
+ # import re
144
+ # import scrapy
145
+ # from pathlib import Path
146
+ # import requests
147
+ # from concurrent.futures import ThreadPoolExecutor
148
+ # from twisted.internet.defer import ensureDeferred
149
+ # from introlix_api.app.database import feed_data, db
150
+ # from introlix_api.app.appwrite import fetch_root_sites
151
+
152
+
153
+ # class GenericSpider(scrapy.Spider):
154
+ # name = "generic"
155
+
156
+ # def __init__(self, *args, **kwargs):
157
+ # super(GenericSpider, self).__init__(*args, **kwargs)
158
+ # self.executor = ThreadPoolExecutor(max_workers=10)
159
+
160
+ # self.data = []
161
+
162
+ # self.all_urls = fetch_root_sites()
163
+ # self.domain_pattern = r'(?:[a-z0-9-]+\.)?([a-z0-9-]+\.[a-z]{2,})(?:\/|$)'
164
+
165
+ # self.allowed_domains = []
166
+ # self.start_urls = []
167
+
168
+ # for url in self.all_urls:
169
+ # result = re.search(self.domain_pattern, url)
170
+
171
+ # if result:
172
+ # self.allowed_domains.append(result.group(1))
173
+ # self.start_urls.append(result.group(1))
174
+
175
+ # def start_requests(self):
176
+
177
+ # for url in self.all_urls:
178
+ # yield scrapy.Request(url=url, callback=self.parse)
179
+
180
+ # def is_this_article(self, url):
181
+ # article_pattern = [
182
+ # r'/(blog|article|articles|post|blog|posts|blogs|)/\d{4}/\d{2}/+[a-z0-9-]+/?',
183
+ # r'/(blog|article|articles|post|blog|posts|blogs|)/[a-z0-9-]+/[a-z0-9-]+',
184
+ # r'(?<!\/\/www)(blog|article|articles|post|posts|blogs)/[a-z0-9-]+',
185
+ # r'^(?!.*\/category\/).*\/[a-z0-9-]+\/[a-z0-9-]+(-[a-z0-9-]+)+$',
186
+ # r'/[^/]+/\d{4}/\d{2}/\d{2}/+[a-z0-9]+/?',
187
+ # r'/[^/]+/\d{4}/\d{2}/+[a-z0-9]+/?'
188
+ # r'/[a-z0-9-]+/\d{4}/\d{2}/+/?',
189
+ # r'/[a-z0-9-]+/\d{4}/\d{2}/\d{2}/+/?'
190
+ # ]
191
+
192
+ # # List of non-article keywords
193
+ # non_article_words = [
194
+ # "category",
195
+ # "signup",
196
+ # "login",
197
+ # "about",
198
+ # "contact",
199
+ # "privacy",
200
+ # "terms",
201
+ # "faq",
202
+ # "help",
203
+ # "support",
204
+ # "user",
205
+ # "account",
206
+ # "settings",
207
+ # "profile",
208
+ # "admin",
209
+ # "dashboard",
210
+ # "search",
211
+ # "index",
212
+ # "topics",
213
+ # "rss",
214
+ # "solutions",
215
+ # "shows",
216
+ # "author"
217
+ # ]
218
+
219
+ # for pattern in article_pattern:
220
+ # if re.search(pattern, url):
221
+ # for word in non_article_words:
222
+ # if word in url:
223
+ # return False
224
+ # return True
225
+ # return False
226
+
227
+ # def parse(self, response):
228
+ # urls = response.css('a::attr(href)').extract()
229
+
230
+ # article_urls = [response.urljoin(url.split("?")[0]) for url in urls if self.is_this_article(url)]
231
+
232
+ # for url in article_urls:
233
+ # yield scrapy.Request(url=url, callback=self.parse_article)
234
+
235
+ # def classify_article(self, text):
236
+ # classify_ai = "dont show api"
237
+ # payload = {"text": text}
238
+
239
+ # try:
240
+ # response = requests.post(classify_ai, json=payload)
241
+ # response.raise_for_status()
242
+ # return response.json().get('category', 'Unknown')
243
+ # except requests.RequestException as e:
244
+ # self.logger.error(f"Error making request to classification API: {e}")
245
+ # return 'Error'
246
+
247
+ # def parse_article(self, response):
248
+ # # getting all the infomation from the article
249
+
250
+ # hostname = response.url.split("/")[2]
251
+
252
+
253
+ # title = response.css("h1::text").get()
254
+ # url = response.url
255
+ # desc = response.css('meta[name="description"]::attr(content)').get()
256
+ # publication_date = response.css('span::text, time::text').re_first(r'(\w+ \d+|\d+\s?\w+,? \w+)')
257
+ # image_url = response.css('meta[property="og:image"]::attr(content)').get()
258
+
259
+ # # Using ThreadPoolExecutor to classify the title in a separate thread
260
+ # future = self.executor.submit(self.classify_article, title)
261
+ # category = future.result()
262
+
263
+ # # storing the infomation on mongodb
264
+ # feed_items = {
265
+ # "title": title,
266
+ # "desc": desc,
267
+ # "url": url,
268
+ # "publication_date": publication_date,
269
+ # "image_url": image_url,
270
+ # "category": category,
271
+ # "source": hostname
272
+ # }
273
+
274
+ # self.data.append(feed_items)
275
+
276
+ # def closed(self, reason):
277
+ # print(f"Spider closed: {reason}")
278
+ # print("Saving ----")
279
+ # self.save_data()
280
+
281
+ # def save_data(self):
282
+ # if "feed_Data" in db.list_collection_names():
283
+ # feed_data.drop()
284
+
285
+ # for feed_items in self.data:
286
+ # feed_data.insert_one(feed_items)
src/introlix_api/app/introlix_spider/scrapy.cfg ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Automatically created by: scrapy startproject
2
+ #
3
+ # For more information about the [deploy] section see:
4
+ # https://scrapyd.readthedocs.io/en/latest/deploy.html
5
+
6
+ [settings]
7
+ default = introlix_spider.settings
8
+
9
+ [deploy]
10
+ #url = http://localhost:6800/
11
+ project = introlix_spider
src/introlix_api/app/model.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field
2
+ from typing import List, Optional
3
+ from datetime import date
4
+ from datetime import datetime
5
+
6
+ # signup model
7
+ class UserSignup(BaseModel):
8
+ name: str
9
+ email: str
10
+ password: str
11
+ dob: date
12
+ interestList: List[str]
13
+
14
+ # login model
15
+ class UserLogin(BaseModel):
16
+ email: str
17
+ password: str
18
+
19
+ # feed model
20
+ class FeedModel(BaseModel):
21
+ id: str = Field(..., alias="_id")
22
+ title: str
23
+ desc: str
24
+ url: str
25
+ image_url: str
26
+ tags: list
27
+ vote: int
28
+ created_at: Optional[datetime]
29
+
30
+ class DiscussionModel(BaseModel):
31
+ id: str = Field(..., alias="_id")
32
+ title: str
33
+ url: str
34
+ tags: list
35
+ vote: int
36
+ created_at: Optional[datetime]
37
+ answer_count: int
src/introlix_api/app/routes/__init__.py ADDED
File without changes
src/introlix_api/app/routes/auth.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ from fastapi import APIRouter, HTTPException, Query
3
+
4
+ from introlix_api.exception import CustomException
5
+ from introlix_api.app.model import UserSignup, UserLogin
6
+ from introlix_api.app.appwrite import databases, APPWRITE_DATABASE_ID, ID, APPWRITE_ACCOUNT_COLLECTION_ID
7
+ from introlix_api.logger import logger
8
+
9
+ router = APIRouter()
10
+
11
+ @router.post("/test")
12
+ async def test(data: dict):
13
+ return {"message": f"POST request works with data {data}"}
14
+
15
+ @router.post('/signup')
16
+ async def signup(user: UserSignup):
17
+ """
18
+ Function to signup a new user
19
+ """
20
+ try:
21
+ # List of avatar colors
22
+ avatar_colors = [
23
+ "#FF4500", # Orange Red
24
+ "#FF6347", # Tomato
25
+ "#FF7F50", # Coral
26
+ "#FF8C00", # Dark Orange
27
+ "#FFD700", # Gold
28
+ "#ADFF2F", # Green Yellow
29
+ "#32CD32", # Lime Green
30
+ "#00FA9A", # Medium Spring Green
31
+ "#40E0D0", # Turquoise
32
+ "#1E90FF", # Dodger Blue
33
+ "#4682B4", # Steel Blue
34
+ "#8A2BE2", # Blue Violet
35
+ "#FF69B4", # Hot Pink
36
+ "#FF1493", # Deep Pink
37
+ "#C71585" # Medium Violet Red
38
+
39
+ ]
40
+ # Check if the email is already registered
41
+ existing_users = databases.list_documents(
42
+ database_id=APPWRITE_DATABASE_ID,
43
+ collection_id=APPWRITE_ACCOUNT_COLLECTION_ID,
44
+ )
45
+
46
+ # Iterate through existing users to check if the email already exists
47
+ for doc in existing_users['documents']:
48
+ if doc['Email'] == user.email:
49
+ raise HTTPException(status_code=400, detail="Email is already registered")
50
+
51
+ # If email is not found, proceed with signup
52
+ result = databases.create_document(
53
+ database_id=APPWRITE_DATABASE_ID,
54
+ collection_id=APPWRITE_ACCOUNT_COLLECTION_ID,
55
+ document_id=ID.unique(),
56
+ data={
57
+ "Name": user.name,
58
+ "Email": user.email,
59
+ "Password": user.password,
60
+ "DOB": user.dob.isoformat(),
61
+ "interests": user.interestList,
62
+ "profileColor": random.choice(avatar_colors)
63
+ }
64
+ )
65
+ return {"message": "User created successfully", "document_id": result['$id'], "interests": result["interests"], "name": result["Name"][0], "profileColor": result["profileColor"]}
66
+
67
+ except Exception as e:
68
+ raise HTTPException(status_code=400, detail=str(e))
69
+
70
+ @router.post('/login')
71
+ async def login(user: UserLogin):
72
+ """
73
+ Function to login a user
74
+ """
75
+ try:
76
+ # List of users
77
+ users = databases.list_documents(
78
+ database_id=APPWRITE_DATABASE_ID,
79
+ collection_id=APPWRITE_ACCOUNT_COLLECTION_ID
80
+ )
81
+ # Find user with matching email and password
82
+ for doc in users['documents']:
83
+ if doc['Email'] == user.email and doc['Password'] == user.password:
84
+ return {"message": "Login successful", "document_id": doc['$id'], "interests": doc["interests"], "name": doc["Name"][0], "profileColor": doc["profileColor"]}
85
+ raise HTTPException(status_code=400, detail="Invalid credentials")
86
+ except Exception as e:
87
+ raise HTTPException(status_code=400, detail=str(e))
88
+
89
+ @router.post("/verify_it_user")
90
+ async def verify_user_exist(user_id: str = Query(...)):
91
+ """
92
+ Function to verify if the user exists
93
+ """
94
+ try:
95
+ # List of users
96
+ users = databases.list_documents(
97
+ database_id=APPWRITE_DATABASE_ID,
98
+ collection_id=APPWRITE_ACCOUNT_COLLECTION_ID
99
+ )
100
+
101
+ # Find user with matching id
102
+ for doc in users['documents']:
103
+ if user_id == doc['$id']:
104
+ return {"message": "It's User", "interests": doc["interests"], "name": doc["Name"][0], "profileColor": doc["profileColor"]}
105
+
106
+ # If no matching user found
107
+ raise HTTPException(status_code=404, detail="User not found")
108
+ except Exception as e:
109
+ raise HTTPException(status_code=500, detail=str(e))
src/introlix_api/app/routes/posts.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bson import ObjectId
2
+ import pytz
3
+ from dateutil import parser
4
+ from datetime import datetime, timezone
5
+ from fastapi import FastAPI, APIRouter, HTTPException, Request, Query
6
+ from introlix_api.app.database import votes
7
+ from introlix_api.exception import CustomException
8
+ from introlix_api.app.database import startup_db_client, shutdown_db_client
9
+ from introlix_api.app.model import FeedModel, DiscussionModel
10
+ from contextlib import asynccontextmanager
11
+ from typing import List
12
+
13
+ @asynccontextmanager
14
+ async def lifespan(app: FastAPI):
15
+ # Start the database connection
16
+ await startup_db_client(app)
17
+ yield
18
+ # Close the database connection
19
+ await shutdown_db_client(app)
20
+
21
+ router = APIRouter()
22
+
23
+ def normalize_date(date_str):
24
+ try:
25
+ # Attempt to parse as ISO format with timezone
26
+ date_obj = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
27
+ except ValueError:
28
+ # If fromisoformat fails, fall back to a more flexible parser
29
+ try:
30
+ date_obj = parser.parse(date_str)
31
+ except (ValueError, TypeError):
32
+ print(f"Warning: Unrecognized date format for '{date_str}'")
33
+ return None # Return None or handle the invalid date as needed
34
+
35
+ # Convert to UTC and return in ISO format
36
+ return date_obj.astimezone(pytz.UTC)
37
+
38
+ @router.get('/posts', response_model=List[FeedModel])
39
+ async def fetch_data(request: Request, tags: List[str] = Query(...), page: int = 1, limit: int = 20):
40
+ """
41
+ Function to fetch posts based on pagination, query, and sorting options.
42
+ """
43
+ try:
44
+ skip = (page - 1) * limit
45
+ query = {
46
+ "content.tags": {"$in": tags},
47
+ "type": "article"
48
+ }
49
+ response = await request.app.mongodb['search_data'].find(query).skip(skip).limit(limit).to_list(limit)
50
+
51
+ current_date = datetime.now(timezone.utc)
52
+ hotness_ranked_posts = []
53
+
54
+ for item in response:
55
+ item["_id"] = str(item['_id'])
56
+ item["title"] = item['content'].get('title', '')
57
+ item["desc"] = item['content'].get('desc', '')
58
+ item["url"] = item.get('url', '')
59
+ item["image_url"] = item['content'].get('image', '') or ""
60
+ item["tags"] = item['content'].get('tags', [])
61
+ item["vote"] = item['content'].get('vote', 0)
62
+
63
+ # Handle created_at normalization
64
+ created_at_str = item['content'].get('created_at', '')
65
+ if created_at_str in [None, "No date found"]:
66
+ created_at = current_date
67
+ else:
68
+ created_at = normalize_date(created_at_str)
69
+
70
+ # Ensure created_at is a datetime object; if None, skip the calculation
71
+ if created_at:
72
+ # Calculate age in hours
73
+ age_hours = (current_date - created_at).total_seconds() / 3600
74
+
75
+ # Hotness ranking formula
76
+ rank = (item["vote"] - 1) / ((age_hours + 2) ** 1.5)
77
+ item["rank"] = rank
78
+ else:
79
+ # If created_at is invalid, set rank low
80
+ item["rank"] = float('-inf')
81
+
82
+ item["created_at"] = created_at.isoformat() if created_at else "Unknown"
83
+ hotness_ranked_posts.append(item)
84
+
85
+ hotness_ranked_posts.sort(key=lambda x: x["rank"], reverse=False)
86
+ return hotness_ranked_posts
87
+
88
+ except Exception as e:
89
+ raise HTTPException(status_code=400, detail=str(e))
90
+
91
+ @router.get('/discussion', response_model=List[DiscussionModel])
92
+ async def fetch_disscussion(request: Request, tags: List[str] = Query(...), page: int = 1, limit: int = 20):
93
+ """
94
+ Function to fetch discussion based on pagination, query, and sorting options.
95
+ """
96
+ try:
97
+ skip = (page - 1) * limit
98
+ query = {
99
+ "content.tags": {"$in": tags},
100
+ "type": "discussion"
101
+ }
102
+ response = await request.app.mongodb['search_data'].find(query).skip(skip).limit(limit).to_list(limit)
103
+
104
+ current_date = datetime.now(timezone.utc)
105
+ hotness_ranked_posts = []
106
+
107
+ for item in response:
108
+ item["_id"] = str(item['_id'])
109
+ item["title"] = item['content'].get('title', '')
110
+ item["url"] = item.get('url', '')
111
+ item["tags"] = item['content'].get('tags', [])
112
+ item["vote"] = item['content'].get('vote', 0)
113
+ item["answer_count"] = item['content'].get('answer_count', 0)
114
+
115
+ # Handle created_at normalization
116
+ created_at_str = item['content'].get('created_at', '')
117
+ created_at_str = datetime.utcfromtimestamp(created_at_str)
118
+ if created_at_str in [None, "No date found"]:
119
+ created_at = current_date
120
+ else:
121
+ created_at = normalize_date(str(created_at_str))
122
+
123
+ # Ensure created_at is a datetime object; if None, skip the calculation
124
+ if created_at:
125
+ # Calculate age in hours
126
+ age_hours = (current_date - created_at).total_seconds() / 3600
127
+
128
+ # Hotness ranking formula
129
+ rank = (item["vote"] - 1) / ((age_hours + 2) ** 1.5)
130
+ item["rank"] = rank
131
+ else:
132
+ # If created_at is invalid, set rank low
133
+ item["rank"] = float('-inf')
134
+
135
+ item["created_at"] = created_at.isoformat() if created_at else "Unknown"
136
+ hotness_ranked_posts.append(item)
137
+
138
+ hotness_ranked_posts.sort(key=lambda x: x["rank"], reverse=False)
139
+ return hotness_ranked_posts
140
+
141
+ except Exception as e:
142
+ raise HTTPException(status_code=400, detail=str(e))
143
+
144
+ @router.post('/vote')
145
+ async def vote(request: Request, vote: int, post_id: str = Query(...), user_id: str = Query(...)):
146
+ """
147
+ Function to vote for a post.
148
+ """
149
+ try:
150
+ post_id = ObjectId(post_id)
151
+
152
+ # Check if the user has already voted for the post
153
+ result = await request.app.mongodb['votes'].find_one({"user_id": user_id, "post_id": post_id, "vote": vote})
154
+
155
+ if result:
156
+ votes.delete_one({
157
+ "_id": result["_id"]
158
+ })
159
+ else:
160
+ existing_vote = await request.app.mongodb['votes'].find_one({"user_id": user_id, "post_id": post_id})
161
+ if existing_vote:
162
+ votes.delete_one({
163
+ "_id": existing_vote["_id"]
164
+ })
165
+
166
+ votes.insert_one({
167
+ "post_id": post_id,
168
+ "user_id": user_id,
169
+ "vote": vote
170
+ })
171
+
172
+ # counting total vote
173
+ # Calculate the total vote count for the post
174
+ total_votes = await request.app.mongodb['votes'].aggregate([
175
+ {"$match": {"post_id": post_id}},
176
+ {"$group": {"_id": "$post_id", "total_votes": {"$sum": "$vote"}}}
177
+ ]).to_list(length=1)
178
+
179
+ # Extract the total vote count or default to 0 if no votes are found
180
+ vote_count = total_votes[0]["total_votes"] if total_votes else 0
181
+
182
+ # Update the vote count in the post document
183
+ await request.app.mongodb['search_data'].update_one(
184
+ {"_id": post_id},
185
+ {"$set": {"content.vote": vote_count}}
186
+ )
187
+
188
+ return {"message": f"Vote submitted successfully with total vote {vote_count}"}
189
+ except Exception as e:
190
+ raise HTTPException(status_code=400, detail=str(e))
191
+
192
+
193
+ @router.get('/hasvoted')
194
+ async def hasVote(request: Request, post_id: str = Query(...), user_id: str = Query(...)):
195
+ """
196
+ Function to check if the user has already voted for a post.
197
+ """
198
+ try:
199
+ post_id = ObjectId(post_id)
200
+
201
+ existing_vote = await request.app.mongodb['votes'].find_one({"user_id": user_id, "post_id": post_id})
202
+
203
+ if existing_vote:
204
+ return {"has_voted": True, "vote": existing_vote['vote']}
205
+ else:
206
+ return {"has_voted": False}
207
+ except Exception as e:
208
+ raise HTTPException(status_code=400, detail=str(e))
src/introlix_api/app/routes/run_spider.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import subprocess
3
+ from fastapi import APIRouter, HTTPException, Query
4
+
5
+ from introlix_api.exception import CustomException
6
+ from introlix_api.logger import logger
7
+
8
+ router = APIRouter()
9
+
10
+ @router.post('/run_spider')
11
+ async def run_spider():
12
+ """
13
+ Function to run the introlix spider
14
+ """
15
+ try:
16
+ command = ["scrapy", "crawl", "generic"] # command to run the spider
17
+ working_directory = "src/introlix_api/app/introlix_spider" # directory to run the spider
18
+
19
+ result = subprocess.run(command, cwd=working_directory, capture_output=True, text=True) # run the spider
20
+
21
+ return result.stdout, result.stderr
22
+ except Exception as e:
23
+ raise HTTPException(status_code=400, detail=str(e))
src/introlix_api/app/routes/similarity.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import re
3
+ from fastapi import FastAPI, APIRouter, HTTPException, Request
4
+
5
+ from introlix_api.exception import CustomException
6
+ from introlix_api.app.database import startup_db_client, shutdown_db_client
7
+ from introlix_api.logger import logger
8
+ from sklearn.feature_extraction.text import TfidfVectorizer
9
+ from sklearn.metrics.pairwise import cosine_similarity
10
+ from contextlib import asynccontextmanager
11
+
12
+ @asynccontextmanager
13
+ async def lifespan(app: FastAPI):
14
+ # Start the database connection
15
+ await startup_db_client(app)
16
+ yield
17
+ # Close the database connection
18
+ await shutdown_db_client(app)
19
+
20
+ router = APIRouter()
21
+
22
+ # Preprocessing function to clean text
23
+ def preprocess_text(text):
24
+ # Convert to lowercase
25
+ text = text.lower()
26
+ # Remove extra spaces, newlines, and tabs
27
+ text = re.sub(r'\s+', ' ', text)
28
+ # Remove punctuation (optional, depending on your data)
29
+ text = re.sub(r'[^\w\s]', '', text)
30
+ return text
31
+
32
+ @router.get('/similarity')
33
+ async def similarity(request: Request, page: int = 1, limit: int = 20, query: str = None):
34
+ """
35
+ Function to calculate cosine similarity between posts and a query.
36
+ """
37
+ try:
38
+ # Ensure the query is provided
39
+ if not query:
40
+ raise HTTPException(status_code=400, detail="Query parameter is required")
41
+
42
+ skip = (page - 1) * limit
43
+
44
+ # Fetch posts from MongoDB
45
+ response = await request.app.mongodb['feedData'].find().skip(skip).limit(limit).to_list(limit)
46
+
47
+ # Filter out items that do not have both title and description
48
+ response = [item for item in response if item.get('title') and item.get('desc')]
49
+
50
+ # Convert ObjectId to string for MongoDB compatibility
51
+ for item in response:
52
+ item['_id'] = str(item['_id'])
53
+
54
+ # Prepare document texts (title + desc) for similarity calculation
55
+ posts_texts = [preprocess_text(item['title'] + ' ' + item['desc']) for item in response]
56
+
57
+ # Preprocess the query
58
+ query = preprocess_text(query)
59
+
60
+ # Include the query at the start of the document list
61
+ documents = [query] + posts_texts
62
+
63
+ # Apply TF-IDF Vectorizer
64
+ vectorizer = TfidfVectorizer(stop_words='english')
65
+ tfidf = vectorizer.fit_transform(documents)
66
+
67
+ # Calculate cosine similarity between the query and the posts
68
+ cosine_similarities = cosine_similarity(tfidf[0:1], tfidf[1:]).flatten()
69
+
70
+ # Debugging: Print cosine similarity scores for better understanding
71
+ print("Cosine Similarities:", cosine_similarities)
72
+
73
+ # Lower the similarity threshold for short text comparisons
74
+ similarity_threshold = 0.05
75
+
76
+ # Filter posts that have a cosine similarity above the threshold
77
+ similar_posts = [
78
+ response[i] for i in range(len(response)) if cosine_similarities[i] >= similarity_threshold
79
+ ]
80
+
81
+ return similar_posts
82
+ except Exception as e:
83
+ raise HTTPException(status_code=400, detail=str)
src/introlix_api/crawler/__init__.py ADDED
File without changes
src/introlix_api/crawler/bot.py ADDED
@@ -0,0 +1,390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, sys, re, time
2
+ import errno
3
+ import string
4
+ import requests
5
+ import multiprocessing
6
+ from bs4 import BeautifulSoup
7
+ from dataclasses import dataclass
8
+ from introlix_api.logger import logger
9
+ from urllib.parse import urlparse, urlunsplit, urljoin
10
+ from urllib.robotparser import RobotFileParser
11
+ from introlix_api.exception import CustomException
12
+ from urllib.robotparser import RobotFileParser
13
+
14
+ from requests import ReadTimeout
15
+ from introlix_api.utils.core import html_to_dom
16
+ from introlix_api.utils.tags import fetch_tags
17
+ from introlix_api.utils.root_sites import root_sites
18
+ from ssl import SSLCertVerificationError
19
+ from urllib3.exceptions import NewConnectionError, MaxRetryError
20
+
21
+ @dataclass
22
+ class BotArgs:
23
+ TIMEOUT_SECONDS = 3
24
+ MAX_FETCH_SIZE = 1024*1024
25
+ BAD_URL_REGEX = re.compile(r'\/\/localhost\b|\.jpg$|\.png$|\.js$|\.gz$|\.zip$|\.pdf$|\.bz2$|\.ipynb$|\.py$')
26
+ GOOD_URL_REGEX = re.compile(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')
27
+ DEFAULT_ENCODING = 'utf8'
28
+ DEFAULT_ENC_ERRORS = 'replace'
29
+ ALLOWED_EXCEPTIONS = (ValueError, ConnectionError, ReadTimeout, TimeoutError,
30
+ OSError, NewConnectionError, MaxRetryError, SSLCertVerificationError)
31
+
32
+ class IntrolixBot:
33
+ def __init__(self, urls: list, args: BotArgs, obey_robots_txt: bool = True):
34
+ """
35
+ Initialize the IntrolixBot.
36
+
37
+ Args:
38
+ urls (list): List of URLs to scrape.
39
+ obey_robots_txt (bool, optional): Whether to obey robots.txt. Defaults to True.
40
+ """
41
+ self.urls = urls
42
+ self.obey_robots_txt = obey_robots_txt
43
+ self.root_sites = root_sites()
44
+ self.root_sites_netlocs = {urlparse(root_url).netloc for root_url in self.root_sites}
45
+ self.good_tags = fetch_tags()
46
+
47
+ # bot args
48
+ self.TIMEOUT_SECONDS = args.TIMEOUT_SECONDS
49
+ self.MAX_FETCH_SIZE = args.MAX_FETCH_SIZE
50
+ self.BAD_URL_REGEX = args.BAD_URL_REGEX
51
+ self.GOOD_URL_REGEX = args.GOOD_URL_REGEX
52
+ self.DEFAULT_ENCODING = args.DEFAULT_ENCODING
53
+ self.DEFAULT_ENC_ERRORS = args.DEFAULT_ENC_ERRORS
54
+ self.ALLOWED_EXCEPTIONS = args.ALLOWED_EXCEPTIONS
55
+
56
+ def fetch(self, url:str) -> tuple[int, bytes]:
57
+ """
58
+ Function to fetch a URL.
59
+
60
+ Args:
61
+ url (str): URL to fetch.
62
+ Returns:
63
+ tuple[int, bytes]: status code and content.
64
+ """
65
+
66
+ r = requests.get(url, stream=True, timeout=self.TIMEOUT_SECONDS)
67
+
68
+ size = 0
69
+ start = time.time()
70
+
71
+ content = b""
72
+ for chunk in r.iter_content(1024):
73
+ if time.time() - start > self.TIMEOUT_SECONDS:
74
+ raise ValueError('Timeout reached')
75
+
76
+ content += chunk
77
+
78
+ size += len(chunk)
79
+ if size > self.MAX_FETCH_SIZE:
80
+ logger.debug(f"Maximum size reached for URL {url}")
81
+ break
82
+
83
+ return r.status_code, content
84
+
85
+ def see_robots_txt(self, url: str) -> bool:
86
+ """
87
+ Function to check if robots.txt allows this bot to crawl.
88
+
89
+ Args:
90
+ main_url (str): main root url of the site.
91
+ url (str): URL to check.
92
+ Returns:
93
+ bool: True if the bot is allowed to crawl, False otherwise.
94
+ """
95
+ try:
96
+ try:
97
+ parsed_url = urlparse(url)
98
+ except ValueError:
99
+ logger.debug(f"Unable to parse URL: {url}")
100
+ return False
101
+
102
+ robots_url = urlunsplit((parsed_url.scheme, parsed_url.netloc, 'robots.txt', '', ''))
103
+ parse_robots = RobotFileParser(robots_url)
104
+
105
+ try:
106
+ status_code, content = self.fetch(robots_url)
107
+ except Exception as e: # Catch all exceptions for now
108
+ logger.debug(f"Robots error: {robots_url}, {e}")
109
+ return True
110
+
111
+ decoded = None
112
+ for encoding in ['utf-8', 'iso-8859-1']:
113
+ try:
114
+ decoded = content.decode(encoding).splitlines()
115
+ break
116
+ except UnicodeDecodeError:
117
+ pass
118
+
119
+ if decoded is None:
120
+ logger.debug(f"Unable to decode robots file {robots_url}")
121
+ return True
122
+
123
+ parse_robots.parse(decoded)
124
+ allowed = parse_robots.can_fetch('IntrolixBot', url) # Your bot's name
125
+ logger.debug(f"Robots allowed for {url}: {allowed} and {decoded} is decoded with {robots_url}")
126
+ return allowed
127
+ except Exception as e:
128
+ raise CustomException(e, sys) from e
129
+
130
+ def get_urls_from_page(self, url: str) -> list:
131
+ """
132
+ Function to get all URLs from a page.
133
+
134
+ Args:
135
+ url (str): URL of the page.
136
+ Returns:
137
+ list: List of URLs from the page.
138
+ """
139
+ try:
140
+ status_code, content = self.fetch(url)
141
+
142
+ if status_code != 200:
143
+ return []
144
+
145
+ soup = BeautifulSoup(content, 'html.parser')
146
+ urls = []
147
+
148
+ for link in soup.find_all('a'):
149
+ href = link.get('href')
150
+ if href:
151
+ if not href.startswith('http'):
152
+ href = urljoin(url, href)
153
+ # if not self.BAD_URL_REGEX.search(href):
154
+ # href = href
155
+ if self.GOOD_URL_REGEX.search(href):
156
+ href_netloc = urlparse(href).netloc
157
+
158
+ logger.debug(f"Checking href domain: {href_netloc} against root domains")
159
+
160
+ if href_netloc in self.root_sites_netlocs:
161
+ urls.append(href)
162
+
163
+ return list(set(urls))
164
+
165
+ except Exception as e:
166
+ logger.info(f"Error occured while getting urls from page {e}")
167
+ return []
168
+ # raise CustomException(e, sys) from e
169
+
170
+ def scrape(self, url: str) -> dict:
171
+ """
172
+ Function to scrape the site.
173
+
174
+ Args:
175
+ url (str): URL to scrape.
176
+ Returns:
177
+ dict: scraped data.
178
+ """
179
+ try:
180
+ logger.info(f"Crawling URL {url}")
181
+ js_timestamp = int(time.time() * 1000)
182
+
183
+ if self.obey_robots_txt:
184
+ allowed = self.see_robots_txt(url)
185
+
186
+ if not allowed:
187
+ return {
188
+ 'url': url,
189
+ 'status': None,
190
+ 'timestamp': js_timestamp,
191
+ 'content': None,
192
+ 'error': {
193
+ 'name': 'RobotsDenied',
194
+ 'message': 'Robots do not allow this URL',
195
+ }
196
+ }
197
+
198
+ try:
199
+ status_code, content = self.fetch(url)
200
+ except self.ALLOWED_EXCEPTIONS as e:
201
+ logger.debug(f"Exception crawling URl {url}: {e}")
202
+ return {
203
+ 'url': url,
204
+ 'status': None,
205
+ 'timestamp': js_timestamp,
206
+ 'content': None,
207
+ 'error': {
208
+ 'name': 'AbortError',
209
+ 'message': str(e),
210
+ }
211
+ }
212
+
213
+ if len(content) == 0:
214
+ return {
215
+ 'url': url,
216
+ 'status': status_code,
217
+ 'timestamp': js_timestamp,
218
+ 'content': None,
219
+ 'error': {
220
+ 'name': 'NoResponseText',
221
+ 'message': 'No response found',
222
+ }
223
+ }
224
+
225
+ try:
226
+ dom = html_to_dom(content, self.DEFAULT_ENCODING, None, self.DEFAULT_ENC_ERRORS)
227
+ except Exception as e:
228
+ logger.exception(f"Error parsing dom: {url}")
229
+ return {
230
+ 'url': url,
231
+ 'status': status_code,
232
+ 'timestamp': js_timestamp,
233
+ 'content': None,
234
+ 'error': {
235
+ 'name': e.__class__.__name__,
236
+ 'message': str(e),
237
+ }
238
+ }
239
+
240
+ title_element = dom.xpath("//title")
241
+ title = ""
242
+ if len(title_element) > 0:
243
+ title_text = title_element[0].text
244
+ if title_text is not None:
245
+ title = title_text.strip()
246
+
247
+
248
+ desc_element = dom.xpath("//meta[@name='description']")
249
+ desc = ""
250
+ if len(desc_element) > 0:
251
+ desc_text = desc_element[0].get('content')
252
+ if desc_text is not None:
253
+ desc = desc_text.strip()
254
+
255
+ og_image_element = dom.xpath("//meta[@property='og:image']/@content")
256
+ if og_image_element:
257
+ image = og_image_element[0]
258
+ else:
259
+ image_elements = dom.xpath("//img")
260
+ image_urls = [urljoin(url, img.get("src")) for img in image_elements if img.get("src")]
261
+ if len(image_urls) > 0:
262
+ image = image_urls[0]
263
+ else:
264
+ image = ""
265
+
266
+ new_links = self.get_urls_from_page(url)
267
+ new_links = list(set(new_links))
268
+
269
+ # Normalize extracted keywords to match the format in good_tags
270
+ normalized_title = re.split(r'[\s-]+', title.lower().translate(str.maketrans('', '',
271
+ string.punctuation)))
272
+ # Filter based on good_tags
273
+ tags = [tag for tag in self.good_tags if tag in normalized_title]
274
+ if not tags:
275
+ tags = ['general']
276
+
277
+
278
+ date = dom.xpath("string(//meta[@property='article:published_time']/@content)")
279
+
280
+ # Fallback: Check JSON-LD for datePublished in <script>
281
+ if not date:
282
+ json_ld_date = dom.xpath("string(//script[@type='application/ld+json'])")
283
+ if json_ld_date:
284
+ import json
285
+ try:
286
+ data = json.loads(json_ld_date)
287
+ date = data.get("datePublished", "").split("T")[0]
288
+ except json.JSONDecodeError:
289
+ pass
290
+
291
+ # Fallback: Look for <time> tag with datetime attribute
292
+ if not date:
293
+ date = dom.xpath("string(//time/@datetime)")
294
+
295
+ # Fallback: Check for common patterns with 'Last Updated'
296
+ if not date:
297
+ date = dom.xpath("string(//span[contains(text(), 'Last Updated')])")
298
+
299
+ # Clean up date format if necessary (for example, strip out extra text)
300
+ if date:
301
+ # Extract date pattern YYYY-MM-DD or similar
302
+ match = re.search(r"\d{4}-\d{2}-\d{2}", date) or re.search(r"\d{2} \w{3}, \d{4}", date)
303
+ date = match.group(0) if match else date
304
+
305
+
306
+ return {
307
+ 'url': url,
308
+ 'content': {
309
+ 'title': title,
310
+ 'desc': desc,
311
+ 'image': image,
312
+ 'tags': tags,
313
+ 'vote': 0,
314
+ 'links': sorted(new_links),
315
+ 'created_at': date if date else 'No date found'
316
+ },
317
+ }
318
+
319
+ except Exception as e:
320
+ raise CustomException(e, sys) from e
321
+
322
+ def batch_converter(self, lst: list, batch_size: int):
323
+ """
324
+ Convert list into batches of a specified size.
325
+
326
+ Args:
327
+ list (list): list to convert
328
+ batch_size (int): size of the batch
329
+ """
330
+ for i in range(0, len(lst), batch_size):
331
+ yield lst[i:i + batch_size]
332
+
333
+ def scrape_parallel(self, batch_size: int):
334
+ """
335
+ Process scrape in parallel using multiprocessing.
336
+
337
+ Args:
338
+ urls (list): List of site URLs to process.
339
+ batch_size (int): Number of URLs to process in each batch.
340
+ Returns:
341
+
342
+ """
343
+ num_workers = max(1, os.cpu_count() - 1)
344
+ # getting urls in batch
345
+ batch_url = list(self.batch_converter(self.urls, batch_size))
346
+
347
+ try:
348
+ # Create a multiprocessing pool
349
+ with multiprocessing.Pool(processes=num_workers) as pool:
350
+ for batch in batch_url:
351
+ results = pool.map(self.scrape, batch)
352
+ # data = list([sublist for sublist in results])
353
+
354
+ yield results
355
+ time.sleep(0.1)
356
+ except IOError as e:
357
+ if e.errno == errno.EPIPE:
358
+ pass
359
+
360
+ def get_urls_from_page_parallel(self, urls: list, batch_size: int):
361
+ """
362
+ Process get_urls_from_page in parallel using multiprocessing.
363
+
364
+ Args:
365
+ urls (list): List of site URLs to process.
366
+ batch_size (int): Number of URLs to process in each batch.
367
+ """
368
+ num_workers = max(1, os.cpu_count() - 1)
369
+
370
+ # getting urls in batch
371
+ batch_url = list(self.batch_converter(urls, batch_size))
372
+
373
+ try:
374
+ # Create a multiprocessing pool
375
+ with multiprocessing.Pool(processes=num_workers) as pool:
376
+ for batch in batch_url:
377
+ results = pool.map(self.get_urls_from_page, batch)
378
+ # return list([url for sublist in results for url in sublist])
379
+ for sublist in results:
380
+ for url in sublist:
381
+ yield url # Yield each URL incrementally
382
+ time.sleep(0.1)
383
+
384
+
385
+ except IOError as e:
386
+ if e.errno == errno.EPIPE:
387
+ pass
388
+
389
+ def fetch_tags(self):
390
+ return self.good_tags
src/introlix_api/engine/__init__.py ADDED
File without changes
src/introlix_api/engine/api_data.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from introlix_api.engine.third_party_apis import get_devDotTo_data
2
+ from introlix_api.engine.graphql import fetch_hashnode_posts
3
+ from introlix_api.app.database import search_data
4
+ from introlix_api.logger import logger
5
+
6
+ def fetch_data(page: int = 1, per_page: int = 10, tag = ''):
7
+ """
8
+ Function to fetch data from multiple sources and combine them.
9
+ """
10
+ devDotTo_data = get_devDotTo_data(page, per_page, tag)
11
+ hashnode_posts = fetch_hashnode_posts(page=page, per_page=per_page, tag=tag)
12
+
13
+ # Combine the fetched data
14
+ if hashnode_posts and devDotTo_data:
15
+ combined_data = devDotTo_data + hashnode_posts
16
+ elif hashnode_posts:
17
+ data = []
18
+
19
+ for item in hashnode_posts:
20
+ new_entry = {
21
+ "url": item["url"],
22
+ "content": {
23
+ "title": item["title"],
24
+ "desc": item["description"],
25
+ "image": item["image"],
26
+ "tags": item["tags"],
27
+ "vote": 0,
28
+ "created_at": item["created_at"],
29
+ },
30
+ "type": item["type"]
31
+ }
32
+ data.append(new_entry)
33
+ return data
34
+ elif devDotTo_data:
35
+ data = []
36
+
37
+ for item in devDotTo_data:
38
+ new_entry = {
39
+ "url": item["url"],
40
+ "content": {
41
+ "title": item["title"],
42
+ "desc": item["description"],
43
+ "image": item["image"],
44
+ "tags": item["tags"],
45
+ "vote": 0,
46
+ "created_at": item["created_at"]
47
+ },
48
+ "type": item["type"]
49
+ }
50
+ data.append(new_entry)
51
+ return data
52
+ else:
53
+ return []
54
+
55
+ data = []
56
+
57
+ for item in combined_data:
58
+ new_entry = {
59
+ "url": item["url"],
60
+ "content": {
61
+ "title": item["title"],
62
+ "desc": item["description"],
63
+ "image": item["image"],
64
+ "tags": item["tags"],
65
+ "vote": 0,
66
+ "created_at": item["created_at"]
67
+ },
68
+ "type": item["type"]
69
+ }
70
+ data.append(new_entry)
71
+
72
+ return data
73
+
74
+
75
+
76
+ def batch_converter(lst: list, batch_size: int):
77
+ """
78
+ Convert list into batches of a specified size.
79
+
80
+ Args:
81
+ list (list): list to convert
82
+ batch_size (int): size of the batch
83
+ """
84
+ for i in range(0, len(lst), batch_size):
85
+ yield lst[i:i + batch_size]
86
+
87
+ if __name__ == '__main__':
88
+
89
+ for page_no in range(1, 1001):
90
+ data = fetch_data(page=page_no)
91
+ if data:
92
+ for batch in batch_converter(data, batch_size=100):
93
+ urls = [d["url"] for d in data]
94
+
95
+ existing_urls = {doc["url"] for doc in search_data.find({"url": {"$in": urls}})}
96
+
97
+ for d in batch:
98
+ if d["url"] not in existing_urls:
99
+ search_data.insert_one(d)
100
+ else:
101
+ logger.debug("No data to save")
src/introlix_api/engine/discussion.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from introlix_api.engine.third_party_apis import get_stack_overflow_data
2
+ from introlix_api.utils.tags import fetch_tags
3
+ from introlix_api.logger import logger
4
+ from introlix_api.app.database import search_data
5
+
6
+ def fetch_discussion(page: int = 1, per_page: int = 10, tag: str = ''):
7
+ """
8
+ Function to fetch data from Stack Overflow API.
9
+ """
10
+ stack_overflow_data = get_stack_overflow_data(page=page, per_page=per_page, tag=tag)
11
+ data = []
12
+
13
+ for item in stack_overflow_data:
14
+ new_entry = {
15
+ "url": item["url"],
16
+ "content": {
17
+ "title": item["title"],
18
+ "tags": item["tags"],
19
+ "vote": 0,
20
+ "created_at": item["created_at"],
21
+ "answer_count": item["answer_count"],
22
+ },
23
+ "type": item["type"]
24
+ }
25
+ data.append(new_entry)
26
+
27
+ return data
28
+
29
+ if __name__ == '__main__':
30
+ for tag in fetch_tags():
31
+ data = fetch_discussion(page=1, per_page=10, tag=tag)
32
+ if data:
33
+ urls = [d["url"] for d in data]
34
+
35
+ existing_urls = {doc["url"] for doc in search_data.find({"url": {"$in": urls}})}
36
+
37
+ for d in data:
38
+ if d["url"] not in existing_urls:
39
+ search_data.insert_one(d)
40
+ else:
41
+ logger.debug("No data to save")
src/introlix_api/engine/graphql.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+
3
+ def fetch_hashnode_posts(page=1, per_page = 10, tag = ''):
4
+ all_posts = []
5
+ has_next_page = True
6
+ end_cursor = None
7
+ posts_per_page = per_page # Number of posts per page
8
+
9
+ # Calculate the number of posts to skip based on the requested page
10
+ skip_count = (page - 1) * posts_per_page
11
+
12
+ while has_next_page:
13
+ # Construct the GraphQL query with the specified number of posts per page
14
+ query = {
15
+ "query": f"""
16
+ query Publication {{
17
+ publication(host: "blog.developerdao.com") {{
18
+ title
19
+ posts(first: {posts_per_page}, after: {f'"{end_cursor}"' if end_cursor else 'null'}) {{
20
+ edges {{
21
+ node {{
22
+ title
23
+ brief
24
+ url
25
+ publishedAt
26
+ tags {{
27
+ id
28
+ name
29
+ }}
30
+ coverImage {{
31
+ url
32
+ }}
33
+ }}
34
+ }}
35
+ pageInfo {{
36
+ endCursor
37
+ hasNextPage
38
+ }}
39
+ }}
40
+ }}
41
+ }}"""
42
+ }
43
+
44
+ # Make the POST request to the Hashnode GraphQL endpoint
45
+ response = requests.post("https://gql.hashnode.com/", json=query)
46
+
47
+ # Check for request success
48
+ if response.status_code == 200:
49
+ data = response.json()
50
+ posts = data['data']['publication']['posts']['edges']
51
+
52
+ # Append fetched posts to the all_posts list
53
+ all_posts.extend([edge['node'] for edge in posts])
54
+
55
+ # Update pagination info
56
+ page_info = data['data']['publication']['posts']['pageInfo']
57
+ end_cursor = page_info['endCursor']
58
+ has_next_page = page_info['hasNextPage']
59
+
60
+ # Stop if we've fetched enough posts
61
+ if len(all_posts) >= skip_count + posts_per_page:
62
+ break
63
+ else:
64
+ print(f"Error: {response.status_code} - {response.text}")
65
+ break
66
+
67
+ # Return only the posts for the requested page
68
+ if tag == 'bitcoin' or tag == 'web3':
69
+ return all_posts[skip_count:skip_count + posts_per_page]
src/introlix_api/engine/third_party_apis.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import requests
4
+ from introlix_api.logger import logger
5
+ from introlix_api.exception import CustomException
6
+
7
+ # Define the URL of the API endpoint
8
+ DEV_DOT_TO_API = "https://dev.to/api/articles?tag={}&page={}&per_page={}"
9
+
10
+ def get_devDotTo_data(page: int = 1, per_page: int = 10, tag: int = '') -> dict:
11
+ """
12
+ Function to fetch data from the dev.to API.
13
+ """
14
+ try:
15
+ # Construct the URL with the provided parameters
16
+ url = DEV_DOT_TO_API.format(tag, page, per_page)
17
+ response = requests.get(url)
18
+
19
+ if response.status_code!= 200:
20
+ logger.debug(f"Failed to fetch data from dev.to: {response.status_code}")
21
+
22
+ # Convert the response to JSON
23
+ articles = response.json()
24
+
25
+ extracted_articles = [
26
+ {
27
+ "title": article["title"],
28
+ "description": article["description"],
29
+ "url": article["url"],
30
+ "tags": article["tag_list"],
31
+ "image": article["cover_image"],
32
+ "created_at": article["created_at"],
33
+ "type": "article"
34
+ }
35
+ for article in articles
36
+ ]
37
+
38
+ return extracted_articles
39
+
40
+ except Exception as e:
41
+ raise CustomException(e, sys) from e
42
+
43
+ def get_github_repo(page: int = 1, per_page: int = 10, tag: int = ''):
44
+ """
45
+ Function to fetch data from GitHub API.
46
+ """
47
+ try:
48
+ # Construct the URL with the provided parameters
49
+ url = f"https://api.github.com/search/repositories?q=topic:{tag}&sort=stars&page={page}&per_page={per_page}"
50
+ response = requests.get(url)
51
+
52
+ if response.status_code!= 200:
53
+ logger.debug(f"Failed to fetch data from GitHub: {response.status_code}")
54
+
55
+ # Convert the response to JSON
56
+ repos = response.json()
57
+
58
+ extracted_repos = [
59
+ {
60
+ "name": repo["name"],
61
+ "description": repo["description"],
62
+ "url": repo["html_url"],
63
+ "stars": repo["stargazers_count"],
64
+ "created_at": repo["created_at"],
65
+ "type": "article"
66
+ }
67
+ for repo in repos["items"]
68
+ ]
69
+
70
+ return extracted_repos
71
+
72
+ except Exception as e:
73
+ raise CustomException(e, sys) from e
74
+
75
+ def get_stack_overflow_data(page: int = 1, per_page: int = 10, tag: int = ''):
76
+ """
77
+ Function to fetch data from Stack Overflow API.
78
+ """
79
+ try:
80
+ # Construct the URL with the provided parameters
81
+ url = f"https://api.stackexchange.com/2.3/questions?order=desc&sort=activity&tagged={tag}&site=stackoverflow&page={page}&pagesize={per_page}"
82
+ response = requests.get(url)
83
+
84
+ if response.status_code!= 200:
85
+ logger.debug(f"Failed to fetch data from Stack Overflow: {response.status_code}")
86
+
87
+ # Convert the response to JSON
88
+ questions = response.json()
89
+
90
+ extracted_questions = [
91
+ {
92
+ "title": question["title"],
93
+ "url": question["link"],
94
+ "tags": question["tags"],
95
+ "created_at": question["creation_date"],
96
+ "answer_count": question["answer_count"],
97
+ "type": "discussion"
98
+ }
99
+ for question in questions["items"]
100
+ ]
101
+
102
+ return extracted_questions
103
+
104
+ except Exception as e:
105
+ raise CustomException(e, sys) from e
106
+
107
+ if __name__ == "__main__":
108
+ print(get_stack_overflow_data(1, 10, tag='python'))
src/introlix_api/engine/youtube.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import httpx
3
+ import asyncio
4
+ import time
5
+ from introlix_api.utils.tags import fetch_tags
6
+ from dotenv import load_dotenv
7
+ from cachetools import TTLCache
8
+
9
+ load_dotenv()
10
+ YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")
11
+
12
+ # Cache with TTL of 6 hours (21600 seconds)
13
+ cache = TTLCache(maxsize=100, ttl=21600)
14
+
15
+ async def get_youtube_videos():
16
+ url = "https://www.googleapis.com/youtube/v3/search"
17
+ videos = []
18
+
19
+ for tag in fetch_tags():
20
+ if tag in cache:
21
+ videos.append(cache[tag]) # Use cached data
22
+ continue
23
+
24
+ params = {
25
+ "key": YOUTUBE_API_KEY,
26
+ "part": "snippet",
27
+ "q": tag,
28
+ "type": "video",
29
+ "maxResults": 5,
30
+ "order": "viewCount"
31
+ }
32
+
33
+ async with httpx.AsyncClient() as client:
34
+ try:
35
+ response = await client.get(url, params=params)
36
+ response.raise_for_status()
37
+ result = response.json()
38
+ videos.append(result)
39
+ cache[tag] = result # Cache the result
40
+ except httpx.HTTPStatusError as e:
41
+ print(f"HTTP error for tag '{tag}': {e}")
42
+ await asyncio.sleep(1)
43
+ except Exception as e:
44
+ print(f"Unexpected error: {e}")
45
+
46
+ await asyncio.sleep(0.5)
47
+
48
+ return videos
49
+
50
+ async def main():
51
+ data = await get_youtube_videos()
52
+ print(data)
53
+
54
+ asyncio.run(main())
src/introlix_api/exception/__init__.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from introlix_api.logger import logger
4
+
5
+ def error_message_detail(error, error_detail):
6
+ """
7
+ Retruns the error message and error details and logs the error
8
+
9
+ Args:
10
+ error: error message
11
+ error_detail: error details
12
+
13
+ Returns:
14
+ error_message: error message
15
+ """
16
+ _, _, exe_tb = error_detail.exc_info()
17
+ file_name = exe_tb.tb_frame.f_code.co_filename
18
+ line_number = exe_tb.tb_lineno
19
+ error_message = "Error occured in file called [{0}] line number: [{1}] error message: [{2}]".format(
20
+ file_name, line_number, str(error)
21
+ )
22
+
23
+ logger.info(error_message)
24
+
25
+ return error_message
26
+
27
+ class CustomException(Exception):
28
+ def __init__(self, error_message, error_detail):
29
+ super().__init__(error_message)
30
+ self.error_message = error_message_detail(error_message, error_detail=error_detail)
31
+
32
+ def __str__(self):
33
+ return self.error_message
34
+
src/introlix_api/logger/__init__.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from datetime import datetime
4
+
5
+ """
6
+ Logging Every error and in logging file that is in the logs directory.
7
+ """
8
+
9
+ LOG_FILE = f"running_logs.log"
10
+ logs_path = os.path.join(os.getcwd(), "logs")
11
+ os.makedirs(logs_path, exist_ok=True)
12
+
13
+
14
+ LOG_FILE_PATH = os.path.join(logs_path, LOG_FILE)
15
+
16
+ logging.basicConfig(
17
+ filename=LOG_FILE_PATH,
18
+ format="[ %(asctime)s ] %(lineno)d %(name)s - %(levelname)s - %(message)s",
19
+ level=logging.INFO,
20
+ )
21
+
22
+ logger = logging.getLogger("introlixLogger")
src/introlix_api/ml/__init__.py ADDED
File without changes
src/introlix_api/ml/model.py ADDED
File without changes
src/introlix_api/ml/recommendation.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import numpy as np
4
+ from sentence_transformers import SentenceTransformer
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ from introlix_api.exception import CustomException
7
+ from introlix_api.logger import logger
8
+ from introlix_api.app.appwrite import get_interests
9
+
10
+ class Recommendation:
11
+ def __init__(self, user_interests: list, articles: list):
12
+ """
13
+ Recommendation system for articles using sentence-transformers and cosine similarity
14
+
15
+ Args:
16
+ user_interests (list): list of user interests
17
+ articles (list): list of all articles
18
+ """
19
+ self.user_interests = user_interests
20
+ self.articles = articles
21
+ self.recommendations = []
22
+ self.model = SentenceTransformer('all-MiniLM-L6-v2')
23
+ self.response = get_interests()
24
+ self.user_interests = [interest['interest'] for interest in self.response]
25
+ self.interest_keywords = {item['interest'].split(':')[1]: item['keywords'] for item in self.response}
26
+
27
+ def encode(self, texts: list):
28
+ """
29
+ Function to encode text into embeddings using sentence-transformers
30
+
31
+ Args:
32
+ texts (list): list of text to be encoded
33
+ Returns:
34
+ encoded embedding values
35
+ """
36
+ try:
37
+ return self.model.encode(texts)
38
+ except Exception as e:
39
+ raise CustomException(e, sys)
40
+
41
+ def recommend(self):
42
+ """
43
+ Function to recommend aritcles based on user interests
44
+
45
+ Args:
46
+ None
47
+ Returns:
48
+ list of recommended articles
49
+ """
50
+
51
+ # Initialize new interests
52
+ new_interests = self.user_interests.copy() # Start with the old
53
+ new_interests = [item.split(':')[0] for item in new_interests]
54
+
55
+
56
+ # Adding keywords to user interests based on existing interests
57
+ for interest in self.user_interests:
58
+ if interest in self.interest_keywords:
59
+ # Append related keywords to new_interests
60
+ new_interests.extend(self.interest_keywords[interest])
61
+
62
+ # Remove duplicates if needed
63
+ new_interests = list(set(new_interests))
64
+
65
+
66
+ # encoding user interests into embeddings
67
+ # print(f"Here is user interest keywords: {self.interest_keywords}")
68
+ user_interests_embeddings = self.encode(new_interests)
69
+ user_interests_embeddings = np.mean(user_interests_embeddings, axis=0) # Averaging embeddings
70
+
71
+ # Reshape user embedding to (1, -1) for compatibility with cosine_similarity
72
+ user_interests_embeddings = user_interests_embeddings.reshape(1, -1)
73
+
74
+ # encoding all articles into embeddings
75
+ article_embeddings = self.encode(self.articles)
76
+
77
+ # print(f"Shape of user_interests_embeddings: {user_interests_embeddings.shape}")
78
+ # print(f"Shape of article_embeddings: {article_embeddings.shape}")
79
+
80
+ # calculate cosine similarity between user interests and all article embeddings
81
+ similarities = cosine_similarity(user_interests_embeddings, article_embeddings).flatten()
82
+
83
+ # sort articles based on similarity
84
+ recommended_indices = np.argsort(similarities)[::-1]
85
+
86
+ # Get all recommended articles sorted by similarity
87
+ recommended_articles = [self.articles[i] for i in recommended_indices]
88
+
89
+ return recommended_articles
src/introlix_api/pipeline/__init__.py ADDED
File without changes
src/introlix_api/pipeline/common_pipeline.py ADDED
File without changes
src/introlix_api/pipeline/periodic_pipeline.py ADDED
File without changes
src/introlix_api/utils/__init__.py ADDED
File without changes