diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000000000000000000000000000000000000..0cadb6c163974887290425c4c7b55ce8cef6f274 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,34 @@ +name: Sync to Hugging Face hub +on: + push: + branches: [main] + + # To run this workflow manually from the Actions tab + workflow_dispatch: + +jobs: + sync-to-hub: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + lfs: true + + - name: Set up Git user + run: | + git config --global user.email "tubex998@gmail.com" + git config --global user.name "satyam998" + + - name: Create a new branch + run: | + git checkout --orphan temp + git add -A + git commit -m "Initial commit" + git branch -D main + git branch -m main + + - name: Force push to hub + env: + HF: ${{ secrets.HF_TOKEN }} + run: git push --force https://satyam998:$HF@huggingface.co/spaces/satyam998/introlix_api main \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..fc82b55fe6fdd8072c9e9e03ecadc550890dacc3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,162 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ \ No newline at end of file diff --git a/.idx/dev.nix b/.idx/dev.nix new file mode 100644 index 0000000000000000000000000000000000000000..b85ca17d796f3d400031c06b65fdd6afe1ab38c8 --- /dev/null +++ b/.idx/dev.nix @@ -0,0 +1,55 @@ +# To learn more about how to use Nix to configure your environment +# see: https://developers.google.com/idx/guides/customize-idx-env +{ pkgs, ... }: { + # Which nixpkgs channel to use. + channel = "stable-23.11"; # or "unstable" + + # Use https://search.nixos.org/packages to find packages + packages = [ + # pkgs.go + pkgs.python311 + pkgs.python311Packages.pip + # pkgs.nodejs_20 + # pkgs.nodePackages.nodemon + ]; + + # Sets environment variables in the workspace + env = {}; + idx = { + # Search for the extensions you want on https://open-vsx.org/ and use "publisher.id" + extensions = [ + # "vscodevim.vim" + ]; + + # Enable previews + previews = { + enable = true; + previews = { + # web = { + # # Example: run "npm run dev" with PORT set to IDX's defined port for previews, + # # and show it in IDX's web preview panel + # command = ["npm" "run" "dev"]; + # manager = "web"; + # env = { + # # Environment variables to set for your server + # PORT = "$PORT"; + # }; + # }; + }; + }; + + # Workspace lifecycle hooks + workspace = { + # Runs when a workspace is first created + onCreate = { + # Example: install JS dependencies from NPM + # npm-install = "npm install"; + }; + # Runs when the workspace is (re)started + onStart = { + # Example: start a background task to watch and re-build backend code + # watch-backend = "npm run watch-backend"; + }; + }; + }; +} diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..cea0d810f7a081c97fc1fb7ca45de7dffdb2fc79 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,19 @@ +FROM python:3.10 + +RUN useradd -m -u 1000 user + +WORKDIR /app + +COPY --chown=user . /app + +RUN pip install -r requirements.txt + +RUN mkdir -p /app/logs +RUN chmod 777 /app/logs + +# Copy the shell script into the container +COPY start.sh /app/start.sh +RUN chmod +x /app/start.sh + +# Use the shell script to start both processes +CMD ["/bin/bash", "/app/start.sh"] \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a6e4e37c1a031ce258eceaeecb748e55465ee136 --- /dev/null +++ b/README.md @@ -0,0 +1,12 @@ +--- +title: Introlix API +emoji: 🔥 +colorFrom: green +colorTo: blue +sdk: docker +pinned: false +license: apache-2.0 +--- + +# Introlix API +

Introlix API offers a comprehensive suite of tools and APIs utilized in Introlix Feed. It is an advanced API that integrates multiple external APIs, RSS feed crawlers, and other data sources to provide a robust and efficient backend service.

\ No newline at end of file diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..1e168dc11c91c0f8e6eff0159b06b8f251d4ed6f --- /dev/null +++ b/app.py @@ -0,0 +1,205 @@ +from fastapi import FastAPI, Query, HTTPException +from bson import ObjectId +import sys +import httpx +import os +import crawler +from fastapi.middleware.cors import CORSMiddleware +from starlette.responses import RedirectResponse +from introlix_api.app.routes import auth, posts, run_spider, similarity +from typing import List +from dotenv import load_dotenv, dotenv_values + +from introlix_api.app.appwrite import databases, APPWRITE_DATABASE_ID, ID, APPWRITE_ACCOUNT_COLLECTION_ID, get_interests +from introlix_api.app.database import startup_db_client, shutdown_db_client +from introlix_api.ml.recommendation import Recommendation +from introlix_api.utils.tags import fetch_tags + +from introlix_api.exception import CustomException + +from contextlib import asynccontextmanager + +from pydantic import BaseModel, Field + +load_dotenv() + +YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY") + +class FeedModel(BaseModel): + id: str = Field(..., alias="_id") + title: str + desc: str + url: str + publication_date: str + image_url: str + category: str + source: str + +@asynccontextmanager +async def lifespan(app: FastAPI): + # Start the database connection + await startup_db_client(app) + yield + # Close the database connection + await shutdown_db_client(app) + +app = FastAPI(lifespan=lifespan) + +origins = [ + "http://localhost:3000", + "http://192.168.1.64:3000", + "https://introlixfeed.vercel.app/", + "https://introlixfeed.vercel.com/" + # Add other allowed origins here if needed +] + +app.add_middleware( + CORSMiddleware, + allow_origins=origins, # Specify allowed origins + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +@app.get("/", tags=["authentication"]) +async def index(): + return RedirectResponse(url='/docs') + +@app.get("/feed_data", response_model=List[FeedModel]) +async def get_feed_data(page: int = 1, limit: int = 20, user_id: str = Query(...), category=None): + try: + skip = (page - 1) * limit + + response = get_interests() + user_interests = [] + # getting only the interests not keywords + for interest in response: + user_interests.append(interest['interest']) + + users = databases.list_documents( + database_id=APPWRITE_DATABASE_ID, + collection_id=APPWRITE_ACCOUNT_COLLECTION_ID + ) + + for doc in users['documents']: + if user_id == doc['$id']: + user_interests = doc['interests'] + + user_interests = [item.split(':')[0] for item in user_interests] + # response = await app.mongodb['feedData'].find({"category": {"$in": user_interests}}).skip(skip).limit(limit).to_list(limit) + + + + # Perform the aggregation + if category == None: + response = await app.mongodb['feedData'].find({"category": {"$in": user_interests}}).skip(skip).limit(limit).to_list(limit) + else: + response = await app.mongodb['feedData'].find({"category": category}).skip(skip).limit(limit).to_list(limit) + + # random.shuffle(response) + + # Filter out items that do not have a title + response = [item for item in response if item.get('title')] + response = [item for item in response if item.get('desc')] + + article_titles = [item['title'] for item in response] + recommendation_system = Recommendation(user_interests, article_titles) + recommended_titles = recommendation_system.recommend() + + response = [post for post in response if post['title'] in recommended_titles] + + + for item in response: + item['_id'] = str(item['_id']) + item['title'] = item.get('title') or '' + item['desc'] = item.get('desc') or '' + item['url'] = item.get('url') or '' + item['publication_date'] = item.get('publication_date') or '' + item['image_url'] = item.get('image_url') or '' + item['category'] = item.get('category') or '' + item['source'] = item.get('source') or '' + + return response + except Exception as e: + raise CustomException(e, sys) from e + +@app.get("/fetch_post", response_model=FeedModel) +async def get_feed_data(post_id: str = Query(...)): + try: + post_id = ObjectId(post_id) + response = await app.mongodb['feedData'].find_one({"_id": post_id}) + + if not response: + raise HTTPException(status_code=404, detail="Post not found") + + # Convert _id to string + response["_id"] = str(response["_id"]) + + # Check for null values and set defaults if needed + response["desc"] = (response.get("desc") or "No Description")[:90] + response["publication_date"] = response.get("publication_date") or "Unknown Date" + response["image_url"] = response.get("image_url") or "No Image URL" + response["category"] = response.get("category") or "Uncategorized" + response["source"] = response.get("source") or "Unknown Source" + + # for item in response: + # item['title'] = item.get('title') or '' + # item['desc'] = item.get('desc') or '' + # item['url'] = item.get('url') or '' + # item['publication_date'] = item.get('publication_date') or '' + # item['image_url'] = item.get('image_url') or '' + # item['category'] = item.get('category') or '' + # item['source'] = item.get('source') or '' + + return response + except Exception as e: + raise CustomException(e, sys) from e + +@app.get("/test_recommendation") +async def test_recommendation( + user_interests: list[str] = Query(..., description="Comma-separated list of user interests"), + articles: list[str] = Query(..., description="Comma-separated list of articles") +): + """ + Test endpoint for recommendations. + Takes user interests and articles as query parameters and returns recommended articles. + """ + + # Create a recommendation instance + recommendation = Recommendation(user_interests, articles) + + # Get the recommended articles + recommended_articles = recommendation.recommend() + + return { + "user_interests": user_interests, + "recommended_articles": recommended_articles, + } + +@app.get("/youtube/videos") +async def get_youtube_videos(query: str = None): + url = "https://www.googleapis.com/youtube/v3/search" + params = { + "key": YOUTUBE_API_KEY, + "part": "snippet", + "q": query or "trending", + "type": "video", + "maxResults": 10, + "order": "viewCount" # You can change this to 'date' for recent uploads + } + + async with httpx.AsyncClient() as client: + response = await client.get(url, params=params) + response.raise_for_status() # Raise an error for bad responses + return response.json() + +@app.get("/tags") +async def get_tags(): + tags = fetch_tags() + return tags + +app.include_router(auth.router, prefix="/auth") +app.include_router(run_spider.router, prefix="/spider") +app.include_router(similarity.router, prefix="/feed") +app.include_router(crawler.router) +app.include_router(posts.router) \ No newline at end of file diff --git a/crawler.py b/crawler.py new file mode 100644 index 0000000000000000000000000000000000000000..da0b45597c3802b1bb3ba37099a6ea97ca7d2f22 --- /dev/null +++ b/crawler.py @@ -0,0 +1,222 @@ +import re +import sys +import time +from urllib.parse import urlparse +from fastapi import APIRouter, HTTPException, Query +from introlix_api.crawler.bot import IntrolixBot, BotArgs +from introlix_api.exception import CustomException +from introlix_api.logger import logger +from introlix_api.utils.root_sites import root_sites +from introlix_api.app.database import search_data, db +from introlix_api.app.appwrite import fetch_root_sites, fetch_saved_urls, save_urls +from pymongo import ASCENDING +from pymongo.errors import DuplicateKeyError + +router = APIRouter() + +BATCH_SIZE = 10 +urls_batch = [] +storage_threshold = 500 * 1024 * 1024 +delete_batch = 1000 + +def filter_urls(url: str) -> bool: + """ + A function to filter non article urls from the scraped urls + Args: + url (list): url + Returns: + bool: True if the url is article url else False + """ + parsed_url = urlparse(url) + + if parsed_url.path in ('', '/'): + return False + + non_article_keywords = [ + "/product", "/products", "/home", "/item", "/items", "/category", "/categories", + "/login", "/signin", "/logout", "/signup", "/register", "/account", "/user", + "/profile", "/dashboard", "/settings", "/preferences", "/order", "/orders", + "/cart", "/checkout", "/payment", "/subscribe", "/subscription", + "/contact", "/support", "/help", "/faq", "/about", "/privacy", "/terms", + "/policy", "/conditions", "/legal", "/service", "/services", "/guide", + "/how-to", "/pricing", "/price", "fees", "/plans", "/features", "/partners", + "/team", "/careers", "/jobs", "/join", "/apply", "/training", "/demo", + "/trial", "/download", "/install", "/app", "/apps", "/software", "/portal", + "/index", "/main", "/video", "/videos", "/photo", "/photos", + "/image", "/images", "/gallery", "/portfolio", "/showcase", "/testimonials", + "/reviews", "/search", "/find", "/browse", "/list", "/tags", "/explore", + "/new", "/trending", "/latest", "/promotions", "/offers", "/deals", "/discount", + "/coupon", "/coupons", "/gift", "/store", "/stores", "/locator", "/locations", + "/branches", "/events", "/webinar", "/calendar", "/schedule", + "/class", "/classes", "/lesson", "/lessons", "/training", "/activity", + "/activities", "/workshop", "/exhibit", "/performance", "/map", "/directions", + "/weather", "/traffic", "/rates", "/auction", "/bid", "/tender", "/investment", + "/loan", "/mortgage", "/property", "/real-estate", "/construction", "/project", + "/client", "/clients", "/partner", "/sponsor", "/media", "/press", "/releases", + "/announcements", "/newsroom", "/resources", "courses", "collections", "/u/", "/members/", + "/@", "/shop", "/wiki", "/author", "/dynamic", "/image", "/submit" # TODO: need to add more + ] + + article_keywords = [ + "/blog/", "post", "article", "insights", "guide", "tutorial", + "how-to", "what", "how", "introduction", "/news/" + ] + + article_pattern = [ + r'/(/blog/|article|articles|post|posts|blogs|news|)/\d{4}/\d{2}/+[a-z0-9-]+/?', + r'/(/blog/|article|articles|post|posts|blogs|news|)/[a-z0-9-]+/[a-z0-9-]+', + r'(? 2: + return True + + return False + +def save_to_db(data): + global urls_batch + try: + # Check database storage size and delete old documents if needed + stats = db.command("collStats", "search_data") + storage_size = stats['size'] + + if storage_size >= storage_threshold: + oldest_docs = search_data.find().sort("createdAt", ASCENDING).limit(delete_batch) + oldest_ids = [doc['_id'] for doc in oldest_docs] + search_data.delete_many({"_id": {"$in": oldest_ids}}) + + # Prepare list of URLs to check in the database + urls = [d["url"] for d in data if filter_urls(d["url"])] + + # Retrieve existing URLs from the database to filter out duplicates + existing_urls = set(search_data.find({"url": {"$in": urls}}).distinct("url")) + + # Filter out documents with URLs that already exist in the database + unique_data = [ + {"url": d["url"], "content": d["content"], "type": "article"} + for d in data + if d["url"] not in existing_urls and d.get("content") is not None + ] + + # Insert only unique documents + if unique_data: + try: + search_data.insert_many(unique_data) + except DuplicateKeyError as e: + logger.info("Duplicate URL detected during insertion. Skipping duplicate entries.") + + # Process URLs in `urls_batch` if it has URLs + if urls_batch: + try: + save_urls(urls_batch) + except Exception as e: + logger.error(f"Error saving URLs to Appwrite: {str(e)}") + urls_batch.clear() + + except Exception as e: + raise CustomException(e, sys) from e + +def extract_urls(batch_size=BATCH_SIZE): + # Fetch documents with required fields only, reducing memory footprint per document + documents = search_data.find({}, {"content.links": 1}) + + # Initialize a list to store URLs in batches + batch_urls = [] + + for doc in documents: + # Extract URLs only if 'content' and 'links' exist + links = doc.get("content", {}).get("links") + if links: + # Use a generator to iterate over links directly + for url in links: + batch_urls.append(url) + # Yield URLs in batches to control memory usage + if len(batch_urls) >= batch_size: + yield batch_urls + batch_urls = [] # Clear the batch after yielding + + # Yield any remaining URLs + if batch_urls: + yield batch_urls + +def crawler(urls_batch): + try: + bot = IntrolixBot(urls=urls_batch, args=BotArgs) + + # Process each batch of scraped data + for data_batch in bot.scrape_parallel(batch_size=BATCH_SIZE): + save_to_db(data_batch) + + except Exception as e: + raise CustomException(e, sys) from e + +def run_crawler_continuously(): + global urls_batch + try: + while True: + start_time = time.time() # Record the start time + + while (time.time() - start_time) < 600: # Run for 10 minutes (600 seconds) + try: + root_urls = fetch_root_sites() + saved_urls = fetch_saved_urls() + except Exception as e: + logger.info("Error fetching URLs from Appwrite: %s", str(e)) + root_urls = [] + saved_urls = [] + + if root_urls and saved_urls: + urls = root_urls + saved_urls + urls = list(set(urls)) + else: + urls = root_sites() + urls_batch + + if urls: + logger.info(f"Starting crawler with {len(urls)} root URLs") + crawler(urls[::-1]) + + + # Extract and process URLs in batches + for extracted_urls in extract_urls(batch_size=BATCH_SIZE): + urls_batch.extend(list(set(extracted_urls))) + # logger.info(f"Starting crawler with {len(set(urls_batch))} extracted URLs from MongoDB") + # crawler(list(set(urls_batch))) + time.sleep(1) + + time.sleep(1) + + # After 10 minutes, the while loop will restart without any pause + logger.info("Restarting the crawler for another 10-minute session.") + except Exception as e: + raise CustomException(e, sys) from e + + +@router.post('/crawler') +def run_crawler(): + try: + run_crawler_continuously() + except Exception as e: + raise HTTPException(status_code=400, detail=str(e)) + + +if __name__ == "__main__": + while True: + start_time = time.time() + while (time.time() - start_time) < 600: + run_crawler_continuously() +# # urls = extract_urls() +# # print(urls) \ No newline at end of file diff --git a/demo.py b/demo.py new file mode 100644 index 0000000000000000000000000000000000000000..801c64569cba6516f36fa7d0deafe68f6a0f60c3 --- /dev/null +++ b/demo.py @@ -0,0 +1,38 @@ +#import csv +# from introlix_api.app.database import feed_data + +# data = feed_data.find({}, {"_id": 0, "title": 1}) # Exclude _id, include only title + +# # Specify the CSV file to write to +# csv_file = 'feed_data_titles.csv' + +# # Write data to a CSV file +# with open(csv_file, mode='w', newline='', encoding='utf-8') as file: +# writer = csv.writer(file) + +# # Write header (just the title field) +# writer.writerow(["title"]) + +# # Write each document's title to the CSV +# for document in data: +# writer.writerow([document.get("title")]) + +# print(f"Title data successfully saved to {csv_file}") +# from introlix_api.crawler.bot import IntrolixBot, BotArgs +# import time + +# start = time.time() +# inbot = IntrolixBot(args=BotArgs, urls=["https://www.wikipedia.org/", "https://medium.com/", "https://www.bbc.com/"]) + +# print(inbot.crawl(batch_size=1048)) +# # end = time.time() +# print(f"Time taken: {end - start}") + +# from introlix_api.app.appwrite import fetch_root_sites + +# print(len(set(fetch_root_sites()))) +# Access the scraped data +# for index, page_data in enumerate(inbot.data): +# print(f"Page {index + 1}:") +# print(page_data) +# print('-' * 40) diff --git a/docs/design.md b/docs/design.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/requirements.md b/docs/requirements.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/main.py b/main.py new file mode 100644 index 0000000000000000000000000000000000000000..ee608fcf9b5b4638a38ab77ce1b6dee9126a5561 --- /dev/null +++ b/main.py @@ -0,0 +1,40 @@ +import subprocess + +def run_app(): + command = ["scrapy", "crawl", "generic"] + working_directory = "src/introlix_api/app/introlix_spider" + + result = subprocess.run(command, cwd=working_directory, capture_output=True, text=True) + + print("Output:", result.stdout) + print("Error:", result.stderr) + +if __name__ == "__main__": + # running the spider + run_app() + + # def run_get_urls_from_page_parallel(self, urls: list, max_workers: int=10) -> list: + # """ + # Running get_urls_from_page function in parallel for many runs. + + # Args: + # urls (list): list of urls + # max_workers (int, optional): number of workers. Defaults to 10. + # Returns: + # list: list of fetched urls + # """ + # fetched_urls = [] + + # with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + # futures = {executor.submit(self.get_urls_from_page, url): url for url in urls} + + # for future in concurrent.futures.as_completed(futures): + # url = futures[future] + + # try: + # result = future.result() + # fetched_urls.append(result) + # except Exception as e: + # raise CustomException(e, sys) from e + + # return list(set(list(url for sublist in fetched_urls if sublist is not None for url in sublist))) \ No newline at end of file diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..7cc2ce0ef2d0354f60fb0f56d5a1a915bbb90bb9 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,22 @@ +numpy +pandas +scrapy +fastapi +uvicorn +Jinja2 +appwrite +python-dotenv +pymongo +aiohttp +motor +httpx +torch +scikit-learn +beautifulsoup4 +sentence-transformers +nltk +algoliasearch +apscheduler +cachetools + +-e . \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..2a0e51a53b77df07d0ff7d04a0857dea7d9247d7 --- /dev/null +++ b/setup.py @@ -0,0 +1,18 @@ +from setuptools import setup +import setuptools + +with open("README.md", "r", encoding="utf-8") as f: + long_description = f.read() + +setup( + name="introlix_api", + version="0.0.1", + author="Satyam Mishra", + author_email="tubex998@gmail.com", + description="Introlix API offers a comprehensive suite of tools and APIs utilized in Introlix Feed.", + long_description=long_description, + long_description_content_type="text/markdown", + package_dir={"": "src"}, + packages=setuptools.find_packages(where="src"), + python_requires=">=3.10", +) \ No newline at end of file diff --git a/src/introlix_api/app/__init__.py b/src/introlix_api/app/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/introlix_api/app/algolia.py b/src/introlix_api/app/algolia.py new file mode 100644 index 0000000000000000000000000000000000000000..5bd6e6910a383510dadcb599b2ced15c2c799e94 --- /dev/null +++ b/src/introlix_api/app/algolia.py @@ -0,0 +1,82 @@ +import os +import json +import asyncio +from bson import ObjectId +from apscheduler.schedulers.asyncio import AsyncIOScheduler +from introlix_api.app.database import search_data +from algoliasearch.search.client import SearchClientSync +from dotenv import load_dotenv + +load_dotenv() + +ALGOLIA_USER = os.getenv("ALGOLIA_USER") +ALGOLIA_KEY = os.getenv("ALGOLIA_KEY") +INDEX_NAME = "introlix_data" + +# Initialize the Algolia client +_client = SearchClientSync(ALGOLIA_USER, ALGOLIA_KEY) + +def convert_object_ids(doc): + """Recursively convert ObjectId fields to strings in the document.""" + for key, value in doc.items(): + if isinstance(value, ObjectId): + doc[key] = str(value) + elif isinstance(value, dict): + convert_object_ids(value) # Recursively convert in nested dicts + elif isinstance(value, list): + for item in value: + if isinstance(item, dict): + convert_object_ids(item) # Recursively convert in dicts within lists + return doc + +async def upload_data(): + """Uploads data to Algolia in batches, updating records by setting `objectID` to prevent duplicates.""" + batch_size = 1000 + batch = [] + + cursor = search_data.find() + for doc in cursor: + # Convert any ObjectId fields to strings for JSON compatibility + doc = convert_object_ids(doc) + + # Set `objectID` to ensure uniqueness and prevent duplicates + doc['objectID'] = str(doc['_id']) # Using MongoDB _id as `objectID` + + # Convert document to JSON string and check its size + doc_json = json.dumps(doc) + doc_size = len(doc_json.encode('utf-8')) + + # Only add to batch if size is within Algolia's 10 KB limit + if doc_size <= 10000: + batch.append(doc) + + # Send batch to Algolia when the batch size is reached + if len(batch) >= batch_size: + _client.save_objects(index_name=INDEX_NAME, objects=batch) + batch.clear() # Clear the batch after sending + + # Send any remaining documents + if batch: + _client.save_objects(index_name=INDEX_NAME, objects=batch) + + print("Uploaded data to Algolia.") + +async def main(): + # Run the upload function immediately + await upload_data() + + scheduler = AsyncIOScheduler() + # Schedule `upload_data` to run every 4 hours + scheduler.add_job(upload_data, 'interval', hours=4) + scheduler.start() + + print("Scheduler started. Uploading data to Algolia every 4 hours.") + + # Keep the main thread alive to allow scheduled tasks to run + try: + await asyncio.Event().wait() + except (KeyboardInterrupt, SystemExit): + scheduler.shutdown() + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/src/introlix_api/app/appwrite.py b/src/introlix_api/app/appwrite.py new file mode 100644 index 0000000000000000000000000000000000000000..1da39a2d13d6acb3ef3e5557287000929fd6f934 --- /dev/null +++ b/src/introlix_api/app/appwrite.py @@ -0,0 +1,179 @@ +import os +import sys +from appwrite.client import Client +from appwrite.query import Query +from appwrite.services.databases import Databases +from appwrite.id import ID +from dotenv import load_dotenv, dotenv_values + +from introlix_api.logger import logger +from introlix_api.exception import CustomException +from introlix_api.utils.common import is_valid_url, sanitize_url + +from pydantic import HttpUrl + +load_dotenv() + +APPWRITE_PROJECT_ID = os.getenv("APPWRITE_PROJECT_ID") +APPWRITE_API_KEY = os.getenv("APPWRITE_API_KEY") +APPWRITE_DATABASE_ID = os.getenv("APPWRITE_DATABASE_ID") +APPWRITE_ROOTSITES_COLLECTION_ID = os.getenv("APPWRITE_ROOTSITES_COLLECTION_ID") +APPWRITE_SAVED_URLS_COLLECTION_ID = os.getenv("APPWRITE_SAVED_URLS_COLLECTION_ID") +APPWRITE_INTERESTS_TO_PICK_COLLECTION_ID = os.getenv("APPWRITE_INTERESTS_TO_PICK_COLLECTION_ID") +APPWRITE_ACCOUNT_COLLECTION_ID = os.getenv("APPWRITE_ACCOUNT_COLLECTION_ID") + +client = Client() +client.set_endpoint('https://cloud.appwrite.io/v1') +client.set_project(APPWRITE_PROJECT_ID) +client.set_key(APPWRITE_API_KEY) + +databases = Databases(client) + +# models for database +class RootSitesModel: + url: HttpUrl + +# fetching the data from appwrite +def fetch_root_sites(): + """ + Function to fetch the root sites from appwrite + """ + try: + logger.info("Fetching all of the root sites...") + limit = 100 + offset = 0 + + root_sites = [] + + while True: + response = databases.list_documents(database_id=APPWRITE_DATABASE_ID,collection_id=APPWRITE_ROOTSITES_COLLECTION_ID, queries=[Query.limit(limit), Query.offset(offset)]) # fetching all of the root sites + + for root_site in response['documents']: + root_sites.append(root_site['url']) + + if len(response['documents']) < limit: + break + + offset += limit + + # root_sites = [root_site['url'] for root_site in response['documents']] # extracting the urls + + return root_sites + + except Exception as e: + raise CustomException(e, sys) from e + +def fetch_saved_urls(): + """ + Function to fetch the root sites from appwrite + """ + try: + logger.info("Fetching all of the saved urls...") + limit = 100 + offset = 0 + + root_sites = [] + + while True: + response = databases.list_documents(database_id=APPWRITE_DATABASE_ID,collection_id=APPWRITE_SAVED_URLS_COLLECTION_ID, queries=[Query.limit(limit), Query.offset(offset)]) # fetching all of the root sites + + for root_site in response['documents']: + root_sites.append(root_site['url']) + + if len(response['documents']) < limit: + break + + offset += limit + + # root_sites = [root_site['url'] for root_site in response['documents']] # extracting the urls + + return root_sites[-4000:] + + except Exception as e: + raise CustomException(e, sys) from e + +def get_interests(): + """ + Function to fetch the interests list from where user can choose its interests + """ + try: + response = databases.list_documents(database_id=APPWRITE_DATABASE_ID,collection_id=APPWRITE_INTERESTS_TO_PICK_COLLECTION_ID, queries=[Query.limit(100), Query.offset(0)]) + + interests = [{"interest": interest['interest'], "keywords": interest['keywords']} for interest in response['documents']] + + return interests + except Exception as e: + raise CustomException(e, sys) from e + +def save_urls(urls): + """ + Function to save the URLs in Appwrite. Handles large collections efficiently. + """ + try: + limit = 10 + offset = 0 + existing_urls = set() # Set to store unique URLs + + # Check the total number of documents in the collection + total_count_response = databases.list_documents( + database_id=APPWRITE_DATABASE_ID, + collection_id=APPWRITE_SAVED_URLS_COLLECTION_ID, + queries=[Query.limit(1)] + ) + total_count = total_count_response['total'] + + # Delete all documents if the count exceeds 20,000 + if total_count > 20000: + logger.info("URL count exceeded 20,000. Deleting all documents in the collection.") + while True: + response = databases.list_documents( + database_id=APPWRITE_DATABASE_ID, + collection_id=APPWRITE_SAVED_URLS_COLLECTION_ID, + queries=[Query.limit(limit)] + ) + + if not response['documents']: + break # All documents have been deleted + + for doc in response['documents']: + databases.delete_document( + database_id=APPWRITE_DATABASE_ID, + collection_id=APPWRITE_SAVED_URLS_COLLECTION_ID, + document_id=doc['$id'] + ) + + # Fetch and process all documents in chunks to populate existing_urls set + offset = 0 # Reset offset after deletion + while True: + # Fetch a chunk of documents from the database + response = databases.list_documents( + database_id=APPWRITE_DATABASE_ID, + collection_id=APPWRITE_SAVED_URLS_COLLECTION_ID, + queries=[Query.limit(limit), Query.offset(offset)] + ) + + # Add the fetched URLs to the set + for doc in response['documents']: + existing_urls.add(doc['url']) + + # Check if we have fetched all documents + if len(response['documents']) < limit: + break # No more documents to fetch, exit the loop + + # Move to the next batch + offset += limit + + # Save only unique URLs that are not already in the set + for url in urls: + if url not in existing_urls: + if is_valid_url(url): + sanitized_url = sanitize_url(url) + databases.create_document( + database_id=APPWRITE_DATABASE_ID, + collection_id=APPWRITE_SAVED_URLS_COLLECTION_ID, + document_id=ID.unique(), + data={'url': sanitized_url} + ) + except Exception as e: + raise CustomException(e, sys) from e + diff --git a/src/introlix_api/app/database.py b/src/introlix_api/app/database.py new file mode 100644 index 0000000000000000000000000000000000000000..f706ffb5080faebf394b0dbf514dd2a034a518be --- /dev/null +++ b/src/introlix_api/app/database.py @@ -0,0 +1,23 @@ +import os +from pymongo import MongoClient +from motor.motor_asyncio import AsyncIOMotorClient + +MONGODB_CLIENT_ID = os.getenv("MONGODB_CLIENT_ID") + +client = MongoClient(MONGODB_CLIENT_ID) + +db = client.IntrolixDb + +feed_data = db.feedData +search_data = db.search_data +votes = db.votes + +async def startup_db_client(app): + app.mongodb_client = AsyncIOMotorClient(MONGODB_CLIENT_ID) + app.mongodb = app.mongodb_client.get_database("IntrolixDb") + print("MongoDB connected.") + +async def shutdown_db_client(app): + app.mongodb_client.close() + print("Database disconnected.") + \ No newline at end of file diff --git a/src/introlix_api/app/introlix_spider/introlix_spider/__init__.py b/src/introlix_api/app/introlix_spider/introlix_spider/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/introlix_api/app/introlix_spider/introlix_spider/items.py b/src/introlix_api/app/introlix_spider/introlix_spider/items.py new file mode 100644 index 0000000000000000000000000000000000000000..6b6b4cee6f4e0a37e842082d319ae29da318fa35 --- /dev/null +++ b/src/introlix_api/app/introlix_spider/introlix_spider/items.py @@ -0,0 +1,12 @@ +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class IntrolixSpiderItem(scrapy.Item): + # define the fields for your item here like: + # name = scrapy.Field() + pass diff --git a/src/introlix_api/app/introlix_spider/introlix_spider/middlewares.py b/src/introlix_api/app/introlix_spider/introlix_spider/middlewares.py new file mode 100644 index 0000000000000000000000000000000000000000..3d3f2689d085f36f07bc56eaaa9a85a54408bd92 --- /dev/null +++ b/src/introlix_api/app/introlix_spider/introlix_spider/middlewares.py @@ -0,0 +1,103 @@ +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + +# useful for handling different item types with a single interface +from itemadapter import is_item, ItemAdapter + + +class IntrolixSpiderSpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, or item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request or item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) + + +class IntrolixSpiderDownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) diff --git a/src/introlix_api/app/introlix_spider/introlix_spider/pipelines.py b/src/introlix_api/app/introlix_spider/introlix_spider/pipelines.py new file mode 100644 index 0000000000000000000000000000000000000000..eba8f34bab8edabdfd4b9f0110e1d19077ca917c --- /dev/null +++ b/src/introlix_api/app/introlix_spider/introlix_spider/pipelines.py @@ -0,0 +1,13 @@ +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + + +# useful for handling different item types with a single interface +from itemadapter import ItemAdapter + + +class IntrolixSpiderPipeline: + def process_item(self, item, spider): + return item diff --git a/src/introlix_api/app/introlix_spider/introlix_spider/settings.py b/src/introlix_api/app/introlix_spider/introlix_spider/settings.py new file mode 100644 index 0000000000000000000000000000000000000000..283427c6a295174576805c0395301343f8cbe88b --- /dev/null +++ b/src/introlix_api/app/introlix_spider/introlix_spider/settings.py @@ -0,0 +1,100 @@ +# Scrapy settings for introlix_spider project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = "introlix_spider" + +SPIDER_MODULES = ["introlix_spider.spiders"] +NEWSPIDER_MODULE = "introlix_spider.spiders" + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = "introlix_spider (+http://www.yourdomain.com)" + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", +# "Accept-Language": "en", +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# "introlix_spider.middlewares.IntrolixSpiderSpiderMiddleware": 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# "introlix_spider.middlewares.IntrolixSpiderDownloaderMiddleware": 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# "scrapy.extensions.telnet.TelnetConsole": None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +#ITEM_PIPELINES = { +# "introlix_spider.pipelines.IntrolixSpiderPipeline": 300, +#} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = "httpcache" +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" + +# Set settings whose default value is deprecated to a future-proof value +REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7" +TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" +FEED_EXPORT_ENCODING = "utf-8" + + +# Increase the number of concurrent requests +CONCURRENT_REQUESTS = 32 + +# Set to 0 for no delay between requests +DOWNLOAD_DELAY = 0 \ No newline at end of file diff --git a/src/introlix_api/app/introlix_spider/introlix_spider/spiders/__init__.py b/src/introlix_api/app/introlix_spider/introlix_spider/spiders/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5ca581dc70652bc451062dd6efa6d8b4d3848a75 --- /dev/null +++ b/src/introlix_api/app/introlix_spider/introlix_spider/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/src/introlix_api/app/introlix_spider/introlix_spider/spiders/generic.py b/src/introlix_api/app/introlix_spider/introlix_spider/spiders/generic.py new file mode 100644 index 0000000000000000000000000000000000000000..5df27af42192f75a3c24af760ef905af331bb4f8 --- /dev/null +++ b/src/introlix_api/app/introlix_spider/introlix_spider/spiders/generic.py @@ -0,0 +1,286 @@ +import os +import re +import scrapy +from concurrent.futures import ThreadPoolExecutor +import aiohttp +import asyncio +from dotenv import load_dotenv, dotenv_values +from introlix_api.app.database import feed_data, db +from introlix_api.app.appwrite import fetch_root_sites + + +load_dotenv() + + +class GenericSpider(scrapy.Spider): + """ + Spider to crawl internet to get data to display it on introlix feed + """ + name = "generic" + + def __init__(self, *args, **kwargs): + super(GenericSpider, self).__init__(*args, **kwargs) + self.executor = ThreadPoolExecutor(max_workers=10) # Control parallelism + + self.data = [] + + self.all_urls = fetch_root_sites() + self.domain_pattern = r'(?:[a-z0-9-]+\.)?([a-z0-9-]+\.[a-z]{2,})(?:\/|$)' + + self.allowed_domains = [] + self.start_urls = [] + self.CLASSIFICATION_API = os.getenv('CLASSIFICATION_API') + + for url in self.all_urls: + result = re.search(self.domain_pattern, url) + + if result: + self.allowed_domains.append(result.group(1)) + self.start_urls.append(result.group(1)) + + def start_requests(self): + for url in self.all_urls: + yield scrapy.Request(url=url, callback=self.parse) + + def is_this_article(self, url): + """ + Function to verify if the url is article url or not + """ + + # list of article url patterns + article_pattern = [ + r'/(blog|article|articles|post|blog|posts|blogs|)/\d{4}/\d{2}/+[a-z0-9-]+/?', + r'/(blog|article|articles|post|blog|posts|blogs|)/[a-z0-9-]+/[a-z0-9-]+', + r'(?= similarity_threshold + ] + + return similar_posts + except Exception as e: + raise HTTPException(status_code=400, detail=str) \ No newline at end of file diff --git a/src/introlix_api/crawler/__init__.py b/src/introlix_api/crawler/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/introlix_api/crawler/bot.py b/src/introlix_api/crawler/bot.py new file mode 100644 index 0000000000000000000000000000000000000000..46b53f39ce9c682754eb2caf7b17cc855b2cbaf0 --- /dev/null +++ b/src/introlix_api/crawler/bot.py @@ -0,0 +1,390 @@ +import os, sys, re, time +import errno +import string +import requests +import multiprocessing +from bs4 import BeautifulSoup +from dataclasses import dataclass +from introlix_api.logger import logger +from urllib.parse import urlparse, urlunsplit, urljoin +from urllib.robotparser import RobotFileParser +from introlix_api.exception import CustomException +from urllib.robotparser import RobotFileParser + +from requests import ReadTimeout +from introlix_api.utils.core import html_to_dom +from introlix_api.utils.tags import fetch_tags +from introlix_api.utils.root_sites import root_sites +from ssl import SSLCertVerificationError +from urllib3.exceptions import NewConnectionError, MaxRetryError + +@dataclass +class BotArgs: + TIMEOUT_SECONDS = 3 + MAX_FETCH_SIZE = 1024*1024 + BAD_URL_REGEX = re.compile(r'\/\/localhost\b|\.jpg$|\.png$|\.js$|\.gz$|\.zip$|\.pdf$|\.bz2$|\.ipynb$|\.py$') + GOOD_URL_REGEX = re.compile(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)') + DEFAULT_ENCODING = 'utf8' + DEFAULT_ENC_ERRORS = 'replace' + ALLOWED_EXCEPTIONS = (ValueError, ConnectionError, ReadTimeout, TimeoutError, + OSError, NewConnectionError, MaxRetryError, SSLCertVerificationError) + +class IntrolixBot: + def __init__(self, urls: list, args: BotArgs, obey_robots_txt: bool = True): + """ + Initialize the IntrolixBot. + + Args: + urls (list): List of URLs to scrape. + obey_robots_txt (bool, optional): Whether to obey robots.txt. Defaults to True. + """ + self.urls = urls + self.obey_robots_txt = obey_robots_txt + self.root_sites = root_sites() + self.root_sites_netlocs = {urlparse(root_url).netloc for root_url in self.root_sites} + self.good_tags = fetch_tags() + + # bot args + self.TIMEOUT_SECONDS = args.TIMEOUT_SECONDS + self.MAX_FETCH_SIZE = args.MAX_FETCH_SIZE + self.BAD_URL_REGEX = args.BAD_URL_REGEX + self.GOOD_URL_REGEX = args.GOOD_URL_REGEX + self.DEFAULT_ENCODING = args.DEFAULT_ENCODING + self.DEFAULT_ENC_ERRORS = args.DEFAULT_ENC_ERRORS + self.ALLOWED_EXCEPTIONS = args.ALLOWED_EXCEPTIONS + + def fetch(self, url:str) -> tuple[int, bytes]: + """ + Function to fetch a URL. + + Args: + url (str): URL to fetch. + Returns: + tuple[int, bytes]: status code and content. + """ + + r = requests.get(url, stream=True, timeout=self.TIMEOUT_SECONDS) + + size = 0 + start = time.time() + + content = b"" + for chunk in r.iter_content(1024): + if time.time() - start > self.TIMEOUT_SECONDS: + raise ValueError('Timeout reached') + + content += chunk + + size += len(chunk) + if size > self.MAX_FETCH_SIZE: + logger.debug(f"Maximum size reached for URL {url}") + break + + return r.status_code, content + + def see_robots_txt(self, url: str) -> bool: + """ + Function to check if robots.txt allows this bot to crawl. + + Args: + main_url (str): main root url of the site. + url (str): URL to check. + Returns: + bool: True if the bot is allowed to crawl, False otherwise. + """ + try: + try: + parsed_url = urlparse(url) + except ValueError: + logger.debug(f"Unable to parse URL: {url}") + return False + + robots_url = urlunsplit((parsed_url.scheme, parsed_url.netloc, 'robots.txt', '', '')) + parse_robots = RobotFileParser(robots_url) + + try: + status_code, content = self.fetch(robots_url) + except Exception as e: # Catch all exceptions for now + logger.debug(f"Robots error: {robots_url}, {e}") + return True + + decoded = None + for encoding in ['utf-8', 'iso-8859-1']: + try: + decoded = content.decode(encoding).splitlines() + break + except UnicodeDecodeError: + pass + + if decoded is None: + logger.debug(f"Unable to decode robots file {robots_url}") + return True + + parse_robots.parse(decoded) + allowed = parse_robots.can_fetch('IntrolixBot', url) # Your bot's name + logger.debug(f"Robots allowed for {url}: {allowed} and {decoded} is decoded with {robots_url}") + return allowed + except Exception as e: + raise CustomException(e, sys) from e + + def get_urls_from_page(self, url: str) -> list: + """ + Function to get all URLs from a page. + + Args: + url (str): URL of the page. + Returns: + list: List of URLs from the page. + """ + try: + status_code, content = self.fetch(url) + + if status_code != 200: + return [] + + soup = BeautifulSoup(content, 'html.parser') + urls = [] + + for link in soup.find_all('a'): + href = link.get('href') + if href: + if not href.startswith('http'): + href = urljoin(url, href) + # if not self.BAD_URL_REGEX.search(href): + # href = href + if self.GOOD_URL_REGEX.search(href): + href_netloc = urlparse(href).netloc + + logger.debug(f"Checking href domain: {href_netloc} against root domains") + + if href_netloc in self.root_sites_netlocs: + urls.append(href) + + return list(set(urls)) + + except Exception as e: + logger.info(f"Error occured while getting urls from page {e}") + return [] + # raise CustomException(e, sys) from e + + def scrape(self, url: str) -> dict: + """ + Function to scrape the site. + + Args: + url (str): URL to scrape. + Returns: + dict: scraped data. + """ + try: + logger.info(f"Crawling URL {url}") + js_timestamp = int(time.time() * 1000) + + if self.obey_robots_txt: + allowed = self.see_robots_txt(url) + + if not allowed: + return { + 'url': url, + 'status': None, + 'timestamp': js_timestamp, + 'content': None, + 'error': { + 'name': 'RobotsDenied', + 'message': 'Robots do not allow this URL', + } + } + + try: + status_code, content = self.fetch(url) + except self.ALLOWED_EXCEPTIONS as e: + logger.debug(f"Exception crawling URl {url}: {e}") + return { + 'url': url, + 'status': None, + 'timestamp': js_timestamp, + 'content': None, + 'error': { + 'name': 'AbortError', + 'message': str(e), + } + } + + if len(content) == 0: + return { + 'url': url, + 'status': status_code, + 'timestamp': js_timestamp, + 'content': None, + 'error': { + 'name': 'NoResponseText', + 'message': 'No response found', + } + } + + try: + dom = html_to_dom(content, self.DEFAULT_ENCODING, None, self.DEFAULT_ENC_ERRORS) + except Exception as e: + logger.exception(f"Error parsing dom: {url}") + return { + 'url': url, + 'status': status_code, + 'timestamp': js_timestamp, + 'content': None, + 'error': { + 'name': e.__class__.__name__, + 'message': str(e), + } + } + + title_element = dom.xpath("//title") + title = "" + if len(title_element) > 0: + title_text = title_element[0].text + if title_text is not None: + title = title_text.strip() + + + desc_element = dom.xpath("//meta[@name='description']") + desc = "" + if len(desc_element) > 0: + desc_text = desc_element[0].get('content') + if desc_text is not None: + desc = desc_text.strip() + + og_image_element = dom.xpath("//meta[@property='og:image']/@content") + if og_image_element: + image = og_image_element[0] + else: + image_elements = dom.xpath("//img") + image_urls = [urljoin(url, img.get("src")) for img in image_elements if img.get("src")] + if len(image_urls) > 0: + image = image_urls[0] + else: + image = "" + + new_links = self.get_urls_from_page(url) + new_links = list(set(new_links)) + + # Normalize extracted keywords to match the format in good_tags + normalized_title = re.split(r'[\s-]+', title.lower().translate(str.maketrans('', '', + string.punctuation))) + # Filter based on good_tags + tags = [tag for tag in self.good_tags if tag in normalized_title] + if not tags: + tags = ['general'] + + + date = dom.xpath("string(//meta[@property='article:published_time']/@content)") + + # Fallback: Check JSON-LD for datePublished in