Spaces:

satyam998
/

introlix_api

Sleeping

App Files Files Community

satyam998 commited on Nov 14, 2024

Commit

79d285f

0 Parent(s):

Initial commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dockerignore +0 -0
.github/workflows/ci.yml +34 -0
.gitignore +162 -0
.idx/dev.nix +55 -0
Dockerfile +19 -0
README.md +12 -0
app.py +205 -0
crawler.py +222 -0
demo.py +38 -0
docs/design.md +0 -0
docs/requirements.md +0 -0
main.py +40 -0
requirements-dev.txt +0 -0
requirements.txt +22 -0
setup.py +18 -0
src/introlix_api/app/__init__.py +0 -0
src/introlix_api/app/algolia.py +82 -0
src/introlix_api/app/appwrite.py +179 -0
src/introlix_api/app/database.py +23 -0
src/introlix_api/app/introlix_spider/introlix_spider/__init__.py +0 -0
src/introlix_api/app/introlix_spider/introlix_spider/items.py +12 -0
src/introlix_api/app/introlix_spider/introlix_spider/middlewares.py +103 -0
src/introlix_api/app/introlix_spider/introlix_spider/pipelines.py +13 -0
src/introlix_api/app/introlix_spider/introlix_spider/settings.py +100 -0
src/introlix_api/app/introlix_spider/introlix_spider/spiders/__init__.py +4 -0
src/introlix_api/app/introlix_spider/introlix_spider/spiders/generic.py +286 -0
src/introlix_api/app/introlix_spider/scrapy.cfg +11 -0
src/introlix_api/app/model.py +37 -0
src/introlix_api/app/routes/__init__.py +0 -0
src/introlix_api/app/routes/auth.py +109 -0
src/introlix_api/app/routes/posts.py +208 -0
src/introlix_api/app/routes/run_spider.py +23 -0
src/introlix_api/app/routes/similarity.py +83 -0
src/introlix_api/crawler/__init__.py +0 -0
src/introlix_api/crawler/bot.py +390 -0
src/introlix_api/engine/__init__.py +0 -0
src/introlix_api/engine/api_data.py +101 -0
src/introlix_api/engine/discussion.py +41 -0
src/introlix_api/engine/graphql.py +69 -0
src/introlix_api/engine/third_party_apis.py +108 -0
src/introlix_api/engine/youtube.py +54 -0
src/introlix_api/exception/__init__.py +34 -0
src/introlix_api/logger/__init__.py +22 -0
src/introlix_api/ml/__init__.py +0 -0
src/introlix_api/ml/model.py +0 -0
src/introlix_api/ml/recommendation.py +89 -0
src/introlix_api/pipeline/__init__.py +0 -0
src/introlix_api/pipeline/common_pipeline.py +0 -0
src/introlix_api/pipeline/periodic_pipeline.py +0 -0
src/introlix_api/utils/__init__.py +0 -0

.dockerignore ADDED Viewed

File without changes

.github/workflows/ci.yml ADDED Viewed

	@@ -0,0 +1,34 @@

+name: Sync to Hugging Face hub
+on:
+  push:
+    branches: [main]
+  # To run this workflow manually from the Actions tab
+  workflow_dispatch:
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          lfs: true
+      - name: Set up Git user
+        run: |
+          git config --global user.email "tubex998@gmail.com"
+          git config --global user.name "satyam998"
+      - name: Create a new branch
+        run: |
+          git checkout --orphan temp
+          git add -A
+          git commit -m "Initial commit"
+          git branch -D main
+          git branch -m main
+      - name: Force push to hub
+        env:
+          HF: ${{ secrets.HF_TOKEN }}
+        run: git push --force https://satyam998:$HF@huggingface.co/spaces/satyam998/introlix_api main

.gitignore ADDED Viewed

	@@ -0,0 +1,162 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

.idx/dev.nix ADDED Viewed

	@@ -0,0 +1,55 @@

+# To learn more about how to use Nix to configure your environment
+# see: https://developers.google.com/idx/guides/customize-idx-env
+{ pkgs, ... }: {
+  # Which nixpkgs channel to use.
+  channel = "stable-23.11"; # or "unstable"
+  # Use https://search.nixos.org/packages to find packages
+  packages = [
+    # pkgs.go
+    pkgs.python311
+    pkgs.python311Packages.pip
+    # pkgs.nodejs_20
+    # pkgs.nodePackages.nodemon
+  ];
+  # Sets environment variables in the workspace
+  env = {};
+  idx = {
+    # Search for the extensions you want on https://open-vsx.org/ and use "publisher.id"
+    extensions = [
+      # "vscodevim.vim"
+    ];
+    # Enable previews
+    previews = {
+      enable = true;
+      previews = {
+        # web = {
+        #   # Example: run "npm run dev" with PORT set to IDX's defined port for previews,
+        #   # and show it in IDX's web preview panel
+        #   command = ["npm" "run" "dev"];
+        #   manager = "web";
+        #   env = {
+        #     # Environment variables to set for your server
+        #     PORT = "$PORT";
+        #   };
+        # };
+      };
+    };
+    # Workspace lifecycle hooks
+    workspace = {
+      # Runs when a workspace is first created
+      onCreate = {
+        # Example: install JS dependencies from NPM
+        # npm-install = "npm install";
+      };
+      # Runs when the workspace is (re)started
+      onStart = {
+        # Example: start a background task to watch and re-build backend code
+        # watch-backend = "npm run watch-backend";
+      };
+    };
+  };
+}

Dockerfile ADDED Viewed

	@@ -0,0 +1,19 @@

+FROM python:3.10
+RUN useradd -m -u 1000 user
+WORKDIR /app
+COPY --chown=user . /app
+RUN pip install -r requirements.txt
+RUN mkdir -p /app/logs
+RUN chmod 777 /app/logs
+# Copy the shell script into the container
+COPY start.sh /app/start.sh
+RUN chmod +x /app/start.sh
+# Use the shell script to start both processes
+CMD ["/bin/bash", "/app/start.sh"]

README.md ADDED Viewed

	@@ -0,0 +1,12 @@

+---
+title: Introlix API
+emoji: 🔥
+colorFrom: green
+colorTo: blue
+sdk: docker
+pinned: false
+license: apache-2.0
+---
+# Introlix API
+<p>Introlix API offers a comprehensive suite of tools and APIs utilized in Introlix Feed. It is an advanced API that integrates multiple external APIs, RSS feed crawlers, and other data sources to provide a robust and efficient backend service.</p>

app.py ADDED Viewed

	@@ -0,0 +1,205 @@

+from fastapi import FastAPI, Query, HTTPException
+from bson import ObjectId
+import sys
+import httpx
+import os
+import crawler
+from fastapi.middleware.cors import CORSMiddleware
+from starlette.responses import RedirectResponse
+from introlix_api.app.routes import auth, posts, run_spider, similarity
+from typing import List
+from dotenv import load_dotenv, dotenv_values
+from introlix_api.app.appwrite import databases, APPWRITE_DATABASE_ID, ID, APPWRITE_ACCOUNT_COLLECTION_ID, get_interests
+from introlix_api.app.database import startup_db_client, shutdown_db_client
+from introlix_api.ml.recommendation import Recommendation
+from introlix_api.utils.tags import fetch_tags
+from introlix_api.exception import CustomException
+from contextlib import asynccontextmanager
+from pydantic import BaseModel, Field
+load_dotenv()
+YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")
+class FeedModel(BaseModel):
+    id: str = Field(..., alias="_id")
+    title: str
+    desc: str
+    url: str
+    publication_date: str
+    image_url: str
+    category: str
+    source: str
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    # Start the database connection
+    await startup_db_client(app)
+    yield
+    # Close the database connection
+    await shutdown_db_client(app)
+app = FastAPI(lifespan=lifespan)
+origins = [
+    "http://localhost:3000",
+    "http://192.168.1.64:3000",
+    "https://introlixfeed.vercel.app/",
+    "https://introlixfeed.vercel.com/"
+    # Add other allowed origins here if needed
+]
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=origins,  # Specify allowed origins
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@app.get("/", tags=["authentication"])
+async def index():
+    return RedirectResponse(url='/docs')
+@app.get("/feed_data", response_model=List[FeedModel])
+async def get_feed_data(page: int = 1, limit: int = 20, user_id: str = Query(...), category=None):
+    try:
+        skip = (page - 1) * limit
+        response = get_interests()
+        user_interests = []
+        # getting only the interests not keywords
+        for interest in response:
+            user_interests.append(interest['interest'])
+        users = databases.list_documents(
+            database_id=APPWRITE_DATABASE_ID,
+            collection_id=APPWRITE_ACCOUNT_COLLECTION_ID
+        )
+        for doc in users['documents']:
+            if user_id == doc['$id']:
+                user_interests = doc['interests']
+        user_interests = [item.split(':')[0] for item in user_interests]
+        # response = await app.mongodb['feedData'].find({"category": {"$in": user_interests}}).skip(skip).limit(limit).to_list(limit)
+        # Perform the aggregation
+        if category == None:
+            response = await app.mongodb['feedData'].find({"category": {"$in": user_interests}}).skip(skip).limit(limit).to_list(limit)
+        else:
+            response = await app.mongodb['feedData'].find({"category": category}).skip(skip).limit(limit).to_list(limit)
+        # random.shuffle(response)
+        # Filter out items that do not have a title
+        response = [item for item in response if item.get('title')]
+        response = [item for item in response if item.get('desc')]
+        article_titles = [item['title'] for item in response]
+        recommendation_system = Recommendation(user_interests, article_titles)
+        recommended_titles = recommendation_system.recommend()
+        response = [post for post in response if post['title'] in recommended_titles]
+        for item in response:
+            item['_id'] = str(item['_id'])
+            item['title'] = item.get('title') or ''
+            item['desc'] = item.get('desc') or ''
+            item['url'] = item.get('url') or ''
+            item['publication_date'] = item.get('publication_date') or ''
+            item['image_url'] = item.get('image_url') or ''
+            item['category'] = item.get('category') or ''
+            item['source'] = item.get('source') or ''
+        return response
+    except Exception as e:
+        raise CustomException(e, sys) from e
+@app.get("/fetch_post", response_model=FeedModel)
+async def get_feed_data(post_id: str = Query(...)):
+    try:
+        post_id = ObjectId(post_id)
+        response = await app.mongodb['feedData'].find_one({"_id": post_id})
+        if not response:
+            raise HTTPException(status_code=404, detail="Post not found")
+        # Convert _id to string
+        response["_id"] = str(response["_id"])
+        # Check for null values and set defaults if needed
+        response["desc"] = (response.get("desc") or "No Description")[:90]
+        response["publication_date"] = response.get("publication_date") or "Unknown Date"
+        response["image_url"] = response.get("image_url") or "No Image URL"
+        response["category"] = response.get("category") or "Uncategorized"
+        response["source"] = response.get("source") or "Unknown Source"
+        # for item in response:
+        #     item['title'] = item.get('title') or ''
+        #     item['desc'] = item.get('desc') or ''
+        #     item['url'] = item.get('url') or ''
+        #     item['publication_date'] = item.get('publication_date') or ''
+        #     item['image_url'] = item.get('image_url') or ''
+        #     item['category'] = item.get('category') or ''
+        #     item['source'] = item.get('source') or ''
+        return response
+    except Exception as e:
+        raise CustomException(e, sys) from e
+@app.get("/test_recommendation")
+async def test_recommendation(
+    user_interests: list[str] = Query(..., description="Comma-separated list of user interests"),
+    articles: list[str] = Query(..., description="Comma-separated list of articles")
+):
+    """
+    Test endpoint for recommendations.
+    Takes user interests and articles as query parameters and returns recommended articles.
+    """
+    # Create a recommendation instance
+    recommendation = Recommendation(user_interests, articles)
+    # Get the recommended articles
+    recommended_articles = recommendation.recommend()
+    return {
+        "user_interests": user_interests,
+        "recommended_articles": recommended_articles,
+    }
+@app.get("/youtube/videos")
+async def get_youtube_videos(query: str = None):
+    url = "https://www.googleapis.com/youtube/v3/search"
+    params = {
+        "key": YOUTUBE_API_KEY,
+        "part": "snippet",
+        "q": query or "trending",
+        "type": "video",
+        "maxResults": 10,
+        "order": "viewCount"  # You can change this to 'date' for recent uploads
+    }
+    async with httpx.AsyncClient() as client:
+        response = await client.get(url, params=params)
+        response.raise_for_status()  # Raise an error for bad responses
+        return response.json()
+@app.get("/tags")
+async def get_tags():
+    tags = fetch_tags()
+    return tags
+app.include_router(auth.router, prefix="/auth")
+app.include_router(run_spider.router, prefix="/spider")
+app.include_router(similarity.router, prefix="/feed")
+app.include_router(crawler.router)
+app.include_router(posts.router)

crawler.py ADDED Viewed

	@@ -0,0 +1,222 @@

+import re
+import sys
+import time
+from urllib.parse import urlparse
+from fastapi import APIRouter, HTTPException, Query
+from introlix_api.crawler.bot import IntrolixBot, BotArgs
+from introlix_api.exception import CustomException
+from introlix_api.logger import logger
+from introlix_api.utils.root_sites import root_sites
+from introlix_api.app.database import search_data, db
+from introlix_api.app.appwrite import fetch_root_sites, fetch_saved_urls, save_urls
+from pymongo import ASCENDING
+from pymongo.errors import DuplicateKeyError
+router = APIRouter()
+BATCH_SIZE = 10
+urls_batch = []
+storage_threshold = 500 * 1024 * 1024
+delete_batch = 1000
+def filter_urls(url: str) -> bool:
+    """
+    A function to filter non article urls from the scraped urls
+    Args:
+        url (list): url
+    Returns:
+        bool: True if the url is article url else False
+    """
+    parsed_url = urlparse(url)
+    if parsed_url.path in ('', '/'):
+        return False
+    non_article_keywords = [
+        "/product", "/products", "/home", "/item", "/items", "/category", "/categories",
+        "/login", "/signin", "/logout", "/signup", "/register", "/account", "/user",
+        "/profile", "/dashboard", "/settings", "/preferences", "/order", "/orders",
+        "/cart", "/checkout", "/payment", "/subscribe", "/subscription",
+        "/contact", "/support", "/help", "/faq", "/about", "/privacy", "/terms",
+        "/policy", "/conditions", "/legal", "/service", "/services", "/guide",
+        "/how-to", "/pricing", "/price", "fees", "/plans", "/features", "/partners",
+        "/team", "/careers", "/jobs", "/join", "/apply", "/training", "/demo",
+        "/trial", "/download", "/install", "/app", "/apps", "/software", "/portal",
+        "/index", "/main", "/video", "/videos", "/photo", "/photos",
+        "/image", "/images", "/gallery", "/portfolio", "/showcase", "/testimonials",
+        "/reviews", "/search", "/find", "/browse", "/list", "/tags", "/explore",
+        "/new", "/trending", "/latest", "/promotions", "/offers", "/deals", "/discount",
+        "/coupon", "/coupons", "/gift", "/store", "/stores", "/locator", "/locations",
+        "/branches", "/events", "/webinar", "/calendar", "/schedule",
+        "/class", "/classes", "/lesson", "/lessons", "/training", "/activity",
+        "/activities", "/workshop", "/exhibit", "/performance", "/map", "/directions",
+        "/weather", "/traffic", "/rates", "/auction", "/bid", "/tender", "/investment",
+        "/loan", "/mortgage", "/property", "/real-estate", "/construction", "/project",
+        "/client", "/clients", "/partner", "/sponsor", "/media", "/press", "/releases",
+        "/announcements", "/newsroom", "/resources", "courses", "collections", "/u/", "/members/",
+        "/@", "/shop", "/wiki", "/author", "/dynamic", "/image", "/submit"  # TODO: need to add more
+    ]
+    article_keywords = [
+        "/blog/", "post", "article", "insights", "guide", "tutorial",
+        "how-to", "what", "how", "introduction", "/news/"
+    ]
+    article_pattern = [
+        r'/(/blog/|article|articles|post|posts|blogs|news|)/\d{4}/\d{2}/+[a-z0-9-]+/?',
+        r'/(/blog/|article|articles|post|posts|blogs|news|)/[a-z0-9-]+/[a-z0-9-]+',
+        r'(?<!\/\/www)(/blog/|article|articles|post|posts|blogs|news|)/[a-z0-9-]+',
+        r'^(?!.*\/category\/).*\/[a-z0-9-]+\/[a-z0-9-]+(-[a-z0-9-]+)+$',
+        r'/[^/]+/\d{4}/\d{2}/\d{2}/+[a-z0-9]+/?',
+        r'/[^/]+/\d{4}/\d{2}/+[a-z0-9]+/?'
+        r'/[a-z0-9-]+/\d{4}/\d{2}/+/?',
+        r'/[a-z0-9-]+/\d{4}/\d{2}/\d{2}/+/?'
+    ]
+    for pattern in article_pattern:
+        if re.search(pattern, url):
+            if not any(keyword in url for keyword in non_article_keywords):
+                return True
+    if any (keyword in url for keyword in article_keywords):
+        return True
+    last_segment = parsed_url.path.strip('/').split('/')[-1]
+    if '-' in last_segment and len(last_segment.split('-')) > 2:
+        return True
+    return False
+def save_to_db(data):
+    global urls_batch
+    try:
+        # Check database storage size and delete old documents if needed
+        stats = db.command("collStats", "search_data")
+        storage_size = stats['size']
+        if storage_size >= storage_threshold:
+            oldest_docs = search_data.find().sort("createdAt", ASCENDING).limit(delete_batch)
+            oldest_ids = [doc['_id'] for doc in oldest_docs]
+            search_data.delete_many({"_id": {"$in": oldest_ids}})
+        # Prepare list of URLs to check in the database
+        urls = [d["url"] for d in data if filter_urls(d["url"])]
+        # Retrieve existing URLs from the database to filter out duplicates
+        existing_urls = set(search_data.find({"url": {"$in": urls}}).distinct("url"))
+        # Filter out documents with URLs that already exist in the database
+        unique_data = [
+            {"url": d["url"], "content": d["content"], "type": "article"}
+            for d in data
+            if d["url"] not in existing_urls and d.get("content") is not None
+        ]
+        # Insert only unique documents
+        if unique_data:
+            try:
+                search_data.insert_many(unique_data)
+            except DuplicateKeyError as e:
+                logger.info("Duplicate URL detected during insertion. Skipping duplicate entries.")
+        # Process URLs in `urls_batch` if it has URLs
+        if urls_batch:
+            try:
+                save_urls(urls_batch)
+            except Exception as e:
+                logger.error(f"Error saving URLs to Appwrite: {str(e)}")
+            urls_batch.clear()
+    except Exception as e:
+        raise CustomException(e, sys) from e
+def extract_urls(batch_size=BATCH_SIZE):
+    # Fetch documents with required fields only, reducing memory footprint per document
+    documents = search_data.find({}, {"content.links": 1})
+    # Initialize a list to store URLs in batches
+    batch_urls = []
+    for doc in documents:
+        # Extract URLs only if 'content' and 'links' exist
+        links = doc.get("content", {}).get("links")
+        if links:
+            # Use a generator to iterate over links directly
+            for url in links:
+                batch_urls.append(url)
+                # Yield URLs in batches to control memory usage
+                if len(batch_urls) >= batch_size:
+                    yield batch_urls
+                    batch_urls = []  # Clear the batch after yielding
+    # Yield any remaining URLs
+    if batch_urls:
+        yield batch_urls
+def crawler(urls_batch):
+    try:
+        bot = IntrolixBot(urls=urls_batch, args=BotArgs)
+        # Process each batch of scraped data
+        for data_batch in bot.scrape_parallel(batch_size=BATCH_SIZE):
+            save_to_db(data_batch)
+    except Exception as e:
+        raise CustomException(e, sys) from e
+def run_crawler_continuously():
+    global urls_batch
+    try:
+        while True:
+            start_time = time.time()  # Record the start time
+            while (time.time() - start_time) < 600:  # Run for 10 minutes (600 seconds)
+                try:
+                    root_urls = fetch_root_sites()
+                    saved_urls = fetch_saved_urls()
+                except Exception as e:
+                    logger.info("Error fetching URLs from Appwrite: %s", str(e))
+                    root_urls = []
+                    saved_urls = []
+                if root_urls and saved_urls:
+                    urls = root_urls + saved_urls
+                    urls = list(set(urls))
+                else:
+                    urls = root_sites() + urls_batch
+                if urls:
+                    logger.info(f"Starting crawler with {len(urls)} root URLs")
+                    crawler(urls[::-1])
+                # Extract and process URLs in batches
+                for extracted_urls in extract_urls(batch_size=BATCH_SIZE):
+                    urls_batch.extend(list(set(extracted_urls)))
+                    # logger.info(f"Starting crawler with {len(set(urls_batch))} extracted URLs from MongoDB")
+                    # crawler(list(set(urls_batch)))
+                time.sleep(1)
+            time.sleep(1)
+            # After 10 minutes, the while loop will restart without any pause
+            logger.info("Restarting the crawler for another 10-minute session.")
+    except Exception as e:
+        raise CustomException(e, sys) from e
+@router.post('/crawler')
+def run_crawler():
+    try:
+        run_crawler_continuously()
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=str(e))
+if __name__ == "__main__":
+    while True:
+        start_time = time.time()
+        while (time.time() - start_time) < 600:
+            run_crawler_continuously()
+#     # urls = extract_urls()
+#     # print(urls)

demo.py ADDED Viewed

	@@ -0,0 +1,38 @@

+#import csv
+# from introlix_api.app.database import feed_data
+# data = feed_data.find({}, {"_id": 0, "title": 1})  # Exclude _id, include only title
+# # Specify the CSV file to write to
+# csv_file = 'feed_data_titles.csv'
+# # Write data to a CSV file
+# with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
+#     writer = csv.writer(file)
+#     # Write header (just the title field)
+#     writer.writerow(["title"])
+#     # Write each document's title to the CSV
+#     for document in data:
+#         writer.writerow([document.get("title")])
+# print(f"Title data successfully saved to {csv_file}")
+# from introlix_api.crawler.bot import IntrolixBot, BotArgs
+# import time
+# start = time.time()
+# inbot = IntrolixBot(args=BotArgs, urls=["https://www.wikipedia.org/", "https://medium.com/", "https://www.bbc.com/"])
+# print(inbot.crawl(batch_size=1048))
+# # end = time.time()
+# print(f"Time taken: {end - start}")
+# from introlix_api.app.appwrite import fetch_root_sites
+# print(len(set(fetch_root_sites())))
+# Access the scraped data
+# for index, page_data in enumerate(inbot.data):
+#     print(f"Page {index + 1}:")
+#     print(page_data)
+#     print('-' * 40)

docs/design.md ADDED Viewed

File without changes

docs/requirements.md ADDED Viewed

File without changes

main.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import subprocess
+def run_app():
+    command = ["scrapy", "crawl", "generic"]
+    working_directory = "src/introlix_api/app/introlix_spider"
+    result = subprocess.run(command, cwd=working_directory, capture_output=True, text=True)
+    print("Output:", result.stdout)
+    print("Error:", result.stderr)
+if __name__ == "__main__":
+    # running the spider
+    run_app()
+    # def run_get_urls_from_page_parallel(self, urls: list, max_workers: int=10) -> list:
+    #     """
+    #     Running get_urls_from_page function in parallel for many runs.
+    #     Args:
+    #         urls (list): list of urls
+    #         max_workers (int, optional): number of workers. Defaults to 10.
+    #     Returns:
+    #         list: list of fetched urls
+    #     """
+    #     fetched_urls = []
+    #     with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+    #         futures = {executor.submit(self.get_urls_from_page, url): url for url in urls}
+    #         for future in concurrent.futures.as_completed(futures):
+    #             url = futures[future]
+    #             try:
+    #                 result = future.result()
+    #                 fetched_urls.append(result)
+    #             except Exception as e:
+    #                 raise CustomException(e, sys) from e
+    #     return list(set(list(url for sublist in fetched_urls if sublist is not None for url in sublist)))

requirements-dev.txt ADDED Viewed

File without changes

requirements.txt ADDED Viewed

	@@ -0,0 +1,22 @@

+numpy
+pandas
+scrapy
+fastapi
+uvicorn
+Jinja2
+appwrite
+python-dotenv
+pymongo
+aiohttp
+motor
+httpx
+torch
+scikit-learn
+beautifulsoup4
+sentence-transformers
+nltk
+algoliasearch
+apscheduler
+cachetools
+-e .

setup.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from setuptools import setup
+import setuptools
+with open("README.md", "r", encoding="utf-8") as f:
+    long_description = f.read()
+setup(
+    name="introlix_api",
+    version="0.0.1",
+    author="Satyam Mishra",
+    author_email="tubex998@gmail.com",
+    description="Introlix API offers a comprehensive suite of tools and APIs utilized in Introlix Feed.",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    package_dir={"": "src"},
+    packages=setuptools.find_packages(where="src"),
+    python_requires=">=3.10",
+)

src/introlix_api/app/__init__.py ADDED Viewed

File without changes

src/introlix_api/app/algolia.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import os
+import json
+import asyncio
+from bson import ObjectId
+from apscheduler.schedulers.asyncio import AsyncIOScheduler
+from introlix_api.app.database import search_data
+from algoliasearch.search.client import SearchClientSync
+from dotenv import load_dotenv
+load_dotenv()
+ALGOLIA_USER = os.getenv("ALGOLIA_USER")
+ALGOLIA_KEY = os.getenv("ALGOLIA_KEY")
+INDEX_NAME = "introlix_data"
+# Initialize the Algolia client
+_client = SearchClientSync(ALGOLIA_USER, ALGOLIA_KEY)
+def convert_object_ids(doc):
+    """Recursively convert ObjectId fields to strings in the document."""
+    for key, value in doc.items():
+        if isinstance(value, ObjectId):
+            doc[key] = str(value)
+        elif isinstance(value, dict):
+            convert_object_ids(value)  # Recursively convert in nested dicts
+        elif isinstance(value, list):
+            for item in value:
+                if isinstance(item, dict):
+                    convert_object_ids(item)  # Recursively convert in dicts within lists
+    return doc
+async def upload_data():
+    """Uploads data to Algolia in batches, updating records by setting `objectID` to prevent duplicates."""
+    batch_size = 1000
+    batch = []
+    cursor = search_data.find()
+    for doc in cursor:
+        # Convert any ObjectId fields to strings for JSON compatibility
+        doc = convert_object_ids(doc)
+        # Set `objectID` to ensure uniqueness and prevent duplicates
+        doc['objectID'] = str(doc['_id'])  # Using MongoDB _id as `objectID`
+        # Convert document to JSON string and check its size
+        doc_json = json.dumps(doc)
+        doc_size = len(doc_json.encode('utf-8'))
+        # Only add to batch if size is within Algolia's 10 KB limit
+        if doc_size <= 10000:
+            batch.append(doc)
+        # Send batch to Algolia when the batch size is reached
+        if len(batch) >= batch_size:
+            _client.save_objects(index_name=INDEX_NAME, objects=batch)
+            batch.clear()  # Clear the batch after sending
+    # Send any remaining documents
+    if batch:
+        _client.save_objects(index_name=INDEX_NAME, objects=batch)
+    print("Uploaded data to Algolia.")
+async def main():
+    # Run the upload function immediately
+    await upload_data()
+    scheduler = AsyncIOScheduler()
+    # Schedule `upload_data` to run every 4 hours
+    scheduler.add_job(upload_data, 'interval', hours=4)
+    scheduler.start()
+    print("Scheduler started. Uploading data to Algolia every 4 hours.")
+    # Keep the main thread alive to allow scheduled tasks to run
+    try:
+        await asyncio.Event().wait()
+    except (KeyboardInterrupt, SystemExit):
+        scheduler.shutdown()
+if __name__ == "__main__":
+    asyncio.run(main())

src/introlix_api/app/appwrite.py ADDED Viewed

	@@ -0,0 +1,179 @@

+import os
+import sys
+from appwrite.client import Client
+from appwrite.query import Query
+from appwrite.services.databases import Databases
+from appwrite.id import ID
+from dotenv import load_dotenv, dotenv_values
+from introlix_api.logger import logger
+from introlix_api.exception import CustomException
+from introlix_api.utils.common import is_valid_url, sanitize_url
+from pydantic import HttpUrl
+load_dotenv()
+APPWRITE_PROJECT_ID = os.getenv("APPWRITE_PROJECT_ID")
+APPWRITE_API_KEY = os.getenv("APPWRITE_API_KEY")
+APPWRITE_DATABASE_ID = os.getenv("APPWRITE_DATABASE_ID")
+APPWRITE_ROOTSITES_COLLECTION_ID = os.getenv("APPWRITE_ROOTSITES_COLLECTION_ID")
+APPWRITE_SAVED_URLS_COLLECTION_ID = os.getenv("APPWRITE_SAVED_URLS_COLLECTION_ID")
+APPWRITE_INTERESTS_TO_PICK_COLLECTION_ID = os.getenv("APPWRITE_INTERESTS_TO_PICK_COLLECTION_ID")
+APPWRITE_ACCOUNT_COLLECTION_ID = os.getenv("APPWRITE_ACCOUNT_COLLECTION_ID")
+client = Client()
+client.set_endpoint('https://cloud.appwrite.io/v1')
+client.set_project(APPWRITE_PROJECT_ID)
+client.set_key(APPWRITE_API_KEY)
+databases = Databases(client)
+# models for database
+class RootSitesModel:
+    url: HttpUrl
+# fetching the data from appwrite
+def fetch_root_sites():
+    """
+    Function to fetch the root sites from appwrite
+    """
+    try:
+        logger.info("Fetching all of the root sites...")
+        limit = 100
+        offset = 0
+        root_sites = []
+        while True:
+            response = databases.list_documents(database_id=APPWRITE_DATABASE_ID,collection_id=APPWRITE_ROOTSITES_COLLECTION_ID, queries=[Query.limit(limit), Query.offset(offset)]) # fetching all of the root sites
+            for root_site in response['documents']:
+                root_sites.append(root_site['url'])
+            if len(response['documents']) < limit:
+                break
+            offset += limit
+            # root_sites = [root_site['url'] for root_site in response['documents']] # extracting the urls
+        return root_sites
+    except Exception as e:
+        raise CustomException(e, sys) from e
+def fetch_saved_urls():
+    """
+    Function to fetch the root sites from appwrite
+    """
+    try:
+        logger.info("Fetching all of the saved urls...")
+        limit = 100
+        offset = 0
+        root_sites = []
+        while True:
+            response = databases.list_documents(database_id=APPWRITE_DATABASE_ID,collection_id=APPWRITE_SAVED_URLS_COLLECTION_ID, queries=[Query.limit(limit), Query.offset(offset)]) # fetching all of the root sites
+            for root_site in response['documents']:
+                root_sites.append(root_site['url'])
+            if len(response['documents']) < limit:
+                break
+            offset += limit
+            # root_sites = [root_site['url'] for root_site in response['documents']] # extracting the urls
+        return root_sites[-4000:]
+    except Exception as e:
+        raise CustomException(e, sys) from e
+def get_interests():
+    """
+    Function to fetch the interests list from where user can choose its interests
+    """
+    try:
+        response = databases.list_documents(database_id=APPWRITE_DATABASE_ID,collection_id=APPWRITE_INTERESTS_TO_PICK_COLLECTION_ID, queries=[Query.limit(100), Query.offset(0)])
+        interests = [{"interest": interest['interest'], "keywords": interest['keywords']} for interest in response['documents']]
+        return interests
+    except Exception as e:
+        raise CustomException(e, sys) from e
+def save_urls(urls):
+    """
+    Function to save the URLs in Appwrite. Handles large collections efficiently.
+    """
+    try:
+        limit = 10
+        offset = 0
+        existing_urls = set()  # Set to store unique URLs
+        # Check the total number of documents in the collection
+        total_count_response = databases.list_documents(
+            database_id=APPWRITE_DATABASE_ID,
+            collection_id=APPWRITE_SAVED_URLS_COLLECTION_ID,
+            queries=[Query.limit(1)]
+        )
+        total_count = total_count_response['total']
+        # Delete all documents if the count exceeds 20,000
+        if total_count > 20000:
+            logger.info("URL count exceeded 20,000. Deleting all documents in the collection.")
+            while True:
+                response = databases.list_documents(
+                    database_id=APPWRITE_DATABASE_ID,
+                    collection_id=APPWRITE_SAVED_URLS_COLLECTION_ID,
+                    queries=[Query.limit(limit)]
+                )
+                if not response['documents']:
+                    break  # All documents have been deleted
+                for doc in response['documents']:
+                    databases.delete_document(
+                        database_id=APPWRITE_DATABASE_ID,
+                        collection_id=APPWRITE_SAVED_URLS_COLLECTION_ID,
+                        document_id=doc['$id']
+                    )
+        # Fetch and process all documents in chunks to populate existing_urls set
+        offset = 0  # Reset offset after deletion
+        while True:
+            # Fetch a chunk of documents from the database
+            response = databases.list_documents(
+                database_id=APPWRITE_DATABASE_ID,
+                collection_id=APPWRITE_SAVED_URLS_COLLECTION_ID,
+                queries=[Query.limit(limit), Query.offset(offset)]
+            )
+            # Add the fetched URLs to the set
+            for doc in response['documents']:
+                existing_urls.add(doc['url'])
+            # Check if we have fetched all documents
+            if len(response['documents']) < limit:
+                break  # No more documents to fetch, exit the loop
+            # Move to the next batch
+            offset += limit
+        # Save only unique URLs that are not already in the set
+        for url in urls:
+            if url not in existing_urls:
+                if is_valid_url(url):
+                    sanitized_url = sanitize_url(url)
+                    databases.create_document(
+                        database_id=APPWRITE_DATABASE_ID,
+                        collection_id=APPWRITE_SAVED_URLS_COLLECTION_ID,
+                        document_id=ID.unique(),
+                        data={'url': sanitized_url}
+                    )
+    except Exception as e:
+        raise CustomException(e, sys) from e

src/introlix_api/app/database.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import os
+from pymongo import MongoClient
+from motor.motor_asyncio import AsyncIOMotorClient
+MONGODB_CLIENT_ID = os.getenv("MONGODB_CLIENT_ID")
+client = MongoClient(MONGODB_CLIENT_ID)
+db = client.IntrolixDb
+feed_data = db.feedData
+search_data = db.search_data
+votes = db.votes
+async def startup_db_client(app):
+    app.mongodb_client = AsyncIOMotorClient(MONGODB_CLIENT_ID)
+    app.mongodb = app.mongodb_client.get_database("IntrolixDb")
+    print("MongoDB connected.")
+async def shutdown_db_client(app):
+    app.mongodb_client.close()
+    print("Database disconnected.")

src/introlix_api/app/introlix_spider/introlix_spider/__init__.py ADDED Viewed

File without changes

src/introlix_api/app/introlix_spider/introlix_spider/items.py ADDED Viewed

	@@ -0,0 +1,12 @@

+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/items.html
+import scrapy
+class IntrolixSpiderItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    pass

src/introlix_api/app/introlix_spider/introlix_spider/middlewares.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+from scrapy import signals
+# useful for handling different item types with a single interface
+from itemadapter import is_item, ItemAdapter
+class IntrolixSpiderSpiderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+        # Should return None or raise an exception.
+        return None
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+        # Must return an iterable of Request, or item objects.
+        for i in result:
+            yield i
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+        # Should return either None or an iterable of Request or item objects.
+        pass
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
+class IntrolixSpiderDownloaderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)

src/introlix_api/app/introlix_spider/introlix_spider/pipelines.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+# useful for handling different item types with a single interface
+from itemadapter import ItemAdapter
+class IntrolixSpiderPipeline:
+    def process_item(self, item, spider):
+        return item

src/introlix_api/app/introlix_spider/introlix_spider/settings.py ADDED Viewed

	@@ -0,0 +1,100 @@

+# Scrapy settings for introlix_spider project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://docs.scrapy.org/en/latest/topics/settings.html
+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+BOT_NAME = "introlix_spider"
+SPIDER_MODULES = ["introlix_spider.spiders"]
+NEWSPIDER_MODULE = "introlix_spider.spiders"
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = "introlix_spider (+http://www.yourdomain.com)"
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = True
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+# Configure a delay for requests for the same website (default: 0)
+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+#    "Accept-Language": "en",
+#}
+# Enable or disable spider middlewares
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    "introlix_spider.middlewares.IntrolixSpiderSpiderMiddleware": 543,
+#}
+# Enable or disable downloader middlewares
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    "introlix_spider.middlewares.IntrolixSpiderDownloaderMiddleware": 543,
+#}
+# Enable or disable extensions
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    "scrapy.extensions.telnet.TelnetConsole": None,
+#}
+# Configure item pipelines
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+#ITEM_PIPELINES = {
+#    "introlix_spider.pipelines.IntrolixSpiderPipeline": 300,
+#}
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+# Enable and configure HTTP caching (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = "httpcache"
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
+# Set settings whose default value is deprecated to a future-proof value
+REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
+TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
+FEED_EXPORT_ENCODING = "utf-8"
+# Increase the number of concurrent requests
+CONCURRENT_REQUESTS = 32
+# Set to 0 for no delay between requests
+DOWNLOAD_DELAY = 0

src/introlix_api/app/introlix_spider/introlix_spider/spiders/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.

src/introlix_api/app/introlix_spider/introlix_spider/spiders/generic.py ADDED Viewed

	@@ -0,0 +1,286 @@

+import os
+import re
+import scrapy
+from concurrent.futures import ThreadPoolExecutor
+import aiohttp
+import asyncio
+from dotenv import load_dotenv, dotenv_values
+from introlix_api.app.database import feed_data, db
+from introlix_api.app.appwrite import fetch_root_sites
+load_dotenv()
+class GenericSpider(scrapy.Spider):
+    """
+    Spider to crawl internet to get data to display it on introlix feed
+    """
+    name = "generic"
+    def __init__(self, *args, **kwargs):
+        super(GenericSpider, self).__init__(*args, **kwargs)
+        self.executor = ThreadPoolExecutor(max_workers=10)  # Control parallelism
+        self.data = []
+        self.all_urls = fetch_root_sites()
+        self.domain_pattern = r'(?:[a-z0-9-]+\.)?([a-z0-9-]+\.[a-z]{2,})(?:\/|$)'
+        self.allowed_domains = []
+        self.start_urls = []
+        self.CLASSIFICATION_API = os.getenv('CLASSIFICATION_API')
+        for url in self.all_urls:
+            result = re.search(self.domain_pattern, url)
+            if result:
+                self.allowed_domains.append(result.group(1))
+                self.start_urls.append(result.group(1))
+    def start_requests(self):
+        for url in self.all_urls:
+            yield scrapy.Request(url=url, callback=self.parse)
+    def is_this_article(self, url):
+        """
+        Function to verify if the url is article url or not
+        """
+        # list of article url patterns
+        article_pattern = [
+            r'/(blog|article|articles|post|blog|posts|blogs|)/\d{4}/\d{2}/+[a-z0-9-]+/?',
+            r'/(blog|article|articles|post|blog|posts|blogs|)/[a-z0-9-]+/[a-z0-9-]+',
+            r'(?<!\/\/www)(blog|article|articles|post|posts|blogs)/[a-z0-9-]+',
+            r'^(?!.*\/category\/).*\/[a-z0-9-]+\/[a-z0-9-]+(-[a-z0-9-]+)+$',
+            r'/[^/]+/\d{4}/\d{2}/\d{2}/+[a-z0-9]+/?',
+            r'/[^/]+/\d{4}/\d{2}/+[a-z0-9]+/?'
+            r'/[a-z0-9-]+/\d{4}/\d{2}/+/?',
+            r'/[a-z0-9-]+/\d{4}/\d{2}/\d{2}/+/?'
+         ]
+        # list of non article keywords
+        non_article_words = [
+            "category", "signup", "login", "about", "contact",  # Add more non-article keywords...
+        ]
+        # Check if the url matches any of the article patterns
+        for pattern in article_pattern:
+            if re.search(pattern, url):
+                if not any(word in url for word in non_article_words):
+                    return True
+        return False
+    def parse(self, response):
+        # Get all the urls from the response
+        urls = response.css('a::attr(href)').extract()
+        # Filter out the urls that are not article urls
+        article_urls = [response.urljoin(url.split("?")[0]) for url in urls if self.is_this_article(url)]
+        # Send a request to each article url
+        for url in article_urls:
+            yield scrapy.Request(url=url, callback=self.parse_article)
+    async def classify_article(self, text):
+        """
+        function to classify the article
+        """
+        classify_ai = self.CLASSIFICATION_API
+        payload = {"text": text}
+        # Send a request to the classification API
+        async with aiohttp.ClientSession() as session:
+            try:
+                async with session.post(classify_ai, json=payload) as response:
+                    response.raise_for_status()
+                    result = await response.json()
+                    return result.get('category', 'Unknown')
+            except aiohttp.ClientError as e:
+                self.logger.error(f"Error making request to classification API: {e}")
+                return 'Error'
+    async def parse_article(self, response):
+        """
+        Function to get all details of the article
+        """
+        hostname = response.url.split("/")[2] # getting the website name of the article
+        title = response.css("h1::text").get() # getting the title of the article
+        url = response.url # getting the url of the article
+        desc = response.css('meta[name="description"]::attr(content)').get() # getting the description of the article
+        publication_date = response.css('span::text, time::text').re_first(r'(\w+ \d+|\d+\s?\w+,? \w+)') # getting the publication date of the article
+        image_url = response.css('meta[property="og:image"]::attr(content)').get() # getting the image url of the article
+        # Classify article title asynchronously
+        category = await self.classify_article(title) # getting the category of the article from the classification API
+        # Prepare feed item
+        feed_items = {
+            "title": title,
+            "desc": desc,
+            "url": url,
+            "publication_date": publication_date,
+            "image_url": image_url,
+            "category": category,
+            "source": hostname
+        }
+        self.data.append(feed_items)
+    def closed(self, reason):
+        print(f"Spider closed: {reason}")
+        print("Saving ----")
+        self.save_data()
+    def save_data(self):
+        # if "feed_Data" in db.list_collection_names():
+        #     feed_data.drop()
+        for feed_items in self.data:
+            feed_data.insert_one(feed_items)
+# import re
+# import scrapy
+# from pathlib import Path
+# import requests
+# from concurrent.futures import ThreadPoolExecutor
+# from twisted.internet.defer import ensureDeferred
+# from introlix_api.app.database import feed_data, db
+# from introlix_api.app.appwrite import fetch_root_sites
+# class GenericSpider(scrapy.Spider):
+#     name = "generic"
+#     def __init__(self, *args, **kwargs):
+#         super(GenericSpider, self).__init__(*args, **kwargs)
+#         self.executor = ThreadPoolExecutor(max_workers=10)
+#         self.data = []
+#         self.all_urls = fetch_root_sites()
+#         self.domain_pattern = r'(?:[a-z0-9-]+\.)?([a-z0-9-]+\.[a-z]{2,})(?:\/|$)'
+#         self.allowed_domains = []
+#         self.start_urls = []
+#         for url in self.all_urls:
+#             result = re.search(self.domain_pattern, url)
+#             if result:
+#                 self.allowed_domains.append(result.group(1))
+#                 self.start_urls.append(result.group(1))
+#     def start_requests(self):
+#         for url in self.all_urls:
+#             yield scrapy.Request(url=url, callback=self.parse)
+#     def is_this_article(self, url):
+#         article_pattern = [
+#             r'/(blog|article|articles|post|blog|posts|blogs|)/\d{4}/\d{2}/+[a-z0-9-]+/?',
+#             r'/(blog|article|articles|post|blog|posts|blogs|)/[a-z0-9-]+/[a-z0-9-]+',
+#             r'(?<!\/\/www)(blog|article|articles|post|posts|blogs)/[a-z0-9-]+',
+#             r'^(?!.*\/category\/).*\/[a-z0-9-]+\/[a-z0-9-]+(-[a-z0-9-]+)+$',
+#             r'/[^/]+/\d{4}/\d{2}/\d{2}/+[a-z0-9]+/?',
+#             r'/[^/]+/\d{4}/\d{2}/+[a-z0-9]+/?'
+#             r'/[a-z0-9-]+/\d{4}/\d{2}/+/?',
+#             r'/[a-z0-9-]+/\d{4}/\d{2}/\d{2}/+/?'
+#         ]
+#         # List of non-article keywords
+#         non_article_words = [
+#             "category",
+#             "signup",
+#             "login",
+#             "about",
+#             "contact",
+#             "privacy",
+#             "terms",
+#             "faq",
+#             "help",
+#             "support",
+#             "user",
+#             "account",
+#             "settings",
+#             "profile",
+#             "admin",
+#             "dashboard",
+#             "search",
+#             "index",
+#             "topics",
+#             "rss",
+#             "solutions",
+#             "shows",
+#             "author"
+#         ]
+#         for pattern in article_pattern:
+#             if re.search(pattern, url):
+#                 for word in non_article_words:
+#                     if word in url:
+#                         return False
+#                 return True
+#         return False
+#     def parse(self, response):
+#         urls = response.css('a::attr(href)').extract()
+#         article_urls = [response.urljoin(url.split("?")[0]) for url in urls if self.is_this_article(url)]
+#         for url in article_urls:
+#             yield scrapy.Request(url=url, callback=self.parse_article)
+#     def classify_article(self, text):
+#         classify_ai = "dont show api"
+#         payload = {"text": text}
+#         try:
+#             response = requests.post(classify_ai, json=payload)
+#             response.raise_for_status()
+#             return response.json().get('category', 'Unknown')
+#         except requests.RequestException as e:
+#             self.logger.error(f"Error making request to classification API: {e}")
+#             return 'Error'
+#     def parse_article(self, response):
+#         # getting all the infomation from the article
+#         hostname = response.url.split("/")[2]
+#         title = response.css("h1::text").get()
+#         url = response.url
+#         desc = response.css('meta[name="description"]::attr(content)').get()
+#         publication_date = response.css('span::text, time::text').re_first(r'(\w+ \d+|\d+\s?\w+,? \w+)')
+#         image_url = response.css('meta[property="og:image"]::attr(content)').get()
+#         # Using ThreadPoolExecutor to classify the title in a separate thread
+#         future = self.executor.submit(self.classify_article, title)
+#         category = future.result()
+#         # storing the infomation on mongodb
+#         feed_items = {
+#             "title": title,
+#             "desc": desc,
+#             "url": url,
+#             "publication_date": publication_date,
+#             "image_url": image_url,
+#             "category": category,
+#             "source": hostname
+#         }
+#         self.data.append(feed_items)
+#     def closed(self, reason):
+#         print(f"Spider closed: {reason}")
+#         print("Saving ----")
+#         self.save_data()
+#     def save_data(self):
+#         if "feed_Data" in db.list_collection_names():
+#             feed_data.drop()
+#         for feed_items in self.data:
+#             feed_data.insert_one(feed_items)

src/introlix_api/app/introlix_spider/scrapy.cfg ADDED Viewed

	@@ -0,0 +1,11 @@

+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+[settings]
+default = introlix_spider.settings
+[deploy]
+#url = http://localhost:6800/
+project = introlix_spider

src/introlix_api/app/model.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from pydantic import BaseModel, Field
+from typing import List, Optional
+from datetime import date
+from datetime import datetime
+# signup model
+class UserSignup(BaseModel):
+    name: str
+    email: str
+    password: str
+    dob: date
+    interestList: List[str]
+# login model
+class UserLogin(BaseModel):
+    email: str
+    password: str
+# feed model
+class FeedModel(BaseModel):
+    id: str = Field(..., alias="_id")
+    title: str
+    desc: str
+    url: str
+    image_url: str
+    tags: list
+    vote: int
+    created_at: Optional[datetime]
+class DiscussionModel(BaseModel):
+    id: str = Field(..., alias="_id")
+    title: str
+    url: str
+    tags: list
+    vote: int
+    created_at: Optional[datetime]
+    answer_count: int

src/introlix_api/app/routes/__init__.py ADDED Viewed

File without changes

src/introlix_api/app/routes/auth.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import random
+from fastapi import APIRouter, HTTPException, Query
+from introlix_api.exception import CustomException
+from introlix_api.app.model import UserSignup, UserLogin
+from introlix_api.app.appwrite import databases, APPWRITE_DATABASE_ID, ID, APPWRITE_ACCOUNT_COLLECTION_ID
+from introlix_api.logger import logger
+router = APIRouter()
+@router.post("/test")
+async def test(data: dict):
+    return {"message": f"POST request works with data {data}"}
+@router.post('/signup')
+async def signup(user: UserSignup):
+    """
+    Function to signup a new user
+    """
+    try:
+        # List of avatar colors
+        avatar_colors = [
+            "#FF4500",  # Orange Red
+            "#FF6347",  # Tomato
+            "#FF7F50",  # Coral
+            "#FF8C00",  # Dark Orange
+            "#FFD700",  # Gold
+            "#ADFF2F",  # Green Yellow
+            "#32CD32",  # Lime Green
+            "#00FA9A",  # Medium Spring Green
+            "#40E0D0",  # Turquoise
+            "#1E90FF",  # Dodger Blue
+            "#4682B4",  # Steel Blue
+            "#8A2BE2",  # Blue Violet
+            "#FF69B4",  # Hot Pink
+            "#FF1493",  # Deep Pink
+            "#C71585"   # Medium Violet Red
+            ]
+        # Check if the email is already registered
+        existing_users = databases.list_documents(
+            database_id=APPWRITE_DATABASE_ID,
+            collection_id=APPWRITE_ACCOUNT_COLLECTION_ID,
+        )
+        # Iterate through existing users to check if the email already exists
+        for doc in existing_users['documents']:
+            if doc['Email'] == user.email:
+                raise HTTPException(status_code=400, detail="Email is already registered")
+        # If email is not found, proceed with signup
+        result = databases.create_document(
+            database_id=APPWRITE_DATABASE_ID,
+            collection_id=APPWRITE_ACCOUNT_COLLECTION_ID,
+            document_id=ID.unique(),
+            data={
+                "Name": user.name,
+                "Email": user.email,
+                "Password": user.password,
+                "DOB": user.dob.isoformat(),
+                "interests": user.interestList,
+                "profileColor": random.choice(avatar_colors)
+            }
+        )
+        return {"message": "User created successfully", "document_id": result['$id'], "interests": result["interests"], "name": result["Name"][0], "profileColor": result["profileColor"]}
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=str(e))
+@router.post('/login')
+async def login(user: UserLogin):
+    """
+    Function to login a user
+    """
+    try:
+        # List of users
+        users = databases.list_documents(
+            database_id=APPWRITE_DATABASE_ID,
+            collection_id=APPWRITE_ACCOUNT_COLLECTION_ID
+        )
+        # Find user with matching email and password
+        for doc in users['documents']:
+            if doc['Email'] == user.email and doc['Password'] == user.password:
+                return {"message": "Login successful", "document_id": doc['$id'], "interests": doc["interests"], "name": doc["Name"][0], "profileColor": doc["profileColor"]}
+        raise HTTPException(status_code=400, detail="Invalid credentials")
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=str(e))
+@router.post("/verify_it_user")
+async def verify_user_exist(user_id: str = Query(...)):
+    """
+    Function to verify if the user exists
+    """
+    try:
+        # List of users
+        users = databases.list_documents(
+            database_id=APPWRITE_DATABASE_ID,
+            collection_id=APPWRITE_ACCOUNT_COLLECTION_ID
+        )
+        # Find user with matching id
+        for doc in users['documents']:
+            if user_id == doc['$id']:
+                return {"message": "It's User", "interests": doc["interests"], "name": doc["Name"][0], "profileColor": doc["profileColor"]}
+        # If no matching user found
+        raise HTTPException(status_code=404, detail="User not found")
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))

src/introlix_api/app/routes/posts.py ADDED Viewed

	@@ -0,0 +1,208 @@

+from bson import ObjectId
+import pytz
+from dateutil import parser
+from datetime import datetime, timezone
+from fastapi import FastAPI, APIRouter, HTTPException, Request, Query
+from introlix_api.app.database import votes
+from introlix_api.exception import CustomException
+from introlix_api.app.database import startup_db_client, shutdown_db_client
+from introlix_api.app.model import FeedModel, DiscussionModel
+from contextlib import asynccontextmanager
+from typing import List
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    # Start the database connection
+    await startup_db_client(app)
+    yield
+    # Close the database connection
+    await shutdown_db_client(app)
+router = APIRouter()
+def normalize_date(date_str):
+    try:
+        # Attempt to parse as ISO format with timezone
+        date_obj = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
+    except ValueError:
+        # If fromisoformat fails, fall back to a more flexible parser
+        try:
+            date_obj = parser.parse(date_str)
+        except (ValueError, TypeError):
+            print(f"Warning: Unrecognized date format for '{date_str}'")
+            return None  # Return None or handle the invalid date as needed
+    # Convert to UTC and return in ISO format
+    return date_obj.astimezone(pytz.UTC)
+@router.get('/posts', response_model=List[FeedModel])
+async def fetch_data(request: Request, tags: List[str] = Query(...), page: int = 1, limit: int = 20):
+    """
+    Function to fetch posts based on pagination, query, and sorting options.
+    """
+    try:
+        skip = (page - 1) * limit
+        query = {
+            "content.tags": {"$in": tags},
+            "type": "article"
+        }
+        response = await request.app.mongodb['search_data'].find(query).skip(skip).limit(limit).to_list(limit)
+        current_date = datetime.now(timezone.utc)
+        hotness_ranked_posts = []
+        for item in response:
+            item["_id"] = str(item['_id'])
+            item["title"] = item['content'].get('title', '')
+            item["desc"] = item['content'].get('desc', '')
+            item["url"] = item.get('url', '')
+            item["image_url"] = item['content'].get('image', '') or ""
+            item["tags"] = item['content'].get('tags', [])
+            item["vote"] = item['content'].get('vote', 0)
+            # Handle created_at normalization
+            created_at_str = item['content'].get('created_at', '')
+            if created_at_str in [None, "No date found"]:
+                created_at = current_date
+            else:
+                created_at = normalize_date(created_at_str)
+            # Ensure created_at is a datetime object; if None, skip the calculation
+            if created_at:
+                # Calculate age in hours
+                age_hours = (current_date - created_at).total_seconds() / 3600
+                # Hotness ranking formula
+                rank = (item["vote"] - 1) / ((age_hours + 2) ** 1.5)
+                item["rank"] = rank
+            else:
+                # If created_at is invalid, set rank low
+                item["rank"] = float('-inf')
+            item["created_at"] = created_at.isoformat() if created_at else "Unknown"
+            hotness_ranked_posts.append(item)
+        hotness_ranked_posts.sort(key=lambda x: x["rank"], reverse=False)
+        return hotness_ranked_posts
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=str(e))
+@router.get('/discussion', response_model=List[DiscussionModel])
+async def fetch_disscussion(request: Request, tags: List[str] = Query(...), page: int = 1, limit: int = 20):
+    """
+    Function to fetch discussion based on pagination, query, and sorting options.
+    """
+    try:
+        skip = (page - 1) * limit
+        query = {
+            "content.tags": {"$in": tags},
+            "type": "discussion"
+        }
+        response = await request.app.mongodb['search_data'].find(query).skip(skip).limit(limit).to_list(limit)
+        current_date = datetime.now(timezone.utc)
+        hotness_ranked_posts = []
+        for item in response:
+            item["_id"] = str(item['_id'])
+            item["title"] = item['content'].get('title', '')
+            item["url"] = item.get('url', '')
+            item["tags"] = item['content'].get('tags', [])
+            item["vote"] = item['content'].get('vote', 0)
+            item["answer_count"] = item['content'].get('answer_count', 0)
+            # Handle created_at normalization
+            created_at_str = item['content'].get('created_at', '')
+            created_at_str = datetime.utcfromtimestamp(created_at_str)
+            if created_at_str in [None, "No date found"]:
+                created_at = current_date
+            else:
+                created_at = normalize_date(str(created_at_str))
+            # Ensure created_at is a datetime object; if None, skip the calculation
+            if created_at:
+                # Calculate age in hours
+                age_hours = (current_date - created_at).total_seconds() / 3600
+                # Hotness ranking formula
+                rank = (item["vote"] - 1) / ((age_hours + 2) ** 1.5)
+                item["rank"] = rank
+            else:
+                # If created_at is invalid, set rank low
+                item["rank"] = float('-inf')
+            item["created_at"] = created_at.isoformat() if created_at else "Unknown"
+            hotness_ranked_posts.append(item)
+        hotness_ranked_posts.sort(key=lambda x: x["rank"], reverse=False)
+        return hotness_ranked_posts
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=str(e))
+@router.post('/vote')
+async def vote(request: Request, vote: int, post_id: str = Query(...), user_id: str = Query(...)):
+    """
+    Function to vote for a post.
+    """
+    try:
+        post_id = ObjectId(post_id)
+        # Check if the user has already voted for the post
+        result = await request.app.mongodb['votes'].find_one({"user_id": user_id, "post_id": post_id, "vote": vote})
+        if result:
+            votes.delete_one({
+                "_id": result["_id"]
+            })
+        else:
+            existing_vote = await request.app.mongodb['votes'].find_one({"user_id": user_id, "post_id": post_id})
+            if existing_vote:
+                votes.delete_one({
+                "_id": existing_vote["_id"]
+                })
+            votes.insert_one({
+                "post_id": post_id,
+                "user_id": user_id,
+                "vote": vote
+            })
+        # counting total vote
+         # Calculate the total vote count for the post
+        total_votes = await request.app.mongodb['votes'].aggregate([
+            {"$match": {"post_id": post_id}},
+            {"$group": {"_id": "$post_id", "total_votes": {"$sum": "$vote"}}}
+        ]).to_list(length=1)
+        # Extract the total vote count or default to 0 if no votes are found
+        vote_count = total_votes[0]["total_votes"] if total_votes else 0
+        # Update the vote count in the post document
+        await request.app.mongodb['search_data'].update_one(
+            {"_id": post_id},
+            {"$set": {"content.vote": vote_count}}
+        )
+        return {"message": f"Vote submitted successfully with total vote {vote_count}"}
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=str(e))
+@router.get('/hasvoted')
+async def hasVote(request: Request, post_id: str = Query(...), user_id: str = Query(...)):
+    """
+    Function to check if the user has already voted for a post.
+    """
+    try:
+        post_id = ObjectId(post_id)
+        existing_vote = await request.app.mongodb['votes'].find_one({"user_id": user_id, "post_id": post_id})
+        if existing_vote:
+            return {"has_voted": True, "vote": existing_vote['vote']}
+        else:
+            return {"has_voted": False}
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=str(e))

src/introlix_api/app/routes/run_spider.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import sys
+import subprocess
+from fastapi import APIRouter, HTTPException, Query
+from introlix_api.exception import CustomException
+from introlix_api.logger import logger
+router = APIRouter()
+@router.post('/run_spider')
+async def run_spider():
+    """
+    Function to run the introlix spider
+    """
+    try:
+        command = ["scrapy", "crawl", "generic"] # command to run the spider
+        working_directory = "src/introlix_api/app/introlix_spider" # directory to run the spider
+        result = subprocess.run(command, cwd=working_directory, capture_output=True, text=True) # run the spider
+        return result.stdout, result.stderr
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=str(e))

src/introlix_api/app/routes/similarity.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import random
+import re
+from fastapi import FastAPI, APIRouter, HTTPException, Request
+from introlix_api.exception import CustomException
+from introlix_api.app.database import startup_db_client, shutdown_db_client
+from introlix_api.logger import logger
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+from contextlib import asynccontextmanager
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    # Start the database connection
+    await startup_db_client(app)
+    yield
+    # Close the database connection
+    await shutdown_db_client(app)
+router = APIRouter()
+# Preprocessing function to clean text
+def preprocess_text(text):
+    # Convert to lowercase
+    text = text.lower()
+    # Remove extra spaces, newlines, and tabs
+    text = re.sub(r'\s+', ' ', text)
+    # Remove punctuation (optional, depending on your data)
+    text = re.sub(r'[^\w\s]', '', text)
+    return text
+@router.get('/similarity')
+async def similarity(request: Request, page: int = 1, limit: int = 20, query: str = None):
+    """
+    Function to calculate cosine similarity between posts and a query.
+    """
+    try:
+        # Ensure the query is provided
+        if not query:
+            raise HTTPException(status_code=400, detail="Query parameter is required")
+        skip = (page - 1) * limit
+        # Fetch posts from MongoDB
+        response = await request.app.mongodb['feedData'].find().skip(skip).limit(limit).to_list(limit)
+        # Filter out items that do not have both title and description
+        response = [item for item in response if item.get('title') and item.get('desc')]
+        # Convert ObjectId to string for MongoDB compatibility
+        for item in response:
+            item['_id'] = str(item['_id'])
+        # Prepare document texts (title + desc) for similarity calculation
+        posts_texts = [preprocess_text(item['title'] + ' ' + item['desc']) for item in response]
+        # Preprocess the query
+        query = preprocess_text(query)
+        # Include the query at the start of the document list
+        documents = [query] + posts_texts
+        # Apply TF-IDF Vectorizer
+        vectorizer = TfidfVectorizer(stop_words='english')
+        tfidf = vectorizer.fit_transform(documents)
+        # Calculate cosine similarity between the query and the posts
+        cosine_similarities = cosine_similarity(tfidf[0:1], tfidf[1:]).flatten()
+        # Debugging: Print cosine similarity scores for better understanding
+        print("Cosine Similarities:", cosine_similarities)
+        # Lower the similarity threshold for short text comparisons
+        similarity_threshold = 0.05
+        # Filter posts that have a cosine similarity above the threshold
+        similar_posts = [
+            response[i] for i in range(len(response)) if cosine_similarities[i] >= similarity_threshold
+        ]
+        return similar_posts
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=str)

src/introlix_api/crawler/__init__.py ADDED Viewed

File without changes

src/introlix_api/crawler/bot.py ADDED Viewed

	@@ -0,0 +1,390 @@

+import os, sys, re, time
+import errno
+import string
+import requests
+import multiprocessing
+from bs4 import BeautifulSoup
+from dataclasses import dataclass
+from introlix_api.logger import logger
+from urllib.parse import urlparse, urlunsplit, urljoin
+from urllib.robotparser import RobotFileParser
+from introlix_api.exception import CustomException
+from urllib.robotparser import RobotFileParser
+from requests import ReadTimeout
+from introlix_api.utils.core import html_to_dom
+from introlix_api.utils.tags import fetch_tags
+from introlix_api.utils.root_sites import root_sites
+from ssl import SSLCertVerificationError
+from urllib3.exceptions import NewConnectionError, MaxRetryError
+@dataclass
+class BotArgs:
+    TIMEOUT_SECONDS = 3
+    MAX_FETCH_SIZE = 1024*1024
+    BAD_URL_REGEX = re.compile(r'\/\/localhost\b|\.jpg$|\.png$|\.js$|\.gz$|\.zip$|\.pdf$|\.bz2$|\.ipynb$|\.py$')
+    GOOD_URL_REGEX = re.compile(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')
+    DEFAULT_ENCODING = 'utf8'
+    DEFAULT_ENC_ERRORS = 'replace'
+    ALLOWED_EXCEPTIONS = (ValueError, ConnectionError, ReadTimeout, TimeoutError,
+                      OSError, NewConnectionError, MaxRetryError, SSLCertVerificationError)
+class IntrolixBot:
+    def __init__(self, urls: list, args: BotArgs, obey_robots_txt: bool = True):
+        """
+        Initialize the IntrolixBot.
+        Args:
+            urls (list): List of URLs to scrape.
+            obey_robots_txt (bool, optional): Whether to obey robots.txt. Defaults to True.
+        """
+        self.urls = urls
+        self.obey_robots_txt = obey_robots_txt
+        self.root_sites = root_sites()
+        self.root_sites_netlocs = {urlparse(root_url).netloc for root_url in self.root_sites}
+        self.good_tags = fetch_tags()
+        # bot args
+        self.TIMEOUT_SECONDS = args.TIMEOUT_SECONDS
+        self.MAX_FETCH_SIZE = args.MAX_FETCH_SIZE
+        self.BAD_URL_REGEX = args.BAD_URL_REGEX
+        self.GOOD_URL_REGEX = args.GOOD_URL_REGEX
+        self.DEFAULT_ENCODING = args.DEFAULT_ENCODING
+        self.DEFAULT_ENC_ERRORS = args.DEFAULT_ENC_ERRORS
+        self.ALLOWED_EXCEPTIONS = args.ALLOWED_EXCEPTIONS
+    def fetch(self, url:str) -> tuple[int, bytes]:
+        """
+        Function to fetch a URL.
+        Args:
+            url (str): URL to fetch.
+        Returns:
+            tuple[int, bytes]: status code and content.
+        """
+        r = requests.get(url, stream=True, timeout=self.TIMEOUT_SECONDS)
+        size = 0
+        start = time.time()
+        content = b""
+        for chunk in r.iter_content(1024):
+            if time.time() - start > self.TIMEOUT_SECONDS:
+                raise ValueError('Timeout reached')
+            content += chunk
+            size += len(chunk)
+            if size > self.MAX_FETCH_SIZE:
+                logger.debug(f"Maximum size reached for URL {url}")
+                break
+        return r.status_code, content
+    def see_robots_txt(self, url: str) -> bool:
+        """
+        Function to check if robots.txt allows this bot to crawl.
+        Args:
+            main_url (str): main root url of the site.
+            url (str): URL to check.
+        Returns:
+            bool: True if the bot is allowed to crawl, False otherwise.
+        """
+        try:
+            try:
+                parsed_url = urlparse(url)
+            except ValueError:
+                logger.debug(f"Unable to parse URL: {url}")
+                return False
+            robots_url = urlunsplit((parsed_url.scheme, parsed_url.netloc, 'robots.txt', '', ''))
+            parse_robots = RobotFileParser(robots_url)
+            try:
+                status_code, content = self.fetch(robots_url)
+            except Exception as e:  # Catch all exceptions for now
+                logger.debug(f"Robots error: {robots_url}, {e}")
+                return True
+            decoded = None
+            for encoding in ['utf-8', 'iso-8859-1']:
+                try:
+                    decoded = content.decode(encoding).splitlines()
+                    break
+                except UnicodeDecodeError:
+                    pass
+            if decoded is None:
+                logger.debug(f"Unable to decode robots file {robots_url}")
+                return True
+            parse_robots.parse(decoded)
+            allowed = parse_robots.can_fetch('IntrolixBot', url)  # Your bot's name
+            logger.debug(f"Robots allowed for {url}: {allowed} and {decoded} is decoded with {robots_url}")
+            return allowed
+        except Exception as e:
+            raise CustomException(e, sys) from e
+    def get_urls_from_page(self, url: str) -> list:
+        """
+        Function to get all URLs from a page.
+        Args:
+            url (str): URL of the page.
+        Returns:
+            list: List of URLs from the page.
+        """
+        try:
+            status_code, content = self.fetch(url)
+            if status_code != 200:
+                return []
+            soup = BeautifulSoup(content, 'html.parser')
+            urls = []
+            for link in soup.find_all('a'):
+                href = link.get('href')
+                if href:
+                    if not href.startswith('http'):
+                        href = urljoin(url, href)
+                    # if not self.BAD_URL_REGEX.search(href):
+                    #     href = href
+                    if self.GOOD_URL_REGEX.search(href):
+                        href_netloc = urlparse(href).netloc
+                        logger.debug(f"Checking href domain: {href_netloc} against root domains")
+                        if href_netloc in self.root_sites_netlocs:
+                            urls.append(href)
+            return list(set(urls))
+        except Exception as e:
+            logger.info(f"Error occured while getting urls from page {e}")
+            return []
+            # raise CustomException(e, sys) from e
+    def scrape(self, url: str) -> dict:
+        """
+        Function to scrape the site.
+        Args:
+            url (str): URL to scrape.
+        Returns:
+            dict: scraped data.
+        """
+        try:
+            logger.info(f"Crawling URL {url}")
+            js_timestamp = int(time.time() * 1000)
+            if self.obey_robots_txt:
+                allowed = self.see_robots_txt(url)
+                if not allowed:
+                    return {
+                        'url': url,
+                        'status': None,
+                        'timestamp': js_timestamp,
+                        'content': None,
+                        'error': {
+                            'name': 'RobotsDenied',
+                            'message': 'Robots do not allow this URL',
+                        }
+                    }
+            try:
+                status_code, content = self.fetch(url)
+            except self.ALLOWED_EXCEPTIONS as e:
+                logger.debug(f"Exception crawling URl {url}: {e}")
+                return {
+                    'url': url,
+                    'status': None,
+                    'timestamp': js_timestamp,
+                    'content': None,
+                    'error': {
+                        'name': 'AbortError',
+                        'message': str(e),
+                    }
+                }
+            if len(content) == 0:
+                return {
+                    'url': url,
+                    'status': status_code,
+                    'timestamp': js_timestamp,
+                    'content': None,
+                    'error': {
+                        'name': 'NoResponseText',
+                        'message': 'No response found',
+                    }
+                }
+            try:
+                dom = html_to_dom(content, self.DEFAULT_ENCODING, None, self.DEFAULT_ENC_ERRORS)
+            except Exception as e:
+                logger.exception(f"Error parsing dom: {url}")
+                return {
+                    'url': url,
+                    'status': status_code,
+                    'timestamp': js_timestamp,
+                    'content': None,
+                    'error': {
+                    'name': e.__class__.__name__,
+                    'message': str(e),
+                }
+            }
+            title_element = dom.xpath("//title")
+            title = ""
+            if len(title_element) > 0:
+                title_text = title_element[0].text
+                if title_text is not None:
+                    title = title_text.strip()
+            desc_element = dom.xpath("//meta[@name='description']")
+            desc = ""
+            if len(desc_element) > 0:
+                desc_text = desc_element[0].get('content')
+                if desc_text is not None:
+                    desc = desc_text.strip()
+            og_image_element = dom.xpath("//meta[@property='og:image']/@content")
+            if og_image_element:
+                image = og_image_element[0]
+            else:
+                image_elements = dom.xpath("//img")
+                image_urls = [urljoin(url, img.get("src")) for img in image_elements if img.get("src")]
+                if len(image_urls) > 0:
+                    image = image_urls[0]
+                else:
+                    image = ""
+            new_links = self.get_urls_from_page(url)
+            new_links = list(set(new_links))
+            # Normalize extracted keywords to match the format in good_tags
+            normalized_title = re.split(r'[\s-]+', title.lower().translate(str.maketrans('', '',
+                                    string.punctuation)))
+            # Filter based on good_tags
+            tags = [tag for tag in self.good_tags if tag in normalized_title]
+            if not tags:
+                tags = ['general']
+            date = dom.xpath("string(//meta[@property='article:published_time']/@content)")
+            # Fallback: Check JSON-LD for datePublished in <script>
+            if not date:
+                json_ld_date = dom.xpath("string(//script[@type='application/ld+json'])")
+                if json_ld_date:
+                    import json
+                    try:
+                        data = json.loads(json_ld_date)
+                        date = data.get("datePublished", "").split("T")[0]
+                    except json.JSONDecodeError:
+                        pass
+            # Fallback: Look for <time> tag with datetime attribute
+            if not date:
+                date = dom.xpath("string(//time/@datetime)")
+            # Fallback: Check for common patterns with 'Last Updated'
+            if not date:
+                date = dom.xpath("string(//span[contains(text(), 'Last Updated')])")
+            # Clean up date format if necessary (for example, strip out extra text)
+            if date:
+                # Extract date pattern YYYY-MM-DD or similar
+                match = re.search(r"\d{4}-\d{2}-\d{2}", date) or re.search(r"\d{2} \w{3}, \d{4}", date)
+                date = match.group(0) if match else date
+            return {
+                'url': url,
+                'content': {
+                    'title': title,
+                    'desc': desc,
+                    'image': image,
+                    'tags': tags,
+                    'vote': 0,
+                    'links': sorted(new_links),
+                    'created_at': date if date else 'No date found'
+                },
+            }
+        except Exception as e:
+            raise CustomException(e, sys) from e
+    def batch_converter(self, lst: list, batch_size: int):
+        """
+        Convert list into batches of a specified size.
+        Args:
+            list (list): list to convert
+            batch_size (int): size of the batch
+        """
+        for i in range(0, len(lst), batch_size):
+            yield lst[i:i + batch_size]
+    def scrape_parallel(self, batch_size: int):
+        """
+        Process scrape in parallel using multiprocessing.
+        Args:
+            urls (list): List of site URLs to process.
+            batch_size (int): Number of URLs to process in each batch.
+        Returns:
+        """
+        num_workers = max(1, os.cpu_count() - 1)
+        # getting urls in batch
+        batch_url = list(self.batch_converter(self.urls, batch_size))
+        try:
+            # Create a multiprocessing pool
+            with multiprocessing.Pool(processes=num_workers) as pool:
+                for batch in batch_url:
+                    results = pool.map(self.scrape, batch)
+                    # data = list([sublist for sublist in results])
+                    yield results
+                    time.sleep(0.1)
+        except IOError as e:
+            if e.errno == errno.EPIPE:
+                pass
+    def get_urls_from_page_parallel(self, urls: list, batch_size: int):
+        """
+        Process get_urls_from_page in parallel using multiprocessing.
+        Args:
+            urls (list): List of site URLs to process.
+            batch_size (int): Number of URLs to process in each batch.
+        """
+        num_workers = max(1, os.cpu_count() - 1)
+        # getting urls in batch
+        batch_url = list(self.batch_converter(urls, batch_size))
+        try:
+            # Create a multiprocessing pool
+            with multiprocessing.Pool(processes=num_workers) as pool:
+                for batch in batch_url:
+                    results = pool.map(self.get_urls_from_page, batch)
+                    # return list([url for sublist in results for url in sublist])
+                    for sublist in results:
+                        for url in sublist:
+                            yield url  # Yield each URL incrementally
+                    time.sleep(0.1)
+        except IOError as e:
+            if e.errno == errno.EPIPE:
+                pass
+    def fetch_tags(self):
+        return self.good_tags

src/introlix_api/engine/__init__.py ADDED Viewed

File without changes

src/introlix_api/engine/api_data.py ADDED Viewed

	@@ -0,0 +1,101 @@

+from introlix_api.engine.third_party_apis import get_devDotTo_data
+from introlix_api.engine.graphql import fetch_hashnode_posts
+from introlix_api.app.database import search_data
+from introlix_api.logger import logger
+def fetch_data(page: int = 1, per_page: int = 10, tag = ''):
+    """
+    Function to fetch data from multiple sources and combine them.
+    """
+    devDotTo_data = get_devDotTo_data(page, per_page, tag)
+    hashnode_posts = fetch_hashnode_posts(page=page, per_page=per_page, tag=tag)
+    # Combine the fetched data
+    if hashnode_posts and devDotTo_data:
+        combined_data = devDotTo_data + hashnode_posts
+    elif hashnode_posts:
+        data = []
+        for item in hashnode_posts:
+            new_entry = {
+                "url": item["url"],
+                "content": {
+                    "title": item["title"],
+                    "desc": item["description"],
+                    "image": item["image"],
+                    "tags": item["tags"],
+                    "vote": 0,
+                    "created_at": item["created_at"],
+                },
+                "type": item["type"]
+            }
+            data.append(new_entry)
+        return data
+    elif devDotTo_data:
+        data = []
+        for item in devDotTo_data:
+            new_entry = {
+                "url": item["url"],
+                "content": {
+                    "title": item["title"],
+                    "desc": item["description"],
+                    "image": item["image"],
+                    "tags": item["tags"],
+                    "vote": 0,
+                    "created_at": item["created_at"]
+                },
+                "type": item["type"]
+            }
+            data.append(new_entry)
+        return data
+    else:
+        return []
+    data = []
+    for item in combined_data:
+        new_entry = {
+            "url": item["url"],
+            "content": {
+                "title": item["title"],
+                "desc": item["description"],
+                "image": item["image"],
+                "tags": item["tags"],
+                "vote": 0,
+                "created_at": item["created_at"]
+            },
+            "type": item["type"]
+        }
+        data.append(new_entry)
+    return data
+def batch_converter(lst: list, batch_size: int):
+    """
+    Convert list into batches of a specified size.
+    Args:
+        list (list): list to convert
+        batch_size (int): size of the batch
+    """
+    for i in range(0, len(lst), batch_size):
+        yield lst[i:i + batch_size]
+if __name__ == '__main__':
+    for page_no in range(1, 1001):
+        data = fetch_data(page=page_no)
+        if data:
+            for batch in batch_converter(data, batch_size=100):
+                urls = [d["url"] for d in data]
+                existing_urls = {doc["url"] for doc in search_data.find({"url": {"$in": urls}})}
+                for d in batch:
+                    if d["url"] not in existing_urls:
+                        search_data.insert_one(d)
+        else:
+            logger.debug("No data to save")

src/introlix_api/engine/discussion.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from introlix_api.engine.third_party_apis import get_stack_overflow_data
+from introlix_api.utils.tags import fetch_tags
+from introlix_api.logger import logger
+from introlix_api.app.database import search_data
+def fetch_discussion(page: int = 1, per_page: int = 10, tag: str = ''):
+    """
+    Function to fetch data from Stack Overflow API.
+    """
+    stack_overflow_data = get_stack_overflow_data(page=page, per_page=per_page, tag=tag)
+    data = []
+    for item in stack_overflow_data:
+        new_entry = {
+            "url": item["url"],
+            "content": {
+                "title": item["title"],
+                "tags": item["tags"],
+                "vote": 0,
+                "created_at": item["created_at"],
+                "answer_count": item["answer_count"],
+            },
+            "type": item["type"]
+        }
+        data.append(new_entry)
+    return data
+if __name__ == '__main__':
+    for tag in fetch_tags():
+        data = fetch_discussion(page=1, per_page=10, tag=tag)
+        if data:
+            urls = [d["url"] for d in data]
+            existing_urls = {doc["url"] for doc in search_data.find({"url": {"$in": urls}})}
+            for d in data:
+                if d["url"] not in existing_urls:
+                    search_data.insert_one(d)
+        else:
+            logger.debug("No data to save")

src/introlix_api/engine/graphql.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import requests
+def fetch_hashnode_posts(page=1, per_page = 10, tag = ''):
+    all_posts = []
+    has_next_page = True
+    end_cursor = None
+    posts_per_page = per_page  # Number of posts per page
+    # Calculate the number of posts to skip based on the requested page
+    skip_count = (page - 1) * posts_per_page
+    while has_next_page:
+        # Construct the GraphQL query with the specified number of posts per page
+        query = {
+            "query": f"""
+            query Publication {{
+                publication(host: "blog.developerdao.com") {{
+                    title
+                    posts(first: {posts_per_page}, after: {f'"{end_cursor}"' if end_cursor else 'null'}) {{
+                        edges {{
+                            node {{
+                                title
+                                brief
+                                url
+                                publishedAt
+                                tags {{
+                                    id
+                                    name
+                                }}
+                                coverImage {{
+                                    url
+                                }}
+                            }}
+                        }}
+                        pageInfo {{
+                            endCursor
+                            hasNextPage
+                        }}
+                    }}
+                }}
+            }}"""
+        }
+        # Make the POST request to the Hashnode GraphQL endpoint
+        response = requests.post("https://gql.hashnode.com/", json=query)
+        # Check for request success
+        if response.status_code == 200:
+            data = response.json()
+            posts = data['data']['publication']['posts']['edges']
+            # Append fetched posts to the all_posts list
+            all_posts.extend([edge['node'] for edge in posts])
+            # Update pagination info
+            page_info = data['data']['publication']['posts']['pageInfo']
+            end_cursor = page_info['endCursor']
+            has_next_page = page_info['hasNextPage']
+            # Stop if we've fetched enough posts
+            if len(all_posts) >= skip_count + posts_per_page:
+                break
+        else:
+            print(f"Error: {response.status_code} - {response.text}")
+            break
+    # Return only the posts for the requested page
+    if tag == 'bitcoin' or tag == 'web3':
+        return all_posts[skip_count:skip_count + posts_per_page]

src/introlix_api/engine/third_party_apis.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import os
+import sys
+import requests
+from introlix_api.logger import logger
+from introlix_api.exception import CustomException
+# Define the URL of the API endpoint
+DEV_DOT_TO_API = "https://dev.to/api/articles?tag={}&page={}&per_page={}"
+def get_devDotTo_data(page: int = 1, per_page: int = 10, tag: int = '') -> dict:
+    """
+    Function to fetch data from the dev.to API.
+    """
+    try:
+        # Construct the URL with the provided parameters
+        url = DEV_DOT_TO_API.format(tag, page, per_page)
+        response = requests.get(url)
+        if response.status_code!= 200:
+            logger.debug(f"Failed to fetch data from dev.to: {response.status_code}")
+        # Convert the response to JSON
+        articles = response.json()
+        extracted_articles = [
+            {
+                "title": article["title"],
+                "description": article["description"],
+                "url": article["url"],
+                "tags": article["tag_list"],
+                "image": article["cover_image"],
+                "created_at": article["created_at"],
+                "type": "article"
+            }
+            for article in articles
+            ]
+        return extracted_articles
+    except Exception as e:
+        raise CustomException(e, sys) from e
+def get_github_repo(page: int = 1, per_page: int = 10, tag: int = ''):
+    """
+    Function to fetch data from GitHub API.
+    """
+    try:
+        # Construct the URL with the provided parameters
+        url = f"https://api.github.com/search/repositories?q=topic:{tag}&sort=stars&page={page}&per_page={per_page}"
+        response = requests.get(url)
+        if response.status_code!= 200:
+            logger.debug(f"Failed to fetch data from GitHub: {response.status_code}")
+        # Convert the response to JSON
+        repos = response.json()
+        extracted_repos = [
+            {
+                "name": repo["name"],
+                "description": repo["description"],
+                "url": repo["html_url"],
+                "stars": repo["stargazers_count"],
+                "created_at": repo["created_at"],
+                "type": "article"
+            }
+            for repo in repos["items"]
+            ]
+        return extracted_repos
+    except Exception as e:
+        raise CustomException(e, sys) from e
+def get_stack_overflow_data(page: int = 1, per_page: int = 10, tag: int = ''):
+    """
+    Function to fetch data from Stack Overflow API.
+    """
+    try:
+        # Construct the URL with the provided parameters
+        url = f"https://api.stackexchange.com/2.3/questions?order=desc&sort=activity&tagged={tag}&site=stackoverflow&page={page}&pagesize={per_page}"
+        response = requests.get(url)
+        if response.status_code!= 200:
+            logger.debug(f"Failed to fetch data from Stack Overflow: {response.status_code}")
+        # Convert the response to JSON
+        questions = response.json()
+        extracted_questions = [
+            {
+                "title": question["title"],
+                "url": question["link"],
+                "tags": question["tags"],
+                "created_at": question["creation_date"],
+                "answer_count": question["answer_count"],
+                "type": "discussion"
+            }
+            for question in questions["items"]
+            ]
+        return extracted_questions
+    except Exception as e:
+        raise CustomException(e, sys) from e
+if __name__ == "__main__":
+    print(get_stack_overflow_data(1, 10, tag='python'))

src/introlix_api/engine/youtube.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import os
+import httpx
+import asyncio
+import time
+from introlix_api.utils.tags import fetch_tags
+from dotenv import load_dotenv
+from cachetools import TTLCache
+load_dotenv()
+YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")
+# Cache with TTL of 6 hours (21600 seconds)
+cache = TTLCache(maxsize=100, ttl=21600)
+async def get_youtube_videos():
+    url = "https://www.googleapis.com/youtube/v3/search"
+    videos = []
+    for tag in fetch_tags():
+        if tag in cache:
+            videos.append(cache[tag])  # Use cached data
+            continue
+        params = {
+            "key": YOUTUBE_API_KEY,
+            "part": "snippet",
+            "q": tag,
+            "type": "video",
+            "maxResults": 5,
+            "order": "viewCount"
+        }
+        async with httpx.AsyncClient() as client:
+            try:
+                response = await client.get(url, params=params)
+                response.raise_for_status()
+                result = response.json()
+                videos.append(result)
+                cache[tag] = result  # Cache the result
+            except httpx.HTTPStatusError as e:
+                print(f"HTTP error for tag '{tag}': {e}")
+                await asyncio.sleep(1)
+            except Exception as e:
+                print(f"Unexpected error: {e}")
+        await asyncio.sleep(0.5)
+    return videos
+async def main():
+    data = await get_youtube_videos()
+    print(data)
+asyncio.run(main())

src/introlix_api/exception/__init__.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import os
+import sys
+from introlix_api.logger import logger
+def error_message_detail(error, error_detail):
+    """
+    Retruns the error message and error details and logs the error
+    Args:
+        error: error message
+        error_detail: error details
+    Returns:
+        error_message: error message
+    """
+    _, _, exe_tb = error_detail.exc_info()
+    file_name = exe_tb.tb_frame.f_code.co_filename
+    line_number = exe_tb.tb_lineno
+    error_message = "Error occured in file called [{0}] line number: [{1}] error message: [{2}]".format(
+        file_name, line_number, str(error)
+    )
+    logger.info(error_message)
+    return error_message
+class CustomException(Exception):
+    def __init__(self, error_message, error_detail):
+        super().__init__(error_message)
+        self.error_message = error_message_detail(error_message, error_detail=error_detail)
+    def __str__(self):
+        return self.error_message

src/introlix_api/logger/__init__.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import logging
+import os
+from datetime import datetime
+"""
+Logging Every error and in logging file that is in the logs directory.
+"""
+LOG_FILE = f"running_logs.log"
+logs_path = os.path.join(os.getcwd(), "logs")
+os.makedirs(logs_path, exist_ok=True)
+LOG_FILE_PATH = os.path.join(logs_path, LOG_FILE)
+logging.basicConfig(
+    filename=LOG_FILE_PATH,
+    format="[ %(asctime)s ] %(lineno)d %(name)s - %(levelname)s - %(message)s",
+    level=logging.INFO,
+)
+logger = logging.getLogger("introlixLogger")

src/introlix_api/ml/__init__.py ADDED Viewed

File without changes

src/introlix_api/ml/model.py ADDED Viewed

File without changes

src/introlix_api/ml/recommendation.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import os
+import sys
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+from introlix_api.exception import CustomException
+from introlix_api.logger import logger
+from introlix_api.app.appwrite import get_interests
+class Recommendation:
+    def __init__(self, user_interests: list, articles: list):
+        """
+        Recommendation system for articles using sentence-transformers and cosine similarity
+        Args:
+            user_interests (list): list of user interests
+            articles (list): list of all articles
+        """
+        self.user_interests = user_interests
+        self.articles = articles
+        self.recommendations = []
+        self.model = SentenceTransformer('all-MiniLM-L6-v2')
+        self.response = get_interests()
+        self.user_interests = [interest['interest'] for interest in self.response]
+        self.interest_keywords = {item['interest'].split(':')[1]: item['keywords'] for item in self.response}
+    def encode(self, texts: list):
+        """
+        Function to encode text into embeddings using sentence-transformers
+        Args:
+            texts (list): list of text to be encoded
+        Returns:
+            encoded embedding values
+        """
+        try:
+            return self.model.encode(texts)
+        except Exception as e:
+            raise CustomException(e, sys)
+    def recommend(self):
+        """
+        Function to recommend aritcles based on user interests
+        Args:
+            None
+        Returns:
+            list of recommended articles
+        """
+        # Initialize new interests
+        new_interests = self.user_interests.copy()  # Start with the old
+        new_interests = [item.split(':')[0] for item in new_interests]
+        # Adding keywords to user interests based on existing interests
+        for interest in self.user_interests:
+            if interest in self.interest_keywords:
+                # Append related keywords to new_interests
+                new_interests.extend(self.interest_keywords[interest])
+        # Remove duplicates if needed
+        new_interests = list(set(new_interests))
+        # encoding user interests into embeddings
+        # print(f"Here is user interest keywords: {self.interest_keywords}")
+        user_interests_embeddings = self.encode(new_interests)
+        user_interests_embeddings = np.mean(user_interests_embeddings, axis=0)  # Averaging embeddings
+        # Reshape user embedding to (1, -1) for compatibility with cosine_similarity
+        user_interests_embeddings = user_interests_embeddings.reshape(1, -1)
+        # encoding all articles into embeddings
+        article_embeddings = self.encode(self.articles)
+        # print(f"Shape of user_interests_embeddings: {user_interests_embeddings.shape}")
+        # print(f"Shape of article_embeddings: {article_embeddings.shape}")
+        # calculate cosine similarity between user interests and all article embeddings
+        similarities = cosine_similarity(user_interests_embeddings, article_embeddings).flatten()
+        # sort articles based on similarity
+        recommended_indices = np.argsort(similarities)[::-1]
+        # Get all recommended articles sorted by similarity
+        recommended_articles = [self.articles[i] for i in recommended_indices]
+        return recommended_articles

src/introlix_api/pipeline/__init__.py ADDED Viewed

File without changes

src/introlix_api/pipeline/common_pipeline.py ADDED Viewed

File without changes

src/introlix_api/pipeline/periodic_pipeline.py ADDED Viewed

File without changes

src/introlix_api/utils/__init__.py ADDED Viewed

File without changes