Spaces:

brestok
/

eu-scrapper

Sleeping

App Files Files Community

brestok commited on Sep 15, 2025

Commit

ab0a73d

1 Parent(s): b60402f

init

Browse files

Files changed (15) hide show

app/api/scraper/db_requests.py +76 -134
app/api/scraper/models.py +8 -8
app/api/scraper/schemas.py +6 -1
app/api/scraper/services/__init__.py +2 -1
app/api/scraper/services/background_tasks.py +10 -7
app/api/scraper/services/parser/__init__.py +2 -2
app/api/scraper/services/parser/equinix.json +0 -0
app/api/scraper/services/parser/intelligent_retry.py +0 -594
app/api/scraper/services/parser/parser.py +0 -935
app/api/scraper/services/parser/parsers.py +293 -0
app/api/scraper/utils.py +16 -0
app/api/scraper/views.py +29 -3
app/core/config.py +2 -0
main.py +0 -0
requirements.txt +42 -0

app/api/scraper/db_requests.py CHANGED Viewed

@@ -4,12 +4,12 @@ Scraper database requests.
 import asyncio
 import re
-from datetime import datetime
 from app.api.common.schemas import FilterRequest
-from app.api.scraper.dto import JobFilter, Salary
 from app.api.scraper.models import JobModel
 from app.core.config import settings
 async def filter_jobs(
@@ -20,25 +20,31 @@ async def filter_jobs(
     """
     query = {}
     skip = filter_request.pageSize * filter_request.pageIndex
     if filter_request.filter.titles:
         query["title"] = {
-            "$regex": "|".join([re.escape(title) for title in filter_request.filter.titles]),
-            "$options": "i"
         }
     if filter_request.filter.companies:
         query["company"] = {
-            "$regex": "|".join([re.escape(company) for company in filter_request.filter.companies]),
-            "$options": "i"
         }
     if filter_request.filter.locations:
         query["location"] = {
-            "$regex": "|".join([re.escape(location) for location in filter_request.filter.locations]),
-            "$options": "i"
         }
     if filter_request.filter.minSalary:
         query["salary.min"] = {"$gte": filter_request.filter.minSalary}
     if filter_request.filter.maxSalary:
@@ -60,134 +66,70 @@ async def filter_jobs(
     )
     return [JobModel.from_mongo(job) for job in jobs], total_count
 async def search_field_options(field: str, value: str) -> list[str]:
     """
     Search field options based on the field and value.
     """
     if field not in ["title", "company", "location"]:
         return []
-    pipeline = [{
-        "$match": {
-            field: {"$regex": re.escape(value), "$options": "i"}
-        }
-    }]
-    pipeline.extend([
-        {"$group": {"_id": f"${field}"}},
-        {"$project": {"_id": 0, "value": "$_id"}},
-        {"$sort": {"value": 1}},
-        {"$limit": 5}
-    ])
     result = await settings.DB_CLIENT.jobs.aggregate(pipeline).to_list(length=5)
     return [item["value"] for item in result if item["value"] and item["value"].strip()]
-async def main():
-    jobs = [
-        JobModel(
-            title="Frontend Developer",
-            company="Meta",
-            location="Menlo Park, CA",
-            description="Join our frontend team to build the next generation of social media experiences. You'll work with React, TypeScript, and cutting-edge web technologies to create engaging user interfaces that connect billions of people worldwide.",
-            requirements="3+ years of experience with React, JavaScript/TypeScript, HTML5, CSS3. Experience with modern build tools and testing frameworks. Strong understanding of responsive design and cross-browser compatibility.",
-            salary=Salary(min=110000, max=160000),
-            isTop5=True,
-            sourceUrl="https://www.metacareers.com",
-        ),
-        JobModel(
-            title="DevOps Engineer",
-            company="Amazon",
-            location="Seattle, WA",
-            description="We're looking for a DevOps Engineer to help scale our cloud infrastructure. You'll work with AWS services, Kubernetes, and automation tools to ensure reliable, scalable, and secure deployments across multiple environments.",
-            requirements="5+ years of experience with AWS services, Docker, Kubernetes, Jenkins/GitLab CI, Terraform. Strong scripting skills in Python or Bash. Experience with monitoring and logging solutions.",
-            salary=Salary(min=130000, max=190000),
-            isTop5=True,
-            sourceUrl="https://www.amazon.jobs",
-        ),
-        JobModel(
-            title="Data Scientist",
-            company="Netflix",
-            location="Los Gatos, CA",
-            description="Join our data science team to analyze user behavior and optimize content recommendations. You'll work with large datasets, machine learning models, and statistical analysis to drive business decisions and enhance user experience.",
-            requirements="PhD or Master's in Statistics, Computer Science, or related field. 4+ years of experience with Python, R, SQL, machine learning libraries (scikit-learn, TensorFlow). Experience with A/B testing and statistical modeling.",
-            salary=Salary(min=140000, max=200000),
-            isTop5=True,
-            sourceUrl="https://jobs.netflix.com",
-        ),
-        JobModel(
-            title="Backend Developer",
-            company="Spotify",
-            location="Stockholm, Sweden",
-            description="Build and maintain scalable backend services that power music streaming for millions of users. Work with microservices architecture, APIs, and distributed systems using modern technologies.",
-            requirements="3+ years of backend development experience with Java, Python, or Go. Experience with microservices, REST APIs, databases (PostgreSQL, Redis). Knowledge of cloud platforms and containerization.",
-            salary=Salary(min=80000, max=120000),
-            isTop5=False,
-            sourceUrl="https://www.lifeatspotify.com",
-        ),
-        JobModel(
-            title="Mobile App Developer",
-            company="Uber",
-            location="San Francisco, CA",
-            description="Develop and maintain mobile applications for iOS and Android platforms. Work on features that millions of users interact with daily, focusing on performance, user experience, and reliability.",
-            requirements="4+ years of mobile development experience with Swift/Objective-C for iOS or Kotlin/Java for Android. Experience with React Native is a plus. Strong understanding of mobile UI/UX principles.",
-            salary=Salary(min=115000, max=165000),
-            isTop5=False,
-            sourceUrl="https://www.uber.com/careers",
-        ),
-        JobModel(
-            title="Product Manager",
-            company="Airbnb",
-            location="San Francisco, CA",
-            description="Lead product strategy and development for our host and guest experience platforms. Work cross-functionally with engineering, design, and data teams to deliver features that enhance user satisfaction and business growth.",
-            requirements="5+ years of product management experience, preferably in consumer-facing products. Strong analytical skills, experience with A/B testing, user research. MBA preferred but not required.",
-            salary=Salary(min=140000, max=220000),
-            isTop5=True,
-            sourceUrl="https://careers.airbnb.com",
-        ),
-        JobModel(
-            title="Cybersecurity Analyst",
-            company="Microsoft",
-            location="Redmond, WA",
-            description="Protect our cloud infrastructure and services from security threats. Monitor security events, conduct threat analysis, and implement security measures to safeguard customer data and corporate assets.",
-            requirements="Bachelor's in Cybersecurity or related field. 3+ years of experience in security operations, threat detection, incident response. Certifications like CISSP, CEH, or equivalent preferred.",
-            salary=Salary(min=95000, max=140000),
-            isTop5=False,
-            sourceUrl="https://careers.microsoft.com",
-        ),
-        JobModel(
-            title="UX Designer",
-            company="Adobe",
-            location="San Jose, CA",
-            description="Design intuitive and engaging user experiences for our creative software products. Collaborate with product managers and engineers to create user-centered designs that enhance creativity and productivity for millions of users.",
-            requirements="4+ years of UX design experience with a strong portfolio. Proficiency in design tools like Figma, Sketch, Adobe Creative Suite. Experience with user research, prototyping, and design systems.",
-            salary=Salary(min=105000, max=150000),
-            isTop5=False,
-            sourceUrl="https://adobe.wd5.myworkdayjobs.com",
-        ),
-        JobModel(
-            title="Machine Learning Engineer",
-            company="Tesla",
-            location="Palo Alto, CA",
-            description="Develop and deploy machine learning models for autonomous driving systems. Work with computer vision, deep learning, and sensor fusion to advance the future of transportation and sustainable energy.",
-            requirements="Master's or PhD in Machine Learning, AI, or related field. 3+ years of experience with PyTorch, TensorFlow, computer vision. Experience with autonomous systems, robotics, or automotive industry preferred.",
-            salary=Salary(min=150000, max=220000),
-            isTop5=True,
-            sourceUrl="https://www.tesla.com/careers",
-        ),
-    ]
-    for job in jobs:
-        await settings.DB_CLIENT.jobs.insert_one(job.to_mongo())
-if __name__ == "__main__":
-    asyncio.run(main())

 import asyncio
 import re
+from datetime import timedelta, datetime
 from app.api.common.schemas import FilterRequest
+from app.api.scraper.dto import JobFilter
 from app.api.scraper.models import JobModel
 from app.core.config import settings
+from app.api.scraper.schemas import StatisticsResponse
 async def filter_jobs(
     """
     query = {}
     skip = filter_request.pageSize * filter_request.pageIndex
     if filter_request.filter.titles:
         query["title"] = {
+            "$regex": "|".join(
+                [re.escape(title) for title in filter_request.filter.titles]
+            ),
+            "$options": "i",
         }
     if filter_request.filter.companies:
         query["company"] = {
+            "$regex": "|".join(
+                [re.escape(company) for company in filter_request.filter.companies]
+            ),
+            "$options": "i",
         }
     if filter_request.filter.locations:
         query["location"] = {
+            "$regex": "|".join(
+                [re.escape(location) for location in filter_request.filter.locations]
+            ),
+            "$options": "i",
         }
     if filter_request.filter.minSalary:
         query["salary.min"] = {"$gte": filter_request.filter.minSalary}
     if filter_request.filter.maxSalary:
     )
     return [JobModel.from_mongo(job) for job in jobs], total_count
 async def search_field_options(field: str, value: str) -> list[str]:
     """
     Search field options based on the field and value.
     """
     if field not in ["title", "company", "location"]:
         return []
+    pipeline = [{"$match": {field: {"$regex": re.escape(value), "$options": "i"}}}]
+    pipeline.extend(
+        [
+            {"$group": {"_id": f"${field}"}},
+            {"$project": {"_id": 0, "value": "$_id"}},
+            {"$sort": {"value": 1}},
+            {"$limit": 5},
+        ]
+    )
     result = await settings.DB_CLIENT.jobs.aggregate(pipeline).to_list(length=5)
     return [item["value"] for item in result if item["value"] and item["value"].strip()]
+async def get_statistics() -> StatisticsResponse:
+    """
+    Get the statistics.
+    """
+    result = (
+        await settings.DB_CLIENT.jobs.find({}, {"_id": 0, "datetimeInserted": 1})
+        .sort("_id", -1)
+        .limit(1)
+        .to_list(length=1)
+    )
+    lastUpdate = datetime.fromisoformat(result[0]["datetimeInserted"])
+    now = datetime.now()
+    days_until_sunday = (6 - now.weekday()) % 7
+    if days_until_sunday == 0 and now.hour < 23:
+        nextUpdate = now.replace(hour=23, minute=0, second=0, microsecond=0)
+    else:
+        if days_until_sunday == 0:
+            days_until_sunday = 7
+        nextUpdate = (now + timedelta(days=days_until_sunday)).replace(
+            hour=23, minute=0, second=0, microsecond=0
+        )
+    return StatisticsResponse(lastUpdate=lastUpdate, nextUpdate=nextUpdate)
+async def save_job_obj(job: JobModel):
+    """
+    Save a job object.
+    """
+    if await check_if_job_exists(job):
+        print(f"Skipped job [{job.title}]!")
+        return
+    await settings.DB_CLIENT.jobs.insert_one(job.to_mongo())
+    print(f"Job [{job.title}] saved!")
+async def check_if_job_exists(job: JobModel):
+    """
+    Check if a job exists.
+    """
+    job = await settings.DB_CLIENT.jobs.find_one(
+        {"title": job.title, "description": job.description}, {"_id": 1}
+    )
+    return True if job else False

app/api/scraper/models.py CHANGED Viewed

@@ -7,12 +7,12 @@ from app.core.database import MongoBaseModel
 class JobModel(MongoBaseModel):
-    title: str
-    company: str
-    location: str
-    description: str
-    requirements: str
-    salary: Salary
     isTop5: bool = False
-    sourceUrl: str
-    datetimeInserted: datetime = Field(default_factory=datetime.now)

 class JobModel(MongoBaseModel):
+    title: str | None = None
+    company: str | None = None
+    location: str | None = None
+    description: str | None = None
+    requirements: str | None = None
+    salary: Salary | None = None
     isTop5: bool = False
+    sourceUrl: str | None = None
+    datetimeInserted: datetime = Field(default_factory=datetime.now)

app/api/scraper/schemas.py CHANGED Viewed

@@ -1,5 +1,10 @@
 from pydantic import BaseModel
 class SearchOptionRequest(BaseModel):
-    value: str

+from datetime import datetime
 from pydantic import BaseModel
 class SearchOptionRequest(BaseModel):
+    value: str
+class StatisticsResponse(BaseModel):
+    lastUpdate: datetime
+    nextUpdate: datetime

app/api/scraper/services/__init__.py CHANGED Viewed

@@ -4,5 +4,6 @@ Chat services.
 from .workflows import generate_chat_name
 from .background_tasks import schedule_update
-__all__ = ["generate_chat_name", "schedule_update"]

 from .workflows import generate_chat_name
 from .background_tasks import schedule_update
+from .background_tasks import run_update
+__all__ = ["generate_chat_name", "schedule_update", "run_update"]

app/api/scraper/services/background_tasks.py CHANGED Viewed

@@ -3,12 +3,7 @@ import asyncio
 from apscheduler.schedulers.asyncio import AsyncIOScheduler  # type: ignore
 from apscheduler.triggers.cron import CronTrigger  # type: ignore
-async def run_update():
-    """
-    Run periodic update tasks.
-    """
-    print("Running scheduled update task...")
 async def schedule_update():
@@ -20,4 +15,12 @@ async def schedule_update():
     scheduler.start()
     while True:
-        await asyncio.sleep(3600)

 from apscheduler.schedulers.asyncio import AsyncIOScheduler  # type: ignore
 from apscheduler.triggers.cron import CronTrigger  # type: ignore
+from .parser import EquinixParser, AmazonParser, LinkedinParser
 async def schedule_update():
     scheduler.start()
     while True:
+        await asyncio.sleep(3600)
+async def run_update():
+    """
+    Run periodic update tasks.
+    """
+    parsers = [EquinixParser(), AmazonParser(), LinkedinParser()]
+    await asyncio.gather(*[parser.run() for parser in parsers])

app/api/scraper/services/parser/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
-from .parser import Parser
-__all__ = ["Parser"]


1	+ from .parsers import EquinixParser, AmazonParser, LinkedinParser, GoogleCareersParser
2
3	+ __all__ = ["EquinixParser", "AmazonParser", "LinkedinParser", "GoogleCareersParser"]

app/api/scraper/services/parser/equinix.json ADDED Viewed

The diff for this file is too large to render. See raw diff

app/api/scraper/services/parser/intelligent_retry.py DELETED Viewed

@@ -1,594 +0,0 @@
-"""
-Intelligent Retry System with Exponential Backoff and Circuit Breakers
-Provides domain-specific retry policies with health monitoring and adaptive rate limiting
-"""
-import asyncio
-import logging
-import random
-import time
-from collections import deque
-from dataclasses import dataclass, field
-from datetime import datetime, timedelta
-from enum import Enum
-from typing import Dict, List, Optional, Any, Callable, Tuple
-from urllib.parse import urlparse
-import httpx
-logger = logging.getLogger(__name__)
-class RetryReason(Enum):
-    """Reasons for retry attempts"""
-    NETWORK_ERROR = "network_error"
-    TIMEOUT = "timeout"
-    HTTP_ERROR = "http_error"
-    RATE_LIMIT = "rate_limit"
-    SERVER_ERROR = "server_error"
-    PARSING_ERROR = "parsing_error"
-    AUTHENTICATION_ERROR = "auth_error"
-    CLOUDFLARE_BLOCK = "cloudflare_block"
-    BOT_DETECTION = "bot_detection"
-class CircuitState(Enum):
-    """Circuit breaker states"""
-    CLOSED = "closed"  # Normal operation
-    OPEN = "open"  # Failing - block requests
-    HALF_OPEN = "half_open"  # Testing - allow limited requests
-@dataclass
-class RetryConfig:
-    """Configuration for retry behavior"""
-    max_attempts: int = 5
-    initial_delay: float = 1.0
-    max_delay: float = 60.0
-    backoff_multiplier: float = 2.0
-    jitter: bool = True
-    jitter_range: Tuple[float, float] = (0.1, 0.3)
-    # HTTP-specific settings
-    retry_http_codes: List[int] = field(default_factory=lambda: [500, 502, 503, 504, 429])
-    timeout: float = 30.0
-    # Domain-specific overrides
-    domain_overrides: Dict[str, Dict[str, Any]] = field(default_factory=dict)
-@dataclass
-class CircuitBreakerConfig:
-    """Configuration for circuit breaker"""
-    failure_threshold: int = 5
-    recovery_timeout: float = 60.0
-    success_threshold: int = 3  # For half-open -> closed transition
-    timeout: float = 30.0
-@dataclass
-class RetryAttempt:
-    """Information about a retry attempt"""
-    attempt: int
-    reason: RetryReason
-    delay: float
-    timestamp: datetime
-    error: Optional[str] = None
-    response_code: Optional[int] = None
-@dataclass
-class DomainHealth:
-    """Health tracking for a domain"""
-    domain: str
-    success_count: int = 0
-    failure_count: int = 0
-    total_requests: int = 0
-    average_response_time: float = 0.0
-    last_success: Optional[datetime] = None
-    last_failure: Optional[datetime] = None
-    consecutive_failures: int = 0
-    recent_response_times: deque = field(default_factory=lambda: deque(maxlen=100))
-    @property
-    def success_rate(self) -> float:
-        """Calculate success rate"""
-        if self.total_requests == 0:
-            return 1.0
-        return self.success_count / self.total_requests
-    @property
-    def health_score(self) -> float:
-        """Calculate overall health score (0-1)"""
-        success_weight = 0.6
-        response_time_weight = 0.3
-        recency_weight = 0.1
-        # Success rate component
-        success_component = self.success_rate * success_weight
-        # Response time component (lower is better)
-        avg_time = self.average_response_time
-        time_score = max(0, 1 - (avg_time / 10))  # 10s = score 0
-        time_component = time_score * response_time_weight
-        # Recency component (recent failures hurt more)
-        recency_component = recency_weight
-        if self.last_failure and self.last_success:
-            if self.last_failure > self.last_success:
-                hours_since_failure = (datetime.now() - self.last_failure).total_seconds() / 3600
-                recency_component *= min(1.0, hours_since_failure / 24)  # Recover over 24h
-        return success_component + time_component + recency_component
-class UserAgentRotator:
-    """Rotates user agents to avoid detection"""
-    USER_AGENTS = [
-        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
-        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
-        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
-        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
-        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0',
-        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:120.0) Gecko/20100101 Firefox/120.0',
-        'Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0',
-        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
-        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/120.0.0.0 Safari/537.36',
-    ]
-    def __init__(self):
-        self.current_index = 0
-    def get_user_agent(self) -> str:
-        """Get next user agent in rotation"""
-        user_agent = self.USER_AGENTS[self.current_index]
-        self.current_index = (self.current_index + 1) % len(self.USER_AGENTS)
-        return user_agent
-    def get_random_user_agent(self) -> str:
-        """Get random user agent"""
-        return random.choice(self.USER_AGENTS)
-class CircuitBreaker:
-    """Circuit breaker implementation for domain protection"""
-    def __init__(self, config: CircuitBreakerConfig):
-        self.config = config
-        self.state = CircuitState.CLOSED
-        self.failure_count = 0
-        self.success_count = 0
-        self.last_failure_time: Optional[datetime] = None
-        self.next_attempt_time: Optional[datetime] = None
-    def can_execute(self) -> bool:
-        """Check if request can be executed"""
-        now = datetime.now()
-        if self.state == CircuitState.CLOSED:
-            return True
-        elif self.state == CircuitState.OPEN:
-            if self.next_attempt_time and now >= self.next_attempt_time:
-                self.state = CircuitState.HALF_OPEN
-                self.success_count = 0
-                return True
-            return False
-        elif self.state == CircuitState.HALF_OPEN:
-            return True
-        return False
-    def record_success(self):
-        """Record successful request"""
-        if self.state == CircuitState.HALF_OPEN:
-            self.success_count += 1
-            if self.success_count >= self.config.success_threshold:
-                self.state = CircuitState.CLOSED
-                self.failure_count = 0
-        elif self.state == CircuitState.CLOSED:
-            self.failure_count = max(0, self.failure_count - 1)  # Recover gradually
-    def record_failure(self):
-        """Record failed request"""
-        self.failure_count += 1
-        self.last_failure_time = datetime.now()
-        if self.failure_count >= self.config.failure_threshold:
-            self.state = CircuitState.OPEN
-            self.next_attempt_time = datetime.now() + timedelta(seconds=self.config.recovery_timeout)
-        elif self.state == CircuitState.HALF_OPEN:
-            self.state = CircuitState.OPEN
-            self.next_attempt_time = datetime.now() + timedelta(seconds=self.config.recovery_timeout)
-    @property
-    def status(self) -> Dict[str, Any]:
-        """Get circuit breaker status"""
-        return {
-            'state': self.state.value,
-            'failure_count': self.failure_count,
-            'success_count': self.success_count,
-            'next_attempt_time': self.next_attempt_time.isoformat() if self.next_attempt_time else None
-        }
-class IntelligentRetryManager:
-    """
-    Intelligent retry system with domain-specific policies, circuit breakers, and adaptive rate limiting
-    """
-    def __init__(self):
-        self.retry_configs: Dict[str, RetryConfig] = {}
-        self.circuit_breakers: Dict[str, CircuitBreaker] = {}
-        self.domain_health: Dict[str, DomainHealth] = {}
-        self.user_agent_rotator = UserAgentRotator()
-        self.rate_limiters: Dict[str, 'RateLimiter'] = {}
-        # Initialize default configurations
-        self._initialize_domain_configs()
-    def _initialize_domain_configs(self):
-        """Initialize domain-specific configurations"""
-        # Workday ATS sites - more aggressive retry
-        workday_config = RetryConfig(
-            max_attempts=7,
-            initial_delay=2.0,
-            max_delay=120.0,
-            backoff_multiplier=2.5,
-            timeout=45.0,
-            retry_http_codes=[500, 502, 503, 504, 429, 403, 408]
-        )
-        workday_domains = [
-            'equinix.wd1.myworkdayjobs.com',
-            'globalswitch.wd103.myworkdayjobs.com',
-            'cyrusone.wd5.myworkdayjobs.com'
-        ]
-        for domain in workday_domains:
-            self.retry_configs[domain] = workday_config
-            self.circuit_breakers[domain] = CircuitBreaker(
-                CircuitBreakerConfig(failure_threshold=8, recovery_timeout=90.0)
-            )
-        # Large tech companies - conservative retry
-        tech_config = RetryConfig(
-            max_attempts=4,
-            initial_delay=3.0,
-            max_delay=60.0,
-            timeout=30.0,
-            retry_http_codes=[500, 502, 503, 504, 429]
-        )
-        tech_domains = [
-            'amazon.jobs',
-            'careers.google.com',
-            'jobs.careers.microsoft.com'
-        ]
-        for domain in tech_domains:
-            self.retry_configs[domain] = tech_config
-            self.circuit_breakers[domain] = CircuitBreaker(
-                CircuitBreakerConfig(failure_threshold=3, recovery_timeout=120.0)
-            )
-        # European datacenter companies - balanced approach
-        eu_config = RetryConfig(
-            max_attempts=6,
-            initial_delay=1.5,
-            max_delay=90.0,
-            timeout=35.0,
-            retry_http_codes=[500, 502, 503, 504, 429, 408]
-        )
-        eu_domains = [
-            'career.hetzner.com',
-            'careers.ovhcloud.com',
-            'scaleway.com',
-            'careers.digitalrealty.com',
-            'data4group.com',
-            'careers.ironmountain.com',
-            'datacenters.ntt'
-        ]
-        for domain in eu_domains:
-            self.retry_configs[domain] = eu_config
-            self.circuit_breakers[domain] = CircuitBreaker(
-                CircuitBreakerConfig(failure_threshold=5, recovery_timeout=60.0)
-            )
-    def _get_domain(self, url: str) -> str:
-        """Extract domain from URL"""
-        return urlparse(url).netloc.lower()
-    def _get_config(self, domain: str) -> RetryConfig:
-        """Get retry configuration for domain"""
-        return self.retry_configs.get(domain, RetryConfig())
-    def _get_circuit_breaker(self, domain: str) -> CircuitBreaker:
-        """Get or create circuit breaker for domain"""
-        if domain not in self.circuit_breakers:
-            self.circuit_breakers[domain] = CircuitBreaker(CircuitBreakerConfig())
-        return self.circuit_breakers[domain]
-    def _get_domain_health(self, domain: str) -> DomainHealth:
-        """Get or create domain health tracker"""
-        if domain not in self.domain_health:
-            self.domain_health[domain] = DomainHealth(domain=domain)
-        return self.domain_health[domain]
-    def _calculate_delay(self, attempt: int, config: RetryConfig, reason: RetryReason) -> float:
-        """Calculate retry delay with exponential backoff and jitter"""
-        # Base exponential backoff
-        delay = config.initial_delay * (config.backoff_multiplier ** (attempt - 1))
-        delay = min(delay, config.max_delay)
-        # Reason-specific adjustments
-        if reason == RetryReason.RATE_LIMIT:
-            delay *= 2.0  # Longer delays for rate limiting
-        elif reason == RetryReason.CLOUDFLARE_BLOCK:
-            delay *= 3.0  # Much longer for Cloudflare
-        elif reason == RetryReason.BOT_DETECTION:
-            delay *= 4.0  # Longest delays for bot detection
-        # Add jitter to avoid thundering herd
-        if config.jitter:
-            jitter_min, jitter_max = config.jitter_range
-            jitter_factor = 1.0 + random.uniform(jitter_min, jitter_max)
-            delay *= jitter_factor
-        return delay
-    def _classify_error(self, error: Exception, response_code: Optional[int] = None) -> RetryReason:
-        """Classify error to determine retry strategy"""
-        if response_code:
-            if response_code == 429:
-                return RetryReason.RATE_LIMIT
-            elif response_code == 403:
-                error_msg = str(error).lower()
-                if 'cloudflare' in error_msg or 'cf-ray' in error_msg:
-                    return RetryReason.CLOUDFLARE_BLOCK
-                elif 'bot' in error_msg or 'captcha' in error_msg:
-                    return RetryReason.BOT_DETECTION
-                else:
-                    return RetryReason.AUTHENTICATION_ERROR
-            elif response_code >= 500:
-                return RetryReason.SERVER_ERROR
-            elif response_code >= 400:
-                return RetryReason.HTTP_ERROR
-        error_msg = str(error).lower()
-        if 'timeout' in error_msg:
-            return RetryReason.TIMEOUT
-        elif 'connection' in error_msg or 'network' in error_msg:
-            return RetryReason.NETWORK_ERROR
-        else:
-            return RetryReason.NETWORK_ERROR
-    def _should_retry(self, error: Exception, attempt: int, config: RetryConfig,
-                      response_code: Optional[int] = None) -> bool:
-        """Determine if request should be retried"""
-        if attempt >= config.max_attempts:
-            return False
-        reason = self._classify_error(error, response_code)
-        # Don't retry authentication errors (except bot detection)
-        if reason == RetryReason.AUTHENTICATION_ERROR:
-            return False
-        # Always retry server errors and network issues
-        if reason in [RetryReason.SERVER_ERROR, RetryReason.NETWORK_ERROR, RetryReason.TIMEOUT]:
-            return True
-        # Retry rate limits and bot detection with longer delays
-        if reason in [RetryReason.RATE_LIMIT, RetryReason.CLOUDFLARE_BLOCK, RetryReason.BOT_DETECTION]:
-            return True
-        # Check HTTP codes
-        if response_code and response_code in config.retry_http_codes:
-            return True
-        return False
-    def _update_domain_health(self, domain: str, success: bool, response_time: float,
-                              error: Optional[Exception] = None):
-        """Update domain health metrics"""
-        health = self._get_domain_health(domain)
-        health.total_requests += 1
-        health.recent_response_times.append(response_time)
-        if success:
-            health.success_count += 1
-            health.last_success = datetime.now()
-            health.consecutive_failures = 0
-        else:
-            health.failure_count += 1
-            health.last_failure = datetime.now()
-            health.consecutive_failures += 1
-        # Update average response time
-        if health.recent_response_times:
-            health.average_response_time = sum(health.recent_response_times) / len(health.recent_response_times)
-    async def execute_with_retry(self, func: Callable, url: str, *args, **kwargs) -> Any:
-        """
-        Execute function with intelligent retry logic
-        Args:
-            func: Async function to execute
-            url: URL being accessed (for domain-specific policies)
-            *args, **kwargs: Arguments to pass to function
-        Returns:
-            Result of successful function execution
-        Raises:
-            Exception: If all retry attempts fail
-        """
-        domain = self._get_domain(url)
-        config = self._get_config(domain)
-        circuit_breaker = self._get_circuit_breaker(domain)
-        attempts = []
-        # Check circuit breaker
-        if not circuit_breaker.can_execute():
-            raise Exception(
-                f"Circuit breaker OPEN for domain {domain}. Next attempt at: {circuit_breaker.next_attempt_time}")
-        for attempt in range(1, config.max_attempts + 1):
-            start_time = time.time()
-            try:
-                # Rotate user agent for each attempt
-                if hasattr(kwargs, 'headers'):
-                    kwargs['headers'] = kwargs.get('headers', {})
-                    kwargs['headers']['User-Agent'] = self.user_agent_rotator.get_user_agent()
-                # Execute function
-                result = await func(*args, **kwargs)
-                # Record success
-                response_time = time.time() - start_time
-                self._update_domain_health(domain, True, response_time)
-                circuit_breaker.record_success()
-                logger.info(f"Request succeeded for {domain} on attempt {attempt}")
-                return result
-            except Exception as error:
-                response_time = time.time() - start_time
-                response_code = getattr(error, 'status_code', None) or getattr(error, 'status', None) or getattr(error, 'code', None)
-                reason = self._classify_error(error, response_code)
-                # Record failure
-                self._update_domain_health(domain, False, response_time, error)
-                circuit_breaker.record_failure()
-                # Log attempt
-                attempts.append(RetryAttempt(
-                    attempt=attempt,
-                    reason=reason,
-                    delay=0,  # Will be set below
-                    timestamp=datetime.now(),
-                    error=str(error)[:200],  # Truncate error message
-                    response_code=response_code
-                ))
-                logger.warning(
-                    f"Attempt {attempt}/{config.max_attempts} failed for {domain}: {reason.value} - {str(error)[:100]}")
-                # Check if should retry
-                if not self._should_retry(error, attempt, config, response_code):
-                    logger.error(f"Not retrying {domain} due to error type: {reason.value}")
-                    raise error
-                # Don't delay on last attempt
-                if attempt < config.max_attempts:
-                    delay = self._calculate_delay(attempt, config, reason)
-                    attempts[-1].delay = delay
-                    logger.info(f"Retrying {domain} in {delay:.2f} seconds (reason: {reason.value})")
-                    await asyncio.sleep(delay)
-        # All attempts failed
-        health = self._get_domain_health(domain)
-        error_msg = f"All {config.max_attempts} retry attempts failed for {domain}. Health score: {health.health_score:.2f}"
-        logger.error(error_msg)
-        # Re-raise the last error
-        if attempts:
-            raise Exception(f"{error_msg}. Last error: {attempts[-1].error}")
-        else:
-            raise Exception(error_msg)
-    async def execute_http_request(self, client: httpx.AsyncClient, method: str, url: str,
-                                   **kwargs) -> httpx.Response:
-        """
-        Execute HTTP request with retry logic
-        Args:
-            client: httpx async client
-            method: HTTP method (GET, POST, etc.)
-            url: Request URL
-            **kwargs: Additional request parameters
-        Returns:
-            Response object
-        """
-        async def make_request():
-            timeout_value = self._get_config(self._get_domain(url)).timeout
-            timeout = kwargs.get('timeout', httpx.Timeout(timeout_value))
-            kwargs['timeout'] = timeout
-            response = await client.request(method, url, **kwargs)
-            response.raise_for_status()
-            return response
-        return await self.execute_with_retry(make_request, url)
-    def get_domain_stats(self) -> Dict[str, Dict[str, Any]]:
-        """Get statistics for all domains"""
-        stats = {}
-        for domain, health in self.domain_health.items():
-            circuit_breaker = self._get_circuit_breaker(domain)
-            stats[domain] = {
-                'health_score': round(health.health_score, 3),
-                'success_rate': round(health.success_rate, 3),
-                'total_requests': health.total_requests,
-                'success_count': health.success_count,
-                'failure_count': health.failure_count,
-                'average_response_time': round(health.average_response_time, 2),
-                'consecutive_failures': health.consecutive_failures,
-                'last_success': health.last_success.isoformat() if health.last_success else None,
-                'last_failure': health.last_failure.isoformat() if health.last_failure else None,
-                'circuit_breaker': circuit_breaker.status
-            }
-        return stats
-    def reset_domain_health(self, domain: Optional[str] = None):
-        """Reset health tracking for a domain or all domains"""
-        if domain:
-            if domain in self.domain_health:
-                self.domain_health[domain] = DomainHealth(domain=domain)
-            if domain in self.circuit_breakers:
-                config = CircuitBreakerConfig()  # Default config
-                self.circuit_breakers[domain] = CircuitBreaker(config)
-        else:
-            self.domain_health.clear()
-            self.circuit_breakers.clear()
-            self._initialize_domain_configs()
-class RateLimiter:
-    """Simple rate limiter for respecting site limits"""
-    def __init__(self, requests_per_minute: int = 60):
-        self.requests_per_minute = requests_per_minute
-        self.request_times: deque = deque(maxlen=requests_per_minute)
-    async def acquire(self):
-        """Acquire rate limit slot, waiting if necessary"""
-        now = time.time()
-        # Remove old requests (older than 1 minute)
-        while self.request_times and now - self.request_times[0] > 60:
-            self.request_times.popleft()
-        # Check if we need to wait
-        if len(self.request_times) >= self.requests_per_minute:
-            wait_time = 60 - (now - self.request_times[0]) + 0.1  # Small buffer
-            if wait_time > 0:
-                await asyncio.sleep(wait_time)
-        # Record this request
-        self.request_times.append(now)

app/api/scraper/services/parser/parser.py DELETED Viewed

@@ -1,935 +0,0 @@
-"""
-Enhanced Multi-Engine Web Scraper
-Combines BeautifulSoup, Playwright, and intelligent fallback strategies for robust data extraction
-"""
-import asyncio
-import hashlib
-import logging
-import re
-import time
-import traceback
-from dataclasses import dataclass, field
-from datetime import datetime
-from enum import Enum
-from typing import Dict, List, Optional, Any
-from urllib.parse import urljoin, urlparse
-import httpx
-import html2text
-from bs4 import BeautifulSoup, Tag
-from playwright.async_api import async_playwright, Browser, Page, BrowserContext
-from selectolax.parser import HTMLParser
-from app.api.scraper.services.parser.intelligent_retry import IntelligentRetryManager
-logger = logging.getLogger(__name__)
-class ScrapingEngine(Enum):
-    """Available scraping engines"""
-    BEAUTIFULSOUP = "beautifulsoup"
-    PLAYWRIGHT = "playwright"
-    SELECTOLAX = "selectolax"
-    HYBRID = "hybrid"
-class ContentType(Enum):
-    """Types of web content"""
-    STATIC_HTML = "static_html"
-    SPA_REACT = "spa_react"
-    SPA_ANGULAR = "spa_angular"
-    SPA_VUE = "spa_vue"
-    DYNAMIC_AJAX = "dynamic_ajax"
-    INFINITE_SCROLL = "infinite_scroll"
-    WORKDAY_ATS = "workday_ats"
-    GREENHOUSE_ATS = "greenhouse_ats"
-    LEVER_ATS = "lever_ats"
-@dataclass
-class SiteProfile:
-    """Site-specific configuration and selectors"""
-    domain: str
-    name: str
-    content_type: ContentType
-    preferred_engine: ScrapingEngine
-    selectors: Dict[str, str] = field(default_factory=dict)
-    wait_conditions: List[str] = field(default_factory=list)
-    javascript_required: bool = False
-    rate_limit_delay: float = 1.0
-    custom_headers: Dict[str, str] = field(default_factory=dict)
-    login_required: bool = False
-    dynamic_loading: bool = False
-    infinite_scroll: bool = False
-    spa_detection_keywords: List[str] = field(default_factory=list)
-@dataclass
-class ScrapingResult:
-    """Result of a scraping operation"""
-    success: bool
-    engine_used: ScrapingEngine
-    content: Optional[str] = None
-    jobs: List[Dict[str, Any]] = field(default_factory=list)
-    metadata: Dict[str, Any] = field(default_factory=dict)
-    errors: List[str] = field(default_factory=list)
-    performance: Dict[str, float] = field(default_factory=dict)
-    fallback_attempts: int = 0
-class Parser:
-    """
-    Enhanced multi-engine web scraper with intelligent fallback strategies
-    Handles static sites, SPAs, and dynamic content with multiple parsing engines
-    """
-    def __init__(self, retry_manager: Optional[IntelligentRetryManager] = None):
-        self.retry_manager = retry_manager or IntelligentRetryManager()
-        self.site_profiles = self._initialize_site_profiles()
-        self.browser: Optional[Browser] = None
-        self.context: Optional[BrowserContext] = None
-        self.client: Optional[httpx.AsyncClient] = None
-        self.h2t = html2text.HTML2Text()
-        self.h2t.ignore_links = False
-        self.h2t.ignore_images = True
-    def _initialize_site_profiles(self) -> Dict[str, SiteProfile]:
-        """Initialize site-specific configurations from the 16 working URLs"""
-        return {
-            "hetzner.com": SiteProfile(
-                domain="hetzner.com",
-                name="Hetzner",
-                content_type=ContentType.STATIC_HTML,
-                preferred_engine=ScrapingEngine.BEAUTIFULSOUP,
-                selectors={
-                    "job_listings": ".job-offer",
-                    "job_title": ".job-title",
-                    "job_location": ".job-location",
-                    "job_description": ".job-description",
-                    "job_url": ".job-link"
-                },
-                wait_conditions=["networkidle"],
-                rate_limit_delay=2.0
-            ),
-            "careers.ovhcloud.com": SiteProfile(
-                domain="careers.ovhcloud.com",
-                name="OVHcloud",
-                content_type=ContentType.SPA_REACT,
-                preferred_engine=ScrapingEngine.PLAYWRIGHT,
-                selectors={
-                    "job_listings": "[data-testid='job-card']",
-                    "job_title": "[data-testid='job-title']",
-                    "job_location": "[data-testid='job-location']",
-                    "job_description": "[data-testid='job-description']"
-                },
-                javascript_required=True,
-                wait_conditions=["load", "networkidle"],
-                spa_detection_keywords=["React", "react-dom"],
-                rate_limit_delay=3.0
-            ),
-            "scaleway.com": SiteProfile(
-                domain="scaleway.com",
-                name="Scaleway",
-                content_type=ContentType.DYNAMIC_AJAX,
-                preferred_engine=ScrapingEngine.HYBRID,
-                selectors={
-                    "job_listings": ".job-item",
-                    "job_title": ".position-title",
-                    "job_location": ".position-location"
-                },
-                javascript_required=True,
-                dynamic_loading=True,
-                rate_limit_delay=2.5
-            ),
-            "equinix.wd1.myworkdayjobs.com": SiteProfile(
-                domain="equinix.wd1.myworkdayjobs.com",
-                name="Equinix",
-                content_type=ContentType.WORKDAY_ATS,
-                preferred_engine=ScrapingEngine.PLAYWRIGHT,
-                selectors={
-                    "job_listings": "[data-automation-id='jobTitle']",
-                    "job_title": "[data-automation-id='jobTitle'] a",
-                    "job_location": "[data-automation-id='locations']"
-                },
-                javascript_required=True,
-                wait_conditions=["load", "networkidle", "domcontentloaded"],
-                rate_limit_delay=4.0,
-                custom_headers={
-                    "Accept": "application/json, text/plain, */*",
-                    "Accept-Language": "en-US,en;q=0.9"
-                }
-            ),
-            "careers.digitalrealty.com": SiteProfile(
-                domain="careers.digitalrealty.com",
-                name="Digital Realty",
-                content_type=ContentType.STATIC_HTML,
-                preferred_engine=ScrapingEngine.BEAUTIFULSOUP,
-                selectors={
-                    "job_listings": ".job-result",
-                    "job_title": ".job-title a",
-                    "job_location": ".job-location"
-                },
-                rate_limit_delay=2.0
-            ),
-            "globalswitch.wd103.myworkdayjobs.com": SiteProfile(
-                domain="globalswitch.wd103.myworkdayjobs.com",
-                name="Global Switch",
-                content_type=ContentType.WORKDAY_ATS,
-                preferred_engine=ScrapingEngine.PLAYWRIGHT,
-                selectors={
-                    "job_listings": "[data-automation-id='jobTitle']",
-                    "job_title": "[data-automation-id='jobTitle'] a",
-                    "job_location": "[data-automation-id='locations']"
-                },
-                javascript_required=True,
-                wait_conditions=["load", "networkidle"],
-                rate_limit_delay=4.0
-            ),
-            "amazon.jobs": SiteProfile(
-                domain="amazon.jobs",
-                name="AWS",
-                content_type=ContentType.SPA_REACT,
-                preferred_engine=ScrapingEngine.PLAYWRIGHT,
-                selectors={
-                    "job_listings": "[data-test='job-tile']",
-                    "job_title": "[data-test='job-title']",
-                    "job_location": "[data-test='job-location']"
-                },
-                javascript_required=True,
-                dynamic_loading=True,
-                infinite_scroll=True,
-                spa_detection_keywords=["React", "react-router"],
-                rate_limit_delay=3.0
-            ),
-            "careers.google.com": SiteProfile(
-                domain="careers.google.com",
-                name="Google",
-                content_type=ContentType.SPA_ANGULAR,
-                preferred_engine=ScrapingEngine.PLAYWRIGHT,
-                selectors={
-                    "job_listings": ".gc-card",
-                    "job_title": ".gc-card__title",
-                    "job_location": ".gc-card__location"
-                },
-                javascript_required=True,
-                wait_conditions=["load", "networkidle"],
-                spa_detection_keywords=["Angular", "ng-"],
-                rate_limit_delay=3.5
-            ),
-            "jobs.careers.microsoft.com": SiteProfile(
-                domain="jobs.careers.microsoft.com",
-                name="Microsoft",
-                content_type=ContentType.SPA_REACT,
-                preferred_engine=ScrapingEngine.PLAYWRIGHT,
-                selectors={
-                    "job_listings": "[data-ph-at-id='job-search-result-item']",
-                    "job_title": "[data-ph-at-id='job-title']",
-                    "job_location": "[data-ph-at-id='job-location']"
-                },
-                javascript_required=True,
-                dynamic_loading=True,
-                rate_limit_delay=3.0
-            ),
-            "data4group.com": SiteProfile(
-                domain="data4group.com",
-                name="Data4",
-                content_type=ContentType.STATIC_HTML,
-                preferred_engine=ScrapingEngine.BEAUTIFULSOUP,
-                selectors={
-                    "job_listings": ".career-item",
-                    "job_title": ".career-title",
-                    "job_location": ".career-location"
-                },
-                rate_limit_delay=2.0
-            ),
-            "careers.ironmountain.com": SiteProfile(
-                domain="careers.ironmountain.com",
-                name="Iron Mountain",
-                content_type=ContentType.STATIC_HTML,
-                preferred_engine=ScrapingEngine.SELECTOLAX,
-                selectors={
-                    "job_listings": ".job-search-result",
-                    "job_title": ".job-title-link",
-                    "job_location": ".job-location"
-                },
-                rate_limit_delay=2.0
-            ),
-            "datacenters.ntt": SiteProfile(
-                domain="datacenters.ntt",
-                name="NTT Global Data Centers",
-                content_type=ContentType.STATIC_HTML,
-                preferred_engine=ScrapingEngine.BEAUTIFULSOUP,
-                selectors={
-                    "job_listings": ".career-listing",
-                    "job_title": ".career-title",
-                    "job_location": ".career-location"
-                },
-                rate_limit_delay=2.5
-            ),
-            "cyrusone.wd5.myworkdayjobs.com": SiteProfile(
-                domain="cyrusone.wd5.myworkdayjobs.com",
-                name="CyrusOne",
-                content_type=ContentType.WORKDAY_ATS,
-                preferred_engine=ScrapingEngine.PLAYWRIGHT,
-                selectors={
-                    "job_listings": "[data-automation-id='jobTitle']",
-                    "job_title": "[data-automation-id='jobTitle'] a",
-                    "job_location": "[data-automation-id='locations']"
-                },
-                javascript_required=True,
-                wait_conditions=["load", "networkidle"],
-                rate_limit_delay=4.0
-            )
-        }
-    async def __aenter__(self):
-        """Async context manager entry"""
-        await self._initialize_clients()
-        return self
-    async def __aexit__(self, exc_type, exc_val, exc_tb):
-        """Async context manager exit"""
-        await self._cleanup_clients()
-    async def _initialize_clients(self):
-        """Initialize HTTP and browser clients"""
-        # Initialize httpx client
-        timeout = httpx.Timeout(timeout=30.0, connect=10.0)
-        self.client = httpx.AsyncClient(
-            timeout=timeout,
-            headers={
-                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
-            }
-        )
-        # Initialize Playwright browser
-        self.playwright = await async_playwright().start()
-        self.browser = await self.playwright.chromium.launch(
-            headless=True,
-            args=['--no-sandbox', '--disable-dev-shm-usage']
-        )
-        self.context = await self.browser.new_context(
-            viewport={'width': 1920, 'height': 1080},
-            user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
-        )
-    async def _cleanup_clients(self):
-        """Clean up clients and resources"""
-        if self.client:
-            await self.client.aclose()
-        if self.context:
-            await self.context.close()
-        if self.browser:
-            await self.browser.close()
-        if hasattr(self, 'playwright'):
-            await self.playwright.stop()
-    def _get_site_profile(self, url: str) -> Optional[SiteProfile]:
-        """Get site profile for URL"""
-        domain = urlparse(url).netloc.lower()
-        # Direct domain match
-        if domain in self.site_profiles:
-            return self.site_profiles[domain]
-        # Partial domain match
-        for profile_domain, profile in self.site_profiles.items():
-            if profile_domain in domain or domain in profile_domain:
-                return profile
-        return None
-    async def _detect_content_type(self, url: str, html: str = None) -> ContentType:
-        """Detect the type of content on a webpage"""
-        if html:
-            soup = BeautifulSoup(html, 'html.parser')
-            # Check for SPA frameworks
-            scripts = soup.find_all('script')
-            script_content = ' '.join([s.get_text() for s in scripts if s.get_text()])
-            if 'React' in script_content or 'react-dom' in script_content:
-                return ContentType.SPA_REACT
-            elif 'Angular' in script_content or 'ng-app' in html:
-                return ContentType.SPA_ANGULAR
-            elif 'Vue' in script_content or 'vue.js' in html:
-                return ContentType.SPA_VUE
-            elif 'workday' in url.lower():
-                return ContentType.WORKDAY_ATS
-            elif 'greenhouse' in url.lower():
-                return ContentType.GREENHOUSE_ATS
-            elif 'lever' in url.lower():
-                return ContentType.LEVER_ATS
-            elif soup.find_all(attrs={'data-testid': True}) or soup.find_all(attrs={'data-cy': True}):
-                return ContentType.DYNAMIC_AJAX
-        return ContentType.STATIC_HTML
-    async def scrape_jobs(self, url: str, max_pages: int = 5) -> ScrapingResult:
-        """
-        Main scraping method with intelligent engine selection and fallback
-        """
-        start_time = time.time()
-        site_profile = self._get_site_profile(url)
-        logger.info(f"Scraping jobs from {url} with profile: {site_profile.name if site_profile else 'Unknown'}")
-        # Try preferred engine first
-        if site_profile:
-            result = await self._scrape_with_engine(url, site_profile.preferred_engine, site_profile, max_pages)
-            if result.success:
-                result.performance['total_time'] = time.time() - start_time
-                return result
-        # Fallback strategy: try engines in order of complexity
-        fallback_engines = [
-            ScrapingEngine.PLAYWRIGHT,
-            ScrapingEngine.BEAUTIFULSOUP,
-            ScrapingEngine.SELECTOLAX
-        ]
-        for i, engine in enumerate(fallback_engines):
-            if site_profile and engine == site_profile.preferred_engine:
-                continue  # Already tried
-            logger.info(f"Trying fallback engine {i + 1}/{len(fallback_engines)}: {engine.value}")
-            result = await self._scrape_with_engine(url, engine, site_profile, max_pages)
-            result.fallback_attempts = i + 1
-            if result.success:
-                result.performance['total_time'] = time.time() - start_time
-                return result
-        # All engines failed
-        return ScrapingResult(
-            success=False,
-            engine_used=ScrapingEngine.HYBRID,
-            errors=["All scraping engines failed"],
-            performance={'total_time': time.time() - start_time},
-            fallback_attempts=len(fallback_engines)
-        )
-    async def _scrape_with_engine(self, url: str, engine: ScrapingEngine,
-                                  site_profile: Optional[SiteProfile], max_pages: int) -> ScrapingResult:
-        """Scrape using specific engine"""
-        try:
-            if engine == ScrapingEngine.PLAYWRIGHT:
-                return await self._scrape_with_playwright(url, site_profile, max_pages)
-            elif engine == ScrapingEngine.BEAUTIFULSOUP:
-                return await self._scrape_with_beautifulsoup(url, site_profile, max_pages)
-            elif engine == ScrapingEngine.SELECTOLAX:
-                return await self._scrape_with_selectolax(url, site_profile, max_pages)
-            elif engine == ScrapingEngine.HYBRID:
-                return await self._scrape_with_hybrid(url, site_profile, max_pages)
-        except Exception as e:
-            logger.error(f"Engine {engine.value} failed: {str(e)}")
-            return ScrapingResult(
-                success=False,
-                engine_used=engine,
-                errors=[f"Engine {engine.value} error: {str(e)}"]
-            )
-    async def _scrape_with_playwright(self, url: str, profile: Optional[SiteProfile], max_pages: int) -> ScrapingResult:
-        """Scrape using Playwright for JavaScript-heavy sites"""
-        page = await self.context.new_page()
-        jobs = []
-        errors = []
-        try:
-            # Set custom headers if provided
-            if profile and profile.custom_headers:
-                await page.set_extra_http_headers(profile.custom_headers)
-            # Navigate to page
-            response = await page.goto(url, wait_until='networkidle', timeout=30000)
-            if response.status >= 400:
-                raise Exception(f"HTTP {response.status}")
-            # Wait for content to load
-            if profile and profile.wait_conditions:
-                for condition in profile.wait_conditions:
-                    if condition == 'networkidle':
-                        await page.wait_for_load_state('networkidle')
-                    elif condition == 'load':
-                        await page.wait_for_load_state('load')
-                    elif condition == 'domcontentloaded':
-                        await page.wait_for_load_state('domcontentloaded')
-            # Handle infinite scroll
-            if profile and profile.infinite_scroll:
-                await self._handle_infinite_scroll(page)
-            # Extract jobs
-            html_content = await page.content()
-            content_type = await self._detect_content_type(url, html_content)
-            if profile and profile.selectors:
-                jobs = await self._extract_jobs_with_selectors(page, profile.selectors, content_type)
-            else:
-                jobs = await self._extract_jobs_intelligent(page, content_type)
-            return ScrapingResult(
-                success=True,
-                engine_used=ScrapingEngine.PLAYWRIGHT,
-                content=html_content,
-                jobs=jobs,
-                metadata={
-                    'content_type': content_type.value,
-                    'page_title': await page.title(),
-                    'final_url': page.url
-                }
-            )
-        except Exception as e:
-            errors.append(f"Playwright error: {str(e)}")
-            return ScrapingResult(
-                success=False,
-                engine_used=ScrapingEngine.PLAYWRIGHT,
-                errors=errors
-            )
-        finally:
-            await page.close()
-    async def _scrape_with_beautifulsoup(self, url: str, profile: Optional[SiteProfile],
-                                         max_pages: int) -> ScrapingResult:
-        """Scrape using BeautifulSoup for static HTML"""
-        jobs = []
-        errors = []
-        try:
-            # Apply rate limiting
-            if profile:
-                await asyncio.sleep(profile.rate_limit_delay)
-            headers = {}
-            if profile and profile.custom_headers:
-                headers.update(profile.custom_headers)
-            response = await self.client.get(url, headers=headers)
-            response.raise_for_status()
-            html_content = response.text
-            soup = BeautifulSoup(html_content, 'html.parser')
-            content_type = await self._detect_content_type(url, html_content)
-            if profile and profile.selectors:
-                jobs = self._extract_jobs_beautifulsoup(soup, profile.selectors)
-            else:
-                jobs = self._extract_jobs_beautifulsoup_intelligent(soup)
-            return ScrapingResult(
-                    success=True,
-                    engine_used=ScrapingEngine.BEAUTIFULSOUP,
-                    content=html_content,
-                    jobs=jobs,
-                    metadata={
-                        'content_type': content_type.value,
-                        'page_title': soup.title.string if soup.title else '',
-                        'final_url': url
-                    }
-                )
-        except Exception as e:
-            traceback.print_exc()
-            return ScrapingResult(
-                success=False,
-                engine_used=ScrapingEngine.BEAUTIFULSOUP,
-                errors=errors
-            )
-    async def _scrape_with_selectolax(self, url: str, profile: Optional[SiteProfile], max_pages: int) -> ScrapingResult:
-        """Scrape using Selectolax for fast HTML parsing"""
-        jobs = []
-        errors = []
-        try:
-            if profile:
-                await asyncio.sleep(profile.rate_limit_delay)
-            headers = {}
-            if profile and profile.custom_headers:
-                headers.update(profile.custom_headers)
-            response = await self.client.get(url, headers=headers)
-            response.raise_for_status()
-            html_content = response.text
-            tree = HTMLParser(html_content)
-            content_type = await self._detect_content_type(url, html_content)
-            if profile and profile.selectors:
-                jobs = self._extract_jobs_selectolax(tree, profile.selectors)
-            else:
-                jobs = self._extract_jobs_selectolax_intelligent(tree)
-            return ScrapingResult(
-                    success=True,
-                    engine_used=ScrapingEngine.SELECTOLAX,
-                    content=html_content,
-                    jobs=jobs,
-                    metadata={
-                        'content_type': content_type.value,
-                        'final_url': url
-                    }
-                )
-        except Exception as e:
-            errors.append(f"Selectolax error: {str(e)}")
-            return ScrapingResult(
-                success=False,
-                engine_used=ScrapingEngine.SELECTOLAX,
-                errors=errors
-            )
-    async def _scrape_with_hybrid(self, url: str, profile: Optional[SiteProfile], max_pages: int) -> ScrapingResult:
-        """Hybrid approach combining multiple engines"""
-        # First, get initial content with BeautifulSoup
-        bs_result = await self._scrape_with_beautifulsoup(url, profile, max_pages)
-        # If JavaScript is likely required, try Playwright
-        if bs_result.success and bs_result.content:
-            content_type = await self._detect_content_type(url, bs_result.content)
-            if content_type in [ContentType.SPA_REACT, ContentType.SPA_ANGULAR, ContentType.SPA_VUE,
-                                ContentType.DYNAMIC_AJAX]:
-                pw_result = await self._scrape_with_playwright(url, profile, max_pages)
-                if pw_result.success:
-                    return pw_result
-        return bs_result
-    async def _handle_infinite_scroll(self, page: Page, max_scrolls: int = 5):
-        """Handle infinite scroll pages"""
-        for _ in range(max_scrolls):
-            # Scroll to bottom
-            await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
-            # Wait for new content
-            await asyncio.sleep(2)
-            # Check if more content loaded
-            prev_height = await page.evaluate("document.body.scrollHeight")
-            await asyncio.sleep(1)
-            new_height = await page.evaluate("document.body.scrollHeight")
-            if prev_height == new_height:
-                break
-    async def _extract_jobs_with_selectors(self, page: Page, selectors: Dict[str, str], content_type: ContentType) -> \
-            List[Dict[str, Any]]:
-        """Extract jobs using specific selectors"""
-        jobs = []
-        try:
-            # Wait for job listings to load
-            if selectors.get('job_listings'):
-                await page.wait_for_selector(selectors['job_listings'], timeout=10000)
-            # Get all job elements
-            job_elements = await page.query_selector_all(selectors.get('job_listings', '.job'))
-            for element in job_elements:
-                job_data = {}
-                # Extract title
-                if selectors.get('job_title'):
-                    title_element = await element.query_selector(selectors['job_title'])
-                    if title_element:
-                        job_data['title'] = await title_element.inner_text()
-                # Extract location
-                if selectors.get('job_location'):
-                    location_element = await element.query_selector(selectors['job_location'])
-                    if location_element:
-                        job_data['location'] = await location_element.inner_text()
-                # Extract description
-                if selectors.get('job_description'):
-                    desc_element = await element.query_selector(selectors['job_description'])
-                    if desc_element:
-                        job_data['description'] = await desc_element.inner_text()
-                # Extract URL
-                if selectors.get('job_url'):
-                    url_element = await element.query_selector(selectors['job_url'])
-                    if url_element:
-                        href = await url_element.get_attribute('href')
-                        if href:
-                            job_data['url'] = urljoin(page.url, href)
-                if job_data.get('title'):  # Only add if we have at least a title
-                    jobs.append(job_data)
-        except Exception as e:
-            logger.error(f"Error extracting jobs with selectors: {str(e)}")
-        return jobs
-    async def _extract_jobs_intelligent(self, page: Page, content_type: ContentType) -> List[Dict[str, Any]]:
-        """Intelligent job extraction without specific selectors"""
-        jobs = []
-        try:
-            # Get page content
-            content = await page.content()
-            soup = BeautifulSoup(content, 'html.parser')
-            # Use intelligent extraction
-            jobs = self._extract_jobs_beautifulsoup_intelligent(soup)
-        except Exception as e:
-            logger.error(f"Error in intelligent job extraction: {str(e)}")
-        return jobs
-    def _extract_jobs_beautifulsoup(self, soup: BeautifulSoup, selectors: Dict[str, str]) -> List[Dict[str, Any]]:
-        """Extract jobs using BeautifulSoup with specific selectors"""
-        jobs = []
-        try:
-            job_elements = soup.select(selectors.get('job_listings', '.job'))
-            for element in job_elements:
-                job_data = {}
-                # Extract title
-                if selectors.get('job_title'):
-                    title_element = element.select_one(selectors['job_title'])
-                    if title_element:
-                        job_data['title'] = title_element.get_text(strip=True)
-                # Extract location
-                if selectors.get('job_location'):
-                    location_element = element.select_one(selectors['job_location'])
-                    if location_element:
-                        job_data['location'] = location_element.get_text(strip=True)
-                # Extract description
-                if selectors.get('job_description'):
-                    desc_element = element.select_one(selectors['job_description'])
-                    if desc_element:
-                        job_data['description'] = desc_element.get_text(strip=True)
-                # Extract URL
-                if selectors.get('job_url'):
-                    url_element = element.select_one(selectors['job_url'])
-                    if url_element and url_element.get('href'):
-                        job_data['url'] = url_element['href']
-                if job_data.get('title'):
-                    jobs.append(job_data)
-        except Exception as e:
-            logger.error(f"Error extracting jobs with BeautifulSoup: {str(e)}")
-        return jobs
-    def _extract_jobs_beautifulsoup_intelligent(self, soup: BeautifulSoup) -> List[Dict[str, Any]]:
-        """Intelligent job extraction using common patterns"""
-        jobs = []
-        # Common job-related selectors to try
-        job_selectors = [
-            '.job', '.job-item', '.job-listing', '.job-card', '.job-result',
-            '.position', '.opening', '.career-item', '.vacancy',
-            '[data-testid*="job"]', '[data-cy*="job"]', '[class*="job"]'
-        ]
-        for selector in job_selectors:
-            elements = soup.select(selector)
-            if elements and len(elements) > 0:
-                for element in elements[:20]:  # Limit to 20 jobs per selector
-                    job_data = self._extract_single_job_intelligent(element)
-                    if job_data and job_data.get('title'):
-                        jobs.append(job_data)
-                if jobs:  # If we found jobs, return
-                    break
-        return jobs
-    def _extract_single_job_intelligent(self, element: Tag) -> Dict[str, Any]:
-        """Extract single job data intelligently"""
-        job_data = {}
-        # Extract title - look for links, headings, or job-related classes
-        title_selectors = ['h1', 'h2', 'h3', 'h4', 'a', '.title', '.job-title', '.position-title']
-        for selector in title_selectors:
-            title_element = element.select_one(selector)
-            if title_element:
-                title_text = title_element.get_text(strip=True)
-                if title_text and len(title_text) > 5:  # Basic validation
-                    job_data['title'] = title_text
-                    break
-        # Extract location - look for location-related patterns
-        location_patterns = [
-            r'\b[A-Z][a-z]+,\s*[A-Z]{2,}\b',  # City, Country
-            r'\b[A-Z][a-z]+,\s*[A-Z][a-z]+\b'  # City, State/Province
-        ]
-        element_text = element.get_text()
-        for pattern in location_patterns:
-            matches = re.findall(pattern, element_text)
-            if matches:
-                job_data['location'] = matches[0]
-                break
-        # Extract URL if element contains a link
-        link = element.find('a')
-        if link and link.get('href'):
-            job_data['url'] = link['href']
-        # Extract description (first paragraph or text content)
-        description = element.get_text(strip=True)
-        if description:
-            job_data['description'] = description[:500]  # Limit length
-        return job_data
-    def _extract_jobs_selectolax(self, tree: HTMLParser, selectors: Dict[str, str]) -> List[Dict[str, Any]]:
-        """Extract jobs using Selectolax with specific selectors"""
-        jobs = []
-        try:
-            job_elements = tree.css(selectors.get('job_listings', '.job'))
-            for element in job_elements:
-                job_data = {}
-                # Extract title
-                if selectors.get('job_title'):
-                    title_elements = element.css(selectors['job_title'])
-                    if title_elements:
-                        job_data['title'] = title_elements[0].text(strip=True)
-                # Extract location
-                if selectors.get('job_location'):
-                    location_elements = element.css(selectors['job_location'])
-                    if location_elements:
-                        job_data['location'] = location_elements[0].text(strip=True)
-                # Extract URL
-                if selectors.get('job_url'):
-                    url_elements = element.css(selectors['job_url'])
-                    if url_elements and url_elements[0].attributes.get('href'):
-                        job_data['url'] = url_elements[0].attributes['href']
-                if job_data.get('title'):
-                    jobs.append(job_data)
-        except Exception as e:
-            logger.error(f"Error extracting jobs with Selectolax: {str(e)}")
-        return jobs
-    def _extract_jobs_selectolax_intelligent(self, tree: HTMLParser) -> List[Dict[str, Any]]:
-        """Intelligent job extraction using Selectolax"""
-        jobs = []
-        # Try common job selectors
-        job_selectors = ['.job', '.job-item', '.job-listing', '.job-card', '.position']
-        for selector in job_selectors:
-            elements = tree.css(selector)
-            if elements:
-                for element in elements[:20]:  # Limit results
-                    job_data = {}
-                    # Extract title
-                    title_elements = element.css('h1, h2, h3, a, .title')
-                    if title_elements:
-                        job_data['title'] = title_elements[0].text(strip=True)
-                    # Extract text content for location
-                    text = element.text()
-                    location_match = re.search(r'\b[A-Z][a-z]+,\s*[A-Z]{2,}\b', text)
-                    if location_match:
-                        job_data['location'] = location_match.group()
-                    # Extract URL
-                    links = element.css('a')
-                    if links and links[0].attributes.get('href'):
-                        job_data['url'] = links[0].attributes['href']
-                    if job_data.get('title'):
-                        jobs.append(job_data)
-                if jobs:
-                    break
-        return jobs
-    def generate_content_hash(self, content: str) -> str:
-        """Generate hash for content deduplication"""
-        return hashlib.sha256(content.encode('utf-8')).hexdigest()
-    async def validate_scraped_jobs(self, jobs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-        """Validate and clean scraped job data"""
-        validated_jobs = []
-        for job in jobs:
-            # Required fields validation
-            if not job.get('title') or len(job['title'].strip()) < 5:
-                continue
-            # Clean and standardize data
-            cleaned_job = {
-                'title': job['title'].strip(),
-                'company': job.get('company', '').strip(),
-                'location': job.get('location', '').strip(),
-                'description': job.get('description', '').strip()[:2000],  # Limit description
-                'url': job.get('url', '').strip(),
-                'scraped_at': datetime.now().isoformat(),
-                'content_hash': self.generate_content_hash(f"{job['title']}{job.get('company', '')}")
-            }
-            # Only add if we have essential information
-            if cleaned_job['title'] and (cleaned_job['company'] or cleaned_job['location']):
-                validated_jobs.append(cleaned_job)
-        return validated_jobs
-# Helper functions for coordination hooks
-async def store_scraping_progress(memory_key: str, data: Dict[str, Any]):
-    """Store scraping progress in memory"""
-    try:
-        # This would integrate with the memory system
-        logger.info(f"Storing progress to memory key: {memory_key}")
-        pass
-    except Exception as e:
-        logger.error(f"Failed to store progress: {str(e)}")
-async def main():
-    """Example usage of the enhanced multi-engine scraper"""
-    async with Parser() as scraper:
-        # Test with a real working URL
-        url = "https://career.hetzner.com/job-offers.html"
-        result = await scraper.scrape_jobs(url)
-        if result.success:
-            logger.info(f"Successfully scraped {len(result.jobs)} jobs using {result.engine_used.value}")
-            for job in result.jobs[:5]:  # Show first 5 jobs
-                print(f"- {job.get('title', 'No title')} at {job.get('location', 'No location')}")
-        else:
-            logger.error(f"Scraping failed: {result.errors}")
-if __name__ == "__main__":
-    asyncio.run(main())

app/api/scraper/services/parser/parsers.py ADDED Viewed

	@@ -0,0 +1,293 @@

+import asyncio
+import json
+from abc import abstractmethod, ABC
+import httpx
+import pydash
+from linkedin_api import Linkedin
+from requests.cookies import cookiejar_from_dict
+from app.api.scraper.db_requests import save_job_obj
+from app.api.scraper.dto import Salary
+from app.api.scraper.models import JobModel
+from app.api.scraper.utils import html_to_md
+from app.core.config import settings
+class BaseAPIScraper(ABC):
+    def __init__(self):
+        self.headers = {
+            "Accept": "application/json",
+            "Content-Type": "application/json",
+            "Origin": "https://equinix.wd1.myworkdayjobs.com",
+            "Referer": "https://equinix.wd1.myworkdayjobs.com/External",
+            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15",
+        }
+    @abstractmethod
+    async def _collect_all_jobs(self):
+        pass
+    @abstractmethod
+    async def _get_job_details(self, job: dict):
+        pass
+    @abstractmethod
+    async def _save_job_details(self, job_details: dict):
+        pass
+    async def _process_job(self, semaphore, job):
+        async with semaphore:
+            job_details = await self._get_job_details(job)
+            await self._save_job_details(job_details)
+    async def run(self):
+        jobs = await self._collect_all_jobs()
+        semaphore = asyncio.Semaphore(10)
+        tasks = [self._process_job(semaphore, job) for job in jobs]
+        await asyncio.gather(*tasks)
+class EquinixParser(BaseAPIScraper):
+    def __init__(self):
+        super().__init__()
+        self.url = "https://equinix.wd1.myworkdayjobs.com/wday/cxs/equinix/External"
+    async def _collect_all_jobs(self):
+        all_jobs = []
+        offset = 1000
+        limit = 20
+        # with open('equinix.json', 'r', encoding='utf-8') as f:
+        #     jobs = json.load(f)['jobs']
+        #     return jobs
+        async with httpx.AsyncClient(timeout=httpx.Timeout(120)) as client:
+            while True:
+                payload = {
+                    "appliedFacets": {
+                        "locations": [
+                            "9bde94053fcb019e5aef9196bc6b72ec",
+                            "9bde94053fcb01c625460997bc6bd6ec",
+                            "9bde94053fcb015852d08896bc6b68ec",
+                            "9bde94053fcb0100f0725b97bc6b17ed",
+                            "9bde94053fcb01bd68750c96bc6bebeb",
+                            "9bde94053fcb01f9e9ff3d96bc6b1dec",
+                            "9bde94053fcb01c9340b3996bc6b18ec",
+                            "9bde94053fcb01c59f552696bc6b04ec",
+                            "9bde94053fcb01d86d2eeb97bc6ba8ed",
+                            "9ba19ba16c33103420c1c16fb36c32a5",
+                            "9bde94053fcb01aca28fc095bc6ba5eb",
+                            "9bde94053fcb01e99181b095bc6b96eb",
+                            "9bde94053fcb01c6b749a795bc6b8ceb",
+                            "9bde94053fcb0164431fe394bc6b0feb",
+                        ]
+                    },
+                    "limit": 20,
+                    "offset": 0,
+                }
+                response = await client.post(
+                    f"{self.url}/jobs", headers=self.headers, json=payload
+                )
+                response.raise_for_status()
+                data = response.json()
+                jobs = data.get("jobPostings", [])
+                if not jobs:
+                    break
+                all_jobs.extend(jobs)
+                if len(all_jobs) >= 1000:
+                    break
+                offset += limit
+                await asyncio.sleep(0.1)
+        with open('equinix.json', 'w', encoding='utf-8') as f:
+            f.write(json.dumps({"jobs": all_jobs}, ensure_ascii=False))
+        return all_jobs
+    async def _get_job_details(self, job: dict):
+        async with httpx.AsyncClient(timeout=httpx.Timeout(120)) as client:
+            response = await client.get(
+                f"{self.url}{job['externalPath']}", headers=self.headers
+            )
+            response.raise_for_status()
+            return response.json()
+    async def _save_job_details(self, job_details: dict):
+        job = JobModel(
+            title=pydash.get(job_details, "jobPostingInfo.title"),
+            company=pydash.get(job_details, "hiringOrganization.name"),
+            location=pydash.get(job_details, "jobPostingInfo.location"),
+            description=html_to_md(
+                pydash.get(job_details, "jobPostingInfo.jobDescription", "")
+            ),
+            # salary=Salary(
+            #     min=job_details["salary"]["min"], max=job_details["salary"]["max"]
+            # ),
+            sourceUrl=pydash.get(job_details, "jobPostingInfo.externalUrl"),
+        )
+        await save_job_obj(job)
+class AmazonParser(BaseAPIScraper):
+    def __init__(self):
+        super().__init__()
+        self.url = "https://www.amazon.jobs/en"
+    async def _collect_all_jobs(self):
+        all_jobs = []
+        offset = 0
+        limit = 100
+        async with httpx.AsyncClient(timeout=httpx.Timeout(120)) as client:
+            while True:
+                response = await client.get(
+                    f"{self.url}/search.json",
+                    params={
+                        "offset": offset,
+                        "result_limit": limit,
+                        "loc_query": "Europe",
+                    },
+                    headers=self.headers,
+                )
+                response.raise_for_status()
+                data = response.json()
+                jobs = data.get("jobs", [])
+                if not jobs:
+                    break
+                all_jobs.extend(jobs)
+                offset += limit
+                await asyncio.sleep(0.5)
+        return all_jobs
+    async def _get_job_details(self, job: dict):
+        async with httpx.AsyncClient(timeout=httpx.Timeout(120)) as client:
+            response = await client.get(f"{self.url}/{job['jobId']}")
+            response.raise_for_status()
+            return response.json()
+    async def _save_job_details(self, job_details: dict):
+        job = JobModel(
+            title=job_details["title"],
+            company=job_details["company"],
+            location=job_details["location"],
+            description=job_details["description"],
+            requirements=job_details["requirements"],
+            salary=Salary(
+                min=job_details["salary"]["min"], max=job_details["salary"]["max"]
+            ),
+            sourceUrl=job_details["url"],
+            isTop5=job_details["isTop5"],
+            datetimeInserted=job_details["datetimeInserted"],
+        )
+        await save_job_obj(job)
+    async def run(self):
+        jobs = await self._collect_all_jobs()
+        for job in jobs:
+            job_details = await self._get_job_details(job)
+            await self._save_job_details(job_details)
+class LinkedinParser(BaseAPIScraper):
+    def __init__(self):
+        super().__init__()
+        self.client = Linkedin(
+            "",
+            "",
+            cookies=cookiejar_from_dict(
+                {
+                    "liap": "true",
+                    "li_at": settings.LINKEDIN_COOKIE_LI_AT,
+                    "JSESSIONID": settings.LINKEDIN_COOKIE_JSESSIONID,
+                }
+            ),
+        )
+    async def _collect_all_jobs(self):
+        offset = 0
+        limit = 100
+        while True:
+            jobs = self.client.search_jobs(
+                location_name="European Union", limit=limit, offset=offset
+            )
+            if not jobs:
+                break
+            offset += limit
+            await asyncio.sleep(0.5)
+        return jobs
+    async def _get_job_details(self, job: dict):
+        job = self.client.get_job(job["entityUrn"].split(":")[-1])
+        return job
+    async def _save_job_details(self, job_details: dict):
+        print(job_details)
+        job = JobModel(
+            title=job_details["title"],
+            company=job_details["company"],
+            location=job_details["location"],
+            description=job_details["description"],
+            requirements=job_details["requirements"],
+            salary=Salary(
+                min=job_details["salary"]["min"], max=job_details["salary"]["max"]
+            ),
+            sourceUrl=job_details["url"],
+            isTop5=job_details["isTop5"],
+            datetimeInserted=job_details["datetimeInserted"],
+        )
+        await save_job_obj(job)
+class GoogleCareersParser(BaseAPIScraper):
+    def __init__(self):
+        super().__init__()
+        self.url = "https://www.google.com/about/careers/applications/jobs/results"
+    async def _collect_all_jobs(self):
+        all_jobs = []
+        page = 1
+        async with httpx.AsyncClient(timeout=httpx.Timeout(120)) as client:
+            while True:
+                response = await client.get(
+                    f"{self.url}?location=Netherlands&location=France&location=England&location=Germany&page={page}",
+                    headers=self.headers,
+                )
+                response.raise_for_status()
+                data = response.json()
+                jobs = data.get("jobs", [])
+                if not jobs:
+                    break
+                all_jobs.extend(jobs)
+                page += 1
+                await asyncio.sleep(0.5)
+        return all_jobs
+    async def _get_job_details(self, job: dict):
+        async with httpx.AsyncClient(timeout=httpx.Timeout(120)) as client:
+            response = await client.get(f"{self.url}/{job['id']}")
+            response.raise_for_status()
+            return response.json()
+    async def _save_job_details(self, job_details: dict):
+        job = JobModel(
+            title=job_details["title"],
+            company=job_details["company"],
+            location=job_details["location"],
+            description=job_details["description"],
+            requirements=job_details["requirements"],
+            salary=Salary(
+                min=job_details["salary"]["min"], max=job_details["salary"]["max"]
+            ),
+            sourceUrl=job_details["url"],
+            isTop5=job_details["isTop5"],
+            datetimeInserted=job_details["datetimeInserted"],
+        )
+        await save_job_obj(job)
+async def main():
+    parser = EquinixParser()
+    jobs = await parser.run()
+    print(jobs)
+if __name__ == "__main__":
+    asyncio.run(main())

app/api/scraper/utils.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import html2text
+def html_to_md(html_string: str) -> str:
+    try:
+        converter = html2text.HTML2Text()
+        converter.ignore_links = True
+        converter.ignore_images = True
+        markdown = converter.handle(html_string)
+        return markdown
+    except Exception as e:
+        return html_string
+if __name__ == "__main__":
+    t = {'canApply': True, 'country': {'descriptor': 'Nigeria', 'id': 'db69d536446c11de98360015c5e6daf6'}, 'externalUrl': 'https://equinix.wd1.myworkdayjobs.com/External/job/Lagos-Nigeria/IP-Projects-and-Implementation-Engineer_JR-155016-1', 'id': '4fbd87f4a0b41000b23edbb2fde00000', 'includeResumeParsing': True, 'jobDescription': '<p><b>Who are we?</b></p><div><p><span><span>Equinix is the world’s digital infrastructure company®, shortening the path to c...n, status as a victim or family member of a victim of crime or abuse, or any other status protected by applicable law.\xa0</p>', 'jobPostingId': 'IP-Projects-and-Implementation-Engineer_JR-155016-1', 'jobPostingSiteId': 'External', 'jobReqId': 'JR-155016', 'jobRequisitionLocation': {'country': {'alpha2Code': 'NG', 'descriptor': 'Nigeria', 'id': 'db69d536446c11de98360015c5e6daf6'}, 'descriptor': 'Lagos, Nigeria'}, 'location': 'Lagos, Nigeria', 'posted': True, 'postedOn': 'Posted Today', 'questionnaireId': '561a2ce86f57100202e03ac5c0880000', 'remoteType': 'Hybrid', 'startDate': '2025-09-15', 'timeType': 'Full time', 'title': 'IP Projects and Implementation Engineer'}
+    print(html_to_md(t["jobDescription"]))

app/api/scraper/views.py CHANGED Viewed

@@ -2,6 +2,7 @@
 Chat API views module.
 """
 from fastapi import Query
 from app.api.common.db_requests import (
@@ -12,11 +13,16 @@ from app.api.common.db_requests import (
 from app.api.common.dto import Paging
 from app.api.common.schemas import AllObjectsResponse, FilterRequest
 from app.api.scraper import scraper_router
-from app.api.scraper.db_requests import filter_jobs, search_field_options
-from app.api.scraper.schemas import SearchOptionRequest
 from app.api.scraper.dto import JobFilter
 from app.api.scraper.models import JobModel
 from app.core.wrappers import CbhResponseWrapper
 @scraper_router.get("/all")
@@ -58,7 +64,16 @@ async def scrape(
         )
     )
-@scraper_router.post('/option/{field}/search')
 async def scrape_option_search(
     field: str,
     request: SearchOptionRequest,
@@ -78,6 +93,7 @@ async def scrape_option_search(
         )
     )
 @scraper_router.get("/{jobId}")
 async def get_job(
     jobId: str,  # pylint: disable=C0103
@@ -98,3 +114,13 @@ async def delete_job(
     """
     await delete_obj(JobModel, jobId)
     return CbhResponseWrapper()

 Chat API views module.
 """
+import asyncio
 from fastapi import Query
 from app.api.common.db_requests import (
 from app.api.common.dto import Paging
 from app.api.common.schemas import AllObjectsResponse, FilterRequest
 from app.api.scraper import scraper_router
+from app.api.scraper.db_requests import (
+    filter_jobs,
+    search_field_options,
+    get_statistics,
+)
+from app.api.scraper.schemas import SearchOptionRequest, StatisticsResponse
 from app.api.scraper.dto import JobFilter
 from app.api.scraper.models import JobModel
 from app.core.wrappers import CbhResponseWrapper
+from app.api.scraper.services import run_update
 @scraper_router.get("/all")
         )
     )
+@scraper_router.get("/statistics")
+async def get_update_statistics() -> CbhResponseWrapper[StatisticsResponse]:
+    """
+    Get the update statistics.
+    """
+    statistics = await get_statistics()
+    return CbhResponseWrapper(data=statistics)
+@scraper_router.post("/option/{field}/search")
 async def scrape_option_search(
     field: str,
     request: SearchOptionRequest,
         )
     )
 @scraper_router.get("/{jobId}")
 async def get_job(
     jobId: str,  # pylint: disable=C0103
     """
     await delete_obj(JobModel, jobId)
     return CbhResponseWrapper()
+@scraper_router.post("/run")
+async def run_scraper() -> CbhResponseWrapper:
+    """
+    Run the scraper.
+    """
+    asyncio.create_task(run_update())
+    return CbhResponseWrapper()

app/core/config.py CHANGED Viewed

@@ -29,6 +29,8 @@ class BaseConfig:
     SECRET_KEY: str = os.getenv("SECRET", "")
     DB_CLIENT: AsyncIOMotorDatabase = AsyncIOMotorClient(os.getenv("MONGO_DB_URL")).euscrapper
     @staticmethod
     def get_headers(api_key: str) -> dict:

     SECRET_KEY: str = os.getenv("SECRET", "")
     DB_CLIENT: AsyncIOMotorDatabase = AsyncIOMotorClient(os.getenv("MONGO_DB_URL")).euscrapper
+    LINKEDIN_COOKIE_LI_AT = os.getenv("LINKEDIN_COOKIE_LI_AT")
+    LINKEDIN_COOKIE_JSESSIONID = os.getenv("LINKEDIN_COOKIE_JSESSIONID")
     @staticmethod
     def get_headers(api_key: str) -> dict:

main.py CHANGED Viewed

File without changes

requirements.txt ADDED Viewed

	@@ -0,0 +1,42 @@

+# Web framework
+fastapi==0.104.1
+uvicorn==0.24.0
+# Database
+motor==3.3.2
+pymongo==4.6.0
+# HTTP requests and web scraping
+requests==2.31.0
+httpx==0.25.2
+beautifulsoup4==4.12.2
+selenium==4.15.2
+# HTML parsing (alternative/faster option)
+selectolax==0.3.17
+html2text==2020.1.16
+# Web automation
+playwright==1.40.0
+# Async job scheduling
+apscheduler==3.10.4
+# AI/ML
+langchain-core==0.1.0
+langchain-openai==0.0.2
+# Configuration and environment
+python-dotenv==1.0.0
+pydantic==2.5.0
+# CORS middleware for FastAPI
+starlette==0.27.0
+# Additional utilities
+python-multipart==0.0.6
+# Additional dependencies for enhanced scraping
+lxml==4.9.3
+fake-useragent==1.4.0
+aiofiles==23.2.0