brestok commited on
Commit
ab0a73d
·
1 Parent(s): b60402f
app/api/scraper/db_requests.py CHANGED
@@ -4,12 +4,12 @@ Scraper database requests.
4
 
5
  import asyncio
6
  import re
7
- from datetime import datetime
8
-
9
  from app.api.common.schemas import FilterRequest
10
- from app.api.scraper.dto import JobFilter, Salary
11
  from app.api.scraper.models import JobModel
12
  from app.core.config import settings
 
13
 
14
 
15
  async def filter_jobs(
@@ -20,25 +20,31 @@ async def filter_jobs(
20
  """
21
  query = {}
22
  skip = filter_request.pageSize * filter_request.pageIndex
23
-
24
  if filter_request.filter.titles:
25
  query["title"] = {
26
- "$regex": "|".join([re.escape(title) for title in filter_request.filter.titles]),
27
- "$options": "i"
 
 
28
  }
29
-
30
  if filter_request.filter.companies:
31
  query["company"] = {
32
- "$regex": "|".join([re.escape(company) for company in filter_request.filter.companies]),
33
- "$options": "i"
 
 
34
  }
35
-
36
  if filter_request.filter.locations:
37
  query["location"] = {
38
- "$regex": "|".join([re.escape(location) for location in filter_request.filter.locations]),
39
- "$options": "i"
 
 
40
  }
41
-
42
  if filter_request.filter.minSalary:
43
  query["salary.min"] = {"$gte": filter_request.filter.minSalary}
44
  if filter_request.filter.maxSalary:
@@ -60,134 +66,70 @@ async def filter_jobs(
60
  )
61
  return [JobModel.from_mongo(job) for job in jobs], total_count
62
 
 
63
  async def search_field_options(field: str, value: str) -> list[str]:
64
  """
65
  Search field options based on the field and value.
66
  """
67
  if field not in ["title", "company", "location"]:
68
  return []
69
-
70
- pipeline = [{
71
- "$match": {
72
- field: {"$regex": re.escape(value), "$options": "i"}
73
- }
74
- }]
75
-
76
- pipeline.extend([
77
- {"$group": {"_id": f"${field}"}},
78
- {"$project": {"_id": 0, "value": "$_id"}},
79
- {"$sort": {"value": 1}},
80
- {"$limit": 5}
81
- ])
82
-
83
  result = await settings.DB_CLIENT.jobs.aggregate(pipeline).to_list(length=5)
84
  return [item["value"] for item in result if item["value"] and item["value"].strip()]
85
 
86
 
87
- async def main():
88
- jobs = [
89
- JobModel(
90
- title="Frontend Developer",
91
- company="Meta",
92
- location="Menlo Park, CA",
93
- description="Join our frontend team to build the next generation of social media experiences. You'll work with React, TypeScript, and cutting-edge web technologies to create engaging user interfaces that connect billions of people worldwide.",
94
- requirements="3+ years of experience with React, JavaScript/TypeScript, HTML5, CSS3. Experience with modern build tools and testing frameworks. Strong understanding of responsive design and cross-browser compatibility.",
95
- salary=Salary(min=110000, max=160000),
96
- isTop5=True,
97
- sourceUrl="https://www.metacareers.com",
98
- ),
99
-
100
- JobModel(
101
- title="DevOps Engineer",
102
- company="Amazon",
103
- location="Seattle, WA",
104
- description="We're looking for a DevOps Engineer to help scale our cloud infrastructure. You'll work with AWS services, Kubernetes, and automation tools to ensure reliable, scalable, and secure deployments across multiple environments.",
105
- requirements="5+ years of experience with AWS services, Docker, Kubernetes, Jenkins/GitLab CI, Terraform. Strong scripting skills in Python or Bash. Experience with monitoring and logging solutions.",
106
- salary=Salary(min=130000, max=190000),
107
- isTop5=True,
108
- sourceUrl="https://www.amazon.jobs",
109
- ),
110
-
111
- JobModel(
112
- title="Data Scientist",
113
- company="Netflix",
114
- location="Los Gatos, CA",
115
- description="Join our data science team to analyze user behavior and optimize content recommendations. You'll work with large datasets, machine learning models, and statistical analysis to drive business decisions and enhance user experience.",
116
- requirements="PhD or Master's in Statistics, Computer Science, or related field. 4+ years of experience with Python, R, SQL, machine learning libraries (scikit-learn, TensorFlow). Experience with A/B testing and statistical modeling.",
117
- salary=Salary(min=140000, max=200000),
118
- isTop5=True,
119
- sourceUrl="https://jobs.netflix.com",
120
- ),
121
-
122
- JobModel(
123
- title="Backend Developer",
124
- company="Spotify",
125
- location="Stockholm, Sweden",
126
- description="Build and maintain scalable backend services that power music streaming for millions of users. Work with microservices architecture, APIs, and distributed systems using modern technologies.",
127
- requirements="3+ years of backend development experience with Java, Python, or Go. Experience with microservices, REST APIs, databases (PostgreSQL, Redis). Knowledge of cloud platforms and containerization.",
128
- salary=Salary(min=80000, max=120000),
129
- isTop5=False,
130
- sourceUrl="https://www.lifeatspotify.com",
131
- ),
132
-
133
- JobModel(
134
- title="Mobile App Developer",
135
- company="Uber",
136
- location="San Francisco, CA",
137
- description="Develop and maintain mobile applications for iOS and Android platforms. Work on features that millions of users interact with daily, focusing on performance, user experience, and reliability.",
138
- requirements="4+ years of mobile development experience with Swift/Objective-C for iOS or Kotlin/Java for Android. Experience with React Native is a plus. Strong understanding of mobile UI/UX principles.",
139
- salary=Salary(min=115000, max=165000),
140
- isTop5=False,
141
- sourceUrl="https://www.uber.com/careers",
142
- ),
143
-
144
- JobModel(
145
- title="Product Manager",
146
- company="Airbnb",
147
- location="San Francisco, CA",
148
- description="Lead product strategy and development for our host and guest experience platforms. Work cross-functionally with engineering, design, and data teams to deliver features that enhance user satisfaction and business growth.",
149
- requirements="5+ years of product management experience, preferably in consumer-facing products. Strong analytical skills, experience with A/B testing, user research. MBA preferred but not required.",
150
- salary=Salary(min=140000, max=220000),
151
- isTop5=True,
152
- sourceUrl="https://careers.airbnb.com",
153
- ),
154
-
155
- JobModel(
156
- title="Cybersecurity Analyst",
157
- company="Microsoft",
158
- location="Redmond, WA",
159
- description="Protect our cloud infrastructure and services from security threats. Monitor security events, conduct threat analysis, and implement security measures to safeguard customer data and corporate assets.",
160
- requirements="Bachelor's in Cybersecurity or related field. 3+ years of experience in security operations, threat detection, incident response. Certifications like CISSP, CEH, or equivalent preferred.",
161
- salary=Salary(min=95000, max=140000),
162
- isTop5=False,
163
- sourceUrl="https://careers.microsoft.com",
164
- ),
165
-
166
- JobModel(
167
- title="UX Designer",
168
- company="Adobe",
169
- location="San Jose, CA",
170
- description="Design intuitive and engaging user experiences for our creative software products. Collaborate with product managers and engineers to create user-centered designs that enhance creativity and productivity for millions of users.",
171
- requirements="4+ years of UX design experience with a strong portfolio. Proficiency in design tools like Figma, Sketch, Adobe Creative Suite. Experience with user research, prototyping, and design systems.",
172
- salary=Salary(min=105000, max=150000),
173
- isTop5=False,
174
- sourceUrl="https://adobe.wd5.myworkdayjobs.com",
175
- ),
176
-
177
- JobModel(
178
- title="Machine Learning Engineer",
179
- company="Tesla",
180
- location="Palo Alto, CA",
181
- description="Develop and deploy machine learning models for autonomous driving systems. Work with computer vision, deep learning, and sensor fusion to advance the future of transportation and sustainable energy.",
182
- requirements="Master's or PhD in Machine Learning, AI, or related field. 3+ years of experience with PyTorch, TensorFlow, computer vision. Experience with autonomous systems, robotics, or automotive industry preferred.",
183
- salary=Salary(min=150000, max=220000),
184
- isTop5=True,
185
- sourceUrl="https://www.tesla.com/careers",
186
- ),
187
- ]
188
-
189
- for job in jobs:
190
- await settings.DB_CLIENT.jobs.insert_one(job.to_mongo())
191
-
192
- if __name__ == "__main__":
193
- asyncio.run(main())
 
4
 
5
  import asyncio
6
  import re
7
+ from datetime import timedelta, datetime
 
8
  from app.api.common.schemas import FilterRequest
9
+ from app.api.scraper.dto import JobFilter
10
  from app.api.scraper.models import JobModel
11
  from app.core.config import settings
12
+ from app.api.scraper.schemas import StatisticsResponse
13
 
14
 
15
  async def filter_jobs(
 
20
  """
21
  query = {}
22
  skip = filter_request.pageSize * filter_request.pageIndex
23
+
24
  if filter_request.filter.titles:
25
  query["title"] = {
26
+ "$regex": "|".join(
27
+ [re.escape(title) for title in filter_request.filter.titles]
28
+ ),
29
+ "$options": "i",
30
  }
31
+
32
  if filter_request.filter.companies:
33
  query["company"] = {
34
+ "$regex": "|".join(
35
+ [re.escape(company) for company in filter_request.filter.companies]
36
+ ),
37
+ "$options": "i",
38
  }
39
+
40
  if filter_request.filter.locations:
41
  query["location"] = {
42
+ "$regex": "|".join(
43
+ [re.escape(location) for location in filter_request.filter.locations]
44
+ ),
45
+ "$options": "i",
46
  }
47
+
48
  if filter_request.filter.minSalary:
49
  query["salary.min"] = {"$gte": filter_request.filter.minSalary}
50
  if filter_request.filter.maxSalary:
 
66
  )
67
  return [JobModel.from_mongo(job) for job in jobs], total_count
68
 
69
+
70
  async def search_field_options(field: str, value: str) -> list[str]:
71
  """
72
  Search field options based on the field and value.
73
  """
74
  if field not in ["title", "company", "location"]:
75
  return []
76
+
77
+ pipeline = [{"$match": {field: {"$regex": re.escape(value), "$options": "i"}}}]
78
+
79
+ pipeline.extend(
80
+ [
81
+ {"$group": {"_id": f"${field}"}},
82
+ {"$project": {"_id": 0, "value": "$_id"}},
83
+ {"$sort": {"value": 1}},
84
+ {"$limit": 5},
85
+ ]
86
+ )
87
+
 
 
88
  result = await settings.DB_CLIENT.jobs.aggregate(pipeline).to_list(length=5)
89
  return [item["value"] for item in result if item["value"] and item["value"].strip()]
90
 
91
 
92
+ async def get_statistics() -> StatisticsResponse:
93
+ """
94
+ Get the statistics.
95
+ """
96
+ result = (
97
+ await settings.DB_CLIENT.jobs.find({}, {"_id": 0, "datetimeInserted": 1})
98
+ .sort("_id", -1)
99
+ .limit(1)
100
+ .to_list(length=1)
101
+ )
102
+
103
+ lastUpdate = datetime.fromisoformat(result[0]["datetimeInserted"])
104
+ now = datetime.now()
105
+ days_until_sunday = (6 - now.weekday()) % 7
106
+ if days_until_sunday == 0 and now.hour < 23:
107
+ nextUpdate = now.replace(hour=23, minute=0, second=0, microsecond=0)
108
+ else:
109
+ if days_until_sunday == 0:
110
+ days_until_sunday = 7
111
+ nextUpdate = (now + timedelta(days=days_until_sunday)).replace(
112
+ hour=23, minute=0, second=0, microsecond=0
113
+ )
114
+ return StatisticsResponse(lastUpdate=lastUpdate, nextUpdate=nextUpdate)
115
+
116
+
117
+ async def save_job_obj(job: JobModel):
118
+ """
119
+ Save a job object.
120
+ """
121
+ if await check_if_job_exists(job):
122
+ print(f"Skipped job [{job.title}]!")
123
+ return
124
+ await settings.DB_CLIENT.jobs.insert_one(job.to_mongo())
125
+ print(f"Job [{job.title}] saved!")
126
+
127
+
128
+ async def check_if_job_exists(job: JobModel):
129
+ """
130
+ Check if a job exists.
131
+ """
132
+ job = await settings.DB_CLIENT.jobs.find_one(
133
+ {"title": job.title, "description": job.description}, {"_id": 1}
134
+ )
135
+ return True if job else False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/api/scraper/models.py CHANGED
@@ -7,12 +7,12 @@ from app.core.database import MongoBaseModel
7
 
8
 
9
  class JobModel(MongoBaseModel):
10
- title: str
11
- company: str
12
- location: str
13
- description: str
14
- requirements: str
15
- salary: Salary
16
  isTop5: bool = False
17
- sourceUrl: str
18
- datetimeInserted: datetime = Field(default_factory=datetime.now)
 
7
 
8
 
9
  class JobModel(MongoBaseModel):
10
+ title: str | None = None
11
+ company: str | None = None
12
+ location: str | None = None
13
+ description: str | None = None
14
+ requirements: str | None = None
15
+ salary: Salary | None = None
16
  isTop5: bool = False
17
+ sourceUrl: str | None = None
18
+ datetimeInserted: datetime = Field(default_factory=datetime.now)
app/api/scraper/schemas.py CHANGED
@@ -1,5 +1,10 @@
 
1
  from pydantic import BaseModel
2
 
3
 
4
  class SearchOptionRequest(BaseModel):
5
- value: str
 
 
 
 
 
1
+ from datetime import datetime
2
  from pydantic import BaseModel
3
 
4
 
5
  class SearchOptionRequest(BaseModel):
6
+ value: str
7
+
8
+ class StatisticsResponse(BaseModel):
9
+ lastUpdate: datetime
10
+ nextUpdate: datetime
app/api/scraper/services/__init__.py CHANGED
@@ -4,5 +4,6 @@ Chat services.
4
 
5
  from .workflows import generate_chat_name
6
  from .background_tasks import schedule_update
 
7
 
8
- __all__ = ["generate_chat_name", "schedule_update"]
 
4
 
5
  from .workflows import generate_chat_name
6
  from .background_tasks import schedule_update
7
+ from .background_tasks import run_update
8
 
9
+ __all__ = ["generate_chat_name", "schedule_update", "run_update"]
app/api/scraper/services/background_tasks.py CHANGED
@@ -3,12 +3,7 @@ import asyncio
3
  from apscheduler.schedulers.asyncio import AsyncIOScheduler # type: ignore
4
  from apscheduler.triggers.cron import CronTrigger # type: ignore
5
 
6
-
7
- async def run_update():
8
- """
9
- Run periodic update tasks.
10
- """
11
- print("Running scheduled update task...")
12
 
13
 
14
  async def schedule_update():
@@ -20,4 +15,12 @@ async def schedule_update():
20
 
21
  scheduler.start()
22
  while True:
23
- await asyncio.sleep(3600)
 
 
 
 
 
 
 
 
 
3
  from apscheduler.schedulers.asyncio import AsyncIOScheduler # type: ignore
4
  from apscheduler.triggers.cron import CronTrigger # type: ignore
5
 
6
+ from .parser import EquinixParser, AmazonParser, LinkedinParser
 
 
 
 
 
7
 
8
 
9
  async def schedule_update():
 
15
 
16
  scheduler.start()
17
  while True:
18
+ await asyncio.sleep(3600)
19
+
20
+
21
+ async def run_update():
22
+ """
23
+ Run periodic update tasks.
24
+ """
25
+ parsers = [EquinixParser(), AmazonParser(), LinkedinParser()]
26
+ await asyncio.gather(*[parser.run() for parser in parsers])
app/api/scraper/services/parser/__init__.py CHANGED
@@ -1,3 +1,3 @@
1
- from .parser import Parser
2
 
3
- __all__ = ["Parser"]
 
1
+ from .parsers import EquinixParser, AmazonParser, LinkedinParser, GoogleCareersParser
2
 
3
+ __all__ = ["EquinixParser", "AmazonParser", "LinkedinParser", "GoogleCareersParser"]
app/api/scraper/services/parser/equinix.json ADDED
The diff for this file is too large to render. See raw diff
 
app/api/scraper/services/parser/intelligent_retry.py DELETED
@@ -1,594 +0,0 @@
1
- """
2
- Intelligent Retry System with Exponential Backoff and Circuit Breakers
3
- Provides domain-specific retry policies with health monitoring and adaptive rate limiting
4
- """
5
-
6
- import asyncio
7
- import logging
8
- import random
9
- import time
10
- from collections import deque
11
- from dataclasses import dataclass, field
12
- from datetime import datetime, timedelta
13
- from enum import Enum
14
- from typing import Dict, List, Optional, Any, Callable, Tuple
15
- from urllib.parse import urlparse
16
-
17
- import httpx
18
-
19
- logger = logging.getLogger(__name__)
20
-
21
-
22
- class RetryReason(Enum):
23
- """Reasons for retry attempts"""
24
- NETWORK_ERROR = "network_error"
25
- TIMEOUT = "timeout"
26
- HTTP_ERROR = "http_error"
27
- RATE_LIMIT = "rate_limit"
28
- SERVER_ERROR = "server_error"
29
- PARSING_ERROR = "parsing_error"
30
- AUTHENTICATION_ERROR = "auth_error"
31
- CLOUDFLARE_BLOCK = "cloudflare_block"
32
- BOT_DETECTION = "bot_detection"
33
-
34
-
35
- class CircuitState(Enum):
36
- """Circuit breaker states"""
37
- CLOSED = "closed" # Normal operation
38
- OPEN = "open" # Failing - block requests
39
- HALF_OPEN = "half_open" # Testing - allow limited requests
40
-
41
-
42
- @dataclass
43
- class RetryConfig:
44
- """Configuration for retry behavior"""
45
- max_attempts: int = 5
46
- initial_delay: float = 1.0
47
- max_delay: float = 60.0
48
- backoff_multiplier: float = 2.0
49
- jitter: bool = True
50
- jitter_range: Tuple[float, float] = (0.1, 0.3)
51
-
52
- # HTTP-specific settings
53
- retry_http_codes: List[int] = field(default_factory=lambda: [500, 502, 503, 504, 429])
54
- timeout: float = 30.0
55
-
56
- # Domain-specific overrides
57
- domain_overrides: Dict[str, Dict[str, Any]] = field(default_factory=dict)
58
-
59
-
60
- @dataclass
61
- class CircuitBreakerConfig:
62
- """Configuration for circuit breaker"""
63
- failure_threshold: int = 5
64
- recovery_timeout: float = 60.0
65
- success_threshold: int = 3 # For half-open -> closed transition
66
- timeout: float = 30.0
67
-
68
-
69
- @dataclass
70
- class RetryAttempt:
71
- """Information about a retry attempt"""
72
- attempt: int
73
- reason: RetryReason
74
- delay: float
75
- timestamp: datetime
76
- error: Optional[str] = None
77
- response_code: Optional[int] = None
78
-
79
-
80
- @dataclass
81
- class DomainHealth:
82
- """Health tracking for a domain"""
83
- domain: str
84
- success_count: int = 0
85
- failure_count: int = 0
86
- total_requests: int = 0
87
- average_response_time: float = 0.0
88
- last_success: Optional[datetime] = None
89
- last_failure: Optional[datetime] = None
90
- consecutive_failures: int = 0
91
- recent_response_times: deque = field(default_factory=lambda: deque(maxlen=100))
92
-
93
- @property
94
- def success_rate(self) -> float:
95
- """Calculate success rate"""
96
- if self.total_requests == 0:
97
- return 1.0
98
- return self.success_count / self.total_requests
99
-
100
- @property
101
- def health_score(self) -> float:
102
- """Calculate overall health score (0-1)"""
103
- success_weight = 0.6
104
- response_time_weight = 0.3
105
- recency_weight = 0.1
106
-
107
- # Success rate component
108
- success_component = self.success_rate * success_weight
109
-
110
- # Response time component (lower is better)
111
- avg_time = self.average_response_time
112
- time_score = max(0, 1 - (avg_time / 10)) # 10s = score 0
113
- time_component = time_score * response_time_weight
114
-
115
- # Recency component (recent failures hurt more)
116
- recency_component = recency_weight
117
- if self.last_failure and self.last_success:
118
- if self.last_failure > self.last_success:
119
- hours_since_failure = (datetime.now() - self.last_failure).total_seconds() / 3600
120
- recency_component *= min(1.0, hours_since_failure / 24) # Recover over 24h
121
-
122
- return success_component + time_component + recency_component
123
-
124
-
125
- class UserAgentRotator:
126
- """Rotates user agents to avoid detection"""
127
-
128
- USER_AGENTS = [
129
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
130
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
131
- 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
132
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
133
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0',
134
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:120.0) Gecko/20100101 Firefox/120.0',
135
- 'Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0',
136
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
137
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/120.0.0.0 Safari/537.36',
138
- ]
139
-
140
- def __init__(self):
141
- self.current_index = 0
142
-
143
- def get_user_agent(self) -> str:
144
- """Get next user agent in rotation"""
145
- user_agent = self.USER_AGENTS[self.current_index]
146
- self.current_index = (self.current_index + 1) % len(self.USER_AGENTS)
147
- return user_agent
148
-
149
- def get_random_user_agent(self) -> str:
150
- """Get random user agent"""
151
- return random.choice(self.USER_AGENTS)
152
-
153
-
154
- class CircuitBreaker:
155
- """Circuit breaker implementation for domain protection"""
156
-
157
- def __init__(self, config: CircuitBreakerConfig):
158
- self.config = config
159
- self.state = CircuitState.CLOSED
160
- self.failure_count = 0
161
- self.success_count = 0
162
- self.last_failure_time: Optional[datetime] = None
163
- self.next_attempt_time: Optional[datetime] = None
164
-
165
- def can_execute(self) -> bool:
166
- """Check if request can be executed"""
167
- now = datetime.now()
168
-
169
- if self.state == CircuitState.CLOSED:
170
- return True
171
- elif self.state == CircuitState.OPEN:
172
- if self.next_attempt_time and now >= self.next_attempt_time:
173
- self.state = CircuitState.HALF_OPEN
174
- self.success_count = 0
175
- return True
176
- return False
177
- elif self.state == CircuitState.HALF_OPEN:
178
- return True
179
-
180
- return False
181
-
182
- def record_success(self):
183
- """Record successful request"""
184
- if self.state == CircuitState.HALF_OPEN:
185
- self.success_count += 1
186
- if self.success_count >= self.config.success_threshold:
187
- self.state = CircuitState.CLOSED
188
- self.failure_count = 0
189
- elif self.state == CircuitState.CLOSED:
190
- self.failure_count = max(0, self.failure_count - 1) # Recover gradually
191
-
192
- def record_failure(self):
193
- """Record failed request"""
194
- self.failure_count += 1
195
- self.last_failure_time = datetime.now()
196
-
197
- if self.failure_count >= self.config.failure_threshold:
198
- self.state = CircuitState.OPEN
199
- self.next_attempt_time = datetime.now() + timedelta(seconds=self.config.recovery_timeout)
200
- elif self.state == CircuitState.HALF_OPEN:
201
- self.state = CircuitState.OPEN
202
- self.next_attempt_time = datetime.now() + timedelta(seconds=self.config.recovery_timeout)
203
-
204
- @property
205
- def status(self) -> Dict[str, Any]:
206
- """Get circuit breaker status"""
207
- return {
208
- 'state': self.state.value,
209
- 'failure_count': self.failure_count,
210
- 'success_count': self.success_count,
211
- 'next_attempt_time': self.next_attempt_time.isoformat() if self.next_attempt_time else None
212
- }
213
-
214
-
215
- class IntelligentRetryManager:
216
- """
217
- Intelligent retry system with domain-specific policies, circuit breakers, and adaptive rate limiting
218
- """
219
-
220
- def __init__(self):
221
- self.retry_configs: Dict[str, RetryConfig] = {}
222
- self.circuit_breakers: Dict[str, CircuitBreaker] = {}
223
- self.domain_health: Dict[str, DomainHealth] = {}
224
- self.user_agent_rotator = UserAgentRotator()
225
- self.rate_limiters: Dict[str, 'RateLimiter'] = {}
226
-
227
- # Initialize default configurations
228
- self._initialize_domain_configs()
229
-
230
- def _initialize_domain_configs(self):
231
- """Initialize domain-specific configurations"""
232
-
233
- # Workday ATS sites - more aggressive retry
234
- workday_config = RetryConfig(
235
- max_attempts=7,
236
- initial_delay=2.0,
237
- max_delay=120.0,
238
- backoff_multiplier=2.5,
239
- timeout=45.0,
240
- retry_http_codes=[500, 502, 503, 504, 429, 403, 408]
241
- )
242
-
243
- workday_domains = [
244
- 'equinix.wd1.myworkdayjobs.com',
245
- 'globalswitch.wd103.myworkdayjobs.com',
246
- 'cyrusone.wd5.myworkdayjobs.com'
247
- ]
248
-
249
- for domain in workday_domains:
250
- self.retry_configs[domain] = workday_config
251
- self.circuit_breakers[domain] = CircuitBreaker(
252
- CircuitBreakerConfig(failure_threshold=8, recovery_timeout=90.0)
253
- )
254
-
255
- # Large tech companies - conservative retry
256
- tech_config = RetryConfig(
257
- max_attempts=4,
258
- initial_delay=3.0,
259
- max_delay=60.0,
260
- timeout=30.0,
261
- retry_http_codes=[500, 502, 503, 504, 429]
262
- )
263
-
264
- tech_domains = [
265
- 'amazon.jobs',
266
- 'careers.google.com',
267
- 'jobs.careers.microsoft.com'
268
- ]
269
-
270
- for domain in tech_domains:
271
- self.retry_configs[domain] = tech_config
272
- self.circuit_breakers[domain] = CircuitBreaker(
273
- CircuitBreakerConfig(failure_threshold=3, recovery_timeout=120.0)
274
- )
275
-
276
- # European datacenter companies - balanced approach
277
- eu_config = RetryConfig(
278
- max_attempts=6,
279
- initial_delay=1.5,
280
- max_delay=90.0,
281
- timeout=35.0,
282
- retry_http_codes=[500, 502, 503, 504, 429, 408]
283
- )
284
-
285
- eu_domains = [
286
- 'career.hetzner.com',
287
- 'careers.ovhcloud.com',
288
- 'scaleway.com',
289
- 'careers.digitalrealty.com',
290
- 'data4group.com',
291
- 'careers.ironmountain.com',
292
- 'datacenters.ntt'
293
- ]
294
-
295
- for domain in eu_domains:
296
- self.retry_configs[domain] = eu_config
297
- self.circuit_breakers[domain] = CircuitBreaker(
298
- CircuitBreakerConfig(failure_threshold=5, recovery_timeout=60.0)
299
- )
300
-
301
- def _get_domain(self, url: str) -> str:
302
- """Extract domain from URL"""
303
- return urlparse(url).netloc.lower()
304
-
305
- def _get_config(self, domain: str) -> RetryConfig:
306
- """Get retry configuration for domain"""
307
- return self.retry_configs.get(domain, RetryConfig())
308
-
309
- def _get_circuit_breaker(self, domain: str) -> CircuitBreaker:
310
- """Get or create circuit breaker for domain"""
311
- if domain not in self.circuit_breakers:
312
- self.circuit_breakers[domain] = CircuitBreaker(CircuitBreakerConfig())
313
- return self.circuit_breakers[domain]
314
-
315
- def _get_domain_health(self, domain: str) -> DomainHealth:
316
- """Get or create domain health tracker"""
317
- if domain not in self.domain_health:
318
- self.domain_health[domain] = DomainHealth(domain=domain)
319
- return self.domain_health[domain]
320
-
321
- def _calculate_delay(self, attempt: int, config: RetryConfig, reason: RetryReason) -> float:
322
- """Calculate retry delay with exponential backoff and jitter"""
323
-
324
- # Base exponential backoff
325
- delay = config.initial_delay * (config.backoff_multiplier ** (attempt - 1))
326
- delay = min(delay, config.max_delay)
327
-
328
- # Reason-specific adjustments
329
- if reason == RetryReason.RATE_LIMIT:
330
- delay *= 2.0 # Longer delays for rate limiting
331
- elif reason == RetryReason.CLOUDFLARE_BLOCK:
332
- delay *= 3.0 # Much longer for Cloudflare
333
- elif reason == RetryReason.BOT_DETECTION:
334
- delay *= 4.0 # Longest delays for bot detection
335
-
336
- # Add jitter to avoid thundering herd
337
- if config.jitter:
338
- jitter_min, jitter_max = config.jitter_range
339
- jitter_factor = 1.0 + random.uniform(jitter_min, jitter_max)
340
- delay *= jitter_factor
341
-
342
- return delay
343
-
344
- def _classify_error(self, error: Exception, response_code: Optional[int] = None) -> RetryReason:
345
- """Classify error to determine retry strategy"""
346
-
347
- if response_code:
348
- if response_code == 429:
349
- return RetryReason.RATE_LIMIT
350
- elif response_code == 403:
351
- error_msg = str(error).lower()
352
- if 'cloudflare' in error_msg or 'cf-ray' in error_msg:
353
- return RetryReason.CLOUDFLARE_BLOCK
354
- elif 'bot' in error_msg or 'captcha' in error_msg:
355
- return RetryReason.BOT_DETECTION
356
- else:
357
- return RetryReason.AUTHENTICATION_ERROR
358
- elif response_code >= 500:
359
- return RetryReason.SERVER_ERROR
360
- elif response_code >= 400:
361
- return RetryReason.HTTP_ERROR
362
-
363
- error_msg = str(error).lower()
364
- if 'timeout' in error_msg:
365
- return RetryReason.TIMEOUT
366
- elif 'connection' in error_msg or 'network' in error_msg:
367
- return RetryReason.NETWORK_ERROR
368
- else:
369
- return RetryReason.NETWORK_ERROR
370
-
371
- def _should_retry(self, error: Exception, attempt: int, config: RetryConfig,
372
- response_code: Optional[int] = None) -> bool:
373
- """Determine if request should be retried"""
374
-
375
- if attempt >= config.max_attempts:
376
- return False
377
-
378
- reason = self._classify_error(error, response_code)
379
-
380
- # Don't retry authentication errors (except bot detection)
381
- if reason == RetryReason.AUTHENTICATION_ERROR:
382
- return False
383
-
384
- # Always retry server errors and network issues
385
- if reason in [RetryReason.SERVER_ERROR, RetryReason.NETWORK_ERROR, RetryReason.TIMEOUT]:
386
- return True
387
-
388
- # Retry rate limits and bot detection with longer delays
389
- if reason in [RetryReason.RATE_LIMIT, RetryReason.CLOUDFLARE_BLOCK, RetryReason.BOT_DETECTION]:
390
- return True
391
-
392
- # Check HTTP codes
393
- if response_code and response_code in config.retry_http_codes:
394
- return True
395
-
396
- return False
397
-
398
- def _update_domain_health(self, domain: str, success: bool, response_time: float,
399
- error: Optional[Exception] = None):
400
- """Update domain health metrics"""
401
- health = self._get_domain_health(domain)
402
- health.total_requests += 1
403
- health.recent_response_times.append(response_time)
404
-
405
- if success:
406
- health.success_count += 1
407
- health.last_success = datetime.now()
408
- health.consecutive_failures = 0
409
- else:
410
- health.failure_count += 1
411
- health.last_failure = datetime.now()
412
- health.consecutive_failures += 1
413
-
414
- # Update average response time
415
- if health.recent_response_times:
416
- health.average_response_time = sum(health.recent_response_times) / len(health.recent_response_times)
417
-
418
- async def execute_with_retry(self, func: Callable, url: str, *args, **kwargs) -> Any:
419
- """
420
- Execute function with intelligent retry logic
421
-
422
- Args:
423
- func: Async function to execute
424
- url: URL being accessed (for domain-specific policies)
425
- *args, **kwargs: Arguments to pass to function
426
-
427
- Returns:
428
- Result of successful function execution
429
-
430
- Raises:
431
- Exception: If all retry attempts fail
432
- """
433
-
434
- domain = self._get_domain(url)
435
- config = self._get_config(domain)
436
- circuit_breaker = self._get_circuit_breaker(domain)
437
- attempts = []
438
-
439
- # Check circuit breaker
440
- if not circuit_breaker.can_execute():
441
- raise Exception(
442
- f"Circuit breaker OPEN for domain {domain}. Next attempt at: {circuit_breaker.next_attempt_time}")
443
-
444
- for attempt in range(1, config.max_attempts + 1):
445
- start_time = time.time()
446
-
447
- try:
448
- # Rotate user agent for each attempt
449
- if hasattr(kwargs, 'headers'):
450
- kwargs['headers'] = kwargs.get('headers', {})
451
- kwargs['headers']['User-Agent'] = self.user_agent_rotator.get_user_agent()
452
-
453
- # Execute function
454
- result = await func(*args, **kwargs)
455
-
456
- # Record success
457
- response_time = time.time() - start_time
458
- self._update_domain_health(domain, True, response_time)
459
- circuit_breaker.record_success()
460
-
461
- logger.info(f"Request succeeded for {domain} on attempt {attempt}")
462
- return result
463
-
464
- except Exception as error:
465
- response_time = time.time() - start_time
466
- response_code = getattr(error, 'status_code', None) or getattr(error, 'status', None) or getattr(error, 'code', None)
467
- reason = self._classify_error(error, response_code)
468
-
469
- # Record failure
470
- self._update_domain_health(domain, False, response_time, error)
471
- circuit_breaker.record_failure()
472
-
473
- # Log attempt
474
- attempts.append(RetryAttempt(
475
- attempt=attempt,
476
- reason=reason,
477
- delay=0, # Will be set below
478
- timestamp=datetime.now(),
479
- error=str(error)[:200], # Truncate error message
480
- response_code=response_code
481
- ))
482
-
483
- logger.warning(
484
- f"Attempt {attempt}/{config.max_attempts} failed for {domain}: {reason.value} - {str(error)[:100]}")
485
-
486
- # Check if should retry
487
- if not self._should_retry(error, attempt, config, response_code):
488
- logger.error(f"Not retrying {domain} due to error type: {reason.value}")
489
- raise error
490
-
491
- # Don't delay on last attempt
492
- if attempt < config.max_attempts:
493
- delay = self._calculate_delay(attempt, config, reason)
494
- attempts[-1].delay = delay
495
-
496
- logger.info(f"Retrying {domain} in {delay:.2f} seconds (reason: {reason.value})")
497
- await asyncio.sleep(delay)
498
-
499
- # All attempts failed
500
- health = self._get_domain_health(domain)
501
- error_msg = f"All {config.max_attempts} retry attempts failed for {domain}. Health score: {health.health_score:.2f}"
502
- logger.error(error_msg)
503
-
504
- # Re-raise the last error
505
- if attempts:
506
- raise Exception(f"{error_msg}. Last error: {attempts[-1].error}")
507
- else:
508
- raise Exception(error_msg)
509
-
510
- async def execute_http_request(self, client: httpx.AsyncClient, method: str, url: str,
511
- **kwargs) -> httpx.Response:
512
- """
513
- Execute HTTP request with retry logic
514
-
515
- Args:
516
- client: httpx async client
517
- method: HTTP method (GET, POST, etc.)
518
- url: Request URL
519
- **kwargs: Additional request parameters
520
-
521
- Returns:
522
- Response object
523
- """
524
-
525
- async def make_request():
526
- timeout_value = self._get_config(self._get_domain(url)).timeout
527
- timeout = kwargs.get('timeout', httpx.Timeout(timeout_value))
528
- kwargs['timeout'] = timeout
529
-
530
- response = await client.request(method, url, **kwargs)
531
- response.raise_for_status()
532
- return response
533
-
534
- return await self.execute_with_retry(make_request, url)
535
-
536
- def get_domain_stats(self) -> Dict[str, Dict[str, Any]]:
537
- """Get statistics for all domains"""
538
- stats = {}
539
-
540
- for domain, health in self.domain_health.items():
541
- circuit_breaker = self._get_circuit_breaker(domain)
542
-
543
- stats[domain] = {
544
- 'health_score': round(health.health_score, 3),
545
- 'success_rate': round(health.success_rate, 3),
546
- 'total_requests': health.total_requests,
547
- 'success_count': health.success_count,
548
- 'failure_count': health.failure_count,
549
- 'average_response_time': round(health.average_response_time, 2),
550
- 'consecutive_failures': health.consecutive_failures,
551
- 'last_success': health.last_success.isoformat() if health.last_success else None,
552
- 'last_failure': health.last_failure.isoformat() if health.last_failure else None,
553
- 'circuit_breaker': circuit_breaker.status
554
- }
555
-
556
- return stats
557
-
558
- def reset_domain_health(self, domain: Optional[str] = None):
559
- """Reset health tracking for a domain or all domains"""
560
- if domain:
561
- if domain in self.domain_health:
562
- self.domain_health[domain] = DomainHealth(domain=domain)
563
- if domain in self.circuit_breakers:
564
- config = CircuitBreakerConfig() # Default config
565
- self.circuit_breakers[domain] = CircuitBreaker(config)
566
- else:
567
- self.domain_health.clear()
568
- self.circuit_breakers.clear()
569
- self._initialize_domain_configs()
570
-
571
-
572
- class RateLimiter:
573
- """Simple rate limiter for respecting site limits"""
574
-
575
- def __init__(self, requests_per_minute: int = 60):
576
- self.requests_per_minute = requests_per_minute
577
- self.request_times: deque = deque(maxlen=requests_per_minute)
578
-
579
- async def acquire(self):
580
- """Acquire rate limit slot, waiting if necessary"""
581
- now = time.time()
582
-
583
- # Remove old requests (older than 1 minute)
584
- while self.request_times and now - self.request_times[0] > 60:
585
- self.request_times.popleft()
586
-
587
- # Check if we need to wait
588
- if len(self.request_times) >= self.requests_per_minute:
589
- wait_time = 60 - (now - self.request_times[0]) + 0.1 # Small buffer
590
- if wait_time > 0:
591
- await asyncio.sleep(wait_time)
592
-
593
- # Record this request
594
- self.request_times.append(now)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/api/scraper/services/parser/parser.py DELETED
@@ -1,935 +0,0 @@
1
- """
2
- Enhanced Multi-Engine Web Scraper
3
- Combines BeautifulSoup, Playwright, and intelligent fallback strategies for robust data extraction
4
- """
5
-
6
- import asyncio
7
- import hashlib
8
- import logging
9
- import re
10
- import time
11
- import traceback
12
- from dataclasses import dataclass, field
13
- from datetime import datetime
14
- from enum import Enum
15
- from typing import Dict, List, Optional, Any
16
- from urllib.parse import urljoin, urlparse
17
-
18
- import httpx
19
- import html2text
20
- from bs4 import BeautifulSoup, Tag
21
- from playwright.async_api import async_playwright, Browser, Page, BrowserContext
22
- from selectolax.parser import HTMLParser
23
-
24
- from app.api.scraper.services.parser.intelligent_retry import IntelligentRetryManager
25
-
26
- logger = logging.getLogger(__name__)
27
-
28
-
29
- class ScrapingEngine(Enum):
30
- """Available scraping engines"""
31
- BEAUTIFULSOUP = "beautifulsoup"
32
- PLAYWRIGHT = "playwright"
33
- SELECTOLAX = "selectolax"
34
- HYBRID = "hybrid"
35
-
36
-
37
- class ContentType(Enum):
38
- """Types of web content"""
39
- STATIC_HTML = "static_html"
40
- SPA_REACT = "spa_react"
41
- SPA_ANGULAR = "spa_angular"
42
- SPA_VUE = "spa_vue"
43
- DYNAMIC_AJAX = "dynamic_ajax"
44
- INFINITE_SCROLL = "infinite_scroll"
45
- WORKDAY_ATS = "workday_ats"
46
- GREENHOUSE_ATS = "greenhouse_ats"
47
- LEVER_ATS = "lever_ats"
48
-
49
-
50
- @dataclass
51
- class SiteProfile:
52
- """Site-specific configuration and selectors"""
53
- domain: str
54
- name: str
55
- content_type: ContentType
56
- preferred_engine: ScrapingEngine
57
- selectors: Dict[str, str] = field(default_factory=dict)
58
- wait_conditions: List[str] = field(default_factory=list)
59
- javascript_required: bool = False
60
- rate_limit_delay: float = 1.0
61
- custom_headers: Dict[str, str] = field(default_factory=dict)
62
- login_required: bool = False
63
- dynamic_loading: bool = False
64
- infinite_scroll: bool = False
65
- spa_detection_keywords: List[str] = field(default_factory=list)
66
-
67
-
68
- @dataclass
69
- class ScrapingResult:
70
- """Result of a scraping operation"""
71
- success: bool
72
- engine_used: ScrapingEngine
73
- content: Optional[str] = None
74
- jobs: List[Dict[str, Any]] = field(default_factory=list)
75
- metadata: Dict[str, Any] = field(default_factory=dict)
76
- errors: List[str] = field(default_factory=list)
77
- performance: Dict[str, float] = field(default_factory=dict)
78
- fallback_attempts: int = 0
79
-
80
-
81
- class Parser:
82
- """
83
- Enhanced multi-engine web scraper with intelligent fallback strategies
84
- Handles static sites, SPAs, and dynamic content with multiple parsing engines
85
- """
86
-
87
- def __init__(self, retry_manager: Optional[IntelligentRetryManager] = None):
88
- self.retry_manager = retry_manager or IntelligentRetryManager()
89
- self.site_profiles = self._initialize_site_profiles()
90
- self.browser: Optional[Browser] = None
91
- self.context: Optional[BrowserContext] = None
92
- self.client: Optional[httpx.AsyncClient] = None
93
- self.h2t = html2text.HTML2Text()
94
- self.h2t.ignore_links = False
95
- self.h2t.ignore_images = True
96
-
97
- def _initialize_site_profiles(self) -> Dict[str, SiteProfile]:
98
- """Initialize site-specific configurations from the 16 working URLs"""
99
- return {
100
- "hetzner.com": SiteProfile(
101
- domain="hetzner.com",
102
- name="Hetzner",
103
- content_type=ContentType.STATIC_HTML,
104
- preferred_engine=ScrapingEngine.BEAUTIFULSOUP,
105
- selectors={
106
- "job_listings": ".job-offer",
107
- "job_title": ".job-title",
108
- "job_location": ".job-location",
109
- "job_description": ".job-description",
110
- "job_url": ".job-link"
111
- },
112
- wait_conditions=["networkidle"],
113
- rate_limit_delay=2.0
114
- ),
115
-
116
- "careers.ovhcloud.com": SiteProfile(
117
- domain="careers.ovhcloud.com",
118
- name="OVHcloud",
119
- content_type=ContentType.SPA_REACT,
120
- preferred_engine=ScrapingEngine.PLAYWRIGHT,
121
- selectors={
122
- "job_listings": "[data-testid='job-card']",
123
- "job_title": "[data-testid='job-title']",
124
- "job_location": "[data-testid='job-location']",
125
- "job_description": "[data-testid='job-description']"
126
- },
127
- javascript_required=True,
128
- wait_conditions=["load", "networkidle"],
129
- spa_detection_keywords=["React", "react-dom"],
130
- rate_limit_delay=3.0
131
- ),
132
-
133
- "scaleway.com": SiteProfile(
134
- domain="scaleway.com",
135
- name="Scaleway",
136
- content_type=ContentType.DYNAMIC_AJAX,
137
- preferred_engine=ScrapingEngine.HYBRID,
138
- selectors={
139
- "job_listings": ".job-item",
140
- "job_title": ".position-title",
141
- "job_location": ".position-location"
142
- },
143
- javascript_required=True,
144
- dynamic_loading=True,
145
- rate_limit_delay=2.5
146
- ),
147
-
148
- "equinix.wd1.myworkdayjobs.com": SiteProfile(
149
- domain="equinix.wd1.myworkdayjobs.com",
150
- name="Equinix",
151
- content_type=ContentType.WORKDAY_ATS,
152
- preferred_engine=ScrapingEngine.PLAYWRIGHT,
153
- selectors={
154
- "job_listings": "[data-automation-id='jobTitle']",
155
- "job_title": "[data-automation-id='jobTitle'] a",
156
- "job_location": "[data-automation-id='locations']"
157
- },
158
- javascript_required=True,
159
- wait_conditions=["load", "networkidle", "domcontentloaded"],
160
- rate_limit_delay=4.0,
161
- custom_headers={
162
- "Accept": "application/json, text/plain, */*",
163
- "Accept-Language": "en-US,en;q=0.9"
164
- }
165
- ),
166
-
167
- "careers.digitalrealty.com": SiteProfile(
168
- domain="careers.digitalrealty.com",
169
- name="Digital Realty",
170
- content_type=ContentType.STATIC_HTML,
171
- preferred_engine=ScrapingEngine.BEAUTIFULSOUP,
172
- selectors={
173
- "job_listings": ".job-result",
174
- "job_title": ".job-title a",
175
- "job_location": ".job-location"
176
- },
177
- rate_limit_delay=2.0
178
- ),
179
-
180
- "globalswitch.wd103.myworkdayjobs.com": SiteProfile(
181
- domain="globalswitch.wd103.myworkdayjobs.com",
182
- name="Global Switch",
183
- content_type=ContentType.WORKDAY_ATS,
184
- preferred_engine=ScrapingEngine.PLAYWRIGHT,
185
- selectors={
186
- "job_listings": "[data-automation-id='jobTitle']",
187
- "job_title": "[data-automation-id='jobTitle'] a",
188
- "job_location": "[data-automation-id='locations']"
189
- },
190
- javascript_required=True,
191
- wait_conditions=["load", "networkidle"],
192
- rate_limit_delay=4.0
193
- ),
194
-
195
- "amazon.jobs": SiteProfile(
196
- domain="amazon.jobs",
197
- name="AWS",
198
- content_type=ContentType.SPA_REACT,
199
- preferred_engine=ScrapingEngine.PLAYWRIGHT,
200
- selectors={
201
- "job_listings": "[data-test='job-tile']",
202
- "job_title": "[data-test='job-title']",
203
- "job_location": "[data-test='job-location']"
204
- },
205
- javascript_required=True,
206
- dynamic_loading=True,
207
- infinite_scroll=True,
208
- spa_detection_keywords=["React", "react-router"],
209
- rate_limit_delay=3.0
210
- ),
211
-
212
- "careers.google.com": SiteProfile(
213
- domain="careers.google.com",
214
- name="Google",
215
- content_type=ContentType.SPA_ANGULAR,
216
- preferred_engine=ScrapingEngine.PLAYWRIGHT,
217
- selectors={
218
- "job_listings": ".gc-card",
219
- "job_title": ".gc-card__title",
220
- "job_location": ".gc-card__location"
221
- },
222
- javascript_required=True,
223
- wait_conditions=["load", "networkidle"],
224
- spa_detection_keywords=["Angular", "ng-"],
225
- rate_limit_delay=3.5
226
- ),
227
-
228
- "jobs.careers.microsoft.com": SiteProfile(
229
- domain="jobs.careers.microsoft.com",
230
- name="Microsoft",
231
- content_type=ContentType.SPA_REACT,
232
- preferred_engine=ScrapingEngine.PLAYWRIGHT,
233
- selectors={
234
- "job_listings": "[data-ph-at-id='job-search-result-item']",
235
- "job_title": "[data-ph-at-id='job-title']",
236
- "job_location": "[data-ph-at-id='job-location']"
237
- },
238
- javascript_required=True,
239
- dynamic_loading=True,
240
- rate_limit_delay=3.0
241
- ),
242
-
243
- "data4group.com": SiteProfile(
244
- domain="data4group.com",
245
- name="Data4",
246
- content_type=ContentType.STATIC_HTML,
247
- preferred_engine=ScrapingEngine.BEAUTIFULSOUP,
248
- selectors={
249
- "job_listings": ".career-item",
250
- "job_title": ".career-title",
251
- "job_location": ".career-location"
252
- },
253
- rate_limit_delay=2.0
254
- ),
255
-
256
- "careers.ironmountain.com": SiteProfile(
257
- domain="careers.ironmountain.com",
258
- name="Iron Mountain",
259
- content_type=ContentType.STATIC_HTML,
260
- preferred_engine=ScrapingEngine.SELECTOLAX,
261
- selectors={
262
- "job_listings": ".job-search-result",
263
- "job_title": ".job-title-link",
264
- "job_location": ".job-location"
265
- },
266
- rate_limit_delay=2.0
267
- ),
268
-
269
- "datacenters.ntt": SiteProfile(
270
- domain="datacenters.ntt",
271
- name="NTT Global Data Centers",
272
- content_type=ContentType.STATIC_HTML,
273
- preferred_engine=ScrapingEngine.BEAUTIFULSOUP,
274
- selectors={
275
- "job_listings": ".career-listing",
276
- "job_title": ".career-title",
277
- "job_location": ".career-location"
278
- },
279
- rate_limit_delay=2.5
280
- ),
281
-
282
- "cyrusone.wd5.myworkdayjobs.com": SiteProfile(
283
- domain="cyrusone.wd5.myworkdayjobs.com",
284
- name="CyrusOne",
285
- content_type=ContentType.WORKDAY_ATS,
286
- preferred_engine=ScrapingEngine.PLAYWRIGHT,
287
- selectors={
288
- "job_listings": "[data-automation-id='jobTitle']",
289
- "job_title": "[data-automation-id='jobTitle'] a",
290
- "job_location": "[data-automation-id='locations']"
291
- },
292
- javascript_required=True,
293
- wait_conditions=["load", "networkidle"],
294
- rate_limit_delay=4.0
295
- )
296
- }
297
-
298
- async def __aenter__(self):
299
- """Async context manager entry"""
300
- await self._initialize_clients()
301
- return self
302
-
303
- async def __aexit__(self, exc_type, exc_val, exc_tb):
304
- """Async context manager exit"""
305
- await self._cleanup_clients()
306
-
307
- async def _initialize_clients(self):
308
- """Initialize HTTP and browser clients"""
309
- # Initialize httpx client
310
- timeout = httpx.Timeout(timeout=30.0, connect=10.0)
311
- self.client = httpx.AsyncClient(
312
- timeout=timeout,
313
- headers={
314
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
315
- }
316
- )
317
-
318
- # Initialize Playwright browser
319
- self.playwright = await async_playwright().start()
320
- self.browser = await self.playwright.chromium.launch(
321
- headless=True,
322
- args=['--no-sandbox', '--disable-dev-shm-usage']
323
- )
324
- self.context = await self.browser.new_context(
325
- viewport={'width': 1920, 'height': 1080},
326
- user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
327
- )
328
-
329
- async def _cleanup_clients(self):
330
- """Clean up clients and resources"""
331
- if self.client:
332
- await self.client.aclose()
333
- if self.context:
334
- await self.context.close()
335
- if self.browser:
336
- await self.browser.close()
337
- if hasattr(self, 'playwright'):
338
- await self.playwright.stop()
339
-
340
- def _get_site_profile(self, url: str) -> Optional[SiteProfile]:
341
- """Get site profile for URL"""
342
- domain = urlparse(url).netloc.lower()
343
-
344
- # Direct domain match
345
- if domain in self.site_profiles:
346
- return self.site_profiles[domain]
347
-
348
- # Partial domain match
349
- for profile_domain, profile in self.site_profiles.items():
350
- if profile_domain in domain or domain in profile_domain:
351
- return profile
352
-
353
- return None
354
-
355
- async def _detect_content_type(self, url: str, html: str = None) -> ContentType:
356
- """Detect the type of content on a webpage"""
357
- if html:
358
- soup = BeautifulSoup(html, 'html.parser')
359
-
360
- # Check for SPA frameworks
361
- scripts = soup.find_all('script')
362
- script_content = ' '.join([s.get_text() for s in scripts if s.get_text()])
363
-
364
- if 'React' in script_content or 'react-dom' in script_content:
365
- return ContentType.SPA_REACT
366
- elif 'Angular' in script_content or 'ng-app' in html:
367
- return ContentType.SPA_ANGULAR
368
- elif 'Vue' in script_content or 'vue.js' in html:
369
- return ContentType.SPA_VUE
370
- elif 'workday' in url.lower():
371
- return ContentType.WORKDAY_ATS
372
- elif 'greenhouse' in url.lower():
373
- return ContentType.GREENHOUSE_ATS
374
- elif 'lever' in url.lower():
375
- return ContentType.LEVER_ATS
376
- elif soup.find_all(attrs={'data-testid': True}) or soup.find_all(attrs={'data-cy': True}):
377
- return ContentType.DYNAMIC_AJAX
378
-
379
- return ContentType.STATIC_HTML
380
-
381
- async def scrape_jobs(self, url: str, max_pages: int = 5) -> ScrapingResult:
382
- """
383
- Main scraping method with intelligent engine selection and fallback
384
- """
385
- start_time = time.time()
386
- site_profile = self._get_site_profile(url)
387
-
388
- logger.info(f"Scraping jobs from {url} with profile: {site_profile.name if site_profile else 'Unknown'}")
389
-
390
- # Try preferred engine first
391
- if site_profile:
392
- result = await self._scrape_with_engine(url, site_profile.preferred_engine, site_profile, max_pages)
393
- if result.success:
394
- result.performance['total_time'] = time.time() - start_time
395
- return result
396
-
397
- # Fallback strategy: try engines in order of complexity
398
- fallback_engines = [
399
- ScrapingEngine.PLAYWRIGHT,
400
- ScrapingEngine.BEAUTIFULSOUP,
401
- ScrapingEngine.SELECTOLAX
402
- ]
403
-
404
- for i, engine in enumerate(fallback_engines):
405
- if site_profile and engine == site_profile.preferred_engine:
406
- continue # Already tried
407
-
408
- logger.info(f"Trying fallback engine {i + 1}/{len(fallback_engines)}: {engine.value}")
409
- result = await self._scrape_with_engine(url, engine, site_profile, max_pages)
410
- result.fallback_attempts = i + 1
411
-
412
- if result.success:
413
- result.performance['total_time'] = time.time() - start_time
414
- return result
415
-
416
- # All engines failed
417
- return ScrapingResult(
418
- success=False,
419
- engine_used=ScrapingEngine.HYBRID,
420
- errors=["All scraping engines failed"],
421
- performance={'total_time': time.time() - start_time},
422
- fallback_attempts=len(fallback_engines)
423
- )
424
-
425
- async def _scrape_with_engine(self, url: str, engine: ScrapingEngine,
426
- site_profile: Optional[SiteProfile], max_pages: int) -> ScrapingResult:
427
- """Scrape using specific engine"""
428
- try:
429
- if engine == ScrapingEngine.PLAYWRIGHT:
430
- return await self._scrape_with_playwright(url, site_profile, max_pages)
431
- elif engine == ScrapingEngine.BEAUTIFULSOUP:
432
- return await self._scrape_with_beautifulsoup(url, site_profile, max_pages)
433
- elif engine == ScrapingEngine.SELECTOLAX:
434
- return await self._scrape_with_selectolax(url, site_profile, max_pages)
435
- elif engine == ScrapingEngine.HYBRID:
436
- return await self._scrape_with_hybrid(url, site_profile, max_pages)
437
- except Exception as e:
438
- logger.error(f"Engine {engine.value} failed: {str(e)}")
439
- return ScrapingResult(
440
- success=False,
441
- engine_used=engine,
442
- errors=[f"Engine {engine.value} error: {str(e)}"]
443
- )
444
-
445
- async def _scrape_with_playwright(self, url: str, profile: Optional[SiteProfile], max_pages: int) -> ScrapingResult:
446
- """Scrape using Playwright for JavaScript-heavy sites"""
447
- page = await self.context.new_page()
448
- jobs = []
449
- errors = []
450
-
451
- try:
452
- # Set custom headers if provided
453
- if profile and profile.custom_headers:
454
- await page.set_extra_http_headers(profile.custom_headers)
455
-
456
- # Navigate to page
457
- response = await page.goto(url, wait_until='networkidle', timeout=30000)
458
- if response.status >= 400:
459
- raise Exception(f"HTTP {response.status}")
460
-
461
- # Wait for content to load
462
- if profile and profile.wait_conditions:
463
- for condition in profile.wait_conditions:
464
- if condition == 'networkidle':
465
- await page.wait_for_load_state('networkidle')
466
- elif condition == 'load':
467
- await page.wait_for_load_state('load')
468
- elif condition == 'domcontentloaded':
469
- await page.wait_for_load_state('domcontentloaded')
470
-
471
- # Handle infinite scroll
472
- if profile and profile.infinite_scroll:
473
- await self._handle_infinite_scroll(page)
474
-
475
- # Extract jobs
476
- html_content = await page.content()
477
- content_type = await self._detect_content_type(url, html_content)
478
-
479
- if profile and profile.selectors:
480
- jobs = await self._extract_jobs_with_selectors(page, profile.selectors, content_type)
481
- else:
482
- jobs = await self._extract_jobs_intelligent(page, content_type)
483
-
484
- return ScrapingResult(
485
- success=True,
486
- engine_used=ScrapingEngine.PLAYWRIGHT,
487
- content=html_content,
488
- jobs=jobs,
489
- metadata={
490
- 'content_type': content_type.value,
491
- 'page_title': await page.title(),
492
- 'final_url': page.url
493
- }
494
- )
495
-
496
- except Exception as e:
497
- errors.append(f"Playwright error: {str(e)}")
498
- return ScrapingResult(
499
- success=False,
500
- engine_used=ScrapingEngine.PLAYWRIGHT,
501
- errors=errors
502
- )
503
- finally:
504
- await page.close()
505
-
506
- async def _scrape_with_beautifulsoup(self, url: str, profile: Optional[SiteProfile],
507
- max_pages: int) -> ScrapingResult:
508
- """Scrape using BeautifulSoup for static HTML"""
509
- jobs = []
510
- errors = []
511
-
512
- try:
513
- # Apply rate limiting
514
- if profile:
515
- await asyncio.sleep(profile.rate_limit_delay)
516
-
517
- headers = {}
518
- if profile and profile.custom_headers:
519
- headers.update(profile.custom_headers)
520
-
521
- response = await self.client.get(url, headers=headers)
522
- response.raise_for_status()
523
-
524
- html_content = response.text
525
- soup = BeautifulSoup(html_content, 'html.parser')
526
-
527
- content_type = await self._detect_content_type(url, html_content)
528
-
529
- if profile and profile.selectors:
530
- jobs = self._extract_jobs_beautifulsoup(soup, profile.selectors)
531
- else:
532
- jobs = self._extract_jobs_beautifulsoup_intelligent(soup)
533
-
534
- return ScrapingResult(
535
- success=True,
536
- engine_used=ScrapingEngine.BEAUTIFULSOUP,
537
- content=html_content,
538
- jobs=jobs,
539
- metadata={
540
- 'content_type': content_type.value,
541
- 'page_title': soup.title.string if soup.title else '',
542
- 'final_url': url
543
- }
544
- )
545
-
546
- except Exception as e:
547
- traceback.print_exc()
548
- return ScrapingResult(
549
- success=False,
550
- engine_used=ScrapingEngine.BEAUTIFULSOUP,
551
- errors=errors
552
- )
553
-
554
- async def _scrape_with_selectolax(self, url: str, profile: Optional[SiteProfile], max_pages: int) -> ScrapingResult:
555
- """Scrape using Selectolax for fast HTML parsing"""
556
- jobs = []
557
- errors = []
558
-
559
- try:
560
- if profile:
561
- await asyncio.sleep(profile.rate_limit_delay)
562
-
563
- headers = {}
564
- if profile and profile.custom_headers:
565
- headers.update(profile.custom_headers)
566
-
567
- response = await self.client.get(url, headers=headers)
568
- response.raise_for_status()
569
-
570
- html_content = response.text
571
- tree = HTMLParser(html_content)
572
-
573
- content_type = await self._detect_content_type(url, html_content)
574
-
575
- if profile and profile.selectors:
576
- jobs = self._extract_jobs_selectolax(tree, profile.selectors)
577
- else:
578
- jobs = self._extract_jobs_selectolax_intelligent(tree)
579
-
580
- return ScrapingResult(
581
- success=True,
582
- engine_used=ScrapingEngine.SELECTOLAX,
583
- content=html_content,
584
- jobs=jobs,
585
- metadata={
586
- 'content_type': content_type.value,
587
- 'final_url': url
588
- }
589
- )
590
-
591
- except Exception as e:
592
- errors.append(f"Selectolax error: {str(e)}")
593
- return ScrapingResult(
594
- success=False,
595
- engine_used=ScrapingEngine.SELECTOLAX,
596
- errors=errors
597
- )
598
-
599
- async def _scrape_with_hybrid(self, url: str, profile: Optional[SiteProfile], max_pages: int) -> ScrapingResult:
600
- """Hybrid approach combining multiple engines"""
601
- # First, get initial content with BeautifulSoup
602
- bs_result = await self._scrape_with_beautifulsoup(url, profile, max_pages)
603
-
604
- # If JavaScript is likely required, try Playwright
605
- if bs_result.success and bs_result.content:
606
- content_type = await self._detect_content_type(url, bs_result.content)
607
- if content_type in [ContentType.SPA_REACT, ContentType.SPA_ANGULAR, ContentType.SPA_VUE,
608
- ContentType.DYNAMIC_AJAX]:
609
- pw_result = await self._scrape_with_playwright(url, profile, max_pages)
610
- if pw_result.success:
611
- return pw_result
612
-
613
- return bs_result
614
-
615
- async def _handle_infinite_scroll(self, page: Page, max_scrolls: int = 5):
616
- """Handle infinite scroll pages"""
617
- for _ in range(max_scrolls):
618
- # Scroll to bottom
619
- await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
620
-
621
- # Wait for new content
622
- await asyncio.sleep(2)
623
-
624
- # Check if more content loaded
625
- prev_height = await page.evaluate("document.body.scrollHeight")
626
- await asyncio.sleep(1)
627
- new_height = await page.evaluate("document.body.scrollHeight")
628
-
629
- if prev_height == new_height:
630
- break
631
-
632
- async def _extract_jobs_with_selectors(self, page: Page, selectors: Dict[str, str], content_type: ContentType) -> \
633
- List[Dict[str, Any]]:
634
- """Extract jobs using specific selectors"""
635
- jobs = []
636
-
637
- try:
638
- # Wait for job listings to load
639
- if selectors.get('job_listings'):
640
- await page.wait_for_selector(selectors['job_listings'], timeout=10000)
641
-
642
- # Get all job elements
643
- job_elements = await page.query_selector_all(selectors.get('job_listings', '.job'))
644
-
645
- for element in job_elements:
646
- job_data = {}
647
-
648
- # Extract title
649
- if selectors.get('job_title'):
650
- title_element = await element.query_selector(selectors['job_title'])
651
- if title_element:
652
- job_data['title'] = await title_element.inner_text()
653
-
654
- # Extract location
655
- if selectors.get('job_location'):
656
- location_element = await element.query_selector(selectors['job_location'])
657
- if location_element:
658
- job_data['location'] = await location_element.inner_text()
659
-
660
- # Extract description
661
- if selectors.get('job_description'):
662
- desc_element = await element.query_selector(selectors['job_description'])
663
- if desc_element:
664
- job_data['description'] = await desc_element.inner_text()
665
-
666
- # Extract URL
667
- if selectors.get('job_url'):
668
- url_element = await element.query_selector(selectors['job_url'])
669
- if url_element:
670
- href = await url_element.get_attribute('href')
671
- if href:
672
- job_data['url'] = urljoin(page.url, href)
673
-
674
- if job_data.get('title'): # Only add if we have at least a title
675
- jobs.append(job_data)
676
-
677
- except Exception as e:
678
- logger.error(f"Error extracting jobs with selectors: {str(e)}")
679
-
680
- return jobs
681
-
682
- async def _extract_jobs_intelligent(self, page: Page, content_type: ContentType) -> List[Dict[str, Any]]:
683
- """Intelligent job extraction without specific selectors"""
684
- jobs = []
685
-
686
- try:
687
- # Get page content
688
- content = await page.content()
689
- soup = BeautifulSoup(content, 'html.parser')
690
-
691
- # Use intelligent extraction
692
- jobs = self._extract_jobs_beautifulsoup_intelligent(soup)
693
-
694
- except Exception as e:
695
- logger.error(f"Error in intelligent job extraction: {str(e)}")
696
-
697
- return jobs
698
-
699
- def _extract_jobs_beautifulsoup(self, soup: BeautifulSoup, selectors: Dict[str, str]) -> List[Dict[str, Any]]:
700
- """Extract jobs using BeautifulSoup with specific selectors"""
701
- jobs = []
702
-
703
- try:
704
- job_elements = soup.select(selectors.get('job_listings', '.job'))
705
-
706
- for element in job_elements:
707
- job_data = {}
708
-
709
- # Extract title
710
- if selectors.get('job_title'):
711
- title_element = element.select_one(selectors['job_title'])
712
- if title_element:
713
- job_data['title'] = title_element.get_text(strip=True)
714
-
715
- # Extract location
716
- if selectors.get('job_location'):
717
- location_element = element.select_one(selectors['job_location'])
718
- if location_element:
719
- job_data['location'] = location_element.get_text(strip=True)
720
-
721
- # Extract description
722
- if selectors.get('job_description'):
723
- desc_element = element.select_one(selectors['job_description'])
724
- if desc_element:
725
- job_data['description'] = desc_element.get_text(strip=True)
726
-
727
- # Extract URL
728
- if selectors.get('job_url'):
729
- url_element = element.select_one(selectors['job_url'])
730
- if url_element and url_element.get('href'):
731
- job_data['url'] = url_element['href']
732
-
733
- if job_data.get('title'):
734
- jobs.append(job_data)
735
-
736
- except Exception as e:
737
- logger.error(f"Error extracting jobs with BeautifulSoup: {str(e)}")
738
-
739
- return jobs
740
-
741
- def _extract_jobs_beautifulsoup_intelligent(self, soup: BeautifulSoup) -> List[Dict[str, Any]]:
742
- """Intelligent job extraction using common patterns"""
743
- jobs = []
744
-
745
- # Common job-related selectors to try
746
- job_selectors = [
747
- '.job', '.job-item', '.job-listing', '.job-card', '.job-result',
748
- '.position', '.opening', '.career-item', '.vacancy',
749
- '[data-testid*="job"]', '[data-cy*="job"]', '[class*="job"]'
750
- ]
751
-
752
- for selector in job_selectors:
753
- elements = soup.select(selector)
754
- if elements and len(elements) > 0:
755
- for element in elements[:20]: # Limit to 20 jobs per selector
756
- job_data = self._extract_single_job_intelligent(element)
757
- if job_data and job_data.get('title'):
758
- jobs.append(job_data)
759
-
760
- if jobs: # If we found jobs, return
761
- break
762
-
763
- return jobs
764
-
765
- def _extract_single_job_intelligent(self, element: Tag) -> Dict[str, Any]:
766
- """Extract single job data intelligently"""
767
- job_data = {}
768
-
769
- # Extract title - look for links, headings, or job-related classes
770
- title_selectors = ['h1', 'h2', 'h3', 'h4', 'a', '.title', '.job-title', '.position-title']
771
- for selector in title_selectors:
772
- title_element = element.select_one(selector)
773
- if title_element:
774
- title_text = title_element.get_text(strip=True)
775
- if title_text and len(title_text) > 5: # Basic validation
776
- job_data['title'] = title_text
777
- break
778
-
779
- # Extract location - look for location-related patterns
780
- location_patterns = [
781
- r'\b[A-Z][a-z]+,\s*[A-Z]{2,}\b', # City, Country
782
- r'\b[A-Z][a-z]+,\s*[A-Z][a-z]+\b' # City, State/Province
783
- ]
784
-
785
- element_text = element.get_text()
786
- for pattern in location_patterns:
787
- matches = re.findall(pattern, element_text)
788
- if matches:
789
- job_data['location'] = matches[0]
790
- break
791
-
792
- # Extract URL if element contains a link
793
- link = element.find('a')
794
- if link and link.get('href'):
795
- job_data['url'] = link['href']
796
-
797
- # Extract description (first paragraph or text content)
798
- description = element.get_text(strip=True)
799
- if description:
800
- job_data['description'] = description[:500] # Limit length
801
-
802
- return job_data
803
-
804
- def _extract_jobs_selectolax(self, tree: HTMLParser, selectors: Dict[str, str]) -> List[Dict[str, Any]]:
805
- """Extract jobs using Selectolax with specific selectors"""
806
- jobs = []
807
-
808
- try:
809
- job_elements = tree.css(selectors.get('job_listings', '.job'))
810
-
811
- for element in job_elements:
812
- job_data = {}
813
-
814
- # Extract title
815
- if selectors.get('job_title'):
816
- title_elements = element.css(selectors['job_title'])
817
- if title_elements:
818
- job_data['title'] = title_elements[0].text(strip=True)
819
-
820
- # Extract location
821
- if selectors.get('job_location'):
822
- location_elements = element.css(selectors['job_location'])
823
- if location_elements:
824
- job_data['location'] = location_elements[0].text(strip=True)
825
-
826
- # Extract URL
827
- if selectors.get('job_url'):
828
- url_elements = element.css(selectors['job_url'])
829
- if url_elements and url_elements[0].attributes.get('href'):
830
- job_data['url'] = url_elements[0].attributes['href']
831
-
832
- if job_data.get('title'):
833
- jobs.append(job_data)
834
-
835
- except Exception as e:
836
- logger.error(f"Error extracting jobs with Selectolax: {str(e)}")
837
-
838
- return jobs
839
-
840
- def _extract_jobs_selectolax_intelligent(self, tree: HTMLParser) -> List[Dict[str, Any]]:
841
- """Intelligent job extraction using Selectolax"""
842
- jobs = []
843
-
844
- # Try common job selectors
845
- job_selectors = ['.job', '.job-item', '.job-listing', '.job-card', '.position']
846
-
847
- for selector in job_selectors:
848
- elements = tree.css(selector)
849
- if elements:
850
- for element in elements[:20]: # Limit results
851
- job_data = {}
852
-
853
- # Extract title
854
- title_elements = element.css('h1, h2, h3, a, .title')
855
- if title_elements:
856
- job_data['title'] = title_elements[0].text(strip=True)
857
-
858
- # Extract text content for location
859
- text = element.text()
860
- location_match = re.search(r'\b[A-Z][a-z]+,\s*[A-Z]{2,}\b', text)
861
- if location_match:
862
- job_data['location'] = location_match.group()
863
-
864
- # Extract URL
865
- links = element.css('a')
866
- if links and links[0].attributes.get('href'):
867
- job_data['url'] = links[0].attributes['href']
868
-
869
- if job_data.get('title'):
870
- jobs.append(job_data)
871
-
872
- if jobs:
873
- break
874
-
875
- return jobs
876
-
877
- def generate_content_hash(self, content: str) -> str:
878
- """Generate hash for content deduplication"""
879
- return hashlib.sha256(content.encode('utf-8')).hexdigest()
880
-
881
- async def validate_scraped_jobs(self, jobs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
882
- """Validate and clean scraped job data"""
883
- validated_jobs = []
884
-
885
- for job in jobs:
886
- # Required fields validation
887
- if not job.get('title') or len(job['title'].strip()) < 5:
888
- continue
889
-
890
- # Clean and standardize data
891
- cleaned_job = {
892
- 'title': job['title'].strip(),
893
- 'company': job.get('company', '').strip(),
894
- 'location': job.get('location', '').strip(),
895
- 'description': job.get('description', '').strip()[:2000], # Limit description
896
- 'url': job.get('url', '').strip(),
897
- 'scraped_at': datetime.now().isoformat(),
898
- 'content_hash': self.generate_content_hash(f"{job['title']}{job.get('company', '')}")
899
- }
900
-
901
- # Only add if we have essential information
902
- if cleaned_job['title'] and (cleaned_job['company'] or cleaned_job['location']):
903
- validated_jobs.append(cleaned_job)
904
-
905
- return validated_jobs
906
-
907
-
908
- # Helper functions for coordination hooks
909
- async def store_scraping_progress(memory_key: str, data: Dict[str, Any]):
910
- """Store scraping progress in memory"""
911
- try:
912
- # This would integrate with the memory system
913
- logger.info(f"Storing progress to memory key: {memory_key}")
914
- pass
915
- except Exception as e:
916
- logger.error(f"Failed to store progress: {str(e)}")
917
-
918
-
919
- async def main():
920
- """Example usage of the enhanced multi-engine scraper"""
921
- async with Parser() as scraper:
922
- # Test with a real working URL
923
- url = "https://career.hetzner.com/job-offers.html"
924
- result = await scraper.scrape_jobs(url)
925
-
926
- if result.success:
927
- logger.info(f"Successfully scraped {len(result.jobs)} jobs using {result.engine_used.value}")
928
- for job in result.jobs[:5]: # Show first 5 jobs
929
- print(f"- {job.get('title', 'No title')} at {job.get('location', 'No location')}")
930
- else:
931
- logger.error(f"Scraping failed: {result.errors}")
932
-
933
-
934
- if __name__ == "__main__":
935
- asyncio.run(main())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/api/scraper/services/parser/parsers.py ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ from abc import abstractmethod, ABC
4
+
5
+ import httpx
6
+ import pydash
7
+ from linkedin_api import Linkedin
8
+ from requests.cookies import cookiejar_from_dict
9
+
10
+ from app.api.scraper.db_requests import save_job_obj
11
+ from app.api.scraper.dto import Salary
12
+ from app.api.scraper.models import JobModel
13
+ from app.api.scraper.utils import html_to_md
14
+ from app.core.config import settings
15
+
16
+
17
+ class BaseAPIScraper(ABC):
18
+ def __init__(self):
19
+ self.headers = {
20
+ "Accept": "application/json",
21
+ "Content-Type": "application/json",
22
+ "Origin": "https://equinix.wd1.myworkdayjobs.com",
23
+ "Referer": "https://equinix.wd1.myworkdayjobs.com/External",
24
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15",
25
+ }
26
+
27
+ @abstractmethod
28
+ async def _collect_all_jobs(self):
29
+ pass
30
+
31
+ @abstractmethod
32
+ async def _get_job_details(self, job: dict):
33
+ pass
34
+
35
+ @abstractmethod
36
+ async def _save_job_details(self, job_details: dict):
37
+ pass
38
+
39
+ async def _process_job(self, semaphore, job):
40
+ async with semaphore:
41
+ job_details = await self._get_job_details(job)
42
+ await self._save_job_details(job_details)
43
+
44
+ async def run(self):
45
+ jobs = await self._collect_all_jobs()
46
+ semaphore = asyncio.Semaphore(10)
47
+ tasks = [self._process_job(semaphore, job) for job in jobs]
48
+ await asyncio.gather(*tasks)
49
+
50
+ class EquinixParser(BaseAPIScraper):
51
+ def __init__(self):
52
+ super().__init__()
53
+ self.url = "https://equinix.wd1.myworkdayjobs.com/wday/cxs/equinix/External"
54
+
55
+ async def _collect_all_jobs(self):
56
+ all_jobs = []
57
+ offset = 1000
58
+ limit = 20
59
+ # with open('equinix.json', 'r', encoding='utf-8') as f:
60
+ # jobs = json.load(f)['jobs']
61
+ # return jobs
62
+
63
+ async with httpx.AsyncClient(timeout=httpx.Timeout(120)) as client:
64
+ while True:
65
+ payload = {
66
+ "appliedFacets": {
67
+ "locations": [
68
+ "9bde94053fcb019e5aef9196bc6b72ec",
69
+ "9bde94053fcb01c625460997bc6bd6ec",
70
+ "9bde94053fcb015852d08896bc6b68ec",
71
+ "9bde94053fcb0100f0725b97bc6b17ed",
72
+ "9bde94053fcb01bd68750c96bc6bebeb",
73
+ "9bde94053fcb01f9e9ff3d96bc6b1dec",
74
+ "9bde94053fcb01c9340b3996bc6b18ec",
75
+ "9bde94053fcb01c59f552696bc6b04ec",
76
+ "9bde94053fcb01d86d2eeb97bc6ba8ed",
77
+ "9ba19ba16c33103420c1c16fb36c32a5",
78
+ "9bde94053fcb01aca28fc095bc6ba5eb",
79
+ "9bde94053fcb01e99181b095bc6b96eb",
80
+ "9bde94053fcb01c6b749a795bc6b8ceb",
81
+ "9bde94053fcb0164431fe394bc6b0feb",
82
+ ]
83
+ },
84
+ "limit": 20,
85
+ "offset": 0,
86
+ }
87
+
88
+ response = await client.post(
89
+ f"{self.url}/jobs", headers=self.headers, json=payload
90
+ )
91
+ response.raise_for_status()
92
+ data = response.json()
93
+
94
+ jobs = data.get("jobPostings", [])
95
+ if not jobs:
96
+ break
97
+ all_jobs.extend(jobs)
98
+ if len(all_jobs) >= 1000:
99
+ break
100
+ offset += limit
101
+ await asyncio.sleep(0.1)
102
+ with open('equinix.json', 'w', encoding='utf-8') as f:
103
+ f.write(json.dumps({"jobs": all_jobs}, ensure_ascii=False))
104
+ return all_jobs
105
+
106
+ async def _get_job_details(self, job: dict):
107
+ async with httpx.AsyncClient(timeout=httpx.Timeout(120)) as client:
108
+ response = await client.get(
109
+ f"{self.url}{job['externalPath']}", headers=self.headers
110
+ )
111
+ response.raise_for_status()
112
+ return response.json()
113
+
114
+ async def _save_job_details(self, job_details: dict):
115
+ job = JobModel(
116
+ title=pydash.get(job_details, "jobPostingInfo.title"),
117
+ company=pydash.get(job_details, "hiringOrganization.name"),
118
+ location=pydash.get(job_details, "jobPostingInfo.location"),
119
+ description=html_to_md(
120
+ pydash.get(job_details, "jobPostingInfo.jobDescription", "")
121
+ ),
122
+ # salary=Salary(
123
+ # min=job_details["salary"]["min"], max=job_details["salary"]["max"]
124
+ # ),
125
+ sourceUrl=pydash.get(job_details, "jobPostingInfo.externalUrl"),
126
+ )
127
+ await save_job_obj(job)
128
+
129
+
130
+ class AmazonParser(BaseAPIScraper):
131
+ def __init__(self):
132
+ super().__init__()
133
+ self.url = "https://www.amazon.jobs/en"
134
+
135
+ async def _collect_all_jobs(self):
136
+ all_jobs = []
137
+ offset = 0
138
+ limit = 100
139
+ async with httpx.AsyncClient(timeout=httpx.Timeout(120)) as client:
140
+ while True:
141
+ response = await client.get(
142
+ f"{self.url}/search.json",
143
+ params={
144
+ "offset": offset,
145
+ "result_limit": limit,
146
+ "loc_query": "Europe",
147
+ },
148
+ headers=self.headers,
149
+ )
150
+ response.raise_for_status()
151
+ data = response.json()
152
+ jobs = data.get("jobs", [])
153
+ if not jobs:
154
+ break
155
+ all_jobs.extend(jobs)
156
+ offset += limit
157
+ await asyncio.sleep(0.5)
158
+ return all_jobs
159
+
160
+ async def _get_job_details(self, job: dict):
161
+ async with httpx.AsyncClient(timeout=httpx.Timeout(120)) as client:
162
+ response = await client.get(f"{self.url}/{job['jobId']}")
163
+ response.raise_for_status()
164
+ return response.json()
165
+
166
+ async def _save_job_details(self, job_details: dict):
167
+ job = JobModel(
168
+ title=job_details["title"],
169
+ company=job_details["company"],
170
+ location=job_details["location"],
171
+ description=job_details["description"],
172
+ requirements=job_details["requirements"],
173
+ salary=Salary(
174
+ min=job_details["salary"]["min"], max=job_details["salary"]["max"]
175
+ ),
176
+ sourceUrl=job_details["url"],
177
+ isTop5=job_details["isTop5"],
178
+ datetimeInserted=job_details["datetimeInserted"],
179
+ )
180
+ await save_job_obj(job)
181
+
182
+ async def run(self):
183
+ jobs = await self._collect_all_jobs()
184
+ for job in jobs:
185
+ job_details = await self._get_job_details(job)
186
+ await self._save_job_details(job_details)
187
+
188
+
189
+ class LinkedinParser(BaseAPIScraper):
190
+ def __init__(self):
191
+ super().__init__()
192
+ self.client = Linkedin(
193
+ "",
194
+ "",
195
+ cookies=cookiejar_from_dict(
196
+ {
197
+ "liap": "true",
198
+ "li_at": settings.LINKEDIN_COOKIE_LI_AT,
199
+ "JSESSIONID": settings.LINKEDIN_COOKIE_JSESSIONID,
200
+ }
201
+ ),
202
+ )
203
+
204
+ async def _collect_all_jobs(self):
205
+ offset = 0
206
+ limit = 100
207
+ while True:
208
+ jobs = self.client.search_jobs(
209
+ location_name="European Union", limit=limit, offset=offset
210
+ )
211
+ if not jobs:
212
+ break
213
+ offset += limit
214
+ await asyncio.sleep(0.5)
215
+ return jobs
216
+
217
+ async def _get_job_details(self, job: dict):
218
+ job = self.client.get_job(job["entityUrn"].split(":")[-1])
219
+ return job
220
+
221
+ async def _save_job_details(self, job_details: dict):
222
+ print(job_details)
223
+ job = JobModel(
224
+ title=job_details["title"],
225
+ company=job_details["company"],
226
+ location=job_details["location"],
227
+ description=job_details["description"],
228
+ requirements=job_details["requirements"],
229
+ salary=Salary(
230
+ min=job_details["salary"]["min"], max=job_details["salary"]["max"]
231
+ ),
232
+ sourceUrl=job_details["url"],
233
+ isTop5=job_details["isTop5"],
234
+ datetimeInserted=job_details["datetimeInserted"],
235
+ )
236
+ await save_job_obj(job)
237
+
238
+
239
+ class GoogleCareersParser(BaseAPIScraper):
240
+ def __init__(self):
241
+ super().__init__()
242
+ self.url = "https://www.google.com/about/careers/applications/jobs/results"
243
+
244
+ async def _collect_all_jobs(self):
245
+ all_jobs = []
246
+ page = 1
247
+ async with httpx.AsyncClient(timeout=httpx.Timeout(120)) as client:
248
+ while True:
249
+ response = await client.get(
250
+ f"{self.url}?location=Netherlands&location=France&location=England&location=Germany&page={page}",
251
+ headers=self.headers,
252
+ )
253
+ response.raise_for_status()
254
+ data = response.json()
255
+ jobs = data.get("jobs", [])
256
+ if not jobs:
257
+ break
258
+ all_jobs.extend(jobs)
259
+ page += 1
260
+ await asyncio.sleep(0.5)
261
+ return all_jobs
262
+
263
+ async def _get_job_details(self, job: dict):
264
+ async with httpx.AsyncClient(timeout=httpx.Timeout(120)) as client:
265
+ response = await client.get(f"{self.url}/{job['id']}")
266
+ response.raise_for_status()
267
+ return response.json()
268
+
269
+ async def _save_job_details(self, job_details: dict):
270
+ job = JobModel(
271
+ title=job_details["title"],
272
+ company=job_details["company"],
273
+ location=job_details["location"],
274
+ description=job_details["description"],
275
+ requirements=job_details["requirements"],
276
+ salary=Salary(
277
+ min=job_details["salary"]["min"], max=job_details["salary"]["max"]
278
+ ),
279
+ sourceUrl=job_details["url"],
280
+ isTop5=job_details["isTop5"],
281
+ datetimeInserted=job_details["datetimeInserted"],
282
+ )
283
+ await save_job_obj(job)
284
+
285
+
286
+ async def main():
287
+ parser = EquinixParser()
288
+ jobs = await parser.run()
289
+ print(jobs)
290
+
291
+
292
+ if __name__ == "__main__":
293
+ asyncio.run(main())
app/api/scraper/utils.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import html2text
2
+
3
+
4
+ def html_to_md(html_string: str) -> str:
5
+ try:
6
+ converter = html2text.HTML2Text()
7
+ converter.ignore_links = True
8
+ converter.ignore_images = True
9
+ markdown = converter.handle(html_string)
10
+ return markdown
11
+ except Exception as e:
12
+ return html_string
13
+
14
+ if __name__ == "__main__":
15
+ t = {'canApply': True, 'country': {'descriptor': 'Nigeria', 'id': 'db69d536446c11de98360015c5e6daf6'}, 'externalUrl': 'https://equinix.wd1.myworkdayjobs.com/External/job/Lagos-Nigeria/IP-Projects-and-Implementation-Engineer_JR-155016-1', 'id': '4fbd87f4a0b41000b23edbb2fde00000', 'includeResumeParsing': True, 'jobDescription': '<p><b>Who are we?</b></p><div><p><span><span>Equinix is the world’s digital infrastructure company®, shortening the path to c...n, status as a victim or family member of a victim of crime or abuse, or any other status protected by applicable law.\xa0</p>', 'jobPostingId': 'IP-Projects-and-Implementation-Engineer_JR-155016-1', 'jobPostingSiteId': 'External', 'jobReqId': 'JR-155016', 'jobRequisitionLocation': {'country': {'alpha2Code': 'NG', 'descriptor': 'Nigeria', 'id': 'db69d536446c11de98360015c5e6daf6'}, 'descriptor': 'Lagos, Nigeria'}, 'location': 'Lagos, Nigeria', 'posted': True, 'postedOn': 'Posted Today', 'questionnaireId': '561a2ce86f57100202e03ac5c0880000', 'remoteType': 'Hybrid', 'startDate': '2025-09-15', 'timeType': 'Full time', 'title': 'IP Projects and Implementation Engineer'}
16
+ print(html_to_md(t["jobDescription"]))
app/api/scraper/views.py CHANGED
@@ -2,6 +2,7 @@
2
  Chat API views module.
3
  """
4
 
 
5
  from fastapi import Query
6
 
7
  from app.api.common.db_requests import (
@@ -12,11 +13,16 @@ from app.api.common.db_requests import (
12
  from app.api.common.dto import Paging
13
  from app.api.common.schemas import AllObjectsResponse, FilterRequest
14
  from app.api.scraper import scraper_router
15
- from app.api.scraper.db_requests import filter_jobs, search_field_options
16
- from app.api.scraper.schemas import SearchOptionRequest
 
 
 
 
17
  from app.api.scraper.dto import JobFilter
18
  from app.api.scraper.models import JobModel
19
  from app.core.wrappers import CbhResponseWrapper
 
20
 
21
 
22
  @scraper_router.get("/all")
@@ -58,7 +64,16 @@ async def scrape(
58
  )
59
  )
60
 
61
- @scraper_router.post('/option/{field}/search')
 
 
 
 
 
 
 
 
 
62
  async def scrape_option_search(
63
  field: str,
64
  request: SearchOptionRequest,
@@ -78,6 +93,7 @@ async def scrape_option_search(
78
  )
79
  )
80
 
 
81
  @scraper_router.get("/{jobId}")
82
  async def get_job(
83
  jobId: str, # pylint: disable=C0103
@@ -98,3 +114,13 @@ async def delete_job(
98
  """
99
  await delete_obj(JobModel, jobId)
100
  return CbhResponseWrapper()
 
 
 
 
 
 
 
 
 
 
 
2
  Chat API views module.
3
  """
4
 
5
+ import asyncio
6
  from fastapi import Query
7
 
8
  from app.api.common.db_requests import (
 
13
  from app.api.common.dto import Paging
14
  from app.api.common.schemas import AllObjectsResponse, FilterRequest
15
  from app.api.scraper import scraper_router
16
+ from app.api.scraper.db_requests import (
17
+ filter_jobs,
18
+ search_field_options,
19
+ get_statistics,
20
+ )
21
+ from app.api.scraper.schemas import SearchOptionRequest, StatisticsResponse
22
  from app.api.scraper.dto import JobFilter
23
  from app.api.scraper.models import JobModel
24
  from app.core.wrappers import CbhResponseWrapper
25
+ from app.api.scraper.services import run_update
26
 
27
 
28
  @scraper_router.get("/all")
 
64
  )
65
  )
66
 
67
+
68
+ @scraper_router.get("/statistics")
69
+ async def get_update_statistics() -> CbhResponseWrapper[StatisticsResponse]:
70
+ """
71
+ Get the update statistics.
72
+ """
73
+ statistics = await get_statistics()
74
+ return CbhResponseWrapper(data=statistics)
75
+
76
+ @scraper_router.post("/option/{field}/search")
77
  async def scrape_option_search(
78
  field: str,
79
  request: SearchOptionRequest,
 
93
  )
94
  )
95
 
96
+
97
  @scraper_router.get("/{jobId}")
98
  async def get_job(
99
  jobId: str, # pylint: disable=C0103
 
114
  """
115
  await delete_obj(JobModel, jobId)
116
  return CbhResponseWrapper()
117
+
118
+
119
+ @scraper_router.post("/run")
120
+ async def run_scraper() -> CbhResponseWrapper:
121
+ """
122
+ Run the scraper.
123
+ """
124
+ asyncio.create_task(run_update())
125
+ return CbhResponseWrapper()
126
+
app/core/config.py CHANGED
@@ -29,6 +29,8 @@ class BaseConfig:
29
  SECRET_KEY: str = os.getenv("SECRET", "")
30
 
31
  DB_CLIENT: AsyncIOMotorDatabase = AsyncIOMotorClient(os.getenv("MONGO_DB_URL")).euscrapper
 
 
32
 
33
  @staticmethod
34
  def get_headers(api_key: str) -> dict:
 
29
  SECRET_KEY: str = os.getenv("SECRET", "")
30
 
31
  DB_CLIENT: AsyncIOMotorDatabase = AsyncIOMotorClient(os.getenv("MONGO_DB_URL")).euscrapper
32
+ LINKEDIN_COOKIE_LI_AT = os.getenv("LINKEDIN_COOKIE_LI_AT")
33
+ LINKEDIN_COOKIE_JSESSIONID = os.getenv("LINKEDIN_COOKIE_JSESSIONID")
34
 
35
  @staticmethod
36
  def get_headers(api_key: str) -> dict:
main.py CHANGED
File without changes
requirements.txt ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Web framework
2
+ fastapi==0.104.1
3
+ uvicorn==0.24.0
4
+
5
+ # Database
6
+ motor==3.3.2
7
+ pymongo==4.6.0
8
+
9
+ # HTTP requests and web scraping
10
+ requests==2.31.0
11
+ httpx==0.25.2
12
+ beautifulsoup4==4.12.2
13
+ selenium==4.15.2
14
+
15
+ # HTML parsing (alternative/faster option)
16
+ selectolax==0.3.17
17
+ html2text==2020.1.16
18
+
19
+ # Web automation
20
+ playwright==1.40.0
21
+
22
+ # Async job scheduling
23
+ apscheduler==3.10.4
24
+
25
+ # AI/ML
26
+ langchain-core==0.1.0
27
+ langchain-openai==0.0.2
28
+
29
+ # Configuration and environment
30
+ python-dotenv==1.0.0
31
+ pydantic==2.5.0
32
+
33
+ # CORS middleware for FastAPI
34
+ starlette==0.27.0
35
+
36
+ # Additional utilities
37
+ python-multipart==0.0.6
38
+
39
+ # Additional dependencies for enhanced scraping
40
+ lxml==4.9.3
41
+ fake-useragent==1.4.0
42
+ aiofiles==23.2.0