Fix: Correct playwright-stealth v2.0.2 API usage and expand discovery to RSS/GitHub/Search
Browse files- app/core/scrapers.py +31 -9
app/core/scrapers.py
CHANGED
|
@@ -136,7 +136,7 @@ class DiscoverySurfer:
|
|
| 136 |
logger.info(f"Surfer exploring {start_url} (depth={depth})")
|
| 137 |
|
| 138 |
from playwright.async_api import async_playwright
|
| 139 |
-
from playwright_stealth import
|
| 140 |
import random
|
| 141 |
import asyncio
|
| 142 |
|
|
@@ -151,7 +151,8 @@ class DiscoverySurfer:
|
|
| 151 |
)
|
| 152 |
|
| 153 |
page = await context.new_page()
|
| 154 |
-
|
|
|
|
| 155 |
logger.info("Stealth Sub-Agent active. Masking automation signatures.")
|
| 156 |
|
| 157 |
# Human-like navigation: Random delay before goto
|
|
@@ -225,22 +226,43 @@ class DiscoverySurfer:
|
|
| 225 |
return []
|
| 226 |
|
| 227 |
async def swoop(self) -> List[Dict]:
|
| 228 |
-
"""The main autonomous 'Swoop' -
|
| 229 |
all_signals = []
|
|
|
|
| 230 |
|
| 231 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
hubs = [
|
| 233 |
"https://news.ycombinator.com",
|
| 234 |
"https://www.reddit.com/r/MachineLearning/",
|
| 235 |
-
"https://www.reddit.com/r/Programming/"
|
|
|
|
|
|
|
| 236 |
]
|
| 237 |
|
| 238 |
-
surfer = DiscoverySurfer()
|
| 239 |
for hub in hubs:
|
| 240 |
-
|
| 241 |
-
|
|
|
|
|
|
|
|
|
|
| 242 |
|
| 243 |
-
logger.info(f"Swoop completed. Found {len(all_signals)}
|
| 244 |
return all_signals
|
| 245 |
|
| 246 |
scraper = DiscoverySurfer()
|
|
|
|
| 136 |
logger.info(f"Surfer exploring {start_url} (depth={depth})")
|
| 137 |
|
| 138 |
from playwright.async_api import async_playwright
|
| 139 |
+
from playwright_stealth import Stealth
|
| 140 |
import random
|
| 141 |
import asyncio
|
| 142 |
|
|
|
|
| 151 |
)
|
| 152 |
|
| 153 |
page = await context.new_page()
|
| 154 |
+
stealth = Stealth()
|
| 155 |
+
await stealth.apply_async(page)
|
| 156 |
logger.info("Stealth Sub-Agent active. Masking automation signatures.")
|
| 157 |
|
| 158 |
# Human-like navigation: Random delay before goto
|
|
|
|
| 226 |
return []
|
| 227 |
|
| 228 |
async def swoop(self) -> List[Dict]:
|
| 229 |
+
"""The main autonomous 'Swoop' - combining surfing, RSS, and deep audits."""
|
| 230 |
all_signals = []
|
| 231 |
+
tech_scraper = TechnicalScraper()
|
| 232 |
|
| 233 |
+
# 1. Deep GitHub & Search Audits
|
| 234 |
+
logger.info("Discovery Orchestrator: initiating technical audits...")
|
| 235 |
+
github_signals = await tech_scraper.fetch_github_audit()
|
| 236 |
+
all_signals.extend(github_signals)
|
| 237 |
+
|
| 238 |
+
for query in settings.TARGET_QUERIES:
|
| 239 |
+
search_signals = await tech_scraper.fetch_search(query)
|
| 240 |
+
all_signals.extend(search_signals)
|
| 241 |
+
|
| 242 |
+
# 2. RSS Forensic Feeds
|
| 243 |
+
logger.info("Discovery Orchestrator: sifting through RSS hubs...")
|
| 244 |
+
for feed in settings.RSS_FEEDS:
|
| 245 |
+
rss_signals = await tech_scraper.fetch_rss(feed)
|
| 246 |
+
all_signals.extend(rss_signals)
|
| 247 |
+
|
| 248 |
+
# 3. Human-like Browser Discovery (Surfing)
|
| 249 |
+
logger.info("Discovery Orchestrator: deploying browser-based surfs...")
|
| 250 |
hubs = [
|
| 251 |
"https://news.ycombinator.com",
|
| 252 |
"https://www.reddit.com/r/MachineLearning/",
|
| 253 |
+
"https://www.reddit.com/r/Programming/",
|
| 254 |
+
"https://dev.to/t/architecture",
|
| 255 |
+
"https://www.infoq.com/news/"
|
| 256 |
]
|
| 257 |
|
|
|
|
| 258 |
for hub in hubs:
|
| 259 |
+
try:
|
| 260 |
+
results = await self.surf(hub)
|
| 261 |
+
all_signals.extend(results)
|
| 262 |
+
except Exception as e:
|
| 263 |
+
logger.error(f"Discovery Orchestrator: Surf failure on {hub}: {e}")
|
| 264 |
|
| 265 |
+
logger.info(f"Swoop completed. Found {len(all_signals)} total forensic signals across all vectors.")
|
| 266 |
return all_signals
|
| 267 |
|
| 268 |
scraper = DiscoverySurfer()
|