Spaces:

Rudraaaa76
/

hacktrack_scraper

Running

App Files Files Community

Rudraaaa76 commited on 15 days ago

Commit

b92a1ee

verified ·

1 Parent(s): 02d234b

Update app.py

Browse files

Files changed (1) hide show

app.py +85 -9

app.py CHANGED Viewed

@@ -25,6 +25,10 @@ app = FastAPI(title="HackTrack Scraper", version="5.0.0")
 playwright = None
 browser    = None
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -648,7 +652,7 @@ async def scrape_with_playwright(url: str, platform: str) -> dict:
     if browser is None:
         return {"scrape_success": False, "error": "Browser not initialized"}
-    # Unstop: try API first
     if platform == "Unstop":
         opp_id = extract_unstop_id(url)
         print(f"[Unstop] Extracted ID: {opp_id}")
@@ -661,12 +665,47 @@ async def scrape_with_playwright(url: str, platform: str) -> dict:
                 return result
         print("[Unstop] API failed, falling back to Playwright")
     context = await browser.new_context(
         user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
-        viewport={"width": 1920, "height": 1080},
     )
     try:
         page = await context.new_page()
         print(f"[Scraper] => {url}  platform={platform}")
         wait_until = "load" if platform in ("Devfolio", "MLH") else "domcontentloaded"
         try:
@@ -675,18 +714,20 @@ async def scrape_with_playwright(url: str, platform: str) -> dict:
             if "Timeout" not in str(e): raise
             print("[Scraper] goto timeout, proceeding anyway")
-        wait_map = {"Unstop": 9, "DoraHacks": 8, "Devfolio": 8, "MLH": 4}
-        wait_sec = wait_map.get(platform, 5)
         print(f"[Scraper] Waiting {wait_sec}s for JS...")
         await page.wait_for_timeout(wait_sec * 1000)
-        for frac in [0.33, 0.66, 1.0, 0.0]:
             try:
                 await page.evaluate(f"window.scrollTo(0, document.body.scrollHeight * {frac})")
             except Exception:
                 pass
-            await asyncio.sleep(0.6)
-        await asyncio.sleep(1.0)
         if platform == "Devfolio":
             raw = await safe_evaluate(page, DEVFOLIO_SCRIPT, GENERIC_SCRIPT)
@@ -747,9 +788,44 @@ async def startup() -> None:
     playwright = await async_playwright().start()
     browser = await playwright.chromium.launch(
         headless=True,
-        args=["--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage"],
     )
-    print("[Scraper] v5.0 ready - structured extraction (no LLM)")
 @app.on_event("shutdown")

 playwright = None
 browser    = None
+# Semaphore: only 1 Playwright scrape at a time on HuggingFace free tier.
+# Prevents two concurrent requests from doubling RAM usage (~1.2GB peak).
+_scrape_sem = asyncio.Semaphore(1)
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     if browser is None:
         return {"scrape_success": False, "error": "Browser not initialized"}
+    # Unstop: try API first — no Playwright needed, saves all memory for this call
     if platform == "Unstop":
         opp_id = extract_unstop_id(url)
         print(f"[Unstop] Extracted ID: {opp_id}")
                 return result
         print("[Unstop] API failed, falling back to Playwright")
+    async with _scrape_sem:
+        # Only one Playwright scrape runs at a time to stay within HuggingFace RAM limits.
+        # Concurrent requests queue here and are processed sequentially.
+        print(f"[Scraper] Semaphore acquired for {platform}")
+        return await _do_playwright_scrape(url, platform)
+async def _do_playwright_scrape(url: str, platform: str) -> dict:
+    """Inner function — runs inside the semaphore."""
+    global browser
     context = await browser.new_context(
         user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
+        # Smaller viewport = less GPU memory for compositing
+        viewport={"width": 1280, "height": 800},
+        # Block credentials/service workers to reduce overhead
+        java_script_enabled=True,
+        bypass_csp=False,
     )
     try:
         page = await context.new_page()
+        # Block images, fonts, media, and tracking — saves 30-60% of page RAM
+        # We only need DOM text and __NEXT_DATA__, not rendered assets
+        async def block_resources(route, request):
+            BLOCK_TYPES = {"image", "media", "font", "stylesheet", "other",
+                           "ping", "websocket"}
+            BLOCK_DOMAINS = {"google-analytics", "googletagmanager", "facebook",
+                             "hotjar", "intercom", "amplitude", "segment",
+                             "cloudflare.com/beacon", "sentry.io"}
+            if request.resource_type in BLOCK_TYPES:
+                await route.abort()
+                return
+            url_lower = request.url.lower()
+            if any(d in url_lower for d in BLOCK_DOMAINS):
+                await route.abort()
+                return
+            await route.continue_()
+        await page.route("**/*", block_resources)
         print(f"[Scraper] => {url}  platform={platform}")
         wait_until = "load" if platform in ("Devfolio", "MLH") else "domcontentloaded"
         try:
             if "Timeout" not in str(e): raise
             print("[Scraper] goto timeout, proceeding anyway")
+        # Reduced wait times — blocking assets means pages settle faster
+        wait_map = {"Unstop": 6, "DoraHacks": 6, "Devfolio": 5, "MLH": 3}
+        wait_sec = wait_map.get(platform, 4)
         print(f"[Scraper] Waiting {wait_sec}s for JS...")
         await page.wait_for_timeout(wait_sec * 1000)
+        # Light scroll only — no heavy scroll since images are blocked anyway
+        for frac in [0.5, 1.0, 0.0]:
             try:
                 await page.evaluate(f"window.scrollTo(0, document.body.scrollHeight * {frac})")
             except Exception:
                 pass
+            await asyncio.sleep(0.4)
+        await asyncio.sleep(0.5)
         if platform == "Devfolio":
             raw = await safe_evaluate(page, DEVFOLIO_SCRIPT, GENERIC_SCRIPT)
     playwright = await async_playwright().start()
     browser = await playwright.chromium.launch(
         headless=True,
+        args=[
+            # ── Security (required for containers) ──────────────────────────
+            "--no-sandbox",
+            "--disable-setuid-sandbox",
+            # ── Memory reduction ─────────────────────────────────────────────
+            "--disable-dev-shm-usage",       # use /tmp instead of /dev/shm
+            "--disable-gpu",                 # no GPU process (~50MB saved)
+            "--no-zygote",                   # skip zygote process fork
+            "--single-process",              # single process mode (~150MB saved)
+            "--disable-extensions",          # no extension processes
+            "--disable-background-networking",
+            "--disable-background-timer-throttling",
+            "--disable-backgrounding-occluded-windows",
+            "--disable-breakpad",            # no crash reporter
+            "--disable-client-side-phishing-detection",
+            "--disable-component-update",
+            "--disable-default-apps",
+            "--disable-domain-reliability",
+            "--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process",
+            "--disable-hang-monitor",
+            "--disable-ipc-flooding-protection",
+            "--disable-popup-blocking",
+            "--disable-prompt-on-repost",
+            "--disable-renderer-backgrounding",
+            "--disable-sync",
+            "--disable-translate",
+            "--metrics-recording-only",
+            "--mute-audio",
+            "--no-first-run",
+            "--safebrowsing-disable-auto-update",
+            "--password-store=basic",
+            "--use-mock-keychain",
+            # ── Reduce per-page memory ────────────────────────────────────────
+            "--js-flags=--max-old-space-size=256",  # cap JS heap to 256MB
+            "--renderer-process-limit=2",
+        ],
     )
+    print("[Scraper] v5.0 ready - memory-optimised Chromium on HuggingFace")
 @app.on_event("shutdown")