Spaces:

Rudraaaa76
/

hacktrack_scraper

Running

App Files Files Community

Rudraaaa76 commited on 15 days ago

Commit

f3449f2

verified ·

1 Parent(s): 6d14edc

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -18

app.py CHANGED Viewed

@@ -2,7 +2,6 @@
 HackTrack Scraper v4.0
 - Groq LLM (llama-3.3-70b-versatile) for intelligent extraction
 - Platforms: Devfolio, Unstop, Devpost, DoraHacks, MLH
-- Deployed on Railway/Render/Fly.io
 """
 from fastapi import FastAPI
@@ -585,6 +584,44 @@ def get_extract_script(platform: str) -> str:
 # PLAYWRIGHT SCRAPER
 # ══════════════════════════════════════════════════════════════════════════════
 async def scrape_with_playwright(url: str, platform: str) -> dict:
     global browser
     if browser is None:
@@ -603,27 +640,34 @@ async def scrape_with_playwright(url: str, platform: str) -> dict:
         page = await context.new_page()
         print(f"[Scraper] → {url}  (platform={platform})")
-        await page.goto(url, wait_until="domcontentloaded", timeout=25000)
-        # Platform-specific wait times
-        wait_map = {"Unstop": 9, "DoraHacks": 8, "Devfolio": 7, "MLH": 4}
         wait_sec = wait_map.get(platform, 5)
-        print(f"[Scraper] Waiting {wait_sec}s for JS...")
         await page.wait_for_timeout(wait_sec * 1000)
-        # Scroll to trigger lazy-loaded content
         for frac in [0.33, 0.66, 1.0, 0.0]:
-            await page.evaluate(f"window.scrollTo(0, document.body.scrollHeight * {frac})")
-            await asyncio.sleep(0.8)
-        # Run platform-specific extraction script
-        script = get_extract_script(platform)
-        # Devfolio script is async — evaluate handles both sync and async
-        try:
-            data = await page.evaluate(script)
-        except Exception:
-            # Fallback to generic if platform script errors
-            data = await page.evaluate(GENERIC_EXTRACT_SCRIPT)
         body_text = data.get("bodyText", "")
         print(f"[Scraper] bodyText={len(body_text)} chars, name='{data.get('name','')}'")
@@ -761,4 +805,4 @@ async def scrape(request: ScrapeRequest):
         return response
     except Exception as e:
         print(f"[Scraper] Endpoint error: {e}")
-        return ScrapeResponse(platform=platform, url=url, scrape_success=False)

 HackTrack Scraper v4.0
 - Groq LLM (llama-3.3-70b-versatile) for intelligent extraction
 - Platforms: Devfolio, Unstop, Devpost, DoraHacks, MLH
 """
 from fastapi import FastAPI
 # PLAYWRIGHT SCRAPER
 # ══════════════════════════════════════════════════════════════════════════════
+EMPTY_DATA = {
+    "name": "", "description": "", "banner_url": "",
+    "bodyText": "", "themes": [], "sidebarPrize": "", "resourceLinks": [],
+}
+async def safe_evaluate(page, script: str, fallback_script: str = None) -> dict:
+    """
+    Evaluate JS safely with retry on 'Execution context was destroyed'
+    caused by Devfolio /overview redirecting to / mid-scrape.
+    """
+    for attempt in range(3):
+        try:
+            try:
+                await page.wait_for_load_state("networkidle", timeout=8000)
+            except Exception:
+                pass
+            return await page.evaluate(script)
+        except Exception as e:
+            err = str(e)
+            print(f"[Scraper] evaluate attempt {attempt + 1} failed: {err[:150]}")
+            if "Execution context was destroyed" in err or "Frame was detached" in err:
+                print("[Scraper] Redirect detected — waiting for page to settle...")
+                try:
+                    await page.wait_for_load_state("domcontentloaded", timeout=12000)
+                    await asyncio.sleep(2)
+                except Exception:
+                    await asyncio.sleep(3)
+                continue
+            if fallback_script and attempt == 1:
+                print("[Scraper] Switching to generic fallback script...")
+                script = fallback_script
+                continue
+            break
+    print("[Scraper] All evaluate attempts exhausted — returning empty data")
+    return EMPTY_DATA
 async def scrape_with_playwright(url: str, platform: str) -> dict:
     global browser
     if browser is None:
         page = await context.new_page()
         print(f"[Scraper] → {url}  (platform={platform})")
+        # Devfolio /overview redirects to / — wait for "load" so the redirect
+        # finishes before we evaluate JS.
+        wait_until = "load" if platform in ("Devfolio", "MLH") else "domcontentloaded"
+        try:
+            await page.goto(url, wait_until=wait_until, timeout=30000)
+        except Exception as e:
+            if "Timeout" in str(e):
+                print(f"[Scraper] goto timeout ({wait_until}) — proceeding anyway")
+            else:
+                raise
+        wait_map = {"Unstop": 9, "DoraHacks": 8, "Devfolio": 8, "MLH": 4}
         wait_sec = wait_map.get(platform, 5)
+        print(f"[Scraper] Waiting {wait_sec}s for JS rendering...")
         await page.wait_for_timeout(wait_sec * 1000)
         for frac in [0.33, 0.66, 1.0, 0.0]:
+            try:
+                await page.evaluate(f"window.scrollTo(0, document.body.scrollHeight * {frac})")
+            except Exception:
+                pass
+            await asyncio.sleep(0.6)
+        await asyncio.sleep(1.0)
+        primary_script  = get_extract_script(platform)
+        fallback_script = GENERIC_EXTRACT_SCRIPT if primary_script != GENERIC_EXTRACT_SCRIPT else None
+        data = await safe_evaluate(page, primary_script, fallback_script)
         body_text = data.get("bodyText", "")
         print(f"[Scraper] bodyText={len(body_text)} chars, name='{data.get('name','')}'")
         return response
     except Exception as e:
         print(f"[Scraper] Endpoint error: {e}")
+        return ScrapeResponse(platform=platform, url=url, scrape_success=False)