Rudraaaa76 commited on
Commit
f3449f2
Β·
verified Β·
1 Parent(s): 6d14edc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -18
app.py CHANGED
@@ -2,7 +2,6 @@
2
  HackTrack Scraper v4.0
3
  - Groq LLM (llama-3.3-70b-versatile) for intelligent extraction
4
  - Platforms: Devfolio, Unstop, Devpost, DoraHacks, MLH
5
- - Deployed on Railway/Render/Fly.io
6
  """
7
 
8
  from fastapi import FastAPI
@@ -585,6 +584,44 @@ def get_extract_script(platform: str) -> str:
585
  # PLAYWRIGHT SCRAPER
586
  # ══════════════════════════════════════════════════════════════════════════════
587
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
588
  async def scrape_with_playwright(url: str, platform: str) -> dict:
589
  global browser
590
  if browser is None:
@@ -603,27 +640,34 @@ async def scrape_with_playwright(url: str, platform: str) -> dict:
603
  page = await context.new_page()
604
  print(f"[Scraper] β†’ {url} (platform={platform})")
605
 
606
- await page.goto(url, wait_until="domcontentloaded", timeout=25000)
607
-
608
- # Platform-specific wait times
609
- wait_map = {"Unstop": 9, "DoraHacks": 8, "Devfolio": 7, "MLH": 4}
 
 
 
 
 
 
 
 
610
  wait_sec = wait_map.get(platform, 5)
611
- print(f"[Scraper] Waiting {wait_sec}s for JS...")
612
  await page.wait_for_timeout(wait_sec * 1000)
613
 
614
- # Scroll to trigger lazy-loaded content
615
  for frac in [0.33, 0.66, 1.0, 0.0]:
616
- await page.evaluate(f"window.scrollTo(0, document.body.scrollHeight * {frac})")
617
- await asyncio.sleep(0.8)
 
 
 
618
 
619
- # Run platform-specific extraction script
620
- script = get_extract_script(platform)
621
- # Devfolio script is async β€” evaluate handles both sync and async
622
- try:
623
- data = await page.evaluate(script)
624
- except Exception:
625
- # Fallback to generic if platform script errors
626
- data = await page.evaluate(GENERIC_EXTRACT_SCRIPT)
627
 
628
  body_text = data.get("bodyText", "")
629
  print(f"[Scraper] bodyText={len(body_text)} chars, name='{data.get('name','')}'")
@@ -761,4 +805,4 @@ async def scrape(request: ScrapeRequest):
761
  return response
762
  except Exception as e:
763
  print(f"[Scraper] Endpoint error: {e}")
764
- return ScrapeResponse(platform=platform, url=url, scrape_success=False)
 
2
  HackTrack Scraper v4.0
3
  - Groq LLM (llama-3.3-70b-versatile) for intelligent extraction
4
  - Platforms: Devfolio, Unstop, Devpost, DoraHacks, MLH
 
5
  """
6
 
7
  from fastapi import FastAPI
 
584
  # PLAYWRIGHT SCRAPER
585
  # ══════════════════════════════════════════════════════════════════════════════
586
 
587
+ EMPTY_DATA = {
588
+ "name": "", "description": "", "banner_url": "",
589
+ "bodyText": "", "themes": [], "sidebarPrize": "", "resourceLinks": [],
590
+ }
591
+
592
+
593
+ async def safe_evaluate(page, script: str, fallback_script: str = None) -> dict:
594
+ """
595
+ Evaluate JS safely with retry on 'Execution context was destroyed'
596
+ caused by Devfolio /overview redirecting to / mid-scrape.
597
+ """
598
+ for attempt in range(3):
599
+ try:
600
+ try:
601
+ await page.wait_for_load_state("networkidle", timeout=8000)
602
+ except Exception:
603
+ pass
604
+ return await page.evaluate(script)
605
+ except Exception as e:
606
+ err = str(e)
607
+ print(f"[Scraper] evaluate attempt {attempt + 1} failed: {err[:150]}")
608
+ if "Execution context was destroyed" in err or "Frame was detached" in err:
609
+ print("[Scraper] Redirect detected β€” waiting for page to settle...")
610
+ try:
611
+ await page.wait_for_load_state("domcontentloaded", timeout=12000)
612
+ await asyncio.sleep(2)
613
+ except Exception:
614
+ await asyncio.sleep(3)
615
+ continue
616
+ if fallback_script and attempt == 1:
617
+ print("[Scraper] Switching to generic fallback script...")
618
+ script = fallback_script
619
+ continue
620
+ break
621
+ print("[Scraper] All evaluate attempts exhausted β€” returning empty data")
622
+ return EMPTY_DATA
623
+
624
+
625
  async def scrape_with_playwright(url: str, platform: str) -> dict:
626
  global browser
627
  if browser is None:
 
640
  page = await context.new_page()
641
  print(f"[Scraper] β†’ {url} (platform={platform})")
642
 
643
+ # Devfolio /overview redirects to / β€” wait for "load" so the redirect
644
+ # finishes before we evaluate JS.
645
+ wait_until = "load" if platform in ("Devfolio", "MLH") else "domcontentloaded"
646
+ try:
647
+ await page.goto(url, wait_until=wait_until, timeout=30000)
648
+ except Exception as e:
649
+ if "Timeout" in str(e):
650
+ print(f"[Scraper] goto timeout ({wait_until}) β€” proceeding anyway")
651
+ else:
652
+ raise
653
+
654
+ wait_map = {"Unstop": 9, "DoraHacks": 8, "Devfolio": 8, "MLH": 4}
655
  wait_sec = wait_map.get(platform, 5)
656
+ print(f"[Scraper] Waiting {wait_sec}s for JS rendering...")
657
  await page.wait_for_timeout(wait_sec * 1000)
658
 
 
659
  for frac in [0.33, 0.66, 1.0, 0.0]:
660
+ try:
661
+ await page.evaluate(f"window.scrollTo(0, document.body.scrollHeight * {frac})")
662
+ except Exception:
663
+ pass
664
+ await asyncio.sleep(0.6)
665
 
666
+ await asyncio.sleep(1.0)
667
+
668
+ primary_script = get_extract_script(platform)
669
+ fallback_script = GENERIC_EXTRACT_SCRIPT if primary_script != GENERIC_EXTRACT_SCRIPT else None
670
+ data = await safe_evaluate(page, primary_script, fallback_script)
 
 
 
671
 
672
  body_text = data.get("bodyText", "")
673
  print(f"[Scraper] bodyText={len(body_text)} chars, name='{data.get('name','')}'")
 
805
  return response
806
  except Exception as e:
807
  print(f"[Scraper] Endpoint error: {e}")
808
+ return ScrapeResponse(platform=platform, url=url, scrape_success=False)