Rudraaaa76 commited on
Commit
b92a1ee
Β·
verified Β·
1 Parent(s): 02d234b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -9
app.py CHANGED
@@ -25,6 +25,10 @@ app = FastAPI(title="HackTrack Scraper", version="5.0.0")
25
  playwright = None
26
  browser = None
27
 
 
 
 
 
28
  app.add_middleware(
29
  CORSMiddleware,
30
  allow_origins=["*"],
@@ -648,7 +652,7 @@ async def scrape_with_playwright(url: str, platform: str) -> dict:
648
  if browser is None:
649
  return {"scrape_success": False, "error": "Browser not initialized"}
650
 
651
- # Unstop: try API first
652
  if platform == "Unstop":
653
  opp_id = extract_unstop_id(url)
654
  print(f"[Unstop] Extracted ID: {opp_id}")
@@ -661,12 +665,47 @@ async def scrape_with_playwright(url: str, platform: str) -> dict:
661
  return result
662
  print("[Unstop] API failed, falling back to Playwright")
663
 
 
 
 
 
 
 
 
 
 
 
 
664
  context = await browser.new_context(
665
  user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
666
- viewport={"width": 1920, "height": 1080},
 
 
 
 
667
  )
668
  try:
669
  page = await context.new_page()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
670
  print(f"[Scraper] => {url} platform={platform}")
671
  wait_until = "load" if platform in ("Devfolio", "MLH") else "domcontentloaded"
672
  try:
@@ -675,18 +714,20 @@ async def scrape_with_playwright(url: str, platform: str) -> dict:
675
  if "Timeout" not in str(e): raise
676
  print("[Scraper] goto timeout, proceeding anyway")
677
 
678
- wait_map = {"Unstop": 9, "DoraHacks": 8, "Devfolio": 8, "MLH": 4}
679
- wait_sec = wait_map.get(platform, 5)
 
680
  print(f"[Scraper] Waiting {wait_sec}s for JS...")
681
  await page.wait_for_timeout(wait_sec * 1000)
682
 
683
- for frac in [0.33, 0.66, 1.0, 0.0]:
 
684
  try:
685
  await page.evaluate(f"window.scrollTo(0, document.body.scrollHeight * {frac})")
686
  except Exception:
687
  pass
688
- await asyncio.sleep(0.6)
689
- await asyncio.sleep(1.0)
690
 
691
  if platform == "Devfolio":
692
  raw = await safe_evaluate(page, DEVFOLIO_SCRIPT, GENERIC_SCRIPT)
@@ -747,9 +788,44 @@ async def startup() -> None:
747
  playwright = await async_playwright().start()
748
  browser = await playwright.chromium.launch(
749
  headless=True,
750
- args=["--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
751
  )
752
- print("[Scraper] v5.0 ready - structured extraction (no LLM)")
753
 
754
 
755
  @app.on_event("shutdown")
 
25
  playwright = None
26
  browser = None
27
 
28
+ # Semaphore: only 1 Playwright scrape at a time on HuggingFace free tier.
29
+ # Prevents two concurrent requests from doubling RAM usage (~1.2GB peak).
30
+ _scrape_sem = asyncio.Semaphore(1)
31
+
32
  app.add_middleware(
33
  CORSMiddleware,
34
  allow_origins=["*"],
 
652
  if browser is None:
653
  return {"scrape_success": False, "error": "Browser not initialized"}
654
 
655
+ # Unstop: try API first β€” no Playwright needed, saves all memory for this call
656
  if platform == "Unstop":
657
  opp_id = extract_unstop_id(url)
658
  print(f"[Unstop] Extracted ID: {opp_id}")
 
665
  return result
666
  print("[Unstop] API failed, falling back to Playwright")
667
 
668
+ async with _scrape_sem:
669
+ # Only one Playwright scrape runs at a time to stay within HuggingFace RAM limits.
670
+ # Concurrent requests queue here and are processed sequentially.
671
+ print(f"[Scraper] Semaphore acquired for {platform}")
672
+ return await _do_playwright_scrape(url, platform)
673
+
674
+
675
+ async def _do_playwright_scrape(url: str, platform: str) -> dict:
676
+ """Inner function β€” runs inside the semaphore."""
677
+ global browser
678
+
679
  context = await browser.new_context(
680
  user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
681
+ # Smaller viewport = less GPU memory for compositing
682
+ viewport={"width": 1280, "height": 800},
683
+ # Block credentials/service workers to reduce overhead
684
+ java_script_enabled=True,
685
+ bypass_csp=False,
686
  )
687
  try:
688
  page = await context.new_page()
689
+
690
+ # Block images, fonts, media, and tracking β€” saves 30-60% of page RAM
691
+ # We only need DOM text and __NEXT_DATA__, not rendered assets
692
+ async def block_resources(route, request):
693
+ BLOCK_TYPES = {"image", "media", "font", "stylesheet", "other",
694
+ "ping", "websocket"}
695
+ BLOCK_DOMAINS = {"google-analytics", "googletagmanager", "facebook",
696
+ "hotjar", "intercom", "amplitude", "segment",
697
+ "cloudflare.com/beacon", "sentry.io"}
698
+ if request.resource_type in BLOCK_TYPES:
699
+ await route.abort()
700
+ return
701
+ url_lower = request.url.lower()
702
+ if any(d in url_lower for d in BLOCK_DOMAINS):
703
+ await route.abort()
704
+ return
705
+ await route.continue_()
706
+
707
+ await page.route("**/*", block_resources)
708
+
709
  print(f"[Scraper] => {url} platform={platform}")
710
  wait_until = "load" if platform in ("Devfolio", "MLH") else "domcontentloaded"
711
  try:
 
714
  if "Timeout" not in str(e): raise
715
  print("[Scraper] goto timeout, proceeding anyway")
716
 
717
+ # Reduced wait times β€” blocking assets means pages settle faster
718
+ wait_map = {"Unstop": 6, "DoraHacks": 6, "Devfolio": 5, "MLH": 3}
719
+ wait_sec = wait_map.get(platform, 4)
720
  print(f"[Scraper] Waiting {wait_sec}s for JS...")
721
  await page.wait_for_timeout(wait_sec * 1000)
722
 
723
+ # Light scroll only β€” no heavy scroll since images are blocked anyway
724
+ for frac in [0.5, 1.0, 0.0]:
725
  try:
726
  await page.evaluate(f"window.scrollTo(0, document.body.scrollHeight * {frac})")
727
  except Exception:
728
  pass
729
+ await asyncio.sleep(0.4)
730
+ await asyncio.sleep(0.5)
731
 
732
  if platform == "Devfolio":
733
  raw = await safe_evaluate(page, DEVFOLIO_SCRIPT, GENERIC_SCRIPT)
 
788
  playwright = await async_playwright().start()
789
  browser = await playwright.chromium.launch(
790
  headless=True,
791
+ args=[
792
+ # ── Security (required for containers) ──────────────────────────
793
+ "--no-sandbox",
794
+ "--disable-setuid-sandbox",
795
+ # ── Memory reduction ─────────────────────────────────────────────
796
+ "--disable-dev-shm-usage", # use /tmp instead of /dev/shm
797
+ "--disable-gpu", # no GPU process (~50MB saved)
798
+ "--no-zygote", # skip zygote process fork
799
+ "--single-process", # single process mode (~150MB saved)
800
+ "--disable-extensions", # no extension processes
801
+ "--disable-background-networking",
802
+ "--disable-background-timer-throttling",
803
+ "--disable-backgrounding-occluded-windows",
804
+ "--disable-breakpad", # no crash reporter
805
+ "--disable-client-side-phishing-detection",
806
+ "--disable-component-update",
807
+ "--disable-default-apps",
808
+ "--disable-domain-reliability",
809
+ "--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process",
810
+ "--disable-hang-monitor",
811
+ "--disable-ipc-flooding-protection",
812
+ "--disable-popup-blocking",
813
+ "--disable-prompt-on-repost",
814
+ "--disable-renderer-backgrounding",
815
+ "--disable-sync",
816
+ "--disable-translate",
817
+ "--metrics-recording-only",
818
+ "--mute-audio",
819
+ "--no-first-run",
820
+ "--safebrowsing-disable-auto-update",
821
+ "--password-store=basic",
822
+ "--use-mock-keychain",
823
+ # ── Reduce per-page memory ────────────────────────────────────────
824
+ "--js-flags=--max-old-space-size=256", # cap JS heap to 256MB
825
+ "--renderer-process-limit=2",
826
+ ],
827
  )
828
+ print("[Scraper] v5.0 ready - memory-optimised Chromium on HuggingFace")
829
 
830
 
831
  @app.on_event("shutdown")