Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -25,6 +25,10 @@ app = FastAPI(title="HackTrack Scraper", version="5.0.0")
|
|
| 25 |
playwright = None
|
| 26 |
browser = None
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
app.add_middleware(
|
| 29 |
CORSMiddleware,
|
| 30 |
allow_origins=["*"],
|
|
@@ -648,7 +652,7 @@ async def scrape_with_playwright(url: str, platform: str) -> dict:
|
|
| 648 |
if browser is None:
|
| 649 |
return {"scrape_success": False, "error": "Browser not initialized"}
|
| 650 |
|
| 651 |
-
# Unstop: try API first
|
| 652 |
if platform == "Unstop":
|
| 653 |
opp_id = extract_unstop_id(url)
|
| 654 |
print(f"[Unstop] Extracted ID: {opp_id}")
|
|
@@ -661,12 +665,47 @@ async def scrape_with_playwright(url: str, platform: str) -> dict:
|
|
| 661 |
return result
|
| 662 |
print("[Unstop] API failed, falling back to Playwright")
|
| 663 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 664 |
context = await browser.new_context(
|
| 665 |
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
|
| 666 |
-
viewport=
|
|
|
|
|
|
|
|
|
|
|
|
|
| 667 |
)
|
| 668 |
try:
|
| 669 |
page = await context.new_page()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 670 |
print(f"[Scraper] => {url} platform={platform}")
|
| 671 |
wait_until = "load" if platform in ("Devfolio", "MLH") else "domcontentloaded"
|
| 672 |
try:
|
|
@@ -675,18 +714,20 @@ async def scrape_with_playwright(url: str, platform: str) -> dict:
|
|
| 675 |
if "Timeout" not in str(e): raise
|
| 676 |
print("[Scraper] goto timeout, proceeding anyway")
|
| 677 |
|
| 678 |
-
|
| 679 |
-
|
|
|
|
| 680 |
print(f"[Scraper] Waiting {wait_sec}s for JS...")
|
| 681 |
await page.wait_for_timeout(wait_sec * 1000)
|
| 682 |
|
| 683 |
-
|
|
|
|
| 684 |
try:
|
| 685 |
await page.evaluate(f"window.scrollTo(0, document.body.scrollHeight * {frac})")
|
| 686 |
except Exception:
|
| 687 |
pass
|
| 688 |
-
await asyncio.sleep(0.
|
| 689 |
-
await asyncio.sleep(
|
| 690 |
|
| 691 |
if platform == "Devfolio":
|
| 692 |
raw = await safe_evaluate(page, DEVFOLIO_SCRIPT, GENERIC_SCRIPT)
|
|
@@ -747,9 +788,44 @@ async def startup() -> None:
|
|
| 747 |
playwright = await async_playwright().start()
|
| 748 |
browser = await playwright.chromium.launch(
|
| 749 |
headless=True,
|
| 750 |
-
args=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 751 |
)
|
| 752 |
-
print("[Scraper] v5.0 ready -
|
| 753 |
|
| 754 |
|
| 755 |
@app.on_event("shutdown")
|
|
|
|
| 25 |
playwright = None
|
| 26 |
browser = None
|
| 27 |
|
| 28 |
+
# Semaphore: only 1 Playwright scrape at a time on HuggingFace free tier.
|
| 29 |
+
# Prevents two concurrent requests from doubling RAM usage (~1.2GB peak).
|
| 30 |
+
_scrape_sem = asyncio.Semaphore(1)
|
| 31 |
+
|
| 32 |
app.add_middleware(
|
| 33 |
CORSMiddleware,
|
| 34 |
allow_origins=["*"],
|
|
|
|
| 652 |
if browser is None:
|
| 653 |
return {"scrape_success": False, "error": "Browser not initialized"}
|
| 654 |
|
| 655 |
+
# Unstop: try API first β no Playwright needed, saves all memory for this call
|
| 656 |
if platform == "Unstop":
|
| 657 |
opp_id = extract_unstop_id(url)
|
| 658 |
print(f"[Unstop] Extracted ID: {opp_id}")
|
|
|
|
| 665 |
return result
|
| 666 |
print("[Unstop] API failed, falling back to Playwright")
|
| 667 |
|
| 668 |
+
async with _scrape_sem:
|
| 669 |
+
# Only one Playwright scrape runs at a time to stay within HuggingFace RAM limits.
|
| 670 |
+
# Concurrent requests queue here and are processed sequentially.
|
| 671 |
+
print(f"[Scraper] Semaphore acquired for {platform}")
|
| 672 |
+
return await _do_playwright_scrape(url, platform)
|
| 673 |
+
|
| 674 |
+
|
| 675 |
+
async def _do_playwright_scrape(url: str, platform: str) -> dict:
|
| 676 |
+
"""Inner function β runs inside the semaphore."""
|
| 677 |
+
global browser
|
| 678 |
+
|
| 679 |
context = await browser.new_context(
|
| 680 |
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
|
| 681 |
+
# Smaller viewport = less GPU memory for compositing
|
| 682 |
+
viewport={"width": 1280, "height": 800},
|
| 683 |
+
# Block credentials/service workers to reduce overhead
|
| 684 |
+
java_script_enabled=True,
|
| 685 |
+
bypass_csp=False,
|
| 686 |
)
|
| 687 |
try:
|
| 688 |
page = await context.new_page()
|
| 689 |
+
|
| 690 |
+
# Block images, fonts, media, and tracking β saves 30-60% of page RAM
|
| 691 |
+
# We only need DOM text and __NEXT_DATA__, not rendered assets
|
| 692 |
+
async def block_resources(route, request):
|
| 693 |
+
BLOCK_TYPES = {"image", "media", "font", "stylesheet", "other",
|
| 694 |
+
"ping", "websocket"}
|
| 695 |
+
BLOCK_DOMAINS = {"google-analytics", "googletagmanager", "facebook",
|
| 696 |
+
"hotjar", "intercom", "amplitude", "segment",
|
| 697 |
+
"cloudflare.com/beacon", "sentry.io"}
|
| 698 |
+
if request.resource_type in BLOCK_TYPES:
|
| 699 |
+
await route.abort()
|
| 700 |
+
return
|
| 701 |
+
url_lower = request.url.lower()
|
| 702 |
+
if any(d in url_lower for d in BLOCK_DOMAINS):
|
| 703 |
+
await route.abort()
|
| 704 |
+
return
|
| 705 |
+
await route.continue_()
|
| 706 |
+
|
| 707 |
+
await page.route("**/*", block_resources)
|
| 708 |
+
|
| 709 |
print(f"[Scraper] => {url} platform={platform}")
|
| 710 |
wait_until = "load" if platform in ("Devfolio", "MLH") else "domcontentloaded"
|
| 711 |
try:
|
|
|
|
| 714 |
if "Timeout" not in str(e): raise
|
| 715 |
print("[Scraper] goto timeout, proceeding anyway")
|
| 716 |
|
| 717 |
+
# Reduced wait times β blocking assets means pages settle faster
|
| 718 |
+
wait_map = {"Unstop": 6, "DoraHacks": 6, "Devfolio": 5, "MLH": 3}
|
| 719 |
+
wait_sec = wait_map.get(platform, 4)
|
| 720 |
print(f"[Scraper] Waiting {wait_sec}s for JS...")
|
| 721 |
await page.wait_for_timeout(wait_sec * 1000)
|
| 722 |
|
| 723 |
+
# Light scroll only β no heavy scroll since images are blocked anyway
|
| 724 |
+
for frac in [0.5, 1.0, 0.0]:
|
| 725 |
try:
|
| 726 |
await page.evaluate(f"window.scrollTo(0, document.body.scrollHeight * {frac})")
|
| 727 |
except Exception:
|
| 728 |
pass
|
| 729 |
+
await asyncio.sleep(0.4)
|
| 730 |
+
await asyncio.sleep(0.5)
|
| 731 |
|
| 732 |
if platform == "Devfolio":
|
| 733 |
raw = await safe_evaluate(page, DEVFOLIO_SCRIPT, GENERIC_SCRIPT)
|
|
|
|
| 788 |
playwright = await async_playwright().start()
|
| 789 |
browser = await playwright.chromium.launch(
|
| 790 |
headless=True,
|
| 791 |
+
args=[
|
| 792 |
+
# ββ Security (required for containers) ββββββββββββββββββββββββββ
|
| 793 |
+
"--no-sandbox",
|
| 794 |
+
"--disable-setuid-sandbox",
|
| 795 |
+
# ββ Memory reduction βββββββββββββββββββββββββββββββββββββββββββββ
|
| 796 |
+
"--disable-dev-shm-usage", # use /tmp instead of /dev/shm
|
| 797 |
+
"--disable-gpu", # no GPU process (~50MB saved)
|
| 798 |
+
"--no-zygote", # skip zygote process fork
|
| 799 |
+
"--single-process", # single process mode (~150MB saved)
|
| 800 |
+
"--disable-extensions", # no extension processes
|
| 801 |
+
"--disable-background-networking",
|
| 802 |
+
"--disable-background-timer-throttling",
|
| 803 |
+
"--disable-backgrounding-occluded-windows",
|
| 804 |
+
"--disable-breakpad", # no crash reporter
|
| 805 |
+
"--disable-client-side-phishing-detection",
|
| 806 |
+
"--disable-component-update",
|
| 807 |
+
"--disable-default-apps",
|
| 808 |
+
"--disable-domain-reliability",
|
| 809 |
+
"--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process",
|
| 810 |
+
"--disable-hang-monitor",
|
| 811 |
+
"--disable-ipc-flooding-protection",
|
| 812 |
+
"--disable-popup-blocking",
|
| 813 |
+
"--disable-prompt-on-repost",
|
| 814 |
+
"--disable-renderer-backgrounding",
|
| 815 |
+
"--disable-sync",
|
| 816 |
+
"--disable-translate",
|
| 817 |
+
"--metrics-recording-only",
|
| 818 |
+
"--mute-audio",
|
| 819 |
+
"--no-first-run",
|
| 820 |
+
"--safebrowsing-disable-auto-update",
|
| 821 |
+
"--password-store=basic",
|
| 822 |
+
"--use-mock-keychain",
|
| 823 |
+
# ββ Reduce per-page memory ββββββββββββββββββββββββββββββββββββββββ
|
| 824 |
+
"--js-flags=--max-old-space-size=256", # cap JS heap to 256MB
|
| 825 |
+
"--renderer-process-limit=2",
|
| 826 |
+
],
|
| 827 |
)
|
| 828 |
+
print("[Scraper] v5.0 ready - memory-optimised Chromium on HuggingFace")
|
| 829 |
|
| 830 |
|
| 831 |
@app.on_event("shutdown")
|