booth-pic-api / backend /scraper /scrape_robust.py
github-actions
Deploy to HF (clean history with LFS)
e666301
import logging
import os
import json
import time
import requests
import io
from PIL import Image
from playwright.sync_api import sync_playwright
from booth_scraper import BoothScraper
# Reuse BoothScraper logic for metadata saving and R2 upload
class RobustBoothScraper(BoothScraper):
def crawl_target_popular(self, target_url, min_likes=1000, max_pages=10):
logging.info(f"--- [RobustScraper] Starting crawl: {target_url} ---")
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
context = browser.new_context(user_agent=self.headers["User-Agent"])
all_target_urls = []
# --- Phase 1: Collect Item URLs from Search Pages ---
for page_num in range(1, max_pages + 1):
separator = "&" if "?" in target_url else "?"
url = f"{target_url}{separator}page={page_num}"
logging.info(f"Fetching search results page {page_num}: {url}")
page = context.new_page()
try:
page.goto(url, wait_until="networkidle", timeout=60000)
page.wait_for_timeout(2000)
cards = page.locator(".item-card")
card_count = cards.count()
if card_count == 0:
logging.warning("No more items found.")
page.close()
break
found_on_page = 0
for i in range(card_count):
card = cards.nth(i)
# Detect likes
likes = 0
try:
# Try the specific selector found by browser subagent
likes_loc = card.locator(".item-card__likes, .js-wishlist-count, .shop__text--link50").first
if likes_loc.count() > 0:
likes_text = likes_loc.inner_text()
import re
nums = re.findall(r'[\d,]+', likes_text)
if nums:
likes = int(nums[0].replace(',', ''))
except:
pass
if likes < min_likes:
# In popular sort, if we hit items < 1000 likes consistently, we might be done
# But lets just skip for now
continue
# Try multiple common link selectors
link_loc = card.locator(".item-card__title-anchor--multiline, .item-card__title a, a.item-card__title").first
if link_loc.count() == 0:
# Fallback: any link inside the card that looks like an item link
link_loc = card.locator("a[href*='/items/']").first
if link_loc.count() > 0:
item_url = link_loc.get_attribute("href")
if item_url:
if not item_url.startswith("http"):
from urllib.parse import urljoin
item_url = urljoin(self.base_url, item_url)
# FORCE re-processing for the targeted crawl to ensure we get 'likes' metadata and images
all_target_urls.append((item_url, likes))
found_on_page += 1
logging.info(f" - Found: {item_url} ({likes} likes)")
else:
logging.warning(f" - Could not find link for card {i}")
logging.info(f"Page {page_num}: Found {found_on_page} items with >= {min_likes} likes.")
page.close()
if found_on_page == 0 and page_num > 1:
# If we find nothing on a page (sorted by popularity), we might have reached the end of highly liked items
logging.info("Reached end of 1000+ likes items (Popularity sorted).")
# break # Optional: Uncomment if we are confident in the sort
except Exception as e:
logging.error(f"Error on search page {page_num}: {e}")
if not page.is_closed(): page.close()
browser.close()
# --- Phase 2: Process Each Item Individually ---
logging.info(f"Total NEW items to process: {len(all_target_urls)}")
for i, (item_url, likes) in enumerate(all_target_urls):
logging.info(f"[{i+1}/{len(all_target_urls)}] Processing: {item_url} ({likes} likes)")
try:
# We create a new browser/page for each item OR in small batches to avoid TargetClosed errors
# For simplicity, we'll use requests for the detail page as it's often more stable
# unless it's heavily protected. Booth detail pages are usually searchable with BS4.
self.process_item(item_url, page=None, likes=likes)
self.processed_urls.add(item_url)
time.sleep(1) # Be polite
except Exception as ex:
logging.error(f"Failed to process {item_url}: {ex}")
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
scraper = RobustBoothScraper(data_dir="data")
target = "https://booth.pm/ja/browse/3D%E8%A1%A3%E8%A3%85?tags%5B%5D=VRChat&adult=include&sort=popular"
scraper.crawl_target_popular(target, min_likes=1000, max_pages=30) # Scrape up to 30 search pages