| |
| """ |
| AdultDVDMarketplace Studio Scraper Bot |
| Usage: |
| python main.py --studio-url "URL" --min-price 6 |
| python main.py --studio-name "Studio Name" --min-price 6 |
| python main.py --studio-url "URL" --min-price 6 --format excel --headless |
| python main.py --studio-url "URL" --min-price 6 --max-items 100 --format both |
| """ |
| import argparse |
| import logging |
| import sys |
| import time |
| from pathlib import Path |
|
|
| from config import BotConfig, DEFAULT_CREDENTIALS_FILE, load_credentials, save_credentials |
| from logger_setup import setup_logger |
| from browser_session import BrowserSession |
| from auth_handler import AuthHandler |
| from studio_navigator import StudioNavigator |
| from listing_scraper import ListingScraper |
| from product_scraper import ProductScraper |
| from export_manager import ExportManager |
|
|
|
|
| |
| |
| |
|
|
| def build_parser() -> argparse.ArgumentParser: |
| p = argparse.ArgumentParser( |
| description="Scrape studio products from AdultDVDMarketplace", |
| formatter_class=argparse.RawDescriptionHelpFormatter, |
| epilog=( |
| "Examples:\n" |
| ' python main.py --studio-url "https://www.adultdvdmarketplace.com/xcart/' |
| 'adult_dvd/dvd_search.php?type=studioid&search=1714&order_by=price" --min-price 6\n' |
| ' python main.py --studio-name "20/20 Vision" --min-price 6\n' |
| ' python main.py --studio-url "..." --min-price 6 --format excel --headless\n' |
| ), |
| ) |
| p.add_argument("--username", default=None, help="Website login username") |
| p.add_argument("--password", default=None, help="Website login password") |
| p.add_argument("--credentials-file", default=DEFAULT_CREDENTIALS_FILE, |
| help="Local credentials file path (default: .browser_state/credentials.json)") |
| studio_input = p.add_mutually_exclusive_group(required=True) |
| studio_input.add_argument("--studio-url", help="Studio listing page URL") |
| studio_input.add_argument("--studio-name", help="Studio name to find from all studios directory") |
| p.add_argument("--min-price", type=float, default=6.0, help="Minimum price threshold (default: 6.0)") |
| p.add_argument("--format", choices=["csv", "excel", "both"], default="csv", |
| help="Output file format (default: csv)") |
| p.add_argument("--output", default=None, help="Output file path (auto-named if omitted)") |
| p.add_argument("--headless", action="store_true", help="Run browser headless (no window)") |
| p.add_argument("--max-items", type=int, default=None, help="Stop after N items (default: no limit)") |
| p.add_argument("--timeout", type=int, default=30000, help="Browser timeout ms (default: 30000)") |
| p.add_argument("--retries", type=int, default=3, help="Retries per product page (default: 3)") |
| return p |
|
|
|
|
| |
| |
| |
|
|
| def main() -> None: |
| args = build_parser().parse_args() |
|
|
| Path("logs").mkdir(exist_ok=True) |
| logger = setup_logger() |
|
|
| logger.info("=" * 60) |
| logger.info("AdultDVDMarketplace Studio Scraper Bot") |
| logger.info("=" * 60) |
| logger.info(f"Studio URL : {args.studio_url or '-'}") |
| logger.info(f"Studio Name : {args.studio_name or '-'}") |
| logger.info(f"Min price : ${args.min_price:.2f}") |
| logger.info(f"Format : {args.format}") |
| logger.info(f"Headless : {args.headless}") |
|
|
| if bool(args.username) ^ bool(args.password): |
| logger.error("Provide both --username and --password together") |
| sys.exit(1) |
|
|
| if args.username and args.password: |
| username = args.username |
| password = args.password |
| credential_source = "cli" |
| else: |
| username, password, credential_source = load_credentials(args.credentials_file) |
|
|
| if not username or not password: |
| logger.error("Credentials not found. Use --username/--password or set ADULTDVD_USERNAME and ADULTDVD_PASSWORD") |
| logger.error(f"Checked credential file: {args.credentials_file}") |
| sys.exit(1) |
|
|
| if credential_source in ("cli", "env"): |
| save_credentials(username, password, args.credentials_file) |
| logger.info(f"Credentials source: {credential_source} (saved to local credentials file)") |
| else: |
| logger.info(f"Credentials source: {credential_source}") |
|
|
| config = BotConfig( |
| username=username, |
| password=password, |
| studio_url=args.studio_url or "", |
| min_price=args.min_price, |
| output_format=args.format, |
| output_file=args.output, |
| headless=args.headless, |
| max_items=args.max_items, |
| timeout=args.timeout, |
| retry_count=args.retries, |
| ) |
|
|
| session = BrowserSession( |
| headless=config.headless, |
| state_dir=config.state_dir, |
| timeout=config.timeout, |
| ) |
|
|
| records = [] |
| output_paths = [] |
| listing_scraper = None |
|
|
| try: |
| session.start() |
|
|
| |
| logger.info("") |
| logger.info("Phase 1 βΆ Authentication") |
| auth = AuthHandler(session, config.username, config.password) |
| if not auth.ensure_authenticated(): |
| logger.error("Authentication failed β aborting") |
| sys.exit(1) |
|
|
| |
| logger.info("") |
| logger.info("Phase 2 βΆ Studio navigation") |
| navigator = StudioNavigator(session) |
|
|
| if args.studio_url: |
| studio_url = config.get_studio_url_with_sort() |
| else: |
| found_url = navigator.find_studio_by_name(args.studio_name) |
| if not found_url: |
| logger.error(f"Could not find studio by name: {args.studio_name}") |
| sys.exit(1) |
| studio_url = found_url |
|
|
| if args.studio_url: |
| if not navigator.navigate_to_studio_url(studio_url): |
| logger.error("Could not open studio page β aborting") |
| sys.exit(1) |
|
|
| |
| logger.info("") |
| logger.info("Phase 3 βΆ Listing scan") |
| listing_scraper = ListingScraper( |
| session=session, |
| min_price=config.min_price, |
| max_items=config.max_items, |
| ) |
| product_scraper = ProductScraper(session=session, retry_count=config.retry_count) |
| scraped_count = 0 |
|
|
| logger.info("") |
| logger.info("Phase 4 βΆ Product detail scraping") |
|
|
| for idx, pinfo in enumerate(listing_scraper.iter_qualifying_products(studio_url), 1): |
| title_hint = pinfo.get("title") or pinfo.get("url", "") |
| logger.info(f" [{idx}] {title_hint[:80]}") |
|
|
| record = product_scraper.scrape_product(pinfo["url"]) |
|
|
| |
| if not record["title"] and pinfo.get("title"): |
| record["title"] = pinfo["title"] |
| if not record["price"] and pinfo.get("price") is not None: |
| record["price"] = f"${pinfo['price']:.2f}" |
|
|
| records.append(record) |
| scraped_count += 1 |
| time.sleep(0.3) |
|
|
| if scraped_count == 0: |
| logger.warning(f"No products found at or above ${config.min_price:.2f}") |
| print(f"\nNo qualifying products found (min price = ${config.min_price:.2f}).") |
| else: |
| logger.info(f"Qualifying products: {scraped_count}") |
|
|
| |
| logger.info("") |
| logger.info("Phase 5 βΆ Export") |
| if config.output_format == "both": |
| paths = ExportManager().save_both(records) |
| for fmt, path in paths.items(): |
| output_paths.append(path) |
| print(f" {fmt.upper()} β {path}") |
| else: |
| mgr = ExportManager( |
| output_format=config.output_format, |
| output_file=config.output_file, |
| ) |
| out_path = mgr.save(records) |
| if out_path: |
| output_paths.append(out_path) |
| print(f"\n Output β {out_path}") |
|
|
| except KeyboardInterrupt: |
| logger.info("Interrupted by user") |
| if records: |
| path = ExportManager(output_format="csv", output_file="output_partial.csv").save(records) |
| print(f"\nPartial results saved β {path}") |
| except Exception as exc: |
| logger.error(f"Unexpected error: {exc}", exc_info=True) |
| if records: |
| path = ExportManager(output_format="csv", output_file="output_recovery.csv").save(records) |
| print(f"\nRecovery file saved β {path}") |
| finally: |
| session.stop() |
|
|
| |
| with_upc = sum(1 for r in records if r.get("upc")) |
| with_error = sum(1 for r in records if r.get("error")) |
|
|
| print() |
| print("=" * 50) |
| print(" SCRAPING COMPLETE") |
| print("=" * 50) |
| if listing_scraper: |
| print(f" Pages scanned : {listing_scraper.pages_scanned}") |
| print(f" Price sort ok : {listing_scraper.sort_verified}") |
| print(f" Products checked : {listing_scraper.checked}") |
| print(f" Skipped below min : {listing_scraper.skipped_below_threshold}") |
| print(f" Skipped no price : {listing_scraper.skipped_unknown_price}") |
| print(f" Products scraped : {len(records)}") |
| print(f" With UPC : {with_upc}") |
| print(f" Errors : {with_error}") |
| if output_paths: |
| print(f" Output files : {', '.join(output_paths)}") |
| print(f" Logs : logs/") |
| print("=" * 50) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|