abdulsalam2121
Add automation bot with Flask and Playwright
5b29309
Raw
History Blame Contribute Delete
10.6 kB
#!/usr/bin/env python3
"""
AdultDVDMarketplace Studio Scraper Bot
Usage:
python main.py --studio-url "URL" --min-price 6
python main.py --studio-name "Studio Name" --min-price 6
python main.py --studio-url "URL" --min-price 6 --format excel --headless
python main.py --studio-url "URL" --min-price 6 --max-items 100 --format both
"""
import argparse
import logging
import sys
import time
from pathlib import Path
from config import BotConfig, DEFAULT_CREDENTIALS_FILE, load_credentials, save_credentials
from logger_setup import setup_logger
from browser_session import BrowserSession
from auth_handler import AuthHandler
from studio_navigator import StudioNavigator
from listing_scraper import ListingScraper
from product_scraper import ProductScraper
from export_manager import ExportManager
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def build_parser() -> argparse.ArgumentParser:
p = argparse.ArgumentParser(
description="Scrape studio products from AdultDVDMarketplace",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=(
"Examples:\n"
' python main.py --studio-url "https://www.adultdvdmarketplace.com/xcart/'
'adult_dvd/dvd_search.php?type=studioid&search=1714&order_by=price" --min-price 6\n'
' python main.py --studio-name "20/20 Vision" --min-price 6\n'
' python main.py --studio-url "..." --min-price 6 --format excel --headless\n'
),
)
p.add_argument("--username", default=None, help="Website login username")
p.add_argument("--password", default=None, help="Website login password")
p.add_argument("--credentials-file", default=DEFAULT_CREDENTIALS_FILE,
help="Local credentials file path (default: .browser_state/credentials.json)")
studio_input = p.add_mutually_exclusive_group(required=True)
studio_input.add_argument("--studio-url", help="Studio listing page URL")
studio_input.add_argument("--studio-name", help="Studio name to find from all studios directory")
p.add_argument("--min-price", type=float, default=6.0, help="Minimum price threshold (default: 6.0)")
p.add_argument("--format", choices=["csv", "excel", "both"], default="csv",
help="Output file format (default: csv)")
p.add_argument("--output", default=None, help="Output file path (auto-named if omitted)")
p.add_argument("--headless", action="store_true", help="Run browser headless (no window)")
p.add_argument("--max-items", type=int, default=None, help="Stop after N items (default: no limit)")
p.add_argument("--timeout", type=int, default=30000, help="Browser timeout ms (default: 30000)")
p.add_argument("--retries", type=int, default=3, help="Retries per product page (default: 3)")
return p
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main() -> None:
args = build_parser().parse_args()
Path("logs").mkdir(exist_ok=True)
logger = setup_logger()
logger.info("=" * 60)
logger.info("AdultDVDMarketplace Studio Scraper Bot")
logger.info("=" * 60)
logger.info(f"Studio URL : {args.studio_url or '-'}")
logger.info(f"Studio Name : {args.studio_name or '-'}")
logger.info(f"Min price : ${args.min_price:.2f}")
logger.info(f"Format : {args.format}")
logger.info(f"Headless : {args.headless}")
if bool(args.username) ^ bool(args.password):
logger.error("Provide both --username and --password together")
sys.exit(1)
if args.username and args.password:
username = args.username
password = args.password
credential_source = "cli"
else:
username, password, credential_source = load_credentials(args.credentials_file)
if not username or not password:
logger.error("Credentials not found. Use --username/--password or set ADULTDVD_USERNAME and ADULTDVD_PASSWORD")
logger.error(f"Checked credential file: {args.credentials_file}")
sys.exit(1)
if credential_source in ("cli", "env"):
save_credentials(username, password, args.credentials_file)
logger.info(f"Credentials source: {credential_source} (saved to local credentials file)")
else:
logger.info(f"Credentials source: {credential_source}")
config = BotConfig(
username=username,
password=password,
studio_url=args.studio_url or "",
min_price=args.min_price,
output_format=args.format,
output_file=args.output,
headless=args.headless,
max_items=args.max_items,
timeout=args.timeout,
retry_count=args.retries,
)
session = BrowserSession(
headless=config.headless,
state_dir=config.state_dir,
timeout=config.timeout,
)
records = []
output_paths = []
listing_scraper = None
try:
session.start()
# ── Phase 1: Authenticate ────────────────────────────────────────
logger.info("")
logger.info("Phase 1 β–Ά Authentication")
auth = AuthHandler(session, config.username, config.password)
if not auth.ensure_authenticated():
logger.error("Authentication failed β€” aborting")
sys.exit(1)
# ── Phase 2: Navigate to studio ─────────────────────────────────
logger.info("")
logger.info("Phase 2 β–Ά Studio navigation")
navigator = StudioNavigator(session)
if args.studio_url:
studio_url = config.get_studio_url_with_sort()
else:
found_url = navigator.find_studio_by_name(args.studio_name)
if not found_url:
logger.error(f"Could not find studio by name: {args.studio_name}")
sys.exit(1)
studio_url = found_url
if args.studio_url:
if not navigator.navigate_to_studio_url(studio_url):
logger.error("Could not open studio page β€” aborting")
sys.exit(1)
# ── Phase 3: Scan listing pages ──────────────────────────────────
logger.info("")
logger.info("Phase 3 β–Ά Listing scan")
listing_scraper = ListingScraper(
session=session,
min_price=config.min_price,
max_items=config.max_items,
)
product_scraper = ProductScraper(session=session, retry_count=config.retry_count)
scraped_count = 0
logger.info("")
logger.info("Phase 4 β–Ά Product detail scraping")
for idx, pinfo in enumerate(listing_scraper.iter_qualifying_products(studio_url), 1):
title_hint = pinfo.get("title") or pinfo.get("url", "")
logger.info(f" [{idx}] {title_hint[:80]}")
record = product_scraper.scrape_product(pinfo["url"])
# Supplement blanks from listing data
if not record["title"] and pinfo.get("title"):
record["title"] = pinfo["title"]
if not record["price"] and pinfo.get("price") is not None:
record["price"] = f"${pinfo['price']:.2f}"
records.append(record)
scraped_count += 1
time.sleep(0.3)
if scraped_count == 0:
logger.warning(f"No products found at or above ${config.min_price:.2f}")
print(f"\nNo qualifying products found (min price = ${config.min_price:.2f}).")
else:
logger.info(f"Qualifying products: {scraped_count}")
# ── Phase 5: Export ───────────────────────────────────────────────
logger.info("")
logger.info("Phase 5 β–Ά Export")
if config.output_format == "both":
paths = ExportManager().save_both(records)
for fmt, path in paths.items():
output_paths.append(path)
print(f" {fmt.upper()} β†’ {path}")
else:
mgr = ExportManager(
output_format=config.output_format,
output_file=config.output_file,
)
out_path = mgr.save(records)
if out_path:
output_paths.append(out_path)
print(f"\n Output β†’ {out_path}")
except KeyboardInterrupt:
logger.info("Interrupted by user")
if records:
path = ExportManager(output_format="csv", output_file="output_partial.csv").save(records)
print(f"\nPartial results saved β†’ {path}")
except Exception as exc:
logger.error(f"Unexpected error: {exc}", exc_info=True)
if records:
path = ExportManager(output_format="csv", output_file="output_recovery.csv").save(records)
print(f"\nRecovery file saved β†’ {path}")
finally:
session.stop()
# ── Summary ───────────────────────────────────────────────────────────
with_upc = sum(1 for r in records if r.get("upc"))
with_error = sum(1 for r in records if r.get("error"))
print()
print("=" * 50)
print(" SCRAPING COMPLETE")
print("=" * 50)
if listing_scraper:
print(f" Pages scanned : {listing_scraper.pages_scanned}")
print(f" Price sort ok : {listing_scraper.sort_verified}")
print(f" Products checked : {listing_scraper.checked}")
print(f" Skipped below min : {listing_scraper.skipped_below_threshold}")
print(f" Skipped no price : {listing_scraper.skipped_unknown_price}")
print(f" Products scraped : {len(records)}")
print(f" With UPC : {with_upc}")
print(f" Errors : {with_error}")
if output_paths:
print(f" Output files : {', '.join(output_paths)}")
print(f" Logs : logs/")
print("=" * 50)
if __name__ == "__main__":
main()