from __future__ import annotations from io import StringIO import csv import json from datetime import datetime, timezone from pathlib import Path from typing import Optional import requests from bs4 import BeautifulSoup from fastapi import FastAPI, HTTPException from fastapi.responses import StreamingResponse from pydantic import BaseModel, Field from urllib.parse import urlencode app = FastAPI(title="Nike Scraper API", version="1.0.0") NIKE_BASE_SEARCH = "https://www.nike.com/w" NIKE_BASE_URL = "https://www.nike.com" CATEGORY_ALIASES = { "t-shirt": "t-shirt", "tee": "t-shirt", "shirt": "shirt", "hoodie": "hoodie", "sweatshirt": "sweatshirt", "jacket": "jacket", "gilet": "gilet", "top": "top", "tank": "tank top", "polo": "polo", "jersey": "jersey", "bra": "sports bra", "pant": "pants", "pants": "pants", "trousers": "trousers", "shorts": "shorts", "short": "shorts", "leggings": "leggings", "tights": "tights", "joggers": "joggers", "sweatpants": "sweatpants", "skirt": "skirt", "dress": "dress", "tracksuit": "tracksuit", "jumpsuit": "jumpsuit", "socks": "socks", "sock": "socks", "hat": "hat", "cap": "cap", "bag": "bag", "backpack": "backpack", } HEADERS = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/123.0.0.0 Safari/537.36" ) } CATEGORIES = [ "sweaters", "hoodies", "t-shirts", "jackets", "shirts", "crews", "jerseys", "tops", "polos", "tanks", "compression", "baselayer", "jeans", "shorts", "skirts", "tights", "parkas", "gilets", "pants", "leggings", "trousers", "joggers", "sweatpants", "dresses", "rompers", "jumpsuits", "onesies", "overalls", "tracksuits", "sneakers", "slippers", "sunglasses", "bras", "socks", "hats", "bags", "backpacks", ] SCRAPE_OUTPUT_DIR = Path(__file__).resolve().parent / "scraped_json" class Recommendation(BaseModel): color: str = Field(..., min_length=1) category: str = Field(..., min_length=1) gender: Optional[str] = Field(default=None, description="men or women") class ScrapeRequest(BaseModel): recommendation: Recommendation max_products: int = Field(default=30, ge=1, le=300) def _ensure_full_url(href: str) -> str: if href.startswith("/"): return f"{NIKE_BASE_URL}{href}" return href def build_nike_search_url(color: str, category: str, gender: Optional[str] = None) -> str: category_normalized = CATEGORY_ALIASES.get(category.lower(), category.lower()) parts: list[str] = [] if gender: parts.append(gender.lower() + "s") parts.append(color.lower()) parts.append(category_normalized) query = " ".join(parts) params = urlencode({"q": query, "vst": query}) return f"{NIKE_BASE_SEARCH}?{params}" def build_nike_urls_from_recommendation(recommendation: Recommendation) -> list[str]: color = recommendation.color category = recommendation.category gender = recommendation.gender if gender: return [build_nike_search_url(color, category, gender)] return [ build_nike_search_url(color, category, "men"), build_nike_search_url(color, category, "women"), build_nike_search_url(color, category), ] def build_search_urls_from_recommendation(recommendation: Recommendation, store: str = "nike") -> list[str]: return build_nike_urls_from_recommendation(recommendation) def build_search_urls_from_query(query: str, store: str = "nike", gender: Optional[str] = None) -> list[str]: normalized_query = str(query or "").strip() if not normalized_query: return [] def _normalize_prefixed_query(prefix: str, value: str) -> str: lowered = value.strip().lower() p = prefix.strip().lower() if lowered.startswith(f"{p} "): return value.strip() return f"{prefix} {value}".strip() if gender: q = _normalize_prefixed_query(gender, normalized_query) return [f"{NIKE_BASE_SEARCH}?{urlencode({'q': q, 'vst': q})}"] return [ f"{NIKE_BASE_SEARCH}?{urlencode({'q': f'men {normalized_query}'.strip(), 'vst': f'men {normalized_query}'.strip()})}", f"{NIKE_BASE_SEARCH}?{urlencode({'q': f'women {normalized_query}'.strip(), 'vst': f'women {normalized_query}'.strip()})}", f"{NIKE_BASE_SEARCH}?{urlencode({'q': normalized_query, 'vst': normalized_query})}", ] def _get_soup(url: str) -> BeautifulSoup: response = requests.get(url, headers=HEADERS, timeout=20) response.raise_for_status() return BeautifulSoup(response.content, "lxml") def _ensure_store_url(href: str, base_url: str) -> str: if not href: return "" if href.startswith("//"): return f"https:{href}" if href.startswith("/"): return f"{base_url}{href}" return href def extract_product_urls(search_url: str) -> list[str]: soup = _get_soup(search_url) product_links: list[str] = [] anchors = soup.find_all("a", {"class": "product-card__link-overlay"}) for anchor in anchors: href = anchor.get("href") if href: full = _ensure_full_url(href) if full not in product_links: product_links.append(full) if not product_links: all_anchors = soup.find_all("a", href=True) for anchor in all_anchors: href = anchor.get("href") if href and "/t/" in href: full = _ensure_full_url(href) if full not in product_links: product_links.append(full) return product_links def _extract_image_from_container(container: BeautifulSoup) -> str: img = container.find("img") if not img: return "" return str(img.get("src") or img.get("data-src") or img.get("srcset") or "").strip() def extract_product_summaries(search_url: str, store: str = "nike") -> list[dict[str, str]]: soup = _get_soup(search_url) summaries: list[dict[str, str]] = [] seen_links: set[str] = set() containers = soup.find_all("div", {"class": "product-card__body"}) for container in containers: anchor = container.find("a", {"class": "product-card__link-overlay"}) if not anchor: continue href = anchor.get("href") if not href: continue item_link = _ensure_full_url(href) if item_link in seen_links: continue seen_links.add(item_link) title = get_title(container) current_price, _ = get_prices(container) image_url = _extract_image_from_container(container.parent if container.parent else container) summaries.append( { "item_link": item_link, "name": title, "price": current_price, "image_url": image_url, } ) if summaries: return summaries # Fallback path when Nike card markup changes. for item_link in extract_product_urls(search_url): if item_link in seen_links: continue seen_links.add(item_link) summaries.append( { "item_link": item_link, "name": "N/A", "price": "N/A", "image_url": "", } ) return summaries def get_title(container: BeautifulSoup) -> str: try: title = container.find_all("div", {"class": "product-card__title"})[0].text subtitle = container.find_all("div", {"class": "product-card__subtitle"})[0].text return f"{title} {subtitle}".strip() except (IndexError, AttributeError): return "N/A" def get_target_gender(title: str) -> str: if "Men's" in title: return "Men" if "Women's" in title: return "Women" return "Unisex" def get_subcategory(title: str) -> str: for word in title.split(" "): candidate = word.lower().strip(",.") if candidate in CATEGORIES or (candidate + "s") in CATEGORIES: return word return "" def get_prices(container: BeautifulSoup) -> tuple[str, str]: try: price_container = container.find_all("div", {"class": "product-price__wrapper"}) current_price = price_container[0].text old_price = "N/A" if current_price.count("$") == 2: prices = current_price.split("$") current_price = "$" + prices[1] if "." in prices[1] else "$" + prices[1] + ".00" old_price = "$" + prices[2] if "." in prices[2] else "$" + prices[2] + ".00" elif "." not in current_price: current_price = current_price + ".00" except (IndexError, AttributeError): current_price, old_price = "N/A", "N/A" return current_price, old_price def get_item_image_link(item_soup: BeautifulSoup) -> str: try: img = item_soup.find("img", {"class": "css-viwop1 u-full-width u-full-height css-m5dkrx"}) return img.get("src") if img else "Click on item link for pictures." except (IndexError, AttributeError): return "Click on item link for pictures." def get_colors(item_soup: BeautifulSoup) -> str: try: current = item_soup.find_all( "div", { "class": "colorway-product-overlay colorway-product-overlay--active " "colorway-product-overlay--selected css-sa2cc9" }, ) if current: colors = current[0].find_all("img", alt=True)[0].get("alt") for color in item_soup.find_all("div", {"class": "colorway-product-overlay css-sa2cc9"}): alt = color.find_all("img", alt=True)[0].get("alt") if alt != "Design your own Nike By You product": colors += " || " + alt else: color_li = item_soup.find_all("li", {"class": "description-preview__color-description ncss-li"}) colors = str(color_li).split(": ")[1].replace("]", "") except (IndexError, AttributeError): colors = "Click on item link for available colors." return colors def scrape_products(search_urls: list[str], max_products: int) -> list[dict[str, str]]: items: list[dict[str, str]] = [] seen_links: set[str] = set() for link in search_urls: soup = _get_soup(link) containers = soup.find_all("div", {"class": "product-card__body"}) for container in containers: if len(items) >= max_products: return items anchor = container.find("a", {"class": "product-card__link-overlay"}) if not anchor: continue href = anchor.get("href") if not href: continue item_link = _ensure_full_url(href) if item_link in seen_links: continue seen_links.add(item_link) title = get_title(container) gender = get_target_gender(title) current_price, old_price = get_prices(container) subcategory = get_subcategory(title) image_link = "Click on item link for pictures." colors = "Click on item link for available colors." try: item_soup = _get_soup(item_link) image_link = get_item_image_link(item_soup) colors = get_colors(item_soup) except requests.RequestException: pass items.append( { "name": title, "gender": gender, "price": current_price, "sale_price": old_price, "colors": colors, "item_link": item_link, "image_link": image_link, "subcategory": subcategory, "brand": "Nike", } ) return items def _build_csv(products: list[dict[str, str]]) -> str: output = StringIO() writer = csv.DictWriter( output, fieldnames=[ "name", "gender", "price", "sale_price", "colors", "item_link", "image_link", "subcategory", "brand", ], ) writer.writeheader() writer.writerows(products) return output.getvalue() def _save_json_payload(prefix: str, payload: dict[str, object]) -> str: SCRAPE_OUTPUT_DIR.mkdir(parents=True, exist_ok=True) ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") filename = f"{prefix}_{ts}.json" file_path = SCRAPE_OUTPUT_DIR / filename with file_path.open("w", encoding="utf-8") as f: json.dump(payload, f, ensure_ascii=True, indent=2) return str(file_path) @app.get("/health") def health() -> dict[str, str]: return {"status": "ok"} @app.get("/") def root() -> dict[str, str]: return { "message": "Nike Scraper API is running.", "docs": "/docs", "health": "/health", } @app.post("/search-urls") def search_urls(payload: Recommendation) -> dict[str, list[str]]: return {"search_urls": build_nike_urls_from_recommendation(payload)} @app.post("/product-urls") def product_urls(payload: Recommendation) -> dict[str, object]: try: urls = build_nike_urls_from_recommendation(payload) all_products: list[dict[str, str]] = [] seen_links: set[str] = set() for url in urls: for product in extract_product_summaries(url): link = product.get("item_link", "") if not link or link in seen_links: continue seen_links.add(link) all_products.append(product) response_payload: dict[str, object] = { "product_urls": [item["item_link"] for item in all_products], "products": all_products, } response_payload["saved_json_path"] = _save_json_payload("product_urls", response_payload) return response_payload except requests.RequestException as exc: raise HTTPException(status_code=502, detail=f"Failed to fetch Nike pages: {exc}") from exc @app.post("/scrape") def scrape(payload: ScrapeRequest) -> dict[str, object]: try: search_urls = build_nike_urls_from_recommendation(payload.recommendation) products = scrape_products(search_urls, max_products=payload.max_products) except requests.RequestException as exc: raise HTTPException(status_code=502, detail=f"Failed to fetch Nike pages: {exc}") from exc response_payload: dict[str, object] = { "search_urls": search_urls, "count": len(products), "products": products, } return response_payload @app.post("/scrape.csv") def scrape_csv(payload: ScrapeRequest) -> StreamingResponse: try: search_urls = build_nike_urls_from_recommendation(payload.recommendation) products = scrape_products(search_urls, max_products=payload.max_products) except requests.RequestException as exc: raise HTTPException(status_code=502, detail=f"Failed to fetch Nike pages: {exc}") from exc csv_content = _build_csv(products) filename = ( f"nike_{payload.recommendation.gender or 'unisex'}_" f"{payload.recommendation.color}_{payload.recommendation.category}.csv" ) return StreamingResponse( iter([csv_content]), media_type="text/csv", headers={"Content-Disposition": f"attachment; filename={filename}"}, )