Spaces:
Running
Running
| from __future__ import annotations | |
| from io import StringIO | |
| import csv | |
| import json | |
| from datetime import datetime, timezone | |
| from pathlib import Path | |
| from typing import Optional | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from fastapi import FastAPI, HTTPException | |
| from fastapi.responses import StreamingResponse | |
| from pydantic import BaseModel, Field | |
| from urllib.parse import urlencode | |
| app = FastAPI(title="Nike Scraper API", version="1.0.0") | |
| NIKE_BASE_SEARCH = "https://www.nike.com/w" | |
| NIKE_BASE_URL = "https://www.nike.com" | |
| CATEGORY_ALIASES = { | |
| "t-shirt": "t-shirt", | |
| "tee": "t-shirt", | |
| "shirt": "shirt", | |
| "hoodie": "hoodie", | |
| "sweatshirt": "sweatshirt", | |
| "jacket": "jacket", | |
| "gilet": "gilet", | |
| "top": "top", | |
| "tank": "tank top", | |
| "polo": "polo", | |
| "jersey": "jersey", | |
| "bra": "sports bra", | |
| "pant": "pants", | |
| "pants": "pants", | |
| "trousers": "trousers", | |
| "shorts": "shorts", | |
| "short": "shorts", | |
| "leggings": "leggings", | |
| "tights": "tights", | |
| "joggers": "joggers", | |
| "sweatpants": "sweatpants", | |
| "skirt": "skirt", | |
| "dress": "dress", | |
| "tracksuit": "tracksuit", | |
| "jumpsuit": "jumpsuit", | |
| "socks": "socks", | |
| "sock": "socks", | |
| "hat": "hat", | |
| "cap": "cap", | |
| "bag": "bag", | |
| "backpack": "backpack", | |
| } | |
| HEADERS = { | |
| "User-Agent": ( | |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " | |
| "AppleWebKit/537.36 (KHTML, like Gecko) " | |
| "Chrome/123.0.0.0 Safari/537.36" | |
| ) | |
| } | |
| CATEGORIES = [ | |
| "sweaters", | |
| "hoodies", | |
| "t-shirts", | |
| "jackets", | |
| "shirts", | |
| "crews", | |
| "jerseys", | |
| "tops", | |
| "polos", | |
| "tanks", | |
| "compression", | |
| "baselayer", | |
| "jeans", | |
| "shorts", | |
| "skirts", | |
| "tights", | |
| "parkas", | |
| "gilets", | |
| "pants", | |
| "leggings", | |
| "trousers", | |
| "joggers", | |
| "sweatpants", | |
| "dresses", | |
| "rompers", | |
| "jumpsuits", | |
| "onesies", | |
| "overalls", | |
| "tracksuits", | |
| "sneakers", | |
| "slippers", | |
| "sunglasses", | |
| "bras", | |
| "socks", | |
| "hats", | |
| "bags", | |
| "backpacks", | |
| ] | |
| SCRAPE_OUTPUT_DIR = Path(__file__).resolve().parent / "scraped_json" | |
| class Recommendation(BaseModel): | |
| color: str = Field(..., min_length=1) | |
| category: str = Field(..., min_length=1) | |
| gender: Optional[str] = Field(default=None, description="men or women") | |
| class ScrapeRequest(BaseModel): | |
| recommendation: Recommendation | |
| max_products: int = Field(default=30, ge=1, le=300) | |
| def _ensure_full_url(href: str) -> str: | |
| if href.startswith("/"): | |
| return f"{NIKE_BASE_URL}{href}" | |
| return href | |
| def build_nike_search_url(color: str, category: str, gender: Optional[str] = None) -> str: | |
| category_normalized = CATEGORY_ALIASES.get(category.lower(), category.lower()) | |
| parts: list[str] = [] | |
| if gender: | |
| parts.append(gender.lower() + "s") | |
| parts.append(color.lower()) | |
| parts.append(category_normalized) | |
| query = " ".join(parts) | |
| params = urlencode({"q": query, "vst": query}) | |
| return f"{NIKE_BASE_SEARCH}?{params}" | |
| def build_nike_urls_from_recommendation(recommendation: Recommendation) -> list[str]: | |
| color = recommendation.color | |
| category = recommendation.category | |
| gender = recommendation.gender | |
| if gender: | |
| return [build_nike_search_url(color, category, gender)] | |
| return [ | |
| build_nike_search_url(color, category, "men"), | |
| build_nike_search_url(color, category, "women"), | |
| build_nike_search_url(color, category), | |
| ] | |
| def build_search_urls_from_recommendation(recommendation: Recommendation, store: str = "nike") -> list[str]: | |
| return build_nike_urls_from_recommendation(recommendation) | |
| def build_search_urls_from_query(query: str, store: str = "nike", gender: Optional[str] = None) -> list[str]: | |
| normalized_query = str(query or "").strip() | |
| if not normalized_query: | |
| return [] | |
| def _normalize_prefixed_query(prefix: str, value: str) -> str: | |
| lowered = value.strip().lower() | |
| p = prefix.strip().lower() | |
| if lowered.startswith(f"{p} "): | |
| return value.strip() | |
| return f"{prefix} {value}".strip() | |
| if gender: | |
| q = _normalize_prefixed_query(gender, normalized_query) | |
| return [f"{NIKE_BASE_SEARCH}?{urlencode({'q': q, 'vst': q})}"] | |
| return [ | |
| f"{NIKE_BASE_SEARCH}?{urlencode({'q': f'men {normalized_query}'.strip(), 'vst': f'men {normalized_query}'.strip()})}", | |
| f"{NIKE_BASE_SEARCH}?{urlencode({'q': f'women {normalized_query}'.strip(), 'vst': f'women {normalized_query}'.strip()})}", | |
| f"{NIKE_BASE_SEARCH}?{urlencode({'q': normalized_query, 'vst': normalized_query})}", | |
| ] | |
| def _get_soup(url: str) -> BeautifulSoup: | |
| response = requests.get(url, headers=HEADERS, timeout=20) | |
| response.raise_for_status() | |
| return BeautifulSoup(response.content, "lxml") | |
| def _ensure_store_url(href: str, base_url: str) -> str: | |
| if not href: | |
| return "" | |
| if href.startswith("//"): | |
| return f"https:{href}" | |
| if href.startswith("/"): | |
| return f"{base_url}{href}" | |
| return href | |
| def extract_product_urls(search_url: str) -> list[str]: | |
| soup = _get_soup(search_url) | |
| product_links: list[str] = [] | |
| anchors = soup.find_all("a", {"class": "product-card__link-overlay"}) | |
| for anchor in anchors: | |
| href = anchor.get("href") | |
| if href: | |
| full = _ensure_full_url(href) | |
| if full not in product_links: | |
| product_links.append(full) | |
| if not product_links: | |
| all_anchors = soup.find_all("a", href=True) | |
| for anchor in all_anchors: | |
| href = anchor.get("href") | |
| if href and "/t/" in href: | |
| full = _ensure_full_url(href) | |
| if full not in product_links: | |
| product_links.append(full) | |
| return product_links | |
| def _extract_image_from_container(container: BeautifulSoup) -> str: | |
| img = container.find("img") | |
| if not img: | |
| return "" | |
| return str(img.get("src") or img.get("data-src") or img.get("srcset") or "").strip() | |
| def extract_product_summaries(search_url: str, store: str = "nike") -> list[dict[str, str]]: | |
| soup = _get_soup(search_url) | |
| summaries: list[dict[str, str]] = [] | |
| seen_links: set[str] = set() | |
| containers = soup.find_all("div", {"class": "product-card__body"}) | |
| for container in containers: | |
| anchor = container.find("a", {"class": "product-card__link-overlay"}) | |
| if not anchor: | |
| continue | |
| href = anchor.get("href") | |
| if not href: | |
| continue | |
| item_link = _ensure_full_url(href) | |
| if item_link in seen_links: | |
| continue | |
| seen_links.add(item_link) | |
| title = get_title(container) | |
| current_price, _ = get_prices(container) | |
| image_url = _extract_image_from_container(container.parent if container.parent else container) | |
| summaries.append( | |
| { | |
| "item_link": item_link, | |
| "name": title, | |
| "price": current_price, | |
| "image_url": image_url, | |
| } | |
| ) | |
| if summaries: | |
| return summaries | |
| # Fallback path when Nike card markup changes. | |
| for item_link in extract_product_urls(search_url): | |
| if item_link in seen_links: | |
| continue | |
| seen_links.add(item_link) | |
| summaries.append( | |
| { | |
| "item_link": item_link, | |
| "name": "N/A", | |
| "price": "N/A", | |
| "image_url": "", | |
| } | |
| ) | |
| return summaries | |
| def get_title(container: BeautifulSoup) -> str: | |
| try: | |
| title = container.find_all("div", {"class": "product-card__title"})[0].text | |
| subtitle = container.find_all("div", {"class": "product-card__subtitle"})[0].text | |
| return f"{title} {subtitle}".strip() | |
| except (IndexError, AttributeError): | |
| return "N/A" | |
| def get_target_gender(title: str) -> str: | |
| if "Men's" in title: | |
| return "Men" | |
| if "Women's" in title: | |
| return "Women" | |
| return "Unisex" | |
| def get_subcategory(title: str) -> str: | |
| for word in title.split(" "): | |
| candidate = word.lower().strip(",.") | |
| if candidate in CATEGORIES or (candidate + "s") in CATEGORIES: | |
| return word | |
| return "" | |
| def get_prices(container: BeautifulSoup) -> tuple[str, str]: | |
| try: | |
| price_container = container.find_all("div", {"class": "product-price__wrapper"}) | |
| current_price = price_container[0].text | |
| old_price = "N/A" | |
| if current_price.count("$") == 2: | |
| prices = current_price.split("$") | |
| current_price = "$" + prices[1] if "." in prices[1] else "$" + prices[1] + ".00" | |
| old_price = "$" + prices[2] if "." in prices[2] else "$" + prices[2] + ".00" | |
| elif "." not in current_price: | |
| current_price = current_price + ".00" | |
| except (IndexError, AttributeError): | |
| current_price, old_price = "N/A", "N/A" | |
| return current_price, old_price | |
| def get_item_image_link(item_soup: BeautifulSoup) -> str: | |
| try: | |
| img = item_soup.find("img", {"class": "css-viwop1 u-full-width u-full-height css-m5dkrx"}) | |
| return img.get("src") if img else "Click on item link for pictures." | |
| except (IndexError, AttributeError): | |
| return "Click on item link for pictures." | |
| def get_colors(item_soup: BeautifulSoup) -> str: | |
| try: | |
| current = item_soup.find_all( | |
| "div", | |
| { | |
| "class": "colorway-product-overlay colorway-product-overlay--active " | |
| "colorway-product-overlay--selected css-sa2cc9" | |
| }, | |
| ) | |
| if current: | |
| colors = current[0].find_all("img", alt=True)[0].get("alt") | |
| for color in item_soup.find_all("div", {"class": "colorway-product-overlay css-sa2cc9"}): | |
| alt = color.find_all("img", alt=True)[0].get("alt") | |
| if alt != "Design your own Nike By You product": | |
| colors += " || " + alt | |
| else: | |
| color_li = item_soup.find_all("li", {"class": "description-preview__color-description ncss-li"}) | |
| colors = str(color_li).split(": ")[1].replace("</li>]", "") | |
| except (IndexError, AttributeError): | |
| colors = "Click on item link for available colors." | |
| return colors | |
| def scrape_products(search_urls: list[str], max_products: int) -> list[dict[str, str]]: | |
| items: list[dict[str, str]] = [] | |
| seen_links: set[str] = set() | |
| for link in search_urls: | |
| soup = _get_soup(link) | |
| containers = soup.find_all("div", {"class": "product-card__body"}) | |
| for container in containers: | |
| if len(items) >= max_products: | |
| return items | |
| anchor = container.find("a", {"class": "product-card__link-overlay"}) | |
| if not anchor: | |
| continue | |
| href = anchor.get("href") | |
| if not href: | |
| continue | |
| item_link = _ensure_full_url(href) | |
| if item_link in seen_links: | |
| continue | |
| seen_links.add(item_link) | |
| title = get_title(container) | |
| gender = get_target_gender(title) | |
| current_price, old_price = get_prices(container) | |
| subcategory = get_subcategory(title) | |
| image_link = "Click on item link for pictures." | |
| colors = "Click on item link for available colors." | |
| try: | |
| item_soup = _get_soup(item_link) | |
| image_link = get_item_image_link(item_soup) | |
| colors = get_colors(item_soup) | |
| except requests.RequestException: | |
| pass | |
| items.append( | |
| { | |
| "name": title, | |
| "gender": gender, | |
| "price": current_price, | |
| "sale_price": old_price, | |
| "colors": colors, | |
| "item_link": item_link, | |
| "image_link": image_link, | |
| "subcategory": subcategory, | |
| "brand": "Nike", | |
| } | |
| ) | |
| return items | |
| def _build_csv(products: list[dict[str, str]]) -> str: | |
| output = StringIO() | |
| writer = csv.DictWriter( | |
| output, | |
| fieldnames=[ | |
| "name", | |
| "gender", | |
| "price", | |
| "sale_price", | |
| "colors", | |
| "item_link", | |
| "image_link", | |
| "subcategory", | |
| "brand", | |
| ], | |
| ) | |
| writer.writeheader() | |
| writer.writerows(products) | |
| return output.getvalue() | |
| def _save_json_payload(prefix: str, payload: dict[str, object]) -> str: | |
| SCRAPE_OUTPUT_DIR.mkdir(parents=True, exist_ok=True) | |
| ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") | |
| filename = f"{prefix}_{ts}.json" | |
| file_path = SCRAPE_OUTPUT_DIR / filename | |
| with file_path.open("w", encoding="utf-8") as f: | |
| json.dump(payload, f, ensure_ascii=True, indent=2) | |
| return str(file_path) | |
| def health() -> dict[str, str]: | |
| return {"status": "ok"} | |
| def root() -> dict[str, str]: | |
| return { | |
| "message": "Nike Scraper API is running.", | |
| "docs": "/docs", | |
| "health": "/health", | |
| } | |
| def search_urls(payload: Recommendation) -> dict[str, list[str]]: | |
| return {"search_urls": build_nike_urls_from_recommendation(payload)} | |
| def product_urls(payload: Recommendation) -> dict[str, object]: | |
| try: | |
| urls = build_nike_urls_from_recommendation(payload) | |
| all_products: list[dict[str, str]] = [] | |
| seen_links: set[str] = set() | |
| for url in urls: | |
| for product in extract_product_summaries(url): | |
| link = product.get("item_link", "") | |
| if not link or link in seen_links: | |
| continue | |
| seen_links.add(link) | |
| all_products.append(product) | |
| response_payload: dict[str, object] = { | |
| "product_urls": [item["item_link"] for item in all_products], | |
| "products": all_products, | |
| } | |
| response_payload["saved_json_path"] = _save_json_payload("product_urls", response_payload) | |
| return response_payload | |
| except requests.RequestException as exc: | |
| raise HTTPException(status_code=502, detail=f"Failed to fetch Nike pages: {exc}") from exc | |
| def scrape(payload: ScrapeRequest) -> dict[str, object]: | |
| try: | |
| search_urls = build_nike_urls_from_recommendation(payload.recommendation) | |
| products = scrape_products(search_urls, max_products=payload.max_products) | |
| except requests.RequestException as exc: | |
| raise HTTPException(status_code=502, detail=f"Failed to fetch Nike pages: {exc}") from exc | |
| response_payload: dict[str, object] = { | |
| "search_urls": search_urls, | |
| "count": len(products), | |
| "products": products, | |
| } | |
| return response_payload | |
| def scrape_csv(payload: ScrapeRequest) -> StreamingResponse: | |
| try: | |
| search_urls = build_nike_urls_from_recommendation(payload.recommendation) | |
| products = scrape_products(search_urls, max_products=payload.max_products) | |
| except requests.RequestException as exc: | |
| raise HTTPException(status_code=502, detail=f"Failed to fetch Nike pages: {exc}") from exc | |
| csv_content = _build_csv(products) | |
| filename = ( | |
| f"nike_{payload.recommendation.gender or 'unisex'}_" | |
| f"{payload.recommendation.color}_{payload.recommendation.category}.csv" | |
| ) | |
| return StreamingResponse( | |
| iter([csv_content]), | |
| media_type="text/csv", | |
| headers={"Content-Disposition": f"attachment; filename={filename}"}, | |
| ) | |