StyleWellBackend / scraper.py
HelloWorld0204's picture
Upload 16 files
e08551d verified
from __future__ import annotations
from io import StringIO
import csv
import json
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
import requests
from bs4 import BeautifulSoup
from fastapi import FastAPI, HTTPException
from fastapi.responses import StreamingResponse
from pydantic import BaseModel, Field
from urllib.parse import urlencode
app = FastAPI(title="Nike Scraper API", version="1.0.0")
NIKE_BASE_SEARCH = "https://www.nike.com/w"
NIKE_BASE_URL = "https://www.nike.com"
CATEGORY_ALIASES = {
"t-shirt": "t-shirt",
"tee": "t-shirt",
"shirt": "shirt",
"hoodie": "hoodie",
"sweatshirt": "sweatshirt",
"jacket": "jacket",
"gilet": "gilet",
"top": "top",
"tank": "tank top",
"polo": "polo",
"jersey": "jersey",
"bra": "sports bra",
"pant": "pants",
"pants": "pants",
"trousers": "trousers",
"shorts": "shorts",
"short": "shorts",
"leggings": "leggings",
"tights": "tights",
"joggers": "joggers",
"sweatpants": "sweatpants",
"skirt": "skirt",
"dress": "dress",
"tracksuit": "tracksuit",
"jumpsuit": "jumpsuit",
"socks": "socks",
"sock": "socks",
"hat": "hat",
"cap": "cap",
"bag": "bag",
"backpack": "backpack",
}
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/123.0.0.0 Safari/537.36"
)
}
CATEGORIES = [
"sweaters",
"hoodies",
"t-shirts",
"jackets",
"shirts",
"crews",
"jerseys",
"tops",
"polos",
"tanks",
"compression",
"baselayer",
"jeans",
"shorts",
"skirts",
"tights",
"parkas",
"gilets",
"pants",
"leggings",
"trousers",
"joggers",
"sweatpants",
"dresses",
"rompers",
"jumpsuits",
"onesies",
"overalls",
"tracksuits",
"sneakers",
"slippers",
"sunglasses",
"bras",
"socks",
"hats",
"bags",
"backpacks",
]
SCRAPE_OUTPUT_DIR = Path(__file__).resolve().parent / "scraped_json"
class Recommendation(BaseModel):
color: str = Field(..., min_length=1)
category: str = Field(..., min_length=1)
gender: Optional[str] = Field(default=None, description="men or women")
class ScrapeRequest(BaseModel):
recommendation: Recommendation
max_products: int = Field(default=30, ge=1, le=300)
def _ensure_full_url(href: str) -> str:
if href.startswith("/"):
return f"{NIKE_BASE_URL}{href}"
return href
def build_nike_search_url(color: str, category: str, gender: Optional[str] = None) -> str:
category_normalized = CATEGORY_ALIASES.get(category.lower(), category.lower())
parts: list[str] = []
if gender:
parts.append(gender.lower() + "s")
parts.append(color.lower())
parts.append(category_normalized)
query = " ".join(parts)
params = urlencode({"q": query, "vst": query})
return f"{NIKE_BASE_SEARCH}?{params}"
def build_nike_urls_from_recommendation(recommendation: Recommendation) -> list[str]:
color = recommendation.color
category = recommendation.category
gender = recommendation.gender
if gender:
return [build_nike_search_url(color, category, gender)]
return [
build_nike_search_url(color, category, "men"),
build_nike_search_url(color, category, "women"),
build_nike_search_url(color, category),
]
def build_search_urls_from_recommendation(recommendation: Recommendation, store: str = "nike") -> list[str]:
return build_nike_urls_from_recommendation(recommendation)
def build_search_urls_from_query(query: str, store: str = "nike", gender: Optional[str] = None) -> list[str]:
normalized_query = str(query or "").strip()
if not normalized_query:
return []
def _normalize_prefixed_query(prefix: str, value: str) -> str:
lowered = value.strip().lower()
p = prefix.strip().lower()
if lowered.startswith(f"{p} "):
return value.strip()
return f"{prefix} {value}".strip()
if gender:
q = _normalize_prefixed_query(gender, normalized_query)
return [f"{NIKE_BASE_SEARCH}?{urlencode({'q': q, 'vst': q})}"]
return [
f"{NIKE_BASE_SEARCH}?{urlencode({'q': f'men {normalized_query}'.strip(), 'vst': f'men {normalized_query}'.strip()})}",
f"{NIKE_BASE_SEARCH}?{urlencode({'q': f'women {normalized_query}'.strip(), 'vst': f'women {normalized_query}'.strip()})}",
f"{NIKE_BASE_SEARCH}?{urlencode({'q': normalized_query, 'vst': normalized_query})}",
]
def _get_soup(url: str) -> BeautifulSoup:
response = requests.get(url, headers=HEADERS, timeout=20)
response.raise_for_status()
return BeautifulSoup(response.content, "lxml")
def _ensure_store_url(href: str, base_url: str) -> str:
if not href:
return ""
if href.startswith("//"):
return f"https:{href}"
if href.startswith("/"):
return f"{base_url}{href}"
return href
def extract_product_urls(search_url: str) -> list[str]:
soup = _get_soup(search_url)
product_links: list[str] = []
anchors = soup.find_all("a", {"class": "product-card__link-overlay"})
for anchor in anchors:
href = anchor.get("href")
if href:
full = _ensure_full_url(href)
if full not in product_links:
product_links.append(full)
if not product_links:
all_anchors = soup.find_all("a", href=True)
for anchor in all_anchors:
href = anchor.get("href")
if href and "/t/" in href:
full = _ensure_full_url(href)
if full not in product_links:
product_links.append(full)
return product_links
def _extract_image_from_container(container: BeautifulSoup) -> str:
img = container.find("img")
if not img:
return ""
return str(img.get("src") or img.get("data-src") or img.get("srcset") or "").strip()
def extract_product_summaries(search_url: str, store: str = "nike") -> list[dict[str, str]]:
soup = _get_soup(search_url)
summaries: list[dict[str, str]] = []
seen_links: set[str] = set()
containers = soup.find_all("div", {"class": "product-card__body"})
for container in containers:
anchor = container.find("a", {"class": "product-card__link-overlay"})
if not anchor:
continue
href = anchor.get("href")
if not href:
continue
item_link = _ensure_full_url(href)
if item_link in seen_links:
continue
seen_links.add(item_link)
title = get_title(container)
current_price, _ = get_prices(container)
image_url = _extract_image_from_container(container.parent if container.parent else container)
summaries.append(
{
"item_link": item_link,
"name": title,
"price": current_price,
"image_url": image_url,
}
)
if summaries:
return summaries
# Fallback path when Nike card markup changes.
for item_link in extract_product_urls(search_url):
if item_link in seen_links:
continue
seen_links.add(item_link)
summaries.append(
{
"item_link": item_link,
"name": "N/A",
"price": "N/A",
"image_url": "",
}
)
return summaries
def get_title(container: BeautifulSoup) -> str:
try:
title = container.find_all("div", {"class": "product-card__title"})[0].text
subtitle = container.find_all("div", {"class": "product-card__subtitle"})[0].text
return f"{title} {subtitle}".strip()
except (IndexError, AttributeError):
return "N/A"
def get_target_gender(title: str) -> str:
if "Men's" in title:
return "Men"
if "Women's" in title:
return "Women"
return "Unisex"
def get_subcategory(title: str) -> str:
for word in title.split(" "):
candidate = word.lower().strip(",.")
if candidate in CATEGORIES or (candidate + "s") in CATEGORIES:
return word
return ""
def get_prices(container: BeautifulSoup) -> tuple[str, str]:
try:
price_container = container.find_all("div", {"class": "product-price__wrapper"})
current_price = price_container[0].text
old_price = "N/A"
if current_price.count("$") == 2:
prices = current_price.split("$")
current_price = "$" + prices[1] if "." in prices[1] else "$" + prices[1] + ".00"
old_price = "$" + prices[2] if "." in prices[2] else "$" + prices[2] + ".00"
elif "." not in current_price:
current_price = current_price + ".00"
except (IndexError, AttributeError):
current_price, old_price = "N/A", "N/A"
return current_price, old_price
def get_item_image_link(item_soup: BeautifulSoup) -> str:
try:
img = item_soup.find("img", {"class": "css-viwop1 u-full-width u-full-height css-m5dkrx"})
return img.get("src") if img else "Click on item link for pictures."
except (IndexError, AttributeError):
return "Click on item link for pictures."
def get_colors(item_soup: BeautifulSoup) -> str:
try:
current = item_soup.find_all(
"div",
{
"class": "colorway-product-overlay colorway-product-overlay--active "
"colorway-product-overlay--selected css-sa2cc9"
},
)
if current:
colors = current[0].find_all("img", alt=True)[0].get("alt")
for color in item_soup.find_all("div", {"class": "colorway-product-overlay css-sa2cc9"}):
alt = color.find_all("img", alt=True)[0].get("alt")
if alt != "Design your own Nike By You product":
colors += " || " + alt
else:
color_li = item_soup.find_all("li", {"class": "description-preview__color-description ncss-li"})
colors = str(color_li).split(": ")[1].replace("</li>]", "")
except (IndexError, AttributeError):
colors = "Click on item link for available colors."
return colors
def scrape_products(search_urls: list[str], max_products: int) -> list[dict[str, str]]:
items: list[dict[str, str]] = []
seen_links: set[str] = set()
for link in search_urls:
soup = _get_soup(link)
containers = soup.find_all("div", {"class": "product-card__body"})
for container in containers:
if len(items) >= max_products:
return items
anchor = container.find("a", {"class": "product-card__link-overlay"})
if not anchor:
continue
href = anchor.get("href")
if not href:
continue
item_link = _ensure_full_url(href)
if item_link in seen_links:
continue
seen_links.add(item_link)
title = get_title(container)
gender = get_target_gender(title)
current_price, old_price = get_prices(container)
subcategory = get_subcategory(title)
image_link = "Click on item link for pictures."
colors = "Click on item link for available colors."
try:
item_soup = _get_soup(item_link)
image_link = get_item_image_link(item_soup)
colors = get_colors(item_soup)
except requests.RequestException:
pass
items.append(
{
"name": title,
"gender": gender,
"price": current_price,
"sale_price": old_price,
"colors": colors,
"item_link": item_link,
"image_link": image_link,
"subcategory": subcategory,
"brand": "Nike",
}
)
return items
def _build_csv(products: list[dict[str, str]]) -> str:
output = StringIO()
writer = csv.DictWriter(
output,
fieldnames=[
"name",
"gender",
"price",
"sale_price",
"colors",
"item_link",
"image_link",
"subcategory",
"brand",
],
)
writer.writeheader()
writer.writerows(products)
return output.getvalue()
def _save_json_payload(prefix: str, payload: dict[str, object]) -> str:
SCRAPE_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
filename = f"{prefix}_{ts}.json"
file_path = SCRAPE_OUTPUT_DIR / filename
with file_path.open("w", encoding="utf-8") as f:
json.dump(payload, f, ensure_ascii=True, indent=2)
return str(file_path)
@app.get("/health")
def health() -> dict[str, str]:
return {"status": "ok"}
@app.get("/")
def root() -> dict[str, str]:
return {
"message": "Nike Scraper API is running.",
"docs": "/docs",
"health": "/health",
}
@app.post("/search-urls")
def search_urls(payload: Recommendation) -> dict[str, list[str]]:
return {"search_urls": build_nike_urls_from_recommendation(payload)}
@app.post("/product-urls")
def product_urls(payload: Recommendation) -> dict[str, object]:
try:
urls = build_nike_urls_from_recommendation(payload)
all_products: list[dict[str, str]] = []
seen_links: set[str] = set()
for url in urls:
for product in extract_product_summaries(url):
link = product.get("item_link", "")
if not link or link in seen_links:
continue
seen_links.add(link)
all_products.append(product)
response_payload: dict[str, object] = {
"product_urls": [item["item_link"] for item in all_products],
"products": all_products,
}
response_payload["saved_json_path"] = _save_json_payload("product_urls", response_payload)
return response_payload
except requests.RequestException as exc:
raise HTTPException(status_code=502, detail=f"Failed to fetch Nike pages: {exc}") from exc
@app.post("/scrape")
def scrape(payload: ScrapeRequest) -> dict[str, object]:
try:
search_urls = build_nike_urls_from_recommendation(payload.recommendation)
products = scrape_products(search_urls, max_products=payload.max_products)
except requests.RequestException as exc:
raise HTTPException(status_code=502, detail=f"Failed to fetch Nike pages: {exc}") from exc
response_payload: dict[str, object] = {
"search_urls": search_urls,
"count": len(products),
"products": products,
}
return response_payload
@app.post("/scrape.csv")
def scrape_csv(payload: ScrapeRequest) -> StreamingResponse:
try:
search_urls = build_nike_urls_from_recommendation(payload.recommendation)
products = scrape_products(search_urls, max_products=payload.max_products)
except requests.RequestException as exc:
raise HTTPException(status_code=502, detail=f"Failed to fetch Nike pages: {exc}") from exc
csv_content = _build_csv(products)
filename = (
f"nike_{payload.recommendation.gender or 'unisex'}_"
f"{payload.recommendation.color}_{payload.recommendation.category}.csv"
)
return StreamingResponse(
iter([csv_content]),
media_type="text/csv",
headers={"Content-Disposition": f"attachment; filename={filename}"},
)