Spaces:

HelloWorld0204
/

StyleWellBackend

Running

File size: 16,294 Bytes

e08551d

from __future__ import annotations

from io import StringIO
import csv
import json
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional

import requests
from bs4 import BeautifulSoup
from fastapi import FastAPI, HTTPException
from fastapi.responses import StreamingResponse
from pydantic import BaseModel, Field
from urllib.parse import urlencode


app = FastAPI(title="Nike Scraper API", version="1.0.0")

NIKE_BASE_SEARCH = "https://www.nike.com/w"
NIKE_BASE_URL = "https://www.nike.com"

CATEGORY_ALIASES = {
    "t-shirt": "t-shirt",
    "tee": "t-shirt",
    "shirt": "shirt",
    "hoodie": "hoodie",
    "sweatshirt": "sweatshirt",
    "jacket": "jacket",
    "gilet": "gilet",
    "top": "top",
    "tank": "tank top",
    "polo": "polo",
    "jersey": "jersey",
    "bra": "sports bra",
    "pant": "pants",
    "pants": "pants",
    "trousers": "trousers",
    "shorts": "shorts",
    "short": "shorts",
    "leggings": "leggings",
    "tights": "tights",
    "joggers": "joggers",
    "sweatpants": "sweatpants",
    "skirt": "skirt",
    "dress": "dress",
    "tracksuit": "tracksuit",
    "jumpsuit": "jumpsuit",
    "socks": "socks",
    "sock": "socks",
    "hat": "hat",
    "cap": "cap",
    "bag": "bag",
    "backpack": "backpack",
}

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/123.0.0.0 Safari/537.36"
    )
}

CATEGORIES = [
    "sweaters",
    "hoodies",
    "t-shirts",
    "jackets",
    "shirts",
    "crews",
    "jerseys",
    "tops",
    "polos",
    "tanks",
    "compression",
    "baselayer",
    "jeans",
    "shorts",
    "skirts",
    "tights",
    "parkas",
    "gilets",
    "pants",
    "leggings",
    "trousers",
    "joggers",
    "sweatpants",
    "dresses",
    "rompers",
    "jumpsuits",
    "onesies",
    "overalls",
    "tracksuits",
    "sneakers",
    "slippers",
    "sunglasses",
    "bras",
    "socks",
    "hats",
    "bags",
    "backpacks",
]

SCRAPE_OUTPUT_DIR = Path(__file__).resolve().parent / "scraped_json"


class Recommendation(BaseModel):
    color: str = Field(..., min_length=1)
    category: str = Field(..., min_length=1)
    gender: Optional[str] = Field(default=None, description="men or women")


class ScrapeRequest(BaseModel):
    recommendation: Recommendation
    max_products: int = Field(default=30, ge=1, le=300)


def _ensure_full_url(href: str) -> str:
    if href.startswith("/"):
        return f"{NIKE_BASE_URL}{href}"
    return href


def build_nike_search_url(color: str, category: str, gender: Optional[str] = None) -> str:
    category_normalized = CATEGORY_ALIASES.get(category.lower(), category.lower())
    parts: list[str] = []
    if gender:
        parts.append(gender.lower() + "s")
    parts.append(color.lower())
    parts.append(category_normalized)
    query = " ".join(parts)
    params = urlencode({"q": query, "vst": query})
    return f"{NIKE_BASE_SEARCH}?{params}"


def build_nike_urls_from_recommendation(recommendation: Recommendation) -> list[str]:
    color = recommendation.color
    category = recommendation.category
    gender = recommendation.gender
    if gender:
        return [build_nike_search_url(color, category, gender)]
    return [
        build_nike_search_url(color, category, "men"),
        build_nike_search_url(color, category, "women"),
        build_nike_search_url(color, category),
    ]


def build_search_urls_from_recommendation(recommendation: Recommendation, store: str = "nike") -> list[str]:
    return build_nike_urls_from_recommendation(recommendation)


def build_search_urls_from_query(query: str, store: str = "nike", gender: Optional[str] = None) -> list[str]:
    normalized_query = str(query or "").strip()
    if not normalized_query:
        return []

    def _normalize_prefixed_query(prefix: str, value: str) -> str:
        lowered = value.strip().lower()
        p = prefix.strip().lower()
        if lowered.startswith(f"{p} "):
            return value.strip()
        return f"{prefix} {value}".strip()

    if gender:
        q = _normalize_prefixed_query(gender, normalized_query)
        return [f"{NIKE_BASE_SEARCH}?{urlencode({'q': q, 'vst': q})}"]

    return [
        f"{NIKE_BASE_SEARCH}?{urlencode({'q': f'men {normalized_query}'.strip(), 'vst': f'men {normalized_query}'.strip()})}",
        f"{NIKE_BASE_SEARCH}?{urlencode({'q': f'women {normalized_query}'.strip(), 'vst': f'women {normalized_query}'.strip()})}",
        f"{NIKE_BASE_SEARCH}?{urlencode({'q': normalized_query, 'vst': normalized_query})}",
    ]


def _get_soup(url: str) -> BeautifulSoup:
    response = requests.get(url, headers=HEADERS, timeout=20)
    response.raise_for_status()
    return BeautifulSoup(response.content, "lxml")


def _ensure_store_url(href: str, base_url: str) -> str:
    if not href:
        return ""
    if href.startswith("//"):
        return f"https:{href}"
    if href.startswith("/"):
        return f"{base_url}{href}"
    return href


def extract_product_urls(search_url: str) -> list[str]:
    soup = _get_soup(search_url)
    product_links: list[str] = []

    anchors = soup.find_all("a", {"class": "product-card__link-overlay"})
    for anchor in anchors:
        href = anchor.get("href")
        if href:
            full = _ensure_full_url(href)
            if full not in product_links:
                product_links.append(full)

    if not product_links:
        all_anchors = soup.find_all("a", href=True)
        for anchor in all_anchors:
            href = anchor.get("href")
            if href and "/t/" in href:
                full = _ensure_full_url(href)
                if full not in product_links:
                    product_links.append(full)

    return product_links


def _extract_image_from_container(container: BeautifulSoup) -> str:
    img = container.find("img")
    if not img:
        return ""
    return str(img.get("src") or img.get("data-src") or img.get("srcset") or "").strip()


def extract_product_summaries(search_url: str, store: str = "nike") -> list[dict[str, str]]:
    soup = _get_soup(search_url)
    summaries: list[dict[str, str]] = []
    seen_links: set[str] = set()

    containers = soup.find_all("div", {"class": "product-card__body"})
    for container in containers:
        anchor = container.find("a", {"class": "product-card__link-overlay"})
        if not anchor:
            continue

        href = anchor.get("href")
        if not href:
            continue

        item_link = _ensure_full_url(href)
        if item_link in seen_links:
            continue
        seen_links.add(item_link)

        title = get_title(container)
        current_price, _ = get_prices(container)
        image_url = _extract_image_from_container(container.parent if container.parent else container)

        summaries.append(
            {
                "item_link": item_link,
                "name": title,
                "price": current_price,
                "image_url": image_url,
            }
        )

    if summaries:
        return summaries

    # Fallback path when Nike card markup changes.
    for item_link in extract_product_urls(search_url):
        if item_link in seen_links:
            continue
        seen_links.add(item_link)
        summaries.append(
            {
                "item_link": item_link,
                "name": "N/A",
                "price": "N/A",
                "image_url": "",
            }
        )
    return summaries


def get_title(container: BeautifulSoup) -> str:
    try:
        title = container.find_all("div", {"class": "product-card__title"})[0].text
        subtitle = container.find_all("div", {"class": "product-card__subtitle"})[0].text
        return f"{title} {subtitle}".strip()
    except (IndexError, AttributeError):
        return "N/A"


def get_target_gender(title: str) -> str:
    if "Men's" in title:
        return "Men"
    if "Women's" in title:
        return "Women"
    return "Unisex"


def get_subcategory(title: str) -> str:
    for word in title.split(" "):
        candidate = word.lower().strip(",.")
        if candidate in CATEGORIES or (candidate + "s") in CATEGORIES:
            return word
    return ""


def get_prices(container: BeautifulSoup) -> tuple[str, str]:
    try:
        price_container = container.find_all("div", {"class": "product-price__wrapper"})
        current_price = price_container[0].text
        old_price = "N/A"

        if current_price.count("$") == 2:
            prices = current_price.split("$")
            current_price = "$" + prices[1] if "." in prices[1] else "$" + prices[1] + ".00"
            old_price = "$" + prices[2] if "." in prices[2] else "$" + prices[2] + ".00"
        elif "." not in current_price:
            current_price = current_price + ".00"
    except (IndexError, AttributeError):
        current_price, old_price = "N/A", "N/A"

    return current_price, old_price


def get_item_image_link(item_soup: BeautifulSoup) -> str:
    try:
        img = item_soup.find("img", {"class": "css-viwop1 u-full-width u-full-height css-m5dkrx"})
        return img.get("src") if img else "Click on item link for pictures."
    except (IndexError, AttributeError):
        return "Click on item link for pictures."


def get_colors(item_soup: BeautifulSoup) -> str:
    try:
        current = item_soup.find_all(
            "div",
            {
                "class": "colorway-product-overlay colorway-product-overlay--active "
                "colorway-product-overlay--selected css-sa2cc9"
            },
        )
        if current:
            colors = current[0].find_all("img", alt=True)[0].get("alt")
            for color in item_soup.find_all("div", {"class": "colorway-product-overlay css-sa2cc9"}):
                alt = color.find_all("img", alt=True)[0].get("alt")
                if alt != "Design your own Nike By You product":
                    colors += " || " + alt
        else:
            color_li = item_soup.find_all("li", {"class": "description-preview__color-description ncss-li"})
            colors = str(color_li).split(": ")[1].replace("</li>]", "")
    except (IndexError, AttributeError):
        colors = "Click on item link for available colors."
    return colors


def scrape_products(search_urls: list[str], max_products: int) -> list[dict[str, str]]:
    items: list[dict[str, str]] = []
    seen_links: set[str] = set()

    for link in search_urls:
        soup = _get_soup(link)
        containers = soup.find_all("div", {"class": "product-card__body"})

        for container in containers:
            if len(items) >= max_products:
                return items

            anchor = container.find("a", {"class": "product-card__link-overlay"})
            if not anchor:
                continue
            href = anchor.get("href")
            if not href:
                continue

            item_link = _ensure_full_url(href)
            if item_link in seen_links:
                continue
            seen_links.add(item_link)

            title = get_title(container)
            gender = get_target_gender(title)
            current_price, old_price = get_prices(container)
            subcategory = get_subcategory(title)

            image_link = "Click on item link for pictures."
            colors = "Click on item link for available colors."
            try:
                item_soup = _get_soup(item_link)
                image_link = get_item_image_link(item_soup)
                colors = get_colors(item_soup)
            except requests.RequestException:
                pass

            items.append(
                {
                    "name": title,
                    "gender": gender,
                    "price": current_price,
                    "sale_price": old_price,
                    "colors": colors,
                    "item_link": item_link,
                    "image_link": image_link,
                    "subcategory": subcategory,
                    "brand": "Nike",
                }
            )

    return items


def _build_csv(products: list[dict[str, str]]) -> str:
    output = StringIO()
    writer = csv.DictWriter(
        output,
        fieldnames=[
            "name",
            "gender",
            "price",
            "sale_price",
            "colors",
            "item_link",
            "image_link",
            "subcategory",
            "brand",
        ],
    )
    writer.writeheader()
    writer.writerows(products)
    return output.getvalue()


def _save_json_payload(prefix: str, payload: dict[str, object]) -> str:
    SCRAPE_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
    filename = f"{prefix}_{ts}.json"
    file_path = SCRAPE_OUTPUT_DIR / filename
    with file_path.open("w", encoding="utf-8") as f:
        json.dump(payload, f, ensure_ascii=True, indent=2)
    return str(file_path)


@app.get("/health")
def health() -> dict[str, str]:
    return {"status": "ok"}


@app.get("/")
def root() -> dict[str, str]:
    return {
        "message": "Nike Scraper API is running.",
        "docs": "/docs",
        "health": "/health",
    }


@app.post("/search-urls")
def search_urls(payload: Recommendation) -> dict[str, list[str]]:
    return {"search_urls": build_nike_urls_from_recommendation(payload)}


@app.post("/product-urls")
def product_urls(payload: Recommendation) -> dict[str, object]:
    try:
        urls = build_nike_urls_from_recommendation(payload)
        all_products: list[dict[str, str]] = []
        seen_links: set[str] = set()
        for url in urls:
            for product in extract_product_summaries(url):
                link = product.get("item_link", "")
                if not link or link in seen_links:
                    continue
                seen_links.add(link)
                all_products.append(product)

        response_payload: dict[str, object] = {
            "product_urls": [item["item_link"] for item in all_products],
            "products": all_products,
        }
        response_payload["saved_json_path"] = _save_json_payload("product_urls", response_payload)
        return response_payload
    except requests.RequestException as exc:
        raise HTTPException(status_code=502, detail=f"Failed to fetch Nike pages: {exc}") from exc


@app.post("/scrape")
def scrape(payload: ScrapeRequest) -> dict[str, object]:
    try:
        search_urls = build_nike_urls_from_recommendation(payload.recommendation)
        products = scrape_products(search_urls, max_products=payload.max_products)
    except requests.RequestException as exc:
        raise HTTPException(status_code=502, detail=f"Failed to fetch Nike pages: {exc}") from exc

    response_payload: dict[str, object] = {
        "search_urls": search_urls,
        "count": len(products),
        "products": products,
    }
    return response_payload


@app.post("/scrape.csv")
def scrape_csv(payload: ScrapeRequest) -> StreamingResponse:
    try:
        search_urls = build_nike_urls_from_recommendation(payload.recommendation)
        products = scrape_products(search_urls, max_products=payload.max_products)
    except requests.RequestException as exc:
        raise HTTPException(status_code=502, detail=f"Failed to fetch Nike pages: {exc}") from exc

    csv_content = _build_csv(products)
    filename = (
        f"nike_{payload.recommendation.gender or 'unisex'}_"
        f"{payload.recommendation.color}_{payload.recommendation.category}.csv"
    )
    return StreamingResponse(
        iter([csv_content]),
        media_type="text/csv",
        headers={"Content-Disposition": f"attachment; filename={filename}"},
    )