Spaces:
Running
Running
Upload 22 files
Browse files- app.py +77 -29
- zalando_scraper.py +120 -47
app.py
CHANGED
|
@@ -899,18 +899,52 @@ def _resolve_target_category(requested_target: str, wardrobe_snapshot: dict[str,
|
|
| 899 |
return "bottomwear"
|
| 900 |
|
| 901 |
|
| 902 |
-
def _product_text_for_relevance(product: dict[str, Any]) -> str:
|
| 903 |
-
name = str(product.get("name") or "")
|
| 904 |
-
url = str(product.get("item_link") or "")
|
| 905 |
-
|
| 906 |
-
|
| 907 |
-
|
| 908 |
-
|
| 909 |
-
|
| 910 |
-
|
| 911 |
-
|
| 912 |
-
|
| 913 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 914 |
text = _product_text_for_relevance(product)
|
| 915 |
if not text:
|
| 916 |
return False
|
|
@@ -918,10 +952,13 @@ def _is_relevant_scraped_product(
|
|
| 918 |
if any(token in text for token in SCRAPER_RELEVANCE_EXCLUDE_TOKENS):
|
| 919 |
return False
|
| 920 |
|
| 921 |
-
planned = _norm(planned_category)
|
| 922 |
-
planned_keywords = SCRAPER_CATEGORY_KEYWORDS.get(planned, {planned} if planned else set())
|
| 923 |
-
if planned_keywords and not any(keyword in text for keyword in planned_keywords):
|
| 924 |
-
return False
|
|
|
|
|
|
|
|
|
|
| 925 |
|
| 926 |
if target_slot == "topwear":
|
| 927 |
topwear_terms = {"shirt", "polo", "blazer", "jacket", "coat", "t-shirt", "tee", "hoodie"}
|
|
@@ -1501,12 +1538,13 @@ def _generate_scraper_plan_with_nemotron(
|
|
| 1501 |
continue
|
| 1502 |
seen_links.add(item_link)
|
| 1503 |
|
| 1504 |
-
if _is_relevant_scraped_product(
|
| 1505 |
-
product=product,
|
| 1506 |
-
target_slot=resolved_target,
|
| 1507 |
-
planned_category=category,
|
| 1508 |
-
|
| 1509 |
-
|
|
|
|
| 1510 |
scraped_products.append(product)
|
| 1511 |
new_products += 1
|
| 1512 |
else:
|
|
@@ -1525,17 +1563,27 @@ def _generate_scraper_plan_with_nemotron(
|
|
| 1525 |
}
|
| 1526 |
)
|
| 1527 |
|
| 1528 |
-
if not scraped_products and fallback_products:
|
| 1529 |
-
scraped_products = fallback_products[:scrape_limit]
|
| 1530 |
-
intermediate_steps.append(
|
| 1531 |
-
{
|
| 1532 |
"step": "scrape_fallback",
|
| 1533 |
"query": query,
|
| 1534 |
"new_products": len(scraped_products),
|
| 1535 |
"total_products": len(scraped_products),
|
| 1536 |
-
"message": "Used non-filtered scrape fallback because strict relevance filtering returned no products.",
|
| 1537 |
-
}
|
| 1538 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1539 |
|
| 1540 |
query_plan_payload = {
|
| 1541 |
"color": color,
|
|
|
|
| 899 |
return "bottomwear"
|
| 900 |
|
| 901 |
|
| 902 |
+
def _product_text_for_relevance(product: dict[str, Any]) -> str:
|
| 903 |
+
name = str(product.get("name") or "")
|
| 904 |
+
url = str(product.get("item_link") or "")
|
| 905 |
+
color = str(product.get("color") or "")
|
| 906 |
+
brand = str(product.get("brand") or "")
|
| 907 |
+
return _norm(f"{name} {color} {brand} {url}")
|
| 908 |
+
|
| 909 |
+
|
| 910 |
+
SCRAPER_COLOR_KEYWORDS: dict[str, set[str]] = {
|
| 911 |
+
"black": {"black", "jet black"},
|
| 912 |
+
"white": {"white", "bright white", "off white", "off-white"},
|
| 913 |
+
"navy": {"navy", "dark blue", "dk blue", "dress blues", "moonlit ocean", "midnight blue"},
|
| 914 |
+
"blue": {"blue", "navy", "dark blue", "dk blue", "dress blues", "ice blue", "light blue", "skyway", "moonlit ocean"},
|
| 915 |
+
"grey": {"grey", "gray", "dark grey", "dark gray", "steel grey", "steel gray", "charcoal"},
|
| 916 |
+
"gray": {"grey", "gray", "dark grey", "dark gray", "steel grey", "steel gray", "charcoal"},
|
| 917 |
+
"beige": {"beige", "sand", "tan", "stone", "morel"},
|
| 918 |
+
"brown": {"brown", "tan", "morel"},
|
| 919 |
+
"olive": {"olive", "khaki"},
|
| 920 |
+
"green": {"green", "olive", "khaki"},
|
| 921 |
+
"red": {"red", "brick red", "winetasting", "wine"},
|
| 922 |
+
"maroon": {"maroon", "burgundy", "wine", "winetasting"},
|
| 923 |
+
}
|
| 924 |
+
|
| 925 |
+
|
| 926 |
+
def _color_keywords_for_relevance(color: str) -> set[str]:
|
| 927 |
+
normalized = extract_base_color(color or "") or _norm(color)
|
| 928 |
+
if not normalized or normalized == "unknown":
|
| 929 |
+
return set()
|
| 930 |
+
return SCRAPER_COLOR_KEYWORDS.get(normalized, {normalized})
|
| 931 |
+
|
| 932 |
+
|
| 933 |
+
def _matches_planned_color(product: dict[str, Any], planned_color: str) -> bool:
|
| 934 |
+
keywords = _color_keywords_for_relevance(planned_color)
|
| 935 |
+
if not keywords:
|
| 936 |
+
return True
|
| 937 |
+
text = _product_text_for_relevance(product)
|
| 938 |
+
return any(keyword in text for keyword in keywords)
|
| 939 |
+
|
| 940 |
+
|
| 941 |
+
def _is_relevant_scraped_product(
|
| 942 |
+
product: dict[str, Any],
|
| 943 |
+
target_slot: str,
|
| 944 |
+
planned_category: str,
|
| 945 |
+
planned_color: str,
|
| 946 |
+
occasion_bucket: str,
|
| 947 |
+
) -> bool:
|
| 948 |
text = _product_text_for_relevance(product)
|
| 949 |
if not text:
|
| 950 |
return False
|
|
|
|
| 952 |
if any(token in text for token in SCRAPER_RELEVANCE_EXCLUDE_TOKENS):
|
| 953 |
return False
|
| 954 |
|
| 955 |
+
planned = _norm(planned_category)
|
| 956 |
+
planned_keywords = SCRAPER_CATEGORY_KEYWORDS.get(planned, {planned} if planned else set())
|
| 957 |
+
if planned_keywords and not any(keyword in text for keyword in planned_keywords):
|
| 958 |
+
return False
|
| 959 |
+
|
| 960 |
+
if not _matches_planned_color(product, planned_color):
|
| 961 |
+
return False
|
| 962 |
|
| 963 |
if target_slot == "topwear":
|
| 964 |
topwear_terms = {"shirt", "polo", "blazer", "jacket", "coat", "t-shirt", "tee", "hoodie"}
|
|
|
|
| 1538 |
continue
|
| 1539 |
seen_links.add(item_link)
|
| 1540 |
|
| 1541 |
+
if _is_relevant_scraped_product(
|
| 1542 |
+
product=product,
|
| 1543 |
+
target_slot=resolved_target,
|
| 1544 |
+
planned_category=category,
|
| 1545 |
+
planned_color=color,
|
| 1546 |
+
occasion_bucket=occasion_bucket,
|
| 1547 |
+
):
|
| 1548 |
scraped_products.append(product)
|
| 1549 |
new_products += 1
|
| 1550 |
else:
|
|
|
|
| 1563 |
}
|
| 1564 |
)
|
| 1565 |
|
| 1566 |
+
if not scraped_products and fallback_products and not _color_keywords_for_relevance(color):
|
| 1567 |
+
scraped_products = fallback_products[:scrape_limit]
|
| 1568 |
+
intermediate_steps.append(
|
| 1569 |
+
{
|
| 1570 |
"step": "scrape_fallback",
|
| 1571 |
"query": query,
|
| 1572 |
"new_products": len(scraped_products),
|
| 1573 |
"total_products": len(scraped_products),
|
| 1574 |
+
"message": "Used non-filtered scrape fallback because strict relevance filtering returned no products.",
|
| 1575 |
+
}
|
| 1576 |
+
)
|
| 1577 |
+
elif not scraped_products and fallback_products:
|
| 1578 |
+
intermediate_steps.append(
|
| 1579 |
+
{
|
| 1580 |
+
"step": "scrape_filter",
|
| 1581 |
+
"query": query,
|
| 1582 |
+
"rejected_products": len(fallback_products),
|
| 1583 |
+
"total_products": 0,
|
| 1584 |
+
"message": "Rejected scraped products because none matched the planned color and category.",
|
| 1585 |
+
}
|
| 1586 |
+
)
|
| 1587 |
|
| 1588 |
query_plan_payload = {
|
| 1589 |
"color": color,
|
zalando_scraper.py
CHANGED
|
@@ -5,7 +5,7 @@ import json
|
|
| 5 |
import os
|
| 6 |
import re
|
| 7 |
from typing import Any, Callable, Optional
|
| 8 |
-
from urllib.parse import urlencode, urlparse
|
| 9 |
|
| 10 |
import requests
|
| 11 |
from bs4 import BeautifulSoup
|
|
@@ -60,7 +60,7 @@ CATEGORY_PATH_MAP = {
|
|
| 60 |
"sportswear": {"women": "womens-sports", "men": "mens-sports", "unisex": "sports"},
|
| 61 |
}
|
| 62 |
|
| 63 |
-
_COLOR_TERMS = [
|
| 64 |
"black",
|
| 65 |
"white",
|
| 66 |
"navy",
|
|
@@ -81,7 +81,32 @@ _COLOR_TERMS = [
|
|
| 81 |
"purple",
|
| 82 |
"yellow",
|
| 83 |
"orange",
|
| 84 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
|
| 87 |
ScrapePostprocessFn = Callable[[list[dict[str, str]]], list[dict[str, str]]]
|
|
@@ -89,8 +114,61 @@ WardrobeSummary = dict[str, Any]
|
|
| 89 |
TextCompletionFn = Callable[[str, int], str]
|
| 90 |
|
| 91 |
|
| 92 |
-
def _norm(value: Any) -> str:
|
| 93 |
-
return str(value or "").strip().lower()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
|
| 96 |
def _normalize_target_category(value: Any) -> str:
|
|
@@ -557,17 +635,11 @@ def _apify_actor_id_from_endpoint(endpoint: str) -> str:
|
|
| 557 |
return "vistics~zalando-scraper"
|
| 558 |
|
| 559 |
|
| 560 |
-
def _build_apify_payload(search_url: str, max_results: int
|
| 561 |
-
|
| 562 |
-
|
| 563 |
-
|
| 564 |
-
|
| 565 |
-
start_urls = [search_url]
|
| 566 |
-
|
| 567 |
-
return {
|
| 568 |
-
"startUrls": start_urls,
|
| 569 |
-
"maxResults": int(max_results),
|
| 570 |
-
}
|
| 571 |
|
| 572 |
|
| 573 |
def _http_error_detail(exc: requests.RequestException, limit: int = 800) -> str:
|
|
@@ -632,13 +704,10 @@ def _scrape_with_apify_run_dataset_fallback(
|
|
| 632 |
wait_for_finish,
|
| 633 |
)
|
| 634 |
|
| 635 |
-
variants = [
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
for variant_name, use_object_start_urls in variants:
|
| 641 |
-
run_payload = _build_apify_payload(search_url, effective_limit, start_urls_as_objects=use_object_start_urls)
|
| 642 |
run_id = ""
|
| 643 |
run_status = ""
|
| 644 |
dataset_id = ""
|
|
@@ -777,13 +846,18 @@ def _normalize_product(item: dict[str, Any]) -> dict[str, str]:
|
|
| 777 |
)
|
| 778 |
)
|
| 779 |
|
| 780 |
-
|
| 781 |
-
|
| 782 |
-
|
| 783 |
-
|
| 784 |
-
|
| 785 |
-
"
|
| 786 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 787 |
"discount_percent": discount_percent,
|
| 788 |
"image_url": image_url,
|
| 789 |
"item_link": url_value,
|
|
@@ -804,14 +878,11 @@ def _scrape_with_apify(search_url: str, max_products: int | None, timeout_second
|
|
| 804 |
actor_id,
|
| 805 |
)
|
| 806 |
|
| 807 |
-
variants = [
|
| 808 |
-
|
| 809 |
-
|
| 810 |
-
|
| 811 |
-
|
| 812 |
-
for variant_name, use_object_start_urls in variants:
|
| 813 |
-
try:
|
| 814 |
-
payload = _build_apify_payload(search_url, effective_limit, start_urls_as_objects=use_object_start_urls)
|
| 815 |
response = requests.post(_apify_request_url(), json=payload, timeout=apify_timeout)
|
| 816 |
response.raise_for_status()
|
| 817 |
|
|
@@ -987,15 +1058,17 @@ def extract_product_summaries(
|
|
| 987 |
errors.append(f"html: {exc}")
|
| 988 |
logger.warning("zalando crawl failed source=html search_url=%s error=%s", search_url, exc)
|
| 989 |
|
| 990 |
-
if postprocess and _requires_postprocess(products):
|
| 991 |
-
try:
|
| 992 |
-
products = postprocess(products)
|
| 993 |
-
except Exception:
|
| 994 |
-
# Never fail scraping because post-processing failed.
|
| 995 |
-
pass
|
| 996 |
-
|
| 997 |
-
|
| 998 |
-
|
|
|
|
|
|
|
| 999 |
raise requests.RequestException("; ".join(errors))
|
| 1000 |
|
| 1001 |
logger.info("zalando crawl completed search_url=%s crawled=%s items=%s", search_url, bool(products), len(products))
|
|
|
|
| 5 |
import os
|
| 6 |
import re
|
| 7 |
from typing import Any, Callable, Optional
|
| 8 |
+
from urllib.parse import parse_qs, urlencode, urlparse
|
| 9 |
|
| 10 |
import requests
|
| 11 |
from bs4 import BeautifulSoup
|
|
|
|
| 60 |
"sportswear": {"women": "womens-sports", "men": "mens-sports", "unisex": "sports"},
|
| 61 |
}
|
| 62 |
|
| 63 |
+
_COLOR_TERMS = [
|
| 64 |
"black",
|
| 65 |
"white",
|
| 66 |
"navy",
|
|
|
|
| 81 |
"purple",
|
| 82 |
"yellow",
|
| 83 |
"orange",
|
| 84 |
+
]
|
| 85 |
+
|
| 86 |
+
_COLOR_QUERY_KEYWORDS: dict[str, set[str]] = {
|
| 87 |
+
"black": {"black"},
|
| 88 |
+
"white": {"white", "bright white", "off white", "off-white"},
|
| 89 |
+
"navy": {"navy", "dark blue", "dk blue", "dress blues", "moonlit ocean", "midnight blue"},
|
| 90 |
+
"blue": {"blue", "navy", "dark blue", "dk blue", "dress blues", "ice blue", "light blue", "skyway", "moonlit ocean"},
|
| 91 |
+
"grey": {"grey", "gray", "dark grey", "dark gray", "steel grey", "steel gray", "charcoal"},
|
| 92 |
+
"gray": {"grey", "gray", "dark grey", "dark gray", "steel grey", "steel gray", "charcoal"},
|
| 93 |
+
"beige": {"beige", "sand", "tan", "stone", "morel"},
|
| 94 |
+
"brown": {"brown", "tan", "morel"},
|
| 95 |
+
"olive": {"olive", "khaki"},
|
| 96 |
+
"green": {"green", "olive", "khaki"},
|
| 97 |
+
"red": {"red", "brick red", "winetasting", "wine"},
|
| 98 |
+
"maroon": {"maroon", "burgundy", "wine", "winetasting"},
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
_CATEGORY_QUERY_KEYWORDS: dict[str, set[str]] = {
|
| 102 |
+
"shirt": {"shirt", "formal shirt"},
|
| 103 |
+
"polo": {"polo"},
|
| 104 |
+
"jacket": {"jacket", "blazer", "coat"},
|
| 105 |
+
"trousers": {"trousers", "pants", "chinos"},
|
| 106 |
+
"pants": {"pants", "trousers", "chinos"},
|
| 107 |
+
"shorts": {"shorts"},
|
| 108 |
+
"jeans": {"jeans"},
|
| 109 |
+
}
|
| 110 |
|
| 111 |
|
| 112 |
ScrapePostprocessFn = Callable[[list[dict[str, str]]], list[dict[str, str]]]
|
|
|
|
| 114 |
TextCompletionFn = Callable[[str, int], str]
|
| 115 |
|
| 116 |
|
| 117 |
+
def _norm(value: Any) -> str:
|
| 118 |
+
return str(value or "").strip().lower()
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def _query_from_search_url(search_url: str) -> str:
|
| 122 |
+
parsed = urlparse(str(search_url or ""))
|
| 123 |
+
values = parse_qs(parsed.query).get("q") or []
|
| 124 |
+
return str(values[0] if values else "").strip()
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def _query_color_keywords(query: str) -> set[str]:
|
| 128 |
+
normalized = _norm(query)
|
| 129 |
+
for color in _COLOR_TERMS:
|
| 130 |
+
if color in normalized:
|
| 131 |
+
return _COLOR_QUERY_KEYWORDS.get(color, {color})
|
| 132 |
+
return set()
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def _query_category_keywords(query: str) -> set[str]:
|
| 136 |
+
normalized = _norm(query)
|
| 137 |
+
for category, keywords in _CATEGORY_QUERY_KEYWORDS.items():
|
| 138 |
+
if category in normalized:
|
| 139 |
+
return keywords
|
| 140 |
+
return set()
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def _product_match_text(product: dict[str, str]) -> str:
|
| 144 |
+
return _norm(
|
| 145 |
+
" ".join(
|
| 146 |
+
[
|
| 147 |
+
str(product.get("name") or ""),
|
| 148 |
+
str(product.get("color") or ""),
|
| 149 |
+
str(product.get("brand") or ""),
|
| 150 |
+
str(product.get("item_link") or ""),
|
| 151 |
+
]
|
| 152 |
+
)
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
def _filter_products_for_search_query(products: list[dict[str, str]], search_url: str) -> list[dict[str, str]]:
|
| 157 |
+
query = _query_from_search_url(search_url)
|
| 158 |
+
color_keywords = _query_color_keywords(query)
|
| 159 |
+
category_keywords = _query_category_keywords(query)
|
| 160 |
+
if not color_keywords and not category_keywords:
|
| 161 |
+
return products
|
| 162 |
+
|
| 163 |
+
filtered: list[dict[str, str]] = []
|
| 164 |
+
for product in products:
|
| 165 |
+
text = _product_match_text(product)
|
| 166 |
+
if color_keywords and not any(keyword in text for keyword in color_keywords):
|
| 167 |
+
continue
|
| 168 |
+
if category_keywords and not any(keyword in text for keyword in category_keywords):
|
| 169 |
+
continue
|
| 170 |
+
filtered.append(product)
|
| 171 |
+
return filtered
|
| 172 |
|
| 173 |
|
| 174 |
def _normalize_target_category(value: Any) -> str:
|
|
|
|
| 635 |
return "vistics~zalando-scraper"
|
| 636 |
|
| 637 |
|
| 638 |
+
def _build_apify_payload(search_url: str, max_results: int) -> dict[str, Any]:
|
| 639 |
+
return {
|
| 640 |
+
"startUrls": [str(search_url or "").strip()],
|
| 641 |
+
"maxResults": int(max_results),
|
| 642 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 643 |
|
| 644 |
|
| 645 |
def _http_error_detail(exc: requests.RequestException, limit: int = 800) -> str:
|
|
|
|
| 704 |
wait_for_finish,
|
| 705 |
)
|
| 706 |
|
| 707 |
+
variants = ["string"]
|
| 708 |
+
|
| 709 |
+
for variant_name in variants:
|
| 710 |
+
run_payload = _build_apify_payload(search_url, effective_limit)
|
|
|
|
|
|
|
|
|
|
| 711 |
run_id = ""
|
| 712 |
run_status = ""
|
| 713 |
dataset_id = ""
|
|
|
|
| 846 |
)
|
| 847 |
)
|
| 848 |
|
| 849 |
+
color = str(item.get("color") or item.get("colorName") or item.get("colour") or "").strip()
|
| 850 |
+
if not color and " - " in name:
|
| 851 |
+
color = name.rsplit(" - ", 1)[-1].strip()
|
| 852 |
+
|
| 853 |
+
return {
|
| 854 |
+
"name": name or "N/A",
|
| 855 |
+
"price": price or "N/A",
|
| 856 |
+
"brand": brand,
|
| 857 |
+
"color": color,
|
| 858 |
+
"currency_symbol": currency_symbol,
|
| 859 |
+
"promotional_price": promotional_price,
|
| 860 |
+
"original_price": original_price,
|
| 861 |
"discount_percent": discount_percent,
|
| 862 |
"image_url": image_url,
|
| 863 |
"item_link": url_value,
|
|
|
|
| 878 |
actor_id,
|
| 879 |
)
|
| 880 |
|
| 881 |
+
variants = ["string"]
|
| 882 |
+
variant_errors: list[str] = []
|
| 883 |
+
for variant_name in variants:
|
| 884 |
+
try:
|
| 885 |
+
payload = _build_apify_payload(search_url, effective_limit)
|
|
|
|
|
|
|
|
|
|
| 886 |
response = requests.post(_apify_request_url(), json=payload, timeout=apify_timeout)
|
| 887 |
response.raise_for_status()
|
| 888 |
|
|
|
|
| 1058 |
errors.append(f"html: {exc}")
|
| 1059 |
logger.warning("zalando crawl failed source=html search_url=%s error=%s", search_url, exc)
|
| 1060 |
|
| 1061 |
+
if postprocess and _requires_postprocess(products):
|
| 1062 |
+
try:
|
| 1063 |
+
products = postprocess(products)
|
| 1064 |
+
except Exception:
|
| 1065 |
+
# Never fail scraping because post-processing failed.
|
| 1066 |
+
pass
|
| 1067 |
+
|
| 1068 |
+
products = _filter_products_for_search_query(products, search_url)
|
| 1069 |
+
|
| 1070 |
+
if not products and errors:
|
| 1071 |
+
logger.warning("zalando crawl completed with no results search_url=%s errors=%s", search_url, "; ".join(errors))
|
| 1072 |
raise requests.RequestException("; ".join(errors))
|
| 1073 |
|
| 1074 |
logger.info("zalando crawl completed search_url=%s crawled=%s items=%s", search_url, bool(products), len(products))
|