HelloWorld0204 commited on
Commit
3116e09
·
verified ·
1 Parent(s): ca9045a

Upload 22 files

Browse files
Files changed (2) hide show
  1. app.py +77 -29
  2. zalando_scraper.py +120 -47
app.py CHANGED
@@ -899,18 +899,52 @@ def _resolve_target_category(requested_target: str, wardrobe_snapshot: dict[str,
899
  return "bottomwear"
900
 
901
 
902
- def _product_text_for_relevance(product: dict[str, Any]) -> str:
903
- name = str(product.get("name") or "")
904
- url = str(product.get("item_link") or "")
905
- return _norm(f"{name} {url}")
906
-
907
-
908
- def _is_relevant_scraped_product(
909
- product: dict[str, Any],
910
- target_slot: str,
911
- planned_category: str,
912
- occasion_bucket: str,
913
- ) -> bool:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
914
  text = _product_text_for_relevance(product)
915
  if not text:
916
  return False
@@ -918,10 +952,13 @@ def _is_relevant_scraped_product(
918
  if any(token in text for token in SCRAPER_RELEVANCE_EXCLUDE_TOKENS):
919
  return False
920
 
921
- planned = _norm(planned_category)
922
- planned_keywords = SCRAPER_CATEGORY_KEYWORDS.get(planned, {planned} if planned else set())
923
- if planned_keywords and not any(keyword in text for keyword in planned_keywords):
924
- return False
 
 
 
925
 
926
  if target_slot == "topwear":
927
  topwear_terms = {"shirt", "polo", "blazer", "jacket", "coat", "t-shirt", "tee", "hoodie"}
@@ -1501,12 +1538,13 @@ def _generate_scraper_plan_with_nemotron(
1501
  continue
1502
  seen_links.add(item_link)
1503
 
1504
- if _is_relevant_scraped_product(
1505
- product=product,
1506
- target_slot=resolved_target,
1507
- planned_category=category,
1508
- occasion_bucket=occasion_bucket,
1509
- ):
 
1510
  scraped_products.append(product)
1511
  new_products += 1
1512
  else:
@@ -1525,17 +1563,27 @@ def _generate_scraper_plan_with_nemotron(
1525
  }
1526
  )
1527
 
1528
- if not scraped_products and fallback_products:
1529
- scraped_products = fallback_products[:scrape_limit]
1530
- intermediate_steps.append(
1531
- {
1532
  "step": "scrape_fallback",
1533
  "query": query,
1534
  "new_products": len(scraped_products),
1535
  "total_products": len(scraped_products),
1536
- "message": "Used non-filtered scrape fallback because strict relevance filtering returned no products.",
1537
- }
1538
- )
 
 
 
 
 
 
 
 
 
 
1539
 
1540
  query_plan_payload = {
1541
  "color": color,
 
899
  return "bottomwear"
900
 
901
 
902
+ def _product_text_for_relevance(product: dict[str, Any]) -> str:
903
+ name = str(product.get("name") or "")
904
+ url = str(product.get("item_link") or "")
905
+ color = str(product.get("color") or "")
906
+ brand = str(product.get("brand") or "")
907
+ return _norm(f"{name} {color} {brand} {url}")
908
+
909
+
910
+ SCRAPER_COLOR_KEYWORDS: dict[str, set[str]] = {
911
+ "black": {"black", "jet black"},
912
+ "white": {"white", "bright white", "off white", "off-white"},
913
+ "navy": {"navy", "dark blue", "dk blue", "dress blues", "moonlit ocean", "midnight blue"},
914
+ "blue": {"blue", "navy", "dark blue", "dk blue", "dress blues", "ice blue", "light blue", "skyway", "moonlit ocean"},
915
+ "grey": {"grey", "gray", "dark grey", "dark gray", "steel grey", "steel gray", "charcoal"},
916
+ "gray": {"grey", "gray", "dark grey", "dark gray", "steel grey", "steel gray", "charcoal"},
917
+ "beige": {"beige", "sand", "tan", "stone", "morel"},
918
+ "brown": {"brown", "tan", "morel"},
919
+ "olive": {"olive", "khaki"},
920
+ "green": {"green", "olive", "khaki"},
921
+ "red": {"red", "brick red", "winetasting", "wine"},
922
+ "maroon": {"maroon", "burgundy", "wine", "winetasting"},
923
+ }
924
+
925
+
926
+ def _color_keywords_for_relevance(color: str) -> set[str]:
927
+ normalized = extract_base_color(color or "") or _norm(color)
928
+ if not normalized or normalized == "unknown":
929
+ return set()
930
+ return SCRAPER_COLOR_KEYWORDS.get(normalized, {normalized})
931
+
932
+
933
+ def _matches_planned_color(product: dict[str, Any], planned_color: str) -> bool:
934
+ keywords = _color_keywords_for_relevance(planned_color)
935
+ if not keywords:
936
+ return True
937
+ text = _product_text_for_relevance(product)
938
+ return any(keyword in text for keyword in keywords)
939
+
940
+
941
+ def _is_relevant_scraped_product(
942
+ product: dict[str, Any],
943
+ target_slot: str,
944
+ planned_category: str,
945
+ planned_color: str,
946
+ occasion_bucket: str,
947
+ ) -> bool:
948
  text = _product_text_for_relevance(product)
949
  if not text:
950
  return False
 
952
  if any(token in text for token in SCRAPER_RELEVANCE_EXCLUDE_TOKENS):
953
  return False
954
 
955
+ planned = _norm(planned_category)
956
+ planned_keywords = SCRAPER_CATEGORY_KEYWORDS.get(planned, {planned} if planned else set())
957
+ if planned_keywords and not any(keyword in text for keyword in planned_keywords):
958
+ return False
959
+
960
+ if not _matches_planned_color(product, planned_color):
961
+ return False
962
 
963
  if target_slot == "topwear":
964
  topwear_terms = {"shirt", "polo", "blazer", "jacket", "coat", "t-shirt", "tee", "hoodie"}
 
1538
  continue
1539
  seen_links.add(item_link)
1540
 
1541
+ if _is_relevant_scraped_product(
1542
+ product=product,
1543
+ target_slot=resolved_target,
1544
+ planned_category=category,
1545
+ planned_color=color,
1546
+ occasion_bucket=occasion_bucket,
1547
+ ):
1548
  scraped_products.append(product)
1549
  new_products += 1
1550
  else:
 
1563
  }
1564
  )
1565
 
1566
+ if not scraped_products and fallback_products and not _color_keywords_for_relevance(color):
1567
+ scraped_products = fallback_products[:scrape_limit]
1568
+ intermediate_steps.append(
1569
+ {
1570
  "step": "scrape_fallback",
1571
  "query": query,
1572
  "new_products": len(scraped_products),
1573
  "total_products": len(scraped_products),
1574
+ "message": "Used non-filtered scrape fallback because strict relevance filtering returned no products.",
1575
+ }
1576
+ )
1577
+ elif not scraped_products and fallback_products:
1578
+ intermediate_steps.append(
1579
+ {
1580
+ "step": "scrape_filter",
1581
+ "query": query,
1582
+ "rejected_products": len(fallback_products),
1583
+ "total_products": 0,
1584
+ "message": "Rejected scraped products because none matched the planned color and category.",
1585
+ }
1586
+ )
1587
 
1588
  query_plan_payload = {
1589
  "color": color,
zalando_scraper.py CHANGED
@@ -5,7 +5,7 @@ import json
5
  import os
6
  import re
7
  from typing import Any, Callable, Optional
8
- from urllib.parse import urlencode, urlparse
9
 
10
  import requests
11
  from bs4 import BeautifulSoup
@@ -60,7 +60,7 @@ CATEGORY_PATH_MAP = {
60
  "sportswear": {"women": "womens-sports", "men": "mens-sports", "unisex": "sports"},
61
  }
62
 
63
- _COLOR_TERMS = [
64
  "black",
65
  "white",
66
  "navy",
@@ -81,7 +81,32 @@ _COLOR_TERMS = [
81
  "purple",
82
  "yellow",
83
  "orange",
84
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
 
87
  ScrapePostprocessFn = Callable[[list[dict[str, str]]], list[dict[str, str]]]
@@ -89,8 +114,61 @@ WardrobeSummary = dict[str, Any]
89
  TextCompletionFn = Callable[[str, int], str]
90
 
91
 
92
- def _norm(value: Any) -> str:
93
- return str(value or "").strip().lower()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
 
96
  def _normalize_target_category(value: Any) -> str:
@@ -557,17 +635,11 @@ def _apify_actor_id_from_endpoint(endpoint: str) -> str:
557
  return "vistics~zalando-scraper"
558
 
559
 
560
- def _build_apify_payload(search_url: str, max_results: int, start_urls_as_objects: bool) -> dict[str, Any]:
561
- start_urls: list[Any]
562
- if start_urls_as_objects:
563
- start_urls = [{"url": search_url}]
564
- else:
565
- start_urls = [search_url]
566
-
567
- return {
568
- "startUrls": start_urls,
569
- "maxResults": int(max_results),
570
- }
571
 
572
 
573
  def _http_error_detail(exc: requests.RequestException, limit: int = 800) -> str:
@@ -632,13 +704,10 @@ def _scrape_with_apify_run_dataset_fallback(
632
  wait_for_finish,
633
  )
634
 
635
- variants = [
636
- ("string", False),
637
- ("object", True),
638
- ]
639
-
640
- for variant_name, use_object_start_urls in variants:
641
- run_payload = _build_apify_payload(search_url, effective_limit, start_urls_as_objects=use_object_start_urls)
642
  run_id = ""
643
  run_status = ""
644
  dataset_id = ""
@@ -777,13 +846,18 @@ def _normalize_product(item: dict[str, Any]) -> dict[str, str]:
777
  )
778
  )
779
 
780
- return {
781
- "name": name or "N/A",
782
- "price": price or "N/A",
783
- "brand": brand,
784
- "currency_symbol": currency_symbol,
785
- "promotional_price": promotional_price,
786
- "original_price": original_price,
 
 
 
 
 
787
  "discount_percent": discount_percent,
788
  "image_url": image_url,
789
  "item_link": url_value,
@@ -804,14 +878,11 @@ def _scrape_with_apify(search_url: str, max_products: int | None, timeout_second
804
  actor_id,
805
  )
806
 
807
- variants = [
808
- ("string", False),
809
- ("object", True),
810
- ]
811
- variant_errors: list[str] = []
812
- for variant_name, use_object_start_urls in variants:
813
- try:
814
- payload = _build_apify_payload(search_url, effective_limit, start_urls_as_objects=use_object_start_urls)
815
  response = requests.post(_apify_request_url(), json=payload, timeout=apify_timeout)
816
  response.raise_for_status()
817
 
@@ -987,15 +1058,17 @@ def extract_product_summaries(
987
  errors.append(f"html: {exc}")
988
  logger.warning("zalando crawl failed source=html search_url=%s error=%s", search_url, exc)
989
 
990
- if postprocess and _requires_postprocess(products):
991
- try:
992
- products = postprocess(products)
993
- except Exception:
994
- # Never fail scraping because post-processing failed.
995
- pass
996
-
997
- if not products and errors:
998
- logger.warning("zalando crawl completed with no results search_url=%s errors=%s", search_url, "; ".join(errors))
 
 
999
  raise requests.RequestException("; ".join(errors))
1000
 
1001
  logger.info("zalando crawl completed search_url=%s crawled=%s items=%s", search_url, bool(products), len(products))
 
5
  import os
6
  import re
7
  from typing import Any, Callable, Optional
8
+ from urllib.parse import parse_qs, urlencode, urlparse
9
 
10
  import requests
11
  from bs4 import BeautifulSoup
 
60
  "sportswear": {"women": "womens-sports", "men": "mens-sports", "unisex": "sports"},
61
  }
62
 
63
+ _COLOR_TERMS = [
64
  "black",
65
  "white",
66
  "navy",
 
81
  "purple",
82
  "yellow",
83
  "orange",
84
+ ]
85
+
86
+ _COLOR_QUERY_KEYWORDS: dict[str, set[str]] = {
87
+ "black": {"black"},
88
+ "white": {"white", "bright white", "off white", "off-white"},
89
+ "navy": {"navy", "dark blue", "dk blue", "dress blues", "moonlit ocean", "midnight blue"},
90
+ "blue": {"blue", "navy", "dark blue", "dk blue", "dress blues", "ice blue", "light blue", "skyway", "moonlit ocean"},
91
+ "grey": {"grey", "gray", "dark grey", "dark gray", "steel grey", "steel gray", "charcoal"},
92
+ "gray": {"grey", "gray", "dark grey", "dark gray", "steel grey", "steel gray", "charcoal"},
93
+ "beige": {"beige", "sand", "tan", "stone", "morel"},
94
+ "brown": {"brown", "tan", "morel"},
95
+ "olive": {"olive", "khaki"},
96
+ "green": {"green", "olive", "khaki"},
97
+ "red": {"red", "brick red", "winetasting", "wine"},
98
+ "maroon": {"maroon", "burgundy", "wine", "winetasting"},
99
+ }
100
+
101
+ _CATEGORY_QUERY_KEYWORDS: dict[str, set[str]] = {
102
+ "shirt": {"shirt", "formal shirt"},
103
+ "polo": {"polo"},
104
+ "jacket": {"jacket", "blazer", "coat"},
105
+ "trousers": {"trousers", "pants", "chinos"},
106
+ "pants": {"pants", "trousers", "chinos"},
107
+ "shorts": {"shorts"},
108
+ "jeans": {"jeans"},
109
+ }
110
 
111
 
112
  ScrapePostprocessFn = Callable[[list[dict[str, str]]], list[dict[str, str]]]
 
114
  TextCompletionFn = Callable[[str, int], str]
115
 
116
 
117
+ def _norm(value: Any) -> str:
118
+ return str(value or "").strip().lower()
119
+
120
+
121
+ def _query_from_search_url(search_url: str) -> str:
122
+ parsed = urlparse(str(search_url or ""))
123
+ values = parse_qs(parsed.query).get("q") or []
124
+ return str(values[0] if values else "").strip()
125
+
126
+
127
+ def _query_color_keywords(query: str) -> set[str]:
128
+ normalized = _norm(query)
129
+ for color in _COLOR_TERMS:
130
+ if color in normalized:
131
+ return _COLOR_QUERY_KEYWORDS.get(color, {color})
132
+ return set()
133
+
134
+
135
+ def _query_category_keywords(query: str) -> set[str]:
136
+ normalized = _norm(query)
137
+ for category, keywords in _CATEGORY_QUERY_KEYWORDS.items():
138
+ if category in normalized:
139
+ return keywords
140
+ return set()
141
+
142
+
143
+ def _product_match_text(product: dict[str, str]) -> str:
144
+ return _norm(
145
+ " ".join(
146
+ [
147
+ str(product.get("name") or ""),
148
+ str(product.get("color") or ""),
149
+ str(product.get("brand") or ""),
150
+ str(product.get("item_link") or ""),
151
+ ]
152
+ )
153
+ )
154
+
155
+
156
+ def _filter_products_for_search_query(products: list[dict[str, str]], search_url: str) -> list[dict[str, str]]:
157
+ query = _query_from_search_url(search_url)
158
+ color_keywords = _query_color_keywords(query)
159
+ category_keywords = _query_category_keywords(query)
160
+ if not color_keywords and not category_keywords:
161
+ return products
162
+
163
+ filtered: list[dict[str, str]] = []
164
+ for product in products:
165
+ text = _product_match_text(product)
166
+ if color_keywords and not any(keyword in text for keyword in color_keywords):
167
+ continue
168
+ if category_keywords and not any(keyword in text for keyword in category_keywords):
169
+ continue
170
+ filtered.append(product)
171
+ return filtered
172
 
173
 
174
  def _normalize_target_category(value: Any) -> str:
 
635
  return "vistics~zalando-scraper"
636
 
637
 
638
+ def _build_apify_payload(search_url: str, max_results: int) -> dict[str, Any]:
639
+ return {
640
+ "startUrls": [str(search_url or "").strip()],
641
+ "maxResults": int(max_results),
642
+ }
 
 
 
 
 
 
643
 
644
 
645
  def _http_error_detail(exc: requests.RequestException, limit: int = 800) -> str:
 
704
  wait_for_finish,
705
  )
706
 
707
+ variants = ["string"]
708
+
709
+ for variant_name in variants:
710
+ run_payload = _build_apify_payload(search_url, effective_limit)
 
 
 
711
  run_id = ""
712
  run_status = ""
713
  dataset_id = ""
 
846
  )
847
  )
848
 
849
+ color = str(item.get("color") or item.get("colorName") or item.get("colour") or "").strip()
850
+ if not color and " - " in name:
851
+ color = name.rsplit(" - ", 1)[-1].strip()
852
+
853
+ return {
854
+ "name": name or "N/A",
855
+ "price": price or "N/A",
856
+ "brand": brand,
857
+ "color": color,
858
+ "currency_symbol": currency_symbol,
859
+ "promotional_price": promotional_price,
860
+ "original_price": original_price,
861
  "discount_percent": discount_percent,
862
  "image_url": image_url,
863
  "item_link": url_value,
 
878
  actor_id,
879
  )
880
 
881
+ variants = ["string"]
882
+ variant_errors: list[str] = []
883
+ for variant_name in variants:
884
+ try:
885
+ payload = _build_apify_payload(search_url, effective_limit)
 
 
 
886
  response = requests.post(_apify_request_url(), json=payload, timeout=apify_timeout)
887
  response.raise_for_status()
888
 
 
1058
  errors.append(f"html: {exc}")
1059
  logger.warning("zalando crawl failed source=html search_url=%s error=%s", search_url, exc)
1060
 
1061
+ if postprocess and _requires_postprocess(products):
1062
+ try:
1063
+ products = postprocess(products)
1064
+ except Exception:
1065
+ # Never fail scraping because post-processing failed.
1066
+ pass
1067
+
1068
+ products = _filter_products_for_search_query(products, search_url)
1069
+
1070
+ if not products and errors:
1071
+ logger.warning("zalando crawl completed with no results search_url=%s errors=%s", search_url, "; ".join(errors))
1072
  raise requests.RequestException("; ".join(errors))
1073
 
1074
  logger.info("zalando crawl completed search_url=%s crawled=%s items=%s", search_url, bool(products), len(products))