Spaces:
Sleeping
Sleeping
| import re | |
| import os | |
| import requests | |
| from base64 import b64decode | |
| from bs4 import BeautifulSoup | |
| from typing import Dict, Optional | |
| Z_KEY = os.environ.get('ZYTE_KEY') | |
| PAGE_NOT_FOUND_STR = 'page not found' | |
| def zyte_call(url: str) -> bytes: | |
| api_response = requests.post( | |
| "https://api.zyte.com/v1/extract", | |
| auth=(Z_KEY, ""), | |
| json={ | |
| "url": url, | |
| "httpResponseBody": True | |
| }, | |
| ) | |
| http_response_body: bytes = b64decode( | |
| api_response.json()["httpResponseBody"]) | |
| return http_response_body | |
| def get_asin_pdp(soup: BeautifulSoup) -> Optional[Dict[str, str]]: | |
| # Check if 404 | |
| if PAGE_NOT_FOUND_STR in soup.find('title').text.lower(): | |
| return None | |
| # Get ASIN | |
| try: | |
| asin = soup.find('link', rel='canonical')['href'].split('/')[-1] | |
| except TypeError: | |
| asin = None | |
| # Get title | |
| search = soup.find('span', id="productTitle") | |
| title = search.text.lstrip().rstrip() if search else None | |
| # Get feature-bullets | |
| search = soup.find('div', id="feature-bullets") | |
| if search: | |
| bullet_search = search.find_all('span', class_='a-list-item') | |
| feature_bullets = [h.text.lstrip().rstrip() for h in bullet_search if len(bullet_search)] | |
| # Remove unwanted bullets | |
| feature_bullets = [b for b in feature_bullets if b != 'Make sure this fits by entering your model number.'] | |
| else: | |
| feature_bullets = None | |
| # Get KV, tech, A+ tables. Merge with override key hierarchy: A+ > tech > KV | |
| kv_res = parse_kv_table(soup) | |
| tech_res = parse_tech_table(soup) | |
| ap_data = parse_ap_table(soup) | |
| tech_data = {**kv_res, **tech_res, **ap_data} | |
| res = {'asin': asin, 'title': title, 'feature_bullets': feature_bullets, 'tech_data': tech_data} | |
| return res | |
| def parse_kv_table(soup: BeautifulSoup) -> Dict[str, str]: | |
| kv_res = {} | |
| try: | |
| search = soup.find('div', id='productOverview_feature_div') | |
| table = search.find('table') | |
| data = table.find_all('tr') | |
| for d in data: | |
| kv = d.find_all('td') | |
| k = kv[0].text.lstrip().rstrip() | |
| v = kv[1].text.lstrip().rstrip() | |
| kv_res[k] = v | |
| except AttributeError: | |
| pass | |
| return kv_res | |
| def parse_tech_table(soup: BeautifulSoup) -> Dict[str, str]: | |
| tech_res = {} | |
| tables = soup.find_all('table', id=re.compile('productDetails_techSpec.*')) | |
| if tables: | |
| for tab in tables: | |
| data = tab.find_all('tr') | |
| for d in data: | |
| key = d.find('th').text.lstrip().rstrip() | |
| value = d.find('td').text.strip('\n').replace('\u200e', '').lstrip().rstrip() | |
| tech_res[key] = value | |
| return tech_res | |
| def parse_ap_table(soup: BeautifulSoup) -> Dict[str, str]: | |
| ap_res = {} | |
| tech = soup.find_all('div', id='tech') | |
| for div in tech: | |
| tables = div.find_all('table') | |
| for tab in tables: | |
| data = tab.find_all('tr') | |
| for d in data: | |
| kv = d.find_all('td') | |
| if kv: | |
| key = kv[0].text.strip('\n').replace('\u200e', '').lstrip().rstrip() | |
| value = kv[1].text.strip('\n').replace('\u200e', '').lstrip().rstrip() | |
| ap_res[key] = value | |
| return ap_res |