Spaces:
Sleeping
Sleeping
add error handling
Browse files- app.py +4 -0
- src/scrape.py +7 -2
app.py
CHANGED
|
@@ -17,6 +17,10 @@ def asin_to_pdp(asin_or_url: str) -> dict:
|
|
| 17 |
|
| 18 |
html = scrape.zyte_call(asin_url)
|
| 19 |
asin_pdp = scrape.get_asin_pdp(BeautifulSoup(html, 'html.parser'))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
return asin_pdp
|
| 21 |
|
| 22 |
|
|
|
|
| 17 |
|
| 18 |
html = scrape.zyte_call(asin_url)
|
| 19 |
asin_pdp = scrape.get_asin_pdp(BeautifulSoup(html, 'html.parser'))
|
| 20 |
+
if not asin_pdp:
|
| 21 |
+
raise gr.Error('Input URL not found (404)')
|
| 22 |
+
elif not asin_pdp.get('title') or not asin_pdp.get('tech_data'):
|
| 23 |
+
raise gr.Error("Couldn't fetch title or technical details from input URL")
|
| 24 |
return asin_pdp
|
| 25 |
|
| 26 |
|
src/scrape.py
CHANGED
|
@@ -3,9 +3,10 @@ import os
|
|
| 3 |
import requests
|
| 4 |
from base64 import b64decode
|
| 5 |
from bs4 import BeautifulSoup
|
| 6 |
-
from typing import Dict
|
| 7 |
|
| 8 |
Z_KEY = os.environ.get('ZYTE_KEY')
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
def zyte_call(url: str) -> bytes:
|
|
@@ -22,7 +23,11 @@ def zyte_call(url: str) -> bytes:
|
|
| 22 |
return http_response_body
|
| 23 |
|
| 24 |
|
| 25 |
-
def get_asin_pdp(soup: BeautifulSoup) -> Dict[str, str]:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
# Get ASIN
|
| 27 |
try:
|
| 28 |
asin = soup.find('link', rel='canonical')['href'].split('/')[-1]
|
|
|
|
| 3 |
import requests
|
| 4 |
from base64 import b64decode
|
| 5 |
from bs4 import BeautifulSoup
|
| 6 |
+
from typing import Dict, Optional
|
| 7 |
|
| 8 |
Z_KEY = os.environ.get('ZYTE_KEY')
|
| 9 |
+
PAGE_NOT_FOUND_STR = 'page not found'
|
| 10 |
|
| 11 |
|
| 12 |
def zyte_call(url: str) -> bytes:
|
|
|
|
| 23 |
return http_response_body
|
| 24 |
|
| 25 |
|
| 26 |
+
def get_asin_pdp(soup: BeautifulSoup) -> Optional[Dict[str, str]]:
|
| 27 |
+
# Check if 404
|
| 28 |
+
if PAGE_NOT_FOUND_STR in soup.find('title').text.lower():
|
| 29 |
+
return None
|
| 30 |
+
|
| 31 |
# Get ASIN
|
| 32 |
try:
|
| 33 |
asin = soup.find('link', rel='canonical')['href'].split('/')[-1]
|