Spaces:
Runtime error
Runtime error
Switched to simple check for domain extension.
Browse files- functions/helper_functions.py +1 -4
- functions/tools.py +2 -2
functions/helper_functions.py
CHANGED
|
@@ -20,6 +20,7 @@ def get_url(company_name: str) -> str:
|
|
| 20 |
'''
|
| 21 |
|
| 22 |
logger = logging.getLogger(__name__ + '.get_url')
|
|
|
|
| 23 |
|
| 24 |
query = f'{company_name} official website'
|
| 25 |
|
|
@@ -45,10 +46,6 @@ def get_feed(website_url: str) -> str:
|
|
| 45 |
|
| 46 |
feeds = feed_search(website_url)
|
| 47 |
|
| 48 |
-
logger.info('Feeds search result is: %s', type(feeds))
|
| 49 |
-
logger.info('Feeds search results: %s', len(feeds))
|
| 50 |
-
logger.info('Feeds results: %s', list(feeds))
|
| 51 |
-
|
| 52 |
if len(feeds) > 0:
|
| 53 |
return str(feeds[0].url)
|
| 54 |
|
|
|
|
| 20 |
'''
|
| 21 |
|
| 22 |
logger = logging.getLogger(__name__ + '.get_url')
|
| 23 |
+
logger.info('Getting website URL for %s', company_name)
|
| 24 |
|
| 25 |
query = f'{company_name} official website'
|
| 26 |
|
|
|
|
| 46 |
|
| 47 |
feeds = feed_search(website_url)
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
if len(feeds) > 0:
|
| 50 |
return str(feeds[0].url)
|
| 51 |
|
functions/tools.py
CHANGED
|
@@ -2,11 +2,11 @@
|
|
| 2 |
|
| 3 |
import logging
|
| 4 |
from urllib.parse import urlparse
|
| 5 |
-
import validators
|
| 6 |
import functions.helper_functions as helper_funcs
|
| 7 |
|
| 8 |
FEED_URIS = {}
|
| 9 |
RSS_EXTENSIONS = ['xml', 'rss', 'atom']
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
def get_content(website: str) -> list:
|
|
@@ -39,7 +39,7 @@ def get_content(website: str) -> list:
|
|
| 39 |
# If neither of those get it - try feedparse if it looks like a url
|
| 40 |
# or else just google it
|
| 41 |
else:
|
| 42 |
-
if
|
| 43 |
website_url = website
|
| 44 |
logger.info('%s looks like a website URL', website)
|
| 45 |
|
|
|
|
| 2 |
|
| 3 |
import logging
|
| 4 |
from urllib.parse import urlparse
|
|
|
|
| 5 |
import functions.helper_functions as helper_funcs
|
| 6 |
|
| 7 |
FEED_URIS = {}
|
| 8 |
RSS_EXTENSIONS = ['xml', 'rss', 'atom']
|
| 9 |
+
COMMON_EXTENSIONS = ['com', 'net', 'org', 'edu', 'gov', 'co', 'us']
|
| 10 |
|
| 11 |
|
| 12 |
def get_content(website: str) -> list:
|
|
|
|
| 39 |
# If neither of those get it - try feedparse if it looks like a url
|
| 40 |
# or else just google it
|
| 41 |
else:
|
| 42 |
+
if website.split('.')[-1] in COMMON_EXTENSIONS:
|
| 43 |
website_url = website
|
| 44 |
logger.info('%s looks like a website URL', website)
|
| 45 |
|