Update main.py
Browse files
main.py
CHANGED
|
@@ -36,10 +36,11 @@ async def power_scrapper(url):
|
|
| 36 |
|
| 37 |
# Extract all links
|
| 38 |
links = await page.query_selector_all('a')
|
| 39 |
-
|
|
|
|
| 40 |
for link in links:
|
| 41 |
href = await link.get_attribute('href')
|
| 42 |
-
result.append(
|
| 43 |
|
| 44 |
# Extract all text content
|
| 45 |
elements = await page.query_selector_all('body *')
|
|
@@ -47,10 +48,10 @@ async def power_scrapper(url):
|
|
| 47 |
for element in elements:
|
| 48 |
text_content = await element.text_content()
|
| 49 |
if text_content and text_content.strip():
|
| 50 |
-
|
| 51 |
|
| 52 |
await browser.close()
|
| 53 |
-
return
|
| 54 |
|
| 55 |
|
| 56 |
def get_links(soup):
|
|
@@ -89,6 +90,6 @@ async def get_data(url: str):
|
|
| 89 |
|
| 90 |
if links==[]:
|
| 91 |
print("running alternative scrapper")
|
| 92 |
-
links = await power_scrapper(url)
|
| 93 |
|
| 94 |
-
return ({"title": title ,
|
|
|
|
| 36 |
|
| 37 |
# Extract all links
|
| 38 |
links = await page.query_selector_all('a')
|
| 39 |
+
page_url = []
|
| 40 |
+
page_content = []
|
| 41 |
for link in links:
|
| 42 |
href = await link.get_attribute('href')
|
| 43 |
+
result.append(href)
|
| 44 |
|
| 45 |
# Extract all text content
|
| 46 |
elements = await page.query_selector_all('body *')
|
|
|
|
| 48 |
for element in elements:
|
| 49 |
text_content = await element.text_content()
|
| 50 |
if text_content and text_content.strip():
|
| 51 |
+
page_content.append(text_content.strip())
|
| 52 |
|
| 53 |
await browser.close()
|
| 54 |
+
return page_url,page_content
|
| 55 |
|
| 56 |
|
| 57 |
def get_links(soup):
|
|
|
|
| 90 |
|
| 91 |
if links==[]:
|
| 92 |
print("running alternative scrapper")
|
| 93 |
+
links,text_content = await power_scrapper(url)
|
| 94 |
|
| 95 |
+
return ({"title": title ,"URL":links,"Content":text_content})
|