Update scraper.py
Browse files- scraper.py +13 -11
scraper.py
CHANGED
|
@@ -4,6 +4,8 @@ import asyncio
|
|
| 4 |
from playwright.async_api import async_playwright
|
| 5 |
from bs4 import BeautifulSoup
|
| 6 |
import requests
|
|
|
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
|
|
@@ -13,26 +15,26 @@ class Scraper:
|
|
| 13 |
async with async_playwright() as p:
|
| 14 |
browser = await p.chromium.launch(headless=True)
|
| 15 |
page = await browser.new_page()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
await page.goto(url)
|
| 17 |
-
|
| 18 |
# Get the title
|
| 19 |
#title = await page.title()
|
| 20 |
-
|
| 21 |
# Get all links
|
| 22 |
page_url = await page.evaluate("""() => {
|
| 23 |
return Array.from(document.querySelectorAll('a')).map(a => a.href);
|
| 24 |
}""")
|
| 25 |
-
|
| 26 |
-
# Get page content (paragraphs
|
| 27 |
page_content = await page.evaluate("""() => {
|
| 28 |
-
|
| 29 |
-
|
| 30 |
}""")
|
| 31 |
-
|
| 32 |
-
# Print the results
|
| 33 |
-
# print(f"Title: {title}")
|
| 34 |
-
# print(f"Links: {links}")
|
| 35 |
-
# print(f"Content: {content}")
|
| 36 |
|
| 37 |
await browser.close()
|
| 38 |
return page_url, page_content
|
|
|
|
| 4 |
from playwright.async_api import async_playwright
|
| 5 |
from bs4 import BeautifulSoup
|
| 6 |
import requests
|
| 7 |
+
import time
|
| 8 |
+
|
| 9 |
|
| 10 |
|
| 11 |
|
|
|
|
| 15 |
async with async_playwright() as p:
|
| 16 |
browser = await p.chromium.launch(headless=True)
|
| 17 |
page = await browser.new_page()
|
| 18 |
+
|
| 19 |
+
# Route to block images, videos, and CSS
|
| 20 |
+
await page.route("**/*", lambda route: route.abort() if route.request.resource_type in ["image", "media", "stylesheet"] else route.continue_())
|
| 21 |
+
|
| 22 |
await page.goto(url)
|
| 23 |
+
|
| 24 |
# Get the title
|
| 25 |
#title = await page.title()
|
| 26 |
+
|
| 27 |
# Get all links
|
| 28 |
page_url = await page.evaluate("""() => {
|
| 29 |
return Array.from(document.querySelectorAll('a')).map(a => a.href);
|
| 30 |
}""")
|
| 31 |
+
|
| 32 |
+
# Get page content (text from paragraphs and headers)
|
| 33 |
page_content = await page.evaluate("""() => {
|
| 34 |
+
let elements = Array.from(document.querySelectorAll('body *'));
|
| 35 |
+
return elements.map(element => element.innerText).join('\\n');
|
| 36 |
}""")
|
| 37 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
await browser.close()
|
| 40 |
return page_url, page_content
|