ocr_api2

Sleeping

Arafath10 commited on Aug 1, 2024

Commit

706f4ae

verified ·

1 Parent(s): 599b9d7

Update scraper.py

Files changed (1) hide show

scraper.py CHANGED Viewed

@@ -4,6 +4,8 @@ import asyncio
 from playwright.async_api import async_playwright
 from bs4 import BeautifulSoup
 import requests
@@ -13,26 +15,26 @@ class Scraper:
         async with async_playwright() as p:
             browser = await p.chromium.launch(headless=True)
             page = await browser.new_page()
             await page.goto(url)
             # Get the title
             #title = await page.title()
             # Get all links
             page_url = await page.evaluate("""() => {
                 return Array.from(document.querySelectorAll('a')).map(a => a.href);
             }""")
-            # Get page content (paragraphs, headers)
             page_content = await page.evaluate("""() => {
-            let elements = Array.from(document.querySelectorAll('body *'));
-            return elements.map(element => element.innerText).join('\\n');
             }""")
-            # Print the results
-            # print(f"Title: {title}")
-            # print(f"Links: {links}")
-            # print(f"Content: {content}")
             await browser.close()
             return page_url, page_content

 from playwright.async_api import async_playwright
 from bs4 import BeautifulSoup
 import requests
+import time
         async with async_playwright() as p:
             browser = await p.chromium.launch(headless=True)
             page = await browser.new_page()
+            # Route to block images, videos, and CSS
+            await page.route("**/*", lambda route: route.abort() if route.request.resource_type in ["image", "media", "stylesheet"] else route.continue_())
             await page.goto(url)
             # Get the title
             #title = await page.title()
             # Get all links
             page_url = await page.evaluate("""() => {
                 return Array.from(document.querySelectorAll('a')).map(a => a.href);
             }""")
+            # Get page content (text from paragraphs and headers)
             page_content = await page.evaluate("""() => {
+                let elements = Array.from(document.querySelectorAll('body *'));
+                return elements.map(element => element.innerText).join('\\n');
             }""")
             await browser.close()
             return page_url, page_content