ocr_api2

Sleeping

Arafath10 commited on Jul 23, 2024

Commit

b928ab9

verified ·

1 Parent(s): 9097392

Update scraper.py

Files changed (1) hide show

scraper.py CHANGED Viewed

@@ -8,6 +8,38 @@ import requests
 class Scraper:
     @staticmethod
     async def power_scrapper(url):
         async with async_playwright() as p:
@@ -76,6 +108,6 @@ class Scraper:
         if not links:
             print("Running alternative scrapper")
-            links, text_content = await Scraper.power_scrapper(url)
         return {"title": title, "URL": links, "Content": text_content}

 class Scraper:
+    @staticmethod
+    async def power_scrapper_2(url):
+        async with async_playwright() as p:
+            browser = await p.chromium.launch(headless=True)
+            page = await browser.new_page()
+            await page.goto(url)
+            # Get the title
+            #title = await page.title()
+            # Get all links
+            page_url = await page.evaluate("""() => {
+                return Array.from(document.querySelectorAll('a')).map(a => a.href);
+            }""")
+            # Get page content (paragraphs, headers)
+            page_content = await page.evaluate("""() => {
+                let elements = Array.from(document.querySelectorAll('p, h1, h2, h3, h4, h5, h6'));
+                return elements.map(element => ({
+                    tag: element.tagName,
+                    text: element.innerText
+                }));
+            }""")
+            # Print the results
+            # print(f"Title: {title}")
+            # print(f"Links: {links}")
+            # print(f"Content: {content}")
+            await browser.close()
+            return page_url, page_content
     @staticmethod
     async def power_scrapper(url):
         async with async_playwright() as p:
         if not links:
             print("Running alternative scrapper")
+            links, text_content = await Scraper.power_scrapper_2(url)
         return {"title": title, "URL": links, "Content": text_content}