ocr_api2

Sleeping

App Files Files Community

Arafath10 commited on May 20, 2024

Commit

8efa796

verified ·

1 Parent(s): 9571a2a

Create scraper.py

Browse files

Files changed (1) hide show

scraper.py +79 -0

scraper.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# scraper.py
+import asyncio
+from playwright.async_api import async_playwright
+from bs4 import BeautifulSoup
+import requests
+class Scraper:
+    @staticmethod
+    async def power_scrapper(url):
+        async with async_playwright() as p:
+            browser = await p.chromium.launch(headless=True)
+            page = await browser.new_page()
+            # Block unnecessary resources to speed up loading
+            await page.route("**/*", lambda route: route.continue_() if route.request.resource_type in ["document", "script"] else route.abort())
+            # Open the target website
+            await page.goto(url, wait_until='domcontentloaded')
+            # Wait for a short time to ensure dynamic content is loaded
+            await page.wait_for_timeout(1000)
+            # Extract all links
+            links = await page.query_selector_all('a')
+            page_url = []
+            page_content = []
+            for link in links:
+                href = await link.get_attribute('href')
+                page_url.append(href)
+            # Extract all text content
+            elements = await page.query_selector_all('body *')
+            for element in elements:
+                text_content = await element.text_content()
+                if text_content and text_content.strip():
+                    page_content.append(text_content.strip())
+            await browser.close()
+            return page_url, page_content
+    @staticmethod
+    def get_links(soup):
+        links = []
+        for link in soup.find_all('a'):
+            href = link.get('href')
+            links.append(href)
+        return links
+    @staticmethod
+    def get_text_content(soup):
+        text_elements = []
+        for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span']:
+            elements = soup.find_all(tag)
+            for element in elements:
+                text_elements.append(element.get_text())
+        return text_elements
+    @staticmethod
+    def get_title(soup):
+        title = soup.find('title').get_text()
+        return title
+    @staticmethod
+    async def scrape(url):
+        headers = {'User-Agent': 'Mozilla/5.0'}
+        response = requests.get(url, headers=headers)
+        soup = BeautifulSoup(response.content, 'html.parser')
+        title = Scraper.get_title(soup)
+        links = Scraper.get_links(soup)
+        text_content = Scraper.get_text_content(soup)
+        if not links:
+            print("Running alternative scrapper")
+            links, text_content = await Scraper.power_scrapper(url)
+        return {"title": title, "URL": links, "Content": text_content}