webscrapper

Sleeping

App Files Files Community

Arafath10 commited on Oct 2, 2024

Commit

1d74e32

verified ·

1 Parent(s): c0bee13

Update main.py

Browse files

Files changed (1) hide show

main.py +33 -21

main.py CHANGED Viewed

@@ -39,54 +39,65 @@ async def get_data(url: str):
 # FastAPI route to scrape the website
 @app.get("/scrape")
-async def scrape_website(url):
     async with async_playwright() as p:
-        # Try using WebKit or Firefox if Chromium fails
-        browser = await p.webkit.launch(headless=True)  # Switch to WebKit
-        # Create a new browser context with a realistic user-agent
         context = await browser.new_context(
             user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
         )
-        # Set additional headers to force HTTP/1.1 and avoid detection
         await context.set_extra_http_headers({
             "Accept-Language": "en-US,en;q=0.9",
             "Upgrade-Insecure-Requests": "1",
-            "Connection": "keep-alive"  # Force HTTP/1.1 instead of HTTP/2
         })
-        # Open a new page
         page = await context.new_page()
-        # Route to block images, videos, and CSS to speed up page load
         await page.route("**/*", lambda route: route.abort() if route.request.resource_type in ["image", "media", "stylesheet", "font", "xhr"] else route.continue_())
         try:
-            # Introduce a slight delay to mimic human behavior
-            await asyncio.sleep(random.uniform(1, 3))
-            # Navigate to the page with an extended timeout
             await page.goto(url, wait_until='domcontentloaded', timeout=60000)
-            # Simulate human behavior by scrolling and moving the mouse
             await page.mouse.move(random.uniform(0, 100), random.uniform(0, 100))
             await page.mouse.wheel(0, random.uniform(200, 400))
-            await asyncio.sleep(random.uniform(1, 3))  # Random delay
             # Get the title of the page
             title = await page.title()
-            # Introduce a slight delay before fetching the links
-            await asyncio.sleep(random.uniform(1, 2))
             # Get all links on the page
             links = await page.evaluate("""() => {
                 return Array.from(document.querySelectorAll('a')).map(a => a.href);
             }""")
-            # Introduce another slight delay before fetching the content
-            await asyncio.sleep(random.uniform(1, 2))
             # Get page content (text from paragraphs and headers)
             content = await page.evaluate("""() => {
                 let elements = Array.from(document.querySelectorAll('body *'));
@@ -108,3 +119,4 @@ async def scrape_website(url):
         except Exception as e:
             return {"error": str(e)}

 # FastAPI route to scrape the website
 @app.get("/scrape")
+async def scrape_website(url: str):
     async with async_playwright() as p:
+        # Launch browser in headless mode with custom args to bypass detection
+        browser = await p.chromium.launch(
+            headless=True,
+            args=[
+                "--disable-blink-features=AutomationControlled",  # Disable automation features
+                "--no-sandbox",
+                "--disable-dev-shm-usage",
+                "--disable-web-security",
+                "--disable-setuid-sandbox",
+                "--disable-features=IsolateOrigins,site-per-process"
+            ]
+        )
+        # Create a new browser context
         context = await browser.new_context(
             user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
         )
+        # Set additional headers to mimic real browsing
         await context.set_extra_http_headers({
             "Accept-Language": "en-US,en;q=0.9",
             "Upgrade-Insecure-Requests": "1",
+            "Referer": "https://www.nasdaq.com"
         })
+        # Create a new page
         page = await context.new_page()
+        # Hide WebDriver and other automation-related properties
+        await page.add_init_script("""
+            Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
+            window.navigator.chrome = { runtime: {} };
+            Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]});
+            Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en']});
+        """)
+        # Block unnecessary resources (images, media, etc.)
         await page.route("**/*", lambda route: route.abort() if route.request.resource_type in ["image", "media", "stylesheet", "font", "xhr"] else route.continue_())
         try:
+            # Navigate to the page with random delays
+            await asyncio.sleep(random.uniform(1, 5))  # Random delay
             await page.goto(url, wait_until='domcontentloaded', timeout=60000)
+            # Randomized mouse movement and scrolling to mimic human interaction
             await page.mouse.move(random.uniform(0, 100), random.uniform(0, 100))
             await page.mouse.wheel(0, random.uniform(200, 400))
+            await asyncio.sleep(random.uniform(1, 5))  # Another random delay
             # Get the title of the page
             title = await page.title()
             # Get all links on the page
             links = await page.evaluate("""() => {
                 return Array.from(document.querySelectorAll('a')).map(a => a.href);
             }""")
             # Get page content (text from paragraphs and headers)
             content = await page.evaluate("""() => {
                 let elements = Array.from(document.querySelectorAll('body *'));
         except Exception as e:
             return {"error": str(e)}