Spaces:
Sleeping
Sleeping
Upload 4 files
Browse files- main.py +4 -7
- requirements.txt +1 -0
main.py
CHANGED
|
@@ -4,6 +4,7 @@ nest_asyncio.apply()
|
|
| 4 |
from fastapi import FastAPI, HTTPException
|
| 5 |
from pydantic import BaseModel, HttpUrl
|
| 6 |
from playwright.async_api import async_playwright
|
|
|
|
| 7 |
from bs4 import BeautifulSoup, Comment
|
| 8 |
import re
|
| 9 |
import asyncio
|
|
@@ -65,15 +66,11 @@ async def scrape_with_playwright(url: str):
|
|
| 65 |
timezone_id="America/New_York"
|
| 66 |
)
|
| 67 |
|
| 68 |
-
# Add init script to further hide webdriver property
|
| 69 |
-
await context.add_init_script("""
|
| 70 |
-
Object.defineProperty(navigator, 'webdriver', {
|
| 71 |
-
get: () => undefined
|
| 72 |
-
});
|
| 73 |
-
""")
|
| 74 |
-
|
| 75 |
page = await context.new_page()
|
| 76 |
|
|
|
|
|
|
|
|
|
|
| 77 |
try:
|
| 78 |
# Go to URL and wait for network to be idle (load complete)
|
| 79 |
await page.goto(url, wait_until="networkidle", timeout=30000)
|
|
|
|
| 4 |
from fastapi import FastAPI, HTTPException
|
| 5 |
from pydantic import BaseModel, HttpUrl
|
| 6 |
from playwright.async_api import async_playwright
|
| 7 |
+
from playwright_stealth import stealth_async
|
| 8 |
from bs4 import BeautifulSoup, Comment
|
| 9 |
import re
|
| 10 |
import asyncio
|
|
|
|
| 66 |
timezone_id="America/New_York"
|
| 67 |
)
|
| 68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
page = await context.new_page()
|
| 70 |
|
| 71 |
+
# Apply stealth to the page
|
| 72 |
+
await stealth_async(page)
|
| 73 |
+
|
| 74 |
try:
|
| 75 |
# Go to URL and wait for network to be idle (load complete)
|
| 76 |
await page.goto(url, wait_until="networkidle", timeout=30000)
|
requirements.txt
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
fastapi
|
| 2 |
uvicorn
|
| 3 |
playwright
|
|
|
|
| 4 |
nest_asyncio
|
| 5 |
beautifulsoup4
|
| 6 |
lxml
|
|
|
|
| 1 |
fastapi
|
| 2 |
uvicorn
|
| 3 |
playwright
|
| 4 |
+
playwright-stealth
|
| 5 |
nest_asyncio
|
| 6 |
beautifulsoup4
|
| 7 |
lxml
|