Spaces:
Sleeping
Sleeping
this is supposed to be better
Browse files- Dockerfile +6 -16
- app.py +15 -41
- requirements.txt +2 -1
Dockerfile
CHANGED
|
@@ -1,26 +1,16 @@
|
|
| 1 |
FROM python:3.10-slim
|
| 2 |
|
| 3 |
-
# Install
|
| 4 |
-
RUN
|
| 5 |
-
&& apt-get update && apt-get install -y \
|
| 6 |
wget \
|
| 7 |
gnupg \
|
| 8 |
unzip \
|
| 9 |
curl \
|
| 10 |
-
&& dpkg -i google-chrome-stable_current_amd64.deb || true \
|
| 11 |
-
&& apt-get install -f -y \
|
| 12 |
&& apt-get clean \
|
| 13 |
-
&& rm -rf /var/lib/apt/lists/*
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
# Install ChromeDriver to a writable location
|
| 19 |
-
RUN wget -q https://chromedriver.storage.googleapis.com/114.0.5735.90/chromedriver_linux64.zip \
|
| 20 |
-
&& unzip chromedriver_linux64.zip -d /tmp/ \
|
| 21 |
-
&& rm chromedriver_linux64.zip \
|
| 22 |
-
&& chmod +x /tmp/chromedriver \
|
| 23 |
-
&& /tmp/chromedriver --version
|
| 24 |
|
| 25 |
WORKDIR /app
|
| 26 |
|
|
|
|
| 1 |
FROM python:3.10-slim
|
| 2 |
|
| 3 |
+
# Install basic dependencies
|
| 4 |
+
RUN apt-get update && apt-get install -y \
|
|
|
|
| 5 |
wget \
|
| 6 |
gnupg \
|
| 7 |
unzip \
|
| 8 |
curl \
|
|
|
|
|
|
|
| 9 |
&& apt-get clean \
|
| 10 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 11 |
+
|
| 12 |
+
# Install Playwright browsers
|
| 13 |
+
RUN playwright install chromium
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
WORKDIR /app
|
| 16 |
|
app.py
CHANGED
|
@@ -5,15 +5,9 @@ import requests
|
|
| 5 |
from bs4 import BeautifulSoup
|
| 6 |
import tempfile
|
| 7 |
import os
|
| 8 |
-
from
|
| 9 |
-
from selenium.webdriver.chrome.options import Options
|
| 10 |
-
from selenium.webdriver.chrome.service import Service
|
| 11 |
-
from selenium.webdriver.common.by import By
|
| 12 |
-
from selenium.webdriver.support.ui import WebDriverWait
|
| 13 |
-
from selenium.webdriver.support import expected_conditions as EC
|
| 14 |
from datetime import datetime
|
| 15 |
import pytz
|
| 16 |
-
from webdriver_manager.chrome import ChromeDriverManager
|
| 17 |
|
| 18 |
os.environ["SE_CACHE_PATH"] = "/tmp/selenium"
|
| 19 |
|
|
@@ -258,40 +252,20 @@ def seek():
|
|
| 258 |
'Connection': 'keep-alive',
|
| 259 |
'Upgrade-Insecure-Requests': '1'
|
| 260 |
}
|
| 261 |
-
# Use
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
options.add_argument("--memory-pressure-off")
|
| 276 |
-
options.add_argument("--max_old_space_size=4096")
|
| 277 |
-
|
| 278 |
-
# Use ChromeDriver from /tmp location and specify Chrome binary path
|
| 279 |
-
service = Service("/tmp/chromedriver")
|
| 280 |
-
options.binary_location = "/tmp/google-chrome"
|
| 281 |
-
driver = webdriver.Chrome(service=service, options=options)
|
| 282 |
-
|
| 283 |
-
driver.get(url)
|
| 284 |
-
|
| 285 |
-
# Wait for the page to load and look for divp elements
|
| 286 |
-
try:
|
| 287 |
-
WebDriverWait(driver, 15).until(
|
| 288 |
-
EC.presence_of_element_located((By.CLASS_NAME, "divp"))
|
| 289 |
-
)
|
| 290 |
-
except:
|
| 291 |
-
pass
|
| 292 |
-
|
| 293 |
-
html = driver.page_source
|
| 294 |
-
driver.quit()
|
| 295 |
|
| 296 |
soup = BeautifulSoup(html, 'html.parser')
|
| 297 |
|
|
|
|
| 5 |
from bs4 import BeautifulSoup
|
| 6 |
import tempfile
|
| 7 |
import os
|
| 8 |
+
from playwright.sync_api import sync_playwright
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
from datetime import datetime
|
| 10 |
import pytz
|
|
|
|
| 11 |
|
| 12 |
os.environ["SE_CACHE_PATH"] = "/tmp/selenium"
|
| 13 |
|
|
|
|
| 252 |
'Connection': 'keep-alive',
|
| 253 |
'Upgrade-Insecure-Requests': '1'
|
| 254 |
}
|
| 255 |
+
# Use Playwright to handle redirects and wait for page to load
|
| 256 |
+
with sync_playwright() as p:
|
| 257 |
+
browser = p.chromium.launch(headless=True)
|
| 258 |
+
page = browser.new_page()
|
| 259 |
+
page.goto(url)
|
| 260 |
+
|
| 261 |
+
# Wait for divp elements to load
|
| 262 |
+
try:
|
| 263 |
+
page.wait_for_selector("div.divp", timeout=15000)
|
| 264 |
+
except:
|
| 265 |
+
pass
|
| 266 |
+
|
| 267 |
+
html = page.content()
|
| 268 |
+
browser.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
|
| 270 |
soup = BeautifulSoup(html, 'html.parser')
|
| 271 |
|
requirements.txt
CHANGED
|
@@ -39,4 +39,5 @@ urllib3==2.4.0
|
|
| 39 |
Werkzeug==3.1.3
|
| 40 |
selenium
|
| 41 |
pytz
|
| 42 |
-
webdriver-manager
|
|
|
|
|
|
| 39 |
Werkzeug==3.1.3
|
| 40 |
selenium
|
| 41 |
pytz
|
| 42 |
+
webdriver-manager
|
| 43 |
+
playwright
|