42Cummer commited on
Commit
be20e75
·
verified ·
1 Parent(s): bb874a9

this is supposed to be better

Browse files
Files changed (3) hide show
  1. Dockerfile +6 -16
  2. app.py +15 -41
  3. requirements.txt +2 -1
Dockerfile CHANGED
@@ -1,26 +1,16 @@
1
  FROM python:3.10-slim
2
 
3
- # Install Chrome to /tmp (writable location)
4
- RUN wget -q https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb \
5
- && apt-get update && apt-get install -y \
6
  wget \
7
  gnupg \
8
  unzip \
9
  curl \
10
- && dpkg -i google-chrome-stable_current_amd64.deb || true \
11
- && apt-get install -f -y \
12
  && apt-get clean \
13
- && rm -rf /var/lib/apt/lists/* \
14
- && rm google-chrome-stable_current_amd64.deb \
15
- && mv /usr/bin/google-chrome /tmp/google-chrome \
16
- && ln -s /tmp/google-chrome /usr/bin/google-chrome
17
-
18
- # Install ChromeDriver to a writable location
19
- RUN wget -q https://chromedriver.storage.googleapis.com/114.0.5735.90/chromedriver_linux64.zip \
20
- && unzip chromedriver_linux64.zip -d /tmp/ \
21
- && rm chromedriver_linux64.zip \
22
- && chmod +x /tmp/chromedriver \
23
- && /tmp/chromedriver --version
24
 
25
  WORKDIR /app
26
 
 
1
  FROM python:3.10-slim
2
 
3
+ # Install basic dependencies
4
+ RUN apt-get update && apt-get install -y \
 
5
  wget \
6
  gnupg \
7
  unzip \
8
  curl \
 
 
9
  && apt-get clean \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ # Install Playwright browsers
13
+ RUN playwright install chromium
 
 
 
 
 
 
 
14
 
15
  WORKDIR /app
16
 
app.py CHANGED
@@ -5,15 +5,9 @@ import requests
5
  from bs4 import BeautifulSoup
6
  import tempfile
7
  import os
8
- from selenium import webdriver
9
- from selenium.webdriver.chrome.options import Options
10
- from selenium.webdriver.chrome.service import Service
11
- from selenium.webdriver.common.by import By
12
- from selenium.webdriver.support.ui import WebDriverWait
13
- from selenium.webdriver.support import expected_conditions as EC
14
  from datetime import datetime
15
  import pytz
16
- from webdriver_manager.chrome import ChromeDriverManager
17
 
18
  os.environ["SE_CACHE_PATH"] = "/tmp/selenium"
19
 
@@ -258,40 +252,20 @@ def seek():
258
  'Connection': 'keep-alive',
259
  'Upgrade-Insecure-Requests': '1'
260
  }
261
- # Use Selenium to handle redirects and wait for page to load
262
- options = Options()
263
- options.add_argument("--headless=new")
264
- options.add_argument("--disable-gpu")
265
- options.add_argument("--no-sandbox")
266
- options.add_argument("--disable-dev-shm-usage")
267
- options.add_argument("--disable-extensions")
268
- options.add_argument("--disable-plugins")
269
- options.add_argument("--disable-images")
270
-
271
- options.add_argument("--disable-web-security")
272
- options.add_argument("--disable-features=VizDisplayCompositor")
273
- options.add_argument("--single-process")
274
- options.add_argument("--no-zygote")
275
- options.add_argument("--memory-pressure-off")
276
- options.add_argument("--max_old_space_size=4096")
277
-
278
- # Use ChromeDriver from /tmp location and specify Chrome binary path
279
- service = Service("/tmp/chromedriver")
280
- options.binary_location = "/tmp/google-chrome"
281
- driver = webdriver.Chrome(service=service, options=options)
282
-
283
- driver.get(url)
284
-
285
- # Wait for the page to load and look for divp elements
286
- try:
287
- WebDriverWait(driver, 15).until(
288
- EC.presence_of_element_located((By.CLASS_NAME, "divp"))
289
- )
290
- except:
291
- pass
292
-
293
- html = driver.page_source
294
- driver.quit()
295
 
296
  soup = BeautifulSoup(html, 'html.parser')
297
 
 
5
  from bs4 import BeautifulSoup
6
  import tempfile
7
  import os
8
+ from playwright.sync_api import sync_playwright
 
 
 
 
 
9
  from datetime import datetime
10
  import pytz
 
11
 
12
  os.environ["SE_CACHE_PATH"] = "/tmp/selenium"
13
 
 
252
  'Connection': 'keep-alive',
253
  'Upgrade-Insecure-Requests': '1'
254
  }
255
+ # Use Playwright to handle redirects and wait for page to load
256
+ with sync_playwright() as p:
257
+ browser = p.chromium.launch(headless=True)
258
+ page = browser.new_page()
259
+ page.goto(url)
260
+
261
+ # Wait for divp elements to load
262
+ try:
263
+ page.wait_for_selector("div.divp", timeout=15000)
264
+ except:
265
+ pass
266
+
267
+ html = page.content()
268
+ browser.close()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
 
270
  soup = BeautifulSoup(html, 'html.parser')
271
 
requirements.txt CHANGED
@@ -39,4 +39,5 @@ urllib3==2.4.0
39
  Werkzeug==3.1.3
40
  selenium
41
  pytz
42
- webdriver-manager
 
 
39
  Werkzeug==3.1.3
40
  selenium
41
  pytz
42
+ webdriver-manager
43
+ playwright