Spaces:
Runtime error
Runtime error
xbvx
Browse files- Dockerfile +34 -84
- app.py +14 -34
Dockerfile
CHANGED
|
@@ -1,26 +1,26 @@
|
|
| 1 |
-
#
|
| 2 |
-
|
| 3 |
|
| 4 |
-
#
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
|
| 25 |
# RUN apt-get update && apt-get install -y wget unzip && \
|
| 26 |
# wget https://dl.google.com/Linux/direct/google-chrome-stable_current_amd64.deb && \
|
|
@@ -28,74 +28,24 @@
|
|
| 28 |
# rm google-chrome-stable_current_amd64.deb && \
|
| 29 |
# apt-get clean
|
| 30 |
|
| 31 |
-
#
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
# # && apt-get clean
|
| 37 |
-
# RUN useradd -m -u 1000 user
|
| 38 |
-
# USER user
|
| 39 |
-
# ENV HOME=/home/user \
|
| 40 |
-
# PATH=/home/user/.local/bin:$PATH
|
| 41 |
-
|
| 42 |
-
# WORKDIR $HOME/app
|
| 43 |
-
# # WORKDIR /app
|
| 44 |
-
|
| 45 |
-
# COPY --chown=user . $HOME/app
|
| 46 |
-
# # COPY . /app
|
| 47 |
-
# # Install Python dependencies
|
| 48 |
-
# RUN pip install --no-cache-dir -r requirements.txt
|
| 49 |
-
|
| 50 |
-
# # Run the Selenium script
|
| 51 |
-
# # CMD ["gunicorn", "-b", "0.0.0.0:7860","app:app"]
|
| 52 |
-
# CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
| 53 |
-
|
| 54 |
-
FROM python:3.10-slim
|
| 55 |
-
|
| 56 |
-
# install google chrome
|
| 57 |
-
|
| 58 |
-
# RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add -
|
| 59 |
-
|
| 60 |
-
# RUN sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list'
|
| 61 |
-
|
| 62 |
-
# RUN apt-get -y update
|
| 63 |
-
|
| 64 |
-
# RUN apt-get install -y google-chrome-stable
|
| 65 |
-
|
| 66 |
-
RUN apt-get update && apt-get install -y wget unzip && \
|
| 67 |
-
wget https://dl.google.com/Linux/direct/google-chrome-stable_current_amd64.deb && \
|
| 68 |
-
apt install -y ./google-chrome-stable_current_amd64.deb && \
|
| 69 |
-
rm google-chrome-stable_current_amd64.deb && \
|
| 70 |
-
apt-get clean
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
# install chromedriver
|
| 74 |
-
|
| 75 |
-
# RUN apt-get install -yqq unzip
|
| 76 |
-
|
| 77 |
-
# RUN wget -O chromedriver.zip http://chromedriver.storage.googleapis.com/`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE`/chromedriver_linux64.zip
|
| 78 |
-
|
| 79 |
-
# RUN unzip chromedriver.zip chromedriver -d /usr/bin/
|
| 80 |
-
|
| 81 |
-
# set display port to avoid crash
|
| 82 |
-
|
| 83 |
-
ENV DISPLAY=:99
|
| 84 |
-
|
| 85 |
-
# WORKDIR /code
|
| 86 |
RUN useradd -m -u 1000 user
|
| 87 |
USER user
|
| 88 |
ENV HOME=/home/user \
|
| 89 |
PATH=/home/user/.local/bin:$PATH
|
| 90 |
|
| 91 |
WORKDIR $HOME/app
|
| 92 |
-
#
|
| 93 |
|
| 94 |
-
COPY ./requirements.txt /code/requirements.txt
|
| 95 |
-
|
| 96 |
-
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
| 97 |
COPY --chown=user . $HOME/app
|
| 98 |
-
# COPY .
|
| 99 |
-
|
|
|
|
| 100 |
|
| 101 |
-
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use the official Python image as the base image
|
| 2 |
+
FROM python:3.10-slim
|
| 3 |
|
| 4 |
+
# Install dependencies for Selenium and Chrome
|
| 5 |
+
RUN apt-get update && apt-get install -y \
|
| 6 |
+
wget \
|
| 7 |
+
unzip \
|
| 8 |
+
curl \
|
| 9 |
+
gnupg \
|
| 10 |
+
libnss3 \
|
| 11 |
+
libgconf-2-4 \
|
| 12 |
+
libxi6 \
|
| 13 |
+
libxcursor1 \
|
| 14 |
+
libxrandr2 \
|
| 15 |
+
libxss1 \
|
| 16 |
+
libxtst6 \
|
| 17 |
+
fonts-liberation \
|
| 18 |
+
xdg-utils \
|
| 19 |
+
libatk-bridge2.0-0 \
|
| 20 |
+
libgtk-3-0 \
|
| 21 |
+
--no-install-recommends && \
|
| 22 |
+
apt-get clean && \
|
| 23 |
+
rm -rf /var/lib/apt/lists/*
|
| 24 |
|
| 25 |
# RUN apt-get update && apt-get install -y wget unzip && \
|
| 26 |
# wget https://dl.google.com/Linux/direct/google-chrome-stable_current_amd64.deb && \
|
|
|
|
| 28 |
# rm google-chrome-stable_current_amd64.deb && \
|
| 29 |
# apt-get clean
|
| 30 |
|
| 31 |
+
# Update the package list and install wget, unzip, and Firefox
|
| 32 |
+
RUN apt-get update && apt-get install -y wget unzip \
|
| 33 |
+
&& apt-get install -y firefox-esr \
|
| 34 |
+
&& apt-get clean
|
| 35 |
+
RUN which firefox
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
RUN useradd -m -u 1000 user
|
| 37 |
USER user
|
| 38 |
ENV HOME=/home/user \
|
| 39 |
PATH=/home/user/.local/bin:$PATH
|
| 40 |
|
| 41 |
WORKDIR $HOME/app
|
| 42 |
+
# WORKDIR /app
|
| 43 |
|
|
|
|
|
|
|
|
|
|
| 44 |
COPY --chown=user . $HOME/app
|
| 45 |
+
# COPY . /app
|
| 46 |
+
# Install Python dependencies
|
| 47 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 48 |
|
| 49 |
+
# Run the Selenium script
|
| 50 |
+
# CMD ["gunicorn", "-b", "0.0.0.0:7860","app:app"]
|
| 51 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
app.py
CHANGED
|
@@ -4,29 +4,18 @@ import time
|
|
| 4 |
import pandas as pd
|
| 5 |
from fastapi import FastAPI, HTTPException
|
| 6 |
from selenium import webdriver
|
| 7 |
-
from selenium.webdriver.
|
| 8 |
from selenium.webdriver.common.action_chains import ActionChains
|
| 9 |
-
from selenium.webdriver.
|
| 10 |
-
from webdriver_manager.
|
| 11 |
from webdriver_manager.core.driver_cache import DriverCacheManager
|
| 12 |
from selenium.webdriver.common.by import By
|
| 13 |
from fake_headers import Headers
|
| 14 |
from fastapi.middleware.cors import CORSMiddleware
|
| 15 |
import logging
|
| 16 |
-
|
| 17 |
|
| 18 |
-
# proxy_username="ockzoweb"
|
| 19 |
-
# proxy_password="23wxmulibzuq"
|
| 20 |
-
# proxy_address="198.23.239.134"
|
| 21 |
-
# proxy_port="6540"
|
| 22 |
|
| 23 |
-
# proxy_url=f"http://{proxy_username}:{proxy_password}@{proxy_address}:{proxy_port}"
|
| 24 |
-
# seleniumwire_options = {
|
| 25 |
-
# "proxy": {
|
| 26 |
-
# "http": proxy_url,
|
| 27 |
-
# "https": proxy_url,
|
| 28 |
-
# }
|
| 29 |
-
# }
|
| 30 |
# Initialize FastAPI
|
| 31 |
app = FastAPI(
|
| 32 |
debug=True,
|
|
@@ -65,26 +54,24 @@ def setup_chromedriver():
|
|
| 65 |
|
| 66 |
# Setup headless Chrome options
|
| 67 |
# Define a custom user agent
|
| 68 |
-
|
| 69 |
|
| 70 |
# my_user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
|
| 71 |
-
|
| 72 |
-
|
| 73 |
# proxy = None
|
| 74 |
browser_option = Options()
|
| 75 |
browser_option.add_argument("--headless") # Running in headless mode (no GUI)
|
| 76 |
browser_option.add_argument("--no-sandbox")
|
| 77 |
browser_option.add_argument("--disable-dev-shm-usage")
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
# browser_option.binary_location = r'C:\Users\HP\.cache\selenium\firefox\win64\133.0\firefox.exe'
|
| 81 |
# browser_option.add_argument("--disable-gpu")
|
| 82 |
# browser_option.add_argument("--log-level=3")
|
| 83 |
# browser_option.add_argument("--disable-notifications")
|
| 84 |
# browser_option.add_argument("--disable-popup-blocking")
|
| 85 |
-
browser_option.add_argument("--user-agent={}"
|
| 86 |
-
|
| 87 |
-
logging.info(f"header: {header}")
|
| 88 |
# if proxy:
|
| 89 |
# browser_option.add_argument(f"--proxy-server={proxy}")
|
| 90 |
|
|
@@ -92,12 +79,10 @@ logging.info(f"header: {header}")
|
|
| 92 |
# Setup WebDriver
|
| 93 |
driver_path = setup_chromedriver()
|
| 94 |
service = Service(executable_path=driver_path)
|
| 95 |
-
|
| 96 |
# actions = ActionChains(driver)
|
| 97 |
|
| 98 |
def getSearchPostData(search_keyword, index, name="", forCompetitorAnalysis=False):
|
| 99 |
-
driver = webdriver.Chrome( options=browser_option,)
|
| 100 |
-
|
| 101 |
# Navigate to the search results page
|
| 102 |
url = f'https://www.reddit.com/search/?q={search_keyword}'
|
| 103 |
driver.get(url)
|
|
@@ -161,16 +146,11 @@ def getSearchPostData(search_keyword, index, name="", forCompetitorAnalysis=Fals
|
|
| 161 |
|
| 162 |
def get_webpage_title(url: str) -> str:
|
| 163 |
try:
|
| 164 |
-
|
| 165 |
-
# driver = webdriver.Chrome( options=browser_option,)
|
| 166 |
-
driver = webdriver.Chrome(service=service, options=browser_option,)
|
| 167 |
-
|
| 168 |
url="https://www.reddit.com"
|
| 169 |
driver.get(url)
|
| 170 |
-
time.sleep(3)
|
| 171 |
title = driver.title
|
| 172 |
logging.info(f"Page title: {title}")
|
| 173 |
-
driver.quit()
|
| 174 |
return title
|
| 175 |
except Exception as e:
|
| 176 |
logging.error(f"Error fetching webpage title: {e}")
|
|
@@ -234,7 +214,7 @@ async def fetch_title(url: str):
|
|
| 234 |
# from selenium.webdriver.common.proxy import Proxy, ProxyType
|
| 235 |
# app = Flask(__name__)
|
| 236 |
|
| 237 |
-
|
| 238 |
# def download_selenium():
|
| 239 |
# prox = Proxy()
|
| 240 |
# prox.proxy_type = ProxyType.MANUAL
|
|
|
|
| 4 |
import pandas as pd
|
| 5 |
from fastapi import FastAPI, HTTPException
|
| 6 |
from selenium import webdriver
|
| 7 |
+
from selenium.webdriver.firefox.service import Service
|
| 8 |
from selenium.webdriver.common.action_chains import ActionChains
|
| 9 |
+
from selenium.webdriver.firefox.options import Options
|
| 10 |
+
from webdriver_manager.firefox import GeckoDriverManager
|
| 11 |
from webdriver_manager.core.driver_cache import DriverCacheManager
|
| 12 |
from selenium.webdriver.common.by import By
|
| 13 |
from fake_headers import Headers
|
| 14 |
from fastapi.middleware.cors import CORSMiddleware
|
| 15 |
import logging
|
| 16 |
+
from selenium_driverless import webdriver as webdriverless
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
# Initialize FastAPI
|
| 20 |
app = FastAPI(
|
| 21 |
debug=True,
|
|
|
|
| 54 |
|
| 55 |
# Setup headless Chrome options
|
| 56 |
# Define a custom user agent
|
| 57 |
+
my_user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0"
|
| 58 |
|
| 59 |
# my_user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
|
| 60 |
+
|
| 61 |
+
capabilities = webdriver.DesiredCapabilities.FIREFOX
|
| 62 |
# proxy = None
|
| 63 |
browser_option = Options()
|
| 64 |
browser_option.add_argument("--headless") # Running in headless mode (no GUI)
|
| 65 |
browser_option.add_argument("--no-sandbox")
|
| 66 |
browser_option.add_argument("--disable-dev-shm-usage")
|
| 67 |
+
browser_option.add_argument("--ignore-certificate-errors")
|
| 68 |
+
browser_option.binary_location = '/usr/bin/firefox'
|
|
|
|
| 69 |
# browser_option.add_argument("--disable-gpu")
|
| 70 |
# browser_option.add_argument("--log-level=3")
|
| 71 |
# browser_option.add_argument("--disable-notifications")
|
| 72 |
# browser_option.add_argument("--disable-popup-blocking")
|
| 73 |
+
browser_option.add_argument(f"--user-agent={my_user_agent}")
|
| 74 |
+
|
|
|
|
| 75 |
# if proxy:
|
| 76 |
# browser_option.add_argument(f"--proxy-server={proxy}")
|
| 77 |
|
|
|
|
| 79 |
# Setup WebDriver
|
| 80 |
driver_path = setup_chromedriver()
|
| 81 |
service = Service(executable_path=driver_path)
|
| 82 |
+
driver = webdriver.Firefox(service=service, options=browser_option)
|
| 83 |
# actions = ActionChains(driver)
|
| 84 |
|
| 85 |
def getSearchPostData(search_keyword, index, name="", forCompetitorAnalysis=False):
|
|
|
|
|
|
|
| 86 |
# Navigate to the search results page
|
| 87 |
url = f'https://www.reddit.com/search/?q={search_keyword}'
|
| 88 |
driver.get(url)
|
|
|
|
| 146 |
|
| 147 |
def get_webpage_title(url: str) -> str:
|
| 148 |
try:
|
| 149 |
+
getSearchPostData(search_keyword="migraine", index=0)
|
|
|
|
|
|
|
|
|
|
| 150 |
url="https://www.reddit.com"
|
| 151 |
driver.get(url)
|
|
|
|
| 152 |
title = driver.title
|
| 153 |
logging.info(f"Page title: {title}")
|
|
|
|
| 154 |
return title
|
| 155 |
except Exception as e:
|
| 156 |
logging.error(f"Error fetching webpage title: {e}")
|
|
|
|
| 214 |
# from selenium.webdriver.common.proxy import Proxy, ProxyType
|
| 215 |
# app = Flask(__name__)
|
| 216 |
|
| 217 |
+
|
| 218 |
# def download_selenium():
|
| 219 |
# prox = Proxy()
|
| 220 |
# prox.proxy_type = ProxyType.MANUAL
|