Spaces:

honeybansal23
/

nextAnalyticsScraping

Runtime error

App Files Files Community

honeybansal23 commited on Nov 30, 2024

Commit

0eacdd5

1 Parent(s): 34ae0fe

sd

Browse files

Files changed (7) hide show

Dockerfile +61 -45
__pycache__/app.cpython-311.pyc +0 -0
app.py +237 -201
requirements.txt +21 -14
selenium_webapp-main/Dockerfile +16 -0
selenium_webapp-main/requirements.txt +6 -0
selenium_webapp-main/selenium_webapp.py +32 -0

Dockerfile CHANGED Viewed

@@ -1,45 +1,61 @@
-# Use the official Python image as the base image
-FROM python:3.10-slim
-# Install dependencies for Selenium and Chrome
-RUN apt-get update && apt-get install -y \
-    wget \
-    unzip \
-    curl \
-    gnupg \
-    libnss3 \
-    libgconf-2-4 \
-    libxi6 \
-    libxcursor1 \
-    libxrandr2 \
-    libxss1 \
-    libxtst6 \
-    fonts-liberation \
-    xdg-utils \
-    libatk-bridge2.0-0 \
-    libgtk-3-0 \
-    --no-install-recommends && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-RUN apt-get update && apt-get install -y wget unzip && \
-    wget https://dl.google.com/Linux/direct/google-chrome-stable_current_amd64.deb && \
-    apt install -y ./google-chrome-stable_current_amd64.deb && \
-    rm google-chrome-stable_current_amd64.deb && \
-    apt-get clean
-RUN useradd -m -u 1000 user
-USER user
-ENV HOME=/home/user \
-	PATH=/home/user/.local/bin:$PATH
-WORKDIR $HOME/app
-# WORKDIR /app
-COPY --chown=user . $HOME/app
-# COPY . /app
-# Install Python dependencies
-RUN pip install --no-cache-dir -r requirements.txt
-# Run the Selenium script
-# CMD ["gunicorn", "-b", "0.0.0.0:7860","app:app"]
-CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

+# # Use the official Python image as the base image
+# FROM python:3.10-slim
+# # Install dependencies for Selenium and Chrome
+# RUN apt-get update && apt-get install -y \
+#     wget \
+#     unzip \
+#     curl \
+#     gnupg \
+#     libnss3 \
+#     libgconf-2-4 \
+#     libxi6 \
+#     libxcursor1 \
+#     libxrandr2 \
+#     libxss1 \
+#     libxtst6 \
+#     fonts-liberation \
+#     xdg-utils \
+#     libatk-bridge2.0-0 \
+#     libgtk-3-0 \
+#     --no-install-recommends && \
+#     apt-get clean && \
+#     rm -rf /var/lib/apt/lists/*
+# RUN apt-get update && apt-get install -y wget unzip && \
+#     wget https://dl.google.com/Linux/direct/google-chrome-stable_current_amd64.deb && \
+#     apt install -y ./google-chrome-stable_current_amd64.deb && \
+#     rm google-chrome-stable_current_amd64.deb && \
+#     apt-get clean
+# RUN useradd -m -u 1000 user
+# USER user
+# ENV HOME=/home/user \
+# 	PATH=/home/user/.local/bin:$PATH
+# WORKDIR $HOME/app
+# # WORKDIR /app
+# COPY --chown=user . $HOME/app
+# # COPY . /app
+# # Install Python dependencies
+# RUN pip install --no-cache-dir -r requirements.txt
+# # Run the Selenium script
+# # CMD ["gunicorn", "-b", "0.0.0.0:7860","app:app"]
+# CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
+ARG PORT=443
+FROM cypress/browsers:latest
+RUN apt-get install python3 -y
+RUN echo $(python3 -m site --user-base)
+COPY requirements.txt  .
+ENV PATH /home/root/.local/bin:${PATH}
+RUN  apt-get update && apt-get install -y python3-pip && pip install -r requirements.txt
+COPY . .
+CMD uvicorn main:app --host 0.0.0.0 --port $PORT

__pycache__/app.cpython-311.pyc CHANGED Viewed

Binary files a/__pycache__/app.cpython-311.pyc and b/__pycache__/app.cpython-311.pyc differ

app.py CHANGED Viewed

@@ -1,205 +1,241 @@
-import os
-import random
-import time
-import pandas as pd
-from fastapi import FastAPI, HTTPException
 from selenium import webdriver
 from selenium.webdriver.chrome.service import Service
-from selenium.webdriver.common.action_chains import ActionChains
-from selenium.webdriver.chrome.options import Options
 from webdriver_manager.chrome import ChromeDriverManager
-from webdriver_manager.core.driver_cache import DriverCacheManager
 from selenium.webdriver.common.by import By
-from fake_headers import Headers
-from fastapi.middleware.cors import CORSMiddleware
-import logging
-from selenium_driverless import webdriver as webdriverless
-# Initialize FastAPI
-app = FastAPI(
-    debug=True,
-    title="NextAnalytics Server",
-    consumes=["application/x-www-form-urlencoded", "multipart/form-data"],
-    docs_url='/swagger'
-)
-# Configure CORS
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-# Setup ChromeDriver and Selenium
-# custom_wdm_cache = os.path.join(os.getcwd(), 'custom_wdm_cache')
-# os.environ['WDM_LOCAL'] = custom_wdm_cache
-# Setup logging
-logging.basicConfig(level=logging.INFO)
-def setup_chromedriver():
-    logging.info("Setting up ChromeDriver...")
-    # custom_wdm_cache = os.path.join(os.getcwd(), 'custom_wdm_cache')
-    # os.environ['WDM_LOCAL'] = custom_wdm_cache
-    # cache_manager = DriverCacheManager(custom_wdm_cache)
-    # os.chmod(custom_wdm_cache, 0o755)  # Ensure proper permissions
-    # path = ChromeDriverManager(cache_manager=cache_manager).install()
-    path = ChromeDriverManager().install()
-    os.chmod(path, 0o755)  # Ensure ChromeDriver is executable
-    logging.info(f"ChromeDriver path: {path}")
-    return path
-# Setup headless Chrome options
-# Define a custom user agent
-my_user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
-# my_user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
-# proxy = None
-browser_option = Options()
-browser_option.add_argument("--headless")  # Running in headless mode (no GUI)
-browser_option.add_argument("--no-sandbox")
-browser_option.add_argument("--disable-dev-shm-usage")
-browser_option.add_argument("--ignore-certificate-errors")
-# browser_option.add_argument("--disable-gpu")
-# browser_option.add_argument("--log-level=3")
-# browser_option.add_argument("--disable-notifications")
-# browser_option.add_argument("--disable-popup-blocking")
-browser_option.add_argument(f"--user-agent={my_user_agent}")
-# if proxy:
-#     browser_option.add_argument(f"--proxy-server={proxy}")
-# Setup WebDriver
-driver_path = setup_chromedriver()
-service = Service(executable_path=driver_path)
-driver = webdriver.Chrome(service=service, options=browser_option)
-# actions = ActionChains(driver)
-def getSearchPostData(search_keyword, index, name="", forCompetitorAnalysis=False):
-    # Navigate to the search results page
-    url = f'https://www.reddit.com/search/?q={search_keyword}'
-    driver.get(url)
-    time.sleep(3)  # Consider using WebDriverWait instead of sleep for better reliability
-    logging.info("Navigated to search page.")
-    posts_data = []
-    list_length = 0  # posts count
-    try:
-        if forCompetitorAnalysis:
-            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
-            time.sleep(5)
-        post_cards = driver.find_elements(By.CSS_SELECTOR, 'a[data-testid="post-title-text"]')
-        post_cards_1 = driver.find_elements(By.CSS_SELECTOR, 'div[data-testid="search-counter-row"]')
-        post_cards_2 = driver.find_elements(By.CSS_SELECTOR, 'faceplate-timeago')
-        logging.info(f"Found {len(post_cards)} post cards.")
-        idx = list_length
-        for card in post_cards_1:
-            try:
-                votes_count = card.find_element(By.XPATH, './/faceplate-number').text
-                comments_count = card.find_element(By.XPATH,
-                    './/span[contains(text(), "comment") or contains(text(), "comments")]/preceding-sibling::faceplate-number'
-                ).text
-                posts_data.append({
-                    "index": idx,
-                    "comment_count": comments_count,
-                    "votes_count": votes_count
-                })
-                idx += 1
-            except Exception as e:
-                logging.error(f"Error processing post_card_1: {e}")
-        idx = list_length
-        for card in post_cards:
-            try:
-                url = card.get_attribute("href")
-                title = card.text
-                posts_data[idx]["title"] = title
-                posts_data[idx]["url"] = url
-                idx += 1
-            except Exception as e:
-                logging.error(f"Error processing post_cards: {e}")
-        idx = list_length
-        for card in post_cards_2:
-            try:
-                time_element = card.find_element(By.XPATH, './time')
-                post_time = time_element.get_attribute('datetime')
-                posts_data[idx]["time"] = post_time
-                idx += 1
-            except Exception as e:
-                logging.error(f"Error processing post_cards_2: {e}")
-    except Exception as e:
-        logging.error(f"Error in scrolling or extracting data: {e}")
-    df = pd.DataFrame(posts_data)
-    df.to_csv(f'posts_data_{index}.csv', index=False)
-    logging.info(f"Data saved to posts_data_{index}.csv")
-    return df
-def get_webpage_title(url: str) -> str:
-    try:
-        getSearchPostData(search_keyword="migraine", index=0)
-        url="https://www.reddit.com"
-        driver.get(url)
-        title = driver.title
-        logging.info(f"Page title: {title}")
-        return title
-    except Exception as e:
-        logging.error(f"Error fetching webpage title: {e}")
-        return str(e)
-@app.get("/")
-async def home():
-    return {"message": "Hello"}
-@app.get("/get-title/")
-async def fetch_title(url: str):
-    """
-    Fetch the title of a webpage by URL.
-    Example: /get-title/?url=https://www.reddit.com
-    """
-    try:
-        title = get_webpage_title(url)
-        return {"url": url, "title": title}
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
-@app.get("/get-reddit/")
-async def getReddit(url: str):
-    """
-    Fetch the title of a webpage by URL.
-    Example: /get-title/?url=https://www.reddit.com
-    """
-    try:
-        options = webdriverless.ChromeOptions()
-        driver_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
-        options.add_argument("--headless")  # Running in headless mode (no GUI)
-        options.add_argument("--no-sandbox")
-        options.add_argument("--disable-dev-shm-usage")
-        options.add_argument("--ignore-certificate-errors")
-        options.add_argument(f"--user-agent={driver_agent}")
-        title="Notitle"
-        async with webdriverless.Chrome(options=options) as driver:
-            await driver.get('https://www.reddit.com')
-            time.sleep(3)
-            title = await driver.title
-            url = await driver.current_url
-            source = await driver.page_source
-            print(title)
-            return {"url": url, "title": title}
-        return {"url": url, "title": title}
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
-# Run the app
-# if __name__ == "__main__":
-#     import uvicorn
-#     uvicorn.run(app, host="127.0.0.1", port=7860)

+# import os
+# import random
+# import time
+# import pandas as pd
+# from fastapi import FastAPI, HTTPException
+# from selenium import webdriver
+# from selenium.webdriver.chrome.service import Service
+# from selenium.webdriver.common.action_chains import ActionChains
+# from selenium.webdriver.chrome.options import Options
+# from webdriver_manager.chrome import ChromeDriverManager
+# from webdriver_manager.core.driver_cache import DriverCacheManager
+# from selenium.webdriver.common.by import By
+# from fake_headers import Headers
+# from fastapi.middleware.cors import CORSMiddleware
+# import logging
+# from selenium_driverless import webdriver as webdriverless
+# # Initialize FastAPI
+# app = FastAPI(
+#     debug=True,
+#     title="NextAnalytics Server",
+#     consumes=["application/x-www-form-urlencoded", "multipart/form-data"],
+#     docs_url='/swagger'
+# )
+# # Configure CORS
+# app.add_middleware(
+#     CORSMiddleware,
+#     allow_origins=["*"],
+#     allow_credentials=True,
+#     allow_methods=["*"],
+#     allow_headers=["*"],
+# )
+# # Setup ChromeDriver and Selenium
+# # custom_wdm_cache = os.path.join(os.getcwd(), 'custom_wdm_cache')
+# # os.environ['WDM_LOCAL'] = custom_wdm_cache
+# # Setup logging
+# logging.basicConfig(level=logging.INFO)
+# def setup_chromedriver():
+#     logging.info("Setting up ChromeDriver...")
+#     # custom_wdm_cache = os.path.join(os.getcwd(), 'custom_wdm_cache')
+#     # os.environ['WDM_LOCAL'] = custom_wdm_cache
+#     # cache_manager = DriverCacheManager(custom_wdm_cache)
+#     # os.chmod(custom_wdm_cache, 0o755)  # Ensure proper permissions
+#     # path = ChromeDriverManager(cache_manager=cache_manager).install()
+#     path = ChromeDriverManager().install()
+#     os.chmod(path, 0o755)  # Ensure ChromeDriver is executable
+#     logging.info(f"ChromeDriver path: {path}")
+#     return path
+# # Setup headless Chrome options
+# # Define a custom user agent
+# my_user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
+# # my_user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
+# # proxy = None
+# browser_option = Options()
+# browser_option.add_argument("--headless")  # Running in headless mode (no GUI)
+# browser_option.add_argument("--no-sandbox")
+# browser_option.add_argument("--disable-dev-shm-usage")
+# browser_option.add_argument("--ignore-certificate-errors")
+# # browser_option.add_argument("--disable-gpu")
+# # browser_option.add_argument("--log-level=3")
+# # browser_option.add_argument("--disable-notifications")
+# # browser_option.add_argument("--disable-popup-blocking")
+# browser_option.add_argument(f"--user-agent={my_user_agent}")
+# # if proxy:
+# #     browser_option.add_argument(f"--proxy-server={proxy}")
+# # Setup WebDriver
+# driver_path = setup_chromedriver()
+# service = Service(executable_path=driver_path)
+# driver = webdriver.Chrome(service=service, options=browser_option)
+# # actions = ActionChains(driver)
+# def getSearchPostData(search_keyword, index, name="", forCompetitorAnalysis=False):
+#     # Navigate to the search results page
+#     url = f'https://www.reddit.com/search/?q={search_keyword}'
+#     driver.get(url)
+#     time.sleep(3)  # Consider using WebDriverWait instead of sleep for better reliability
+#     logging.info("Navigated to search page.")
+#     posts_data = []
+#     list_length = 0  # posts count
+#     try:
+#         if forCompetitorAnalysis:
+#             driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+#             time.sleep(5)
+#         post_cards = driver.find_elements(By.CSS_SELECTOR, 'a[data-testid="post-title-text"]')
+#         post_cards_1 = driver.find_elements(By.CSS_SELECTOR, 'div[data-testid="search-counter-row"]')
+#         post_cards_2 = driver.find_elements(By.CSS_SELECTOR, 'faceplate-timeago')
+#         logging.info(f"Found {len(post_cards)} post cards.")
+#         idx = list_length
+#         for card in post_cards_1:
+#             try:
+#                 votes_count = card.find_element(By.XPATH, './/faceplate-number').text
+#                 comments_count = card.find_element(By.XPATH,
+#                     './/span[contains(text(), "comment") or contains(text(), "comments")]/preceding-sibling::faceplate-number'
+#                 ).text
+#                 posts_data.append({
+#                     "index": idx,
+#                     "comment_count": comments_count,
+#                     "votes_count": votes_count
+#                 })
+#                 idx += 1
+#             except Exception as e:
+#                 logging.error(f"Error processing post_card_1: {e}")
+#         idx = list_length
+#         for card in post_cards:
+#             try:
+#                 url = card.get_attribute("href")
+#                 title = card.text
+#                 posts_data[idx]["title"] = title
+#                 posts_data[idx]["url"] = url
+#                 idx += 1
+#             except Exception as e:
+#                 logging.error(f"Error processing post_cards: {e}")
+#         idx = list_length
+#         for card in post_cards_2:
+#             try:
+#                 time_element = card.find_element(By.XPATH, './time')
+#                 post_time = time_element.get_attribute('datetime')
+#                 posts_data[idx]["time"] = post_time
+#                 idx += 1
+#             except Exception as e:
+#                 logging.error(f"Error processing post_cards_2: {e}")
+#     except Exception as e:
+#         logging.error(f"Error in scrolling or extracting data: {e}")
+#     df = pd.DataFrame(posts_data)
+#     df.to_csv(f'posts_data_{index}.csv', index=False)
+#     logging.info(f"Data saved to posts_data_{index}.csv")
+#     return df
+# def get_webpage_title(url: str) -> str:
+#     try:
+#         getSearchPostData(search_keyword="migraine", index=0)
+#         url="https://www.reddit.com"
+#         driver.get(url)
+#         title = driver.title
+#         logging.info(f"Page title: {title}")
+#         return title
+#     except Exception as e:
+#         logging.error(f"Error fetching webpage title: {e}")
+#         return str(e)
+# @app.get("/")
+# async def home():
+#     return {"message": "Hello"}
+# @app.get("/get-title/")
+# async def fetch_title(url: str):
+#     """
+#     Fetch the title of a webpage by URL.
+#     Example: /get-title/?url=https://www.reddit.com
+#     """
+#     try:
+#         title = get_webpage_title(url)
+#         return {"url": url, "title": title}
+#     except Exception as e:
+#         raise HTTPException(status_code=500, detail=str(e))
+# @app.get("/get-reddit/")
+# async def getReddit(url: str):
+#     """
+#     Fetch the title of a webpage by URL.
+#     Example: /get-title/?url=https://www.reddit.com
+#     """
+#     try:
+#         options = webdriverless.ChromeOptions()
+#         driver_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
+#         options.add_argument("--headless")  # Running in headless mode (no GUI)
+#         options.add_argument("--no-sandbox")
+#         options.add_argument("--disable-dev-shm-usage")
+#         options.add_argument("--ignore-certificate-errors")
+#         options.add_argument(f"--user-agent={driver_agent}")
+#         title="Notitle"
+#         async with webdriverless.Chrome(options=options) as driver:
+#             await driver.get('https://www.reddit.com')
+#             time.sleep(3)
+#             title = await driver.title
+#             url = await driver.current_url
+#             source = await driver.page_source
+#             print(title)
+#             return {"url": url, "title": title}
+#         return {"url": url, "title": title}
+#     except Exception as e:
+#         raise HTTPException(status_code=500, detail=str(e))
+# # Run the app
+# # if __name__ == "__main__":
+# #     import uvicorn
+# #     uvicorn.run(app, host="127.0.0.1", port=7860)
 from selenium import webdriver
+from flask import Flask, request
 from selenium.webdriver.chrome.service import Service
 from webdriver_manager.chrome import ChromeDriverManager
 from selenium.webdriver.common.by import By
+app = Flask(__name__)
+def download_selenium():
+    chrome_options = webdriver.ChromeOptions()
+    driver_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
+    chrome_options.add_argument("--headless")
+    chrome_options.add_argument("--no-sandbox")
+    chrome_options.add_argument("--disable-dev-shm-usage")
+    prefs = {"profile.managed_default_content_settings.images": 2}
+    chrome_options.add_experimental_option("prefs", prefs)
+    chrome_options.add_argument(f"--user-agent={driver_agent}")
+    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
+    driver.get("https://reddit.com")
+    title = driver.title
+    # language = driver.find_element(By.XPATH, "//div[@id='SIvCob']").text
+    data = {'Page Title': title}
+    return data
+@app.route('/', methods = ['GET','POST'])
+def home():
+    if (request.method == 'GET'):
+        return download_selenium()
+if __name__ == "__main__":
+    app.run(debug=True, port=3000)

requirements.txt CHANGED Viewed

@@ -1,14 +1,21 @@
-flask==3.0.3
-gunicorn
-flask_cors
-selenium==4.26.1
-uvicorn
-scalar_fastapi==1.0.3
-requests
-pandas
-numpy
-webdriver_manager==4.0.2
-fake_headers
-fastapi
-selenium_driverless
-asyncio

+# flask==3.0.3
+# gunicorn
+# flask_cors
+# selenium==4.26.1
+# uvicorn
+# scalar_fastapi==1.0.3
+# requests
+# pandas
+# numpy
+# webdriver_manager==4.0.2
+# fake_headers
+# fastapi
+# selenium_driverless
+# asyncio
+selenium==4.6.0
+requests==2.28.1
+webdriver_manager==3.8.4
+packaging==21.3
+flask-restful==0.3.9
+gunicorn==20.1.0

selenium_webapp-main/Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+ARG PORT=443
+FROM cypress/browsers:latest
+RUN apt-get install python3 -y
+RUN echo $(python3 -m site --user-base)
+COPY requirements.txt  .
+ENV PATH /home/root/.local/bin:${PATH}
+RUN  apt-get update && apt-get install -y python3-pip && pip install -r requirements.txt
+COPY . .
+CMD uvicorn main:app --host 0.0.0.0 --port $PORT

selenium_webapp-main/requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+selenium==4.6.0
+requests==2.28.1
+webdriver_manager==3.8.4
+packaging==21.3
+flask-restful==0.3.9
+gunicorn==20.1.0

selenium_webapp-main/selenium_webapp.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from selenium import webdriver
+from flask import Flask, request
+from selenium.webdriver.chrome.service import Service
+from webdriver_manager.chrome import ChromeDriverManager
+from selenium.webdriver.common.by import By
+app = Flask(__name__)
+def download_selenium():
+    chrome_options = webdriver.ChromeOptions()
+    chrome_options.add_argument("--headless")
+    chrome_options.add_argument("--no-sandbox")
+    chrome_options.add_argument("--disable-dev-shm-usage")
+    prefs = {"profile.managed_default_content_settings.images": 2}
+    chrome_options.add_experimental_option("prefs", prefs)
+    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
+    driver.get("https://google.com")
+    title = driver.title
+    language = driver.find_element(By.XPATH, "//div[@id='SIvCob']").text
+    data = {'Page Title': title, 'Language': language}
+    return data
+@app.route('/', methods = ['GET','POST'])
+def home():
+    if (request.method == 'GET'):
+        return download_selenium()
+if __name__ == "__main__":
+    app.run(debug=True, port=3000)