Spaces:

honey234
/

nextanalyticsScraper1.0

Sleeping

App Files Files Community

honey234 commited on Nov 30, 2024

Commit

2b3e1c6

1 Parent(s): 705801c

sdfsd

Browse files

Files changed (11) hide show

.gitignore +54 -1
Dockerfile +52 -0
README.md +6 -4
app.py +267 -0
custom_wdm_cache/.gitkeep +0 -0
posts_data_0.csv +26 -0
requirements.txt +23 -0
selenium_webapp-main/Dockerfile +16 -0
selenium_webapp-main/requirements.txt +6 -0
selenium_webapp-main/selenium_webapp.py +32 -0
test.py +18 -0

.gitignore CHANGED Viewed

	@@ -1 +1,54 @@
1	- *.env

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# Virtual environment
+venv/
+ENV/
+env/
+.venv/
+.ENV/
+# Jupyter Notebook checkpoints
+.ipynb_checkpoints
+# VS Code files
+.vscode/
+# PyCharm files
+.idea/
+# Distribution / Packaging
+build/
+dist/
+cache/
+.wdm/
+Lib/
+Include/
+Scripts/
+*.egg-info/
+*.egg
+*.whl
+# Logs and debug files
+*.log
+# Test results
+*.out
+*.coverage
+.coverage.*
+# Environment variables and settings
+.env
+*.env
+pip-log.txt
+pip-delete-this-directory.txt
+# macOS files
+.DS_Store
+# Windows files
+Thumbs.db
+/databases/service-account.json

Dockerfile ADDED Viewed

	@@ -0,0 +1,52 @@

+# Use the official Python image as the base image
+FROM python:3.10-slim
+# Install dependencies for Selenium and Chrome
+RUN apt-get update && apt-get install -y \
+    wget \
+    unzip \
+    curl \
+    gnupg \
+    libnss3 \
+    libgconf-2-4 \
+    libxi6 \
+    libxcursor1 \
+    libxrandr2 \
+    libxss1 \
+    libxtst6 \
+    fonts-liberation \
+    xdg-utils \
+    libatk-bridge2.0-0 \
+    libgtk-3-0 \
+    --no-install-recommends && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+RUN apt-get update && apt-get install -y wget unzip && \
+    wget https://dl.google.com/Linux/direct/google-chrome-stable_current_amd64.deb && \
+    apt install -y ./google-chrome-stable_current_amd64.deb && \
+    rm google-chrome-stable_current_amd64.deb && \
+    apt-get clean
+RUN which google-chrome
+# Update the package list and install wget, unzip, and Firefox
+# RUN apt-get update && apt-get install -y wget unzip \
+#     && apt-get install -y firefox-esr \
+#     && apt-get clean
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+# WORKDIR /app
+COPY --chown=user . $HOME/app
+# COPY . /app
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Run the Selenium script
+# CMD ["gunicorn", "-b", "0.0.0.0:7860","app:app"]
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,10 +1,12 @@
 ---
-title: NextanalyticsScraper1.0
-emoji: 👁
-colorFrom: gray
-colorTo: green
 sdk: docker
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: NextAnalyticsScraping
+emoji: 📚
+colorFrom: pink
+colorTo: yellow
 sdk: docker
 pinned: false
+license: apache-2.0
+short_description: scraping test hosting
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,267 @@

+import os
+import random
+import time
+import pandas as pd
+from fastapi import FastAPI, HTTPException
+from seleniumwire import webdriver
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.common.action_chains import ActionChains
+from selenium.webdriver.chrome.options import Options
+from webdriver_manager.chrome import ChromeDriverManager
+from webdriver_manager.core.driver_cache import DriverCacheManager
+from selenium.webdriver.common.by import By
+from fake_headers import Headers
+from fastapi.middleware.cors import CORSMiddleware
+import logging
+from selenium_driverless import webdriver as webdriverless
+proxy_username="ockzoweb"
+proxy_password="23wxmulibzuq"
+proxy_address="198.23.239.134"
+proxy_port="6540"
+proxy_url=f"http://{proxy_username}:{proxy_password}@{proxy_address}:{proxy_port}"
+seleniumwire_options = {
+    "proxy": {
+        "http": proxy_url,
+        "https": proxy_url,
+    }
+}
+# Initialize FastAPI
+app = FastAPI(
+    debug=True,
+    title="NextAnalytics Server",
+    consumes=["application/x-www-form-urlencoded", "multipart/form-data"],
+    docs_url='/swagger'
+)
+# Configure CORS
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Setup ChromeDriver and Selenium
+# custom_wdm_cache = os.path.join(os.getcwd(), 'custom_wdm_cache')
+# os.environ['WDM_LOCAL'] = custom_wdm_cache
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+def setup_chromedriver():
+    logging.info("Setting up ChromeDriver...")
+    # custom_wdm_cache = os.path.join(os.getcwd(), 'custom_wdm_cache')
+    # os.environ['WDM_LOCAL'] = custom_wdm_cache
+    # cache_manager = DriverCacheManager(custom_wdm_cache)
+    # os.chmod(custom_wdm_cache, 0o755)  # Ensure proper permissions
+    # path = ChromeDriverManager(cache_manager=cache_manager).install()
+    path = ChromeDriverManager().install()
+    os.chmod(path, 0o755)  # Ensure ChromeDriver is executable
+    logging.info(f"ChromeDriver path: {path}")
+    return path
+# Setup headless Chrome options
+# Define a custom user agent
+# my_user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0"
+my_user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
+# capabilities = webdriver.DesiredCapabilities.FIREFOX
+# proxy = None
+browser_option = Options()
+browser_option.add_argument("--headless")  # Running in headless mode (no GUI)
+browser_option.add_argument("--no-sandbox")
+browser_option.add_argument("--disable-dev-shm-usage")
+browser_option.add_argument("--ignore-certificate-errors")
+# browser_option.binary_location = '/usr/bin/firefox'
+# browser_option.binary_location = r'C:\Users\HP\.cache\selenium\firefox\win64\133.0\firefox.exe'
+# browser_option.add_argument("--disable-gpu")
+# browser_option.add_argument("--log-level=3")
+# browser_option.add_argument("--disable-notifications")
+# browser_option.add_argument("--disable-popup-blocking")
+browser_option.add_argument(f"--user-agent={my_user_agent}")
+# if proxy:
+#     browser_option.add_argument(f"--proxy-server={proxy}")
+# Setup WebDriver
+driver_path = setup_chromedriver()
+service = Service(executable_path=driver_path)
+driver = webdriver.Chrome(service=service, options=browser_option,)
+# actions = ActionChains(driver)
+def getSearchPostData(search_keyword, index, name="", forCompetitorAnalysis=False):
+    # Navigate to the search results page
+    url = f'https://www.reddit.com/search/?q={search_keyword}'
+    driver.get(url)
+    time.sleep(3)  # Consider using WebDriverWait instead of sleep for better reliability
+    logging.info("Navigated to search page.")
+    posts_data = []
+    list_length = 0  # posts count
+    try:
+        if forCompetitorAnalysis:
+            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+            time.sleep(5)
+        post_cards = driver.find_elements(By.CSS_SELECTOR, 'a[data-testid="post-title-text"]')
+        post_cards_1 = driver.find_elements(By.CSS_SELECTOR, 'div[data-testid="search-counter-row"]')
+        post_cards_2 = driver.find_elements(By.CSS_SELECTOR, 'faceplate-timeago')
+        logging.info(f"Found {len(post_cards)} post cards.")
+        idx = list_length
+        for card in post_cards_1:
+            try:
+                votes_count = card.find_element(By.XPATH, './/faceplate-number').text
+                comments_count = card.find_element(By.XPATH,
+                    './/span[contains(text(), "comment") or contains(text(), "comments")]/preceding-sibling::faceplate-number'
+                ).text
+                posts_data.append({
+                    "index": idx,
+                    "comment_count": comments_count,
+                    "votes_count": votes_count
+                })
+                idx += 1
+            except Exception as e:
+                logging.error(f"Error processing post_card_1: {e}")
+        idx = list_length
+        for card in post_cards:
+            try:
+                url = card.get_attribute("href")
+                title = card.text
+                posts_data[idx]["title"] = title
+                posts_data[idx]["url"] = url
+                idx += 1
+            except Exception as e:
+                logging.error(f"Error processing post_cards: {e}")
+        idx = list_length
+        for card in post_cards_2:
+            try:
+                time_element = card.find_element(By.XPATH, './time')
+                post_time = time_element.get_attribute('datetime')
+                posts_data[idx]["time"] = post_time
+                idx += 1
+            except Exception as e:
+                logging.error(f"Error processing post_cards_2: {e}")
+    except Exception as e:
+        logging.error(f"Error in scrolling or extracting data: {e}")
+    df = pd.DataFrame(posts_data)
+    df.to_csv(f'posts_data_{index}.csv', index=False)
+    logging.info(f"Data saved to posts_data_{index}.csv")
+    return df
+def get_webpage_title(url: str) -> str:
+    try:
+        getSearchPostData(search_keyword="migraine", index=0)
+        url="https://www.reddit.com"
+        driver.get(url)
+        title = driver.title
+        logging.info(f"Page title: {title}")
+        return title
+    except Exception as e:
+        logging.error(f"Error fetching webpage title: {e}")
+        return str(e)
+@app.get("/")
+async def home():
+    return {"message": "Hello"}
+@app.get("/get-title/")
+async def fetch_title(url: str):
+    """
+    Fetch the title of a webpage by URL.
+    Example: /get-title/?url=https://www.reddit.com
+    """
+    try:
+        title = get_webpage_title(url)
+        return {"url": url, "title": title}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+# @app.get("/get-reddit/")
+# async def getReddit(url: str):
+#     """
+#     Fetch the title of a webpage by URL.
+#     Example: /get-title/?url=https://www.reddit.com
+#     """
+#     try:
+#         options = webdriverless.ChromeOptions()
+#         driver_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
+#         options.add_argument("--headless")  # Running in headless mode (no GUI)
+#         options.add_argument("--no-sandbox")
+#         options.add_argument("--disable-dev-shm-usage")
+#         options.add_argument("--ignore-certificate-errors")
+#         options.add_argument(f"--user-agent={driver_agent}")
+#         title="Notitle"
+#         async with webdriverless.Chrome(options=options) as driver:
+#             await driver.get('https://www.reddit.com')
+#             time.sleep(3)
+#             title = await driver.title
+#             url = await driver.current_url
+#             source = await driver.page_source
+#             print(title)
+#             return {"url": url, "title": title}
+#         return {"url": url, "title": title}
+#     except Exception as e:
+#         raise HTTPException(status_code=500, detail=str(e))
+# Run the app
+# if __name__ == "__main__":
+#     import uvicorn
+#     uvicorn.run(app, host="127.0.0.1", port=7860)
+# from selenium import webdriver
+# from flask import Flask, request
+# from selenium.webdriver.chrome.service import Service
+# from webdriver_manager.chrome import ChromeDriverManager
+# from selenium.webdriver.common.by import By
+# from selenium.webdriver.common.proxy import Proxy, ProxyType
+# app = Flask(__name__)
+# def download_selenium():
+#     prox = Proxy()
+#     prox.proxy_type = ProxyType.MANUAL
+#     prox.http_proxy = "ip_addr:port"
+#     prox.socks_proxy = "ip_addr:port"
+#     prox.ssl_proxy = "ip_addr:port"
+#     chrome_options = webdriver.ChromeOptions()
+#     driver_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
+#     capabilities = webdriver.DesiredCapabilities.CHROME
+#     prox.to_capabilities(capabilities)
+#     chrome_options.add_argument("--headless=new")
+#     # chrome_options.add_argument(f"--proxy-server={proxy}")
+#     chrome_options.add_argument("--no-sandbox")
+#     chrome_options.add_argument("--disable-dev-shm-usage")
+#     # chrome_options.add_argument("--ignore-certificate-errors")
+#     # chrome_options.add_argument("--disable-gpu")
+#     # chrome_options.add_argument("--log-level=3")
+#     # chrome_options.add_argument("--disable-notifications")
+#     # chrome_options.add_argument("--disable-popup-blocking")
+#     prefs = {"profile.managed_default_content_settings.images": 2}
+#     # chrome_options.add_experimental_option("prefs", prefs)
+#     chrome_options.add_argument(f"--user-agent={driver_agent}")
+#     driver = webdriver.Chrome(service=Service(ChromeDriverManager().install(),desired_capabilities=capabilities), options=chrome_options)
+#     driver.get("https://reddit.com")
+#     title = driver.title
+#     # language = driver.find_element(By.XPATH, "//div[@id='SIvCob']").text
+#     data = {'Page Title': title}
+#     return data
+# @app.route('/', methods = ['GET','POST'])
+# def home():
+#     if (request.method == 'GET'):
+#         return download_selenium()
+# if __name__ == "__main__":
+#     app.run(debug=True, port=3000)

custom_wdm_cache/.gitkeep ADDED Viewed

File without changes

posts_data_0.csv ADDED Viewed

	@@ -0,0 +1,26 @@

+index,comment_count,votes_count,title,url,time
+0,26,6,Migraine DBQ,https://www.reddit.com/r/VeteransBenefits/comments/1cogdv4/migraine_dbq/,2024-05-10T03:51:24.726Z
+1,84,651,A cool guide to migraine symptoms!,https://www.reddit.com/r/coolguides/comments/1f7wymg/a_cool_guide_to_migraine_symptoms/,2024-09-03T10:56:10.230Z
+2,321,141,What makes you sure it's a migraine?,https://www.reddit.com/r/migraine/comments/1cth4kw/what_makes_you_sure_its_a_migraine/,2024-05-16T16:33:05.274Z
+3,217,100,Biggest migraine life hacks?,https://www.reddit.com/r/migraine/comments/1902t0p/biggest_migraine_life_hacks/,2024-01-06T15:49:12.132Z
+4,112,40,What finally stopped your migraine?,https://www.reddit.com/r/migraine/comments/1e7wvsc/what_finally_stopped_your_migraine/,2024-07-20T14:27:22.529Z
+5,228,222,ELI5: What causes Migraines?,https://www.reddit.com/r/explainlikeimfive/comments/1e84z59/eli5_what_causes_migraines/,2024-07-20T20:35:12.699Z
+6,1K,2.9K,Most people use the word 'migraine' wrong.,https://www.reddit.com/r/unpopularopinion/comments/1884d1g/most_people_use_the_word_migraine_wrong/,2023-12-01T05:45:16.842Z
+7,236,1.9K,Wife’s migraines reduced by 90% and I feel like a jackass,https://www.reddit.com/r/migraine/comments/1g1dxw5/wifes_migraines_reduced_by_90_and_i_feel_like_a/,2024-10-11T16:31:11.670Z
+8,257,1.1K,It's just a migraine,https://www.reddit.com/r/Radiology/comments/1cisw7o/its_just_a_migraine/,2024-05-02T22:33:13.773Z
+9,239,660,"The cure for migraines! I've got it now, you can all stop searching! /s",https://www.reddit.com/r/migraine/comments/1ebkkff/the_cure_for_migraines_ive_got_it_now_you_can_all/,2024-07-25T03:09:38.426Z
+10,40,10,What are your migraine relief tricks?,https://www.reddit.com/r/AskReddit/comments/15oald2/what_are_your_migraine_relief_tricks/,2023-08-11T14:34:43.326Z
+11,400,1.7K,Some migraine symptoms you might not expect,https://www.reddit.com/r/migraine/comments/1e8zlkv/some_migraine_symptoms_you_might_not_expect/,2024-07-21T23:20:45.826Z
+12,158,3.1K,I painted this based on the visuals I get from migraines.,https://www.reddit.com/r/AbstractArt/comments/1gojgxz/i_painted_this_based_on_the_visuals_i_get_from/,2024-11-11T04:02:54.421Z
+13,185,698,migraine suffers are born with defective nervous systems,https://www.reddit.com/r/migraine/comments/1bs1g6c/migraine_suffers_are_born_with_defective_nervous/,2024-03-31T04:09:32.745Z
+14,911,25K,I’ve had constant migraines for the past week thanks to Matlab.,https://www.reddit.com/r/ProgrammerHumor/comments/xbmg98/ive_had_constant_migraines_for_the_past_week/,2022-09-11T16:10:36.852Z
+15,65,41,ELI5: what is the difference between a Headache vs a migraine?,https://www.reddit.com/r/explainlikeimfive/comments/1ccqoaj/eli5_what_is_the_difference_between_a_headache_vs/,2024-04-25T12:28:12.099Z
+16,243,6.5K,Need to share this. I got a migraine from laughing so hard.,https://www.reddit.com/r/BaldursGate3/comments/1butvvc/need_to_share_this_i_got_a_migraine_from_laughing/,2024-04-03T14:23:48.741Z
+17,54,6,What type of Migraine Logs does VA accept?,https://www.reddit.com/r/VeteransBenefits/comments/1fmdy2v/what_type_of_migraine_logs_does_va_accept/,2024-09-21T21:54:44.854Z
+18,231,1.9K,Raise your hand if you’re an American with a stress induced migraine today,https://www.reddit.com/r/migraine/comments/1gk64ce/raise_your_hand_if_youre_an_american_with_a/,2024-11-05T13:01:07.737Z
+19,240,316,Sign a migraine is coming,https://www.reddit.com/r/migraine/comments/1b90est/sign_a_migraine_is_coming/,2024-03-07T17:33:14.935Z
+20,1.1K,24K,TIL: Migraines are 3 times more common in women than in men.,https://www.reddit.com/r/todayilearned/comments/t0jzlf/til_migraines_are_3_times_more_common_in_women/,2022-02-24T20:03:31.456Z
+21,16,6,Migraine or MS?,https://www.reddit.com/r/MultipleSclerosis/comments/1fu5xnr/migraine_or_ms/,2024-10-02T02:31:44.292Z
+22,644,3.4K,This is the first time I’ve had someone tell me what I have isn’t a migraine. I mentioned having to grocery shop during a migraine and got this as a response.,https://www.reddit.com/r/migraine/comments/1780gdm/this_is_the_first_time_ive_had_someone_tell_me/,2023-10-14T22:12:03.962Z
+23,91,46,How do you manage your migraines?,https://www.reddit.com/r/POTS/comments/1duim0f/how_do_you_manage_your_migraines/,2024-07-03T16:27:50.806Z
+24,928,10K,10 minutes video call healthcare for $300 ...and just for migraine. Wow.,https://www.reddit.com/r/facepalm/comments/w6t020/10_minutes_video_call_healthcare_for_300_and_just/,2022-07-24T10:59:04.264Z

requirements.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+flask==3.0.3
+gunicorn
+flask_cors
+selenium==4.26.1
+uvicorn
+scalar_fastapi==1.0.3
+requests
+pandas
+numpy
+webdriver_manager==4.0.2
+fake_headers
+fastapi
+selenium_driverless
+asyncio
+selenium-wire
+blinker==1.7.0
+# selenium==4.6.0
+# requests==2.28.1
+# webdriver_manager==3.8.4
+# packaging==21.3
+# flask-restful==0.3.9
+# gunicorn==20.1.0

selenium_webapp-main/Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+ARG PORT=443
+FROM cypress/browsers:latest
+RUN apt-get install python3 -y
+RUN echo $(python3 -m site --user-base)
+COPY requirements.txt  .
+ENV PATH /home/root/.local/bin:${PATH}
+RUN  apt-get update && apt-get install -y python3-pip && pip install -r requirements.txt
+COPY . .
+CMD uvicorn main:app --host 0.0.0.0 --port $PORT

selenium_webapp-main/requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+selenium==4.6.0
+requests==2.28.1
+webdriver_manager==3.8.4
+packaging==21.3
+flask-restful==0.3.9
+gunicorn==20.1.0

selenium_webapp-main/selenium_webapp.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from selenium import webdriver
+from flask import Flask, request
+from selenium.webdriver.chrome.service import Service
+from webdriver_manager.chrome import ChromeDriverManager
+from selenium.webdriver.common.by import By
+app = Flask(__name__)
+def download_selenium():
+    chrome_options = webdriver.ChromeOptions()
+    chrome_options.add_argument("--headless")
+    chrome_options.add_argument("--no-sandbox")
+    chrome_options.add_argument("--disable-dev-shm-usage")
+    prefs = {"profile.managed_default_content_settings.images": 2}
+    chrome_options.add_experimental_option("prefs", prefs)
+    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
+    driver.get("https://google.com")
+    title = driver.title
+    language = driver.find_element(By.XPATH, "//div[@id='SIvCob']").text
+    data = {'Page Title': title, 'Language': language}
+    return data
+@app.route('/', methods = ['GET','POST'])
+def home():
+    if (request.method == 'GET'):
+        return download_selenium()
+if __name__ == "__main__":
+    app.run(debug=True, port=3000)

test.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import time
+from selenium_driverless import webdriver
+import asyncio
+async def main():
+    options = webdriver.ChromeOptions()
+    async with webdriver.Chrome(options=options) as driver:
+        await driver.get('https://www.reddit.com')
+        time.sleep(3)
+        title = await driver.title
+        url = await driver.current_url
+        source = await driver.page_source
+        print(title)
+asyncio.run(main())