Spaces:
Sleeping
Sleeping
Commit ·
7742d11
1
Parent(s): b51e0d5
Initial commit
Browse files- .dockerignore +4 -0
- .env +1 -0
- .gitignore +1 -0
- Dockerfile +49 -0
- app/__pycache__/clickloom_llm.cpython-39.pyc +0 -0
- app/__pycache__/clickloom_scrape.cpython-39.pyc +0 -0
- app/clickloom_scrape.py +32 -0
- app/example.py +17 -0
- app/main.py +9 -0
- requirements.txt +5 -0
.dockerignore
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_pycache_
|
| 2 |
+
app/_pycache_
|
| 3 |
+
venv
|
| 4 |
+
.env
|
.env
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
groq_api_key = "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"
|
.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
venv
|
Dockerfile
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
ENV DEBIAN_FRONTEND=noninteractive
|
| 4 |
+
|
| 5 |
+
# Install system dependencies
|
| 6 |
+
RUN apt-get update && apt-get install -y \
|
| 7 |
+
chromium-driver \
|
| 8 |
+
chromium \
|
| 9 |
+
libglib2.0-0 \
|
| 10 |
+
libnss3 \
|
| 11 |
+
libgconf-2-4 \
|
| 12 |
+
libxss1 \
|
| 13 |
+
libappindicator3-1 \
|
| 14 |
+
libasound2 \
|
| 15 |
+
libatk-bridge2.0-0 \
|
| 16 |
+
libx11-xcb1 \
|
| 17 |
+
libxcomposite1 \
|
| 18 |
+
libxdamage1 \
|
| 19 |
+
libxrandr2 \
|
| 20 |
+
libgbm1 \
|
| 21 |
+
libgtk-3-0 \
|
| 22 |
+
xdg-utils \
|
| 23 |
+
fonts-liberation \
|
| 24 |
+
&& apt-get clean && rm -rf /var/lib/apt/lists/*
|
| 25 |
+
|
| 26 |
+
# Set environment variable so Selenium uses Chromium
|
| 27 |
+
ENV CHROME_BIN=/usr/bin/chromium
|
| 28 |
+
ENV PATH="/usr/lib/chromium/:${PATH}"
|
| 29 |
+
|
| 30 |
+
# Install Python dependencies
|
| 31 |
+
COPY requirements.txt .
|
| 32 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 33 |
+
|
| 34 |
+
# Install Selenium and chromedriver-autoinstaller
|
| 35 |
+
RUN pip install --no-cache-dir selenium chromedriver-autoinstaller
|
| 36 |
+
|
| 37 |
+
# Copy your FastAPI app
|
| 38 |
+
COPY app /app
|
| 39 |
+
WORKDIR /app
|
| 40 |
+
|
| 41 |
+
# Expose FastAPI port
|
| 42 |
+
EXPOSE 7860
|
| 43 |
+
|
| 44 |
+
# Start FastAPI
|
| 45 |
+
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
|
app/__pycache__/clickloom_llm.cpython-39.pyc
ADDED
|
Binary file (2.95 kB). View file
|
|
|
app/__pycache__/clickloom_scrape.cpython-39.pyc
ADDED
|
Binary file (1.37 kB). View file
|
|
|
app/clickloom_scrape.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import chromedriver_autoinstaller
|
| 2 |
+
from selenium import webdriver
|
| 3 |
+
from selenium.webdriver.chrome.options import Options
|
| 4 |
+
from selenium.webdriver.common.by import By
|
| 5 |
+
from selenium.common.exceptions import StaleElementReferenceException
|
| 6 |
+
import json
|
| 7 |
+
|
| 8 |
+
chromedriver_autoinstaller.install() # Automatically installs compatible driver
|
| 9 |
+
|
| 10 |
+
options = Options()
|
| 11 |
+
# options.binary_location = "/usr/bin/chromium-browser"
|
| 12 |
+
options.add_argument("--headless")
|
| 13 |
+
options.add_argument("--no-sandbox")
|
| 14 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def scraper(link:str,options = options):
|
| 18 |
+
driver = webdriver.Chrome(options=options)
|
| 19 |
+
driver.get(link)
|
| 20 |
+
page_text = driver.find_element(By.TAG_NAME, "body").text
|
| 21 |
+
scripts = driver.find_elements(By.TAG_NAME, "script")
|
| 22 |
+
script_sources = [s.get_attribute("src") for s in scripts if s.get_attribute("src")]
|
| 23 |
+
links = driver.find_elements(By.TAG_NAME, "link")
|
| 24 |
+
link_sources = [l.get_attribute("href") for l in links if l.get_attribute("href")]
|
| 25 |
+
|
| 26 |
+
driver.quit()
|
| 27 |
+
data = {
|
| 28 |
+
"page_text": page_text,
|
| 29 |
+
"script_sources": script_sources,
|
| 30 |
+
"link_sources": link_sources,
|
| 31 |
+
}
|
| 32 |
+
return data
|
app/example.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
|
| 3 |
+
# URL of the running FastAPI scraper server
|
| 4 |
+
base_url = "https://apexherbert200-selenium-scraper2.hf.space/scrape"
|
| 5 |
+
|
| 6 |
+
# Target page to scrape
|
| 7 |
+
params = {"link": "https://jobright.ai/jobs/info/681ab6e27e673b00b9024e36"}
|
| 8 |
+
|
| 9 |
+
# Send GET request
|
| 10 |
+
response = requests.get(base_url, params=params)
|
| 11 |
+
|
| 12 |
+
# Print the JSON response
|
| 13 |
+
if response.status_code == 200:
|
| 14 |
+
data = response.json()
|
| 15 |
+
print(data)
|
| 16 |
+
else:
|
| 17 |
+
print("Error:", response.status_code, response.text)
|
app/main.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, Query
|
| 2 |
+
from clickloom_scrape import scraper
|
| 3 |
+
|
| 4 |
+
app = FastAPI()
|
| 5 |
+
|
| 6 |
+
@app.get("/scrape")
|
| 7 |
+
def scrape(link: str):
|
| 8 |
+
return scraper(link)
|
| 9 |
+
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
selenium==4.14.0
|
| 2 |
+
fastapi
|
| 3 |
+
uvicorn
|
| 4 |
+
pydantic
|
| 5 |
+
chromedriver-autoinstaller
|