apexherbert200 commited on
Commit
7742d11
·
1 Parent(s): b51e0d5

Initial commit

Browse files
.dockerignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ _pycache_
2
+ app/_pycache_
3
+ venv
4
+ .env
.env ADDED
@@ -0,0 +1 @@
 
 
1
+ groq_api_key = "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"
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ venv
Dockerfile ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ ENV DEBIAN_FRONTEND=noninteractive
4
+
5
+ # Install system dependencies
6
+ RUN apt-get update && apt-get install -y \
7
+ chromium-driver \
8
+ chromium \
9
+ libglib2.0-0 \
10
+ libnss3 \
11
+ libgconf-2-4 \
12
+ libxss1 \
13
+ libappindicator3-1 \
14
+ libasound2 \
15
+ libatk-bridge2.0-0 \
16
+ libx11-xcb1 \
17
+ libxcomposite1 \
18
+ libxdamage1 \
19
+ libxrandr2 \
20
+ libgbm1 \
21
+ libgtk-3-0 \
22
+ xdg-utils \
23
+ fonts-liberation \
24
+ && apt-get clean && rm -rf /var/lib/apt/lists/*
25
+
26
+ # Set environment variable so Selenium uses Chromium
27
+ ENV CHROME_BIN=/usr/bin/chromium
28
+ ENV PATH="/usr/lib/chromium/:${PATH}"
29
+
30
+ # Install Python dependencies
31
+ COPY requirements.txt .
32
+ RUN pip install --no-cache-dir -r requirements.txt
33
+
34
+ # Install Selenium and chromedriver-autoinstaller
35
+ RUN pip install --no-cache-dir selenium chromedriver-autoinstaller
36
+
37
+ # Copy your FastAPI app
38
+ COPY app /app
39
+ WORKDIR /app
40
+
41
+ # Expose FastAPI port
42
+ EXPOSE 7860
43
+
44
+ # Start FastAPI
45
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
46
+
47
+
48
+
49
+
app/__pycache__/clickloom_llm.cpython-39.pyc ADDED
Binary file (2.95 kB). View file
 
app/__pycache__/clickloom_scrape.cpython-39.pyc ADDED
Binary file (1.37 kB). View file
 
app/clickloom_scrape.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import chromedriver_autoinstaller
2
+ from selenium import webdriver
3
+ from selenium.webdriver.chrome.options import Options
4
+ from selenium.webdriver.common.by import By
5
+ from selenium.common.exceptions import StaleElementReferenceException
6
+ import json
7
+
8
+ chromedriver_autoinstaller.install() # Automatically installs compatible driver
9
+
10
+ options = Options()
11
+ # options.binary_location = "/usr/bin/chromium-browser"
12
+ options.add_argument("--headless")
13
+ options.add_argument("--no-sandbox")
14
+ options.add_argument("--disable-dev-shm-usage")
15
+
16
+
17
+ def scraper(link:str,options = options):
18
+ driver = webdriver.Chrome(options=options)
19
+ driver.get(link)
20
+ page_text = driver.find_element(By.TAG_NAME, "body").text
21
+ scripts = driver.find_elements(By.TAG_NAME, "script")
22
+ script_sources = [s.get_attribute("src") for s in scripts if s.get_attribute("src")]
23
+ links = driver.find_elements(By.TAG_NAME, "link")
24
+ link_sources = [l.get_attribute("href") for l in links if l.get_attribute("href")]
25
+
26
+ driver.quit()
27
+ data = {
28
+ "page_text": page_text,
29
+ "script_sources": script_sources,
30
+ "link_sources": link_sources,
31
+ }
32
+ return data
app/example.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+
3
+ # URL of the running FastAPI scraper server
4
+ base_url = "https://apexherbert200-selenium-scraper2.hf.space/scrape"
5
+
6
+ # Target page to scrape
7
+ params = {"link": "https://jobright.ai/jobs/info/681ab6e27e673b00b9024e36"}
8
+
9
+ # Send GET request
10
+ response = requests.get(base_url, params=params)
11
+
12
+ # Print the JSON response
13
+ if response.status_code == 200:
14
+ data = response.json()
15
+ print(data)
16
+ else:
17
+ print("Error:", response.status_code, response.text)
app/main.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, Query
2
+ from clickloom_scrape import scraper
3
+
4
+ app = FastAPI()
5
+
6
+ @app.get("/scrape")
7
+ def scrape(link: str):
8
+ return scraper(link)
9
+
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ selenium==4.14.0
2
+ fastapi
3
+ uvicorn
4
+ pydantic
5
+ chromedriver-autoinstaller