Initial commit

Files changed (10) hide show

.dockerignore ADDED Viewed

+_pycache_
+app/_pycache_
+venv
+.env

.env ADDED Viewed

	@@ -0,0 +1 @@


1	+ groq_api_key = "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"

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ venv

Dockerfile ADDED Viewed

+FROM python:3.10-slim
+ENV DEBIAN_FRONTEND=noninteractive
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    chromium-driver \
+    chromium \
+    libglib2.0-0 \
+    libnss3 \
+    libgconf-2-4 \
+    libxss1 \
+    libappindicator3-1 \
+    libasound2 \
+    libatk-bridge2.0-0 \
+    libx11-xcb1 \
+    libxcomposite1 \
+    libxdamage1 \
+    libxrandr2 \
+    libgbm1 \
+    libgtk-3-0 \
+    xdg-utils \
+    fonts-liberation \
+    && apt-get clean && rm -rf /var/lib/apt/lists/*
+# Set environment variable so Selenium uses Chromium
+ENV CHROME_BIN=/usr/bin/chromium
+ENV PATH="/usr/lib/chromium/:${PATH}"
+# Install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Install Selenium and chromedriver-autoinstaller
+RUN pip install --no-cache-dir selenium chromedriver-autoinstaller
+# Copy your FastAPI app
+COPY app /app
+WORKDIR /app
+# Expose FastAPI port
+EXPOSE 7860
+# Start FastAPI
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]

app/__pycache__/clickloom_llm.cpython-39.pyc ADDED Viewed

Binary file (2.95 kB). View file

app/__pycache__/clickloom_scrape.cpython-39.pyc ADDED Viewed

Binary file (1.37 kB). View file

app/clickloom_scrape.py ADDED Viewed

+import chromedriver_autoinstaller
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from selenium.common.exceptions import StaleElementReferenceException
+import json
+chromedriver_autoinstaller.install()  # Automatically installs compatible driver
+options = Options()
+# options.binary_location = "/usr/bin/chromium-browser"
+options.add_argument("--headless")
+options.add_argument("--no-sandbox")
+options.add_argument("--disable-dev-shm-usage")
+def scraper(link:str,options = options):
+    driver = webdriver.Chrome(options=options)
+    driver.get(link)
+    page_text = driver.find_element(By.TAG_NAME, "body").text
+    scripts = driver.find_elements(By.TAG_NAME, "script")
+    script_sources = [s.get_attribute("src") for s in scripts if s.get_attribute("src")]
+    links = driver.find_elements(By.TAG_NAME, "link")
+    link_sources = [l.get_attribute("href") for l in links if l.get_attribute("href")]
+    driver.quit()
+    data =  {
+        "page_text": page_text,
+        "script_sources": script_sources,
+        "link_sources": link_sources,
+    }
+    return data

app/example.py ADDED Viewed

+import requests
+# URL of the running FastAPI scraper server
+base_url = "https://apexherbert200-selenium-scraper2.hf.space/scrape"
+# Target page to scrape
+params = {"link": "https://jobright.ai/jobs/info/681ab6e27e673b00b9024e36"}
+# Send GET request
+response = requests.get(base_url, params=params)
+# Print the JSON response
+if response.status_code == 200:
+    data = response.json()
+    print(data)
+else:
+    print("Error:", response.status_code, response.text)

app/main.py ADDED Viewed

+from fastapi import FastAPI, Query
+from clickloom_scrape import scraper
+app = FastAPI()
+@app.get("/scrape")
+def scrape(link: str):
+    return scraper(link)

requirements.txt ADDED Viewed

+selenium==4.14.0
+fastapi
+uvicorn
+pydantic
+chromedriver-autoinstaller