Spaces:

Sinketji
/

Scraping-tool

Paused

File size: 3,846 Bytes

from fastapi import FastAPI, Form
from fastapi.responses import JSONResponse, HTMLResponse
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import os

# Stealth engine
from stealth_browser import (
    launch_stealth_browser,
    stealth_goto,
    close_browser
)

app = FastAPI(title="Ultra Powerful Scraper")

# --------------------------------
# Utils
# --------------------------------
def get_headers():
    ua = UserAgent()
    return {
        "User-Agent": ua.random,
        "Accept-Language": "en-US,en;q=0.9",
        "Accept": "text/html,application/xhtml+xml",
        "Connection": "keep-alive",
        "DNT": "1",
    }

def is_cloudflare_page(html: str) -> bool:
    markers = [
        "cf-browser-verification",
        "cloudflare",
        "Attention Required!",
        "Checking your browser",
        "/cdn-cgi/"
    ]
    html_lower = html.lower()
    return any(m.lower() in html_lower for m in markers)

def parse_html(html: str):
    soup = BeautifulSoup(html, "lxml")
    return {
        "title": soup.title.string.strip() if soup.title else "",
        "headings": [h.get_text(strip=True) for h in soup.find_all(["h1", "h2", "h3"])],
        "paragraphs": [p.get_text(strip=True) for p in soup.find_all("p")],
        "links": list(set(a["href"] for a in soup.find_all("a", href=True))),
    }

# --------------------------------
# Static scrape
# --------------------------------
def static_scrape(url: str):
    r = requests.get(url, headers=get_headers(), timeout=20)
    r.raise_for_status()

    if is_cloudflare_page(r.text):
        raise RuntimeError("Cloudflare detected")

    data = parse_html(r.text)
    return {
        "success": True,
        "engine": "requests",
        "bypass": "not_needed",
        **data
    }

# --------------------------------
# Stealth scrape (Cloudflare-aware)
# --------------------------------
def stealth_scrape(url: str):
    p, browser, context, page = launch_stealth_browser(headless=True)

    try:
        html = stealth_goto(page, url)
        data = parse_html(html)

        return {
            "success": True,
            "engine": "playwright-stealth",
            "bypass": "attempted",
            **data
        }
    finally:
        close_browser(p, browser)

# --------------------------------
# API
# --------------------------------
@app.post("/scrape")
def scrape(url: str = Form(...)):
    if not url.startswith("http"):
        return JSONResponse(
            status_code=400,
            content={
                "success": False,
                "error": "Invalid URL. Must start with http or https."
            }
        )

    logs = []

    # 1️⃣ Try static
    try:
        logs.append("Trying static scraping (requests)")
        result = static_scrape(url)
        logs.append("Static scrape successful")

        result["logs"] = logs
        return JSONResponse(result)

    except Exception as e:
        logs.append(f"Static failed: {str(e)}")

    # 2️⃣ Fallback to stealth
    try:
        logs.append("Switching to stealth browser (Cloudflare bypass)")
        result = stealth_scrape(url)
        logs.append("Stealth scrape completed")

        result["logs"] = logs
        return JSONResponse(result)

    except Exception as e:
        logs.append(f"Stealth failed: {str(e)}")

    return JSONResponse(
        status_code=500,
        content={
            "success": False,
            "error": "All scraping methods failed",
            "logs": logs
        }
    )

# --------------------------------
# Serve UI
# --------------------------------
@app.get("/", response_class=HTMLResponse)
def serve_ui():
    if not os.path.exists("index.html"):
        return "<h1>index.html not found</h1>"

    with open("index.html", "r", encoding="utf-8") as f:
        return f.read()