Scraping-tool / app.py
Sinketji's picture
Update app.py
d0202f5 verified
from fastapi import FastAPI, Form
from fastapi.responses import JSONResponse, HTMLResponse
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import os
# Stealth engine
from stealth_browser import (
launch_stealth_browser,
stealth_goto,
close_browser
)
app = FastAPI(title="Ultra Powerful Scraper")
# --------------------------------
# Utils
# --------------------------------
def get_headers():
ua = UserAgent()
return {
"User-Agent": ua.random,
"Accept-Language": "en-US,en;q=0.9",
"Accept": "text/html,application/xhtml+xml",
"Connection": "keep-alive",
"DNT": "1",
}
def is_cloudflare_page(html: str) -> bool:
markers = [
"cf-browser-verification",
"cloudflare",
"Attention Required!",
"Checking your browser",
"/cdn-cgi/"
]
html_lower = html.lower()
return any(m.lower() in html_lower for m in markers)
def parse_html(html: str):
soup = BeautifulSoup(html, "lxml")
return {
"title": soup.title.string.strip() if soup.title else "",
"headings": [h.get_text(strip=True) for h in soup.find_all(["h1", "h2", "h3"])],
"paragraphs": [p.get_text(strip=True) for p in soup.find_all("p")],
"links": list(set(a["href"] for a in soup.find_all("a", href=True))),
}
# --------------------------------
# Static scrape
# --------------------------------
def static_scrape(url: str):
r = requests.get(url, headers=get_headers(), timeout=20)
r.raise_for_status()
if is_cloudflare_page(r.text):
raise RuntimeError("Cloudflare detected")
data = parse_html(r.text)
return {
"success": True,
"engine": "requests",
"bypass": "not_needed",
**data
}
# --------------------------------
# Stealth scrape (Cloudflare-aware)
# --------------------------------
def stealth_scrape(url: str):
p, browser, context, page = launch_stealth_browser(headless=True)
try:
html = stealth_goto(page, url)
data = parse_html(html)
return {
"success": True,
"engine": "playwright-stealth",
"bypass": "attempted",
**data
}
finally:
close_browser(p, browser)
# --------------------------------
# API
# --------------------------------
@app.post("/scrape")
def scrape(url: str = Form(...)):
if not url.startswith("http"):
return JSONResponse(
status_code=400,
content={
"success": False,
"error": "Invalid URL. Must start with http or https."
}
)
logs = []
# 1️⃣ Try static
try:
logs.append("Trying static scraping (requests)")
result = static_scrape(url)
logs.append("Static scrape successful")
result["logs"] = logs
return JSONResponse(result)
except Exception as e:
logs.append(f"Static failed: {str(e)}")
# 2️⃣ Fallback to stealth
try:
logs.append("Switching to stealth browser (Cloudflare bypass)")
result = stealth_scrape(url)
logs.append("Stealth scrape completed")
result["logs"] = logs
return JSONResponse(result)
except Exception as e:
logs.append(f"Stealth failed: {str(e)}")
return JSONResponse(
status_code=500,
content={
"success": False,
"error": "All scraping methods failed",
"logs": logs
}
)
# --------------------------------
# Serve UI
# --------------------------------
@app.get("/", response_class=HTMLResponse)
def serve_ui():
if not os.path.exists("index.html"):
return "<h1>index.html not found</h1>"
with open("index.html", "r", encoding="utf-8") as f:
return f.read()