Spaces:

Sinketji
/

Scraping-tool

Paused

App Files Files Community

Scraping-tool / app.py

Sinketji

Update app.py

d0202f5 verified 2 months ago

raw

history blame contribute delete

3.85 kB

	from fastapi import FastAPI, Form
	from fastapi.responses import JSONResponse, HTMLResponse
	import requests
	from bs4 import BeautifulSoup
	from fake_useragent import UserAgent
	import os

	# Stealth engine
	from stealth_browser import (
	launch_stealth_browser,
	stealth_goto,
	close_browser
	)

	app = FastAPI(title="Ultra Powerful Scraper")

	# --------------------------------
	# Utils
	# --------------------------------
	def get_headers():
	ua = UserAgent()
	return {
	"User-Agent": ua.random,
	"Accept-Language": "en-US,en;q=0.9",
	"Accept": "text/html,application/xhtml+xml",
	"Connection": "keep-alive",
	"DNT": "1",
	}

	def is_cloudflare_page(html: str) -> bool:
	markers = [
	"cf-browser-verification",
	"cloudflare",
	"Attention Required!",
	"Checking your browser",
	"/cdn-cgi/"
	]
	html_lower = html.lower()
	return any(m.lower() in html_lower for m in markers)

	def parse_html(html: str):
	soup = BeautifulSoup(html, "lxml")
	return {
	"title": soup.title.string.strip() if soup.title else "",
	"headings": [h.get_text(strip=True) for h in soup.find_all(["h1", "h2", "h3"])],
	"paragraphs": [p.get_text(strip=True) for p in soup.find_all("p")],
	"links": list(set(a["href"] for a in soup.find_all("a", href=True))),
	}

	# --------------------------------
	# Static scrape
	# --------------------------------
	def static_scrape(url: str):
	r = requests.get(url, headers=get_headers(), timeout=20)
	r.raise_for_status()

	if is_cloudflare_page(r.text):
	raise RuntimeError("Cloudflare detected")

	data = parse_html(r.text)
	return {
	"success": True,
	"engine": "requests",
	"bypass": "not_needed",
	**data
	}

	# --------------------------------
	# Stealth scrape (Cloudflare-aware)
	# --------------------------------
	def stealth_scrape(url: str):
	p, browser, context, page = launch_stealth_browser(headless=True)

	try:
	html = stealth_goto(page, url)
	data = parse_html(html)

	return {
	"success": True,
	"engine": "playwright-stealth",
	"bypass": "attempted",
	**data
	}
	finally:
	close_browser(p, browser)

	# --------------------------------
	# API
	# --------------------------------
	@app.post("/scrape")
	def scrape(url: str = Form(...)):
	if not url.startswith("http"):
	return JSONResponse(
	status_code=400,
	content={
	"success": False,
	"error": "Invalid URL. Must start with http or https."
	}
	)

	logs = []

	# 1️⃣ Try static
	try:
	logs.append("Trying static scraping (requests)")
	result = static_scrape(url)
	logs.append("Static scrape successful")

	result["logs"] = logs
	return JSONResponse(result)

	except Exception as e:
	logs.append(f"Static failed: {str(e)}")

	# 2️⃣ Fallback to stealth
	try:
	logs.append("Switching to stealth browser (Cloudflare bypass)")
	result = stealth_scrape(url)
	logs.append("Stealth scrape completed")

	result["logs"] = logs
	return JSONResponse(result)

	except Exception as e:
	logs.append(f"Stealth failed: {str(e)}")

	return JSONResponse(
	status_code=500,
	content={
	"success": False,
	"error": "All scraping methods failed",
	"logs": logs
	}
	)

	# --------------------------------
	# Serve UI
	# --------------------------------
	@app.get("/", response_class=HTMLResponse)
	def serve_ui():
	if not os.path.exists("index.html"):
	return "<h1>index.html not found</h1>"

	with open("index.html", "r", encoding="utf-8") as f:
	return f.read()