Spaces:

Sinketji
/

Scraping-tool

Sleeping

App Files Files Community

Sinketji commited on Dec 25, 2025

Commit

d0202f5

verified ·

1 Parent(s): 53ef5e6

Update app.py

Browse files

Files changed (1) hide show

app.py +97 -48

app.py CHANGED Viewed

@@ -3,14 +3,20 @@ from fastapi.responses import JSONResponse, HTMLResponse
 import requests
 from bs4 import BeautifulSoup
 from fake_useragent import UserAgent
-from playwright.sync_api import sync_playwright
 import os
 app = FastAPI(title="Ultra Powerful Scraper")
-# ---------------------------
-# Utility: Anti-bot headers
-# ---------------------------
 def get_headers():
     ua = UserAgent()
     return {
@@ -21,76 +27,119 @@ def get_headers():
         "DNT": "1",
     }
-# ---------------------------
-# Static scraping
-# ---------------------------
-def static_scrape(url: str):
-    r = requests.get(url, headers=get_headers(), timeout=20)
-    r.raise_for_status()
-    soup = BeautifulSoup(r.text, "lxml")
     return {
-        "success": True,
-        "mode": "static",
         "title": soup.title.string.strip() if soup.title else "",
         "headings": [h.get_text(strip=True) for h in soup.find_all(["h1", "h2", "h3"])],
         "paragraphs": [p.get_text(strip=True) for p in soup.find_all("p")],
         "links": list(set(a["href"] for a in soup.find_all("a", href=True))),
     }
-# ---------------------------
-# Dynamic scraping (JS-heavy)
-# ---------------------------
-def dynamic_scrape(url: str):
-    with sync_playwright() as p:
-        browser = p.chromium.launch(headless=True)
-        page = browser.new_page()
-        page.goto(url, timeout=30000)
-        page.wait_for_timeout(3000)
-        html = page.content()
-        browser.close()
-    soup = BeautifulSoup(html, "lxml")
     return {
         "success": True,
-        "mode": "dynamic",
-        "title": soup.title.string.strip() if soup.title else "",
-        "headings": [h.get_text(strip=True) for h in soup.find_all(["h1", "h2", "h3"])],
-        "paragraphs": [p.get_text(strip=True) for p in soup.find_all("p")],
-        "links": list(set(a["href"] for a in soup.find_all("a", href=True))),
     }
-# ---------------------------
-# API endpoint
-# ---------------------------
 @app.post("/scrape")
 def scrape(url: str = Form(...)):
     if not url.startswith("http"):
         return JSONResponse(
             status_code=400,
-            content={"success": False, "error": "Invalid URL. Must start with http or https."}
         )
     try:
-        return JSONResponse(static_scrape(url))
-    except Exception:
-        try:
-            return JSONResponse(dynamic_scrape(url))
-        except Exception as e:
-            return JSONResponse(
-                status_code=500,
-                content={"success": False, "error": str(e)}
-            )
-# ---------------------------
 # Serve UI
-# ---------------------------
 @app.get("/", response_class=HTMLResponse)
 def serve_ui():
     if not os.path.exists("index.html"):
-        return "<h1>UI file (index.html) not found</h1>"
     with open("index.html", "r", encoding="utf-8") as f:
         return f.read()

 import requests
 from bs4 import BeautifulSoup
 from fake_useragent import UserAgent
 import os
+# Stealth engine
+from stealth_browser import (
+    launch_stealth_browser,
+    stealth_goto,
+    close_browser
+)
 app = FastAPI(title="Ultra Powerful Scraper")
+# --------------------------------
+# Utils
+# --------------------------------
 def get_headers():
     ua = UserAgent()
     return {
         "DNT": "1",
     }
+def is_cloudflare_page(html: str) -> bool:
+    markers = [
+        "cf-browser-verification",
+        "cloudflare",
+        "Attention Required!",
+        "Checking your browser",
+        "/cdn-cgi/"
+    ]
+    html_lower = html.lower()
+    return any(m.lower() in html_lower for m in markers)
+def parse_html(html: str):
+    soup = BeautifulSoup(html, "lxml")
     return {
         "title": soup.title.string.strip() if soup.title else "",
         "headings": [h.get_text(strip=True) for h in soup.find_all(["h1", "h2", "h3"])],
         "paragraphs": [p.get_text(strip=True) for p in soup.find_all("p")],
         "links": list(set(a["href"] for a in soup.find_all("a", href=True))),
     }
+# --------------------------------
+# Static scrape
+# --------------------------------
+def static_scrape(url: str):
+    r = requests.get(url, headers=get_headers(), timeout=20)
+    r.raise_for_status()
+    if is_cloudflare_page(r.text):
+        raise RuntimeError("Cloudflare detected")
+    data = parse_html(r.text)
     return {
         "success": True,
+        "engine": "requests",
+        "bypass": "not_needed",
+        **data
     }
+# --------------------------------
+# Stealth scrape (Cloudflare-aware)
+# --------------------------------
+def stealth_scrape(url: str):
+    p, browser, context, page = launch_stealth_browser(headless=True)
+    try:
+        html = stealth_goto(page, url)
+        data = parse_html(html)
+        return {
+            "success": True,
+            "engine": "playwright-stealth",
+            "bypass": "attempted",
+            **data
+        }
+    finally:
+        close_browser(p, browser)
+# --------------------------------
+# API
+# --------------------------------
 @app.post("/scrape")
 def scrape(url: str = Form(...)):
     if not url.startswith("http"):
         return JSONResponse(
             status_code=400,
+            content={
+                "success": False,
+                "error": "Invalid URL. Must start with http or https."
+            }
         )
+    logs = []
+    # 1️⃣ Try static
+    try:
+        logs.append("Trying static scraping (requests)")
+        result = static_scrape(url)
+        logs.append("Static scrape successful")
+        result["logs"] = logs
+        return JSONResponse(result)
+    except Exception as e:
+        logs.append(f"Static failed: {str(e)}")
+    # 2️⃣ Fallback to stealth
     try:
+        logs.append("Switching to stealth browser (Cloudflare bypass)")
+        result = stealth_scrape(url)
+        logs.append("Stealth scrape completed")
+        result["logs"] = logs
+        return JSONResponse(result)
+    except Exception as e:
+        logs.append(f"Stealth failed: {str(e)}")
+    return JSONResponse(
+        status_code=500,
+        content={
+            "success": False,
+            "error": "All scraping methods failed",
+            "logs": logs
+        }
+    )
+# --------------------------------
 # Serve UI
+# --------------------------------
 @app.get("/", response_class=HTMLResponse)
 def serve_ui():
     if not os.path.exists("index.html"):
+        return "<h1>index.html not found</h1>"
     with open("index.html", "r", encoding="utf-8") as f:
         return f.read()