Spaces:
No application file
No application file
Upload 3 files
Browse files- app.py +398 -0
- browser_automation_ui.html +270 -0
- requirements.txt +39 -0
app.py
ADDED
|
@@ -0,0 +1,398 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import base64
|
| 3 |
+
import json
|
| 4 |
+
import logging
|
| 5 |
+
import os
|
| 6 |
+
import time
|
| 7 |
+
import uuid
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
from io import BytesIO
|
| 10 |
+
from typing import Dict, List, Optional, Any
|
| 11 |
+
from contextlib import asynccontextmanager
|
| 12 |
+
|
| 13 |
+
import gradio as gr
|
| 14 |
+
import uvicorn
|
| 15 |
+
from fastapi import FastAPI, HTTPException, BackgroundTasks
|
| 16 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 17 |
+
from fastapi.responses import JSONResponse
|
| 18 |
+
from pydantic import BaseModel
|
| 19 |
+
from playwright.async_api import async_playwright, Browser, BrowserContext, Page
|
| 20 |
+
from selenium import webdriver
|
| 21 |
+
from selenium.webdriver.chrome.options import Options
|
| 22 |
+
from selenium.webdriver.chrome.service import Service
|
| 23 |
+
from selenium.webdriver.common.by import By
|
| 24 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 25 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 26 |
+
from selenium.common.exceptions import TimeoutException, WebDriverException
|
| 27 |
+
from bs4 import BeautifulSoup
|
| 28 |
+
from PIL import Image
|
| 29 |
+
|
| 30 |
+
# Configure logging
|
| 31 |
+
logging.basicConfig(level=logging.INFO)
|
| 32 |
+
logger = logging.getLogger(__name__)
|
| 33 |
+
|
| 34 |
+
# Global browser instances
|
| 35 |
+
browser_instances: Dict[str, Dict] = {}
|
| 36 |
+
playwright_instance = None
|
| 37 |
+
browser_pool = None
|
| 38 |
+
|
| 39 |
+
# Pydantic models
|
| 40 |
+
class BrowserLaunchRequest(BaseModel):
|
| 41 |
+
headless: bool = True
|
| 42 |
+
width: int = 1920
|
| 43 |
+
height: int = 1080
|
| 44 |
+
user_agent: Optional[str] = None
|
| 45 |
+
|
| 46 |
+
class NavigateRequest(BaseModel):
|
| 47 |
+
session_id: str
|
| 48 |
+
url: str
|
| 49 |
+
wait_until: str = "networkidle"
|
| 50 |
+
|
| 51 |
+
class ScreenshotRequest(BaseModel):
|
| 52 |
+
session_id: str
|
| 53 |
+
full_page: bool = False
|
| 54 |
+
selector: Optional[str] = None
|
| 55 |
+
|
| 56 |
+
class ElementActionRequest(BaseModel):
|
| 57 |
+
session_id: str
|
| 58 |
+
selector: str
|
| 59 |
+
action: str # click, type, scroll, hover
|
| 60 |
+
value: Optional[str] = None
|
| 61 |
+
|
| 62 |
+
class ScrapeRequest(BaseModel):
|
| 63 |
+
session_id: str
|
| 64 |
+
selectors: Optional[List[str]] = None
|
| 65 |
+
extract_all: bool = False
|
| 66 |
+
|
| 67 |
+
class AIExtractionRequest(BaseModel):
|
| 68 |
+
session_id: str
|
| 69 |
+
prompt: str
|
| 70 |
+
target_elements: Optional[List[str]] = None
|
| 71 |
+
|
| 72 |
+
# Global lifespan manager
|
| 73 |
+
@asynccontextmanager
|
| 74 |
+
async def lifespan(app: FastAPI):
|
| 75 |
+
global playwright_instance, browser_pool
|
| 76 |
+
|
| 77 |
+
# Startup
|
| 78 |
+
playwright_instance = await async_playwright().start()
|
| 79 |
+
browser_pool = await playwright_instance.chromium.launch(
|
| 80 |
+
headless=True,
|
| 81 |
+
args=['--no-sandbox', '--disable-dev-shm-usage', '--disable-gpu']
|
| 82 |
+
)
|
| 83 |
+
logger.info("Browser pool initialized")
|
| 84 |
+
|
| 85 |
+
yield
|
| 86 |
+
|
| 87 |
+
# Shutdown
|
| 88 |
+
if browser_pool:
|
| 89 |
+
await browser_pool.close()
|
| 90 |
+
if playwright_instance:
|
| 91 |
+
await playwright_instance.stop()
|
| 92 |
+
logger.info("Browser instances cleaned up")
|
| 93 |
+
|
| 94 |
+
# Initialize FastAPI app
|
| 95 |
+
app = FastAPI(
|
| 96 |
+
title="Web Scraping API Service",
|
| 97 |
+
description="Headless browser automation with Playwright and Selenium",
|
| 98 |
+
version="1.0.0",
|
| 99 |
+
lifespan=lifespan
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
app.add_middleware(
|
| 103 |
+
CORSMiddleware,
|
| 104 |
+
allow_origins=["*"],
|
| 105 |
+
allow_credentials=True,
|
| 106 |
+
allow_methods=["*"],
|
| 107 |
+
allow_headers=["*"],
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
# Utility functions
|
| 111 |
+
def get_chrome_options():
|
| 112 |
+
"""Get Chrome options for Selenium"""
|
| 113 |
+
options = Options()
|
| 114 |
+
options.add_argument('--headless')
|
| 115 |
+
options.add_argument('--no-sandbox')
|
| 116 |
+
options.add_argument('--disable-dev-shm-usage')
|
| 117 |
+
options.add_argument('--disable-gpu')
|
| 118 |
+
options.add_argument('--window-size=1920,1080')
|
| 119 |
+
return options
|
| 120 |
+
|
| 121 |
+
def cleanup_old_sessions():
|
| 122 |
+
"""Clean up sessions older than 1 hour"""
|
| 123 |
+
current_time = time.time()
|
| 124 |
+
expired_sessions = []
|
| 125 |
+
|
| 126 |
+
for session_id, session_data in browser_instances.items():
|
| 127 |
+
if current_time - session_data.get('created_at', 0) > 3600: # 1 hour
|
| 128 |
+
expired_sessions.append(session_id)
|
| 129 |
+
|
| 130 |
+
for session_id in expired_sessions:
|
| 131 |
+
asyncio.create_task(close_browser_session(session_id))
|
| 132 |
+
|
| 133 |
+
async def close_browser_session(session_id: str):
|
| 134 |
+
"""Close a specific browser session"""
|
| 135 |
+
if session_id in browser_instances:
|
| 136 |
+
session = browser_instances[session_id]
|
| 137 |
+
|
| 138 |
+
# Close Playwright session
|
| 139 |
+
if 'playwright_page' in session:
|
| 140 |
+
try:
|
| 141 |
+
await session['playwright_page'].close()
|
| 142 |
+
await session['playwright_context'].close()
|
| 143 |
+
except Exception as e:
|
| 144 |
+
logger.error(f"Error closing Playwright session {session_id}: {e}")
|
| 145 |
+
|
| 146 |
+
# Close Selenium session
|
| 147 |
+
if 'selenium_driver' in session:
|
| 148 |
+
try:
|
| 149 |
+
session['selenium_driver'].quit()
|
| 150 |
+
except Exception as e:
|
| 151 |
+
logger.error(f"Error closing Selenium session {session_id}: {e}")
|
| 152 |
+
|
| 153 |
+
del browser_instances[session_id]
|
| 154 |
+
logger.info(f"Closed browser session: {session_id}")
|
| 155 |
+
|
| 156 |
+
# API Endpoints
|
| 157 |
+
|
| 158 |
+
@app.get("/health")
|
| 159 |
+
async def health_check():
|
| 160 |
+
"""Health check endpoint"""
|
| 161 |
+
return {"status": "healthy", "timestamp": datetime.now().isoformat()}
|
| 162 |
+
|
| 163 |
+
@app.post("/api/browser/launch")
|
| 164 |
+
async def launch_browser(request: BrowserLaunchRequest):
|
| 165 |
+
"""Launch a new browser instance"""
|
| 166 |
+
session_id = str(uuid.uuid4())
|
| 167 |
+
|
| 168 |
+
try:
|
| 169 |
+
# Launch Playwright browser
|
| 170 |
+
context = await browser_pool.new_context(
|
| 171 |
+
viewport={'width': request.width, 'height': request.height},
|
| 172 |
+
user_agent=request.user_agent
|
| 173 |
+
)
|
| 174 |
+
page = await context.new_page()
|
| 175 |
+
|
| 176 |
+
# Launch Selenium browser
|
| 177 |
+
chrome_options = get_chrome_options()
|
| 178 |
+
if request.user_agent:
|
| 179 |
+
chrome_options.add_argument(f'--user-agent={request.user_agent}')
|
| 180 |
+
|
| 181 |
+
selenium_driver = webdriver.Chrome(options=chrome_options)
|
| 182 |
+
selenium_driver.set_window_size(request.width, request.height)
|
| 183 |
+
|
| 184 |
+
# Store session
|
| 185 |
+
browser_instances[session_id] = {
|
| 186 |
+
'playwright_context': context,
|
| 187 |
+
'playwright_page': page,
|
| 188 |
+
'selenium_driver': selenium_driver,
|
| 189 |
+
'created_at': time.time(),
|
| 190 |
+
'config': request.dict()
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
logger.info(f"Launched browser session: {session_id}")
|
| 194 |
+
return {"session_id": session_id, "status": "launched"}
|
| 195 |
+
|
| 196 |
+
except Exception as e:
|
| 197 |
+
logger.error(f"Error launching browser: {e}")
|
| 198 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 199 |
+
|
| 200 |
+
@app.post("/api/browser/navigate")
|
| 201 |
+
async def navigate_to_url(request: NavigateRequest):
|
| 202 |
+
"""Navigate to a URL"""
|
| 203 |
+
if request.session_id not in browser_instances:
|
| 204 |
+
raise HTTPException(status_code=404, detail="Session not found")
|
| 205 |
+
|
| 206 |
+
session = browser_instances[request.session_id]
|
| 207 |
+
|
| 208 |
+
try:
|
| 209 |
+
# Navigate with Playwright
|
| 210 |
+
await session['playwright_page'].goto(request.url, wait_until=request.wait_until)
|
| 211 |
+
|
| 212 |
+
# Navigate with Selenium
|
| 213 |
+
session['selenium_driver'].get(request.url)
|
| 214 |
+
|
| 215 |
+
return {"status": "navigated", "url": request.url}
|
| 216 |
+
|
| 217 |
+
except Exception as e:
|
| 218 |
+
logger.error(f"Error navigating to {request.url}: {e}")
|
| 219 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 220 |
+
|
| 221 |
+
@app.post("/api/browser/screenshot")
|
| 222 |
+
async def take_screenshot(request: ScreenshotRequest):
|
| 223 |
+
"""Take a screenshot"""
|
| 224 |
+
if request.session_id not in browser_instances:
|
| 225 |
+
raise HTTPException(status_code=404, detail="Session not found")
|
| 226 |
+
|
| 227 |
+
session = browser_instances[request.session_id]
|
| 228 |
+
|
| 229 |
+
try:
|
| 230 |
+
if request.selector:
|
| 231 |
+
# Screenshot specific element with Playwright
|
| 232 |
+
element = await session['playwright_page'].locator(request.selector).first
|
| 233 |
+
screenshot_bytes = await element.screenshot()
|
| 234 |
+
else:
|
| 235 |
+
# Full page screenshot with Playwright
|
| 236 |
+
screenshot_bytes = await session['playwright_page'].screenshot(
|
| 237 |
+
full_page=request.full_page
|
| 238 |
+
)
|
| 239 |
+
|
| 240 |
+
# Convert to base64
|
| 241 |
+
screenshot_b64 = base64.b64encode(screenshot_bytes).decode()
|
| 242 |
+
|
| 243 |
+
return {
|
| 244 |
+
"screenshot": screenshot_b64,
|
| 245 |
+
"format": "png",
|
| 246 |
+
"timestamp": datetime.now().isoformat()
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
except Exception as e:
|
| 250 |
+
logger.error(f"Error taking screenshot: {e}")
|
| 251 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 252 |
+
|
| 253 |
+
@app.post("/api/elements/action")
|
| 254 |
+
async def perform_element_action(request: ElementActionRequest):
|
| 255 |
+
"""Perform action on an element"""
|
| 256 |
+
if request.session_id not in browser_instances:
|
| 257 |
+
raise HTTPException(status_code=404, detail="Session not found")
|
| 258 |
+
|
| 259 |
+
session = browser_instances[request.session_id]
|
| 260 |
+
|
| 261 |
+
try:
|
| 262 |
+
page = session['playwright_page']
|
| 263 |
+
element = page.locator(request.selector).first
|
| 264 |
+
|
| 265 |
+
if request.action == "click":
|
| 266 |
+
await element.click()
|
| 267 |
+
elif request.action == "type":
|
| 268 |
+
await element.fill(request.value or "")
|
| 269 |
+
elif request.action == "scroll":
|
| 270 |
+
await element.scroll_into_view_if_needed()
|
| 271 |
+
elif request.action == "hover":
|
| 272 |
+
await element.hover()
|
| 273 |
+
else:
|
| 274 |
+
raise HTTPException(status_code=400, detail="Invalid action")
|
| 275 |
+
|
| 276 |
+
return {"status": "completed", "action": request.action}
|
| 277 |
+
|
| 278 |
+
except Exception as e:
|
| 279 |
+
logger.error(f"Error performing action {request.action}: {e}")
|
| 280 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 281 |
+
|
| 282 |
+
@app.get("/api/elements/inspect/{session_id}")
|
| 283 |
+
async def inspect_page_elements(session_id: str):
|
| 284 |
+
"""Get all interactive elements on the page"""
|
| 285 |
+
if session_id not in browser_instances:
|
| 286 |
+
raise HTTPException(status_code=404, detail="Session not found")
|
| 287 |
+
|
| 288 |
+
session = browser_instances[session_id]
|
| 289 |
+
|
| 290 |
+
try:
|
| 291 |
+
page = session['playwright_page']
|
| 292 |
+
|
| 293 |
+
# Get page content
|
| 294 |
+
content = await page.content()
|
| 295 |
+
soup = BeautifulSoup(content, 'html.parser')
|
| 296 |
+
|
| 297 |
+
# Find interactive elements
|
| 298 |
+
interactive_selectors = [
|
| 299 |
+
'a', 'button', 'input', 'select', 'textarea',
|
| 300 |
+
'[onclick]', '[href]', '[role="button"]'
|
| 301 |
+
]
|
| 302 |
+
|
| 303 |
+
elements = []
|
| 304 |
+
for selector in interactive_selectors:
|
| 305 |
+
found_elements = soup.select(selector)
|
| 306 |
+
for i, elem in enumerate(found_elements):
|
| 307 |
+
element_info = {
|
| 308 |
+
'tag': elem.name,
|
| 309 |
+
'selector': f"{selector}:nth-of-type({i+1})",
|
| 310 |
+
'text': elem.get_text(strip=True)[:100],
|
| 311 |
+
'attributes': dict(elem.attrs),
|
| 312 |
+
'type': elem.get('type', 'N/A')
|
| 313 |
+
}
|
| 314 |
+
elements.append(element_info)
|
| 315 |
+
|
| 316 |
+
return {"elements": elements, "total_count": len(elements)}
|
| 317 |
+
|
| 318 |
+
except Exception as e:
|
| 319 |
+
logger.error(f"Error inspecting elements: {e}")
|
| 320 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 321 |
+
|
| 322 |
+
@app.post("/api/scrape/content")
|
| 323 |
+
async def scrape_content(request: ScrapeRequest):
|
| 324 |
+
"""Scrape content from the page"""
|
| 325 |
+
if request.session_id not in browser_instances:
|
| 326 |
+
raise HTTPException(status_code=404, detail="Session not found")
|
| 327 |
+
|
| 328 |
+
session = browser_instances[request.session_id]
|
| 329 |
+
|
| 330 |
+
try:
|
| 331 |
+
page = session['playwright_page']
|
| 332 |
+
content = await page.content()
|
| 333 |
+
soup = BeautifulSoup(content, 'html.parser')
|
| 334 |
+
|
| 335 |
+
scraped_data = {}
|
| 336 |
+
|
| 337 |
+
if request.extract_all:
|
| 338 |
+
# Extract common elements
|
| 339 |
+
scraped_data = {
|
| 340 |
+
'title': soup.title.string if soup.title else None,
|
| 341 |
+
'headings': [h.get_text(strip=True) for h in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])],
|
| 342 |
+
'paragraphs': [p.get_text(strip=True) for p in soup.find_all('p')],
|
| 343 |
+
'links': [{'text': a.get_text(strip=True), 'href': a.get('href')} for a in soup.find_all('a', href=True)],
|
| 344 |
+
'images': [{'src': img.get('src'), 'alt': img.get('alt')} for img in soup.find_all('img')],
|
| 345 |
+
'forms': [{'action': form.get('action'), 'method': form.get('method')} for form in soup.find_all('form')]
|
| 346 |
+
}
|
| 347 |
+
elif request.selectors:
|
| 348 |
+
# Extract specific selectors
|
| 349 |
+
for selector in request.selectors:
|
| 350 |
+
elements = soup.select(selector)
|
| 351 |
+
scraped_data[selector] = [elem.get_text(strip=True) for elem in elements]
|
| 352 |
+
|
| 353 |
+
return {"data": scraped_data, "timestamp": datetime.now().isoformat()}
|
| 354 |
+
|
| 355 |
+
except Exception as e:
|
| 356 |
+
logger.error(f"Error scraping content: {e}")
|
| 357 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 358 |
+
|
| 359 |
+
@app.delete("/api/browser/close/{session_id}")
|
| 360 |
+
async def close_browser(session_id: str):
|
| 361 |
+
"""Close a browser session"""
|
| 362 |
+
if session_id not in browser_instances:
|
| 363 |
+
raise HTTPException(status_code=404, detail="Session not found")
|
| 364 |
+
|
| 365 |
+
await close_browser_session(session_id)
|
| 366 |
+
return {"status": "closed", "session_id": session_id}
|
| 367 |
+
|
| 368 |
+
@app.get("/api/sessions")
|
| 369 |
+
async def list_sessions():
|
| 370 |
+
"""List all active browser sessions"""
|
| 371 |
+
sessions = []
|
| 372 |
+
for session_id, session_data in browser_instances.items():
|
| 373 |
+
sessions.append({
|
| 374 |
+
'session_id': session_id,
|
| 375 |
+
'created_at': datetime.fromtimestamp(session_data['created_at']).isoformat(),
|
| 376 |
+
'config': session_data['config']
|
| 377 |
+
})
|
| 378 |
+
|
| 379 |
+
return {"sessions": sessions, "total_count": len(sessions)}
|
| 380 |
+
|
| 381 |
+
# Background task to cleanup old sessions
|
| 382 |
+
@app.on_event("startup")
|
| 383 |
+
async def startup_event():
|
| 384 |
+
async def cleanup_task():
|
| 385 |
+
while True:
|
| 386 |
+
cleanup_old_sessions()
|
| 387 |
+
await asyncio.sleep(300) # Clean up every 5 minutes
|
| 388 |
+
|
| 389 |
+
asyncio.create_task(cleanup_task())
|
| 390 |
+
|
| 391 |
+
if __name__ == "__main__":
|
| 392 |
+
uvicorn.run(
|
| 393 |
+
"app:app",
|
| 394 |
+
host="0.0.0.0",
|
| 395 |
+
port=7860,
|
| 396 |
+
reload=False,
|
| 397 |
+
workers=1
|
| 398 |
+
)
|
browser_automation_ui.html
ADDED
|
@@ -0,0 +1,270 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>🌐 Web Scraping Server UI</title>
|
| 7 |
+
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&family=Source+Code+Pro:wght@400;500&display=swap" rel="stylesheet">
|
| 8 |
+
<link href="https://cdnjs.cloudflare.com/ajax/libs/lucide/0.263.1/lucide.min.css" rel="stylesheet">
|
| 9 |
+
<style>
|
| 10 |
+
* {
|
| 11 |
+
margin: 0;
|
| 12 |
+
padding: 0;
|
| 13 |
+
box-sizing: border-box;
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
body {
|
| 17 |
+
font-family: 'Inter', sans-serif;
|
| 18 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 19 |
+
min-height: 100vh;
|
| 20 |
+
color: #2d3748;
|
| 21 |
+
overflow-x: hidden;
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
.container {
|
| 25 |
+
max-width: 1400px;
|
| 26 |
+
margin: 0 auto;
|
| 27 |
+
padding: 20px;
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
.header {
|
| 31 |
+
text-align: center;
|
| 32 |
+
margin-bottom: 30px;
|
| 33 |
+
color: white;
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
.header h1 {
|
| 37 |
+
font-size: 2.5rem;
|
| 38 |
+
font-weight: 700;
|
| 39 |
+
margin-bottom: 10px;
|
| 40 |
+
text-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
.header p {
|
| 44 |
+
font-size: 1.1rem;
|
| 45 |
+
opacity: 0.9;
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
.status-bar {
|
| 49 |
+
background: rgba(255, 255, 255, 0.95);
|
| 50 |
+
backdrop-filter: blur(10px);
|
| 51 |
+
border-radius: 12px;
|
| 52 |
+
padding: 16px 24px;
|
| 53 |
+
margin-bottom: 24px;
|
| 54 |
+
display: flex;
|
| 55 |
+
justify-content: space-between;
|
| 56 |
+
align-items: center;
|
| 57 |
+
box-shadow: 0 8px 32px rgba(0, 0, 0, 0.1);
|
| 58 |
+
border: 1px solid rgba(255, 255, 255, 0.2);
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
.status-item {
|
| 62 |
+
display: flex;
|
| 63 |
+
align-items: center;
|
| 64 |
+
gap: 8px;
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
.status-indicator {
|
| 68 |
+
width: 12px;
|
| 69 |
+
height: 12px;
|
| 70 |
+
border-radius: 50%;
|
| 71 |
+
animation: pulse 2s infinite;
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
.status-online {
|
| 75 |
+
background: #10b981;
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
.status-offline {
|
| 79 |
+
background: #ef4444;
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
@keyframes pulse {
|
| 83 |
+
0%, 100% { opacity: 1; }
|
| 84 |
+
50% { opacity: 0.5; }
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
.main-content {
|
| 88 |
+
background: rgba(255, 255, 255, 0.95);
|
| 89 |
+
backdrop-filter: blur(10px);
|
| 90 |
+
border-radius: 16px;
|
| 91 |
+
box-shadow: 0 20px 40px rgba(0, 0, 0, 0.1);
|
| 92 |
+
border: 1px solid rgba(255, 255, 255, 0.2);
|
| 93 |
+
overflow: hidden;
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
.tab-nav {
|
| 97 |
+
display: flex;
|
| 98 |
+
background: rgba(248, 250, 252, 0.8);
|
| 99 |
+
border-bottom: 1px solid #e2e8f0;
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
.tab-button {
|
| 103 |
+
flex: 1;
|
| 104 |
+
padding: 16px 24px;
|
| 105 |
+
background: none;
|
| 106 |
+
border: none;
|
| 107 |
+
cursor: pointer;
|
| 108 |
+
font-family: 'Inter', sans-serif;
|
| 109 |
+
font-size: 1rem;
|
| 110 |
+
font-weight: 500;
|
| 111 |
+
color: #64748b;
|
| 112 |
+
transition: all 0.3s ease;
|
| 113 |
+
position: relative;
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
.tab-button.active {
|
| 117 |
+
color: #4f46e5;
|
| 118 |
+
background: rgba(79, 70, 229, 0.05);
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
.tab-button.active::after {
|
| 122 |
+
content: '';
|
| 123 |
+
position: absolute;
|
| 124 |
+
bottom: 0;
|
| 125 |
+
left: 0;
|
| 126 |
+
right: 0;
|
| 127 |
+
height: 3px;
|
| 128 |
+
background: #4f46e5;
|
| 129 |
+
border-radius: 3px 3px 0 0;
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
.tab-button:hover {
|
| 133 |
+
background: rgba(79, 70, 229, 0.05);
|
| 134 |
+
color: #4f46e5;
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
.tab-content {
|
| 138 |
+
padding: 32px;
|
| 139 |
+
min-height: 600px;
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
.tab-pane {
|
| 143 |
+
display: none;
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
.tab-pane.active {
|
| 147 |
+
display: block;
|
| 148 |
+
animation: fadeIn 0.3s ease;
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
@keyframes fadeIn {
|
| 152 |
+
from { opacity: 0; transform: translateY(10px); }
|
| 153 |
+
to { opacity: 1; transform: translateY(0); }
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
.api-section {
|
| 157 |
+
margin-bottom: 40px;
|
| 158 |
+
background: #f8fafc;
|
| 159 |
+
border-radius: 12px;
|
| 160 |
+
border: 1px solid #e2e8f0;
|
| 161 |
+
overflow: hidden;
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
.api-header {
|
| 165 |
+
background: linear-gradient(135deg, #4f46e5, #7c3aed);
|
| 166 |
+
color: white;
|
| 167 |
+
padding: 20px 24px;
|
| 168 |
+
display: flex;
|
| 169 |
+
align-items: center;
|
| 170 |
+
gap: 12px;
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
.api-header i {
|
| 174 |
+
font-size: 1.2rem;
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
.api-header h3 {
|
| 178 |
+
font-size: 1.25rem;
|
| 179 |
+
font-weight: 600;
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
.api-body {
|
| 183 |
+
padding: 24px;
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
.form-group {
|
| 187 |
+
margin-bottom: 20px;
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
.form-label {
|
| 191 |
+
display: block;
|
| 192 |
+
margin-bottom: 8px;
|
| 193 |
+
font-weight: 500;
|
| 194 |
+
color: #374151;
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
.form-input, .form-textarea, .form-select {
|
| 198 |
+
width: 100%;
|
| 199 |
+
padding: 12px 16px;
|
| 200 |
+
border: 2px solid #e5e7eb;
|
| 201 |
+
border-radius: 8px;
|
| 202 |
+
font-family: 'Inter', sans-serif;
|
| 203 |
+
font-size: 0.95rem;
|
| 204 |
+
transition: all 0.3s ease;
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
.form-input:focus, .form-textarea:focus, .form-select:focus {
|
| 208 |
+
outline: none;
|
| 209 |
+
border-color: #4f46e5;
|
| 210 |
+
box-shadow: 0 0 0 3px rgba(79, 70, 229, 0.1);
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
.form-textarea {
|
| 214 |
+
min-height: 120px;
|
| 215 |
+
resize: vertical;
|
| 216 |
+
font-family: 'Source Code Pro', monospace;
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
.btn {
|
| 220 |
+
padding: 12px 24px;
|
| 221 |
+
border: none;
|
| 222 |
+
border-radius: 8px;
|
| 223 |
+
cursor: pointer;
|
| 224 |
+
font-family: 'Inter', sans-serif;
|
| 225 |
+
font-size: 0.95rem;
|
| 226 |
+
font-weight: 500;
|
| 227 |
+
transition: all 0.3s ease;
|
| 228 |
+
display: inline-flex;
|
| 229 |
+
align-items: center;
|
| 230 |
+
gap: 8px;
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
+
.btn-primary {
|
| 234 |
+
background: linear-gradient(135deg, #4f46e5, #7c3aed);
|
| 235 |
+
color: white;
|
| 236 |
+
box-shadow: 0 4px 12px rgba(79, 70, 229, 0.3);
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
.btn-primary:hover {
|
| 240 |
+
transform: translateY(-2px);
|
| 241 |
+
box-shadow: 0 6px 20px rgba(79, 70, 229, 0.4);
|
| 242 |
+
}
|
| 243 |
+
|
| 244 |
+
.btn-secondary {
|
| 245 |
+
background: #f1f5f9;
|
| 246 |
+
color: #475569;
|
| 247 |
+
border: 1px solid #e2e8f0;
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
+
.btn-secondary:hover {
|
| 251 |
+
background: #e2e8f0;
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
.response-area {
|
| 255 |
+
margin-top: 24px;
|
| 256 |
+
background: #1e293b;
|
| 257 |
+
border-radius: 8px;
|
| 258 |
+
padding: 20px;
|
| 259 |
+
font-family: 'Source Code Pro', monospace;
|
| 260 |
+
color: #e2e8f0;
|
| 261 |
+
white-space: pre-wrap;
|
| 262 |
+
max-height: 400px;
|
| 263 |
+
overflow-y: auto;
|
| 264 |
+
border: 1px solid #334155;
|
| 265 |
+
}
|
| 266 |
+
|
| 267 |
+
.element-inspector {
|
| 268 |
+
background: #f8fafc;
|
| 269 |
+
border-radius: 12px;
|
| 270 |
+
border
|
requirements.txt
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Web framework
|
| 2 |
+
fastapi==0.104.1
|
| 3 |
+
uvicorn[standard]==0.24.0
|
| 4 |
+
gradio==4.8.0
|
| 5 |
+
|
| 6 |
+
# Browser automation
|
| 7 |
+
playwright==1.40.0
|
| 8 |
+
selenium==4.15.2
|
| 9 |
+
webdriver-manager==4.0.1
|
| 10 |
+
|
| 11 |
+
# Web scraping and parsing
|
| 12 |
+
beautifulsoup4==4.12.2
|
| 13 |
+
lxml==4.9.3
|
| 14 |
+
requests==2.31.0
|
| 15 |
+
aiohttp==3.9.1
|
| 16 |
+
|
| 17 |
+
# Image processing
|
| 18 |
+
Pillow==10.1.0
|
| 19 |
+
|
| 20 |
+
# Data processing
|
| 21 |
+
pandas==2.1.4
|
| 22 |
+
numpy==1.25.2
|
| 23 |
+
|
| 24 |
+
# Async support
|
| 25 |
+
asyncio-throttle==1.0.2
|
| 26 |
+
aiofiles==23.2.1
|
| 27 |
+
|
| 28 |
+
# Utilities
|
| 29 |
+
python-multipart==0.0.6
|
| 30 |
+
python-dotenv==1.0.0
|
| 31 |
+
pydantic==2.5.0
|
| 32 |
+
|
| 33 |
+
# AI/ML for content understanding (optional)
|
| 34 |
+
openai==1.3.7
|
| 35 |
+
anthropic==0.7.8
|
| 36 |
+
|
| 37 |
+
# Development
|
| 38 |
+
pytest==7.4.3
|
| 39 |
+
pytest-asyncio==0.21.1
|