File size: 12,038 Bytes
ce40d2a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 | import os
import asyncio
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import Optional, Dict
from uuid import uuid4
from src.web_extractor import WebExtractor
from src.scrapers.playwright_scraper import ScraperConfig
app = FastAPI()
# Store active sessions
sessions: Dict[str, WebExtractor] = {}
class ScrapeRequest(BaseModel):
url: str
query: str
model_name: Optional[str] = "alias-fast"
class SessionCreateRequest(BaseModel):
model_name: Optional[str] = "alias-fast"
@app.get("/health")
async def health():
return {"status": "ok", "message": "CyberScraper 2077 API is running"}
@app.get("/api-docs")
async def api_docs():
"""Comprehensive API documentation with examples"""
return {
"title": "CyberScraper 2077 API Documentation",
"version": "1.0.0",
"description": "Advanced web scraping API with session management and AI-powered content extraction",
"base_url": "https://grazieprego-scrapling.hf.space",
"endpoints": {
"health": {
"method": "GET",
"path": "/health",
"description": "Check if the API is running",
"response": {
"status": "ok",
"message": "CyberScraper 2077 API is running"
},
"example": "curl https://grazieprego-scrapling.hf.space/health"
},
"scrape": {
"method": "POST",
"path": "/api/scrape",
"description": "Stateless scrape request - creates a new extractor for each request",
"request_body": {
"url": "string - The URL to scrape",
"query": "string - The extraction query/instruction",
"model_name": "string (optional) - AI model to use (default: 'alias-fast')"
},
"response": {
"url": "string - The scraped URL",
"query": "string - The query used",
"response": "any - The extracted content"
},
"example": {
"curl": "curl -X POST https://grazieprego-scrapling.hf.space/api/scrape -H 'Content-Type: application/json' -d '{\"url\": \"https://example.com\", \"query\": \"Extract all product prices\"}'",
"python": "import requests\nresponse = requests.post('https://grazieprego-scrapling.hf.space/api/scrape', json={'url': 'https://example.com', 'query': 'Extract prices'})\nprint(response.json())"
}
},
"create_session": {
"method": "POST",
"path": "/api/session",
"description": "Create a persistent scraping session for multiple requests",
"request_body": {
"model_name": "string (optional) - AI model to use (default: 'alias-fast')"
},
"response": {
"session_id": "string - UUID of the created session",
"message": "string - Confirmation message",
"model": "string - Model used"
},
"example": {
"curl": "curl -X POST https://grazieprego-scrapling.hf.space/api/session -H 'Content-Type: application/json' -d '{\"model_name\": \"alias-fast\"}'",
"python": "import requests\nsession = requests.post('https://grazieprego-scrapling.hf.space/api/session', json={'model_name': 'alias-fast'})\nsession_id = session.json()['session_id']"
}
},
"session_scrape": {
"method": "POST",
"path": "/api/session/{session_id}/scrape",
"description": "Scrape using an existing session context (more efficient for multiple requests)",
"path_parameters": {
"session_id": "string - UUID of the session"
},
"request_body": {
"url": "string - The URL to scrape",
"query": "string - The extraction query",
"model_name": "string (optional)"
},
"response": {
"session_id": "string - The session ID",
"url": "string - The scraped URL",
"query": "string - The query used",
"response": "any - The extracted content"
},
"example": {
"curl": "curl -X POST https://grazieprego-scrapling.hf.space/api/session/uuid-here/scrape -H 'Content-Type: application/json' -d '{\"url\": \"https://example.com/page1\", \"query\": \"Extract titles\"}'",
"python": "import requests\nresponse = requests.post(f'https://grazieprego-scrapling.hf.space/api/session/{session_id}/scrape', json={'url': 'https://example.com', 'query': 'Extract data'})\nprint(response.json())"
}
},
"close_session": {
"method": "DELETE",
"path": "/api/session/{session_id}",
"description": "Close a session and release resources",
"path_parameters": {
"session_id": "string - UUID of the session to close"
},
"response": {
"message": "string - Confirmation message",
"session_id": "string - The closed session ID"
},
"example": {
"curl": "curl -X DELETE https://grazieprego-scrapling.hf.space/api/session/uuid-here",
"python": "import requests\nresponse = requests.delete(f'https://grazieprego-scrapling.hf.space/api/session/{session_id}')\nprint(response.json())"
}
}
},
"usage_guide": {
"quick_start": [
"1. Make a simple scrape request to /api/scrape",
"2. For multiple requests, create a session first",
"3. Use the session ID for subsequent requests",
"4. Close sessions when done to free resources"
],
"best_practices": [
"Use stateless /api/scrape for one-off requests",
"Use sessions for batch processing multiple URLs",
"Always close sessions when finished",
"Handle errors gracefully (500 errors may occur on complex sites)",
"Set appropriate timeouts for slow-loading pages"
],
"error_handling": {
"404": "Session not found (for session endpoints)",
"500": "Internal server error - check the detail message",
"Common issues": [
"URL unreachable or timeout",
"JavaScript-heavy sites may require different approaches",
"Bot protection may block requests"
]
}
},
"integration_examples": {
"python_script": """
import requests
# Stateless scrape
response = requests.post(
'https://grazieprego-scrapling.hf.space/api/scrape',
json={
'url': 'https://example.com',
'query': 'Extract all headings and prices'
}
)
print("Result:", response.json())
# Session-based workflow
session_response = requests.post(
'https://grazieprego-scrapling.hf.space/api/session',
json={'model_name': 'alias-fast'}
)
session_id = session_response.json()['session_id']
try:
# Multiple requests using the same session
for url in ['https://example.com/page1', 'https://example.com/page2']:
result = requests.post(
f'https://grazieprego-scrapling.hf.space/api/session/{session_id}/scrape',
json={'url': url, 'query': 'Extract product data'}
)
print(f"Scraped {url}:", result.json())
finally:
# Always close the session
requests.delete(f'https://grazieprego-scrapling.hf.space/api/session/{session_id}')
""",
"javascript": """
// Fetch API example
async function scrapeUrl(url, query) {
const response = await fetch('https://grazieprego-scrapling.hf.space/api/scrape', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ url, query })
});
return await response.json();
}
// Usage
scrapeUrl('https://example.com', 'Extract all links').then(console.log);
"""
},
"rate_limits": {
"note": "Rate limits may apply. Please use responsibly.",
"recommendation": "For high-volume scraping, use session-based approach and implement retry logic"
}
}
@app.post("/api/scrape")
async def scrape(request: ScrapeRequest):
"""Stateless scrape request (creates a new extractor for each request)"""
scraper_config = ScraperConfig(
headless=True,
max_retries=3,
delay_after_load=5
)
extractor = WebExtractor(model_name=request.model_name, scraper_config=scraper_config)
try:
# Construct the query by combining URL and the specific request
full_query = f"{request.url} {request.query}"
response = await extractor.process_query(full_query)
# If response is a tuple (csv/excel), extract the first part
if isinstance(response, tuple):
response = response[0]
# Clean up
if hasattr(extractor.playwright_scraper, 'close'):
await extractor.playwright_scraper.close()
return {
"url": request.url,
"query": request.query,
"response": response
}
except Exception as e:
# Try to clean up on error
if hasattr(extractor, 'playwright_scraper') and hasattr(extractor.playwright_scraper, 'close'):
await extractor.playwright_scraper.close()
raise HTTPException(status_code=500, detail=str(e))
@app.post("/api/session")
async def create_session(request: SessionCreateRequest):
"""Create a persistent scraping session"""
session_id = str(uuid4())
try:
scraper_config = ScraperConfig(
headless=True,
max_retries=3,
delay_after_load=5
)
extractor = WebExtractor(model_name=request.model_name, scraper_config=scraper_config)
sessions[session_id] = extractor
return {"session_id": session_id, "message": "Session created", "model": request.model_name}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to create session: {str(e)}")
@app.post("/api/session/{session_id}/scrape")
async def session_scrape(session_id: str, request: ScrapeRequest):
"""Scrape using an existing session context"""
if session_id not in sessions:
raise HTTPException(status_code=404, detail="Session not found")
extractor = sessions[session_id]
try:
full_query = f"{request.url} {request.query}"
response = await extractor.process_query(full_query)
if isinstance(response, tuple):
response = response[0]
return {
"session_id": session_id,
"url": request.url,
"query": request.query,
"response": response
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.delete("/api/session/{session_id}")
async def close_session(session_id: str):
"""Close a session and release resources"""
if session_id in sessions:
extractor = sessions[session_id]
if hasattr(extractor.playwright_scraper, 'close'):
await extractor.playwright_scraper.close()
del sessions[session_id]
return {"message": "Session closed", "session_id": session_id}
raise HTTPException(status_code=404, detail="Session not found")
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)
|