Spaces:
Running
Running
File size: 3,930 Bytes
e1d311a e840680 e1d311a e840680 e1d311a e840680 e1d311a e840680 e1d311a e840680 e1d311a e840680 e1d311a e840680 e1d311a e840680 e1d311a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 | import os
import asyncio
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import Optional, Dict
from uuid import uuid4
from src.web_extractor import WebExtractor
from src.scrapers.playwright_scraper import ScraperConfig
app = FastAPI()
# Store active sessions
sessions: Dict[str, WebExtractor] = {}
class ScrapeRequest(BaseModel):
url: str
query: str
model_name: Optional[str] = "alias-fast"
class SessionCreateRequest(BaseModel):
model_name: Optional[str] = "alias-fast"
@app.get("/health")
async def health():
return {"status": "ok", "message": "CyberScraper 2077 API is running"}
@app.post("/api/scrape")
async def scrape(request: ScrapeRequest):
"""Stateless scrape request (creates a new extractor for each request)"""
scraper_config = ScraperConfig(
headless=True,
max_retries=3,
delay_after_load=5
)
extractor = WebExtractor(model_name=request.model_name, scraper_config=scraper_config)
try:
# Construct the query by combining URL and the specific request
full_query = f"{request.url} {request.query}"
response = await extractor.process_query(full_query)
# If response is a tuple (csv/excel), extract the first part
if isinstance(response, tuple):
response = response[0]
# Clean up
if hasattr(extractor.playwright_scraper, 'close'):
await extractor.playwright_scraper.close()
return {
"url": request.url,
"query": request.query,
"response": response
}
except Exception as e:
# Try to clean up on error
if hasattr(extractor, 'playwright_scraper') and hasattr(extractor.playwright_scraper, 'close'):
await extractor.playwright_scraper.close()
raise HTTPException(status_code=500, detail=str(e))
@app.post("/api/session")
async def create_session(request: SessionCreateRequest):
"""Create a persistent scraping session"""
session_id = str(uuid4())
try:
scraper_config = ScraperConfig(
headless=True,
max_retries=3,
delay_after_load=5
)
extractor = WebExtractor(model_name=request.model_name, scraper_config=scraper_config)
sessions[session_id] = extractor
return {"session_id": session_id, "message": "Session created", "model": request.model_name}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to create session: {str(e)}")
@app.post("/api/session/{session_id}/scrape")
async def session_scrape(session_id: str, request: ScrapeRequest):
"""Scrape using an existing session context"""
if session_id not in sessions:
raise HTTPException(status_code=404, detail="Session not found")
extractor = sessions[session_id]
try:
full_query = f"{request.url} {request.query}"
response = await extractor.process_query(full_query)
if isinstance(response, tuple):
response = response[0]
return {
"session_id": session_id,
"url": request.url,
"query": request.query,
"response": response
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.delete("/api/session/{session_id}")
async def close_session(session_id: str):
"""Close a session and release resources"""
if session_id in sessions:
extractor = sessions[session_id]
if hasattr(extractor.playwright_scraper, 'close'):
await extractor.playwright_scraper.close()
del sessions[session_id]
return {"message": "Session closed", "session_id": session_id}
raise HTTPException(status_code=404, detail="Session not found")
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)
|