File size: 3,930 Bytes
e1d311a
 
 
 
e840680
 
e1d311a
 
 
 
 
e840680
 
 
e1d311a
 
 
 
 
e840680
 
 
e1d311a
 
 
 
 
 
e840680
e1d311a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e840680
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e1d311a
e840680
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e1d311a
e840680
e1d311a
 
 
 
 
 
 
e840680
 
 
 
 
 
 
 
 
 
 
e1d311a
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import os
import asyncio
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import Optional, Dict
from uuid import uuid4
from src.web_extractor import WebExtractor
from src.scrapers.playwright_scraper import ScraperConfig

app = FastAPI()

# Store active sessions
sessions: Dict[str, WebExtractor] = {}

class ScrapeRequest(BaseModel):
    url: str
    query: str
    model_name: Optional[str] = "alias-fast"

class SessionCreateRequest(BaseModel):
    model_name: Optional[str] = "alias-fast"

@app.get("/health")
async def health():
    return {"status": "ok", "message": "CyberScraper 2077 API is running"}

@app.post("/api/scrape")
async def scrape(request: ScrapeRequest):
    """Stateless scrape request (creates a new extractor for each request)"""
    scraper_config = ScraperConfig(
        headless=True,
        max_retries=3,
        delay_after_load=5
    )

    extractor = WebExtractor(model_name=request.model_name, scraper_config=scraper_config)
    try:
        # Construct the query by combining URL and the specific request
        full_query = f"{request.url} {request.query}"
        response = await extractor.process_query(full_query)

        # If response is a tuple (csv/excel), extract the first part
        if isinstance(response, tuple):
            response = response[0]
            
        # Clean up
        if hasattr(extractor.playwright_scraper, 'close'):
            await extractor.playwright_scraper.close()

        return {
            "url": request.url,
            "query": request.query,
            "response": response
        }
    except Exception as e:
        # Try to clean up on error
        if hasattr(extractor, 'playwright_scraper') and hasattr(extractor.playwright_scraper, 'close'):
            await extractor.playwright_scraper.close()
        raise HTTPException(status_code=500, detail=str(e))

@app.post("/api/session")
async def create_session(request: SessionCreateRequest):
    """Create a persistent scraping session"""
    session_id = str(uuid4())
    try:
        scraper_config = ScraperConfig(
            headless=True,
            max_retries=3,
            delay_after_load=5
        )
        extractor = WebExtractor(model_name=request.model_name, scraper_config=scraper_config)
        sessions[session_id] = extractor
        return {"session_id": session_id, "message": "Session created", "model": request.model_name}
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Failed to create session: {str(e)}")

@app.post("/api/session/{session_id}/scrape")
async def session_scrape(session_id: str, request: ScrapeRequest):
    """Scrape using an existing session context"""
    if session_id not in sessions:
        raise HTTPException(status_code=404, detail="Session not found")
    
    extractor = sessions[session_id]
    try:
        full_query = f"{request.url} {request.query}"
        response = await extractor.process_query(full_query)
        
        if isinstance(response, tuple):
            response = response[0]
            
        return {
            "session_id": session_id,
            "url": request.url,
            "query": request.query,
            "response": response
        }
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

@app.delete("/api/session/{session_id}")
async def close_session(session_id: str):
    """Close a session and release resources"""
    if session_id in sessions:
        extractor = sessions[session_id]
        if hasattr(extractor.playwright_scraper, 'close'):
            await extractor.playwright_scraper.close()
        del sessions[session_id]
        return {"message": "Session closed", "session_id": session_id}
    raise HTTPException(status_code=404, detail="Session not found")

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)