Spaces:

WD101
/

OneServerToRuleThemAll

Sleeping

File size: 7,215 Bytes

feea636
 
 
 
 
191e833
 
 
 
 
 
 
 
feea636
 
 
 
191e833
 
feea636

from fastapi import FastAPI, HTTPException, BackgroundTasks
from pydantic import BaseModel, HttpUrl
from typing import List, Dict, Optional
import asyncio
from main import WebScrapingOrchestrator
from contextlib import asynccontextmanager

@asynccontextmanager
async def lifespan(app: FastAPI):
    # Startup code (if any) goes here
    yield
    # Shutdown code goes here
    await orchestrator.close_connections()

app = FastAPI(
    title="Advanced Web Scraper for LLM",
    description="Scrape, analyze, and store web content optimized for LLM consumption",
    version="1.0.0",
    lifespan = lifespan,
)

# Global orchestrator instance
orchestrator = WebScrapingOrchestrator()

# Pydantic models
class URLRequest(BaseModel):
    url: HttpUrl
    
class SearchRequest(BaseModel):
    query: str
    limit: int = 5

class BatchURLRequest(BaseModel):
    urls: List[HttpUrl]

# Response models
class ScrapingResponse(BaseModel):
    success: bool
    url: str
    title: Optional[str] = None
    summary: Optional[Dict] = None
    llm_ready_data: Optional[Dict] = None
    error: Optional[str] = None

class SearchResponse(BaseModel):
    results: List[Dict]
    total_found: int

@app.post("/scrape", response_model=ScrapingResponse)
async def scrape_url(request: URLRequest):
    """Scrape a single URL and store data optimized for LLM consumption"""
    try:
        result = await orchestrator.process_url(str(request.url))
        
        if "error" in result:
            raise HTTPException(status_code=400, detail=result["error"])
        
        return ScrapingResponse(**result)
    
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")

@app.post("/scrape-batch")
async def scrape_batch_urls(request: BatchURLRequest, background_tasks: BackgroundTasks):
    """Scrape multiple URLs in the background"""
    async def process_batch():
        results = []
        for url in request.urls:
            try:
                result = await orchestrator.process_url(str(url))
                results.append(result)
            except Exception as e:
                results.append({"error": str(e), "url": str(url)})
        return results
    
    # Add to background tasks
    background_tasks.add_task(process_batch)
    
    return {
        "message": f"Started processing {len(request.urls)} URLs in background",
        "urls": [str(url) for url in request.urls]
    }

@app.get("/page/{url:path}")
async def get_page_data(url: str):
    """Get processed page data optimized for LLM consumption"""
    try:
        # Decode URL
        import urllib.parse
        decoded_url = urllib.parse.unquote(url)
        
        page_data = orchestrator.get_page_for_llm(decoded_url)
        
        if not page_data:
            raise HTTPException(status_code=404, detail="Page not found")
        
        return page_data
    
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Retrieval failed: {str(e)}")

@app.post("/search", response_model=SearchResponse)
async def search_content(request: SearchRequest):
    """Search stored content for LLM context"""
    try:
        results = orchestrator.search_for_llm(request.query, request.limit)
        
        return SearchResponse(
            results=results,
            total_found=len(results)
        )
    
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Search failed: {str(e)}")

@app.get("/llm-ready/{url:path}")
async def get_llm_ready_content(url: str):
    """Get content specifically formatted for LLM consumption"""
    try:
        import urllib.parse
        decoded_url = urllib.parse.unquote(url)
        
        page_data = orchestrator.get_page_for_llm(decoded_url)
        
        if not page_data:
            raise HTTPException(status_code=404, detail="Page not found")
        
        # Format for LLM
        llm_content = {
            "instruction": "Use this content for generating summaries, notes, or mind maps",
            "content": {
                "title": page_data["title"],
                "main_content": page_data["content"],
                "structure": {
                    "headings": page_data["headings"],
                    "content_type": page_data["study_metadata"]["content_type"],
                    "complexity": page_data["study_metadata"]["complexity_score"],
                    "reading_time": page_data["study_metadata"]["reading_time"]
                },
                "context": {
                    "related_pages": page_data["relationships"]["related_pages"],
                    "key_topics": page_data["study_metadata"]["key_topics"]
                }
            },
            "suggestions": {
                "study_approach": _get_study_approach(page_data["study_metadata"]),
                "focus_areas": page_data["headings"][:3],
                "difficulty_level": _assess_difficulty(page_data["study_metadata"])
            }
        }
        
        return llm_content
    
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"LLM formatting failed: {str(e)}")

@app.get("/health")
async def health_check():
    """Health check endpoint"""
    return {"status": "healthy", "message": "Web scraper API is running"}

@app.get("/stats")
async def get_statistics():
    """Get scraping statistics"""
    try:
        # Get basic stats from MongoDB
        mongo_stats = orchestrator.mongo_storage.collection.estimated_document_count()
        
        return {
            "total_pages_scraped": mongo_stats,
            "database_status": "connected",
            "features": [
                "Dynamic content scraping with Playwright",
                "DOM structure analysis",
                "MongoDB storage for content",
                "Neo4j for relationships",
                "LLM-optimized data extraction"
            ]
        }
    
    except Exception as e:
        return {"error": f"Stats retrieval failed: {str(e)}"}

def _get_study_approach(metadata: Dict) -> str:
    """Suggest study approach based on content analysis"""
    content_type = metadata.get("content_type", "general")
    complexity = metadata.get("complexity_score", 0)
    
    if content_type == "tutorial":
        return "hands-on practice with step-by-step approach"
    elif content_type == "documentation":
        return "reference-based learning with examples"
    elif content_type == "research":
        return "analytical reading with note-taking"
    elif complexity > 5:
        return "detailed study with concept mapping"
    else:
        return "general reading with summary creation"

def _assess_difficulty(metadata: Dict) -> str:
    """Assess content difficulty for LLM processing hints"""
    complexity = metadata.get("complexity_score", 0)
    reading_time = metadata.get("reading_time", 0)
    
    if complexity < 2 and reading_time < 5:
        return "beginner"
    elif complexity < 5 and reading_time < 15:
        return "intermediate"
    else:
        return "advanced"

# Run the API
if __name__ == "__main__":
    import uvicorn
    uvicorn.run("api:app", host="0.0.0.0", port=8000, reload=True)