Spaces:
Sleeping
Sleeping
File size: 7,215 Bytes
feea636 191e833 feea636 191e833 feea636 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 |
from fastapi import FastAPI, HTTPException, BackgroundTasks
from pydantic import BaseModel, HttpUrl
from typing import List, Dict, Optional
import asyncio
from main import WebScrapingOrchestrator
from contextlib import asynccontextmanager
@asynccontextmanager
async def lifespan(app: FastAPI):
# Startup code (if any) goes here
yield
# Shutdown code goes here
await orchestrator.close_connections()
app = FastAPI(
title="Advanced Web Scraper for LLM",
description="Scrape, analyze, and store web content optimized for LLM consumption",
version="1.0.0",
lifespan = lifespan,
)
# Global orchestrator instance
orchestrator = WebScrapingOrchestrator()
# Pydantic models
class URLRequest(BaseModel):
url: HttpUrl
class SearchRequest(BaseModel):
query: str
limit: int = 5
class BatchURLRequest(BaseModel):
urls: List[HttpUrl]
# Response models
class ScrapingResponse(BaseModel):
success: bool
url: str
title: Optional[str] = None
summary: Optional[Dict] = None
llm_ready_data: Optional[Dict] = None
error: Optional[str] = None
class SearchResponse(BaseModel):
results: List[Dict]
total_found: int
@app.post("/scrape", response_model=ScrapingResponse)
async def scrape_url(request: URLRequest):
"""Scrape a single URL and store data optimized for LLM consumption"""
try:
result = await orchestrator.process_url(str(request.url))
if "error" in result:
raise HTTPException(status_code=400, detail=result["error"])
return ScrapingResponse(**result)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")
@app.post("/scrape-batch")
async def scrape_batch_urls(request: BatchURLRequest, background_tasks: BackgroundTasks):
"""Scrape multiple URLs in the background"""
async def process_batch():
results = []
for url in request.urls:
try:
result = await orchestrator.process_url(str(url))
results.append(result)
except Exception as e:
results.append({"error": str(e), "url": str(url)})
return results
# Add to background tasks
background_tasks.add_task(process_batch)
return {
"message": f"Started processing {len(request.urls)} URLs in background",
"urls": [str(url) for url in request.urls]
}
@app.get("/page/{url:path}")
async def get_page_data(url: str):
"""Get processed page data optimized for LLM consumption"""
try:
# Decode URL
import urllib.parse
decoded_url = urllib.parse.unquote(url)
page_data = orchestrator.get_page_for_llm(decoded_url)
if not page_data:
raise HTTPException(status_code=404, detail="Page not found")
return page_data
except Exception as e:
raise HTTPException(status_code=500, detail=f"Retrieval failed: {str(e)}")
@app.post("/search", response_model=SearchResponse)
async def search_content(request: SearchRequest):
"""Search stored content for LLM context"""
try:
results = orchestrator.search_for_llm(request.query, request.limit)
return SearchResponse(
results=results,
total_found=len(results)
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Search failed: {str(e)}")
@app.get("/llm-ready/{url:path}")
async def get_llm_ready_content(url: str):
"""Get content specifically formatted for LLM consumption"""
try:
import urllib.parse
decoded_url = urllib.parse.unquote(url)
page_data = orchestrator.get_page_for_llm(decoded_url)
if not page_data:
raise HTTPException(status_code=404, detail="Page not found")
# Format for LLM
llm_content = {
"instruction": "Use this content for generating summaries, notes, or mind maps",
"content": {
"title": page_data["title"],
"main_content": page_data["content"],
"structure": {
"headings": page_data["headings"],
"content_type": page_data["study_metadata"]["content_type"],
"complexity": page_data["study_metadata"]["complexity_score"],
"reading_time": page_data["study_metadata"]["reading_time"]
},
"context": {
"related_pages": page_data["relationships"]["related_pages"],
"key_topics": page_data["study_metadata"]["key_topics"]
}
},
"suggestions": {
"study_approach": _get_study_approach(page_data["study_metadata"]),
"focus_areas": page_data["headings"][:3],
"difficulty_level": _assess_difficulty(page_data["study_metadata"])
}
}
return llm_content
except Exception as e:
raise HTTPException(status_code=500, detail=f"LLM formatting failed: {str(e)}")
@app.get("/health")
async def health_check():
"""Health check endpoint"""
return {"status": "healthy", "message": "Web scraper API is running"}
@app.get("/stats")
async def get_statistics():
"""Get scraping statistics"""
try:
# Get basic stats from MongoDB
mongo_stats = orchestrator.mongo_storage.collection.estimated_document_count()
return {
"total_pages_scraped": mongo_stats,
"database_status": "connected",
"features": [
"Dynamic content scraping with Playwright",
"DOM structure analysis",
"MongoDB storage for content",
"Neo4j for relationships",
"LLM-optimized data extraction"
]
}
except Exception as e:
return {"error": f"Stats retrieval failed: {str(e)}"}
def _get_study_approach(metadata: Dict) -> str:
"""Suggest study approach based on content analysis"""
content_type = metadata.get("content_type", "general")
complexity = metadata.get("complexity_score", 0)
if content_type == "tutorial":
return "hands-on practice with step-by-step approach"
elif content_type == "documentation":
return "reference-based learning with examples"
elif content_type == "research":
return "analytical reading with note-taking"
elif complexity > 5:
return "detailed study with concept mapping"
else:
return "general reading with summary creation"
def _assess_difficulty(metadata: Dict) -> str:
"""Assess content difficulty for LLM processing hints"""
complexity = metadata.get("complexity_score", 0)
reading_time = metadata.get("reading_time", 0)
if complexity < 2 and reading_time < 5:
return "beginner"
elif complexity < 5 and reading_time < 15:
return "intermediate"
else:
return "advanced"
# Run the API
if __name__ == "__main__":
import uvicorn
uvicorn.run("api:app", host="0.0.0.0", port=8000, reload=True) |