File size: 7,215 Bytes
feea636
 
 
 
 
191e833
 
 
 
 
 
 
 
feea636
 
 
 
191e833
 
feea636
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
from fastapi import FastAPI, HTTPException, BackgroundTasks
from pydantic import BaseModel, HttpUrl
from typing import List, Dict, Optional
import asyncio
from main import WebScrapingOrchestrator
from contextlib import asynccontextmanager

@asynccontextmanager
async def lifespan(app: FastAPI):
    # Startup code (if any) goes here
    yield
    # Shutdown code goes here
    await orchestrator.close_connections()

app = FastAPI(
    title="Advanced Web Scraper for LLM",
    description="Scrape, analyze, and store web content optimized for LLM consumption",
    version="1.0.0",
    lifespan = lifespan,
)

# Global orchestrator instance
orchestrator = WebScrapingOrchestrator()

# Pydantic models
class URLRequest(BaseModel):
    url: HttpUrl
    
class SearchRequest(BaseModel):
    query: str
    limit: int = 5

class BatchURLRequest(BaseModel):
    urls: List[HttpUrl]

# Response models
class ScrapingResponse(BaseModel):
    success: bool
    url: str
    title: Optional[str] = None
    summary: Optional[Dict] = None
    llm_ready_data: Optional[Dict] = None
    error: Optional[str] = None

class SearchResponse(BaseModel):
    results: List[Dict]
    total_found: int

@app.post("/scrape", response_model=ScrapingResponse)
async def scrape_url(request: URLRequest):
    """Scrape a single URL and store data optimized for LLM consumption"""
    try:
        result = await orchestrator.process_url(str(request.url))
        
        if "error" in result:
            raise HTTPException(status_code=400, detail=result["error"])
        
        return ScrapingResponse(**result)
    
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")

@app.post("/scrape-batch")
async def scrape_batch_urls(request: BatchURLRequest, background_tasks: BackgroundTasks):
    """Scrape multiple URLs in the background"""
    async def process_batch():
        results = []
        for url in request.urls:
            try:
                result = await orchestrator.process_url(str(url))
                results.append(result)
            except Exception as e:
                results.append({"error": str(e), "url": str(url)})
        return results
    
    # Add to background tasks
    background_tasks.add_task(process_batch)
    
    return {
        "message": f"Started processing {len(request.urls)} URLs in background",
        "urls": [str(url) for url in request.urls]
    }

@app.get("/page/{url:path}")
async def get_page_data(url: str):
    """Get processed page data optimized for LLM consumption"""
    try:
        # Decode URL
        import urllib.parse
        decoded_url = urllib.parse.unquote(url)
        
        page_data = orchestrator.get_page_for_llm(decoded_url)
        
        if not page_data:
            raise HTTPException(status_code=404, detail="Page not found")
        
        return page_data
    
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Retrieval failed: {str(e)}")

@app.post("/search", response_model=SearchResponse)
async def search_content(request: SearchRequest):
    """Search stored content for LLM context"""
    try:
        results = orchestrator.search_for_llm(request.query, request.limit)
        
        return SearchResponse(
            results=results,
            total_found=len(results)
        )
    
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Search failed: {str(e)}")

@app.get("/llm-ready/{url:path}")
async def get_llm_ready_content(url: str):
    """Get content specifically formatted for LLM consumption"""
    try:
        import urllib.parse
        decoded_url = urllib.parse.unquote(url)
        
        page_data = orchestrator.get_page_for_llm(decoded_url)
        
        if not page_data:
            raise HTTPException(status_code=404, detail="Page not found")
        
        # Format for LLM
        llm_content = {
            "instruction": "Use this content for generating summaries, notes, or mind maps",
            "content": {
                "title": page_data["title"],
                "main_content": page_data["content"],
                "structure": {
                    "headings": page_data["headings"],
                    "content_type": page_data["study_metadata"]["content_type"],
                    "complexity": page_data["study_metadata"]["complexity_score"],
                    "reading_time": page_data["study_metadata"]["reading_time"]
                },
                "context": {
                    "related_pages": page_data["relationships"]["related_pages"],
                    "key_topics": page_data["study_metadata"]["key_topics"]
                }
            },
            "suggestions": {
                "study_approach": _get_study_approach(page_data["study_metadata"]),
                "focus_areas": page_data["headings"][:3],
                "difficulty_level": _assess_difficulty(page_data["study_metadata"])
            }
        }
        
        return llm_content
    
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"LLM formatting failed: {str(e)}")

@app.get("/health")
async def health_check():
    """Health check endpoint"""
    return {"status": "healthy", "message": "Web scraper API is running"}

@app.get("/stats")
async def get_statistics():
    """Get scraping statistics"""
    try:
        # Get basic stats from MongoDB
        mongo_stats = orchestrator.mongo_storage.collection.estimated_document_count()
        
        return {
            "total_pages_scraped": mongo_stats,
            "database_status": "connected",
            "features": [
                "Dynamic content scraping with Playwright",
                "DOM structure analysis",
                "MongoDB storage for content",
                "Neo4j for relationships",
                "LLM-optimized data extraction"
            ]
        }
    
    except Exception as e:
        return {"error": f"Stats retrieval failed: {str(e)}"}

def _get_study_approach(metadata: Dict) -> str:
    """Suggest study approach based on content analysis"""
    content_type = metadata.get("content_type", "general")
    complexity = metadata.get("complexity_score", 0)
    
    if content_type == "tutorial":
        return "hands-on practice with step-by-step approach"
    elif content_type == "documentation":
        return "reference-based learning with examples"
    elif content_type == "research":
        return "analytical reading with note-taking"
    elif complexity > 5:
        return "detailed study with concept mapping"
    else:
        return "general reading with summary creation"

def _assess_difficulty(metadata: Dict) -> str:
    """Assess content difficulty for LLM processing hints"""
    complexity = metadata.get("complexity_score", 0)
    reading_time = metadata.get("reading_time", 0)
    
    if complexity < 2 and reading_time < 5:
        return "beginner"
    elif complexity < 5 and reading_time < 15:
        return "intermediate"
    else:
        return "advanced"

# Run the API
if __name__ == "__main__":
    import uvicorn
    uvicorn.run("api:app", host="0.0.0.0", port=8000, reload=True)