Spaces:
Sleeping
Sleeping
Commit
·
feea636
1
Parent(s):
80a0aa2
add files
Browse files- .gitignore +49 -0
- Dockerfile +34 -0
- app.py +211 -0
- main.py +184 -0
- requirements.txt +16 -0
- scraper/__init__.py +0 -0
- scraper/data_extractor.py +176 -0
- scraper/dom_analyzer.py +162 -0
- scraper/html_loader.py +68 -0
- server.py +33 -0
- storage/__init__.py +0 -0
- storage/mongo_storage.py +143 -0
- storage/neo4j_storage.py +216 -0
.gitignore
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Sensitive files
|
| 2 |
+
certification.pem
|
| 3 |
+
*.pem
|
| 4 |
+
*.key
|
| 5 |
+
*.crt
|
| 6 |
+
*.cert
|
| 7 |
+
|
| 8 |
+
# Python temporary files
|
| 9 |
+
__pycache__/
|
| 10 |
+
*.pyc
|
| 11 |
+
*.pyo
|
| 12 |
+
*.pyd
|
| 13 |
+
.Python
|
| 14 |
+
pip-log.txt
|
| 15 |
+
pip-delete-this-directory.txt
|
| 16 |
+
|
| 17 |
+
# Virtual environments
|
| 18 |
+
venv/
|
| 19 |
+
env/
|
| 20 |
+
.venv/
|
| 21 |
+
.env
|
| 22 |
+
|
| 23 |
+
# Docker artifacts
|
| 24 |
+
*.dockerignore
|
| 25 |
+
Dockerfile.bak
|
| 26 |
+
*.log
|
| 27 |
+
|
| 28 |
+
# Editor and IDE files
|
| 29 |
+
.vscode/
|
| 30 |
+
.idea/
|
| 31 |
+
*.sublime-project
|
| 32 |
+
*.sublime-workspace
|
| 33 |
+
|
| 34 |
+
# System files
|
| 35 |
+
.DS_Store
|
| 36 |
+
Thumbs.db
|
| 37 |
+
|
| 38 |
+
# Local development and testing
|
| 39 |
+
*.swp
|
| 40 |
+
*.swo
|
| 41 |
+
*.tmp
|
| 42 |
+
*.bak
|
| 43 |
+
*.backup
|
| 44 |
+
|
| 45 |
+
# Cache and temporary directories
|
| 46 |
+
.cache/
|
| 47 |
+
*.cache
|
| 48 |
+
*.egg-info/
|
| 49 |
+
dist
|
Dockerfile
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
|
| 2 |
+
# you will also find guides on how best to write your Dockerfile
|
| 3 |
+
|
| 4 |
+
FROM python:3.9
|
| 5 |
+
|
| 6 |
+
# Install system dependencies for Playwright
|
| 7 |
+
RUN apt-get update && apt-get install -y \
|
| 8 |
+
libnss3 \
|
| 9 |
+
libatk1.0-0 \
|
| 10 |
+
libatk-bridge2.0-0 \
|
| 11 |
+
libxcomposite1 \
|
| 12 |
+
libxdamage1 \
|
| 13 |
+
libxrandr2 \
|
| 14 |
+
libgbm1 \
|
| 15 |
+
libpango-1.0-0 \
|
| 16 |
+
libcairo2 \
|
| 17 |
+
libasound2 \
|
| 18 |
+
libxshmfence1 \
|
| 19 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 20 |
+
|
| 21 |
+
RUN useradd -m -u 1000 user
|
| 22 |
+
USER user
|
| 23 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
| 24 |
+
|
| 25 |
+
WORKDIR /app
|
| 26 |
+
|
| 27 |
+
COPY --chown=user ./requirements.txt requirements.txt
|
| 28 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
| 29 |
+
|
| 30 |
+
# Install Playwright browsers
|
| 31 |
+
RUN playwright install
|
| 32 |
+
|
| 33 |
+
COPY --chown=user . /app
|
| 34 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
app.py
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, HTTPException, BackgroundTasks
|
| 2 |
+
from pydantic import BaseModel, HttpUrl
|
| 3 |
+
from typing import List, Dict, Optional
|
| 4 |
+
import asyncio
|
| 5 |
+
from main import WebScrapingOrchestrator
|
| 6 |
+
|
| 7 |
+
app = FastAPI(
|
| 8 |
+
title="Advanced Web Scraper for LLM",
|
| 9 |
+
description="Scrape, analyze, and store web content optimized for LLM consumption",
|
| 10 |
+
version="1.0.0"
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
# Global orchestrator instance
|
| 14 |
+
orchestrator = WebScrapingOrchestrator()
|
| 15 |
+
|
| 16 |
+
# Pydantic models
|
| 17 |
+
class URLRequest(BaseModel):
|
| 18 |
+
url: HttpUrl
|
| 19 |
+
|
| 20 |
+
class SearchRequest(BaseModel):
|
| 21 |
+
query: str
|
| 22 |
+
limit: int = 5
|
| 23 |
+
|
| 24 |
+
class BatchURLRequest(BaseModel):
|
| 25 |
+
urls: List[HttpUrl]
|
| 26 |
+
|
| 27 |
+
# Response models
|
| 28 |
+
class ScrapingResponse(BaseModel):
|
| 29 |
+
success: bool
|
| 30 |
+
url: str
|
| 31 |
+
title: Optional[str] = None
|
| 32 |
+
summary: Optional[Dict] = None
|
| 33 |
+
llm_ready_data: Optional[Dict] = None
|
| 34 |
+
error: Optional[str] = None
|
| 35 |
+
|
| 36 |
+
class SearchResponse(BaseModel):
|
| 37 |
+
results: List[Dict]
|
| 38 |
+
total_found: int
|
| 39 |
+
|
| 40 |
+
@app.post("/scrape", response_model=ScrapingResponse)
|
| 41 |
+
async def scrape_url(request: URLRequest):
|
| 42 |
+
"""Scrape a single URL and store data optimized for LLM consumption"""
|
| 43 |
+
try:
|
| 44 |
+
result = await orchestrator.process_url(str(request.url))
|
| 45 |
+
|
| 46 |
+
if "error" in result:
|
| 47 |
+
raise HTTPException(status_code=400, detail=result["error"])
|
| 48 |
+
|
| 49 |
+
return ScrapingResponse(**result)
|
| 50 |
+
|
| 51 |
+
except Exception as e:
|
| 52 |
+
raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")
|
| 53 |
+
|
| 54 |
+
@app.post("/scrape-batch")
|
| 55 |
+
async def scrape_batch_urls(request: BatchURLRequest, background_tasks: BackgroundTasks):
|
| 56 |
+
"""Scrape multiple URLs in the background"""
|
| 57 |
+
async def process_batch():
|
| 58 |
+
results = []
|
| 59 |
+
for url in request.urls:
|
| 60 |
+
try:
|
| 61 |
+
result = await orchestrator.process_url(str(url))
|
| 62 |
+
results.append(result)
|
| 63 |
+
except Exception as e:
|
| 64 |
+
results.append({"error": str(e), "url": str(url)})
|
| 65 |
+
return results
|
| 66 |
+
|
| 67 |
+
# Add to background tasks
|
| 68 |
+
background_tasks.add_task(process_batch)
|
| 69 |
+
|
| 70 |
+
return {
|
| 71 |
+
"message": f"Started processing {len(request.urls)} URLs in background",
|
| 72 |
+
"urls": [str(url) for url in request.urls]
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
@app.get("/page/{url:path}")
|
| 76 |
+
async def get_page_data(url: str):
|
| 77 |
+
"""Get processed page data optimized for LLM consumption"""
|
| 78 |
+
try:
|
| 79 |
+
# Decode URL
|
| 80 |
+
import urllib.parse
|
| 81 |
+
decoded_url = urllib.parse.unquote(url)
|
| 82 |
+
|
| 83 |
+
page_data = orchestrator.get_page_for_llm(decoded_url)
|
| 84 |
+
|
| 85 |
+
if not page_data:
|
| 86 |
+
raise HTTPException(status_code=404, detail="Page not found")
|
| 87 |
+
|
| 88 |
+
return page_data
|
| 89 |
+
|
| 90 |
+
except Exception as e:
|
| 91 |
+
raise HTTPException(status_code=500, detail=f"Retrieval failed: {str(e)}")
|
| 92 |
+
|
| 93 |
+
@app.post("/search", response_model=SearchResponse)
|
| 94 |
+
async def search_content(request: SearchRequest):
|
| 95 |
+
"""Search stored content for LLM context"""
|
| 96 |
+
try:
|
| 97 |
+
results = orchestrator.search_for_llm(request.query, request.limit)
|
| 98 |
+
|
| 99 |
+
return SearchResponse(
|
| 100 |
+
results=results,
|
| 101 |
+
total_found=len(results)
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
except Exception as e:
|
| 105 |
+
raise HTTPException(status_code=500, detail=f"Search failed: {str(e)}")
|
| 106 |
+
|
| 107 |
+
@app.get("/llm-ready/{url:path}")
|
| 108 |
+
async def get_llm_ready_content(url: str):
|
| 109 |
+
"""Get content specifically formatted for LLM consumption"""
|
| 110 |
+
try:
|
| 111 |
+
import urllib.parse
|
| 112 |
+
decoded_url = urllib.parse.unquote(url)
|
| 113 |
+
|
| 114 |
+
page_data = orchestrator.get_page_for_llm(decoded_url)
|
| 115 |
+
|
| 116 |
+
if not page_data:
|
| 117 |
+
raise HTTPException(status_code=404, detail="Page not found")
|
| 118 |
+
|
| 119 |
+
# Format for LLM
|
| 120 |
+
llm_content = {
|
| 121 |
+
"instruction": "Use this content for generating summaries, notes, or mind maps",
|
| 122 |
+
"content": {
|
| 123 |
+
"title": page_data["title"],
|
| 124 |
+
"main_content": page_data["content"],
|
| 125 |
+
"structure": {
|
| 126 |
+
"headings": page_data["headings"],
|
| 127 |
+
"content_type": page_data["study_metadata"]["content_type"],
|
| 128 |
+
"complexity": page_data["study_metadata"]["complexity_score"],
|
| 129 |
+
"reading_time": page_data["study_metadata"]["reading_time"]
|
| 130 |
+
},
|
| 131 |
+
"context": {
|
| 132 |
+
"related_pages": page_data["relationships"]["related_pages"],
|
| 133 |
+
"key_topics": page_data["study_metadata"]["key_topics"]
|
| 134 |
+
}
|
| 135 |
+
},
|
| 136 |
+
"suggestions": {
|
| 137 |
+
"study_approach": _get_study_approach(page_data["study_metadata"]),
|
| 138 |
+
"focus_areas": page_data["headings"][:3],
|
| 139 |
+
"difficulty_level": _assess_difficulty(page_data["study_metadata"])
|
| 140 |
+
}
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
return llm_content
|
| 144 |
+
|
| 145 |
+
except Exception as e:
|
| 146 |
+
raise HTTPException(status_code=500, detail=f"LLM formatting failed: {str(e)}")
|
| 147 |
+
|
| 148 |
+
@app.get("/health")
|
| 149 |
+
async def health_check():
|
| 150 |
+
"""Health check endpoint"""
|
| 151 |
+
return {"status": "healthy", "message": "Web scraper API is running"}
|
| 152 |
+
|
| 153 |
+
@app.get("/stats")
|
| 154 |
+
async def get_statistics():
|
| 155 |
+
"""Get scraping statistics"""
|
| 156 |
+
try:
|
| 157 |
+
# Get basic stats from MongoDB
|
| 158 |
+
mongo_stats = orchestrator.mongo_storage.collection.estimated_document_count()
|
| 159 |
+
|
| 160 |
+
return {
|
| 161 |
+
"total_pages_scraped": mongo_stats,
|
| 162 |
+
"database_status": "connected",
|
| 163 |
+
"features": [
|
| 164 |
+
"Dynamic content scraping with Playwright",
|
| 165 |
+
"DOM structure analysis",
|
| 166 |
+
"MongoDB storage for content",
|
| 167 |
+
"Neo4j for relationships",
|
| 168 |
+
"LLM-optimized data extraction"
|
| 169 |
+
]
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
except Exception as e:
|
| 173 |
+
return {"error": f"Stats retrieval failed: {str(e)}"}
|
| 174 |
+
|
| 175 |
+
def _get_study_approach(metadata: Dict) -> str:
|
| 176 |
+
"""Suggest study approach based on content analysis"""
|
| 177 |
+
content_type = metadata.get("content_type", "general")
|
| 178 |
+
complexity = metadata.get("complexity_score", 0)
|
| 179 |
+
|
| 180 |
+
if content_type == "tutorial":
|
| 181 |
+
return "hands-on practice with step-by-step approach"
|
| 182 |
+
elif content_type == "documentation":
|
| 183 |
+
return "reference-based learning with examples"
|
| 184 |
+
elif content_type == "research":
|
| 185 |
+
return "analytical reading with note-taking"
|
| 186 |
+
elif complexity > 5:
|
| 187 |
+
return "detailed study with concept mapping"
|
| 188 |
+
else:
|
| 189 |
+
return "general reading with summary creation"
|
| 190 |
+
|
| 191 |
+
def _assess_difficulty(metadata: Dict) -> str:
|
| 192 |
+
"""Assess content difficulty for LLM processing hints"""
|
| 193 |
+
complexity = metadata.get("complexity_score", 0)
|
| 194 |
+
reading_time = metadata.get("reading_time", 0)
|
| 195 |
+
|
| 196 |
+
if complexity < 2 and reading_time < 5:
|
| 197 |
+
return "beginner"
|
| 198 |
+
elif complexity < 5 and reading_time < 15:
|
| 199 |
+
return "intermediate"
|
| 200 |
+
else:
|
| 201 |
+
return "advanced"
|
| 202 |
+
|
| 203 |
+
@app.on_event("shutdown")
|
| 204 |
+
async def shutdown_event():
|
| 205 |
+
"""Clean up on shutdown"""
|
| 206 |
+
orchestrator.close_connections()
|
| 207 |
+
|
| 208 |
+
# Run the API
|
| 209 |
+
if __name__ == "__main__":
|
| 210 |
+
import uvicorn
|
| 211 |
+
uvicorn.run("api:app", host="0.0.0.0", port=8000, reload=True)
|
main.py
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
from typing import Dict, Optional,List
|
| 3 |
+
from scraper.html_loader import HTMLLoader
|
| 4 |
+
from scraper.data_extractor import DataExtractor
|
| 5 |
+
from scraper.dom_analyzer import DOMAnalyzer
|
| 6 |
+
from storage.mongo_storage import MongoStorage
|
| 7 |
+
# from storage.neo4j_storage import Neo4jStorage
|
| 8 |
+
|
| 9 |
+
class WebScrapingOrchestrator:
|
| 10 |
+
def __init__(self):
|
| 11 |
+
self.data_extractor = DataExtractor()
|
| 12 |
+
self.dom_analyzer = DOMAnalyzer()
|
| 13 |
+
self.mongo_storage = MongoStorage()
|
| 14 |
+
# self.neo4j_storage = Neo4jStorage()
|
| 15 |
+
|
| 16 |
+
async def process_url(self, url: str) -> Dict:
|
| 17 |
+
"""Complete pipeline to process a URL for LLM consumption"""
|
| 18 |
+
try:
|
| 19 |
+
print(f"Processing URL: {url}")
|
| 20 |
+
|
| 21 |
+
# Step 1: Load HTML content
|
| 22 |
+
async with HTMLLoader() as loader:
|
| 23 |
+
html_data = await loader.load_page(url)
|
| 24 |
+
|
| 25 |
+
if not html_data:
|
| 26 |
+
return {"error": "Failed to load page"}
|
| 27 |
+
|
| 28 |
+
print("✓ HTML loaded successfully")
|
| 29 |
+
|
| 30 |
+
# Step 2: Extract structured data
|
| 31 |
+
extracted_data = self.data_extractor.extract_structured_data(
|
| 32 |
+
html_data["html"],
|
| 33 |
+
html_data["url"]
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
print("✓ Data extracted successfully")
|
| 37 |
+
|
| 38 |
+
# Step 3: Analyze DOM structure
|
| 39 |
+
dom_structure = self.dom_analyzer.analyze_structure(html_data["html"])
|
| 40 |
+
|
| 41 |
+
print("✓ DOM structure analyzed")
|
| 42 |
+
|
| 43 |
+
# Step 4: Store in MongoDB
|
| 44 |
+
mongo_id = self.mongo_storage.store_page_data(
|
| 45 |
+
html_data["url"],
|
| 46 |
+
extracted_data,
|
| 47 |
+
dom_structure
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
print("✓ Data stored in MongoDB")
|
| 51 |
+
|
| 52 |
+
# Step 5: Store relationships in Neo4j
|
| 53 |
+
# self.neo4j_storage.store_relationships(
|
| 54 |
+
# html_data["url"],
|
| 55 |
+
# extracted_data,
|
| 56 |
+
# dom_structure
|
| 57 |
+
# )
|
| 58 |
+
|
| 59 |
+
print("✓ Relationships stored in Neo4j")
|
| 60 |
+
|
| 61 |
+
# Return LLM-ready summary
|
| 62 |
+
return {
|
| 63 |
+
"success": True,
|
| 64 |
+
"url": html_data["url"],
|
| 65 |
+
"title": html_data["title"],
|
| 66 |
+
"mongo_id": mongo_id,
|
| 67 |
+
"summary": {
|
| 68 |
+
"content_blocks": len(extracted_data["content"]),
|
| 69 |
+
"text_length": len(extracted_data["text_summary"]),
|
| 70 |
+
"links_found": len(extracted_data["links"]),
|
| 71 |
+
"images_found": len(extracted_data["images"]),
|
| 72 |
+
"dom_depth": dom_structure["statistics"]["max_depth"],
|
| 73 |
+
"content_type": self._identify_content_type(extracted_data)
|
| 74 |
+
},
|
| 75 |
+
"llm_ready_data": {
|
| 76 |
+
"text_summary": extracted_data["text_summary"],
|
| 77 |
+
"key_headings": [h["text"] for h in extracted_data["metadata"]["headings"][:5]],
|
| 78 |
+
"main_topics": self._extract_main_topics(extracted_data),
|
| 79 |
+
"study_hints": self._generate_study_hints(extracted_data, dom_structure)
|
| 80 |
+
}
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
except Exception as e:
|
| 84 |
+
print(f"✗ Error processing {url}: {str(e)}")
|
| 85 |
+
return {"error": str(e), "url": url}
|
| 86 |
+
|
| 87 |
+
def get_page_for_llm(self, url: str) -> Optional[Dict]:
|
| 88 |
+
"""Retrieve page data optimized for LLM consumption"""
|
| 89 |
+
# Get from MongoDB
|
| 90 |
+
mongo_data = self.mongo_storage.get_page_data(url)
|
| 91 |
+
if not mongo_data:
|
| 92 |
+
return None
|
| 93 |
+
|
| 94 |
+
# Get relationships from Neo4j
|
| 95 |
+
neo4j_data = self.neo4j_storage.get_page_relationships(url)
|
| 96 |
+
|
| 97 |
+
# Combine for LLM
|
| 98 |
+
return {
|
| 99 |
+
"content": mongo_data["content"]["text_summary"],
|
| 100 |
+
"title": mongo_data["title"],
|
| 101 |
+
"headings": [h["text"] for h in mongo_data["content"]["headings"]],
|
| 102 |
+
"structure": mongo_data["study_metadata"],
|
| 103 |
+
"relationships": {
|
| 104 |
+
"related_pages": neo4j_data.get("internal_links", [])[:5],
|
| 105 |
+
"external_references": neo4j_data.get("external_links", [])[:3]
|
| 106 |
+
},
|
| 107 |
+
"study_metadata": mongo_data["study_metadata"]
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
def search_for_llm(self, query: str, limit: int = 5) -> List[Dict]:
|
| 111 |
+
"""Search content for LLM context"""
|
| 112 |
+
results = self.mongo_storage.search_pages(query, limit)
|
| 113 |
+
|
| 114 |
+
llm_ready_results = []
|
| 115 |
+
for result in results:
|
| 116 |
+
llm_ready_results.append({
|
| 117 |
+
"url": result["url"],
|
| 118 |
+
"title": result["title"],
|
| 119 |
+
"summary": result["content"]["text_summary"][:500],
|
| 120 |
+
"content_type": result["study_metadata"]["content_type"],
|
| 121 |
+
"complexity": result["study_metadata"]["complexity_score"],
|
| 122 |
+
"key_topics": result["study_metadata"]["key_topics"][:5]
|
| 123 |
+
})
|
| 124 |
+
|
| 125 |
+
return llm_ready_results
|
| 126 |
+
|
| 127 |
+
def _identify_content_type(self, data: Dict) -> str:
|
| 128 |
+
"""Identify content type for processing hints"""
|
| 129 |
+
title = data["metadata"]["title"].lower()
|
| 130 |
+
text = data["text_summary"].lower()
|
| 131 |
+
|
| 132 |
+
if any(word in title for word in ["tutorial", "guide", "how to"]):
|
| 133 |
+
return "tutorial"
|
| 134 |
+
elif any(word in title for word in ["documentation", "docs", "api"]):
|
| 135 |
+
return "documentation"
|
| 136 |
+
elif any(word in title for word in ["blog", "article", "news"]):
|
| 137 |
+
return "article"
|
| 138 |
+
elif any(word in text for word in ["research", "study", "analysis"]):
|
| 139 |
+
return "research"
|
| 140 |
+
return "general"
|
| 141 |
+
|
| 142 |
+
def _extract_main_topics(self, data: Dict) -> List[str]:
|
| 143 |
+
"""Extract main topics for LLM understanding"""
|
| 144 |
+
topics = set()
|
| 145 |
+
|
| 146 |
+
# From title
|
| 147 |
+
title_words = [word for word in data["metadata"]["title"].split() if len(word) > 3]
|
| 148 |
+
topics.update(title_words[:3])
|
| 149 |
+
|
| 150 |
+
# From headings
|
| 151 |
+
for heading in data["metadata"]["headings"][:3]:
|
| 152 |
+
heading_words = [word for word in heading["text"].split() if len(word) > 3]
|
| 153 |
+
topics.update(heading_words[:2])
|
| 154 |
+
|
| 155 |
+
return list(topics)[:5]
|
| 156 |
+
|
| 157 |
+
def _generate_study_hints(self, extracted_data: Dict, dom_structure: Dict) -> Dict:
|
| 158 |
+
"""Generate study hints for LLM processing"""
|
| 159 |
+
return {
|
| 160 |
+
"difficulty_level": "beginner" if len(extracted_data["text_summary"]) < 2000 else "intermediate",
|
| 161 |
+
"estimated_study_time": f"{len(extracted_data['text_summary'].split()) // 250} minutes",
|
| 162 |
+
"content_structure": "well_structured" if len(extracted_data["metadata"]["headings"]) > 3 else "basic",
|
| 163 |
+
"has_examples": "code" in extracted_data["text_summary"].lower(),
|
| 164 |
+
"interactive_elements": dom_structure["statistics"]["tag_distribution"].get("form", 0) > 0
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
def close_connections(self):
|
| 168 |
+
"""Close all database connections"""
|
| 169 |
+
self.neo4j_storage.close()
|
| 170 |
+
|
| 171 |
+
# Main execution function
|
| 172 |
+
async def main():
|
| 173 |
+
orchestrator = WebScrapingOrchestrator()
|
| 174 |
+
|
| 175 |
+
# Example usage
|
| 176 |
+
test_url = "https://en.wikipedia.org/wiki/Virat_Kohli"
|
| 177 |
+
result = await orchestrator.process_url(test_url)
|
| 178 |
+
print(f"Processing result: {result}")
|
| 179 |
+
|
| 180 |
+
# Clean up
|
| 181 |
+
orchestrator.close_connections()
|
| 182 |
+
|
| 183 |
+
if __name__ == "__main__":
|
| 184 |
+
asyncio.run(main())
|
requirements.txt
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi==0.104.1
|
| 2 |
+
uvicorn==0.24.0
|
| 3 |
+
playwright==1.40.0
|
| 4 |
+
beautifulsoup4==4.12.2
|
| 5 |
+
pymongo==4.6.0
|
| 6 |
+
neo4j==5.15.0
|
| 7 |
+
pydantic==2.5.2
|
| 8 |
+
python-multipart==0.0.6
|
| 9 |
+
aiofiles==23.2.1
|
| 10 |
+
requests==2.31.0
|
| 11 |
+
lxml==4.9.3
|
| 12 |
+
newspaper3k==0.2.8
|
| 13 |
+
readability-lxml==0.8.1
|
| 14 |
+
python-dotenv==1.0.0
|
| 15 |
+
nltk==3.8.1
|
| 16 |
+
spacy==3.7.2
|
scraper/__init__.py
ADDED
|
File without changes
|
scraper/data_extractor.py
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from bs4 import BeautifulSoup, Comment
|
| 2 |
+
from typing import Dict, List, Optional
|
| 3 |
+
import re
|
| 4 |
+
from urllib.parse import urljoin, urlparse
|
| 5 |
+
|
| 6 |
+
class DataExtractor:
|
| 7 |
+
def __init__(self):
|
| 8 |
+
self.ignore_selectors = [
|
| 9 |
+
'.advertisement',
|
| 10 |
+
'.ad',
|
| 11 |
+
'.banner',
|
| 12 |
+
'.popup',
|
| 13 |
+
'#footer',
|
| 14 |
+
'.footer',
|
| 15 |
+
'.sidebar',
|
| 16 |
+
'nav',
|
| 17 |
+
'.navbar',
|
| 18 |
+
'.menu',
|
| 19 |
+
'header',
|
| 20 |
+
'#header',
|
| 21 |
+
'script',
|
| 22 |
+
'style',
|
| 23 |
+
'noscript',
|
| 24 |
+
'iframe',
|
| 25 |
+
'meta',
|
| 26 |
+
'link',
|
| 27 |
+
'[class*="ad-"]',
|
| 28 |
+
'[id*="ad-"]',
|
| 29 |
+
'.cookie-notice',
|
| 30 |
+
'.modal',
|
| 31 |
+
'form',
|
| 32 |
+
'input',
|
| 33 |
+
'button',
|
| 34 |
+
'.social-media',
|
| 35 |
+
'.comments-section',
|
| 36 |
+
'.widget'
|
| 37 |
+
]
|
| 38 |
+
self.content_selectors = [
|
| 39 |
+
'.main-content',
|
| 40 |
+
'article',
|
| 41 |
+
'p',
|
| 42 |
+
'h1',
|
| 43 |
+
'h2',
|
| 44 |
+
'h3',
|
| 45 |
+
'h4',
|
| 46 |
+
'h5',
|
| 47 |
+
'h6',
|
| 48 |
+
'div.content',
|
| 49 |
+
'.post',
|
| 50 |
+
'.article-body',
|
| 51 |
+
'.content-body',
|
| 52 |
+
'section',
|
| 53 |
+
'main',
|
| 54 |
+
'ul',
|
| 55 |
+
'ol',
|
| 56 |
+
'li',
|
| 57 |
+
'table',
|
| 58 |
+
'td',
|
| 59 |
+
'th',
|
| 60 |
+
'blockquote',
|
| 61 |
+
'pre',
|
| 62 |
+
'.text',
|
| 63 |
+
'[class*="content"]',
|
| 64 |
+
'[class*="post"]',
|
| 65 |
+
'[class*="article"]',
|
| 66 |
+
'div:not([class*="ad"]):not([class*="banner"]):not([class*="sidebar"])'
|
| 67 |
+
]
|
| 68 |
+
self.min_text_length = 200
|
| 69 |
+
def extract_structured_data(self, html: str, url: str) -> Dict:
|
| 70 |
+
"""Extract structured data from HTML for LLM consumption"""
|
| 71 |
+
soup = BeautifulSoup(html, 'lxml')
|
| 72 |
+
|
| 73 |
+
# Remove unwanted elements
|
| 74 |
+
self._clean_html(soup)
|
| 75 |
+
|
| 76 |
+
return {
|
| 77 |
+
"content": self._extract_content(soup),
|
| 78 |
+
"metadata": self._extract_metadata(soup, url),
|
| 79 |
+
"structure": self._extract_structure(soup),
|
| 80 |
+
"links": self._extract_links(soup, url),
|
| 81 |
+
"images": self._extract_images(soup, url),
|
| 82 |
+
"text_summary": self._extract_text_summary(soup)
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
def _clean_html(self, soup: BeautifulSoup):
|
| 86 |
+
"""Remove unwanted elements for cleaner extraction"""
|
| 87 |
+
for selector in self.ignore_selectors:
|
| 88 |
+
for element in soup.select(selector):
|
| 89 |
+
element.decompose()
|
| 90 |
+
|
| 91 |
+
# Remove comments and scripts
|
| 92 |
+
for element in soup(text=lambda text: isinstance(text, Comment)):
|
| 93 |
+
element.extract()
|
| 94 |
+
|
| 95 |
+
def _extract_content(self, soup: BeautifulSoup) -> List[Dict]:
|
| 96 |
+
"""Extract main content blocks"""
|
| 97 |
+
content_blocks = []
|
| 98 |
+
|
| 99 |
+
for selector in self.content_selectors:
|
| 100 |
+
elements = soup.select(selector)
|
| 101 |
+
for elem in elements:
|
| 102 |
+
text = elem.get_text(strip=True)
|
| 103 |
+
if len(text) >= self.min_text_length:
|
| 104 |
+
content_blocks.append({
|
| 105 |
+
"tag": elem.name,
|
| 106 |
+
"text": text,
|
| 107 |
+
"html": str(elem),
|
| 108 |
+
"attributes": dict(elem.attrs) if elem.attrs else {}
|
| 109 |
+
})
|
| 110 |
+
|
| 111 |
+
return content_blocks
|
| 112 |
+
|
| 113 |
+
def _extract_metadata(self, soup: BeautifulSoup, url: str) -> Dict:
|
| 114 |
+
"""Extract page metadata"""
|
| 115 |
+
title = soup.find('title')
|
| 116 |
+
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
| 117 |
+
|
| 118 |
+
return {
|
| 119 |
+
"title": title.get_text().strip() if title else "",
|
| 120 |
+
"description": meta_desc.get('content', '') if meta_desc else "",
|
| 121 |
+
"url": url,
|
| 122 |
+
"domain": urlparse(url).netloc,
|
| 123 |
+
"headings": self._extract_headings(soup)
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
def _extract_headings(self, soup: BeautifulSoup) -> List[Dict]:
|
| 127 |
+
"""Extract heading hierarchy for structure"""
|
| 128 |
+
headings = []
|
| 129 |
+
for i in range(1, 7):
|
| 130 |
+
for heading in soup.find_all(f'h{i}'):
|
| 131 |
+
headings.append({
|
| 132 |
+
"level": i,
|
| 133 |
+
"text": heading.get_text().strip(),
|
| 134 |
+
"id": heading.get('id', '')
|
| 135 |
+
})
|
| 136 |
+
return headings
|
| 137 |
+
|
| 138 |
+
def _extract_structure(self, soup: BeautifulSoup) -> Dict:
|
| 139 |
+
"""Extract DOM structure for relationships"""
|
| 140 |
+
return {
|
| 141 |
+
"sections": len(soup.find_all(['section', 'article', 'div'])),
|
| 142 |
+
"paragraphs": len(soup.find_all('p')),
|
| 143 |
+
"lists": len(soup.find_all(['ul', 'ol'])),
|
| 144 |
+
"tables": len(soup.find_all('table')),
|
| 145 |
+
"forms": len(soup.find_all('form'))
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
def _extract_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
|
| 149 |
+
"""Extract all links for relationship mapping"""
|
| 150 |
+
links = []
|
| 151 |
+
for link in soup.find_all('a', href=True):
|
| 152 |
+
href = urljoin(base_url, link['href'])
|
| 153 |
+
links.append({
|
| 154 |
+
"url": href,
|
| 155 |
+
"text": link.get_text().strip(),
|
| 156 |
+
"internal": urlparse(href).netloc == urlparse(base_url).netloc
|
| 157 |
+
})
|
| 158 |
+
return links[:50] # Limit for performance
|
| 159 |
+
|
| 160 |
+
def _extract_images(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
|
| 161 |
+
"""Extract images with context"""
|
| 162 |
+
images = []
|
| 163 |
+
for img in soup.find_all('img', src=True):
|
| 164 |
+
images.append({
|
| 165 |
+
"src": urljoin(base_url, img['src']),
|
| 166 |
+
"alt": img.get('alt', ''),
|
| 167 |
+
"caption": img.get('title', '')
|
| 168 |
+
})
|
| 169 |
+
return images[:20] # Limit for performance
|
| 170 |
+
|
| 171 |
+
def _extract_text_summary(self, soup: BeautifulSoup) -> str:
|
| 172 |
+
"""Extract clean text for LLM processing"""
|
| 173 |
+
text = soup.get_text()
|
| 174 |
+
# Clean whitespace and normalize
|
| 175 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
| 176 |
+
return text[:5000] # Limit for token efficiency
|
scraper/dom_analyzer.py
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from bs4 import BeautifulSoup
|
| 2 |
+
from typing import Dict, List
|
| 3 |
+
import hashlib
|
| 4 |
+
|
| 5 |
+
class DOMAnalyzer:
|
| 6 |
+
def __init__(self):
|
| 7 |
+
pass
|
| 8 |
+
|
| 9 |
+
def analyze_structure(self, html: str) -> Dict:
|
| 10 |
+
"""Analyze DOM structure and create tree representation"""
|
| 11 |
+
soup = BeautifulSoup(html, 'lxml')
|
| 12 |
+
|
| 13 |
+
return {
|
| 14 |
+
"tree": self._build_dom_tree(soup.body if soup.body else soup),
|
| 15 |
+
"statistics": self._get_dom_statistics(soup),
|
| 16 |
+
"semantic_structure": self._analyze_semantic_structure(soup),
|
| 17 |
+
"content_blocks": self._identify_content_blocks(soup)
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
def _build_dom_tree(self, element, depth=0, max_depth=5) -> Dict:
|
| 21 |
+
"""Build hierarchical DOM tree structure"""
|
| 22 |
+
if depth > max_depth or not element or not hasattr(element, 'name'):
|
| 23 |
+
return {}
|
| 24 |
+
|
| 25 |
+
node = {
|
| 26 |
+
"tag": element.name if element.name else "text",
|
| 27 |
+
"id": element.get('id', ''),
|
| 28 |
+
"classes": element.get('class', []),
|
| 29 |
+
"text_content": element.get_text()[:100] if element.get_text() else "",
|
| 30 |
+
"children": [],
|
| 31 |
+
"attributes": dict(element.attrs) if hasattr(element, 'attrs') else {},
|
| 32 |
+
"depth": depth,
|
| 33 |
+
"node_id": hashlib.md5(str(element)[:500].encode()).hexdigest()[:8]
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
# Add children (limit to prevent huge trees)
|
| 37 |
+
if hasattr(element, 'children') and depth < max_depth:
|
| 38 |
+
child_count = 0
|
| 39 |
+
for child in element.children:
|
| 40 |
+
if child_count >= 10: # Limit children per node
|
| 41 |
+
break
|
| 42 |
+
if hasattr(child, 'name') and child.name:
|
| 43 |
+
child_node = self._build_dom_tree(child, depth + 1, max_depth)
|
| 44 |
+
if child_node:
|
| 45 |
+
node["children"].append(child_node)
|
| 46 |
+
child_count += 1
|
| 47 |
+
|
| 48 |
+
return node
|
| 49 |
+
|
| 50 |
+
def _get_dom_statistics(self, soup: BeautifulSoup) -> Dict:
|
| 51 |
+
"""Get DOM statistics for analysis"""
|
| 52 |
+
all_tags = soup.find_all()
|
| 53 |
+
tag_counts = {}
|
| 54 |
+
|
| 55 |
+
for tag in all_tags:
|
| 56 |
+
tag_name = tag.name
|
| 57 |
+
tag_counts[tag_name] = tag_counts.get(tag_name, 0) + 1
|
| 58 |
+
|
| 59 |
+
return {
|
| 60 |
+
"total_elements": len(all_tags),
|
| 61 |
+
"tag_distribution": tag_counts,
|
| 62 |
+
"max_depth": self._calculate_max_depth(soup),
|
| 63 |
+
"text_content_ratio": self._calculate_text_ratio(soup)
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
def _analyze_semantic_structure(self, soup: BeautifulSoup) -> Dict:
|
| 67 |
+
"""Analyze semantic HTML structure"""
|
| 68 |
+
semantic_tags = ['header', 'nav', 'main', 'article', 'section', 'aside', 'footer']
|
| 69 |
+
semantic_elements = {}
|
| 70 |
+
|
| 71 |
+
for tag in semantic_tags:
|
| 72 |
+
elements = soup.find_all(tag)
|
| 73 |
+
semantic_elements[tag] = len(elements)
|
| 74 |
+
|
| 75 |
+
return {
|
| 76 |
+
"semantic_elements": semantic_elements,
|
| 77 |
+
"has_semantic_structure": sum(semantic_elements.values()) > 0,
|
| 78 |
+
"content_hierarchy": self._analyze_heading_hierarchy(soup)
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
def _identify_content_blocks(self, soup: BeautifulSoup) -> List[Dict]:
|
| 82 |
+
"""Identify main content blocks for LLM processing"""
|
| 83 |
+
content_blocks = []
|
| 84 |
+
|
| 85 |
+
# Look for common content containers
|
| 86 |
+
selectors = ['article', 'main', '.content', '#content', '.post', '.entry']
|
| 87 |
+
|
| 88 |
+
for selector in selectors:
|
| 89 |
+
elements = soup.select(selector)
|
| 90 |
+
for elem in elements:
|
| 91 |
+
if elem.get_text(strip=True):
|
| 92 |
+
content_blocks.append({
|
| 93 |
+
"selector": selector,
|
| 94 |
+
"tag": elem.name,
|
| 95 |
+
"text_length": len(elem.get_text()),
|
| 96 |
+
"element_id": elem.get('id', ''),
|
| 97 |
+
"classes": elem.get('class', []),
|
| 98 |
+
"priority": self._calculate_content_priority(elem)
|
| 99 |
+
})
|
| 100 |
+
|
| 101 |
+
return sorted(content_blocks, key=lambda x: x['priority'], reverse=True)[:5]
|
| 102 |
+
|
| 103 |
+
def _calculate_max_depth(self, soup: BeautifulSoup) -> int:
|
| 104 |
+
"""Calculate maximum DOM depth"""
|
| 105 |
+
def get_depth(element, current_depth=0):
|
| 106 |
+
if not hasattr(element, 'children'):
|
| 107 |
+
return current_depth
|
| 108 |
+
|
| 109 |
+
max_child_depth = current_depth
|
| 110 |
+
for child in element.children:
|
| 111 |
+
if hasattr(child, 'name') and child.name:
|
| 112 |
+
depth = get_depth(child, current_depth + 1)
|
| 113 |
+
max_child_depth = max(max_child_depth, depth)
|
| 114 |
+
|
| 115 |
+
return max_child_depth
|
| 116 |
+
|
| 117 |
+
return get_depth(soup)
|
| 118 |
+
|
| 119 |
+
def _calculate_text_ratio(self, soup: BeautifulSoup) -> float:
|
| 120 |
+
"""Calculate ratio of text content to HTML tags"""
|
| 121 |
+
text_length = len(soup.get_text())
|
| 122 |
+
html_length = len(str(soup))
|
| 123 |
+
return text_length / html_length if html_length > 0 else 0
|
| 124 |
+
|
| 125 |
+
def _analyze_heading_hierarchy(self, soup: BeautifulSoup) -> List[Dict]:
|
| 126 |
+
"""Analyze heading structure for content organization"""
|
| 127 |
+
headings = []
|
| 128 |
+
for i in range(1, 7):
|
| 129 |
+
for heading in soup.find_all(f'h{i}'):
|
| 130 |
+
headings.append({
|
| 131 |
+
"level": i,
|
| 132 |
+
"text": heading.get_text().strip(),
|
| 133 |
+
"position": len(headings)
|
| 134 |
+
})
|
| 135 |
+
return headings
|
| 136 |
+
|
| 137 |
+
def _calculate_content_priority(self, element) -> int:
|
| 138 |
+
"""Calculate priority score for content blocks"""
|
| 139 |
+
score = 0
|
| 140 |
+
text_length = len(element.get_text())
|
| 141 |
+
|
| 142 |
+
# Text length scoring
|
| 143 |
+
score += min(text_length // 100, 10)
|
| 144 |
+
|
| 145 |
+
# Semantic tag bonus
|
| 146 |
+
if element.name in ['article', 'main']:
|
| 147 |
+
score += 5
|
| 148 |
+
elif element.name in ['section', 'div']:
|
| 149 |
+
score += 2
|
| 150 |
+
|
| 151 |
+
# Class/ID based scoring
|
| 152 |
+
classes = element.get('class', [])
|
| 153 |
+
element_id = element.get('id', '')
|
| 154 |
+
|
| 155 |
+
content_indicators = ['content', 'article', 'post', 'main', 'body']
|
| 156 |
+
for indicator in content_indicators:
|
| 157 |
+
if any(indicator in str(c).lower() for c in classes):
|
| 158 |
+
score += 3
|
| 159 |
+
if indicator in element_id.lower():
|
| 160 |
+
score += 3
|
| 161 |
+
|
| 162 |
+
return score
|
scraper/html_loader.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
from playwright.async_api import async_playwright
|
| 3 |
+
from typing import Dict, Optional
|
| 4 |
+
import time
|
| 5 |
+
|
| 6 |
+
class HTMLLoader:
|
| 7 |
+
def __init__(self):
|
| 8 |
+
self.browser = None
|
| 9 |
+
self.context = None
|
| 10 |
+
self.max_retries = 3
|
| 11 |
+
self.timeout = 30000
|
| 12 |
+
self.wait_for_selector = "body"
|
| 13 |
+
self.max_retries = 3
|
| 14 |
+
self.delay_between_requests = 1.0
|
| 15 |
+
|
| 16 |
+
async def __aenter__(self):
|
| 17 |
+
self.playwright = await async_playwright().start()
|
| 18 |
+
self.browser = await self.playwright.chromium.launch(
|
| 19 |
+
headless=True
|
| 20 |
+
)
|
| 21 |
+
self.context = await self.browser.new_context(
|
| 22 |
+
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
| 23 |
+
)
|
| 24 |
+
return self
|
| 25 |
+
|
| 26 |
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
| 27 |
+
if self.context:
|
| 28 |
+
await self.context.close()
|
| 29 |
+
if self.browser:
|
| 30 |
+
await self.browser.close()
|
| 31 |
+
if self.playwright:
|
| 32 |
+
await self.playwright.stop()
|
| 33 |
+
|
| 34 |
+
async def load_page(self, url: str) -> Dict[str, str]:
|
| 35 |
+
"""Load HTML content from URL handling both static and dynamic sites"""
|
| 36 |
+
for attempt in range(self.max_retries):
|
| 37 |
+
try:
|
| 38 |
+
page = await self.context.new_page()
|
| 39 |
+
await page.goto(url, timeout=self.timeout)
|
| 40 |
+
|
| 41 |
+
# Wait for body to load
|
| 42 |
+
await page.wait_for_selector(
|
| 43 |
+
self.wait_for_selector,
|
| 44 |
+
timeout=10000
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
# Additional wait for dynamic content
|
| 48 |
+
await page.wait_for_timeout(2000)
|
| 49 |
+
|
| 50 |
+
html_content = await page.content()
|
| 51 |
+
title = await page.title()
|
| 52 |
+
url_final = page.url
|
| 53 |
+
|
| 54 |
+
await page.close()
|
| 55 |
+
|
| 56 |
+
return {
|
| 57 |
+
"html": html_content,
|
| 58 |
+
"title": title,
|
| 59 |
+
"url": url_final,
|
| 60 |
+
"timestamp": int(time.time())
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
except Exception as e:
|
| 64 |
+
if attempt == self.max_retries - 1:
|
| 65 |
+
raise Exception(f"Failed to load {url}: {str(e)}")
|
| 66 |
+
await asyncio.sleep(self.delay_between_requests)
|
| 67 |
+
|
| 68 |
+
return None
|
server.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import asyncio
|
| 3 |
+
from main import WebScrapingOrchestrator
|
| 4 |
+
|
| 5 |
+
orchestrator = WebScrapingOrchestrator()
|
| 6 |
+
|
| 7 |
+
async def scrape_async(url):
|
| 8 |
+
result = await orchestrator.process_url(url)
|
| 9 |
+
if "error" in result:
|
| 10 |
+
return f"❌ Error: {result['error']}"
|
| 11 |
+
return {
|
| 12 |
+
"URL": result.get("url"),
|
| 13 |
+
"Title": result.get("title"),
|
| 14 |
+
"Text Length": result["summary"]["text_length"],
|
| 15 |
+
"Headings": result["llm_ready_data"]["key_headings"],
|
| 16 |
+
"Main Topics": result["llm_ready_data"]["main_topics"],
|
| 17 |
+
"Summary (Short)": result["llm_ready_data"]["text_summary"][:800] + "..."
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
def scrape(url):
|
| 21 |
+
return asyncio.run(scrape_async(url))
|
| 22 |
+
|
| 23 |
+
with gr.Blocks(title="MCP Web Scraper") as demo:
|
| 24 |
+
gr.Markdown("### 🔍 MCP LLM Web Scraper")
|
| 25 |
+
url_input = gr.Textbox(label="Enter a webpage URL", placeholder="https://...")
|
| 26 |
+
output = gr.JSON(label="Scraped & LLM-ready Content")
|
| 27 |
+
|
| 28 |
+
scrape_button = gr.Button("Scrape Page")
|
| 29 |
+
scrape_button.click(scrape, inputs=url_input, outputs=output)
|
| 30 |
+
|
| 31 |
+
if __name__ == "__main__":
|
| 32 |
+
#demo.launch(server_name="0.0.0.0", server_port=7860)
|
| 33 |
+
demo.launch()
|
storage/__init__.py
ADDED
|
File without changes
|
storage/mongo_storage.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pymongo import MongoClient
|
| 2 |
+
from typing import Dict, List, Optional
|
| 3 |
+
import datetime
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
class MongoStorage:
|
| 7 |
+
def __init__(self):
|
| 8 |
+
self.client = MongoClient(os.environ.get("mongo_db_uri"))
|
| 9 |
+
self.db = self.client[os.environ.get("mongo_db_name")]
|
| 10 |
+
self.collection = self.db.scraped_pages
|
| 11 |
+
self._create_indexes()
|
| 12 |
+
|
| 13 |
+
def _create_indexes(self):
|
| 14 |
+
"""Create indexes for better query performance"""
|
| 15 |
+
self.collection.create_index("url", unique=True)
|
| 16 |
+
self.collection.create_index("domain")
|
| 17 |
+
self.collection.create_index("timestamp")
|
| 18 |
+
self.collection.create_index("content.metadata.title")
|
| 19 |
+
|
| 20 |
+
def store_page_data(self, url: str, extracted_data: Dict, dom_structure: Dict) -> str:
|
| 21 |
+
"""Store complete page data optimized for LLM consumption"""
|
| 22 |
+
document = {
|
| 23 |
+
"url": url,
|
| 24 |
+
"domain": extracted_data["metadata"]["domain"],
|
| 25 |
+
"timestamp": datetime.datetime.utcnow(),
|
| 26 |
+
"title": extracted_data["metadata"]["title"],
|
| 27 |
+
"description": extracted_data["metadata"]["description"],
|
| 28 |
+
|
| 29 |
+
# LLM-optimized content structure
|
| 30 |
+
"content": {
|
| 31 |
+
"text_summary": extracted_data["text_summary"],
|
| 32 |
+
"content_blocks": extracted_data["content"],
|
| 33 |
+
"headings": extracted_data["metadata"]["headings"],
|
| 34 |
+
"structure_info": extracted_data["structure"]
|
| 35 |
+
},
|
| 36 |
+
|
| 37 |
+
# Relationship data
|
| 38 |
+
"relationships": {
|
| 39 |
+
"internal_links": [link for link in extracted_data["links"] if link["internal"]],
|
| 40 |
+
"external_links": [link for link in extracted_data["links"] if not link["internal"]],
|
| 41 |
+
"images": extracted_data["images"]
|
| 42 |
+
},
|
| 43 |
+
|
| 44 |
+
# DOM analysis for advanced processing
|
| 45 |
+
"dom_analysis": {
|
| 46 |
+
"tree_structure": dom_structure["tree"],
|
| 47 |
+
"statistics": dom_structure["statistics"],
|
| 48 |
+
"semantic_structure": dom_structure["semantic_structure"],
|
| 49 |
+
"content_blocks": dom_structure["content_blocks"]
|
| 50 |
+
},
|
| 51 |
+
|
| 52 |
+
# Study-friendly metadata
|
| 53 |
+
"study_metadata": {
|
| 54 |
+
"reading_time": self._estimate_reading_time(extracted_data["text_summary"]),
|
| 55 |
+
"complexity_score": self._calculate_complexity_score(extracted_data),
|
| 56 |
+
"content_type": self._identify_content_type(extracted_data),
|
| 57 |
+
"key_topics": self._extract_key_topics(extracted_data)
|
| 58 |
+
}
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
# Upsert document
|
| 62 |
+
result = self.collection.replace_one(
|
| 63 |
+
{"url": url},
|
| 64 |
+
document,
|
| 65 |
+
upsert=True
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
return str(result.upserted_id or result.matched_count)
|
| 69 |
+
|
| 70 |
+
def get_page_data(self, url: str) -> Optional[Dict]:
|
| 71 |
+
"""Retrieve page data by URL"""
|
| 72 |
+
return self.collection.find_one({"url": url})
|
| 73 |
+
|
| 74 |
+
def get_pages_by_domain(self, domain: str) -> List[Dict]:
|
| 75 |
+
"""Get all pages from a specific domain"""
|
| 76 |
+
return list(self.collection.find({"domain": domain}))
|
| 77 |
+
|
| 78 |
+
def search_pages(self, query: str, limit: int = 10) -> List[Dict]:
|
| 79 |
+
"""Search pages by content for LLM queries"""
|
| 80 |
+
search_filter = {
|
| 81 |
+
"$or": [
|
| 82 |
+
{"title": {"$regex": query, "$options": "i"}},
|
| 83 |
+
{"description": {"$regex": query, "$options": "i"}},
|
| 84 |
+
{"content.text_summary": {"$regex": query, "$options": "i"}}
|
| 85 |
+
]
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
return list(self.collection.find(search_filter).limit(limit))
|
| 89 |
+
|
| 90 |
+
def _estimate_reading_time(self, text: str) -> int:
|
| 91 |
+
"""Estimate reading time in minutes (250 words per minute)"""
|
| 92 |
+
word_count = len(text.split())
|
| 93 |
+
return max(1, word_count // 250)
|
| 94 |
+
|
| 95 |
+
def _calculate_complexity_score(self, data: Dict) -> float:
|
| 96 |
+
"""Calculate content complexity for LLM processing hints"""
|
| 97 |
+
score = 0.0
|
| 98 |
+
|
| 99 |
+
# Text length factor
|
| 100 |
+
text_length = len(data["text_summary"])
|
| 101 |
+
score += min(text_length / 1000, 5.0)
|
| 102 |
+
|
| 103 |
+
# Structure complexity
|
| 104 |
+
content_blocks = len(data["content"])
|
| 105 |
+
score += min(content_blocks / 10, 3.0)
|
| 106 |
+
|
| 107 |
+
# Link density
|
| 108 |
+
total_links = len(data["links"])
|
| 109 |
+
score += min(total_links / 20, 2.0)
|
| 110 |
+
|
| 111 |
+
return round(score, 2)
|
| 112 |
+
|
| 113 |
+
def _identify_content_type(self, data: Dict) -> str:
|
| 114 |
+
"""Identify content type for LLM processing strategy"""
|
| 115 |
+
title = data["metadata"]["title"].lower()
|
| 116 |
+
text = data["text_summary"].lower()
|
| 117 |
+
|
| 118 |
+
if any(word in title or word in text for word in ["tutorial", "guide", "how to"]):
|
| 119 |
+
return "tutorial"
|
| 120 |
+
elif any(word in title or word in text for word in ["news", "article", "report"]):
|
| 121 |
+
return "article"
|
| 122 |
+
elif any(word in title or word in text for word in ["documentation", "docs", "reference"]):
|
| 123 |
+
return "documentation"
|
| 124 |
+
elif any(word in title or word in text for word in ["blog", "post", "opinion"]):
|
| 125 |
+
return "blog_post"
|
| 126 |
+
else:
|
| 127 |
+
return "general"
|
| 128 |
+
|
| 129 |
+
def _extract_key_topics(self, data: Dict) -> List[str]:
|
| 130 |
+
"""Extract key topics for study organization"""
|
| 131 |
+
# Simple keyword extraction from headings and title
|
| 132 |
+
topics = set()
|
| 133 |
+
|
| 134 |
+
# From title
|
| 135 |
+
title_words = data["metadata"]["title"].split()
|
| 136 |
+
topics.update([word.lower() for word in title_words if len(word) > 3])
|
| 137 |
+
|
| 138 |
+
# From headings
|
| 139 |
+
for heading in data["metadata"]["headings"]:
|
| 140 |
+
heading_words = heading["text"].split()
|
| 141 |
+
topics.update([word.lower() for word in heading_words if len(word) > 3])
|
| 142 |
+
|
| 143 |
+
return list(topics)[:10] # Limit to top 10 topics
|
storage/neo4j_storage.py
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from neo4j import GraphDatabase
|
| 2 |
+
from typing import Dict, List
|
| 3 |
+
from urllib.parse import urlparse
|
| 4 |
+
from config.settings import settings
|
| 5 |
+
|
| 6 |
+
class Neo4jStorage:
|
| 7 |
+
def __init__(self):
|
| 8 |
+
self.driver = GraphDatabase.driver(
|
| 9 |
+
settings.database.neo4j_uri,
|
| 10 |
+
auth=(settings.database.neo4j_user, settings.database.neo4j_password)
|
| 11 |
+
)
|
| 12 |
+
self._create_constraints()
|
| 13 |
+
|
| 14 |
+
def _create_constraints(self):
|
| 15 |
+
"""Create constraints and indexes for better performance"""
|
| 16 |
+
with self.driver.session() as session:
|
| 17 |
+
try:
|
| 18 |
+
session.run("CREATE CONSTRAINT page_url IF NOT EXISTS FOR (p:Page) REQUIRE p.url IS UNIQUE")
|
| 19 |
+
session.run("CREATE CONSTRAINT domain_name IF NOT EXISTS FOR (d:Domain) REQUIRE d.name IS UNIQUE")
|
| 20 |
+
session.run("CREATE INDEX page_title IF NOT EXISTS FOR (p:Page) ON (p.title)")
|
| 21 |
+
except Exception as e:
|
| 22 |
+
pass # Constraints might already exist
|
| 23 |
+
|
| 24 |
+
def store_relationships(self, url: str, extracted_data: Dict, dom_structure: Dict):
|
| 25 |
+
"""Store page relationships and structure in Neo4j"""
|
| 26 |
+
with self.driver.session() as session:
|
| 27 |
+
# Create main page node
|
| 28 |
+
self._create_page_node(session, url, extracted_data)
|
| 29 |
+
|
| 30 |
+
# Create domain relationships
|
| 31 |
+
self._create_domain_relationships(session, url, extracted_data)
|
| 32 |
+
|
| 33 |
+
# Create content relationships
|
| 34 |
+
self._create_content_relationships(session, url, extracted_data)
|
| 35 |
+
|
| 36 |
+
# Create link relationships
|
| 37 |
+
self._create_link_relationships(session, url, extracted_data["links"])
|
| 38 |
+
|
| 39 |
+
# Create DOM structure relationships
|
| 40 |
+
self._create_dom_relationships(session, url, dom_structure)
|
| 41 |
+
|
| 42 |
+
def _create_page_node(self, session, url: str, data: Dict):
|
| 43 |
+
"""Create or update page node with LLM-friendly properties"""
|
| 44 |
+
query = """
|
| 45 |
+
MERGE (p:Page {url: $url})
|
| 46 |
+
SET p.title = $title,
|
| 47 |
+
p.description = $description,
|
| 48 |
+
p.domain = $domain,
|
| 49 |
+
p.content_type = $content_type,
|
| 50 |
+
p.complexity_score = $complexity_score,
|
| 51 |
+
p.reading_time = $reading_time,
|
| 52 |
+
p.word_count = $word_count,
|
| 53 |
+
p.last_scraped = datetime()
|
| 54 |
+
"""
|
| 55 |
+
|
| 56 |
+
session.run(query, {
|
| 57 |
+
"url": url,
|
| 58 |
+
"title": data["metadata"]["title"],
|
| 59 |
+
"description": data["metadata"]["description"],
|
| 60 |
+
"domain": data["metadata"]["domain"],
|
| 61 |
+
"content_type": self._identify_content_type(data),
|
| 62 |
+
"complexity_score": self._calculate_complexity_score(data),
|
| 63 |
+
"reading_time": len(data["text_summary"].split()) // 250,
|
| 64 |
+
"word_count": len(data["text_summary"].split())
|
| 65 |
+
})
|
| 66 |
+
|
| 67 |
+
def _create_domain_relationships(self, session, url: str, data: Dict):
|
| 68 |
+
"""Create domain nodes and relationships"""
|
| 69 |
+
domain = data["metadata"]["domain"]
|
| 70 |
+
|
| 71 |
+
# Create domain node
|
| 72 |
+
session.run("""
|
| 73 |
+
MERGE (d:Domain {name: $domain})
|
| 74 |
+
SET d.last_updated = datetime()
|
| 75 |
+
""", {"domain": domain})
|
| 76 |
+
|
| 77 |
+
# Link page to domain
|
| 78 |
+
session.run("""
|
| 79 |
+
MATCH (p:Page {url: $url})
|
| 80 |
+
MATCH (d:Domain {name: $domain})
|
| 81 |
+
MERGE (p)-[:BELONGS_TO]->(d)
|
| 82 |
+
""", {"url": url, "domain": domain})
|
| 83 |
+
|
| 84 |
+
def _create_content_relationships(self, session, url: str, data: Dict):
|
| 85 |
+
"""Create content structure relationships for LLM understanding"""
|
| 86 |
+
# Create topic nodes from headings
|
| 87 |
+
for i, heading in enumerate(data["metadata"]["headings"]):
|
| 88 |
+
session.run("""
|
| 89 |
+
MATCH (p:Page {url: $url})
|
| 90 |
+
MERGE (h:Heading {text: $text, level: $level, page_url: $url})
|
| 91 |
+
SET h.position = $position
|
| 92 |
+
MERGE (p)-[:HAS_HEADING]->(h)
|
| 93 |
+
""", {
|
| 94 |
+
"url": url,
|
| 95 |
+
"text": heading["text"],
|
| 96 |
+
"level": heading["level"],
|
| 97 |
+
"position": i
|
| 98 |
+
})
|
| 99 |
+
|
| 100 |
+
# Create content block relationships
|
| 101 |
+
for i, block in enumerate(data["content"][:10]): # Limit for performance
|
| 102 |
+
session.run("""
|
| 103 |
+
MATCH (p:Page {url: $url})
|
| 104 |
+
MERGE (c:ContentBlock {text: $text, page_url: $url, position: $position})
|
| 105 |
+
SET c.tag = $tag,
|
| 106 |
+
c.length = $length
|
| 107 |
+
MERGE (p)-[:HAS_CONTENT]->(c)
|
| 108 |
+
""", {
|
| 109 |
+
"url": url,
|
| 110 |
+
"text": block["text"][:500], # Truncate for storage
|
| 111 |
+
"tag": block["tag"],
|
| 112 |
+
"length": len(block["text"]),
|
| 113 |
+
"position": i
|
| 114 |
+
})
|
| 115 |
+
|
| 116 |
+
def _create_link_relationships(self, session, url: str, links: List[Dict]):
|
| 117 |
+
"""Create link relationships for navigation understanding"""
|
| 118 |
+
for link in links[:20]: # Limit for performance
|
| 119 |
+
target_url = link["url"]
|
| 120 |
+
link_text = link["text"]
|
| 121 |
+
is_internal = link["internal"]
|
| 122 |
+
|
| 123 |
+
# Create target page node (minimal)
|
| 124 |
+
session.run("""
|
| 125 |
+
MERGE (target:Page {url: $target_url})
|
| 126 |
+
SET target.discovered_via = $source_url
|
| 127 |
+
""", {"target_url": target_url, "source_url": url})
|
| 128 |
+
|
| 129 |
+
# Create relationship
|
| 130 |
+
relationship_type = "LINKS_TO_INTERNAL" if is_internal else "LINKS_TO_EXTERNAL"
|
| 131 |
+
session.run(f"""
|
| 132 |
+
MATCH (source:Page {{url: $source_url}})
|
| 133 |
+
MATCH (target:Page {{url: $target_url}})
|
| 134 |
+
MERGE (source)-[r:{relationship_type}]->(target)
|
| 135 |
+
SET r.link_text = $link_text,
|
| 136 |
+
r.is_internal = $is_internal
|
| 137 |
+
""", {
|
| 138 |
+
"source_url": url,
|
| 139 |
+
"target_url": target_url,
|
| 140 |
+
"link_text": link_text,
|
| 141 |
+
"is_internal": is_internal
|
| 142 |
+
})
|
| 143 |
+
|
| 144 |
+
def _create_dom_relationships(self, session, url: str, dom_structure: Dict):
|
| 145 |
+
"""Create DOM structure relationships for content hierarchy"""
|
| 146 |
+
# Create semantic structure nodes
|
| 147 |
+
semantic_elements = dom_structure["semantic_structure"]["semantic_elements"]
|
| 148 |
+
for tag, count in semantic_elements.items():
|
| 149 |
+
if count > 0:
|
| 150 |
+
session.run("""
|
| 151 |
+
MATCH (p:Page {url: $url})
|
| 152 |
+
MERGE (s:SemanticElement {tag: $tag, page_url: $url})
|
| 153 |
+
SET s.count = $count
|
| 154 |
+
MERGE (p)-[:HAS_SEMANTIC_ELEMENT]->(s)
|
| 155 |
+
""", {"url": url, "tag": tag, "count": count})
|
| 156 |
+
|
| 157 |
+
def get_page_relationships(self, url: str) -> Dict:
|
| 158 |
+
"""Get all relationships for a page for LLM context"""
|
| 159 |
+
with self.driver.session() as session:
|
| 160 |
+
result = session.run("""
|
| 161 |
+
MATCH (p:Page {url: $url})
|
| 162 |
+
OPTIONAL MATCH (p)-[:LINKS_TO_INTERNAL]->(internal:Page)
|
| 163 |
+
OPTIONAL MATCH (p)-[:LINKS_TO_EXTERNAL]->(external:Page)
|
| 164 |
+
OPTIONAL MATCH (p)-[:HAS_HEADING]->(h:Heading)
|
| 165 |
+
RETURN p, collect(DISTINCT internal.url) as internal_links,
|
| 166 |
+
collect(DISTINCT external.url) as external_links,
|
| 167 |
+
collect(DISTINCT {text: h.text, level: h.level}) as headings
|
| 168 |
+
""", {"url": url})
|
| 169 |
+
|
| 170 |
+
record = result.single()
|
| 171 |
+
if record:
|
| 172 |
+
return {
|
| 173 |
+
"page": dict(record["p"]),
|
| 174 |
+
"internal_links": record["internal_links"],
|
| 175 |
+
"external_links": record["external_links"],
|
| 176 |
+
"headings": record["headings"]
|
| 177 |
+
}
|
| 178 |
+
return {}
|
| 179 |
+
|
| 180 |
+
def get_related_pages(self, url: str, limit: int = 5) -> List[Dict]:
|
| 181 |
+
"""Find related pages for LLM context and study suggestions"""
|
| 182 |
+
with self.driver.session() as session:
|
| 183 |
+
result = session.run("""
|
| 184 |
+
MATCH (p:Page {url: $url})
|
| 185 |
+
MATCH (p)-[:BELONGS_TO]->(d:Domain)
|
| 186 |
+
MATCH (related:Page)-[:BELONGS_TO]->(d)
|
| 187 |
+
WHERE related.url <> $url
|
| 188 |
+
RETURN related.url as url, related.title as title,
|
| 189 |
+
related.content_type as content_type,
|
| 190 |
+
related.complexity_score as complexity_score
|
| 191 |
+
ORDER BY related.complexity_score DESC
|
| 192 |
+
LIMIT $limit
|
| 193 |
+
""", {"url": url, "limit": limit})
|
| 194 |
+
|
| 195 |
+
return [dict(record) for record in result]
|
| 196 |
+
|
| 197 |
+
def _identify_content_type(self, data: Dict) -> str:
|
| 198 |
+
"""Identify content type for graph relationships"""
|
| 199 |
+
title = data["metadata"]["title"].lower()
|
| 200 |
+
if "tutorial" in title or "guide" in title:
|
| 201 |
+
return "tutorial"
|
| 202 |
+
elif "documentation" in title or "docs" in title:
|
| 203 |
+
return "documentation"
|
| 204 |
+
elif "blog" in title or "article" in title:
|
| 205 |
+
return "article"
|
| 206 |
+
return "general"
|
| 207 |
+
|
| 208 |
+
def _calculate_complexity_score(self, data: Dict) -> float:
|
| 209 |
+
"""Calculate complexity score for relationship weighting"""
|
| 210 |
+
text_length = len(data["text_summary"])
|
| 211 |
+
content_blocks = len(data["content"])
|
| 212 |
+
return min(text_length / 1000 + content_blocks / 10, 10.0)
|
| 213 |
+
|
| 214 |
+
def close(self):
|
| 215 |
+
"""Close database connection"""
|
| 216 |
+
self.driver.close()
|