etukurudinesh commited on
Commit
feea636
·
1 Parent(s): 80a0aa2
.gitignore ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Sensitive files
2
+ certification.pem
3
+ *.pem
4
+ *.key
5
+ *.crt
6
+ *.cert
7
+
8
+ # Python temporary files
9
+ __pycache__/
10
+ *.pyc
11
+ *.pyo
12
+ *.pyd
13
+ .Python
14
+ pip-log.txt
15
+ pip-delete-this-directory.txt
16
+
17
+ # Virtual environments
18
+ venv/
19
+ env/
20
+ .venv/
21
+ .env
22
+
23
+ # Docker artifacts
24
+ *.dockerignore
25
+ Dockerfile.bak
26
+ *.log
27
+
28
+ # Editor and IDE files
29
+ .vscode/
30
+ .idea/
31
+ *.sublime-project
32
+ *.sublime-workspace
33
+
34
+ # System files
35
+ .DS_Store
36
+ Thumbs.db
37
+
38
+ # Local development and testing
39
+ *.swp
40
+ *.swo
41
+ *.tmp
42
+ *.bak
43
+ *.backup
44
+
45
+ # Cache and temporary directories
46
+ .cache/
47
+ *.cache
48
+ *.egg-info/
49
+ dist
Dockerfile ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # you will also find guides on how best to write your Dockerfile
3
+
4
+ FROM python:3.9
5
+
6
+ # Install system dependencies for Playwright
7
+ RUN apt-get update && apt-get install -y \
8
+ libnss3 \
9
+ libatk1.0-0 \
10
+ libatk-bridge2.0-0 \
11
+ libxcomposite1 \
12
+ libxdamage1 \
13
+ libxrandr2 \
14
+ libgbm1 \
15
+ libpango-1.0-0 \
16
+ libcairo2 \
17
+ libasound2 \
18
+ libxshmfence1 \
19
+ && rm -rf /var/lib/apt/lists/*
20
+
21
+ RUN useradd -m -u 1000 user
22
+ USER user
23
+ ENV PATH="/home/user/.local/bin:$PATH"
24
+
25
+ WORKDIR /app
26
+
27
+ COPY --chown=user ./requirements.txt requirements.txt
28
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
29
+
30
+ # Install Playwright browsers
31
+ RUN playwright install
32
+
33
+ COPY --chown=user . /app
34
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException, BackgroundTasks
2
+ from pydantic import BaseModel, HttpUrl
3
+ from typing import List, Dict, Optional
4
+ import asyncio
5
+ from main import WebScrapingOrchestrator
6
+
7
+ app = FastAPI(
8
+ title="Advanced Web Scraper for LLM",
9
+ description="Scrape, analyze, and store web content optimized for LLM consumption",
10
+ version="1.0.0"
11
+ )
12
+
13
+ # Global orchestrator instance
14
+ orchestrator = WebScrapingOrchestrator()
15
+
16
+ # Pydantic models
17
+ class URLRequest(BaseModel):
18
+ url: HttpUrl
19
+
20
+ class SearchRequest(BaseModel):
21
+ query: str
22
+ limit: int = 5
23
+
24
+ class BatchURLRequest(BaseModel):
25
+ urls: List[HttpUrl]
26
+
27
+ # Response models
28
+ class ScrapingResponse(BaseModel):
29
+ success: bool
30
+ url: str
31
+ title: Optional[str] = None
32
+ summary: Optional[Dict] = None
33
+ llm_ready_data: Optional[Dict] = None
34
+ error: Optional[str] = None
35
+
36
+ class SearchResponse(BaseModel):
37
+ results: List[Dict]
38
+ total_found: int
39
+
40
+ @app.post("/scrape", response_model=ScrapingResponse)
41
+ async def scrape_url(request: URLRequest):
42
+ """Scrape a single URL and store data optimized for LLM consumption"""
43
+ try:
44
+ result = await orchestrator.process_url(str(request.url))
45
+
46
+ if "error" in result:
47
+ raise HTTPException(status_code=400, detail=result["error"])
48
+
49
+ return ScrapingResponse(**result)
50
+
51
+ except Exception as e:
52
+ raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")
53
+
54
+ @app.post("/scrape-batch")
55
+ async def scrape_batch_urls(request: BatchURLRequest, background_tasks: BackgroundTasks):
56
+ """Scrape multiple URLs in the background"""
57
+ async def process_batch():
58
+ results = []
59
+ for url in request.urls:
60
+ try:
61
+ result = await orchestrator.process_url(str(url))
62
+ results.append(result)
63
+ except Exception as e:
64
+ results.append({"error": str(e), "url": str(url)})
65
+ return results
66
+
67
+ # Add to background tasks
68
+ background_tasks.add_task(process_batch)
69
+
70
+ return {
71
+ "message": f"Started processing {len(request.urls)} URLs in background",
72
+ "urls": [str(url) for url in request.urls]
73
+ }
74
+
75
+ @app.get("/page/{url:path}")
76
+ async def get_page_data(url: str):
77
+ """Get processed page data optimized for LLM consumption"""
78
+ try:
79
+ # Decode URL
80
+ import urllib.parse
81
+ decoded_url = urllib.parse.unquote(url)
82
+
83
+ page_data = orchestrator.get_page_for_llm(decoded_url)
84
+
85
+ if not page_data:
86
+ raise HTTPException(status_code=404, detail="Page not found")
87
+
88
+ return page_data
89
+
90
+ except Exception as e:
91
+ raise HTTPException(status_code=500, detail=f"Retrieval failed: {str(e)}")
92
+
93
+ @app.post("/search", response_model=SearchResponse)
94
+ async def search_content(request: SearchRequest):
95
+ """Search stored content for LLM context"""
96
+ try:
97
+ results = orchestrator.search_for_llm(request.query, request.limit)
98
+
99
+ return SearchResponse(
100
+ results=results,
101
+ total_found=len(results)
102
+ )
103
+
104
+ except Exception as e:
105
+ raise HTTPException(status_code=500, detail=f"Search failed: {str(e)}")
106
+
107
+ @app.get("/llm-ready/{url:path}")
108
+ async def get_llm_ready_content(url: str):
109
+ """Get content specifically formatted for LLM consumption"""
110
+ try:
111
+ import urllib.parse
112
+ decoded_url = urllib.parse.unquote(url)
113
+
114
+ page_data = orchestrator.get_page_for_llm(decoded_url)
115
+
116
+ if not page_data:
117
+ raise HTTPException(status_code=404, detail="Page not found")
118
+
119
+ # Format for LLM
120
+ llm_content = {
121
+ "instruction": "Use this content for generating summaries, notes, or mind maps",
122
+ "content": {
123
+ "title": page_data["title"],
124
+ "main_content": page_data["content"],
125
+ "structure": {
126
+ "headings": page_data["headings"],
127
+ "content_type": page_data["study_metadata"]["content_type"],
128
+ "complexity": page_data["study_metadata"]["complexity_score"],
129
+ "reading_time": page_data["study_metadata"]["reading_time"]
130
+ },
131
+ "context": {
132
+ "related_pages": page_data["relationships"]["related_pages"],
133
+ "key_topics": page_data["study_metadata"]["key_topics"]
134
+ }
135
+ },
136
+ "suggestions": {
137
+ "study_approach": _get_study_approach(page_data["study_metadata"]),
138
+ "focus_areas": page_data["headings"][:3],
139
+ "difficulty_level": _assess_difficulty(page_data["study_metadata"])
140
+ }
141
+ }
142
+
143
+ return llm_content
144
+
145
+ except Exception as e:
146
+ raise HTTPException(status_code=500, detail=f"LLM formatting failed: {str(e)}")
147
+
148
+ @app.get("/health")
149
+ async def health_check():
150
+ """Health check endpoint"""
151
+ return {"status": "healthy", "message": "Web scraper API is running"}
152
+
153
+ @app.get("/stats")
154
+ async def get_statistics():
155
+ """Get scraping statistics"""
156
+ try:
157
+ # Get basic stats from MongoDB
158
+ mongo_stats = orchestrator.mongo_storage.collection.estimated_document_count()
159
+
160
+ return {
161
+ "total_pages_scraped": mongo_stats,
162
+ "database_status": "connected",
163
+ "features": [
164
+ "Dynamic content scraping with Playwright",
165
+ "DOM structure analysis",
166
+ "MongoDB storage for content",
167
+ "Neo4j for relationships",
168
+ "LLM-optimized data extraction"
169
+ ]
170
+ }
171
+
172
+ except Exception as e:
173
+ return {"error": f"Stats retrieval failed: {str(e)}"}
174
+
175
+ def _get_study_approach(metadata: Dict) -> str:
176
+ """Suggest study approach based on content analysis"""
177
+ content_type = metadata.get("content_type", "general")
178
+ complexity = metadata.get("complexity_score", 0)
179
+
180
+ if content_type == "tutorial":
181
+ return "hands-on practice with step-by-step approach"
182
+ elif content_type == "documentation":
183
+ return "reference-based learning with examples"
184
+ elif content_type == "research":
185
+ return "analytical reading with note-taking"
186
+ elif complexity > 5:
187
+ return "detailed study with concept mapping"
188
+ else:
189
+ return "general reading with summary creation"
190
+
191
+ def _assess_difficulty(metadata: Dict) -> str:
192
+ """Assess content difficulty for LLM processing hints"""
193
+ complexity = metadata.get("complexity_score", 0)
194
+ reading_time = metadata.get("reading_time", 0)
195
+
196
+ if complexity < 2 and reading_time < 5:
197
+ return "beginner"
198
+ elif complexity < 5 and reading_time < 15:
199
+ return "intermediate"
200
+ else:
201
+ return "advanced"
202
+
203
+ @app.on_event("shutdown")
204
+ async def shutdown_event():
205
+ """Clean up on shutdown"""
206
+ orchestrator.close_connections()
207
+
208
+ # Run the API
209
+ if __name__ == "__main__":
210
+ import uvicorn
211
+ uvicorn.run("api:app", host="0.0.0.0", port=8000, reload=True)
main.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ from typing import Dict, Optional,List
3
+ from scraper.html_loader import HTMLLoader
4
+ from scraper.data_extractor import DataExtractor
5
+ from scraper.dom_analyzer import DOMAnalyzer
6
+ from storage.mongo_storage import MongoStorage
7
+ # from storage.neo4j_storage import Neo4jStorage
8
+
9
+ class WebScrapingOrchestrator:
10
+ def __init__(self):
11
+ self.data_extractor = DataExtractor()
12
+ self.dom_analyzer = DOMAnalyzer()
13
+ self.mongo_storage = MongoStorage()
14
+ # self.neo4j_storage = Neo4jStorage()
15
+
16
+ async def process_url(self, url: str) -> Dict:
17
+ """Complete pipeline to process a URL for LLM consumption"""
18
+ try:
19
+ print(f"Processing URL: {url}")
20
+
21
+ # Step 1: Load HTML content
22
+ async with HTMLLoader() as loader:
23
+ html_data = await loader.load_page(url)
24
+
25
+ if not html_data:
26
+ return {"error": "Failed to load page"}
27
+
28
+ print("✓ HTML loaded successfully")
29
+
30
+ # Step 2: Extract structured data
31
+ extracted_data = self.data_extractor.extract_structured_data(
32
+ html_data["html"],
33
+ html_data["url"]
34
+ )
35
+
36
+ print("✓ Data extracted successfully")
37
+
38
+ # Step 3: Analyze DOM structure
39
+ dom_structure = self.dom_analyzer.analyze_structure(html_data["html"])
40
+
41
+ print("✓ DOM structure analyzed")
42
+
43
+ # Step 4: Store in MongoDB
44
+ mongo_id = self.mongo_storage.store_page_data(
45
+ html_data["url"],
46
+ extracted_data,
47
+ dom_structure
48
+ )
49
+
50
+ print("✓ Data stored in MongoDB")
51
+
52
+ # Step 5: Store relationships in Neo4j
53
+ # self.neo4j_storage.store_relationships(
54
+ # html_data["url"],
55
+ # extracted_data,
56
+ # dom_structure
57
+ # )
58
+
59
+ print("✓ Relationships stored in Neo4j")
60
+
61
+ # Return LLM-ready summary
62
+ return {
63
+ "success": True,
64
+ "url": html_data["url"],
65
+ "title": html_data["title"],
66
+ "mongo_id": mongo_id,
67
+ "summary": {
68
+ "content_blocks": len(extracted_data["content"]),
69
+ "text_length": len(extracted_data["text_summary"]),
70
+ "links_found": len(extracted_data["links"]),
71
+ "images_found": len(extracted_data["images"]),
72
+ "dom_depth": dom_structure["statistics"]["max_depth"],
73
+ "content_type": self._identify_content_type(extracted_data)
74
+ },
75
+ "llm_ready_data": {
76
+ "text_summary": extracted_data["text_summary"],
77
+ "key_headings": [h["text"] for h in extracted_data["metadata"]["headings"][:5]],
78
+ "main_topics": self._extract_main_topics(extracted_data),
79
+ "study_hints": self._generate_study_hints(extracted_data, dom_structure)
80
+ }
81
+ }
82
+
83
+ except Exception as e:
84
+ print(f"✗ Error processing {url}: {str(e)}")
85
+ return {"error": str(e), "url": url}
86
+
87
+ def get_page_for_llm(self, url: str) -> Optional[Dict]:
88
+ """Retrieve page data optimized for LLM consumption"""
89
+ # Get from MongoDB
90
+ mongo_data = self.mongo_storage.get_page_data(url)
91
+ if not mongo_data:
92
+ return None
93
+
94
+ # Get relationships from Neo4j
95
+ neo4j_data = self.neo4j_storage.get_page_relationships(url)
96
+
97
+ # Combine for LLM
98
+ return {
99
+ "content": mongo_data["content"]["text_summary"],
100
+ "title": mongo_data["title"],
101
+ "headings": [h["text"] for h in mongo_data["content"]["headings"]],
102
+ "structure": mongo_data["study_metadata"],
103
+ "relationships": {
104
+ "related_pages": neo4j_data.get("internal_links", [])[:5],
105
+ "external_references": neo4j_data.get("external_links", [])[:3]
106
+ },
107
+ "study_metadata": mongo_data["study_metadata"]
108
+ }
109
+
110
+ def search_for_llm(self, query: str, limit: int = 5) -> List[Dict]:
111
+ """Search content for LLM context"""
112
+ results = self.mongo_storage.search_pages(query, limit)
113
+
114
+ llm_ready_results = []
115
+ for result in results:
116
+ llm_ready_results.append({
117
+ "url": result["url"],
118
+ "title": result["title"],
119
+ "summary": result["content"]["text_summary"][:500],
120
+ "content_type": result["study_metadata"]["content_type"],
121
+ "complexity": result["study_metadata"]["complexity_score"],
122
+ "key_topics": result["study_metadata"]["key_topics"][:5]
123
+ })
124
+
125
+ return llm_ready_results
126
+
127
+ def _identify_content_type(self, data: Dict) -> str:
128
+ """Identify content type for processing hints"""
129
+ title = data["metadata"]["title"].lower()
130
+ text = data["text_summary"].lower()
131
+
132
+ if any(word in title for word in ["tutorial", "guide", "how to"]):
133
+ return "tutorial"
134
+ elif any(word in title for word in ["documentation", "docs", "api"]):
135
+ return "documentation"
136
+ elif any(word in title for word in ["blog", "article", "news"]):
137
+ return "article"
138
+ elif any(word in text for word in ["research", "study", "analysis"]):
139
+ return "research"
140
+ return "general"
141
+
142
+ def _extract_main_topics(self, data: Dict) -> List[str]:
143
+ """Extract main topics for LLM understanding"""
144
+ topics = set()
145
+
146
+ # From title
147
+ title_words = [word for word in data["metadata"]["title"].split() if len(word) > 3]
148
+ topics.update(title_words[:3])
149
+
150
+ # From headings
151
+ for heading in data["metadata"]["headings"][:3]:
152
+ heading_words = [word for word in heading["text"].split() if len(word) > 3]
153
+ topics.update(heading_words[:2])
154
+
155
+ return list(topics)[:5]
156
+
157
+ def _generate_study_hints(self, extracted_data: Dict, dom_structure: Dict) -> Dict:
158
+ """Generate study hints for LLM processing"""
159
+ return {
160
+ "difficulty_level": "beginner" if len(extracted_data["text_summary"]) < 2000 else "intermediate",
161
+ "estimated_study_time": f"{len(extracted_data['text_summary'].split()) // 250} minutes",
162
+ "content_structure": "well_structured" if len(extracted_data["metadata"]["headings"]) > 3 else "basic",
163
+ "has_examples": "code" in extracted_data["text_summary"].lower(),
164
+ "interactive_elements": dom_structure["statistics"]["tag_distribution"].get("form", 0) > 0
165
+ }
166
+
167
+ def close_connections(self):
168
+ """Close all database connections"""
169
+ self.neo4j_storage.close()
170
+
171
+ # Main execution function
172
+ async def main():
173
+ orchestrator = WebScrapingOrchestrator()
174
+
175
+ # Example usage
176
+ test_url = "https://en.wikipedia.org/wiki/Virat_Kohli"
177
+ result = await orchestrator.process_url(test_url)
178
+ print(f"Processing result: {result}")
179
+
180
+ # Clean up
181
+ orchestrator.close_connections()
182
+
183
+ if __name__ == "__main__":
184
+ asyncio.run(main())
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.104.1
2
+ uvicorn==0.24.0
3
+ playwright==1.40.0
4
+ beautifulsoup4==4.12.2
5
+ pymongo==4.6.0
6
+ neo4j==5.15.0
7
+ pydantic==2.5.2
8
+ python-multipart==0.0.6
9
+ aiofiles==23.2.1
10
+ requests==2.31.0
11
+ lxml==4.9.3
12
+ newspaper3k==0.2.8
13
+ readability-lxml==0.8.1
14
+ python-dotenv==1.0.0
15
+ nltk==3.8.1
16
+ spacy==3.7.2
scraper/__init__.py ADDED
File without changes
scraper/data_extractor.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup, Comment
2
+ from typing import Dict, List, Optional
3
+ import re
4
+ from urllib.parse import urljoin, urlparse
5
+
6
+ class DataExtractor:
7
+ def __init__(self):
8
+ self.ignore_selectors = [
9
+ '.advertisement',
10
+ '.ad',
11
+ '.banner',
12
+ '.popup',
13
+ '#footer',
14
+ '.footer',
15
+ '.sidebar',
16
+ 'nav',
17
+ '.navbar',
18
+ '.menu',
19
+ 'header',
20
+ '#header',
21
+ 'script',
22
+ 'style',
23
+ 'noscript',
24
+ 'iframe',
25
+ 'meta',
26
+ 'link',
27
+ '[class*="ad-"]',
28
+ '[id*="ad-"]',
29
+ '.cookie-notice',
30
+ '.modal',
31
+ 'form',
32
+ 'input',
33
+ 'button',
34
+ '.social-media',
35
+ '.comments-section',
36
+ '.widget'
37
+ ]
38
+ self.content_selectors = [
39
+ '.main-content',
40
+ 'article',
41
+ 'p',
42
+ 'h1',
43
+ 'h2',
44
+ 'h3',
45
+ 'h4',
46
+ 'h5',
47
+ 'h6',
48
+ 'div.content',
49
+ '.post',
50
+ '.article-body',
51
+ '.content-body',
52
+ 'section',
53
+ 'main',
54
+ 'ul',
55
+ 'ol',
56
+ 'li',
57
+ 'table',
58
+ 'td',
59
+ 'th',
60
+ 'blockquote',
61
+ 'pre',
62
+ '.text',
63
+ '[class*="content"]',
64
+ '[class*="post"]',
65
+ '[class*="article"]',
66
+ 'div:not([class*="ad"]):not([class*="banner"]):not([class*="sidebar"])'
67
+ ]
68
+ self.min_text_length = 200
69
+ def extract_structured_data(self, html: str, url: str) -> Dict:
70
+ """Extract structured data from HTML for LLM consumption"""
71
+ soup = BeautifulSoup(html, 'lxml')
72
+
73
+ # Remove unwanted elements
74
+ self._clean_html(soup)
75
+
76
+ return {
77
+ "content": self._extract_content(soup),
78
+ "metadata": self._extract_metadata(soup, url),
79
+ "structure": self._extract_structure(soup),
80
+ "links": self._extract_links(soup, url),
81
+ "images": self._extract_images(soup, url),
82
+ "text_summary": self._extract_text_summary(soup)
83
+ }
84
+
85
+ def _clean_html(self, soup: BeautifulSoup):
86
+ """Remove unwanted elements for cleaner extraction"""
87
+ for selector in self.ignore_selectors:
88
+ for element in soup.select(selector):
89
+ element.decompose()
90
+
91
+ # Remove comments and scripts
92
+ for element in soup(text=lambda text: isinstance(text, Comment)):
93
+ element.extract()
94
+
95
+ def _extract_content(self, soup: BeautifulSoup) -> List[Dict]:
96
+ """Extract main content blocks"""
97
+ content_blocks = []
98
+
99
+ for selector in self.content_selectors:
100
+ elements = soup.select(selector)
101
+ for elem in elements:
102
+ text = elem.get_text(strip=True)
103
+ if len(text) >= self.min_text_length:
104
+ content_blocks.append({
105
+ "tag": elem.name,
106
+ "text": text,
107
+ "html": str(elem),
108
+ "attributes": dict(elem.attrs) if elem.attrs else {}
109
+ })
110
+
111
+ return content_blocks
112
+
113
+ def _extract_metadata(self, soup: BeautifulSoup, url: str) -> Dict:
114
+ """Extract page metadata"""
115
+ title = soup.find('title')
116
+ meta_desc = soup.find('meta', attrs={'name': 'description'})
117
+
118
+ return {
119
+ "title": title.get_text().strip() if title else "",
120
+ "description": meta_desc.get('content', '') if meta_desc else "",
121
+ "url": url,
122
+ "domain": urlparse(url).netloc,
123
+ "headings": self._extract_headings(soup)
124
+ }
125
+
126
+ def _extract_headings(self, soup: BeautifulSoup) -> List[Dict]:
127
+ """Extract heading hierarchy for structure"""
128
+ headings = []
129
+ for i in range(1, 7):
130
+ for heading in soup.find_all(f'h{i}'):
131
+ headings.append({
132
+ "level": i,
133
+ "text": heading.get_text().strip(),
134
+ "id": heading.get('id', '')
135
+ })
136
+ return headings
137
+
138
+ def _extract_structure(self, soup: BeautifulSoup) -> Dict:
139
+ """Extract DOM structure for relationships"""
140
+ return {
141
+ "sections": len(soup.find_all(['section', 'article', 'div'])),
142
+ "paragraphs": len(soup.find_all('p')),
143
+ "lists": len(soup.find_all(['ul', 'ol'])),
144
+ "tables": len(soup.find_all('table')),
145
+ "forms": len(soup.find_all('form'))
146
+ }
147
+
148
+ def _extract_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
149
+ """Extract all links for relationship mapping"""
150
+ links = []
151
+ for link in soup.find_all('a', href=True):
152
+ href = urljoin(base_url, link['href'])
153
+ links.append({
154
+ "url": href,
155
+ "text": link.get_text().strip(),
156
+ "internal": urlparse(href).netloc == urlparse(base_url).netloc
157
+ })
158
+ return links[:50] # Limit for performance
159
+
160
+ def _extract_images(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
161
+ """Extract images with context"""
162
+ images = []
163
+ for img in soup.find_all('img', src=True):
164
+ images.append({
165
+ "src": urljoin(base_url, img['src']),
166
+ "alt": img.get('alt', ''),
167
+ "caption": img.get('title', '')
168
+ })
169
+ return images[:20] # Limit for performance
170
+
171
+ def _extract_text_summary(self, soup: BeautifulSoup) -> str:
172
+ """Extract clean text for LLM processing"""
173
+ text = soup.get_text()
174
+ # Clean whitespace and normalize
175
+ text = re.sub(r'\s+', ' ', text).strip()
176
+ return text[:5000] # Limit for token efficiency
scraper/dom_analyzer.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup
2
+ from typing import Dict, List
3
+ import hashlib
4
+
5
+ class DOMAnalyzer:
6
+ def __init__(self):
7
+ pass
8
+
9
+ def analyze_structure(self, html: str) -> Dict:
10
+ """Analyze DOM structure and create tree representation"""
11
+ soup = BeautifulSoup(html, 'lxml')
12
+
13
+ return {
14
+ "tree": self._build_dom_tree(soup.body if soup.body else soup),
15
+ "statistics": self._get_dom_statistics(soup),
16
+ "semantic_structure": self._analyze_semantic_structure(soup),
17
+ "content_blocks": self._identify_content_blocks(soup)
18
+ }
19
+
20
+ def _build_dom_tree(self, element, depth=0, max_depth=5) -> Dict:
21
+ """Build hierarchical DOM tree structure"""
22
+ if depth > max_depth or not element or not hasattr(element, 'name'):
23
+ return {}
24
+
25
+ node = {
26
+ "tag": element.name if element.name else "text",
27
+ "id": element.get('id', ''),
28
+ "classes": element.get('class', []),
29
+ "text_content": element.get_text()[:100] if element.get_text() else "",
30
+ "children": [],
31
+ "attributes": dict(element.attrs) if hasattr(element, 'attrs') else {},
32
+ "depth": depth,
33
+ "node_id": hashlib.md5(str(element)[:500].encode()).hexdigest()[:8]
34
+ }
35
+
36
+ # Add children (limit to prevent huge trees)
37
+ if hasattr(element, 'children') and depth < max_depth:
38
+ child_count = 0
39
+ for child in element.children:
40
+ if child_count >= 10: # Limit children per node
41
+ break
42
+ if hasattr(child, 'name') and child.name:
43
+ child_node = self._build_dom_tree(child, depth + 1, max_depth)
44
+ if child_node:
45
+ node["children"].append(child_node)
46
+ child_count += 1
47
+
48
+ return node
49
+
50
+ def _get_dom_statistics(self, soup: BeautifulSoup) -> Dict:
51
+ """Get DOM statistics for analysis"""
52
+ all_tags = soup.find_all()
53
+ tag_counts = {}
54
+
55
+ for tag in all_tags:
56
+ tag_name = tag.name
57
+ tag_counts[tag_name] = tag_counts.get(tag_name, 0) + 1
58
+
59
+ return {
60
+ "total_elements": len(all_tags),
61
+ "tag_distribution": tag_counts,
62
+ "max_depth": self._calculate_max_depth(soup),
63
+ "text_content_ratio": self._calculate_text_ratio(soup)
64
+ }
65
+
66
+ def _analyze_semantic_structure(self, soup: BeautifulSoup) -> Dict:
67
+ """Analyze semantic HTML structure"""
68
+ semantic_tags = ['header', 'nav', 'main', 'article', 'section', 'aside', 'footer']
69
+ semantic_elements = {}
70
+
71
+ for tag in semantic_tags:
72
+ elements = soup.find_all(tag)
73
+ semantic_elements[tag] = len(elements)
74
+
75
+ return {
76
+ "semantic_elements": semantic_elements,
77
+ "has_semantic_structure": sum(semantic_elements.values()) > 0,
78
+ "content_hierarchy": self._analyze_heading_hierarchy(soup)
79
+ }
80
+
81
+ def _identify_content_blocks(self, soup: BeautifulSoup) -> List[Dict]:
82
+ """Identify main content blocks for LLM processing"""
83
+ content_blocks = []
84
+
85
+ # Look for common content containers
86
+ selectors = ['article', 'main', '.content', '#content', '.post', '.entry']
87
+
88
+ for selector in selectors:
89
+ elements = soup.select(selector)
90
+ for elem in elements:
91
+ if elem.get_text(strip=True):
92
+ content_blocks.append({
93
+ "selector": selector,
94
+ "tag": elem.name,
95
+ "text_length": len(elem.get_text()),
96
+ "element_id": elem.get('id', ''),
97
+ "classes": elem.get('class', []),
98
+ "priority": self._calculate_content_priority(elem)
99
+ })
100
+
101
+ return sorted(content_blocks, key=lambda x: x['priority'], reverse=True)[:5]
102
+
103
+ def _calculate_max_depth(self, soup: BeautifulSoup) -> int:
104
+ """Calculate maximum DOM depth"""
105
+ def get_depth(element, current_depth=0):
106
+ if not hasattr(element, 'children'):
107
+ return current_depth
108
+
109
+ max_child_depth = current_depth
110
+ for child in element.children:
111
+ if hasattr(child, 'name') and child.name:
112
+ depth = get_depth(child, current_depth + 1)
113
+ max_child_depth = max(max_child_depth, depth)
114
+
115
+ return max_child_depth
116
+
117
+ return get_depth(soup)
118
+
119
+ def _calculate_text_ratio(self, soup: BeautifulSoup) -> float:
120
+ """Calculate ratio of text content to HTML tags"""
121
+ text_length = len(soup.get_text())
122
+ html_length = len(str(soup))
123
+ return text_length / html_length if html_length > 0 else 0
124
+
125
+ def _analyze_heading_hierarchy(self, soup: BeautifulSoup) -> List[Dict]:
126
+ """Analyze heading structure for content organization"""
127
+ headings = []
128
+ for i in range(1, 7):
129
+ for heading in soup.find_all(f'h{i}'):
130
+ headings.append({
131
+ "level": i,
132
+ "text": heading.get_text().strip(),
133
+ "position": len(headings)
134
+ })
135
+ return headings
136
+
137
+ def _calculate_content_priority(self, element) -> int:
138
+ """Calculate priority score for content blocks"""
139
+ score = 0
140
+ text_length = len(element.get_text())
141
+
142
+ # Text length scoring
143
+ score += min(text_length // 100, 10)
144
+
145
+ # Semantic tag bonus
146
+ if element.name in ['article', 'main']:
147
+ score += 5
148
+ elif element.name in ['section', 'div']:
149
+ score += 2
150
+
151
+ # Class/ID based scoring
152
+ classes = element.get('class', [])
153
+ element_id = element.get('id', '')
154
+
155
+ content_indicators = ['content', 'article', 'post', 'main', 'body']
156
+ for indicator in content_indicators:
157
+ if any(indicator in str(c).lower() for c in classes):
158
+ score += 3
159
+ if indicator in element_id.lower():
160
+ score += 3
161
+
162
+ return score
scraper/html_loader.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ from playwright.async_api import async_playwright
3
+ from typing import Dict, Optional
4
+ import time
5
+
6
+ class HTMLLoader:
7
+ def __init__(self):
8
+ self.browser = None
9
+ self.context = None
10
+ self.max_retries = 3
11
+ self.timeout = 30000
12
+ self.wait_for_selector = "body"
13
+ self.max_retries = 3
14
+ self.delay_between_requests = 1.0
15
+
16
+ async def __aenter__(self):
17
+ self.playwright = await async_playwright().start()
18
+ self.browser = await self.playwright.chromium.launch(
19
+ headless=True
20
+ )
21
+ self.context = await self.browser.new_context(
22
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
23
+ )
24
+ return self
25
+
26
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
27
+ if self.context:
28
+ await self.context.close()
29
+ if self.browser:
30
+ await self.browser.close()
31
+ if self.playwright:
32
+ await self.playwright.stop()
33
+
34
+ async def load_page(self, url: str) -> Dict[str, str]:
35
+ """Load HTML content from URL handling both static and dynamic sites"""
36
+ for attempt in range(self.max_retries):
37
+ try:
38
+ page = await self.context.new_page()
39
+ await page.goto(url, timeout=self.timeout)
40
+
41
+ # Wait for body to load
42
+ await page.wait_for_selector(
43
+ self.wait_for_selector,
44
+ timeout=10000
45
+ )
46
+
47
+ # Additional wait for dynamic content
48
+ await page.wait_for_timeout(2000)
49
+
50
+ html_content = await page.content()
51
+ title = await page.title()
52
+ url_final = page.url
53
+
54
+ await page.close()
55
+
56
+ return {
57
+ "html": html_content,
58
+ "title": title,
59
+ "url": url_final,
60
+ "timestamp": int(time.time())
61
+ }
62
+
63
+ except Exception as e:
64
+ if attempt == self.max_retries - 1:
65
+ raise Exception(f"Failed to load {url}: {str(e)}")
66
+ await asyncio.sleep(self.delay_between_requests)
67
+
68
+ return None
server.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import asyncio
3
+ from main import WebScrapingOrchestrator
4
+
5
+ orchestrator = WebScrapingOrchestrator()
6
+
7
+ async def scrape_async(url):
8
+ result = await orchestrator.process_url(url)
9
+ if "error" in result:
10
+ return f"❌ Error: {result['error']}"
11
+ return {
12
+ "URL": result.get("url"),
13
+ "Title": result.get("title"),
14
+ "Text Length": result["summary"]["text_length"],
15
+ "Headings": result["llm_ready_data"]["key_headings"],
16
+ "Main Topics": result["llm_ready_data"]["main_topics"],
17
+ "Summary (Short)": result["llm_ready_data"]["text_summary"][:800] + "..."
18
+ }
19
+
20
+ def scrape(url):
21
+ return asyncio.run(scrape_async(url))
22
+
23
+ with gr.Blocks(title="MCP Web Scraper") as demo:
24
+ gr.Markdown("### 🔍 MCP LLM Web Scraper")
25
+ url_input = gr.Textbox(label="Enter a webpage URL", placeholder="https://...")
26
+ output = gr.JSON(label="Scraped & LLM-ready Content")
27
+
28
+ scrape_button = gr.Button("Scrape Page")
29
+ scrape_button.click(scrape, inputs=url_input, outputs=output)
30
+
31
+ if __name__ == "__main__":
32
+ #demo.launch(server_name="0.0.0.0", server_port=7860)
33
+ demo.launch()
storage/__init__.py ADDED
File without changes
storage/mongo_storage.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pymongo import MongoClient
2
+ from typing import Dict, List, Optional
3
+ import datetime
4
+ import os
5
+
6
+ class MongoStorage:
7
+ def __init__(self):
8
+ self.client = MongoClient(os.environ.get("mongo_db_uri"))
9
+ self.db = self.client[os.environ.get("mongo_db_name")]
10
+ self.collection = self.db.scraped_pages
11
+ self._create_indexes()
12
+
13
+ def _create_indexes(self):
14
+ """Create indexes for better query performance"""
15
+ self.collection.create_index("url", unique=True)
16
+ self.collection.create_index("domain")
17
+ self.collection.create_index("timestamp")
18
+ self.collection.create_index("content.metadata.title")
19
+
20
+ def store_page_data(self, url: str, extracted_data: Dict, dom_structure: Dict) -> str:
21
+ """Store complete page data optimized for LLM consumption"""
22
+ document = {
23
+ "url": url,
24
+ "domain": extracted_data["metadata"]["domain"],
25
+ "timestamp": datetime.datetime.utcnow(),
26
+ "title": extracted_data["metadata"]["title"],
27
+ "description": extracted_data["metadata"]["description"],
28
+
29
+ # LLM-optimized content structure
30
+ "content": {
31
+ "text_summary": extracted_data["text_summary"],
32
+ "content_blocks": extracted_data["content"],
33
+ "headings": extracted_data["metadata"]["headings"],
34
+ "structure_info": extracted_data["structure"]
35
+ },
36
+
37
+ # Relationship data
38
+ "relationships": {
39
+ "internal_links": [link for link in extracted_data["links"] if link["internal"]],
40
+ "external_links": [link for link in extracted_data["links"] if not link["internal"]],
41
+ "images": extracted_data["images"]
42
+ },
43
+
44
+ # DOM analysis for advanced processing
45
+ "dom_analysis": {
46
+ "tree_structure": dom_structure["tree"],
47
+ "statistics": dom_structure["statistics"],
48
+ "semantic_structure": dom_structure["semantic_structure"],
49
+ "content_blocks": dom_structure["content_blocks"]
50
+ },
51
+
52
+ # Study-friendly metadata
53
+ "study_metadata": {
54
+ "reading_time": self._estimate_reading_time(extracted_data["text_summary"]),
55
+ "complexity_score": self._calculate_complexity_score(extracted_data),
56
+ "content_type": self._identify_content_type(extracted_data),
57
+ "key_topics": self._extract_key_topics(extracted_data)
58
+ }
59
+ }
60
+
61
+ # Upsert document
62
+ result = self.collection.replace_one(
63
+ {"url": url},
64
+ document,
65
+ upsert=True
66
+ )
67
+
68
+ return str(result.upserted_id or result.matched_count)
69
+
70
+ def get_page_data(self, url: str) -> Optional[Dict]:
71
+ """Retrieve page data by URL"""
72
+ return self.collection.find_one({"url": url})
73
+
74
+ def get_pages_by_domain(self, domain: str) -> List[Dict]:
75
+ """Get all pages from a specific domain"""
76
+ return list(self.collection.find({"domain": domain}))
77
+
78
+ def search_pages(self, query: str, limit: int = 10) -> List[Dict]:
79
+ """Search pages by content for LLM queries"""
80
+ search_filter = {
81
+ "$or": [
82
+ {"title": {"$regex": query, "$options": "i"}},
83
+ {"description": {"$regex": query, "$options": "i"}},
84
+ {"content.text_summary": {"$regex": query, "$options": "i"}}
85
+ ]
86
+ }
87
+
88
+ return list(self.collection.find(search_filter).limit(limit))
89
+
90
+ def _estimate_reading_time(self, text: str) -> int:
91
+ """Estimate reading time in minutes (250 words per minute)"""
92
+ word_count = len(text.split())
93
+ return max(1, word_count // 250)
94
+
95
+ def _calculate_complexity_score(self, data: Dict) -> float:
96
+ """Calculate content complexity for LLM processing hints"""
97
+ score = 0.0
98
+
99
+ # Text length factor
100
+ text_length = len(data["text_summary"])
101
+ score += min(text_length / 1000, 5.0)
102
+
103
+ # Structure complexity
104
+ content_blocks = len(data["content"])
105
+ score += min(content_blocks / 10, 3.0)
106
+
107
+ # Link density
108
+ total_links = len(data["links"])
109
+ score += min(total_links / 20, 2.0)
110
+
111
+ return round(score, 2)
112
+
113
+ def _identify_content_type(self, data: Dict) -> str:
114
+ """Identify content type for LLM processing strategy"""
115
+ title = data["metadata"]["title"].lower()
116
+ text = data["text_summary"].lower()
117
+
118
+ if any(word in title or word in text for word in ["tutorial", "guide", "how to"]):
119
+ return "tutorial"
120
+ elif any(word in title or word in text for word in ["news", "article", "report"]):
121
+ return "article"
122
+ elif any(word in title or word in text for word in ["documentation", "docs", "reference"]):
123
+ return "documentation"
124
+ elif any(word in title or word in text for word in ["blog", "post", "opinion"]):
125
+ return "blog_post"
126
+ else:
127
+ return "general"
128
+
129
+ def _extract_key_topics(self, data: Dict) -> List[str]:
130
+ """Extract key topics for study organization"""
131
+ # Simple keyword extraction from headings and title
132
+ topics = set()
133
+
134
+ # From title
135
+ title_words = data["metadata"]["title"].split()
136
+ topics.update([word.lower() for word in title_words if len(word) > 3])
137
+
138
+ # From headings
139
+ for heading in data["metadata"]["headings"]:
140
+ heading_words = heading["text"].split()
141
+ topics.update([word.lower() for word in heading_words if len(word) > 3])
142
+
143
+ return list(topics)[:10] # Limit to top 10 topics
storage/neo4j_storage.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from neo4j import GraphDatabase
2
+ from typing import Dict, List
3
+ from urllib.parse import urlparse
4
+ from config.settings import settings
5
+
6
+ class Neo4jStorage:
7
+ def __init__(self):
8
+ self.driver = GraphDatabase.driver(
9
+ settings.database.neo4j_uri,
10
+ auth=(settings.database.neo4j_user, settings.database.neo4j_password)
11
+ )
12
+ self._create_constraints()
13
+
14
+ def _create_constraints(self):
15
+ """Create constraints and indexes for better performance"""
16
+ with self.driver.session() as session:
17
+ try:
18
+ session.run("CREATE CONSTRAINT page_url IF NOT EXISTS FOR (p:Page) REQUIRE p.url IS UNIQUE")
19
+ session.run("CREATE CONSTRAINT domain_name IF NOT EXISTS FOR (d:Domain) REQUIRE d.name IS UNIQUE")
20
+ session.run("CREATE INDEX page_title IF NOT EXISTS FOR (p:Page) ON (p.title)")
21
+ except Exception as e:
22
+ pass # Constraints might already exist
23
+
24
+ def store_relationships(self, url: str, extracted_data: Dict, dom_structure: Dict):
25
+ """Store page relationships and structure in Neo4j"""
26
+ with self.driver.session() as session:
27
+ # Create main page node
28
+ self._create_page_node(session, url, extracted_data)
29
+
30
+ # Create domain relationships
31
+ self._create_domain_relationships(session, url, extracted_data)
32
+
33
+ # Create content relationships
34
+ self._create_content_relationships(session, url, extracted_data)
35
+
36
+ # Create link relationships
37
+ self._create_link_relationships(session, url, extracted_data["links"])
38
+
39
+ # Create DOM structure relationships
40
+ self._create_dom_relationships(session, url, dom_structure)
41
+
42
+ def _create_page_node(self, session, url: str, data: Dict):
43
+ """Create or update page node with LLM-friendly properties"""
44
+ query = """
45
+ MERGE (p:Page {url: $url})
46
+ SET p.title = $title,
47
+ p.description = $description,
48
+ p.domain = $domain,
49
+ p.content_type = $content_type,
50
+ p.complexity_score = $complexity_score,
51
+ p.reading_time = $reading_time,
52
+ p.word_count = $word_count,
53
+ p.last_scraped = datetime()
54
+ """
55
+
56
+ session.run(query, {
57
+ "url": url,
58
+ "title": data["metadata"]["title"],
59
+ "description": data["metadata"]["description"],
60
+ "domain": data["metadata"]["domain"],
61
+ "content_type": self._identify_content_type(data),
62
+ "complexity_score": self._calculate_complexity_score(data),
63
+ "reading_time": len(data["text_summary"].split()) // 250,
64
+ "word_count": len(data["text_summary"].split())
65
+ })
66
+
67
+ def _create_domain_relationships(self, session, url: str, data: Dict):
68
+ """Create domain nodes and relationships"""
69
+ domain = data["metadata"]["domain"]
70
+
71
+ # Create domain node
72
+ session.run("""
73
+ MERGE (d:Domain {name: $domain})
74
+ SET d.last_updated = datetime()
75
+ """, {"domain": domain})
76
+
77
+ # Link page to domain
78
+ session.run("""
79
+ MATCH (p:Page {url: $url})
80
+ MATCH (d:Domain {name: $domain})
81
+ MERGE (p)-[:BELONGS_TO]->(d)
82
+ """, {"url": url, "domain": domain})
83
+
84
+ def _create_content_relationships(self, session, url: str, data: Dict):
85
+ """Create content structure relationships for LLM understanding"""
86
+ # Create topic nodes from headings
87
+ for i, heading in enumerate(data["metadata"]["headings"]):
88
+ session.run("""
89
+ MATCH (p:Page {url: $url})
90
+ MERGE (h:Heading {text: $text, level: $level, page_url: $url})
91
+ SET h.position = $position
92
+ MERGE (p)-[:HAS_HEADING]->(h)
93
+ """, {
94
+ "url": url,
95
+ "text": heading["text"],
96
+ "level": heading["level"],
97
+ "position": i
98
+ })
99
+
100
+ # Create content block relationships
101
+ for i, block in enumerate(data["content"][:10]): # Limit for performance
102
+ session.run("""
103
+ MATCH (p:Page {url: $url})
104
+ MERGE (c:ContentBlock {text: $text, page_url: $url, position: $position})
105
+ SET c.tag = $tag,
106
+ c.length = $length
107
+ MERGE (p)-[:HAS_CONTENT]->(c)
108
+ """, {
109
+ "url": url,
110
+ "text": block["text"][:500], # Truncate for storage
111
+ "tag": block["tag"],
112
+ "length": len(block["text"]),
113
+ "position": i
114
+ })
115
+
116
+ def _create_link_relationships(self, session, url: str, links: List[Dict]):
117
+ """Create link relationships for navigation understanding"""
118
+ for link in links[:20]: # Limit for performance
119
+ target_url = link["url"]
120
+ link_text = link["text"]
121
+ is_internal = link["internal"]
122
+
123
+ # Create target page node (minimal)
124
+ session.run("""
125
+ MERGE (target:Page {url: $target_url})
126
+ SET target.discovered_via = $source_url
127
+ """, {"target_url": target_url, "source_url": url})
128
+
129
+ # Create relationship
130
+ relationship_type = "LINKS_TO_INTERNAL" if is_internal else "LINKS_TO_EXTERNAL"
131
+ session.run(f"""
132
+ MATCH (source:Page {{url: $source_url}})
133
+ MATCH (target:Page {{url: $target_url}})
134
+ MERGE (source)-[r:{relationship_type}]->(target)
135
+ SET r.link_text = $link_text,
136
+ r.is_internal = $is_internal
137
+ """, {
138
+ "source_url": url,
139
+ "target_url": target_url,
140
+ "link_text": link_text,
141
+ "is_internal": is_internal
142
+ })
143
+
144
+ def _create_dom_relationships(self, session, url: str, dom_structure: Dict):
145
+ """Create DOM structure relationships for content hierarchy"""
146
+ # Create semantic structure nodes
147
+ semantic_elements = dom_structure["semantic_structure"]["semantic_elements"]
148
+ for tag, count in semantic_elements.items():
149
+ if count > 0:
150
+ session.run("""
151
+ MATCH (p:Page {url: $url})
152
+ MERGE (s:SemanticElement {tag: $tag, page_url: $url})
153
+ SET s.count = $count
154
+ MERGE (p)-[:HAS_SEMANTIC_ELEMENT]->(s)
155
+ """, {"url": url, "tag": tag, "count": count})
156
+
157
+ def get_page_relationships(self, url: str) -> Dict:
158
+ """Get all relationships for a page for LLM context"""
159
+ with self.driver.session() as session:
160
+ result = session.run("""
161
+ MATCH (p:Page {url: $url})
162
+ OPTIONAL MATCH (p)-[:LINKS_TO_INTERNAL]->(internal:Page)
163
+ OPTIONAL MATCH (p)-[:LINKS_TO_EXTERNAL]->(external:Page)
164
+ OPTIONAL MATCH (p)-[:HAS_HEADING]->(h:Heading)
165
+ RETURN p, collect(DISTINCT internal.url) as internal_links,
166
+ collect(DISTINCT external.url) as external_links,
167
+ collect(DISTINCT {text: h.text, level: h.level}) as headings
168
+ """, {"url": url})
169
+
170
+ record = result.single()
171
+ if record:
172
+ return {
173
+ "page": dict(record["p"]),
174
+ "internal_links": record["internal_links"],
175
+ "external_links": record["external_links"],
176
+ "headings": record["headings"]
177
+ }
178
+ return {}
179
+
180
+ def get_related_pages(self, url: str, limit: int = 5) -> List[Dict]:
181
+ """Find related pages for LLM context and study suggestions"""
182
+ with self.driver.session() as session:
183
+ result = session.run("""
184
+ MATCH (p:Page {url: $url})
185
+ MATCH (p)-[:BELONGS_TO]->(d:Domain)
186
+ MATCH (related:Page)-[:BELONGS_TO]->(d)
187
+ WHERE related.url <> $url
188
+ RETURN related.url as url, related.title as title,
189
+ related.content_type as content_type,
190
+ related.complexity_score as complexity_score
191
+ ORDER BY related.complexity_score DESC
192
+ LIMIT $limit
193
+ """, {"url": url, "limit": limit})
194
+
195
+ return [dict(record) for record in result]
196
+
197
+ def _identify_content_type(self, data: Dict) -> str:
198
+ """Identify content type for graph relationships"""
199
+ title = data["metadata"]["title"].lower()
200
+ if "tutorial" in title or "guide" in title:
201
+ return "tutorial"
202
+ elif "documentation" in title or "docs" in title:
203
+ return "documentation"
204
+ elif "blog" in title or "article" in title:
205
+ return "article"
206
+ return "general"
207
+
208
+ def _calculate_complexity_score(self, data: Dict) -> float:
209
+ """Calculate complexity score for relationship weighting"""
210
+ text_length = len(data["text_summary"])
211
+ content_blocks = len(data["content"])
212
+ return min(text_length / 1000 + content_blocks / 10, 10.0)
213
+
214
+ def close(self):
215
+ """Close database connection"""
216
+ self.driver.close()