Spaces:

WD101
/

OneServerToRuleThemAll

Runtime error

App Files Files Community

etukurudinesh commited on Jun 6, 2025

Commit

feea636

1 Parent(s): 80a0aa2

add files

Browse files

Files changed (13) hide show

.gitignore +49 -0
Dockerfile +34 -0
app.py +211 -0
main.py +184 -0
requirements.txt +16 -0
scraper/__init__.py +0 -0
scraper/data_extractor.py +176 -0
scraper/dom_analyzer.py +162 -0
scraper/html_loader.py +68 -0
server.py +33 -0
storage/__init__.py +0 -0
storage/mongo_storage.py +143 -0
storage/neo4j_storage.py +216 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,49 @@

+# Sensitive files
+certification.pem
+*.pem
+*.key
+*.crt
+*.cert
+# Python temporary files
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.Python
+pip-log.txt
+pip-delete-this-directory.txt
+# Virtual environments
+venv/
+env/
+.venv/
+.env
+# Docker artifacts
+*.dockerignore
+Dockerfile.bak
+*.log
+# Editor and IDE files
+.vscode/
+.idea/
+*.sublime-project
+*.sublime-workspace
+# System files
+.DS_Store
+Thumbs.db
+# Local development and testing
+*.swp
+*.swo
+*.tmp
+*.bak
+*.backup
+# Cache and temporary directories
+.cache/
+*.cache
+*.egg-info/
+dist

Dockerfile ADDED Viewed

	@@ -0,0 +1,34 @@

+# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+FROM python:3.9
+# Install system dependencies for Playwright
+RUN apt-get update && apt-get install -y \
+    libnss3 \
+    libatk1.0-0 \
+    libatk-bridge2.0-0 \
+    libxcomposite1 \
+    libxdamage1 \
+    libxrandr2 \
+    libgbm1 \
+    libpango-1.0-0 \
+    libcairo2 \
+    libasound2 \
+    libxshmfence1 \
+    && rm -rf /var/lib/apt/lists/*
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+# Install Playwright browsers
+RUN playwright install
+COPY --chown=user . /app
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,211 @@

+from fastapi import FastAPI, HTTPException, BackgroundTasks
+from pydantic import BaseModel, HttpUrl
+from typing import List, Dict, Optional
+import asyncio
+from main import WebScrapingOrchestrator
+app = FastAPI(
+    title="Advanced Web Scraper for LLM",
+    description="Scrape, analyze, and store web content optimized for LLM consumption",
+    version="1.0.0"
+)
+# Global orchestrator instance
+orchestrator = WebScrapingOrchestrator()
+# Pydantic models
+class URLRequest(BaseModel):
+    url: HttpUrl
+class SearchRequest(BaseModel):
+    query: str
+    limit: int = 5
+class BatchURLRequest(BaseModel):
+    urls: List[HttpUrl]
+# Response models
+class ScrapingResponse(BaseModel):
+    success: bool
+    url: str
+    title: Optional[str] = None
+    summary: Optional[Dict] = None
+    llm_ready_data: Optional[Dict] = None
+    error: Optional[str] = None
+class SearchResponse(BaseModel):
+    results: List[Dict]
+    total_found: int
+@app.post("/scrape", response_model=ScrapingResponse)
+async def scrape_url(request: URLRequest):
+    """Scrape a single URL and store data optimized for LLM consumption"""
+    try:
+        result = await orchestrator.process_url(str(request.url))
+        if "error" in result:
+            raise HTTPException(status_code=400, detail=result["error"])
+        return ScrapingResponse(**result)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")
+@app.post("/scrape-batch")
+async def scrape_batch_urls(request: BatchURLRequest, background_tasks: BackgroundTasks):
+    """Scrape multiple URLs in the background"""
+    async def process_batch():
+        results = []
+        for url in request.urls:
+            try:
+                result = await orchestrator.process_url(str(url))
+                results.append(result)
+            except Exception as e:
+                results.append({"error": str(e), "url": str(url)})
+        return results
+    # Add to background tasks
+    background_tasks.add_task(process_batch)
+    return {
+        "message": f"Started processing {len(request.urls)} URLs in background",
+        "urls": [str(url) for url in request.urls]
+    }
+@app.get("/page/{url:path}")
+async def get_page_data(url: str):
+    """Get processed page data optimized for LLM consumption"""
+    try:
+        # Decode URL
+        import urllib.parse
+        decoded_url = urllib.parse.unquote(url)
+        page_data = orchestrator.get_page_for_llm(decoded_url)
+        if not page_data:
+            raise HTTPException(status_code=404, detail="Page not found")
+        return page_data
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Retrieval failed: {str(e)}")
+@app.post("/search", response_model=SearchResponse)
+async def search_content(request: SearchRequest):
+    """Search stored content for LLM context"""
+    try:
+        results = orchestrator.search_for_llm(request.query, request.limit)
+        return SearchResponse(
+            results=results,
+            total_found=len(results)
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Search failed: {str(e)}")
+@app.get("/llm-ready/{url:path}")
+async def get_llm_ready_content(url: str):
+    """Get content specifically formatted for LLM consumption"""
+    try:
+        import urllib.parse
+        decoded_url = urllib.parse.unquote(url)
+        page_data = orchestrator.get_page_for_llm(decoded_url)
+        if not page_data:
+            raise HTTPException(status_code=404, detail="Page not found")
+        # Format for LLM
+        llm_content = {
+            "instruction": "Use this content for generating summaries, notes, or mind maps",
+            "content": {
+                "title": page_data["title"],
+                "main_content": page_data["content"],
+                "structure": {
+                    "headings": page_data["headings"],
+                    "content_type": page_data["study_metadata"]["content_type"],
+                    "complexity": page_data["study_metadata"]["complexity_score"],
+                    "reading_time": page_data["study_metadata"]["reading_time"]
+                },
+                "context": {
+                    "related_pages": page_data["relationships"]["related_pages"],
+                    "key_topics": page_data["study_metadata"]["key_topics"]
+                }
+            },
+            "suggestions": {
+                "study_approach": _get_study_approach(page_data["study_metadata"]),
+                "focus_areas": page_data["headings"][:3],
+                "difficulty_level": _assess_difficulty(page_data["study_metadata"])
+            }
+        }
+        return llm_content
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"LLM formatting failed: {str(e)}")
+@app.get("/health")
+async def health_check():
+    """Health check endpoint"""
+    return {"status": "healthy", "message": "Web scraper API is running"}
+@app.get("/stats")
+async def get_statistics():
+    """Get scraping statistics"""
+    try:
+        # Get basic stats from MongoDB
+        mongo_stats = orchestrator.mongo_storage.collection.estimated_document_count()
+        return {
+            "total_pages_scraped": mongo_stats,
+            "database_status": "connected",
+            "features": [
+                "Dynamic content scraping with Playwright",
+                "DOM structure analysis",
+                "MongoDB storage for content",
+                "Neo4j for relationships",
+                "LLM-optimized data extraction"
+            ]
+        }
+    except Exception as e:
+        return {"error": f"Stats retrieval failed: {str(e)}"}
+def _get_study_approach(metadata: Dict) -> str:
+    """Suggest study approach based on content analysis"""
+    content_type = metadata.get("content_type", "general")
+    complexity = metadata.get("complexity_score", 0)
+    if content_type == "tutorial":
+        return "hands-on practice with step-by-step approach"
+    elif content_type == "documentation":
+        return "reference-based learning with examples"
+    elif content_type == "research":
+        return "analytical reading with note-taking"
+    elif complexity > 5:
+        return "detailed study with concept mapping"
+    else:
+        return "general reading with summary creation"
+def _assess_difficulty(metadata: Dict) -> str:
+    """Assess content difficulty for LLM processing hints"""
+    complexity = metadata.get("complexity_score", 0)
+    reading_time = metadata.get("reading_time", 0)
+    if complexity < 2 and reading_time < 5:
+        return "beginner"
+    elif complexity < 5 and reading_time < 15:
+        return "intermediate"
+    else:
+        return "advanced"
+@app.on_event("shutdown")
+async def shutdown_event():
+    """Clean up on shutdown"""
+    orchestrator.close_connections()
+# Run the API
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run("api:app", host="0.0.0.0", port=8000, reload=True)

main.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import asyncio
+from typing import Dict, Optional,List
+from scraper.html_loader import HTMLLoader
+from scraper.data_extractor import DataExtractor
+from scraper.dom_analyzer import DOMAnalyzer
+from storage.mongo_storage import MongoStorage
+# from storage.neo4j_storage import Neo4jStorage
+class WebScrapingOrchestrator:
+    def __init__(self):
+        self.data_extractor = DataExtractor()
+        self.dom_analyzer = DOMAnalyzer()
+        self.mongo_storage = MongoStorage()
+        # self.neo4j_storage = Neo4jStorage()
+    async def process_url(self, url: str) -> Dict:
+        """Complete pipeline to process a URL for LLM consumption"""
+        try:
+            print(f"Processing URL: {url}")
+            # Step 1: Load HTML content
+            async with HTMLLoader() as loader:
+                html_data = await loader.load_page(url)
+            if not html_data:
+                return {"error": "Failed to load page"}
+            print("✓ HTML loaded successfully")
+            # Step 2: Extract structured data
+            extracted_data = self.data_extractor.extract_structured_data(
+                html_data["html"],
+                html_data["url"]
+            )
+            print("✓ Data extracted successfully")
+            # Step 3: Analyze DOM structure
+            dom_structure = self.dom_analyzer.analyze_structure(html_data["html"])
+            print("✓ DOM structure analyzed")
+            # Step 4: Store in MongoDB
+            mongo_id = self.mongo_storage.store_page_data(
+                html_data["url"],
+                extracted_data,
+                dom_structure
+            )
+            print("✓ Data stored in MongoDB")
+            # Step 5: Store relationships in Neo4j
+            # self.neo4j_storage.store_relationships(
+            #     html_data["url"],
+            #     extracted_data,
+            #     dom_structure
+            # )
+            print("✓ Relationships stored in Neo4j")
+            # Return LLM-ready summary
+            return {
+                "success": True,
+                "url": html_data["url"],
+                "title": html_data["title"],
+                "mongo_id": mongo_id,
+                "summary": {
+                    "content_blocks": len(extracted_data["content"]),
+                    "text_length": len(extracted_data["text_summary"]),
+                    "links_found": len(extracted_data["links"]),
+                    "images_found": len(extracted_data["images"]),
+                    "dom_depth": dom_structure["statistics"]["max_depth"],
+                    "content_type": self._identify_content_type(extracted_data)
+                },
+                "llm_ready_data": {
+                    "text_summary": extracted_data["text_summary"],
+                    "key_headings": [h["text"] for h in extracted_data["metadata"]["headings"][:5]],
+                    "main_topics": self._extract_main_topics(extracted_data),
+                    "study_hints": self._generate_study_hints(extracted_data, dom_structure)
+                }
+            }
+        except Exception as e:
+            print(f"✗ Error processing {url}: {str(e)}")
+            return {"error": str(e), "url": url}
+    def get_page_for_llm(self, url: str) -> Optional[Dict]:
+        """Retrieve page data optimized for LLM consumption"""
+        # Get from MongoDB
+        mongo_data = self.mongo_storage.get_page_data(url)
+        if not mongo_data:
+            return None
+        # Get relationships from Neo4j
+        neo4j_data = self.neo4j_storage.get_page_relationships(url)
+        # Combine for LLM
+        return {
+            "content": mongo_data["content"]["text_summary"],
+            "title": mongo_data["title"],
+            "headings": [h["text"] for h in mongo_data["content"]["headings"]],
+            "structure": mongo_data["study_metadata"],
+            "relationships": {
+                "related_pages": neo4j_data.get("internal_links", [])[:5],
+                "external_references": neo4j_data.get("external_links", [])[:3]
+            },
+            "study_metadata": mongo_data["study_metadata"]
+        }
+    def search_for_llm(self, query: str, limit: int = 5) -> List[Dict]:
+        """Search content for LLM context"""
+        results = self.mongo_storage.search_pages(query, limit)
+        llm_ready_results = []
+        for result in results:
+            llm_ready_results.append({
+                "url": result["url"],
+                "title": result["title"],
+                "summary": result["content"]["text_summary"][:500],
+                "content_type": result["study_metadata"]["content_type"],
+                "complexity": result["study_metadata"]["complexity_score"],
+                "key_topics": result["study_metadata"]["key_topics"][:5]
+            })
+        return llm_ready_results
+    def _identify_content_type(self, data: Dict) -> str:
+        """Identify content type for processing hints"""
+        title = data["metadata"]["title"].lower()
+        text = data["text_summary"].lower()
+        if any(word in title for word in ["tutorial", "guide", "how to"]):
+            return "tutorial"
+        elif any(word in title for word in ["documentation", "docs", "api"]):
+            return "documentation"
+        elif any(word in title for word in ["blog", "article", "news"]):
+            return "article"
+        elif any(word in text for word in ["research", "study", "analysis"]):
+            return "research"
+        return "general"
+    def _extract_main_topics(self, data: Dict) -> List[str]:
+        """Extract main topics for LLM understanding"""
+        topics = set()
+        # From title
+        title_words = [word for word in data["metadata"]["title"].split() if len(word) > 3]
+        topics.update(title_words[:3])
+        # From headings
+        for heading in data["metadata"]["headings"][:3]:
+            heading_words = [word for word in heading["text"].split() if len(word) > 3]
+            topics.update(heading_words[:2])
+        return list(topics)[:5]
+    def _generate_study_hints(self, extracted_data: Dict, dom_structure: Dict) -> Dict:
+        """Generate study hints for LLM processing"""
+        return {
+            "difficulty_level": "beginner" if len(extracted_data["text_summary"]) < 2000 else "intermediate",
+            "estimated_study_time": f"{len(extracted_data['text_summary'].split()) // 250} minutes",
+            "content_structure": "well_structured" if len(extracted_data["metadata"]["headings"]) > 3 else "basic",
+            "has_examples": "code" in extracted_data["text_summary"].lower(),
+            "interactive_elements": dom_structure["statistics"]["tag_distribution"].get("form", 0) > 0
+        }
+    def close_connections(self):
+        """Close all database connections"""
+        self.neo4j_storage.close()
+# Main execution function
+async def main():
+    orchestrator = WebScrapingOrchestrator()
+    # Example usage
+    test_url = "https://en.wikipedia.org/wiki/Virat_Kohli"
+    result = await orchestrator.process_url(test_url)
+    print(f"Processing result: {result}")
+    # Clean up
+    orchestrator.close_connections()
+if __name__ == "__main__":
+    asyncio.run(main())

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+fastapi==0.104.1
+uvicorn==0.24.0
+playwright==1.40.0
+beautifulsoup4==4.12.2
+pymongo==4.6.0
+neo4j==5.15.0
+pydantic==2.5.2
+python-multipart==0.0.6
+aiofiles==23.2.1
+requests==2.31.0
+lxml==4.9.3
+newspaper3k==0.2.8
+readability-lxml==0.8.1
+python-dotenv==1.0.0
+nltk==3.8.1
+spacy==3.7.2

scraper/__init__.py ADDED Viewed

File without changes

scraper/data_extractor.py ADDED Viewed

	@@ -0,0 +1,176 @@

+from bs4 import BeautifulSoup, Comment
+from typing import Dict, List, Optional
+import re
+from urllib.parse import urljoin, urlparse
+class DataExtractor:
+    def __init__(self):
+        self.ignore_selectors = [
+            '.advertisement',
+            '.ad',
+            '.banner',
+            '.popup',
+            '#footer',
+            '.footer',
+            '.sidebar',
+            'nav',
+            '.navbar',
+            '.menu',
+            'header',
+            '#header',
+            'script',
+            'style',
+            'noscript',
+            'iframe',
+            'meta',
+            'link',
+            '[class*="ad-"]',
+            '[id*="ad-"]',
+            '.cookie-notice',
+            '.modal',
+            'form',
+            'input',
+            'button',
+            '.social-media',
+            '.comments-section',
+            '.widget'
+        ]
+        self.content_selectors = [
+            '.main-content',
+            'article',
+            'p',
+            'h1',
+            'h2',
+            'h3',
+            'h4',
+            'h5',
+            'h6',
+            'div.content',
+            '.post',
+            '.article-body',
+            '.content-body',
+            'section',
+            'main',
+            'ul',
+            'ol',
+            'li',
+            'table',
+            'td',
+            'th',
+            'blockquote',
+            'pre',
+            '.text',
+            '[class*="content"]',
+            '[class*="post"]',
+            '[class*="article"]',
+            'div:not([class*="ad"]):not([class*="banner"]):not([class*="sidebar"])'
+        ]
+        self.min_text_length = 200
+    def extract_structured_data(self, html: str, url: str) -> Dict:
+        """Extract structured data from HTML for LLM consumption"""
+        soup = BeautifulSoup(html, 'lxml')
+        # Remove unwanted elements
+        self._clean_html(soup)
+        return {
+            "content": self._extract_content(soup),
+            "metadata": self._extract_metadata(soup, url),
+            "structure": self._extract_structure(soup),
+            "links": self._extract_links(soup, url),
+            "images": self._extract_images(soup, url),
+            "text_summary": self._extract_text_summary(soup)
+        }
+    def _clean_html(self, soup: BeautifulSoup):
+        """Remove unwanted elements for cleaner extraction"""
+        for selector in self.ignore_selectors:
+            for element in soup.select(selector):
+                element.decompose()
+        # Remove comments and scripts
+        for element in soup(text=lambda text: isinstance(text, Comment)):
+            element.extract()
+    def _extract_content(self, soup: BeautifulSoup) -> List[Dict]:
+        """Extract main content blocks"""
+        content_blocks = []
+        for selector in self.content_selectors:
+            elements = soup.select(selector)
+            for elem in elements:
+                text = elem.get_text(strip=True)
+                if len(text) >= self.min_text_length:
+                    content_blocks.append({
+                        "tag": elem.name,
+                        "text": text,
+                        "html": str(elem),
+                        "attributes": dict(elem.attrs) if elem.attrs else {}
+                    })
+        return content_blocks
+    def _extract_metadata(self, soup: BeautifulSoup, url: str) -> Dict:
+        """Extract page metadata"""
+        title = soup.find('title')
+        meta_desc = soup.find('meta', attrs={'name': 'description'})
+        return {
+            "title": title.get_text().strip() if title else "",
+            "description": meta_desc.get('content', '') if meta_desc else "",
+            "url": url,
+            "domain": urlparse(url).netloc,
+            "headings": self._extract_headings(soup)
+        }
+    def _extract_headings(self, soup: BeautifulSoup) -> List[Dict]:
+        """Extract heading hierarchy for structure"""
+        headings = []
+        for i in range(1, 7):
+            for heading in soup.find_all(f'h{i}'):
+                headings.append({
+                    "level": i,
+                    "text": heading.get_text().strip(),
+                    "id": heading.get('id', '')
+                })
+        return headings
+    def _extract_structure(self, soup: BeautifulSoup) -> Dict:
+        """Extract DOM structure for relationships"""
+        return {
+            "sections": len(soup.find_all(['section', 'article', 'div'])),
+            "paragraphs": len(soup.find_all('p')),
+            "lists": len(soup.find_all(['ul', 'ol'])),
+            "tables": len(soup.find_all('table')),
+            "forms": len(soup.find_all('form'))
+        }
+    def _extract_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
+        """Extract all links for relationship mapping"""
+        links = []
+        for link in soup.find_all('a', href=True):
+            href = urljoin(base_url, link['href'])
+            links.append({
+                "url": href,
+                "text": link.get_text().strip(),
+                "internal": urlparse(href).netloc == urlparse(base_url).netloc
+            })
+        return links[:50]  # Limit for performance
+    def _extract_images(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
+        """Extract images with context"""
+        images = []
+        for img in soup.find_all('img', src=True):
+            images.append({
+                "src": urljoin(base_url, img['src']),
+                "alt": img.get('alt', ''),
+                "caption": img.get('title', '')
+            })
+        return images[:20]  # Limit for performance
+    def _extract_text_summary(self, soup: BeautifulSoup) -> str:
+        """Extract clean text for LLM processing"""
+        text = soup.get_text()
+        # Clean whitespace and normalize
+        text = re.sub(r'\s+', ' ', text).strip()
+        return text[:5000]  # Limit for token efficiency

scraper/dom_analyzer.py ADDED Viewed

	@@ -0,0 +1,162 @@

+from bs4 import BeautifulSoup
+from typing import Dict, List
+import hashlib
+class DOMAnalyzer:
+    def __init__(self):
+        pass
+    def analyze_structure(self, html: str) -> Dict:
+        """Analyze DOM structure and create tree representation"""
+        soup = BeautifulSoup(html, 'lxml')
+        return {
+            "tree": self._build_dom_tree(soup.body if soup.body else soup),
+            "statistics": self._get_dom_statistics(soup),
+            "semantic_structure": self._analyze_semantic_structure(soup),
+            "content_blocks": self._identify_content_blocks(soup)
+        }
+    def _build_dom_tree(self, element, depth=0, max_depth=5) -> Dict:
+        """Build hierarchical DOM tree structure"""
+        if depth > max_depth or not element or not hasattr(element, 'name'):
+            return {}
+        node = {
+            "tag": element.name if element.name else "text",
+            "id": element.get('id', ''),
+            "classes": element.get('class', []),
+            "text_content": element.get_text()[:100] if element.get_text() else "",
+            "children": [],
+            "attributes": dict(element.attrs) if hasattr(element, 'attrs') else {},
+            "depth": depth,
+            "node_id": hashlib.md5(str(element)[:500].encode()).hexdigest()[:8]
+        }
+        # Add children (limit to prevent huge trees)
+        if hasattr(element, 'children') and depth < max_depth:
+            child_count = 0
+            for child in element.children:
+                if child_count >= 10:  # Limit children per node
+                    break
+                if hasattr(child, 'name') and child.name:
+                    child_node = self._build_dom_tree(child, depth + 1, max_depth)
+                    if child_node:
+                        node["children"].append(child_node)
+                        child_count += 1
+        return node
+    def _get_dom_statistics(self, soup: BeautifulSoup) -> Dict:
+        """Get DOM statistics for analysis"""
+        all_tags = soup.find_all()
+        tag_counts = {}
+        for tag in all_tags:
+            tag_name = tag.name
+            tag_counts[tag_name] = tag_counts.get(tag_name, 0) + 1
+        return {
+            "total_elements": len(all_tags),
+            "tag_distribution": tag_counts,
+            "max_depth": self._calculate_max_depth(soup),
+            "text_content_ratio": self._calculate_text_ratio(soup)
+        }
+    def _analyze_semantic_structure(self, soup: BeautifulSoup) -> Dict:
+        """Analyze semantic HTML structure"""
+        semantic_tags = ['header', 'nav', 'main', 'article', 'section', 'aside', 'footer']
+        semantic_elements = {}
+        for tag in semantic_tags:
+            elements = soup.find_all(tag)
+            semantic_elements[tag] = len(elements)
+        return {
+            "semantic_elements": semantic_elements,
+            "has_semantic_structure": sum(semantic_elements.values()) > 0,
+            "content_hierarchy": self._analyze_heading_hierarchy(soup)
+        }
+    def _identify_content_blocks(self, soup: BeautifulSoup) -> List[Dict]:
+        """Identify main content blocks for LLM processing"""
+        content_blocks = []
+        # Look for common content containers
+        selectors = ['article', 'main', '.content', '#content', '.post', '.entry']
+        for selector in selectors:
+            elements = soup.select(selector)
+            for elem in elements:
+                if elem.get_text(strip=True):
+                    content_blocks.append({
+                        "selector": selector,
+                        "tag": elem.name,
+                        "text_length": len(elem.get_text()),
+                        "element_id": elem.get('id', ''),
+                        "classes": elem.get('class', []),
+                        "priority": self._calculate_content_priority(elem)
+                    })
+        return sorted(content_blocks, key=lambda x: x['priority'], reverse=True)[:5]
+    def _calculate_max_depth(self, soup: BeautifulSoup) -> int:
+        """Calculate maximum DOM depth"""
+        def get_depth(element, current_depth=0):
+            if not hasattr(element, 'children'):
+                return current_depth
+            max_child_depth = current_depth
+            for child in element.children:
+                if hasattr(child, 'name') and child.name:
+                    depth = get_depth(child, current_depth + 1)
+                    max_child_depth = max(max_child_depth, depth)
+            return max_child_depth
+        return get_depth(soup)
+    def _calculate_text_ratio(self, soup: BeautifulSoup) -> float:
+        """Calculate ratio of text content to HTML tags"""
+        text_length = len(soup.get_text())
+        html_length = len(str(soup))
+        return text_length / html_length if html_length > 0 else 0
+    def _analyze_heading_hierarchy(self, soup: BeautifulSoup) -> List[Dict]:
+        """Analyze heading structure for content organization"""
+        headings = []
+        for i in range(1, 7):
+            for heading in soup.find_all(f'h{i}'):
+                headings.append({
+                    "level": i,
+                    "text": heading.get_text().strip(),
+                    "position": len(headings)
+                })
+        return headings
+    def _calculate_content_priority(self, element) -> int:
+        """Calculate priority score for content blocks"""
+        score = 0
+        text_length = len(element.get_text())
+        # Text length scoring
+        score += min(text_length // 100, 10)
+        # Semantic tag bonus
+        if element.name in ['article', 'main']:
+            score += 5
+        elif element.name in ['section', 'div']:
+            score += 2
+        # Class/ID based scoring
+        classes = element.get('class', [])
+        element_id = element.get('id', '')
+        content_indicators = ['content', 'article', 'post', 'main', 'body']
+        for indicator in content_indicators:
+            if any(indicator in str(c).lower() for c in classes):
+                score += 3
+            if indicator in element_id.lower():
+                score += 3
+        return score

scraper/html_loader.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import asyncio
+from playwright.async_api import async_playwright
+from typing import Dict, Optional
+import time
+class HTMLLoader:
+    def __init__(self):
+        self.browser = None
+        self.context = None
+        self.max_retries = 3
+        self.timeout = 30000
+        self.wait_for_selector = "body"
+        self.max_retries = 3
+        self.delay_between_requests = 1.0
+    async def __aenter__(self):
+        self.playwright = await async_playwright().start()
+        self.browser = await self.playwright.chromium.launch(
+            headless=True
+        )
+        self.context = await self.browser.new_context(
+            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
+        )
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        if self.context:
+            await self.context.close()
+        if self.browser:
+            await self.browser.close()
+        if self.playwright:
+            await self.playwright.stop()
+    async def load_page(self, url: str) -> Dict[str, str]:
+        """Load HTML content from URL handling both static and dynamic sites"""
+        for attempt in range(self.max_retries):
+            try:
+                page = await self.context.new_page()
+                await page.goto(url, timeout=self.timeout)
+                # Wait for body to load
+                await page.wait_for_selector(
+                    self.wait_for_selector,
+                    timeout=10000
+                )
+                # Additional wait for dynamic content
+                await page.wait_for_timeout(2000)
+                html_content = await page.content()
+                title = await page.title()
+                url_final = page.url
+                await page.close()
+                return {
+                    "html": html_content,
+                    "title": title,
+                    "url": url_final,
+                    "timestamp": int(time.time())
+                }
+            except Exception as e:
+                if attempt == self.max_retries - 1:
+                    raise Exception(f"Failed to load {url}: {str(e)}")
+                await asyncio.sleep(self.delay_between_requests)
+        return None

server.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import gradio as gr
+import asyncio
+from main import WebScrapingOrchestrator
+orchestrator = WebScrapingOrchestrator()
+async def scrape_async(url):
+    result = await orchestrator.process_url(url)
+    if "error" in result:
+        return f"❌ Error: {result['error']}"
+    return {
+        "URL": result.get("url"),
+        "Title": result.get("title"),
+        "Text Length": result["summary"]["text_length"],
+        "Headings": result["llm_ready_data"]["key_headings"],
+        "Main Topics": result["llm_ready_data"]["main_topics"],
+        "Summary (Short)": result["llm_ready_data"]["text_summary"][:800] + "..."
+    }
+def scrape(url):
+    return asyncio.run(scrape_async(url))
+with gr.Blocks(title="MCP Web Scraper") as demo:
+    gr.Markdown("### 🔍 MCP LLM Web Scraper")
+    url_input = gr.Textbox(label="Enter a webpage URL", placeholder="https://...")
+    output = gr.JSON(label="Scraped & LLM-ready Content")
+    scrape_button = gr.Button("Scrape Page")
+    scrape_button.click(scrape, inputs=url_input, outputs=output)
+if __name__ == "__main__":
+    #demo.launch(server_name="0.0.0.0", server_port=7860)
+    demo.launch()

storage/__init__.py ADDED Viewed

File without changes

storage/mongo_storage.py ADDED Viewed

	@@ -0,0 +1,143 @@

+from pymongo import MongoClient
+from typing import Dict, List, Optional
+import datetime
+import os
+class MongoStorage:
+    def __init__(self):
+        self.client = MongoClient(os.environ.get("mongo_db_uri"))
+        self.db = self.client[os.environ.get("mongo_db_name")]
+        self.collection = self.db.scraped_pages
+        self._create_indexes()
+    def _create_indexes(self):
+        """Create indexes for better query performance"""
+        self.collection.create_index("url", unique=True)
+        self.collection.create_index("domain")
+        self.collection.create_index("timestamp")
+        self.collection.create_index("content.metadata.title")
+    def store_page_data(self, url: str, extracted_data: Dict, dom_structure: Dict) -> str:
+        """Store complete page data optimized for LLM consumption"""
+        document = {
+            "url": url,
+            "domain": extracted_data["metadata"]["domain"],
+            "timestamp": datetime.datetime.utcnow(),
+            "title": extracted_data["metadata"]["title"],
+            "description": extracted_data["metadata"]["description"],
+            # LLM-optimized content structure
+            "content": {
+                "text_summary": extracted_data["text_summary"],
+                "content_blocks": extracted_data["content"],
+                "headings": extracted_data["metadata"]["headings"],
+                "structure_info": extracted_data["structure"]
+            },
+            # Relationship data
+            "relationships": {
+                "internal_links": [link for link in extracted_data["links"] if link["internal"]],
+                "external_links": [link for link in extracted_data["links"] if not link["internal"]],
+                "images": extracted_data["images"]
+            },
+            # DOM analysis for advanced processing
+            "dom_analysis": {
+                "tree_structure": dom_structure["tree"],
+                "statistics": dom_structure["statistics"],
+                "semantic_structure": dom_structure["semantic_structure"],
+                "content_blocks": dom_structure["content_blocks"]
+            },
+            # Study-friendly metadata
+            "study_metadata": {
+                "reading_time": self._estimate_reading_time(extracted_data["text_summary"]),
+                "complexity_score": self._calculate_complexity_score(extracted_data),
+                "content_type": self._identify_content_type(extracted_data),
+                "key_topics": self._extract_key_topics(extracted_data)
+            }
+        }
+        # Upsert document
+        result = self.collection.replace_one(
+            {"url": url},
+            document,
+            upsert=True
+        )
+        return str(result.upserted_id or result.matched_count)
+    def get_page_data(self, url: str) -> Optional[Dict]:
+        """Retrieve page data by URL"""
+        return self.collection.find_one({"url": url})
+    def get_pages_by_domain(self, domain: str) -> List[Dict]:
+        """Get all pages from a specific domain"""
+        return list(self.collection.find({"domain": domain}))
+    def search_pages(self, query: str, limit: int = 10) -> List[Dict]:
+        """Search pages by content for LLM queries"""
+        search_filter = {
+            "$or": [
+                {"title": {"$regex": query, "$options": "i"}},
+                {"description": {"$regex": query, "$options": "i"}},
+                {"content.text_summary": {"$regex": query, "$options": "i"}}
+            ]
+        }
+        return list(self.collection.find(search_filter).limit(limit))
+    def _estimate_reading_time(self, text: str) -> int:
+        """Estimate reading time in minutes (250 words per minute)"""
+        word_count = len(text.split())
+        return max(1, word_count // 250)
+    def _calculate_complexity_score(self, data: Dict) -> float:
+        """Calculate content complexity for LLM processing hints"""
+        score = 0.0
+        # Text length factor
+        text_length = len(data["text_summary"])
+        score += min(text_length / 1000, 5.0)
+        # Structure complexity
+        content_blocks = len(data["content"])
+        score += min(content_blocks / 10, 3.0)
+        # Link density
+        total_links = len(data["links"])
+        score += min(total_links / 20, 2.0)
+        return round(score, 2)
+    def _identify_content_type(self, data: Dict) -> str:
+        """Identify content type for LLM processing strategy"""
+        title = data["metadata"]["title"].lower()
+        text = data["text_summary"].lower()
+        if any(word in title or word in text for word in ["tutorial", "guide", "how to"]):
+            return "tutorial"
+        elif any(word in title or word in text for word in ["news", "article", "report"]):
+            return "article"
+        elif any(word in title or word in text for word in ["documentation", "docs", "reference"]):
+            return "documentation"
+        elif any(word in title or word in text for word in ["blog", "post", "opinion"]):
+            return "blog_post"
+        else:
+            return "general"
+    def _extract_key_topics(self, data: Dict) -> List[str]:
+        """Extract key topics for study organization"""
+        # Simple keyword extraction from headings and title
+        topics = set()
+        # From title
+        title_words = data["metadata"]["title"].split()
+        topics.update([word.lower() for word in title_words if len(word) > 3])
+        # From headings
+        for heading in data["metadata"]["headings"]:
+            heading_words = heading["text"].split()
+            topics.update([word.lower() for word in heading_words if len(word) > 3])
+        return list(topics)[:10]  # Limit to top 10 topics

storage/neo4j_storage.py ADDED Viewed

	@@ -0,0 +1,216 @@

+from neo4j import GraphDatabase
+from typing import Dict, List
+from urllib.parse import urlparse
+from config.settings import settings
+class Neo4jStorage:
+    def __init__(self):
+        self.driver = GraphDatabase.driver(
+            settings.database.neo4j_uri,
+            auth=(settings.database.neo4j_user, settings.database.neo4j_password)
+        )
+        self._create_constraints()
+    def _create_constraints(self):
+        """Create constraints and indexes for better performance"""
+        with self.driver.session() as session:
+            try:
+                session.run("CREATE CONSTRAINT page_url IF NOT EXISTS FOR (p:Page) REQUIRE p.url IS UNIQUE")
+                session.run("CREATE CONSTRAINT domain_name IF NOT EXISTS FOR (d:Domain) REQUIRE d.name IS UNIQUE")
+                session.run("CREATE INDEX page_title IF NOT EXISTS FOR (p:Page) ON (p.title)")
+            except Exception as e:
+                pass  # Constraints might already exist
+    def store_relationships(self, url: str, extracted_data: Dict, dom_structure: Dict):
+        """Store page relationships and structure in Neo4j"""
+        with self.driver.session() as session:
+            # Create main page node
+            self._create_page_node(session, url, extracted_data)
+            # Create domain relationships
+            self._create_domain_relationships(session, url, extracted_data)
+            # Create content relationships
+            self._create_content_relationships(session, url, extracted_data)
+            # Create link relationships
+            self._create_link_relationships(session, url, extracted_data["links"])
+            # Create DOM structure relationships
+            self._create_dom_relationships(session, url, dom_structure)
+    def _create_page_node(self, session, url: str, data: Dict):
+        """Create or update page node with LLM-friendly properties"""
+        query = """
+        MERGE (p:Page {url: $url})
+        SET p.title = $title,
+            p.description = $description,
+            p.domain = $domain,
+            p.content_type = $content_type,
+            p.complexity_score = $complexity_score,
+            p.reading_time = $reading_time,
+            p.word_count = $word_count,
+            p.last_scraped = datetime()
+        """
+        session.run(query, {
+            "url": url,
+            "title": data["metadata"]["title"],
+            "description": data["metadata"]["description"],
+            "domain": data["metadata"]["domain"],
+            "content_type": self._identify_content_type(data),
+            "complexity_score": self._calculate_complexity_score(data),
+            "reading_time": len(data["text_summary"].split()) // 250,
+            "word_count": len(data["text_summary"].split())
+        })
+    def _create_domain_relationships(self, session, url: str, data: Dict):
+        """Create domain nodes and relationships"""
+        domain = data["metadata"]["domain"]
+        # Create domain node
+        session.run("""
+        MERGE (d:Domain {name: $domain})
+        SET d.last_updated = datetime()
+        """, {"domain": domain})
+        # Link page to domain
+        session.run("""
+        MATCH (p:Page {url: $url})
+        MATCH (d:Domain {name: $domain})
+        MERGE (p)-[:BELONGS_TO]->(d)
+        """, {"url": url, "domain": domain})
+    def _create_content_relationships(self, session, url: str, data: Dict):
+        """Create content structure relationships for LLM understanding"""
+        # Create topic nodes from headings
+        for i, heading in enumerate(data["metadata"]["headings"]):
+            session.run("""
+            MATCH (p:Page {url: $url})
+            MERGE (h:Heading {text: $text, level: $level, page_url: $url})
+            SET h.position = $position
+            MERGE (p)-[:HAS_HEADING]->(h)
+            """, {
+                "url": url,
+                "text": heading["text"],
+                "level": heading["level"],
+                "position": i
+            })
+        # Create content block relationships
+        for i, block in enumerate(data["content"][:10]):  # Limit for performance
+            session.run("""
+            MATCH (p:Page {url: $url})
+            MERGE (c:ContentBlock {text: $text, page_url: $url, position: $position})
+            SET c.tag = $tag,
+                c.length = $length
+            MERGE (p)-[:HAS_CONTENT]->(c)
+            """, {
+                "url": url,
+                "text": block["text"][:500],  # Truncate for storage
+                "tag": block["tag"],
+                "length": len(block["text"]),
+                "position": i
+            })
+    def _create_link_relationships(self, session, url: str, links: List[Dict]):
+        """Create link relationships for navigation understanding"""
+        for link in links[:20]:  # Limit for performance
+            target_url = link["url"]
+            link_text = link["text"]
+            is_internal = link["internal"]
+            # Create target page node (minimal)
+            session.run("""
+            MERGE (target:Page {url: $target_url})
+            SET target.discovered_via = $source_url
+            """, {"target_url": target_url, "source_url": url})
+            # Create relationship
+            relationship_type = "LINKS_TO_INTERNAL" if is_internal else "LINKS_TO_EXTERNAL"
+            session.run(f"""
+            MATCH (source:Page {{url: $source_url}})
+            MATCH (target:Page {{url: $target_url}})
+            MERGE (source)-[r:{relationship_type}]->(target)
+            SET r.link_text = $link_text,
+                r.is_internal = $is_internal
+            """, {
+                "source_url": url,
+                "target_url": target_url,
+                "link_text": link_text,
+                "is_internal": is_internal
+            })
+    def _create_dom_relationships(self, session, url: str, dom_structure: Dict):
+        """Create DOM structure relationships for content hierarchy"""
+        # Create semantic structure nodes
+        semantic_elements = dom_structure["semantic_structure"]["semantic_elements"]
+        for tag, count in semantic_elements.items():
+            if count > 0:
+                session.run("""
+                MATCH (p:Page {url: $url})
+                MERGE (s:SemanticElement {tag: $tag, page_url: $url})
+                SET s.count = $count
+                MERGE (p)-[:HAS_SEMANTIC_ELEMENT]->(s)
+                """, {"url": url, "tag": tag, "count": count})
+    def get_page_relationships(self, url: str) -> Dict:
+        """Get all relationships for a page for LLM context"""
+        with self.driver.session() as session:
+            result = session.run("""
+            MATCH (p:Page {url: $url})
+            OPTIONAL MATCH (p)-[:LINKS_TO_INTERNAL]->(internal:Page)
+            OPTIONAL MATCH (p)-[:LINKS_TO_EXTERNAL]->(external:Page)
+            OPTIONAL MATCH (p)-[:HAS_HEADING]->(h:Heading)
+            RETURN p, collect(DISTINCT internal.url) as internal_links,
+                   collect(DISTINCT external.url) as external_links,
+                   collect(DISTINCT {text: h.text, level: h.level}) as headings
+            """, {"url": url})
+            record = result.single()
+            if record:
+                return {
+                    "page": dict(record["p"]),
+                    "internal_links": record["internal_links"],
+                    "external_links": record["external_links"],
+                    "headings": record["headings"]
+                }
+            return {}
+    def get_related_pages(self, url: str, limit: int = 5) -> List[Dict]:
+        """Find related pages for LLM context and study suggestions"""
+        with self.driver.session() as session:
+            result = session.run("""
+            MATCH (p:Page {url: $url})
+            MATCH (p)-[:BELONGS_TO]->(d:Domain)
+            MATCH (related:Page)-[:BELONGS_TO]->(d)
+            WHERE related.url <> $url
+            RETURN related.url as url, related.title as title,
+                   related.content_type as content_type,
+                   related.complexity_score as complexity_score
+            ORDER BY related.complexity_score DESC
+            LIMIT $limit
+            """, {"url": url, "limit": limit})
+            return [dict(record) for record in result]
+    def _identify_content_type(self, data: Dict) -> str:
+        """Identify content type for graph relationships"""
+        title = data["metadata"]["title"].lower()
+        if "tutorial" in title or "guide" in title:
+            return "tutorial"
+        elif "documentation" in title or "docs" in title:
+            return "documentation"
+        elif "blog" in title or "article" in title:
+            return "article"
+        return "general"
+    def _calculate_complexity_score(self, data: Dict) -> float:
+        """Calculate complexity score for relationship weighting"""
+        text_length = len(data["text_summary"])
+        content_blocks = len(data["content"])
+        return min(text_length / 1000 + content_blocks / 10, 10.0)
+    def close(self):
+        """Close database connection"""
+        self.driver.close()