Spaces:

hellorahulk
/

crawlitall

Build error

App Files Files Community

hellorahulk commited on Feb 5, 2025

Commit

43aa272

verified ·

1 Parent(s): 7dbdab0

Upload 4 files

Browse files

Files changed (4) hide show

Dockerfile +46 -0
README.md +75 -10
app.py +457 -0
requirements.txt +9 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,46 @@

+FROM python:3.10-slim
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    wget \
+    gnupg \
+    && rm -rf /var/lib/apt/lists/*
+# Install latest Chrome and its dependencies
+RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
+    && echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list \
+    && apt-get update \
+    && apt-get install -y \
+    google-chrome-stable \
+    fonts-ipafont-gothic \
+    fonts-wqy-zenhei \
+    fonts-thai-tlwg \
+    fonts-kacst \
+    fonts-freefont-ttf \
+    libxss1 \
+    && rm -rf /var/lib/apt/lists/*
+# Set up working directory
+WORKDIR /app
+# Copy requirements and install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Install Playwright browsers
+RUN playwright install chromium
+RUN playwright install-deps
+# Copy application code
+COPY . .
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+ENV GRADIO_SERVER_NAME=0.0.0.0
+ENV GRADIO_SERVER_PORT=7860
+# Expose port
+EXPOSE 7860
+# Start the application
+CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -1,10 +1,75 @@
----
-title: Crawlitall
-emoji: 🏢
-colorFrom: indigo
-colorTo: gray
-sdk: docker
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Crawl4AI Demo - Docker Deployment
+This is a Docker-ready version of the Crawl4AI demo application, specifically designed for deployment on Hugging Face Spaces.
+## Features
+- Web interface built with Gradio
+- Support for multiple crawler types (Basic, LLM, Cosine, JSON/CSS)
+- Configurable word count threshold
+- Markdown output with metadata
+- Sub-page crawling capabilities
+- Lazy loading support
+- Docker-optimized configuration
+## Deployment Instructions
+1. Create a new Space on Hugging Face:
+   - Go to huggingface.co/spaces
+   - Click "Create new Space"
+   - Choose "Docker" as the SDK
+   - Set the hardware requirements (recommended: CPU + 16GB RAM)
+2. Upload the files:
+   - Upload all files from this directory to your Space
+   - Make sure to include:
+     - `Dockerfile`
+     - `app.py`
+     - `requirements.txt`
+     - `README.md`
+3. The Space will automatically build and deploy the application.
+## Environment Variables
+No environment variables are required for basic functionality. The application is configured to run out of the box.
+## Hardware Requirements
+- CPU: 2+ cores recommended
+- RAM: 16GB recommended
+- Disk: 5GB minimum
+## Browser Support
+The application uses Chrome in headless mode for web crawling. The Dockerfile includes all necessary dependencies.
+## Limitations
+- Memory usage increases with the number of pages crawled
+- Some websites may block automated crawling
+- JavaScript-heavy sites may require additional configuration
+## Troubleshooting
+If you encounter issues:
+1. Check the Space logs for error messages
+2. Ensure the Chrome browser is running correctly
+3. Verify network connectivity
+4. Check memory usage
+## Development
+To run locally with Docker:
+```bash
+docker build -t crawl4ai-demo .
+docker run -p 7860:7860 crawl4ai-demo
+```
+Visit http://localhost:7860 to access the application.
+## License
+This project is licensed under the MIT License - see the LICENSE file for details.

app.py ADDED Viewed

	@@ -0,0 +1,457 @@

+"""
+Crawl4AI Demo Application (Docker Version)
+=======================================
+This is a modified version of the Crawl4AI demo application specifically designed
+for deployment in a Docker container on Hugging Face Spaces.
+Features:
+---------
+- Web interface built with Gradio for interactive use
+- Support for multiple crawler types (Basic, LLM, Cosine, JSON/CSS)
+- Configurable word count threshold
+- Markdown output with metadata
+- Sub-page crawling capabilities
+- Lazy loading support
+- Docker-optimized configuration
+"""
+import gradio as gr
+import asyncio
+from typing import Optional, Dict, Any, List, Set
+from enum import Enum
+from pydantic import BaseModel
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, BrowserConfig
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+import urllib.parse
+import os
+# Configure browser settings for Docker environment
+CHROME_PATH = "/usr/bin/google-chrome-stable"
+os.environ["CHROME_PATH"] = CHROME_PATH
+class CrawlerType(str, Enum):
+    """Enumeration of supported crawler types."""
+    BASIC = "basic"
+    LLM = "llm"
+    COSINE = "cosine"
+    JSON_CSS = "json_css"
+class ExtractionType(str, Enum):
+    """Enumeration of supported extraction strategies."""
+    DEFAULT = "default"
+    CSS = "css"
+    XPATH = "xpath"
+    LLM = "llm"
+    COMBINED = "combined"
+class CrawlRequest(BaseModel):
+    """Request model for crawling operations."""
+    url: str
+    crawler_type: CrawlerType = CrawlerType.BASIC
+    extraction_type: ExtractionType = ExtractionType.DEFAULT
+    word_count_threshold: int = 100
+    css_selector: Optional[str] = None
+    xpath_query: Optional[str] = None
+    excluded_tags: Optional[list] = None
+    scan_full_page: bool = False
+    scroll_delay: float = 0.5
+    crawl_subpages: bool = False
+    max_depth: int = 1
+    exclude_external_links: bool = True
+    max_pages: int = 10
+def create_extraction_strategy(extraction_type: ExtractionType, css_selector: Optional[str] = None, xpath_query: Optional[str] = None) -> Any:
+    """Create an extraction strategy based on the specified type."""
+    if extraction_type == ExtractionType.CSS and css_selector:
+        schema = {
+            "name": "Content",
+            "baseSelector": css_selector,
+            "fields": [
+                {"name": "title", "selector": "h1,h2", "type": "text"},
+                {"name": "text", "selector": "p", "type": "text"},
+                {"name": "links", "selector": "a", "type": "attribute", "attribute": "href"}
+            ]
+        }
+        return JsonCssExtractionStrategy(schema)
+    return None
+async def crawl_with_subpages(request: CrawlRequest, base_url: str, current_depth: int = 1, visited: Set[str] = None) -> Dict:
+    """Recursively crawl pages including sub-pages up to the specified depth."""
+    if visited is None:
+        visited = set()
+    if current_depth > request.max_depth or len(visited) >= request.max_pages:
+        return None
+    normalized_url = urllib.parse.urljoin(request.url, '/')
+    if normalized_url in visited:
+        return None
+    run_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        verbose=True,
+        word_count_threshold=request.word_count_threshold,
+        css_selector=request.css_selector,
+        excluded_tags=request.excluded_tags or ["nav", "footer", "header"],
+        exclude_external_links=request.exclude_external_links,
+        wait_for=f"css:{request.css_selector}" if request.css_selector else None,
+        wait_for_images=True,
+        page_timeout=30000,
+        scan_full_page=request.scan_full_page,
+        scroll_delay=request.scroll_delay,
+        extraction_strategy=create_extraction_strategy(
+            request.extraction_type,
+            request.css_selector,
+            request.xpath_query
+        )
+    )
+    # Docker-optimized browser configuration
+    browser_config = BrowserConfig(
+        headless=True,
+        viewport_width=1920,
+        viewport_height=1080,
+        chrome_path=CHROME_PATH,
+        args=[
+            "--no-sandbox",
+            "--disable-dev-shm-usage",
+            "--disable-gpu"
+        ]
+    )
+    results = {
+        "pages": [],
+        "total_links": 0,
+        "visited_pages": len(visited)
+    }
+    try:
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            result = await crawler.arun(url=request.url, config=run_config)
+            if not result.success:
+                print(f"Failed to crawl {request.url}: {result.error_message}")
+                return None
+            page_result = {
+                "url": request.url,
+                "markdown": result.markdown_v2 if hasattr(result, 'markdown_v2') else "",
+                "extracted_content": result.extracted_content if hasattr(result, 'extracted_content') else None,
+                "depth": current_depth
+            }
+            results["pages"].append(page_result)
+            visited.add(normalized_url)
+            if request.crawl_subpages and hasattr(result, 'links'):
+                internal_links = result.links.get("internal", [])
+                if internal_links:
+                    results["total_links"] += len(internal_links)
+                    for link in internal_links:
+                        if len(visited) >= request.max_pages:
+                            break
+                        try:
+                            normalized_link = urllib.parse.urljoin(request.url, link)
+                            link_domain = urllib.parse.urlparse(normalized_link).netloc
+                            if normalized_link in visited or (request.exclude_external_links and link_domain != base_url):
+                                continue
+                            sub_request = CrawlRequest(
+                                **{**request.dict(), "url": normalized_link}
+                            )
+                            sub_result = await crawl_with_subpages(
+                                sub_request,
+                                base_url,
+                                current_depth + 1,
+                                visited
+                            )
+                            if sub_result:
+                                results["pages"].extend(sub_result["pages"])
+                                results["total_links"] += sub_result["total_links"]
+                                results["visited_pages"] = len(visited)
+                        except Exception as e:
+                            print(f"Error processing link {link}: {str(e)}")
+                            continue
+            return results
+    except Exception as e:
+        print(f"Error crawling {request.url}: {str(e)}")
+        return None
+async def crawl_url(request: CrawlRequest) -> Dict:
+    """Crawl a URL and return the extracted content."""
+    try:
+        base_url = urllib.parse.urlparse(request.url).netloc
+        if request.crawl_subpages:
+            results = await crawl_with_subpages(request, base_url)
+            if not results or not results["pages"]:
+                raise Exception(f"Failed to crawl pages starting from {request.url}")
+            combined_markdown = "\\n\\n---\\n\\n".join(
+                f"## Page: {page['url']}\\n{page['markdown']}"
+                for page in results["pages"]
+            )
+            return {
+                "markdown": combined_markdown,
+                "metadata": {
+                    "url": request.url,
+                    "crawler_type": request.crawler_type.value,
+                    "extraction_type": request.extraction_type.value,
+                    "word_count_threshold": request.word_count_threshold,
+                    "css_selector": request.css_selector,
+                    "xpath_query": request.xpath_query,
+                    "scan_full_page": request.scan_full_page,
+                    "scroll_delay": request.scroll_delay,
+                    "total_pages_crawled": results["visited_pages"],
+                    "total_links_found": results["total_links"],
+                    "max_depth_reached": min(request.max_depth, max(page["depth"] for page in results["pages"]))
+                },
+                "pages": results["pages"]
+            }
+        else:
+            wait_condition = f"css:{request.css_selector}" if request.css_selector else None
+            run_config = CrawlerRunConfig(
+                cache_mode=CacheMode.BYPASS,
+                word_count_threshold=request.word_count_threshold,
+                css_selector=request.css_selector,
+                excluded_tags=request.excluded_tags or ["nav", "footer", "header"],
+                wait_for=wait_condition,
+                wait_for_images=True,
+                page_timeout=30000,
+                scan_full_page=request.scan_full_page,
+                scroll_delay=request.scroll_delay,
+                extraction_strategy=create_extraction_strategy(
+                    request.extraction_type,
+                    request.css_selector,
+                    request.xpath_query
+                )
+            )
+            # Docker-optimized browser configuration
+            browser_config = BrowserConfig(
+                headless=True,
+                viewport_width=1920,
+                viewport_height=1080,
+                chrome_path=CHROME_PATH,
+                args=[
+                    "--no-sandbox",
+                    "--disable-dev-shm-usage",
+                    "--disable-gpu"
+                ]
+            )
+            async with AsyncWebCrawler(config=browser_config) as crawler:
+                result = await crawler.arun(url=request.url, config=run_config)
+                if not result.success:
+                    raise Exception(result.error_message)
+                images = result.media.get("images", []) if hasattr(result, 'media') else []
+                image_info = "\n### Images Found\n" if images else ""
+                for i, img in enumerate(images[:5]):
+                    image_info += f"- Image {i+1}: {img.get('src', 'N/A')}\n"
+                    if img.get('alt'):
+                        image_info += f"  Alt: {img['alt']}\n"
+                    if img.get('score'):
+                        image_info += f"  Score: {img['score']}\n"
+                return {
+                    "markdown": result.markdown_v2 if hasattr(result, 'markdown_v2') else "",
+                    "metadata": {
+                        "url": request.url,
+                        "crawler_type": request.crawler_type.value,
+                        "extraction_type": request.extraction_type.value,
+                        "word_count_threshold": request.word_count_threshold,
+                        "css_selector": request.css_selector,
+                        "xpath_query": request.xpath_query,
+                        "scan_full_page": request.scan_full_page,
+                        "scroll_delay": request.scroll_delay,
+                        "wait_condition": wait_condition
+                    },
+                    "extracted_content": result.extracted_content if hasattr(result, 'extracted_content') else None,
+                    "image_info": image_info
+                }
+    except Exception as e:
+        raise Exception(str(e))
+async def gradio_crawl(
+    url: str,
+    crawler_type: str,
+    extraction_type: str,
+    word_count_threshold: int,
+    css_selector: str,
+    xpath_query: str,
+    scan_full_page: bool,
+    scroll_delay: float,
+    crawl_subpages: bool,
+    max_depth: int,
+    max_pages: int,
+    exclude_external_links: bool
+) -> tuple[str, str]:
+    """Handle crawling requests from the Gradio interface."""
+    try:
+        request = CrawlRequest(
+            url=url,
+            crawler_type=CrawlerType(crawler_type.lower()),
+            extraction_type=ExtractionType(extraction_type.lower()),
+            word_count_threshold=word_count_threshold,
+            css_selector=css_selector if css_selector else None,
+            xpath_query=xpath_query if xpath_query else None,
+            scan_full_page=scan_full_page,
+            scroll_delay=scroll_delay,
+            crawl_subpages=crawl_subpages,
+            max_depth=max_depth,
+            max_pages=max_pages,
+            exclude_external_links=exclude_external_links
+        )
+        result = await crawl_url(request)
+        markdown_content = str(result["markdown"]) if result.get("markdown") else ""
+        metadata_str = f"""### Metadata
+- URL: {result['metadata']['url']}
+- Crawler Type: {result['metadata']['crawler_type']}
+- Extraction Type: {result['metadata']['extraction_type']}
+- Word Count Threshold: {result['metadata']['word_count_threshold']}
+- CSS Selector: {result['metadata']['css_selector'] or 'None'}
+- XPath Query: {result['metadata']['xpath_query'] or 'None'}
+- Full Page Scan: {result['metadata']['scan_full_page']}
+- Scroll Delay: {result['metadata']['scroll_delay']}s"""
+        if crawl_subpages:
+            metadata_str += f"""
+- Total Pages Crawled: {result['metadata'].get('total_pages_crawled', 0)}
+- Total Links Found: {result['metadata'].get('total_links_found', 0)}
+- Max Depth Reached: {result['metadata'].get('max_depth_reached', 1)}"""
+        if result.get('image_info'):
+            metadata_str += f"\n\n{result['image_info']}"
+        if result.get("extracted_content"):
+            metadata_str += f"\n\n### Extracted Content\n```json\n{result['extracted_content']}\n```"
+        return markdown_content, metadata_str
+    except Exception as e:
+        error_msg = f"Error: {str(e)}"
+        return error_msg, "Error occurred while crawling"
+# Create Gradio interface with Docker-optimized settings
+demo = gr.Interface(
+    fn=gradio_crawl,
+    inputs=[
+        gr.Textbox(
+            label="URL",
+            placeholder="Enter URL to crawl",
+            info="The webpage URL to extract content from"
+        ),
+        gr.Dropdown(
+            choices=["Basic", "LLM", "Cosine", "JSON/CSS"],
+            label="Crawler Type",
+            value="Basic",
+            info="Select the content extraction strategy"
+        ),
+        gr.Dropdown(
+            choices=["Default", "CSS", "XPath", "LLM", "Combined"],
+            label="Extraction Type",
+            value="Default",
+            info="Choose how to extract content from the page"
+        ),
+        gr.Slider(
+            minimum=50,
+            maximum=500,
+            value=100,
+            step=50,
+            label="Word Count Threshold",
+            info="Minimum number of words required for content extraction"
+        ),
+        gr.Textbox(
+            label="CSS Selector",
+            placeholder="e.g., article.content, main.post",
+            info="CSS selector to target specific content (used with CSS extraction type)"
+        ),
+        gr.Textbox(
+            label="XPath Query",
+            placeholder="e.g., //article[@class='content']",
+            info="XPath query to target specific content (used with XPath extraction type)"
+        ),
+        gr.Checkbox(
+            label="Scan Full Page",
+            value=False,
+            info="Enable to scroll through the entire page to load lazy content"
+        ),
+        gr.Slider(
+            minimum=0.1,
+            maximum=2.0,
+            value=0.5,
+            step=0.1,
+            label="Scroll Delay",
+            info="Delay between scroll steps in seconds when scanning full page"
+        ),
+        gr.Checkbox(
+            label="Crawl Sub-pages",
+            value=False,
+            info="Enable to crawl links found on the page"
+        ),
+        gr.Slider(
+            minimum=1,
+            maximum=5,
+            value=1,
+            step=1,
+            label="Max Crawl Depth",
+            info="Maximum depth for recursive crawling (1 = only direct links)"
+        ),
+        gr.Slider(
+            minimum=1,
+            maximum=50,
+            value=10,
+            step=5,
+            label="Max Pages",
+            info="Maximum number of pages to crawl"
+        ),
+        gr.Checkbox(
+            label="Exclude External Links",
+            value=True,
+            info="Only crawl links within the same domain"
+        )
+    ],
+    outputs=[
+        gr.Markdown(label="Generated Markdown"),
+        gr.Markdown(label="Metadata & Extraction Results")
+    ],
+    title="Crawl4AI Demo",
+    description="""
+    This demo allows you to extract content from web pages using different crawling and extraction strategies.
+    1. Enter a URL to crawl
+    2. Select a crawler type (Basic, LLM, Cosine, JSON/CSS)
+    3. Choose an extraction strategy (Default, CSS, XPath, LLM, Combined)
+    4. Configure additional options:
+       - Word count threshold for content filtering
+       - CSS selectors for targeting specific content
+       - XPath queries for precise extraction
+       - Full page scanning for lazy-loaded content
+       - Scroll delay for controlling page scanning speed
+       - Sub-page crawling with depth control
+       - Maximum number of pages to crawl
+       - External link filtering
+    The extracted content will be displayed in markdown format along with metadata and extraction results.
+    When sub-page crawling is enabled, content from all crawled pages will be combined in the output.
+    """,
+    examples=[
+        ["https://example.com", "Basic", "Default", 100, "", "", False, 0.5, False, 1, 10, True],
+        ["https://example.com/blog", "Basic", "CSS", 100, "article.post", "", True, 0.5, True, 2, 5, True],
+    ]
+)
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+crawl4ai
+gradio
+python-dotenv
+pydantic
+playwright
+aiofiles
+python-multipart
+typing-extensions
+uvicorn