Spaces:

hellorahulk
/

crawlit

Sleeping

App Files Files Community

hellorahulk commited on Feb 5, 2025

Commit

41844a4

verified ·

1 Parent(s): d1777c6

Update app.py

Browse files

Files changed (1) hide show

app.py +80 -267

app.py CHANGED Viewed

@@ -1,89 +1,50 @@
 """
 Crawl4AI Demo Application
-========================
-This application provides a web interface and API for the Crawl4AI library, allowing users to extract
-content from web pages using different crawling strategies.
 Features:
 ---------
 - Web interface built with Gradio for interactive use
-- RESTful API endpoint for programmatic access
 - Support for multiple crawler types (Basic, LLM, Cosine, JSON/CSS)
 - Configurable word count threshold
 - Markdown output with metadata
 Usage:
 ------
-1. Start the server:
-   ```
-   python app.py
-   ```
-2. Access the web interface at http://localhost:8000
-3. Use the API endpoint at http://localhost:8000/api/crawl
-API Example:
------------
-```python
-import requests
-response = requests.post(
-    "http://localhost:8000/api/crawl",
-    json={
-        "url": "https://example.com",
-        "crawler_type": "basic",
-        "word_count_threshold": 100
-    }
-)
-result = response.json()
-```
 Dependencies:
 ------------
 - gradio
-- fastapi
-- crawl4ai
-- uvicorn
 """
 import gradio as gr
 import asyncio
-from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel
-from enum import Enum
 from typing import Optional, Dict, Any, List, Set
-from contextlib import asynccontextmanager
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, BrowserConfig
 from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
-from playwright.async_api import async_playwright
 import urllib.parse
 class CrawlerType(str, Enum):
-    """
-    Enumeration of supported crawler types.
-    Attributes:
-        BASIC (str): Simple HTML parsing and content extraction
-        LLM (str): Language model-based content extraction
-        COSINE (str): Cosine similarity-based content extraction
-        JSON_CSS (str): JSON/CSS selector-based content extraction
-    """
     BASIC = "basic"
     LLM = "llm"
     COSINE = "cosine"
     JSON_CSS = "json_css"
 class ExtractionType(str, Enum):
-    """
-    Enumeration of supported extraction strategies.
-    Attributes:
-        DEFAULT (str): Default extraction without specific strategy
-        CSS (str): CSS selector-based extraction
-        XPATH (str): XPath-based extraction
-        LLM (str): Language model-based extraction
-        COMBINED (str): Combined strategy using multiple approaches
-    """
     DEFAULT = "default"
     CSS = "css"
     XPATH = "xpath"
@@ -91,24 +52,7 @@ class ExtractionType(str, Enum):
     COMBINED = "combined"
 class CrawlRequest(BaseModel):
-    """
-    Request model for crawling operations.
-    Attributes:
-        url (str): The URL to crawl
-        crawler_type (CrawlerType): The type of crawler to use
-        extraction_type (ExtractionType): The extraction strategy to use
-        word_count_threshold (int): Minimum word count for extracted content
-        css_selector (Optional[str]): CSS selector for content extraction
-        xpath_query (Optional[str]): XPath query for content extraction
-        excluded_tags (Optional[list]): HTML tags to exclude from extraction
-        scan_full_page (bool): Whether to scan the entire page for lazy-loaded content
-        scroll_delay (float): Delay between scroll steps in seconds
-        crawl_subpages (bool): Whether to crawl sub-pages found in links
-        max_depth (int): Maximum depth for recursive crawling (1 = only direct links)
-        exclude_external_links (bool): Whether to exclude links to external domains
-        max_pages (int): Maximum number of pages to crawl
-    """
     url: str
     crawler_type: CrawlerType = CrawlerType.BASIC
     extraction_type: ExtractionType = ExtractionType.DEFAULT
@@ -123,72 +67,8 @@ class CrawlRequest(BaseModel):
     exclude_external_links: bool = True
     max_pages: int = 10
-# Global crawler variable
-crawler = None
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-    """
-    Lifespan context manager for FastAPI application.
-    Handles crawler initialization and cleanup.
-    """
-    global crawler
-    # Initialize browser configuration
-    browser_config = BrowserConfig(
-        headless=True,
-        viewport_width=1920,
-        viewport_height=1080
-    )
-    # Create and initialize crawler
-    try:
-        crawler = AsyncWebCrawler(config=browser_config)
-        print("Crawler initialized successfully")
-        yield
-    finally:
-        if crawler:
-            await crawler.close()
-            print("Crawler resources cleaned up")
-# Create FastAPI app with lifespan handler
-app = FastAPI(
-    title="Crawl4AI Demo",
-    description="A web interface and API for extracting content from web pages using Crawl4AI",
-    version="1.0.0",
-    lifespan=lifespan
-)
-@app.on_event("startup")
-async def startup_event():
-    """Initialize the browser on startup"""
-    try:
-        async with async_playwright() as playwright:
-            await crawler.initialize(playwright)
-    except Exception as e:
-        print(f"Error initializing browser: {e}")
-        raise
-@app.on_event("shutdown")
-async def shutdown_event():
-    """Clean up browser resources on shutdown"""
-    try:
-        await crawler.cleanup()
-    except Exception as e:
-        print(f"Error during cleanup: {e}")
 def create_extraction_strategy(extraction_type: ExtractionType, css_selector: Optional[str] = None, xpath_query: Optional[str] = None) -> Any:
-    """
-    Create an extraction strategy based on the specified type.
-    Args:
-        extraction_type (ExtractionType): The type of extraction strategy
-        css_selector (Optional[str]): CSS selector for content extraction
-        xpath_query (Optional[str]): XPath query for content extraction
-    Returns:
-        Any: The configured extraction strategy
-    """
     if extraction_type == ExtractionType.CSS and css_selector:
         schema = {
             "name": "Content",
@@ -203,9 +83,7 @@ def create_extraction_strategy(extraction_type: ExtractionType, css_selector: Op
     return None
 async def crawl_with_subpages(request: CrawlRequest, base_url: str, current_depth: int = 1, visited: Set[str] = None) -> Dict:
-    """
-    Recursively crawl pages including sub-pages up to the specified depth.
-    """
     if visited is None:
         visited = set()
@@ -219,26 +97,17 @@ async def crawl_with_subpages(request: CrawlRequest, base_url: str, current_dept
     # Create run configuration for current page
     run_config = CrawlerRunConfig(
-        # Core settings
         cache_mode=CacheMode.BYPASS,
-        verbose=True,  # Enable verbose logging
-        # Content settings
         word_count_threshold=request.word_count_threshold,
         css_selector=request.css_selector,
         excluded_tags=request.excluded_tags or ["nav", "footer", "header"],
         exclude_external_links=request.exclude_external_links,
-        # Page & JS settings
         wait_for=f"css:{request.css_selector}" if request.css_selector else None,
         wait_for_images=True,
         page_timeout=30000,
-        # Lazy loading settings
         scan_full_page=request.scan_full_page,
         scroll_delay=request.scroll_delay,
-        # Extraction settings
         extraction_strategy=create_extraction_strategy(
             request.extraction_type,
             request.css_selector,
@@ -286,21 +155,17 @@ async def crawl_with_subpages(request: CrawlRequest, base_url: str, current_dept
                         if len(visited) >= request.max_pages:
                             break
-                        # Normalize and validate the link
                         try:
                             normalized_link = urllib.parse.urljoin(request.url, link)
                             link_domain = urllib.parse.urlparse(normalized_link).netloc
-                            # Skip if already visited or external link
                             if normalized_link in visited or (request.exclude_external_links and link_domain != base_url):
                                 continue
-                            # Create new request for sub-page
                             sub_request = CrawlRequest(
                                 **{**request.dict(), "url": normalized_link}
                             )
-                            # Recursively crawl sub-page
                             sub_result = await crawl_with_subpages(
                                 sub_request,
                                 base_url,
@@ -321,20 +186,16 @@ async def crawl_with_subpages(request: CrawlRequest, base_url: str, current_dept
         print(f"Error crawling {request.url}: {str(e)}")
         return None
-@app.post("/api/crawl")
-async def crawl_url(request: CrawlRequest):
-    """
-    API endpoint to crawl a URL and return the extracted content.
-    """
     try:
         base_url = urllib.parse.urlparse(request.url).netloc
         if request.crawl_subpages:
             results = await crawl_with_subpages(request, base_url)
             if not results or not results["pages"]:
-                raise HTTPException(status_code=500, detail=f"Failed to crawl pages starting from {request.url}")
-            # Combine results from all pages
             combined_markdown = "\\n\\n---\\n\\n".join(
                 f"## Page: {page['url']}\\n{page['markdown']}"
                 for page in results["pages"]
@@ -358,29 +219,18 @@ async def crawl_url(request: CrawlRequest):
                 "pages": results["pages"]
             }
         else:
-            # Format wait_for condition properly if CSS selector is provided
             wait_condition = f"css:{request.css_selector}" if request.css_selector else None
-            # Create run configuration
             run_config = CrawlerRunConfig(
-                # Core settings
                 cache_mode=CacheMode.BYPASS,
-                # Content settings
                 word_count_threshold=request.word_count_threshold,
                 css_selector=request.css_selector,
                 excluded_tags=request.excluded_tags or ["nav", "footer", "header"],
-                # Page & JS settings
-                wait_for=wait_condition,  # Using properly formatted wait condition
-                wait_for_images=True,     # Always wait for images to load
-                page_timeout=30000,       # 30 seconds timeout for page operations
-                # Lazy loading settings
                 scan_full_page=request.scan_full_page,
                 scroll_delay=request.scroll_delay,
-                # Extraction settings
                 extraction_strategy=create_extraction_strategy(
                     request.extraction_type,
                     request.css_selector,
@@ -388,59 +238,45 @@ async def crawl_url(request: CrawlRequest):
                 )
             )
-            # Create browser config with optimized settings
             browser_config = BrowserConfig(
                 headless=True,
                 viewport_width=1920,
                 viewport_height=1080
             )
-            async with AsyncWebCrawler(config=browser_config) as temp_crawler:
-                try:
-                    result = await temp_crawler.arun(
-                        url=request.url,
-                        config=run_config
-                    )
-                    if not result.success:
-                        raise HTTPException(status_code=500, detail=result.error_message)
-                    # Get image information
-                    images = result.media.get("images", []) if hasattr(result, 'media') else []
-                    image_info = "\n### Images Found\n" if images else ""
-                    for i, img in enumerate(images[:5]):  # Show first 5 images
-                        image_info += f"- Image {i+1}: {img.get('src', 'N/A')}\n"
-                        if img.get('alt'):
-                            image_info += f"  Alt: {img['alt']}\n"
-                        if img.get('score'):
-                            image_info += f"  Score: {img['score']}\n"
-                    return {
-                        "markdown": result.markdown_v2 if hasattr(result, 'markdown_v2') else "",
-                        "metadata": {
-                            "url": request.url,
-                            "crawler_type": request.crawler_type.value,
-                            "extraction_type": request.extraction_type.value,
-                            "word_count_threshold": request.word_count_threshold,
-                            "css_selector": request.css_selector,
-                            "xpath_query": request.xpath_query,
-                            "scan_full_page": request.scan_full_page,
-                            "scroll_delay": request.scroll_delay,
-                            "wait_condition": wait_condition
-                        },
-                        "extracted_content": result.extracted_content if hasattr(result, 'extracted_content') else None,
-                        "image_info": image_info
-                    }
-                except Exception as e:
-                    # More specific error handling
-                    error_msg = str(e)
-                    if "Wait condition failed" in error_msg:
-                        error_msg = f"Failed to find element matching selector '{request.css_selector}'. Please check if the selector is correct."
-                    elif "TimeoutError" in error_msg:
-                        error_msg = "Page took too long to load. Please try again or check the URL."
-                    raise HTTPException(status_code=500, detail=error_msg)
     except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
 async def gradio_crawl(
     url: str,
@@ -456,48 +292,27 @@ async def gradio_crawl(
     max_pages: int,
     exclude_external_links: bool
 ) -> tuple[str, str]:
-    """
-    Gradio interface function to handle crawling requests from the web UI.
-    Args:
-        url (str): The webpage URL to crawl
-        crawler_type (str): Type of crawler to use
-        extraction_type (str): Type of extraction strategy
-        word_count_threshold (int): Minimum word count threshold
-        css_selector (str): CSS selector for content targeting
-        xpath_query (str): XPath query for content targeting
-        scan_full_page (bool): Whether to scan the full page
-        scroll_delay (float): Delay between scroll steps
-        crawl_subpages (bool): Whether to crawl sub-pages
-        max_depth (int): Maximum crawl depth
-        max_pages (int): Maximum number of pages to crawl
-        exclude_external_links (bool): Whether to exclude external links
-    Returns:
-        tuple[str, str]: Tuple containing (markdown_content, metadata_string)
-    """
-    request = CrawlRequest(
-        url=url,
-        crawler_type=CrawlerType(crawler_type.lower()),
-        extraction_type=ExtractionType(extraction_type.lower()),
-        word_count_threshold=word_count_threshold,
-        css_selector=css_selector if css_selector else None,
-        xpath_query=xpath_query if xpath_query else None,
-        scan_full_page=scan_full_page,
-        scroll_delay=scroll_delay,
-        crawl_subpages=crawl_subpages,
-        max_depth=max_depth,
-        max_pages=max_pages,
-        exclude_external_links=exclude_external_links
-    )
     try:
         result = await crawl_url(request)
-        # Convert markdown result to string if it exists
         markdown_content = str(result["markdown"]) if result.get("markdown") else ""
-        # Format the metadata and results
         metadata_str = f"""### Metadata
 - URL: {result['metadata']['url']}
 - Crawler Type: {result['metadata']['crawler_type']}
@@ -508,18 +323,15 @@ async def gradio_crawl(
 - Full Page Scan: {result['metadata']['scan_full_page']}
 - Scroll Delay: {result['metadata']['scroll_delay']}s"""
-        # Add sub-page crawling information if enabled
         if crawl_subpages:
             metadata_str += f"""
 - Total Pages Crawled: {result['metadata'].get('total_pages_crawled', 0)}
 - Total Links Found: {result['metadata'].get('total_links_found', 0)}
 - Max Depth Reached: {result['metadata'].get('max_depth_reached', 1)}"""
-        # Add image information if available
         if result.get('image_info'):
             metadata_str += f"\n\n{result['image_info']}"
-        # Add extracted content if available
         if result.get("extracted_content"):
             metadata_str += f"\n\n### Extracted Content\n```json\n{result['extracted_content']}\n```"
@@ -528,7 +340,7 @@ async def gradio_crawl(
         error_msg = f"Error: {str(e)}"
         return error_msg, "Error occurred while crawling"
-# Create Gradio interface with enhanced documentation
 demo = gr.Interface(
     fn=gradio_crawl,
     inputs=[
@@ -630,12 +442,13 @@ demo = gr.Interface(
     The extracted content will be displayed in markdown format along with metadata and extraction results.
     When sub-page crawling is enabled, content from all crawled pages will be combined in the output.
-    """
 )
-# Mount Gradio app to FastAPI
-app = gr.mount_gradio_app(app, demo, path="/")
 if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8000)

 """
 Crawl4AI Demo Application
+====================================================
+This is a modified version of the Crawl4AI demo application specifically designed
+for deployment on Hugging Face Spaces.
 Features:
 ---------
 - Web interface built with Gradio for interactive use
 - Support for multiple crawler types (Basic, LLM, Cosine, JSON/CSS)
 - Configurable word count threshold
 - Markdown output with metadata
+- Sub-page crawling capabilities
+- Lazy loading support
 Usage:
 ------
+This version is specifically designed for Hugging Face Spaces deployment.
+Simply upload this file to your Space and it will automatically run.
 Dependencies:
 ------------
 - gradio
+- crawl4ai>=0.4.3b0
+- python-dotenv>=1.0.0
+- pydantic>=2.5.0
 """
 import gradio as gr
 import asyncio
 from typing import Optional, Dict, Any, List, Set
+from enum import Enum
+from pydantic import BaseModel
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, BrowserConfig
 from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
 import urllib.parse
 class CrawlerType(str, Enum):
+    """Enumeration of supported crawler types."""
     BASIC = "basic"
     LLM = "llm"
     COSINE = "cosine"
     JSON_CSS = "json_css"
 class ExtractionType(str, Enum):
+    """Enumeration of supported extraction strategies."""
     DEFAULT = "default"
     CSS = "css"
     XPATH = "xpath"
     COMBINED = "combined"
 class CrawlRequest(BaseModel):
+    """Request model for crawling operations."""
     url: str
     crawler_type: CrawlerType = CrawlerType.BASIC
     extraction_type: ExtractionType = ExtractionType.DEFAULT
     exclude_external_links: bool = True
     max_pages: int = 10
 def create_extraction_strategy(extraction_type: ExtractionType, css_selector: Optional[str] = None, xpath_query: Optional[str] = None) -> Any:
+    """Create an extraction strategy based on the specified type."""
     if extraction_type == ExtractionType.CSS and css_selector:
         schema = {
             "name": "Content",
     return None
 async def crawl_with_subpages(request: CrawlRequest, base_url: str, current_depth: int = 1, visited: Set[str] = None) -> Dict:
+    """Recursively crawl pages including sub-pages up to the specified depth."""
     if visited is None:
         visited = set()
     # Create run configuration for current page
     run_config = CrawlerRunConfig(
         cache_mode=CacheMode.BYPASS,
+        verbose=True,
         word_count_threshold=request.word_count_threshold,
         css_selector=request.css_selector,
         excluded_tags=request.excluded_tags or ["nav", "footer", "header"],
         exclude_external_links=request.exclude_external_links,
         wait_for=f"css:{request.css_selector}" if request.css_selector else None,
         wait_for_images=True,
         page_timeout=30000,
         scan_full_page=request.scan_full_page,
         scroll_delay=request.scroll_delay,
         extraction_strategy=create_extraction_strategy(
             request.extraction_type,
             request.css_selector,
                         if len(visited) >= request.max_pages:
                             break
                         try:
                             normalized_link = urllib.parse.urljoin(request.url, link)
                             link_domain = urllib.parse.urlparse(normalized_link).netloc
                             if normalized_link in visited or (request.exclude_external_links and link_domain != base_url):
                                 continue
                             sub_request = CrawlRequest(
                                 **{**request.dict(), "url": normalized_link}
                             )
                             sub_result = await crawl_with_subpages(
                                 sub_request,
                                 base_url,
         print(f"Error crawling {request.url}: {str(e)}")
         return None
+async def crawl_url(request: CrawlRequest) -> Dict:
+    """Crawl a URL and return the extracted content."""
     try:
         base_url = urllib.parse.urlparse(request.url).netloc
         if request.crawl_subpages:
             results = await crawl_with_subpages(request, base_url)
             if not results or not results["pages"]:
+                raise Exception(f"Failed to crawl pages starting from {request.url}")
             combined_markdown = "\\n\\n---\\n\\n".join(
                 f"## Page: {page['url']}\\n{page['markdown']}"
                 for page in results["pages"]
                 "pages": results["pages"]
             }
         else:
             wait_condition = f"css:{request.css_selector}" if request.css_selector else None
             run_config = CrawlerRunConfig(
                 cache_mode=CacheMode.BYPASS,
                 word_count_threshold=request.word_count_threshold,
                 css_selector=request.css_selector,
                 excluded_tags=request.excluded_tags or ["nav", "footer", "header"],
+                wait_for=wait_condition,
+                wait_for_images=True,
+                page_timeout=30000,
                 scan_full_page=request.scan_full_page,
                 scroll_delay=request.scroll_delay,
                 extraction_strategy=create_extraction_strategy(
                     request.extraction_type,
                     request.css_selector,
                 )
             )
             browser_config = BrowserConfig(
                 headless=True,
                 viewport_width=1920,
                 viewport_height=1080
             )
+            async with AsyncWebCrawler(config=browser_config) as crawler:
+                result = await crawler.arun(url=request.url, config=run_config)
+                if not result.success:
+                    raise Exception(result.error_message)
+                images = result.media.get("images", []) if hasattr(result, 'media') else []
+                image_info = "\n### Images Found\n" if images else ""
+                for i, img in enumerate(images[:5]):
+                    image_info += f"- Image {i+1}: {img.get('src', 'N/A')}\n"
+                    if img.get('alt'):
+                        image_info += f"  Alt: {img['alt']}\n"
+                    if img.get('score'):
+                        image_info += f"  Score: {img['score']}\n"
+                return {
+                    "markdown": result.markdown_v2 if hasattr(result, 'markdown_v2') else "",
+                    "metadata": {
+                        "url": request.url,
+                        "crawler_type": request.crawler_type.value,
+                        "extraction_type": request.extraction_type.value,
+                        "word_count_threshold": request.word_count_threshold,
+                        "css_selector": request.css_selector,
+                        "xpath_query": request.xpath_query,
+                        "scan_full_page": request.scan_full_page,
+                        "scroll_delay": request.scroll_delay,
+                        "wait_condition": wait_condition
+                    },
+                    "extracted_content": result.extracted_content if hasattr(result, 'extracted_content') else None,
+                    "image_info": image_info
+                }
     except Exception as e:
+        raise Exception(str(e))
 async def gradio_crawl(
     url: str,
     max_pages: int,
     exclude_external_links: bool
 ) -> tuple[str, str]:
+    """Handle crawling requests from the Gradio interface."""
     try:
+        request = CrawlRequest(
+            url=url,
+            crawler_type=CrawlerType(crawler_type.lower()),
+            extraction_type=ExtractionType(extraction_type.lower()),
+            word_count_threshold=word_count_threshold,
+            css_selector=css_selector if css_selector else None,
+            xpath_query=xpath_query if xpath_query else None,
+            scan_full_page=scan_full_page,
+            scroll_delay=scroll_delay,
+            crawl_subpages=crawl_subpages,
+            max_depth=max_depth,
+            max_pages=max_pages,
+            exclude_external_links=exclude_external_links
+        )
         result = await crawl_url(request)
         markdown_content = str(result["markdown"]) if result.get("markdown") else ""
         metadata_str = f"""### Metadata
 - URL: {result['metadata']['url']}
 - Crawler Type: {result['metadata']['crawler_type']}
 - Full Page Scan: {result['metadata']['scan_full_page']}
 - Scroll Delay: {result['metadata']['scroll_delay']}s"""
         if crawl_subpages:
             metadata_str += f"""
 - Total Pages Crawled: {result['metadata'].get('total_pages_crawled', 0)}
 - Total Links Found: {result['metadata'].get('total_links_found', 0)}
 - Max Depth Reached: {result['metadata'].get('max_depth_reached', 1)}"""
         if result.get('image_info'):
             metadata_str += f"\n\n{result['image_info']}"
         if result.get("extracted_content"):
             metadata_str += f"\n\n### Extracted Content\n```json\n{result['extracted_content']}\n```"
         error_msg = f"Error: {str(e)}"
         return error_msg, "Error occurred while crawling"
+# Create Gradio interface
 demo = gr.Interface(
     fn=gradio_crawl,
     inputs=[
     The extracted content will be displayed in markdown format along with metadata and extraction results.
     When sub-page crawling is enabled, content from all crawled pages will be combined in the output.
+    """,
+    examples=[
+        ["https://example.com", "Basic", "Default", 100, "", "", False, 0.5, False, 1, 10, True],
+        ["https://example.com/blog", "Basic", "CSS", 100, "article.post", "", True, 0.5, True, 2, 5, True],
+    ]
 )
+# For Hugging Face Spaces, we launch just the Gradio interface
 if __name__ == "__main__":
+    demo.launch()