Spaces:

hellorahulk
/

crawlitall

Build error

App Files Files Community

hellorahulk commited on Feb 5, 2025

Commit

fede52d

verified ·

1 Parent(s): 0c96379

Update app.py

Browse files

Files changed (1) hide show

app.py +115 -106

app.py CHANGED Viewed

@@ -348,114 +348,123 @@ async def gradio_crawl(
         error_msg = f"Error: {str(e)}"
         return error_msg, "Error occurred while crawling"
-# Create Gradio interface with Docker-optimized settings
-demo = gr.Interface(
-    fn=gradio_crawl,
-    inputs=[
-        gr.Textbox(
-            label="URL",
-            placeholder="Enter URL to crawl",
-            info="The webpage URL to extract content from"
-        ),
-        gr.Dropdown(
-            choices=["Basic", "LLM", "Cosine", "JSON/CSS"],
-            label="Crawler Type",
-            value="Basic",
-            info="Select the content extraction strategy"
-        ),
-        gr.Dropdown(
-            choices=["Default", "CSS", "XPath", "LLM", "Combined"],
-            label="Extraction Type",
-            value="Default",
-            info="Choose how to extract content from the page"
-        ),
-        gr.Slider(
-            minimum=50,
-            maximum=500,
-            value=100,
-            step=50,
-            label="Word Count Threshold",
-            info="Minimum number of words required for content extraction"
-        ),
-        gr.Textbox(
-            label="CSS Selector",
-            placeholder="e.g., article.content, main.post",
-            info="CSS selector to target specific content (used with CSS extraction type)"
-        ),
-        gr.Textbox(
-            label="XPath Query",
-            placeholder="e.g., //article[@class='content']",
-            info="XPath query to target specific content (used with XPath extraction type)"
-        ),
-        gr.Checkbox(
-            label="Scan Full Page",
-            value=False,
-            info="Enable to scroll through the entire page to load lazy content"
-        ),
-        gr.Slider(
-            minimum=0.1,
-            maximum=2.0,
-            value=0.5,
-            step=0.1,
-            label="Scroll Delay",
-            info="Delay between scroll steps in seconds when scanning full page"
-        ),
-        gr.Checkbox(
-            label="Crawl Sub-pages",
-            value=False,
-            info="Enable to crawl links found on the page"
-        ),
-        gr.Slider(
-            minimum=1,
-            maximum=5,
-            value=1,
-            step=1,
-            label="Max Crawl Depth",
-            info="Maximum depth for recursive crawling (1 = only direct links)"
-        ),
-        gr.Slider(
-            minimum=1,
-            maximum=50,
-            value=10,
-            step=5,
-            label="Max Pages",
-            info="Maximum number of pages to crawl"
-        ),
-        gr.Checkbox(
-            label="Exclude External Links",
-            value=True,
-            info="Only crawl links within the same domain"
-        )
-    ],
-    outputs=[
-        gr.Markdown(label="Generated Markdown"),
-        gr.Markdown(label="Metadata & Extraction Results")
-    ],
-    title="Crawl4AI Demo",
-    description="""
-    This demo allows you to extract content from web pages using different crawling and extraction strategies.
-    1. Enter a URL to crawl
-    2. Select a crawler type (Basic, LLM, Cosine, JSON/CSS)
-    3. Choose an extraction strategy (Default, CSS, XPath, LLM, Combined)
-    4. Configure additional options:
-       - Word count threshold for content filtering
-       - CSS selectors for targeting specific content
-       - XPath queries for precise extraction
-       - Full page scanning for lazy-loaded content
-       - Scroll delay for controlling page scanning speed
-       - Sub-page crawling with depth control
-       - Maximum number of pages to crawl
-       - External link filtering
-    The extracted content will be displayed in markdown format along with metadata and extraction results.
-    When sub-page crawling is enabled, content from all crawled pages will be combined in the output.
-    """,
-    examples=[
-        ["https://example.com", "Basic", "Default", 100, "", "", False, 0.5, False, 1, 10, True],
-        ["https://example.com/blog", "Basic", "CSS", 100, "article.post", "", True, 0.5, True, 2, 5, True],
-    ]
-)
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)

         error_msg = f"Error: {str(e)}"
         return error_msg, "Error occurred while crawling"
+# Create Gradio interface with simplified configuration
+with gr.Blocks(title="Crawl4AI Demo") as demo:
+    gr.Markdown("""
+    # Crawl4AI Web Content Extractor
+    Extract content from web pages using different crawling and extraction strategies.
+    """)
+    with gr.Row():
+        with gr.Column():
+            url_input = gr.Textbox(
+                label="URL",
+                placeholder="Enter URL to crawl",
+                info="The webpage URL to extract content from"
+            )
+            crawler_type = gr.Dropdown(
+                choices=["Basic", "LLM", "Cosine", "JSON/CSS"],
+                label="Crawler Type",
+                value="Basic",
+                info="Select the content extraction strategy"
+            )
+            extraction_type = gr.Dropdown(
+                choices=["Default", "CSS", "XPath", "LLM", "Combined"],
+                label="Extraction Type",
+                value="Default",
+                info="Choose how to extract content from the page"
+            )
+            word_count = gr.Slider(
+                minimum=50,
+                maximum=500,
+                value=100,
+                step=50,
+                label="Word Count Threshold",
+                info="Minimum number of words required for content extraction"
+            )
+            css_selector = gr.Textbox(
+                label="CSS Selector",
+                placeholder="e.g., article.content, main.post",
+                info="CSS selector to target specific content"
+            )
+            xpath_query = gr.Textbox(
+                label="XPath Query",
+                placeholder="e.g., //article[@class='content']",
+                info="XPath query to target specific content"
+            )
+        with gr.Column():
+            scan_full_page = gr.Checkbox(
+                label="Scan Full Page",
+                value=False,
+                info="Enable to scroll through the entire page"
+            )
+            scroll_delay = gr.Slider(
+                minimum=0.1,
+                maximum=2.0,
+                value=0.5,
+                step=0.1,
+                label="Scroll Delay",
+                info="Delay between scroll steps in seconds"
+            )
+            crawl_subpages = gr.Checkbox(
+                label="Crawl Sub-pages",
+                value=False,
+                info="Enable to crawl links found on the page"
+            )
+            max_depth = gr.Slider(
+                minimum=1,
+                maximum=5,
+                value=1,
+                step=1,
+                label="Max Crawl Depth",
+                info="Maximum depth for recursive crawling"
+            )
+            max_pages = gr.Slider(
+                minimum=1,
+                maximum=50,
+                value=10,
+                step=5,
+                label="Max Pages",
+                info="Maximum number of pages to crawl"
+            )
+            exclude_external = gr.Checkbox(
+                label="Exclude External Links",
+                value=True,
+                info="Only crawl links within the same domain"
+            )
+    with gr.Row():
+        crawl_button = gr.Button("Start Crawling")
+    with gr.Row():
+        output_markdown = gr.Markdown(label="Generated Markdown")
+        output_metadata = gr.Markdown(label="Metadata & Results")
+    crawl_button.click(
+        fn=gradio_crawl,
+        inputs=[
+            url_input, crawler_type, extraction_type,
+            word_count, css_selector, xpath_query,
+            scan_full_page, scroll_delay, crawl_subpages,
+            max_depth, max_pages, exclude_external
+        ],
+        outputs=[output_markdown, output_metadata]
+    )
+    gr.Examples(
+        examples=[
+            ["https://example.com", "Basic", "Default", 100, "", "", False, 0.5, False, 1, 10, True],
+            ["https://example.com/blog", "Basic", "CSS", 100, "article.post", "", True, 0.5, True, 2, 5, True],
+        ],
+        inputs=[
+            url_input, crawler_type, extraction_type,
+            word_count, css_selector, xpath_query,
+            scan_full_page, scroll_delay, crawl_subpages,
+            max_depth, max_pages, exclude_external
+        ]
+    )
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)