Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -348,114 +348,123 @@ async def gradio_crawl(
|
|
| 348 |
error_msg = f"Error: {str(e)}"
|
| 349 |
return error_msg, "Error occurred while crawling"
|
| 350 |
|
| 351 |
-
# Create Gradio interface with
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
gr.Textbox(
|
| 356 |
-
label="URL",
|
| 357 |
-
placeholder="Enter URL to crawl",
|
| 358 |
-
info="The webpage URL to extract content from"
|
| 359 |
-
),
|
| 360 |
-
gr.Dropdown(
|
| 361 |
-
choices=["Basic", "LLM", "Cosine", "JSON/CSS"],
|
| 362 |
-
label="Crawler Type",
|
| 363 |
-
value="Basic",
|
| 364 |
-
info="Select the content extraction strategy"
|
| 365 |
-
),
|
| 366 |
-
gr.Dropdown(
|
| 367 |
-
choices=["Default", "CSS", "XPath", "LLM", "Combined"],
|
| 368 |
-
label="Extraction Type",
|
| 369 |
-
value="Default",
|
| 370 |
-
info="Choose how to extract content from the page"
|
| 371 |
-
),
|
| 372 |
-
gr.Slider(
|
| 373 |
-
minimum=50,
|
| 374 |
-
maximum=500,
|
| 375 |
-
value=100,
|
| 376 |
-
step=50,
|
| 377 |
-
label="Word Count Threshold",
|
| 378 |
-
info="Minimum number of words required for content extraction"
|
| 379 |
-
),
|
| 380 |
-
gr.Textbox(
|
| 381 |
-
label="CSS Selector",
|
| 382 |
-
placeholder="e.g., article.content, main.post",
|
| 383 |
-
info="CSS selector to target specific content (used with CSS extraction type)"
|
| 384 |
-
),
|
| 385 |
-
gr.Textbox(
|
| 386 |
-
label="XPath Query",
|
| 387 |
-
placeholder="e.g., //article[@class='content']",
|
| 388 |
-
info="XPath query to target specific content (used with XPath extraction type)"
|
| 389 |
-
),
|
| 390 |
-
gr.Checkbox(
|
| 391 |
-
label="Scan Full Page",
|
| 392 |
-
value=False,
|
| 393 |
-
info="Enable to scroll through the entire page to load lazy content"
|
| 394 |
-
),
|
| 395 |
-
gr.Slider(
|
| 396 |
-
minimum=0.1,
|
| 397 |
-
maximum=2.0,
|
| 398 |
-
value=0.5,
|
| 399 |
-
step=0.1,
|
| 400 |
-
label="Scroll Delay",
|
| 401 |
-
info="Delay between scroll steps in seconds when scanning full page"
|
| 402 |
-
),
|
| 403 |
-
gr.Checkbox(
|
| 404 |
-
label="Crawl Sub-pages",
|
| 405 |
-
value=False,
|
| 406 |
-
info="Enable to crawl links found on the page"
|
| 407 |
-
),
|
| 408 |
-
gr.Slider(
|
| 409 |
-
minimum=1,
|
| 410 |
-
maximum=5,
|
| 411 |
-
value=1,
|
| 412 |
-
step=1,
|
| 413 |
-
label="Max Crawl Depth",
|
| 414 |
-
info="Maximum depth for recursive crawling (1 = only direct links)"
|
| 415 |
-
),
|
| 416 |
-
gr.Slider(
|
| 417 |
-
minimum=1,
|
| 418 |
-
maximum=50,
|
| 419 |
-
value=10,
|
| 420 |
-
step=5,
|
| 421 |
-
label="Max Pages",
|
| 422 |
-
info="Maximum number of pages to crawl"
|
| 423 |
-
),
|
| 424 |
-
gr.Checkbox(
|
| 425 |
-
label="Exclude External Links",
|
| 426 |
-
value=True,
|
| 427 |
-
info="Only crawl links within the same domain"
|
| 428 |
-
)
|
| 429 |
-
],
|
| 430 |
-
outputs=[
|
| 431 |
-
gr.Markdown(label="Generated Markdown"),
|
| 432 |
-
gr.Markdown(label="Metadata & Extraction Results")
|
| 433 |
-
],
|
| 434 |
-
title="Crawl4AI Demo",
|
| 435 |
-
description="""
|
| 436 |
-
This demo allows you to extract content from web pages using different crawling and extraction strategies.
|
| 437 |
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
3. Choose an extraction strategy (Default, CSS, XPath, LLM, Combined)
|
| 441 |
-
4. Configure additional options:
|
| 442 |
-
- Word count threshold for content filtering
|
| 443 |
-
- CSS selectors for targeting specific content
|
| 444 |
-
- XPath queries for precise extraction
|
| 445 |
-
- Full page scanning for lazy-loaded content
|
| 446 |
-
- Scroll delay for controlling page scanning speed
|
| 447 |
-
- Sub-page crawling with depth control
|
| 448 |
-
- Maximum number of pages to crawl
|
| 449 |
-
- External link filtering
|
| 450 |
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 459 |
|
| 460 |
if __name__ == "__main__":
|
| 461 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
|
| 348 |
error_msg = f"Error: {str(e)}"
|
| 349 |
return error_msg, "Error occurred while crawling"
|
| 350 |
|
| 351 |
+
# Create Gradio interface with simplified configuration
|
| 352 |
+
with gr.Blocks(title="Crawl4AI Demo") as demo:
|
| 353 |
+
gr.Markdown("""
|
| 354 |
+
# Crawl4AI Web Content Extractor
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 355 |
|
| 356 |
+
Extract content from web pages using different crawling and extraction strategies.
|
| 357 |
+
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 358 |
|
| 359 |
+
with gr.Row():
|
| 360 |
+
with gr.Column():
|
| 361 |
+
url_input = gr.Textbox(
|
| 362 |
+
label="URL",
|
| 363 |
+
placeholder="Enter URL to crawl",
|
| 364 |
+
info="The webpage URL to extract content from"
|
| 365 |
+
)
|
| 366 |
+
crawler_type = gr.Dropdown(
|
| 367 |
+
choices=["Basic", "LLM", "Cosine", "JSON/CSS"],
|
| 368 |
+
label="Crawler Type",
|
| 369 |
+
value="Basic",
|
| 370 |
+
info="Select the content extraction strategy"
|
| 371 |
+
)
|
| 372 |
+
extraction_type = gr.Dropdown(
|
| 373 |
+
choices=["Default", "CSS", "XPath", "LLM", "Combined"],
|
| 374 |
+
label="Extraction Type",
|
| 375 |
+
value="Default",
|
| 376 |
+
info="Choose how to extract content from the page"
|
| 377 |
+
)
|
| 378 |
+
word_count = gr.Slider(
|
| 379 |
+
minimum=50,
|
| 380 |
+
maximum=500,
|
| 381 |
+
value=100,
|
| 382 |
+
step=50,
|
| 383 |
+
label="Word Count Threshold",
|
| 384 |
+
info="Minimum number of words required for content extraction"
|
| 385 |
+
)
|
| 386 |
+
css_selector = gr.Textbox(
|
| 387 |
+
label="CSS Selector",
|
| 388 |
+
placeholder="e.g., article.content, main.post",
|
| 389 |
+
info="CSS selector to target specific content"
|
| 390 |
+
)
|
| 391 |
+
xpath_query = gr.Textbox(
|
| 392 |
+
label="XPath Query",
|
| 393 |
+
placeholder="e.g., //article[@class='content']",
|
| 394 |
+
info="XPath query to target specific content"
|
| 395 |
+
)
|
| 396 |
+
|
| 397 |
+
with gr.Column():
|
| 398 |
+
scan_full_page = gr.Checkbox(
|
| 399 |
+
label="Scan Full Page",
|
| 400 |
+
value=False,
|
| 401 |
+
info="Enable to scroll through the entire page"
|
| 402 |
+
)
|
| 403 |
+
scroll_delay = gr.Slider(
|
| 404 |
+
minimum=0.1,
|
| 405 |
+
maximum=2.0,
|
| 406 |
+
value=0.5,
|
| 407 |
+
step=0.1,
|
| 408 |
+
label="Scroll Delay",
|
| 409 |
+
info="Delay between scroll steps in seconds"
|
| 410 |
+
)
|
| 411 |
+
crawl_subpages = gr.Checkbox(
|
| 412 |
+
label="Crawl Sub-pages",
|
| 413 |
+
value=False,
|
| 414 |
+
info="Enable to crawl links found on the page"
|
| 415 |
+
)
|
| 416 |
+
max_depth = gr.Slider(
|
| 417 |
+
minimum=1,
|
| 418 |
+
maximum=5,
|
| 419 |
+
value=1,
|
| 420 |
+
step=1,
|
| 421 |
+
label="Max Crawl Depth",
|
| 422 |
+
info="Maximum depth for recursive crawling"
|
| 423 |
+
)
|
| 424 |
+
max_pages = gr.Slider(
|
| 425 |
+
minimum=1,
|
| 426 |
+
maximum=50,
|
| 427 |
+
value=10,
|
| 428 |
+
step=5,
|
| 429 |
+
label="Max Pages",
|
| 430 |
+
info="Maximum number of pages to crawl"
|
| 431 |
+
)
|
| 432 |
+
exclude_external = gr.Checkbox(
|
| 433 |
+
label="Exclude External Links",
|
| 434 |
+
value=True,
|
| 435 |
+
info="Only crawl links within the same domain"
|
| 436 |
+
)
|
| 437 |
+
|
| 438 |
+
with gr.Row():
|
| 439 |
+
crawl_button = gr.Button("Start Crawling")
|
| 440 |
+
|
| 441 |
+
with gr.Row():
|
| 442 |
+
output_markdown = gr.Markdown(label="Generated Markdown")
|
| 443 |
+
output_metadata = gr.Markdown(label="Metadata & Results")
|
| 444 |
+
|
| 445 |
+
crawl_button.click(
|
| 446 |
+
fn=gradio_crawl,
|
| 447 |
+
inputs=[
|
| 448 |
+
url_input, crawler_type, extraction_type,
|
| 449 |
+
word_count, css_selector, xpath_query,
|
| 450 |
+
scan_full_page, scroll_delay, crawl_subpages,
|
| 451 |
+
max_depth, max_pages, exclude_external
|
| 452 |
+
],
|
| 453 |
+
outputs=[output_markdown, output_metadata]
|
| 454 |
+
)
|
| 455 |
+
|
| 456 |
+
gr.Examples(
|
| 457 |
+
examples=[
|
| 458 |
+
["https://example.com", "Basic", "Default", 100, "", "", False, 0.5, False, 1, 10, True],
|
| 459 |
+
["https://example.com/blog", "Basic", "CSS", 100, "article.post", "", True, 0.5, True, 2, 5, True],
|
| 460 |
+
],
|
| 461 |
+
inputs=[
|
| 462 |
+
url_input, crawler_type, extraction_type,
|
| 463 |
+
word_count, css_selector, xpath_query,
|
| 464 |
+
scan_full_page, scroll_delay, crawl_subpages,
|
| 465 |
+
max_depth, max_pages, exclude_external
|
| 466 |
+
]
|
| 467 |
+
)
|
| 468 |
|
| 469 |
if __name__ == "__main__":
|
| 470 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|