hellorahulk commited on
Commit
41844a4
·
verified ·
1 Parent(s): d1777c6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -267
app.py CHANGED
@@ -1,89 +1,50 @@
1
  """
2
  Crawl4AI Demo Application
3
- ========================
4
 
5
- This application provides a web interface and API for the Crawl4AI library, allowing users to extract
6
- content from web pages using different crawling strategies.
7
 
8
  Features:
9
  ---------
10
  - Web interface built with Gradio for interactive use
11
- - RESTful API endpoint for programmatic access
12
  - Support for multiple crawler types (Basic, LLM, Cosine, JSON/CSS)
13
  - Configurable word count threshold
14
  - Markdown output with metadata
 
 
15
 
16
  Usage:
17
  ------
18
- 1. Start the server:
19
- ```
20
- python app.py
21
- ```
22
- 2. Access the web interface at http://localhost:8000
23
- 3. Use the API endpoint at http://localhost:8000/api/crawl
24
-
25
- API Example:
26
- -----------
27
- ```python
28
- import requests
29
-
30
- response = requests.post(
31
- "http://localhost:8000/api/crawl",
32
- json={
33
- "url": "https://example.com",
34
- "crawler_type": "basic",
35
- "word_count_threshold": 100
36
- }
37
- )
38
- result = response.json()
39
- ```
40
 
41
  Dependencies:
42
  ------------
43
  - gradio
44
- - fastapi
45
- - crawl4ai
46
- - uvicorn
47
  """
48
 
49
  import gradio as gr
50
  import asyncio
51
- from fastapi import FastAPI, HTTPException
52
- from pydantic import BaseModel
53
- from enum import Enum
54
  from typing import Optional, Dict, Any, List, Set
55
- from contextlib import asynccontextmanager
 
56
  from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, BrowserConfig
57
  from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
58
- from playwright.async_api import async_playwright
59
  import urllib.parse
60
 
61
  class CrawlerType(str, Enum):
62
- """
63
- Enumeration of supported crawler types.
64
-
65
- Attributes:
66
- BASIC (str): Simple HTML parsing and content extraction
67
- LLM (str): Language model-based content extraction
68
- COSINE (str): Cosine similarity-based content extraction
69
- JSON_CSS (str): JSON/CSS selector-based content extraction
70
- """
71
  BASIC = "basic"
72
  LLM = "llm"
73
  COSINE = "cosine"
74
  JSON_CSS = "json_css"
75
 
76
  class ExtractionType(str, Enum):
77
- """
78
- Enumeration of supported extraction strategies.
79
-
80
- Attributes:
81
- DEFAULT (str): Default extraction without specific strategy
82
- CSS (str): CSS selector-based extraction
83
- XPATH (str): XPath-based extraction
84
- LLM (str): Language model-based extraction
85
- COMBINED (str): Combined strategy using multiple approaches
86
- """
87
  DEFAULT = "default"
88
  CSS = "css"
89
  XPATH = "xpath"
@@ -91,24 +52,7 @@ class ExtractionType(str, Enum):
91
  COMBINED = "combined"
92
 
93
  class CrawlRequest(BaseModel):
94
- """
95
- Request model for crawling operations.
96
-
97
- Attributes:
98
- url (str): The URL to crawl
99
- crawler_type (CrawlerType): The type of crawler to use
100
- extraction_type (ExtractionType): The extraction strategy to use
101
- word_count_threshold (int): Minimum word count for extracted content
102
- css_selector (Optional[str]): CSS selector for content extraction
103
- xpath_query (Optional[str]): XPath query for content extraction
104
- excluded_tags (Optional[list]): HTML tags to exclude from extraction
105
- scan_full_page (bool): Whether to scan the entire page for lazy-loaded content
106
- scroll_delay (float): Delay between scroll steps in seconds
107
- crawl_subpages (bool): Whether to crawl sub-pages found in links
108
- max_depth (int): Maximum depth for recursive crawling (1 = only direct links)
109
- exclude_external_links (bool): Whether to exclude links to external domains
110
- max_pages (int): Maximum number of pages to crawl
111
- """
112
  url: str
113
  crawler_type: CrawlerType = CrawlerType.BASIC
114
  extraction_type: ExtractionType = ExtractionType.DEFAULT
@@ -123,72 +67,8 @@ class CrawlRequest(BaseModel):
123
  exclude_external_links: bool = True
124
  max_pages: int = 10
125
 
126
- # Global crawler variable
127
- crawler = None
128
-
129
- @asynccontextmanager
130
- async def lifespan(app: FastAPI):
131
- """
132
- Lifespan context manager for FastAPI application.
133
- Handles crawler initialization and cleanup.
134
- """
135
- global crawler
136
-
137
- # Initialize browser configuration
138
- browser_config = BrowserConfig(
139
- headless=True,
140
- viewport_width=1920,
141
- viewport_height=1080
142
- )
143
-
144
- # Create and initialize crawler
145
- try:
146
- crawler = AsyncWebCrawler(config=browser_config)
147
- print("Crawler initialized successfully")
148
- yield
149
- finally:
150
- if crawler:
151
- await crawler.close()
152
- print("Crawler resources cleaned up")
153
-
154
- # Create FastAPI app with lifespan handler
155
- app = FastAPI(
156
- title="Crawl4AI Demo",
157
- description="A web interface and API for extracting content from web pages using Crawl4AI",
158
- version="1.0.0",
159
- lifespan=lifespan
160
- )
161
-
162
- @app.on_event("startup")
163
- async def startup_event():
164
- """Initialize the browser on startup"""
165
- try:
166
- async with async_playwright() as playwright:
167
- await crawler.initialize(playwright)
168
- except Exception as e:
169
- print(f"Error initializing browser: {e}")
170
- raise
171
-
172
- @app.on_event("shutdown")
173
- async def shutdown_event():
174
- """Clean up browser resources on shutdown"""
175
- try:
176
- await crawler.cleanup()
177
- except Exception as e:
178
- print(f"Error during cleanup: {e}")
179
-
180
  def create_extraction_strategy(extraction_type: ExtractionType, css_selector: Optional[str] = None, xpath_query: Optional[str] = None) -> Any:
181
- """
182
- Create an extraction strategy based on the specified type.
183
-
184
- Args:
185
- extraction_type (ExtractionType): The type of extraction strategy
186
- css_selector (Optional[str]): CSS selector for content extraction
187
- xpath_query (Optional[str]): XPath query for content extraction
188
-
189
- Returns:
190
- Any: The configured extraction strategy
191
- """
192
  if extraction_type == ExtractionType.CSS and css_selector:
193
  schema = {
194
  "name": "Content",
@@ -203,9 +83,7 @@ def create_extraction_strategy(extraction_type: ExtractionType, css_selector: Op
203
  return None
204
 
205
  async def crawl_with_subpages(request: CrawlRequest, base_url: str, current_depth: int = 1, visited: Set[str] = None) -> Dict:
206
- """
207
- Recursively crawl pages including sub-pages up to the specified depth.
208
- """
209
  if visited is None:
210
  visited = set()
211
 
@@ -219,26 +97,17 @@ async def crawl_with_subpages(request: CrawlRequest, base_url: str, current_dept
219
 
220
  # Create run configuration for current page
221
  run_config = CrawlerRunConfig(
222
- # Core settings
223
  cache_mode=CacheMode.BYPASS,
224
- verbose=True, # Enable verbose logging
225
-
226
- # Content settings
227
  word_count_threshold=request.word_count_threshold,
228
  css_selector=request.css_selector,
229
  excluded_tags=request.excluded_tags or ["nav", "footer", "header"],
230
  exclude_external_links=request.exclude_external_links,
231
-
232
- # Page & JS settings
233
  wait_for=f"css:{request.css_selector}" if request.css_selector else None,
234
  wait_for_images=True,
235
  page_timeout=30000,
236
-
237
- # Lazy loading settings
238
  scan_full_page=request.scan_full_page,
239
  scroll_delay=request.scroll_delay,
240
-
241
- # Extraction settings
242
  extraction_strategy=create_extraction_strategy(
243
  request.extraction_type,
244
  request.css_selector,
@@ -286,21 +155,17 @@ async def crawl_with_subpages(request: CrawlRequest, base_url: str, current_dept
286
  if len(visited) >= request.max_pages:
287
  break
288
 
289
- # Normalize and validate the link
290
  try:
291
  normalized_link = urllib.parse.urljoin(request.url, link)
292
  link_domain = urllib.parse.urlparse(normalized_link).netloc
293
 
294
- # Skip if already visited or external link
295
  if normalized_link in visited or (request.exclude_external_links and link_domain != base_url):
296
  continue
297
 
298
- # Create new request for sub-page
299
  sub_request = CrawlRequest(
300
  **{**request.dict(), "url": normalized_link}
301
  )
302
 
303
- # Recursively crawl sub-page
304
  sub_result = await crawl_with_subpages(
305
  sub_request,
306
  base_url,
@@ -321,20 +186,16 @@ async def crawl_with_subpages(request: CrawlRequest, base_url: str, current_dept
321
  print(f"Error crawling {request.url}: {str(e)}")
322
  return None
323
 
324
- @app.post("/api/crawl")
325
- async def crawl_url(request: CrawlRequest):
326
- """
327
- API endpoint to crawl a URL and return the extracted content.
328
- """
329
  try:
330
  base_url = urllib.parse.urlparse(request.url).netloc
331
 
332
  if request.crawl_subpages:
333
  results = await crawl_with_subpages(request, base_url)
334
  if not results or not results["pages"]:
335
- raise HTTPException(status_code=500, detail=f"Failed to crawl pages starting from {request.url}")
336
 
337
- # Combine results from all pages
338
  combined_markdown = "\\n\\n---\\n\\n".join(
339
  f"## Page: {page['url']}\\n{page['markdown']}"
340
  for page in results["pages"]
@@ -358,29 +219,18 @@ async def crawl_url(request: CrawlRequest):
358
  "pages": results["pages"]
359
  }
360
  else:
361
- # Format wait_for condition properly if CSS selector is provided
362
  wait_condition = f"css:{request.css_selector}" if request.css_selector else None
363
 
364
- # Create run configuration
365
  run_config = CrawlerRunConfig(
366
- # Core settings
367
  cache_mode=CacheMode.BYPASS,
368
-
369
- # Content settings
370
  word_count_threshold=request.word_count_threshold,
371
  css_selector=request.css_selector,
372
  excluded_tags=request.excluded_tags or ["nav", "footer", "header"],
373
-
374
- # Page & JS settings
375
- wait_for=wait_condition, # Using properly formatted wait condition
376
- wait_for_images=True, # Always wait for images to load
377
- page_timeout=30000, # 30 seconds timeout for page operations
378
-
379
- # Lazy loading settings
380
  scan_full_page=request.scan_full_page,
381
  scroll_delay=request.scroll_delay,
382
-
383
- # Extraction settings
384
  extraction_strategy=create_extraction_strategy(
385
  request.extraction_type,
386
  request.css_selector,
@@ -388,59 +238,45 @@ async def crawl_url(request: CrawlRequest):
388
  )
389
  )
390
 
391
- # Create browser config with optimized settings
392
  browser_config = BrowserConfig(
393
  headless=True,
394
  viewport_width=1920,
395
  viewport_height=1080
396
  )
397
 
398
- async with AsyncWebCrawler(config=browser_config) as temp_crawler:
399
- try:
400
- result = await temp_crawler.arun(
401
- url=request.url,
402
- config=run_config
403
- )
404
-
405
- if not result.success:
406
- raise HTTPException(status_code=500, detail=result.error_message)
407
-
408
- # Get image information
409
- images = result.media.get("images", []) if hasattr(result, 'media') else []
410
- image_info = "\n### Images Found\n" if images else ""
411
- for i, img in enumerate(images[:5]): # Show first 5 images
412
- image_info += f"- Image {i+1}: {img.get('src', 'N/A')}\n"
413
- if img.get('alt'):
414
- image_info += f" Alt: {img['alt']}\n"
415
- if img.get('score'):
416
- image_info += f" Score: {img['score']}\n"
417
-
418
- return {
419
- "markdown": result.markdown_v2 if hasattr(result, 'markdown_v2') else "",
420
- "metadata": {
421
- "url": request.url,
422
- "crawler_type": request.crawler_type.value,
423
- "extraction_type": request.extraction_type.value,
424
- "word_count_threshold": request.word_count_threshold,
425
- "css_selector": request.css_selector,
426
- "xpath_query": request.xpath_query,
427
- "scan_full_page": request.scan_full_page,
428
- "scroll_delay": request.scroll_delay,
429
- "wait_condition": wait_condition
430
- },
431
- "extracted_content": result.extracted_content if hasattr(result, 'extracted_content') else None,
432
- "image_info": image_info
433
- }
434
- except Exception as e:
435
- # More specific error handling
436
- error_msg = str(e)
437
- if "Wait condition failed" in error_msg:
438
- error_msg = f"Failed to find element matching selector '{request.css_selector}'. Please check if the selector is correct."
439
- elif "TimeoutError" in error_msg:
440
- error_msg = "Page took too long to load. Please try again or check the URL."
441
- raise HTTPException(status_code=500, detail=error_msg)
442
  except Exception as e:
443
- raise HTTPException(status_code=500, detail=str(e))
444
 
445
  async def gradio_crawl(
446
  url: str,
@@ -456,48 +292,27 @@ async def gradio_crawl(
456
  max_pages: int,
457
  exclude_external_links: bool
458
  ) -> tuple[str, str]:
459
- """
460
- Gradio interface function to handle crawling requests from the web UI.
461
-
462
- Args:
463
- url (str): The webpage URL to crawl
464
- crawler_type (str): Type of crawler to use
465
- extraction_type (str): Type of extraction strategy
466
- word_count_threshold (int): Minimum word count threshold
467
- css_selector (str): CSS selector for content targeting
468
- xpath_query (str): XPath query for content targeting
469
- scan_full_page (bool): Whether to scan the full page
470
- scroll_delay (float): Delay between scroll steps
471
- crawl_subpages (bool): Whether to crawl sub-pages
472
- max_depth (int): Maximum crawl depth
473
- max_pages (int): Maximum number of pages to crawl
474
- exclude_external_links (bool): Whether to exclude external links
475
-
476
- Returns:
477
- tuple[str, str]: Tuple containing (markdown_content, metadata_string)
478
- """
479
- request = CrawlRequest(
480
- url=url,
481
- crawler_type=CrawlerType(crawler_type.lower()),
482
- extraction_type=ExtractionType(extraction_type.lower()),
483
- word_count_threshold=word_count_threshold,
484
- css_selector=css_selector if css_selector else None,
485
- xpath_query=xpath_query if xpath_query else None,
486
- scan_full_page=scan_full_page,
487
- scroll_delay=scroll_delay,
488
- crawl_subpages=crawl_subpages,
489
- max_depth=max_depth,
490
- max_pages=max_pages,
491
- exclude_external_links=exclude_external_links
492
- )
493
-
494
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
  result = await crawl_url(request)
496
 
497
- # Convert markdown result to string if it exists
498
  markdown_content = str(result["markdown"]) if result.get("markdown") else ""
499
 
500
- # Format the metadata and results
501
  metadata_str = f"""### Metadata
502
  - URL: {result['metadata']['url']}
503
  - Crawler Type: {result['metadata']['crawler_type']}
@@ -508,18 +323,15 @@ async def gradio_crawl(
508
  - Full Page Scan: {result['metadata']['scan_full_page']}
509
  - Scroll Delay: {result['metadata']['scroll_delay']}s"""
510
 
511
- # Add sub-page crawling information if enabled
512
  if crawl_subpages:
513
  metadata_str += f"""
514
  - Total Pages Crawled: {result['metadata'].get('total_pages_crawled', 0)}
515
  - Total Links Found: {result['metadata'].get('total_links_found', 0)}
516
  - Max Depth Reached: {result['metadata'].get('max_depth_reached', 1)}"""
517
 
518
- # Add image information if available
519
  if result.get('image_info'):
520
  metadata_str += f"\n\n{result['image_info']}"
521
 
522
- # Add extracted content if available
523
  if result.get("extracted_content"):
524
  metadata_str += f"\n\n### Extracted Content\n```json\n{result['extracted_content']}\n```"
525
 
@@ -528,7 +340,7 @@ async def gradio_crawl(
528
  error_msg = f"Error: {str(e)}"
529
  return error_msg, "Error occurred while crawling"
530
 
531
- # Create Gradio interface with enhanced documentation
532
  demo = gr.Interface(
533
  fn=gradio_crawl,
534
  inputs=[
@@ -630,12 +442,13 @@ demo = gr.Interface(
630
 
631
  The extracted content will be displayed in markdown format along with metadata and extraction results.
632
  When sub-page crawling is enabled, content from all crawled pages will be combined in the output.
633
- """
 
 
 
 
634
  )
635
 
636
- # Mount Gradio app to FastAPI
637
- app = gr.mount_gradio_app(app, demo, path="/")
638
-
639
  if __name__ == "__main__":
640
- import uvicorn
641
- uvicorn.run(app, host="0.0.0.0", port=8000)
 
1
  """
2
  Crawl4AI Demo Application
3
+ ====================================================
4
 
5
+ This is a modified version of the Crawl4AI demo application specifically designed
6
+ for deployment on Hugging Face Spaces.
7
 
8
  Features:
9
  ---------
10
  - Web interface built with Gradio for interactive use
 
11
  - Support for multiple crawler types (Basic, LLM, Cosine, JSON/CSS)
12
  - Configurable word count threshold
13
  - Markdown output with metadata
14
+ - Sub-page crawling capabilities
15
+ - Lazy loading support
16
 
17
  Usage:
18
  ------
19
+ This version is specifically designed for Hugging Face Spaces deployment.
20
+ Simply upload this file to your Space and it will automatically run.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  Dependencies:
23
  ------------
24
  - gradio
25
+ - crawl4ai>=0.4.3b0
26
+ - python-dotenv>=1.0.0
27
+ - pydantic>=2.5.0
28
  """
29
 
30
  import gradio as gr
31
  import asyncio
 
 
 
32
  from typing import Optional, Dict, Any, List, Set
33
+ from enum import Enum
34
+ from pydantic import BaseModel
35
  from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, BrowserConfig
36
  from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
 
37
  import urllib.parse
38
 
39
  class CrawlerType(str, Enum):
40
+ """Enumeration of supported crawler types."""
 
 
 
 
 
 
 
 
41
  BASIC = "basic"
42
  LLM = "llm"
43
  COSINE = "cosine"
44
  JSON_CSS = "json_css"
45
 
46
  class ExtractionType(str, Enum):
47
+ """Enumeration of supported extraction strategies."""
 
 
 
 
 
 
 
 
 
48
  DEFAULT = "default"
49
  CSS = "css"
50
  XPATH = "xpath"
 
52
  COMBINED = "combined"
53
 
54
  class CrawlRequest(BaseModel):
55
+ """Request model for crawling operations."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  url: str
57
  crawler_type: CrawlerType = CrawlerType.BASIC
58
  extraction_type: ExtractionType = ExtractionType.DEFAULT
 
67
  exclude_external_links: bool = True
68
  max_pages: int = 10
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  def create_extraction_strategy(extraction_type: ExtractionType, css_selector: Optional[str] = None, xpath_query: Optional[str] = None) -> Any:
71
+ """Create an extraction strategy based on the specified type."""
 
 
 
 
 
 
 
 
 
 
72
  if extraction_type == ExtractionType.CSS and css_selector:
73
  schema = {
74
  "name": "Content",
 
83
  return None
84
 
85
  async def crawl_with_subpages(request: CrawlRequest, base_url: str, current_depth: int = 1, visited: Set[str] = None) -> Dict:
86
+ """Recursively crawl pages including sub-pages up to the specified depth."""
 
 
87
  if visited is None:
88
  visited = set()
89
 
 
97
 
98
  # Create run configuration for current page
99
  run_config = CrawlerRunConfig(
 
100
  cache_mode=CacheMode.BYPASS,
101
+ verbose=True,
 
 
102
  word_count_threshold=request.word_count_threshold,
103
  css_selector=request.css_selector,
104
  excluded_tags=request.excluded_tags or ["nav", "footer", "header"],
105
  exclude_external_links=request.exclude_external_links,
 
 
106
  wait_for=f"css:{request.css_selector}" if request.css_selector else None,
107
  wait_for_images=True,
108
  page_timeout=30000,
 
 
109
  scan_full_page=request.scan_full_page,
110
  scroll_delay=request.scroll_delay,
 
 
111
  extraction_strategy=create_extraction_strategy(
112
  request.extraction_type,
113
  request.css_selector,
 
155
  if len(visited) >= request.max_pages:
156
  break
157
 
 
158
  try:
159
  normalized_link = urllib.parse.urljoin(request.url, link)
160
  link_domain = urllib.parse.urlparse(normalized_link).netloc
161
 
 
162
  if normalized_link in visited or (request.exclude_external_links and link_domain != base_url):
163
  continue
164
 
 
165
  sub_request = CrawlRequest(
166
  **{**request.dict(), "url": normalized_link}
167
  )
168
 
 
169
  sub_result = await crawl_with_subpages(
170
  sub_request,
171
  base_url,
 
186
  print(f"Error crawling {request.url}: {str(e)}")
187
  return None
188
 
189
+ async def crawl_url(request: CrawlRequest) -> Dict:
190
+ """Crawl a URL and return the extracted content."""
 
 
 
191
  try:
192
  base_url = urllib.parse.urlparse(request.url).netloc
193
 
194
  if request.crawl_subpages:
195
  results = await crawl_with_subpages(request, base_url)
196
  if not results or not results["pages"]:
197
+ raise Exception(f"Failed to crawl pages starting from {request.url}")
198
 
 
199
  combined_markdown = "\\n\\n---\\n\\n".join(
200
  f"## Page: {page['url']}\\n{page['markdown']}"
201
  for page in results["pages"]
 
219
  "pages": results["pages"]
220
  }
221
  else:
 
222
  wait_condition = f"css:{request.css_selector}" if request.css_selector else None
223
 
 
224
  run_config = CrawlerRunConfig(
 
225
  cache_mode=CacheMode.BYPASS,
 
 
226
  word_count_threshold=request.word_count_threshold,
227
  css_selector=request.css_selector,
228
  excluded_tags=request.excluded_tags or ["nav", "footer", "header"],
229
+ wait_for=wait_condition,
230
+ wait_for_images=True,
231
+ page_timeout=30000,
 
 
 
 
232
  scan_full_page=request.scan_full_page,
233
  scroll_delay=request.scroll_delay,
 
 
234
  extraction_strategy=create_extraction_strategy(
235
  request.extraction_type,
236
  request.css_selector,
 
238
  )
239
  )
240
 
 
241
  browser_config = BrowserConfig(
242
  headless=True,
243
  viewport_width=1920,
244
  viewport_height=1080
245
  )
246
 
247
+ async with AsyncWebCrawler(config=browser_config) as crawler:
248
+ result = await crawler.arun(url=request.url, config=run_config)
249
+
250
+ if not result.success:
251
+ raise Exception(result.error_message)
252
+
253
+ images = result.media.get("images", []) if hasattr(result, 'media') else []
254
+ image_info = "\n### Images Found\n" if images else ""
255
+ for i, img in enumerate(images[:5]):
256
+ image_info += f"- Image {i+1}: {img.get('src', 'N/A')}\n"
257
+ if img.get('alt'):
258
+ image_info += f" Alt: {img['alt']}\n"
259
+ if img.get('score'):
260
+ image_info += f" Score: {img['score']}\n"
261
+
262
+ return {
263
+ "markdown": result.markdown_v2 if hasattr(result, 'markdown_v2') else "",
264
+ "metadata": {
265
+ "url": request.url,
266
+ "crawler_type": request.crawler_type.value,
267
+ "extraction_type": request.extraction_type.value,
268
+ "word_count_threshold": request.word_count_threshold,
269
+ "css_selector": request.css_selector,
270
+ "xpath_query": request.xpath_query,
271
+ "scan_full_page": request.scan_full_page,
272
+ "scroll_delay": request.scroll_delay,
273
+ "wait_condition": wait_condition
274
+ },
275
+ "extracted_content": result.extracted_content if hasattr(result, 'extracted_content') else None,
276
+ "image_info": image_info
277
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
278
  except Exception as e:
279
+ raise Exception(str(e))
280
 
281
  async def gradio_crawl(
282
  url: str,
 
292
  max_pages: int,
293
  exclude_external_links: bool
294
  ) -> tuple[str, str]:
295
+ """Handle crawling requests from the Gradio interface."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
  try:
297
+ request = CrawlRequest(
298
+ url=url,
299
+ crawler_type=CrawlerType(crawler_type.lower()),
300
+ extraction_type=ExtractionType(extraction_type.lower()),
301
+ word_count_threshold=word_count_threshold,
302
+ css_selector=css_selector if css_selector else None,
303
+ xpath_query=xpath_query if xpath_query else None,
304
+ scan_full_page=scan_full_page,
305
+ scroll_delay=scroll_delay,
306
+ crawl_subpages=crawl_subpages,
307
+ max_depth=max_depth,
308
+ max_pages=max_pages,
309
+ exclude_external_links=exclude_external_links
310
+ )
311
+
312
  result = await crawl_url(request)
313
 
 
314
  markdown_content = str(result["markdown"]) if result.get("markdown") else ""
315
 
 
316
  metadata_str = f"""### Metadata
317
  - URL: {result['metadata']['url']}
318
  - Crawler Type: {result['metadata']['crawler_type']}
 
323
  - Full Page Scan: {result['metadata']['scan_full_page']}
324
  - Scroll Delay: {result['metadata']['scroll_delay']}s"""
325
 
 
326
  if crawl_subpages:
327
  metadata_str += f"""
328
  - Total Pages Crawled: {result['metadata'].get('total_pages_crawled', 0)}
329
  - Total Links Found: {result['metadata'].get('total_links_found', 0)}
330
  - Max Depth Reached: {result['metadata'].get('max_depth_reached', 1)}"""
331
 
 
332
  if result.get('image_info'):
333
  metadata_str += f"\n\n{result['image_info']}"
334
 
 
335
  if result.get("extracted_content"):
336
  metadata_str += f"\n\n### Extracted Content\n```json\n{result['extracted_content']}\n```"
337
 
 
340
  error_msg = f"Error: {str(e)}"
341
  return error_msg, "Error occurred while crawling"
342
 
343
+ # Create Gradio interface
344
  demo = gr.Interface(
345
  fn=gradio_crawl,
346
  inputs=[
 
442
 
443
  The extracted content will be displayed in markdown format along with metadata and extraction results.
444
  When sub-page crawling is enabled, content from all crawled pages will be combined in the output.
445
+ """,
446
+ examples=[
447
+ ["https://example.com", "Basic", "Default", 100, "", "", False, 0.5, False, 1, 10, True],
448
+ ["https://example.com/blog", "Basic", "CSS", 100, "article.post", "", True, 0.5, True, 2, 5, True],
449
+ ]
450
  )
451
 
452
+ # For Hugging Face Spaces, we launch just the Gradio interface
 
 
453
  if __name__ == "__main__":
454
+ demo.launch()