dwfwfwfwf commited on
Commit
142c0ff
·
verified ·
1 Parent(s): c959c9d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +296 -144
app.py CHANGED
@@ -1,41 +1,46 @@
 
1
  from fastapi import FastAPI, HTTPException
2
- from pydantic import BaseModel, HttpUrl
3
- from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, MemoryAdaptiveDispatcher, CrawlerMonitor, DisplayMode
 
 
 
 
 
 
 
 
 
4
  from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
5
  from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter
6
- from googlesearch import search
 
7
  import uvicorn
8
  import asyncio
9
- import nest_asyncio
10
  import re
11
- from typing import Optional, List, Dict
12
  from bs4 import BeautifulSoup
13
  from datetime import datetime
 
14
 
15
- # Apply nest_asyncio to allow nested event loops
16
- nest_asyncio.apply()
17
 
18
  app = FastAPI(
19
- title="Crawl4AI API",
20
- description="A web API for Crawl4AI web scraping service",
21
- version="1.0.0"
22
  )
23
 
24
- # Request model for the new search and crawl endpoint
25
- class SearchCrawlRequest(BaseModel):
26
- query: str = "Latest trends in India Gen Z" # Default query as per your request
27
- num_results: int = 10 # Default to 10 results
28
 
29
- # Existing request model for single URL crawling
30
  class CrawlRequest(BaseModel):
31
  url: HttpUrl
32
  cache_mode: str = "DISABLED"
33
  excluded_tags: list[str] = ["nav", "footer", "aside", "header", "script", "style"]
34
  remove_overlay_elements: bool = True
35
  ignore_links: bool = True
36
- subject: Optional[str] = None
37
 
38
- # Response models (unchanged from template)
39
  class Article(BaseModel):
40
  title: str
41
  url: str
@@ -43,7 +48,7 @@ class Article(BaseModel):
43
  image_url: Optional[str] = None
44
  timestamp: Optional[str] = None
45
  category: Optional[str] = None
46
- source_url: Optional[str] = None
47
 
48
  class CrawlResponse(BaseModel):
49
  url: str
@@ -54,206 +59,353 @@ class CrawlResponse(BaseModel):
54
  raw_markdown: Optional[str] = None
55
  stats: Dict = {}
56
 
57
- # Utility functions from the template (unchanged)
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  def clean_url(url: str) -> str:
 
59
  url = url.replace('<', '').replace('>', '').strip()
60
  if url.startswith('https://'):
61
- domain = url[8:].split('/')[0]
62
- cleaned_url = url.replace(f'https://{domain}/{domain}', domain)
63
- cleaned_url = cleaned_url.replace(f'https://{domain}/https:/', '')
64
- cleaned_url = cleaned_url.replace(f'https://{domain}/https://{domain}', domain)
 
 
 
 
 
65
  if not cleaned_url.startswith('https://'):
66
- cleaned_url = f'https://{cleaned_url}'
 
 
 
 
67
  else:
68
  cleaned_url = url
69
- cleaned_url = cleaned_url.split(' ')[0].split(')')[0].rstrip('/')
 
 
70
  return cleaned_url
71
 
 
72
  def is_valid_title(title: str) -> bool:
 
 
73
  invalid_patterns = ['**_access_time_', 'existing code', '...', 'navigation', 'menu', 'logo']
74
- if any(pattern in title.lower() for pattern in invalid_patterns):
75
- return False
76
- if title.count('-') > 3 or title.count('_') > 2 or len(title.strip()) < 5:
77
- return False
78
  return True
79
 
80
  def clean_description(description: str) -> Optional[str]:
81
- if not description or '_access_time_' in description:
82
- return None
 
83
  description = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', description)
84
- description = re.sub(r'https?://\S+', '', description)
85
- description = description.replace('(', '').replace(')', '').replace('<', '').replace('>', '')
 
86
  description = ' '.join(description.split())
87
- return description if len(description) > 10 else None
88
 
89
- def extract_articles(markdown: str) -> List[Article]:
 
90
  articles = []
91
  seen_urls = set()
92
- article_pattern = r'(?:!\[([^\]]*)\])?\[([^\]]+)\]\(([^)]+)\)(?:\s*\(([^)]+)\))?\s*(?:\[(.*?)\])?\s*([^[\n]*)'
93
- matches = re.finditer(article_pattern, markdown, re.DOTALL)
94
- for match in matches:
95
- title = match.group(2)
96
- url = match.group(3)
97
- description = match.group(6)
98
- if not is_valid_title(title):
99
- continue
 
 
 
 
 
 
 
 
100
  url = clean_url(url)
101
- if url in seen_urls or url.lower().endswith(('.jpg', '.png', '.gif', '.jpeg')):
 
102
  continue
 
 
103
  seen_urls.add(url)
 
104
  clean_desc = clean_description(description)
 
105
  image_url = None
106
- image_match = re.search(r'!\[([^\]]*)\]\(([^)]+)\)', description) if description else None
107
- if image_match:
108
- image_url = clean_url(image_match.group(2))
 
 
109
  article = Article(
110
- title=title.strip(),
111
  url=url,
112
  description=clean_desc,
113
  image_url=image_url,
114
  timestamp=None,
115
  category=None,
116
- source_url=None
117
  )
118
  articles.append(article)
 
119
  return articles
120
 
121
- def extract_metadata(markdown: str, html: str) -> Dict:
 
 
122
  metadata = {
123
  "timestamp": datetime.now().isoformat(),
124
  "categories": [],
125
- "total_articles": 0
126
  }
127
- category_pattern = r'##\s+\[(.*?)\]'
128
- categories = re.findall(category_pattern, markdown)
129
- if categories:
130
- metadata["categories"] = [cat.strip() for cat in categories]
 
 
 
 
 
 
131
  return metadata
132
 
133
- # New endpoint for search and multi-crawl
134
- @app.post("/search_and_crawl", response_model=List[CrawlResponse])
135
- async def search_and_crawl(request: SearchCrawlRequest):
 
 
 
 
 
 
 
 
 
 
136
  try:
137
- # Perform Google search in a separate thread to avoid blocking
138
- loop = asyncio.get_running_loop()
139
- search_results = await loop.run_in_executor(None, lambda: search(request.query, num_results=request.num_results))
140
- urls = list(search_results)
 
 
 
 
 
 
 
141
 
142
- # Configure content filter based on the search query
143
- content_filter = BM25ContentFilter(user_query=request.query, bm25_threshold=1.2)
144
  md_generator = DefaultMarkdownGenerator(
145
  content_filter=content_filter,
146
- options={"ignore_images": True, "ignore_links": True}
147
  )
148
 
149
- # Set up multi-crawler dispatcher for efficiency
150
- dispatcher = MemoryAdaptiveDispatcher(
151
- memory_threshold_percent=80.0, # Pause if memory usage exceeds 80%
152
- check_interval=1.0, # Check memory every second
153
- max_session_permit=5, # Limit to 5 concurrent tasks
154
- monitor=CrawlerMonitor(display_mode=DisplayMode.AGGREGATED)
155
- )
156
 
157
- # Crawl multiple URLs
158
- async with AsyncWebCrawler() as crawler:
159
  config = CrawlerRunConfig(
160
- cache_mode=CacheMode.DISABLED,
161
- excluded_tags=["nav", "footer", "aside", "header", "script", "style"],
162
- remove_overlay_elements=True,
163
  markdown_generator=md_generator,
164
  exclude_external_links=True,
165
  exclude_social_media_links=True,
166
  exclude_external_images=True,
167
  exclude_domains=["facebook.com", "twitter.com", "instagram.com", "youtube.com", "tiktok.com", "pinterest.com"]
168
  )
169
- results = await crawler.arun_many(urls=urls, config=config, dispatcher=dispatcher)
170
 
171
- # Process crawl results
172
- crawl_responses = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  for result in results:
174
- if result.success:
175
  markdown = result.markdown_v2.raw_markdown
176
- html = result.html
177
- articles = extract_articles(markdown)
178
- metadata = extract_metadata(markdown, html)
179
- for article in articles:
180
- article.source_url = result.url
181
  crawl_response = CrawlResponse(
182
  url=result.url,
183
  success=True,
 
184
  metadata=metadata,
185
  articles=articles,
186
  raw_markdown=markdown,
187
  stats={
188
  "total_links": len(result.links) if result.links else 0,
189
- "processing_time": result.processing_time if hasattr(result, 'processing_time') else None
 
 
190
  }
191
  )
192
  else:
193
- crawl_response = CrawlResponse(
194
  url=result.url,
195
  success=False,
196
- error=result.error_message,
197
- metadata={},
198
  articles=[],
199
  raw_markdown=None,
200
- stats={}
201
- )
202
- crawl_responses.append(crawl_response)
203
- return crawl_responses
 
204
 
205
- except Exception as e:
206
- raise HTTPException(status_code=500, detail=str(e))
 
207
 
208
- # Existing single URL crawl endpoint (unchanged from template)
209
- @app.post("/crawl", response_model=CrawlResponse)
210
- async def crawl_url(request: CrawlRequest):
211
- try:
212
- cache_mode = CacheMode.DISABLED
213
- if request.subject:
214
- content_filter = BM25ContentFilter(user_query=request.subject, bm25_threshold=1.2)
215
- else:
216
- content_filter = PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=50)
217
- options = {"ignore_images": True}
218
- if request.ignore_links:
219
- options["ignore_links"] = True
220
- md_generator = DefaultMarkdownGenerator(content_filter=content_filter, options=options)
221
- async with AsyncWebCrawler() as crawler:
222
- config = CrawlerRunConfig(
223
- cache_mode=cache_mode,
224
- excluded_tags=request.excluded_tags,
225
- remove_overlay_elements=request.remove_overlay_elements,
226
- markdown_generator=md_generator,
227
- exclude_external_links=True,
228
- exclude_social_media_links=True,
229
- exclude_external_images=True,
230
- exclude_domains=["facebook.com", "twitter.com", "instagram.com", "youtube.com", "tiktok.com", "pinterest.com"]
231
- )
232
- result = await crawler.arun(url=str(request.url), config=config)
233
- markdown = result.markdown_v2.raw_markdown
234
- html = result.html
235
- articles = extract_articles(markdown)
236
- metadata = extract_metadata(markdown, html)
237
- metadata["subject"] = request.subject
238
- for article in articles:
239
- article.source_url = str(request.url)
240
- return CrawlResponse(
241
- url=str(request.url),
242
- success=result.success,
243
- metadata=metadata,
244
- articles=articles,
245
- raw_markdown=markdown if result.success else None,
246
- stats={
247
- "total_links": len(result.links) if result.links else 0,
248
- "processing_time": result.processing_time if hasattr(result, 'processing_time') else None
249
- }
250
- )
251
  except Exception as e:
252
- raise HTTPException(status_code=500, detail=str(e))
 
 
 
253
 
254
- @app.get("/")
255
- def read_root():
256
- return {"message": "Welcome to Crawl4AI API", "docs": "/docs", "redoc": "/redoc"}
257
 
 
258
  if __name__ == "__main__":
259
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
+ # app.py
2
  from fastapi import FastAPI, HTTPException
3
+ from pydantic import BaseModel, HttpUrl, Field
4
+ from crawl4ai import (
5
+ AsyncWebCrawler,
6
+ CrawlerRunConfig,
7
+ CacheMode,
8
+ BrowserConfig,
9
+ RateLimiter,
10
+ CrawlerMonitor, # Keep this import
11
+ DisplayMode # Keep this import
12
+ )
13
+ from crawl4ai.async_dispatcher import MemoryAdaptiveDispatcher, SemaphoreDispatcher # Import dispatchers
14
  from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
15
  from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter
16
+ from googlesearch import search as google_search_sync # Rename to avoid conflict
17
+
18
  import uvicorn
19
  import asyncio
 
20
  import re
21
+ from typing import Optional, List, Dict, Tuple
22
  from bs4 import BeautifulSoup
23
  from datetime import datetime
24
+ import traceback # For detailed error logging
25
 
26
+ # nest_asyncio removed - no longer needed
 
27
 
28
  app = FastAPI(
29
+ title="Search & Crawl API",
30
+ description="An API to perform Google Search and crawl results using Crawl4AI",
31
+ version="1.1.0"
32
  )
33
 
34
+ # --- Pydantic Models ---
 
 
 
35
 
 
36
  class CrawlRequest(BaseModel):
37
  url: HttpUrl
38
  cache_mode: str = "DISABLED"
39
  excluded_tags: list[str] = ["nav", "footer", "aside", "header", "script", "style"]
40
  remove_overlay_elements: bool = True
41
  ignore_links: bool = True
42
+ subject: Optional[str] = None # Optional subject for content filtering
43
 
 
44
  class Article(BaseModel):
45
  title: str
46
  url: str
 
48
  image_url: Optional[str] = None
49
  timestamp: Optional[str] = None
50
  category: Optional[str] = None
51
+ source_url: Optional[str] = None # Added to track original source
52
 
53
  class CrawlResponse(BaseModel):
54
  url: str
 
59
  raw_markdown: Optional[str] = None
60
  stats: Dict = {}
61
 
62
+ class SearchCrawlRequest(BaseModel):
63
+ query: str = Field(..., description="The query string for Google Search")
64
+ num_results: int = Field(default=10, ge=1, le=30, description="Number of Google Search results to crawl")
65
+ subject: Optional[str] = Field(default=None, description="Optional subject for BM25 content filtering during crawl")
66
+ use_semaphore_dispatcher: bool = Field(default=False, description="Use SemaphoreDispatcher instead of MemoryAdaptiveDispatcher")
67
+ max_concurrent_tasks: int = Field(default=10, ge=1, description="Max concurrent crawls (used by dispatcher)")
68
+ cache_mode: str = Field(default="DISABLED", description="Crawl4AI cache mode (ENABLED, DISABLED, BYPASS)")
69
+ base_delay_secs: Tuple[float, float] = Field(default=(1.0, 3.0), description="Base delay range (min, max) in seconds for rate limiter")
70
+ max_delay_secs: float = Field(default=60.0, description="Max backoff delay in seconds for rate limiter")
71
+ max_retries: int = Field(default=3, description="Max retries on rate limit errors for rate limiter")
72
+
73
+
74
+ # --- Helper Functions ---
75
+
76
  def clean_url(url: str) -> str:
77
+ """Clean and normalize URLs"""
78
  url = url.replace('<', '').replace('>', '').strip()
79
  if url.startswith('https://'):
80
+ try:
81
+ domain_part = url[8:].split('/')[0]
82
+ if domain_part:
83
+ cleaned_url = url.replace(f'https://{domain_part}/{domain_part}', f'https://{domain_part}')
84
+ cleaned_url = re.sub(rf'https://{re.escape(domain_part)}/https:/*', f'https://{domain_part}/', cleaned_url)
85
+ else:
86
+ cleaned_url = url
87
+ except IndexError:
88
+ cleaned_url = url
89
  if not cleaned_url.startswith('https://'):
90
+ # Attempt reconstruction only if domain_part was found
91
+ if 'domain_part' in locals() and domain_part:
92
+ cleaned_url = f'https://{domain_part}'
93
+ else: # Fallback if domain extraction failed entirely
94
+ cleaned_url = url # Keep original if parsing was problematic
95
  else:
96
  cleaned_url = url
97
+
98
+ cleaned_url = cleaned_url.split(' ')[0].split(')')[0]
99
+ cleaned_url = cleaned_url.rstrip('/')
100
  return cleaned_url
101
 
102
+
103
  def is_valid_title(title: str) -> bool:
104
+ """Check if the title is valid"""
105
+ if not title: return False
106
  invalid_patterns = ['**_access_time_', 'existing code', '...', 'navigation', 'menu', 'logo']
107
+ title_lower = title.lower()
108
+ if any(pattern in title_lower for pattern in invalid_patterns): return False
109
+ if title.count('-') > 4 or title.count('_') > 3 or '/' in title: return False
110
+ if len(title.strip()) < 5: return False
111
  return True
112
 
113
  def clean_description(description: str) -> Optional[str]:
114
+ """Clean and normalize description text"""
115
+ if not description: return None
116
+ if '_access_time_' in description or description.strip().startswith("!"): return None
117
  description = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', description)
118
+ description = re.sub(r'\bhttps?://\S+', '', description)
119
+ description = description.replace('*', '').replace('_', '').replace('`', '')
120
+ description = description.strip().strip('()[]{}<>')
121
  description = ' '.join(description.split())
122
+ return description if len(description) > 15 else None
123
 
124
+ def extract_articles(markdown: str, source_url: str) -> List[Article]:
125
+ """Extracts articles from markdown, assigning the source_url"""
126
  articles = []
127
  seen_urls = set()
128
+ article_pattern = re.compile(
129
+ r'(?:!\[[^\]]*\]\((?P<image_url>[^)]+)\)\s*)?'
130
+ r'\[(?P<title>[^\]]+)\]'
131
+ r'\((?P<url>[^)]+)\)'
132
+ r'(?:\s*(?P<description>[^\n\[]*))?'
133
+ , re.MULTILINE)
134
+
135
+ for match in article_pattern.finditer(markdown):
136
+ title = match.group('title').strip()
137
+ url = match.group('url').strip()
138
+ description = match.group('description').strip() if match.group('description') else None
139
+ image_url_extracted = match.group('image_url').strip() if match.group('image_url') else None
140
+
141
+ if not url or not title: continue
142
+ if not is_valid_title(title): continue
143
+
144
  url = clean_url(url)
145
+
146
+ if not url.startswith(('http://', 'https://')) or url.lower().endswith(('.pdf', '.jpg', '.png', '.gif', '.jpeg', '.webp', '.svg', '.zip', '.docx')):
147
  continue
148
+
149
+ if url in seen_urls: continue
150
  seen_urls.add(url)
151
+
152
  clean_desc = clean_description(description)
153
+
154
  image_url = None
155
+ if image_url_extracted:
156
+ cleaned_img_url = clean_url(image_url_extracted)
157
+ if cleaned_img_url.lower().endswith(('.jpg', '.png', '.gif', '.jpeg', '.webp')):
158
+ image_url = cleaned_img_url
159
+
160
  article = Article(
161
+ title=title,
162
  url=url,
163
  description=clean_desc,
164
  image_url=image_url,
165
  timestamp=None,
166
  category=None,
167
+ source_url=source_url
168
  )
169
  articles.append(article)
170
+
171
  return articles
172
 
173
+
174
+ def extract_metadata(markdown: str) -> Dict:
175
+ """Basic metadata extraction from markdown"""
176
  metadata = {
177
  "timestamp": datetime.now().isoformat(),
178
  "categories": [],
 
179
  }
180
+ category_pattern = r'^##\s+(.*)'
181
+ matches = re.findall(category_pattern, markdown, re.MULTILINE)
182
+ if matches:
183
+ cleaned_categories = []
184
+ for cat in matches:
185
+ cat_text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', cat) # Remove links
186
+ cat_text = cat_text.replace('*','').replace('_','').strip()
187
+ if cat_text and len(cat_text) > 2:
188
+ cleaned_categories.append(cat_text)
189
+ metadata["categories"] = cleaned_categories
190
  return metadata
191
 
192
+ # --- FastAPI Endpoints ---
193
+
194
+ @app.get("/")
195
+ def read_root():
196
+ return {
197
+ "message": "Welcome to Search & Crawl API",
198
+ "docs_url": "/docs",
199
+ "redoc_url": "/redoc"
200
+ }
201
+
202
+ @app.post("/crawl", response_model=CrawlResponse, summary="Crawl a single URL")
203
+ async def crawl_url(request: CrawlRequest):
204
+ """Crawls a single URL using Crawl4AI."""
205
  try:
206
+ # Determine Cache Mode
207
+ try:
208
+ cache_mode_enum = CacheMode[request.cache_mode.upper()]
209
+ except KeyError:
210
+ raise HTTPException(status_code=400, detail=f"Invalid cache_mode. Use one of: {', '.join([m.name for m in CacheMode])}")
211
+
212
+ # Configure content filter based on subject
213
+ if request.subject:
214
+ content_filter = BM25ContentFilter(user_query=request.subject, bm25_threshold=1.2)
215
+ else:
216
+ content_filter = PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=50)
217
 
 
 
218
  md_generator = DefaultMarkdownGenerator(
219
  content_filter=content_filter,
220
+ options={"ignore_images": True, "ignore_links": request.ignore_links}
221
  )
222
 
223
+ # Browser Config
224
+ browser_config = BrowserConfig(headless=True, verbose=False)
 
 
 
 
 
225
 
226
+ async with AsyncWebCrawler(config=browser_config) as crawler:
 
227
  config = CrawlerRunConfig(
228
+ cache_mode=cache_mode_enum,
229
+ excluded_tags=request.excluded_tags,
230
+ remove_overlay_elements=request.remove_overlay_elements,
231
  markdown_generator=md_generator,
232
  exclude_external_links=True,
233
  exclude_social_media_links=True,
234
  exclude_external_images=True,
235
  exclude_domains=["facebook.com", "twitter.com", "instagram.com", "youtube.com", "tiktok.com", "pinterest.com"]
236
  )
 
237
 
238
+ result = await crawler.arun(url=str(request.url), config=config)
239
+
240
+ markdown = result.markdown_v2.raw_markdown if result.success and result.markdown_v2 else None
241
+ articles = extract_articles(markdown, str(request.url)) if markdown else []
242
+ metadata = extract_metadata(markdown) if markdown else {"timestamp": datetime.now().isoformat(), "categories": []}
243
+ metadata["subject"] = request.subject
244
+ metadata["total_articles"] = len(articles)
245
+
246
+
247
+ return CrawlResponse(
248
+ url=str(request.url),
249
+ success=result.success,
250
+ error=result.error_message if not result.success else None,
251
+ metadata=metadata,
252
+ articles=articles,
253
+ raw_markdown=markdown,
254
+ stats={
255
+ "total_links": len(result.links) if result.links else 0,
256
+ "processing_time": result.processing_time if hasattr(result, 'processing_time') else None,
257
+ "status_code": result.status_code if hasattr(result, 'status_code') else None,
258
+ "dispatch_info": result.dispatch_result.dict() if result.dispatch_result else None
259
+ }
260
+ )
261
+
262
+ except Exception as e:
263
+ print(f"Error during single crawl for {request.url}: {traceback.format_exc()}")
264
+ raise HTTPException(status_code=500, detail=f"An unexpected error occurred: {str(e)}")
265
+
266
+
267
+ @app.post("/search-and-crawl", response_model=List[CrawlResponse], summary="Search Google and crawl results")
268
+ async def search_and_crawl(request: SearchCrawlRequest):
269
+ """
270
+ Performs a Google Search for the given query, retrieves the top URLs,
271
+ and crawls each URL using Crawl4AI's multi-URL dispatcher.
272
+ """
273
+ urls_to_crawl = []
274
+ try:
275
+ # --- 1. Perform Google Search (Synchronous, run in thread pool) ---
276
+ loop = asyncio.get_running_loop()
277
+ search_iterator = await loop.run_in_executor(
278
+ None,
279
+ lambda: google_search_sync(request.query, num_results=request.num_results, lang='en')
280
+ )
281
+ urls_to_crawl = [clean_url(url) for url in search_iterator if url]
282
+
283
+ if not urls_to_crawl:
284
+ return []
285
+
286
+ except Exception as e:
287
+ print(f"Error during Google Search for '{request.query}': {traceback.format_exc()}")
288
+ raise HTTPException(status_code=500, detail=f"Google Search failed: {str(e)}")
289
+
290
+ # --- 2. Configure Crawl4AI ---
291
+ try:
292
+ # Determine Cache Mode
293
+ try:
294
+ cache_mode_enum = CacheMode[request.cache_mode.upper()]
295
+ except KeyError:
296
+ raise HTTPException(status_code=400, detail=f"Invalid cache_mode. Use one of: {', '.join([m.name for m in CacheMode])}")
297
+
298
+ # Configure content filter
299
+ if request.subject:
300
+ content_filter = BM25ContentFilter(user_query=request.subject, bm25_threshold=1.2)
301
+ else:
302
+ content_filter = PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=50)
303
+
304
+ md_generator = DefaultMarkdownGenerator(
305
+ content_filter=content_filter,
306
+ options={"ignore_images": True, "ignore_links": True}
307
+ )
308
+
309
+ # General CrawlerRunConfig
310
+ run_config = CrawlerRunConfig(
311
+ cache_mode=cache_mode_enum,
312
+ stream=False,
313
+ excluded_tags=["nav", "footer", "aside", "header", "script", "style", "noscript", "figure"],
314
+ remove_overlay_elements=True,
315
+ markdown_generator=md_generator,
316
+ exclude_external_links=True,
317
+ exclude_social_media_links=True,
318
+ exclude_external_images=True,
319
+ exclude_domains=["facebook.com", "twitter.com", "instagram.com", "youtube.com", "tiktok.com", "pinterest.com", "linkedin.com"],
320
+ )
321
+
322
+ # Browser Config
323
+ browser_config = BrowserConfig(headless=True, verbose=False)
324
+
325
+ # Rate Limiter Config
326
+ rate_limiter = RateLimiter(
327
+ base_delay=request.base_delay_secs,
328
+ max_delay=request.max_delay_secs,
329
+ max_retries=request.max_retries,
330
+ rate_limit_codes=[429, 503]
331
+ )
332
+
333
+ # Optional Monitor (Corrected initialization)
334
+ monitor = CrawlerMonitor(display_mode=DisplayMode.AGGREGATED) # <--- CORRECTED LINE
335
+
336
+ # --- 3. Select and Configure Dispatcher ---
337
+ if request.use_semaphore_dispatcher:
338
+ dispatcher = SemaphoreDispatcher(
339
+ max_session_permit=request.max_concurrent_tasks,
340
+ rate_limiter=rate_limiter,
341
+ monitor=monitor # Pass the correctly initialized monitor
342
+ )
343
+ else:
344
+ dispatcher = MemoryAdaptiveDispatcher(
345
+ max_session_permit=request.max_concurrent_tasks,
346
+ memory_threshold_percent=90.0,
347
+ check_interval=1.0,
348
+ rate_limiter=rate_limiter,
349
+ monitor=monitor # Pass the correctly initialized monitor
350
+ )
351
+
352
+ # --- 4. Run Multi-URL Crawl ---
353
+ crawl_results = []
354
+ async with AsyncWebCrawler(config=browser_config) as crawler:
355
+ results = await crawler.arun_many(
356
+ urls=urls_to_crawl,
357
+ config=run_config,
358
+ dispatcher=dispatcher
359
+ )
360
+
361
+ # --- 5. Process Results ---
362
  for result in results:
363
+ if result.success and result.markdown_v2 and result.markdown_v2.raw_markdown:
364
  markdown = result.markdown_v2.raw_markdown
365
+ articles = extract_articles(markdown, result.url)
366
+ metadata = extract_metadata(markdown)
367
+ metadata["subject"] = request.subject
368
+ metadata["total_articles"] = len(articles)
369
+
370
  crawl_response = CrawlResponse(
371
  url=result.url,
372
  success=True,
373
+ error=None,
374
  metadata=metadata,
375
  articles=articles,
376
  raw_markdown=markdown,
377
  stats={
378
  "total_links": len(result.links) if result.links else 0,
379
+ "processing_time": result.processing_time if hasattr(result, 'processing_time') else None,
380
+ "status_code": result.status_code if hasattr(result, 'status_code') else None,
381
+ "dispatch_info": result.dispatch_result.dict() if result.dispatch_result else None
382
  }
383
  )
384
  else:
385
+ crawl_response = CrawlResponse(
386
  url=result.url,
387
  success=False,
388
+ error=result.error_message or "Crawling failed or produced no markdown",
389
+ metadata={"timestamp": datetime.now().isoformat()},
390
  articles=[],
391
  raw_markdown=None,
392
+ stats={
393
+ "status_code": result.status_code if hasattr(result, 'status_code') else None,
394
+ "dispatch_info": result.dispatch_result.dict() if result.dispatch_result else None
395
+ }
396
+ )
397
 
398
+ crawl_results.append(crawl_response)
399
+
400
+ return crawl_results
401
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
402
  except Exception as e:
403
+ # Log the full traceback for internal debugging
404
+ print(f"Error during multi-crawl process for query '{request.query}': {traceback.format_exc()}")
405
+ # Raise HTTPException with a user-friendly message (without exposing internal details like specific arguments)
406
+ raise HTTPException(status_code=500, detail=f"Multi-crawl process failed: An internal error occurred during crawling setup or execution. Original error type: {type(e).__name__}")
407
 
 
 
 
408
 
409
+ # --- Run Application ---
410
  if __name__ == "__main__":
411
+ uvicorn.run(app, host="0.0.0.0", port=7860) # Removed --workers here, let Docker/deployment handle scaling if needed.