Karim shoair commited on
Commit
01127e9
·
1 Parent(s): 168160d

feat(spiders): Change items hook to make it for processing items + add a stat for this

Browse files
scrapling/spiders/engine.py CHANGED
@@ -102,12 +102,17 @@ class CrawlerEngine:
102
  self.stats.offsite_requests_count += 1
103
  log.debug(f"Filtered offsite request to: {result.url}")
104
  elif isinstance(result, dict):
105
- self.stats.items_scraped += 1
106
- self._items.append(result)
107
- if self._item_stream:
108
- await self._item_stream.send(result)
109
- await self.spider.on_scraped_item(result)
110
- log.debug(f"Scraped from {str(response)}\n{result}")
 
 
 
 
 
111
  elif result is not None:
112
  log.error(f"Spider must return Request, dict or None, got '{type(result)}' in {request}")
113
  except Exception as e:
 
102
  self.stats.offsite_requests_count += 1
103
  log.debug(f"Filtered offsite request to: {result.url}")
104
  elif isinstance(result, dict):
105
+ processed_result = await self.spider.on_scraped_item(result)
106
+ if processed_result:
107
+ self.stats.items_scraped += 1
108
+ log.debug(f"Scraped from {str(response)}\n{processed_result}")
109
+ if self._item_stream:
110
+ await self._item_stream.send(processed_result)
111
+ else:
112
+ self._items.append(processed_result)
113
+ else:
114
+ self.stats.items_dropped += 1
115
+ log.warning(f"Dropped from {str(response)}\n{processed_result}")
116
  elif result is not None:
117
  log.error(f"Spider must return Request, dict or None, got '{type(result)}' in {request}")
118
  except Exception as e:
scrapling/spiders/result.py CHANGED
@@ -50,6 +50,7 @@ class CrawlStats:
50
  offsite_requests_count: int = 0
51
  response_bytes: int = 0
52
  items_scraped: int = 0
 
53
  start_time: float = 0.0
54
  end_time: float = 0.0
55
  download_delay: float = 0.0
@@ -85,6 +86,7 @@ class CrawlStats:
85
  def to_dict(self) -> dict[str, Any]:
86
  return {
87
  "items_scraped": self.items_scraped,
 
88
  "elapsed_seconds": round(self.elapsed_seconds, 2),
89
  "download_delay": round(self.download_delay, 2),
90
  "concurrent_requests": self.concurrent_requests,
 
50
  offsite_requests_count: int = 0
51
  response_bytes: int = 0
52
  items_scraped: int = 0
53
+ items_dropped: int = 0
54
  start_time: float = 0.0
55
  end_time: float = 0.0
56
  download_delay: float = 0.0
 
86
  def to_dict(self) -> dict[str, Any]:
87
  return {
88
  "items_scraped": self.items_scraped,
89
+ "items_dropped": self.items_dropped,
90
  "elapsed_seconds": round(self.elapsed_seconds, 2),
91
  "download_delay": round(self.download_delay, 2),
92
  "concurrent_requests": self.concurrent_requests,
scrapling/spiders/spider.py CHANGED
@@ -160,9 +160,9 @@ class Spider(ABC):
160
  """
161
  self.logger.error(error, exc_info=error)
162
 
163
- async def on_scraped_item(self, item: dict[str, Any]) -> None:
164
- """Handle a scraped item. Override or extend for item pipelines."""
165
- pass
166
 
167
  async def is_blocked(self, response: "Response") -> bool:
168
  """Check if the response is blocked. Users should override this for custom detection logic."""
 
160
  """
161
  self.logger.error(error, exc_info=error)
162
 
163
+ async def on_scraped_item(self, item: Dict[str, Any]) -> Dict[str, Any] | None:
164
+ """A hook to be overridden by users to do some processing on scraped items, return `None` to drop the item silently."""
165
+ return item
166
 
167
  async def is_blocked(self, response: "Response") -> bool:
168
  """Check if the response is blocked. Users should override this for custom detection logic."""