Karim shoair commited on
Commit ·
01127e9
1
Parent(s): 168160d
feat(spiders): Change items hook to make it for processing items + add a stat for this
Browse files- scrapling/spiders/engine.py +11 -6
- scrapling/spiders/result.py +2 -0
- scrapling/spiders/spider.py +3 -3
scrapling/spiders/engine.py
CHANGED
|
@@ -102,12 +102,17 @@ class CrawlerEngine:
|
|
| 102 |
self.stats.offsite_requests_count += 1
|
| 103 |
log.debug(f"Filtered offsite request to: {result.url}")
|
| 104 |
elif isinstance(result, dict):
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
elif result is not None:
|
| 112 |
log.error(f"Spider must return Request, dict or None, got '{type(result)}' in {request}")
|
| 113 |
except Exception as e:
|
|
|
|
| 102 |
self.stats.offsite_requests_count += 1
|
| 103 |
log.debug(f"Filtered offsite request to: {result.url}")
|
| 104 |
elif isinstance(result, dict):
|
| 105 |
+
processed_result = await self.spider.on_scraped_item(result)
|
| 106 |
+
if processed_result:
|
| 107 |
+
self.stats.items_scraped += 1
|
| 108 |
+
log.debug(f"Scraped from {str(response)}\n{processed_result}")
|
| 109 |
+
if self._item_stream:
|
| 110 |
+
await self._item_stream.send(processed_result)
|
| 111 |
+
else:
|
| 112 |
+
self._items.append(processed_result)
|
| 113 |
+
else:
|
| 114 |
+
self.stats.items_dropped += 1
|
| 115 |
+
log.warning(f"Dropped from {str(response)}\n{processed_result}")
|
| 116 |
elif result is not None:
|
| 117 |
log.error(f"Spider must return Request, dict or None, got '{type(result)}' in {request}")
|
| 118 |
except Exception as e:
|
scrapling/spiders/result.py
CHANGED
|
@@ -50,6 +50,7 @@ class CrawlStats:
|
|
| 50 |
offsite_requests_count: int = 0
|
| 51 |
response_bytes: int = 0
|
| 52 |
items_scraped: int = 0
|
|
|
|
| 53 |
start_time: float = 0.0
|
| 54 |
end_time: float = 0.0
|
| 55 |
download_delay: float = 0.0
|
|
@@ -85,6 +86,7 @@ class CrawlStats:
|
|
| 85 |
def to_dict(self) -> dict[str, Any]:
|
| 86 |
return {
|
| 87 |
"items_scraped": self.items_scraped,
|
|
|
|
| 88 |
"elapsed_seconds": round(self.elapsed_seconds, 2),
|
| 89 |
"download_delay": round(self.download_delay, 2),
|
| 90 |
"concurrent_requests": self.concurrent_requests,
|
|
|
|
| 50 |
offsite_requests_count: int = 0
|
| 51 |
response_bytes: int = 0
|
| 52 |
items_scraped: int = 0
|
| 53 |
+
items_dropped: int = 0
|
| 54 |
start_time: float = 0.0
|
| 55 |
end_time: float = 0.0
|
| 56 |
download_delay: float = 0.0
|
|
|
|
| 86 |
def to_dict(self) -> dict[str, Any]:
|
| 87 |
return {
|
| 88 |
"items_scraped": self.items_scraped,
|
| 89 |
+
"items_dropped": self.items_dropped,
|
| 90 |
"elapsed_seconds": round(self.elapsed_seconds, 2),
|
| 91 |
"download_delay": round(self.download_delay, 2),
|
| 92 |
"concurrent_requests": self.concurrent_requests,
|
scrapling/spiders/spider.py
CHANGED
|
@@ -160,9 +160,9 @@ class Spider(ABC):
|
|
| 160 |
"""
|
| 161 |
self.logger.error(error, exc_info=error)
|
| 162 |
|
| 163 |
-
async def on_scraped_item(self, item:
|
| 164 |
-
"""
|
| 165 |
-
|
| 166 |
|
| 167 |
async def is_blocked(self, response: "Response") -> bool:
|
| 168 |
"""Check if the response is blocked. Users should override this for custom detection logic."""
|
|
|
|
| 160 |
"""
|
| 161 |
self.logger.error(error, exc_info=error)
|
| 162 |
|
| 163 |
+
async def on_scraped_item(self, item: Dict[str, Any]) -> Dict[str, Any] | None:
|
| 164 |
+
"""A hook to be overridden by users to do some processing on scraped items, return `None` to drop the item silently."""
|
| 165 |
+
return item
|
| 166 |
|
| 167 |
async def is_blocked(self, response: "Response") -> bool:
|
| 168 |
"""Check if the response is blocked. Users should override this for custom detection logic."""
|