Karim shoair commited on
Commit ·
32daccc
1
Parent(s): c08de96
feat(proxy control): Force a proxy at request level at any given point
Browse files- scrapling/engines/_browsers/_base.py +2 -2
- scrapling/engines/_browsers/_controllers.py +16 -4
- scrapling/engines/_browsers/_stealth.py +16 -4
- scrapling/engines/_browsers/_types.py +1 -0
- scrapling/engines/static.py +2 -2
- scrapling/engines/toolbelt/convertor.py +7 -1
- scrapling/engines/toolbelt/custom.py +5 -1
- scrapling/spiders/engine.py +3 -0
- scrapling/spiders/session.py +2 -1
scrapling/engines/_browsers/_base.py
CHANGED
|
@@ -175,7 +175,7 @@ class SyncSession:
|
|
| 175 |
proxy: Optional[ProxyType] = None,
|
| 176 |
) -> Generator["PageInfo[Page]", None, None]:
|
| 177 |
"""Acquire a page - either from persistent context or fresh context with proxy."""
|
| 178 |
-
if
|
| 179 |
# Rotation mode: create fresh context with the provided proxy
|
| 180 |
if not self.browser: # pragma: no cover
|
| 181 |
raise RuntimeError("Browser not initialized for proxy rotation mode")
|
|
@@ -344,7 +344,7 @@ class AsyncSession:
|
|
| 344 |
proxy: Optional[ProxyType] = None,
|
| 345 |
) -> AsyncGenerator["PageInfo[AsyncPage]", None]:
|
| 346 |
"""Acquire a page - either from persistent context or fresh context with proxy."""
|
| 347 |
-
if
|
| 348 |
# Rotation mode: create fresh context with the provided proxy
|
| 349 |
if not self.browser: # pragma: no cover
|
| 350 |
raise RuntimeError("Browser not initialized for proxy rotation mode")
|
|
|
|
| 175 |
proxy: Optional[ProxyType] = None,
|
| 176 |
) -> Generator["PageInfo[Page]", None, None]:
|
| 177 |
"""Acquire a page - either from persistent context or fresh context with proxy."""
|
| 178 |
+
if proxy:
|
| 179 |
# Rotation mode: create fresh context with the provided proxy
|
| 180 |
if not self.browser: # pragma: no cover
|
| 181 |
raise RuntimeError("Browser not initialized for proxy rotation mode")
|
|
|
|
| 344 |
proxy: Optional[ProxyType] = None,
|
| 345 |
) -> AsyncGenerator["PageInfo[AsyncPage]", None]:
|
| 346 |
"""Acquire a page - either from persistent context or fresh context with proxy."""
|
| 347 |
+
if proxy:
|
| 348 |
# Rotation mode: create fresh context with the provided proxy
|
| 349 |
if not self.browser: # pragma: no cover
|
| 350 |
raise RuntimeError("Browser not initialized for proxy rotation mode")
|
scrapling/engines/_browsers/_controllers.py
CHANGED
|
@@ -115,8 +115,11 @@ class DynamicSession(SyncSession, DynamicSessionMixin):
|
|
| 115 |
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
| 116 |
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
| 117 |
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
|
|
|
| 118 |
:return: A `Response` object.
|
| 119 |
"""
|
|
|
|
|
|
|
| 120 |
params = _validate(kwargs, self, PlaywrightConfig)
|
| 121 |
if not self._is_alive: # pragma: no cover
|
| 122 |
raise RuntimeError("Context manager has been closed")
|
|
@@ -129,7 +132,10 @@ class DynamicSession(SyncSession, DynamicSessionMixin):
|
|
| 129 |
)
|
| 130 |
|
| 131 |
for attempt in range(self._config.retries):
|
| 132 |
-
|
|
|
|
|
|
|
|
|
|
| 133 |
|
| 134 |
with self._page_generator(
|
| 135 |
params.timeout, params.extra_headers, params.disable_resources, proxy
|
|
@@ -162,7 +168,7 @@ class DynamicSession(SyncSession, DynamicSessionMixin):
|
|
| 162 |
page.wait_for_timeout(params.wait)
|
| 163 |
|
| 164 |
response = ResponseFactory.from_playwright_response(
|
| 165 |
-
page, first_response, final_response[0], params.selector_config
|
| 166 |
)
|
| 167 |
return response
|
| 168 |
|
|
@@ -276,8 +282,11 @@ class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
|
|
| 276 |
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
| 277 |
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
| 278 |
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
|
|
|
| 279 |
:return: A `Response` object.
|
| 280 |
"""
|
|
|
|
|
|
|
| 281 |
params = _validate(kwargs, self, PlaywrightConfig)
|
| 282 |
|
| 283 |
if not self._is_alive: # pragma: no cover
|
|
@@ -291,7 +300,10 @@ class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
|
|
| 291 |
)
|
| 292 |
|
| 293 |
for attempt in range(self._config.retries):
|
| 294 |
-
|
|
|
|
|
|
|
|
|
|
| 295 |
|
| 296 |
async with self._page_generator(
|
| 297 |
params.timeout, params.extra_headers, params.disable_resources, proxy
|
|
@@ -324,7 +336,7 @@ class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
|
|
| 324 |
await page.wait_for_timeout(params.wait)
|
| 325 |
|
| 326 |
response = await ResponseFactory.from_async_playwright_response(
|
| 327 |
-
page, first_response, final_response[0], params.selector_config
|
| 328 |
)
|
| 329 |
return response
|
| 330 |
|
|
|
|
| 115 |
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
| 116 |
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
| 117 |
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
| 118 |
+
:param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
|
| 119 |
:return: A `Response` object.
|
| 120 |
"""
|
| 121 |
+
static_proxy = kwargs.pop("proxy", None)
|
| 122 |
+
|
| 123 |
params = _validate(kwargs, self, PlaywrightConfig)
|
| 124 |
if not self._is_alive: # pragma: no cover
|
| 125 |
raise RuntimeError("Context manager has been closed")
|
|
|
|
| 132 |
)
|
| 133 |
|
| 134 |
for attempt in range(self._config.retries):
|
| 135 |
+
if self._config.proxy_rotator and static_proxy is None:
|
| 136 |
+
proxy = self._config.proxy_rotator.get_proxy()
|
| 137 |
+
else:
|
| 138 |
+
proxy = static_proxy
|
| 139 |
|
| 140 |
with self._page_generator(
|
| 141 |
params.timeout, params.extra_headers, params.disable_resources, proxy
|
|
|
|
| 168 |
page.wait_for_timeout(params.wait)
|
| 169 |
|
| 170 |
response = ResponseFactory.from_playwright_response(
|
| 171 |
+
page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy}
|
| 172 |
)
|
| 173 |
return response
|
| 174 |
|
|
|
|
| 282 |
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
| 283 |
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
| 284 |
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
| 285 |
+
:param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
|
| 286 |
:return: A `Response` object.
|
| 287 |
"""
|
| 288 |
+
static_proxy = kwargs.pop("proxy", None)
|
| 289 |
+
|
| 290 |
params = _validate(kwargs, self, PlaywrightConfig)
|
| 291 |
|
| 292 |
if not self._is_alive: # pragma: no cover
|
|
|
|
| 300 |
)
|
| 301 |
|
| 302 |
for attempt in range(self._config.retries):
|
| 303 |
+
if self._config.proxy_rotator and static_proxy is None:
|
| 304 |
+
proxy = self._config.proxy_rotator.get_proxy()
|
| 305 |
+
else:
|
| 306 |
+
proxy = static_proxy
|
| 307 |
|
| 308 |
async with self._page_generator(
|
| 309 |
params.timeout, params.extra_headers, params.disable_resources, proxy
|
|
|
|
| 336 |
await page.wait_for_timeout(params.wait)
|
| 337 |
|
| 338 |
response = await ResponseFactory.from_async_playwright_response(
|
| 339 |
+
page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy}
|
| 340 |
)
|
| 341 |
return response
|
| 342 |
|
scrapling/engines/_browsers/_stealth.py
CHANGED
|
@@ -204,8 +204,11 @@ class StealthySession(SyncSession, StealthySessionMixin):
|
|
| 204 |
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
| 205 |
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
| 206 |
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
|
|
|
| 207 |
:return: A `Response` object.
|
| 208 |
"""
|
|
|
|
|
|
|
| 209 |
params = _validate(kwargs, self, StealthConfig)
|
| 210 |
if not self._is_alive: # pragma: no cover
|
| 211 |
raise RuntimeError("Context manager has been closed")
|
|
@@ -218,7 +221,10 @@ class StealthySession(SyncSession, StealthySessionMixin):
|
|
| 218 |
)
|
| 219 |
|
| 220 |
for attempt in range(self._config.retries):
|
| 221 |
-
|
|
|
|
|
|
|
|
|
|
| 222 |
|
| 223 |
with self._page_generator(
|
| 224 |
params.timeout, params.extra_headers, params.disable_resources, proxy
|
|
@@ -256,7 +262,7 @@ class StealthySession(SyncSession, StealthySessionMixin):
|
|
| 256 |
page.wait_for_timeout(params.wait)
|
| 257 |
|
| 258 |
response = ResponseFactory.from_playwright_response(
|
| 259 |
-
page, first_response, final_response[0], params.selector_config
|
| 260 |
)
|
| 261 |
return response
|
| 262 |
|
|
@@ -454,8 +460,11 @@ class AsyncStealthySession(AsyncSession, StealthySessionMixin):
|
|
| 454 |
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
| 455 |
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
| 456 |
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
|
|
|
| 457 |
:return: A `Response` object.
|
| 458 |
"""
|
|
|
|
|
|
|
| 459 |
params = _validate(kwargs, self, StealthConfig)
|
| 460 |
|
| 461 |
if not self._is_alive: # pragma: no cover
|
|
@@ -469,7 +478,10 @@ class AsyncStealthySession(AsyncSession, StealthySessionMixin):
|
|
| 469 |
)
|
| 470 |
|
| 471 |
for attempt in range(self._config.retries):
|
| 472 |
-
|
|
|
|
|
|
|
|
|
|
| 473 |
|
| 474 |
async with self._page_generator(
|
| 475 |
params.timeout, params.extra_headers, params.disable_resources, proxy
|
|
@@ -507,7 +519,7 @@ class AsyncStealthySession(AsyncSession, StealthySessionMixin):
|
|
| 507 |
await page.wait_for_timeout(params.wait)
|
| 508 |
|
| 509 |
response = await ResponseFactory.from_async_playwright_response(
|
| 510 |
-
page, first_response, final_response[0], params.selector_config
|
| 511 |
)
|
| 512 |
return response
|
| 513 |
|
|
|
|
| 204 |
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
| 205 |
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
| 206 |
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
| 207 |
+
:param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
|
| 208 |
:return: A `Response` object.
|
| 209 |
"""
|
| 210 |
+
static_proxy = kwargs.pop("proxy", None)
|
| 211 |
+
|
| 212 |
params = _validate(kwargs, self, StealthConfig)
|
| 213 |
if not self._is_alive: # pragma: no cover
|
| 214 |
raise RuntimeError("Context manager has been closed")
|
|
|
|
| 221 |
)
|
| 222 |
|
| 223 |
for attempt in range(self._config.retries):
|
| 224 |
+
if self._config.proxy_rotator and static_proxy is None:
|
| 225 |
+
proxy = self._config.proxy_rotator.get_proxy()
|
| 226 |
+
else:
|
| 227 |
+
proxy = static_proxy
|
| 228 |
|
| 229 |
with self._page_generator(
|
| 230 |
params.timeout, params.extra_headers, params.disable_resources, proxy
|
|
|
|
| 262 |
page.wait_for_timeout(params.wait)
|
| 263 |
|
| 264 |
response = ResponseFactory.from_playwright_response(
|
| 265 |
+
page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy}
|
| 266 |
)
|
| 267 |
return response
|
| 268 |
|
|
|
|
| 460 |
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
| 461 |
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
| 462 |
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
| 463 |
+
:param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
|
| 464 |
:return: A `Response` object.
|
| 465 |
"""
|
| 466 |
+
static_proxy = kwargs.pop("proxy", None)
|
| 467 |
+
|
| 468 |
params = _validate(kwargs, self, StealthConfig)
|
| 469 |
|
| 470 |
if not self._is_alive: # pragma: no cover
|
|
|
|
| 478 |
)
|
| 479 |
|
| 480 |
for attempt in range(self._config.retries):
|
| 481 |
+
if self._config.proxy_rotator and static_proxy is None:
|
| 482 |
+
proxy = self._config.proxy_rotator.get_proxy()
|
| 483 |
+
else:
|
| 484 |
+
proxy = static_proxy
|
| 485 |
|
| 486 |
async with self._page_generator(
|
| 487 |
params.timeout, params.extra_headers, params.disable_resources, proxy
|
|
|
|
| 519 |
await page.wait_for_timeout(params.wait)
|
| 520 |
|
| 521 |
response = await ResponseFactory.from_async_playwright_response(
|
| 522 |
+
page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy}
|
| 523 |
)
|
| 524 |
return response
|
| 525 |
|
scrapling/engines/_browsers/_types.py
CHANGED
|
@@ -99,6 +99,7 @@ if TYPE_CHECKING: # pragma: no cover
|
|
| 99 |
selector_config: Optional[Dict]
|
| 100 |
extra_headers: Optional[Dict[str, str]]
|
| 101 |
wait_selector_state: SelectorWaitStates
|
|
|
|
| 102 |
|
| 103 |
class StealthSession(PlaywrightSession, total=False):
|
| 104 |
allow_webgl: bool
|
|
|
|
| 99 |
selector_config: Optional[Dict]
|
| 100 |
extra_headers: Optional[Dict[str, str]]
|
| 101 |
wait_selector_state: SelectorWaitStates
|
| 102 |
+
proxy: Optional[str | Dict[str, str]]
|
| 103 |
|
| 104 |
class StealthSession(PlaywrightSession, total=False):
|
| 105 |
allow_webgl: bool
|
scrapling/engines/static.py
CHANGED
|
@@ -250,7 +250,7 @@ class _SyncSessionLogic(_ConfigurationLogic):
|
|
| 250 |
request_args = self._merge_request_args(stealth=stealth, proxy=proxy, **kwargs)
|
| 251 |
try:
|
| 252 |
response = session.request(method, **request_args)
|
| 253 |
-
result = ResponseFactory.from_http_request(response, selector_config)
|
| 254 |
return result
|
| 255 |
except CurlError as e: # pragma: no cover
|
| 256 |
if attempt < max_retries - 1:
|
|
@@ -466,7 +466,7 @@ class _ASyncSessionLogic(_ConfigurationLogic):
|
|
| 466 |
request_args = self._merge_request_args(stealth=stealth, proxy=proxy, **kwargs)
|
| 467 |
try:
|
| 468 |
response = await session.request(method, **request_args)
|
| 469 |
-
result = ResponseFactory.from_http_request(response, selector_config)
|
| 470 |
return result
|
| 471 |
except CurlError as e: # pragma: no cover
|
| 472 |
if attempt < max_retries - 1:
|
|
|
|
| 250 |
request_args = self._merge_request_args(stealth=stealth, proxy=proxy, **kwargs)
|
| 251 |
try:
|
| 252 |
response = session.request(method, **request_args)
|
| 253 |
+
result = ResponseFactory.from_http_request(response, selector_config, meta={"proxy": proxy})
|
| 254 |
return result
|
| 255 |
except CurlError as e: # pragma: no cover
|
| 256 |
if attempt < max_retries - 1:
|
|
|
|
| 466 |
request_args = self._merge_request_args(stealth=stealth, proxy=proxy, **kwargs)
|
| 467 |
try:
|
| 468 |
response = await session.request(method, **request_args)
|
| 469 |
+
result = ResponseFactory.from_http_request(response, selector_config, meta={"proxy": proxy})
|
| 470 |
return result
|
| 471 |
except CurlError as e: # pragma: no cover
|
| 472 |
if attempt < max_retries - 1:
|
scrapling/engines/toolbelt/convertor.py
CHANGED
|
@@ -85,6 +85,7 @@ class ResponseFactory:
|
|
| 85 |
first_response: SyncResponse,
|
| 86 |
final_response: Optional[SyncResponse],
|
| 87 |
parser_arguments: Dict,
|
|
|
|
| 88 |
) -> Response:
|
| 89 |
"""
|
| 90 |
Transforms a Playwright response into an internal `Response` object, encapsulating
|
|
@@ -134,6 +135,7 @@ class ResponseFactory:
|
|
| 134 |
"headers": first_response.all_headers(),
|
| 135 |
"request_headers": first_response.request.all_headers(),
|
| 136 |
"history": history,
|
|
|
|
| 137 |
**parser_arguments,
|
| 138 |
}
|
| 139 |
)
|
|
@@ -220,6 +222,7 @@ class ResponseFactory:
|
|
| 220 |
first_response: AsyncResponse,
|
| 221 |
final_response: Optional[AsyncResponse],
|
| 222 |
parser_arguments: Dict,
|
|
|
|
| 223 |
) -> Response:
|
| 224 |
"""
|
| 225 |
Transforms a Playwright response into an internal `Response` object, encapsulating
|
|
@@ -269,16 +272,18 @@ class ResponseFactory:
|
|
| 269 |
"headers": await first_response.all_headers(),
|
| 270 |
"request_headers": await first_response.request.all_headers(),
|
| 271 |
"history": history,
|
|
|
|
| 272 |
**parser_arguments,
|
| 273 |
}
|
| 274 |
)
|
| 275 |
|
| 276 |
@staticmethod
|
| 277 |
-
def from_http_request(response: CurlResponse, parser_arguments: Dict) -> Response:
|
| 278 |
"""Takes `curl_cffi` response and generates `Response` object from it.
|
| 279 |
|
| 280 |
:param response: `curl_cffi` response object
|
| 281 |
:param parser_arguments: Additional arguments to be passed to the `Response` object constructor.
|
|
|
|
| 282 |
:return: A `Response` object that is the same as `Selector` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
| 283 |
"""
|
| 284 |
return Response(
|
|
@@ -293,6 +298,7 @@ class ResponseFactory:
|
|
| 293 |
"request_headers": dict(response.request.headers) if response.request else {},
|
| 294 |
"method": response.request.method if response.request else "GET",
|
| 295 |
"history": response.history, # https://github.com/lexiforest/curl_cffi/issues/82
|
|
|
|
| 296 |
**parser_arguments,
|
| 297 |
}
|
| 298 |
)
|
|
|
|
| 85 |
first_response: SyncResponse,
|
| 86 |
final_response: Optional[SyncResponse],
|
| 87 |
parser_arguments: Dict,
|
| 88 |
+
meta: Optional[Dict] = None,
|
| 89 |
) -> Response:
|
| 90 |
"""
|
| 91 |
Transforms a Playwright response into an internal `Response` object, encapsulating
|
|
|
|
| 135 |
"headers": first_response.all_headers(),
|
| 136 |
"request_headers": first_response.request.all_headers(),
|
| 137 |
"history": history,
|
| 138 |
+
"meta": meta,
|
| 139 |
**parser_arguments,
|
| 140 |
}
|
| 141 |
)
|
|
|
|
| 222 |
first_response: AsyncResponse,
|
| 223 |
final_response: Optional[AsyncResponse],
|
| 224 |
parser_arguments: Dict,
|
| 225 |
+
meta: Optional[Dict] = None,
|
| 226 |
) -> Response:
|
| 227 |
"""
|
| 228 |
Transforms a Playwright response into an internal `Response` object, encapsulating
|
|
|
|
| 272 |
"headers": await first_response.all_headers(),
|
| 273 |
"request_headers": await first_response.request.all_headers(),
|
| 274 |
"history": history,
|
| 275 |
+
"meta": meta,
|
| 276 |
**parser_arguments,
|
| 277 |
}
|
| 278 |
)
|
| 279 |
|
| 280 |
@staticmethod
|
| 281 |
+
def from_http_request(response: CurlResponse, parser_arguments: Dict, meta: Optional[Dict] = None) -> Response:
|
| 282 |
"""Takes `curl_cffi` response and generates `Response` object from it.
|
| 283 |
|
| 284 |
:param response: `curl_cffi` response object
|
| 285 |
:param parser_arguments: Additional arguments to be passed to the `Response` object constructor.
|
| 286 |
+
:param meta: Optional metadata dictionary to attach to the Response.
|
| 287 |
:return: A `Response` object that is the same as `Selector` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
| 288 |
"""
|
| 289 |
return Response(
|
|
|
|
| 298 |
"request_headers": dict(response.request.headers) if response.request else {},
|
| 299 |
"method": response.request.method if response.request else "GET",
|
| 300 |
"history": response.history, # https://github.com/lexiforest/curl_cffi/issues/82
|
| 301 |
+
"meta": meta,
|
| 302 |
**parser_arguments,
|
| 303 |
}
|
| 304 |
)
|
scrapling/engines/toolbelt/custom.py
CHANGED
|
@@ -39,6 +39,7 @@ class Response(Selector):
|
|
| 39 |
encoding: str = "utf-8",
|
| 40 |
method: str = "GET",
|
| 41 |
history: List | None = None,
|
|
|
|
| 42 |
**selector_config: Any,
|
| 43 |
):
|
| 44 |
adaptive_domain: str = cast(str, selector_config.pop("adaptive_domain", ""))
|
|
@@ -57,7 +58,10 @@ class Response(Selector):
|
|
| 57 |
# For easier debugging while working from a Python shell
|
| 58 |
log.info(f"Fetched ({status}) <{method} {url}> (referer: {request_headers.get('referer')})")
|
| 59 |
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
| 61 |
self.request: Optional["Request"] = None # Will be set by crawler
|
| 62 |
|
| 63 |
def follow(
|
|
|
|
| 39 |
encoding: str = "utf-8",
|
| 40 |
method: str = "GET",
|
| 41 |
history: List | None = None,
|
| 42 |
+
meta: Dict[str, Any] | None = None,
|
| 43 |
**selector_config: Any,
|
| 44 |
):
|
| 45 |
adaptive_domain: str = cast(str, selector_config.pop("adaptive_domain", ""))
|
|
|
|
| 58 |
# For easier debugging while working from a Python shell
|
| 59 |
log.info(f"Fetched ({status}) <{method} {url}> (referer: {request_headers.get('referer')})")
|
| 60 |
|
| 61 |
+
if meta and not isinstance(meta, dict):
|
| 62 |
+
raise TypeError(f"Response meta should be dictionary but got {type(meta).__name__} instead!")
|
| 63 |
+
|
| 64 |
+
self.meta: Dict[str, Any] = meta or {}
|
| 65 |
self.request: Optional["Request"] = None # Will be set by crawler
|
| 66 |
|
| 67 |
def follow(
|
scrapling/spiders/engine.py
CHANGED
|
@@ -113,6 +113,9 @@ class CrawlerEngine:
|
|
| 113 |
retry_request._retry_count += 1
|
| 114 |
retry_request.priority -= 1 # Don't retry immediately
|
| 115 |
retry_request.dont_filter = True
|
|
|
|
|
|
|
|
|
|
| 116 |
new_request = await self.spider.retry_blocked_request(retry_request, response)
|
| 117 |
self._normalize_request(new_request)
|
| 118 |
await self.scheduler.enqueue(new_request)
|
|
|
|
| 113 |
retry_request._retry_count += 1
|
| 114 |
retry_request.priority -= 1 # Don't retry immediately
|
| 115 |
retry_request.dont_filter = True
|
| 116 |
+
retry_request._session_kwargs.pop("proxy", None)
|
| 117 |
+
retry_request._session_kwargs.pop("proxies", None)
|
| 118 |
+
|
| 119 |
new_request = await self.spider.retry_blocked_request(retry_request, response)
|
| 120 |
self._normalize_request(new_request)
|
| 121 |
await self.scheduler.enqueue(new_request)
|
scrapling/spiders/session.py
CHANGED
|
@@ -124,7 +124,8 @@ class SessionManager:
|
|
| 124 |
response = await session.fetch(url=request.url, **request._session_kwargs)
|
| 125 |
|
| 126 |
response.request = request
|
| 127 |
-
response
|
|
|
|
| 128 |
return response
|
| 129 |
raise RuntimeError("No session found with the request session id")
|
| 130 |
|
|
|
|
| 124 |
response = await session.fetch(url=request.url, **request._session_kwargs)
|
| 125 |
|
| 126 |
response.request = request
|
| 127 |
+
# Merge request meta into response meta (response meta takes priority)
|
| 128 |
+
response.meta = {**request.meta, **response.meta}
|
| 129 |
return response
|
| 130 |
raise RuntimeError("No session found with the request session id")
|
| 131 |
|