Karim shoair commited on
Commit
32daccc
·
1 Parent(s): c08de96

feat(proxy control): Force a proxy at request level at any given point

Browse files
scrapling/engines/_browsers/_base.py CHANGED
@@ -175,7 +175,7 @@ class SyncSession:
175
  proxy: Optional[ProxyType] = None,
176
  ) -> Generator["PageInfo[Page]", None, None]:
177
  """Acquire a page - either from persistent context or fresh context with proxy."""
178
- if self._config.proxy_rotator:
179
  # Rotation mode: create fresh context with the provided proxy
180
  if not self.browser: # pragma: no cover
181
  raise RuntimeError("Browser not initialized for proxy rotation mode")
@@ -344,7 +344,7 @@ class AsyncSession:
344
  proxy: Optional[ProxyType] = None,
345
  ) -> AsyncGenerator["PageInfo[AsyncPage]", None]:
346
  """Acquire a page - either from persistent context or fresh context with proxy."""
347
- if self._config.proxy_rotator:
348
  # Rotation mode: create fresh context with the provided proxy
349
  if not self.browser: # pragma: no cover
350
  raise RuntimeError("Browser not initialized for proxy rotation mode")
 
175
  proxy: Optional[ProxyType] = None,
176
  ) -> Generator["PageInfo[Page]", None, None]:
177
  """Acquire a page - either from persistent context or fresh context with proxy."""
178
+ if proxy:
179
  # Rotation mode: create fresh context with the provided proxy
180
  if not self.browser: # pragma: no cover
181
  raise RuntimeError("Browser not initialized for proxy rotation mode")
 
344
  proxy: Optional[ProxyType] = None,
345
  ) -> AsyncGenerator["PageInfo[AsyncPage]", None]:
346
  """Acquire a page - either from persistent context or fresh context with proxy."""
347
+ if proxy:
348
  # Rotation mode: create fresh context with the provided proxy
349
  if not self.browser: # pragma: no cover
350
  raise RuntimeError("Browser not initialized for proxy rotation mode")
scrapling/engines/_browsers/_controllers.py CHANGED
@@ -115,8 +115,11 @@ class DynamicSession(SyncSession, DynamicSessionMixin):
115
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
116
  :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
117
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
 
118
  :return: A `Response` object.
119
  """
 
 
120
  params = _validate(kwargs, self, PlaywrightConfig)
121
  if not self._is_alive: # pragma: no cover
122
  raise RuntimeError("Context manager has been closed")
@@ -129,7 +132,10 @@ class DynamicSession(SyncSession, DynamicSessionMixin):
129
  )
130
 
131
  for attempt in range(self._config.retries):
132
- proxy = self._config.proxy_rotator.get_proxy() if self._config.proxy_rotator else None
 
 
 
133
 
134
  with self._page_generator(
135
  params.timeout, params.extra_headers, params.disable_resources, proxy
@@ -162,7 +168,7 @@ class DynamicSession(SyncSession, DynamicSessionMixin):
162
  page.wait_for_timeout(params.wait)
163
 
164
  response = ResponseFactory.from_playwright_response(
165
- page, first_response, final_response[0], params.selector_config
166
  )
167
  return response
168
 
@@ -276,8 +282,11 @@ class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
276
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
277
  :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
278
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
 
279
  :return: A `Response` object.
280
  """
 
 
281
  params = _validate(kwargs, self, PlaywrightConfig)
282
 
283
  if not self._is_alive: # pragma: no cover
@@ -291,7 +300,10 @@ class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
291
  )
292
 
293
  for attempt in range(self._config.retries):
294
- proxy = self._config.proxy_rotator.get_proxy() if self._config.proxy_rotator else None
 
 
 
295
 
296
  async with self._page_generator(
297
  params.timeout, params.extra_headers, params.disable_resources, proxy
@@ -324,7 +336,7 @@ class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
324
  await page.wait_for_timeout(params.wait)
325
 
326
  response = await ResponseFactory.from_async_playwright_response(
327
- page, first_response, final_response[0], params.selector_config
328
  )
329
  return response
330
 
 
115
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
116
  :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
117
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
118
+ :param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
119
  :return: A `Response` object.
120
  """
121
+ static_proxy = kwargs.pop("proxy", None)
122
+
123
  params = _validate(kwargs, self, PlaywrightConfig)
124
  if not self._is_alive: # pragma: no cover
125
  raise RuntimeError("Context manager has been closed")
 
132
  )
133
 
134
  for attempt in range(self._config.retries):
135
+ if self._config.proxy_rotator and static_proxy is None:
136
+ proxy = self._config.proxy_rotator.get_proxy()
137
+ else:
138
+ proxy = static_proxy
139
 
140
  with self._page_generator(
141
  params.timeout, params.extra_headers, params.disable_resources, proxy
 
168
  page.wait_for_timeout(params.wait)
169
 
170
  response = ResponseFactory.from_playwright_response(
171
+ page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy}
172
  )
173
  return response
174
 
 
282
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
283
  :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
284
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
285
+ :param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
286
  :return: A `Response` object.
287
  """
288
+ static_proxy = kwargs.pop("proxy", None)
289
+
290
  params = _validate(kwargs, self, PlaywrightConfig)
291
 
292
  if not self._is_alive: # pragma: no cover
 
300
  )
301
 
302
  for attempt in range(self._config.retries):
303
+ if self._config.proxy_rotator and static_proxy is None:
304
+ proxy = self._config.proxy_rotator.get_proxy()
305
+ else:
306
+ proxy = static_proxy
307
 
308
  async with self._page_generator(
309
  params.timeout, params.extra_headers, params.disable_resources, proxy
 
336
  await page.wait_for_timeout(params.wait)
337
 
338
  response = await ResponseFactory.from_async_playwright_response(
339
+ page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy}
340
  )
341
  return response
342
 
scrapling/engines/_browsers/_stealth.py CHANGED
@@ -204,8 +204,11 @@ class StealthySession(SyncSession, StealthySessionMixin):
204
  :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
205
  :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
206
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
 
207
  :return: A `Response` object.
208
  """
 
 
209
  params = _validate(kwargs, self, StealthConfig)
210
  if not self._is_alive: # pragma: no cover
211
  raise RuntimeError("Context manager has been closed")
@@ -218,7 +221,10 @@ class StealthySession(SyncSession, StealthySessionMixin):
218
  )
219
 
220
  for attempt in range(self._config.retries):
221
- proxy = self._config.proxy_rotator.get_proxy() if self._config.proxy_rotator else None
 
 
 
222
 
223
  with self._page_generator(
224
  params.timeout, params.extra_headers, params.disable_resources, proxy
@@ -256,7 +262,7 @@ class StealthySession(SyncSession, StealthySessionMixin):
256
  page.wait_for_timeout(params.wait)
257
 
258
  response = ResponseFactory.from_playwright_response(
259
- page, first_response, final_response[0], params.selector_config
260
  )
261
  return response
262
 
@@ -454,8 +460,11 @@ class AsyncStealthySession(AsyncSession, StealthySessionMixin):
454
  :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
455
  :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
456
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
 
457
  :return: A `Response` object.
458
  """
 
 
459
  params = _validate(kwargs, self, StealthConfig)
460
 
461
  if not self._is_alive: # pragma: no cover
@@ -469,7 +478,10 @@ class AsyncStealthySession(AsyncSession, StealthySessionMixin):
469
  )
470
 
471
  for attempt in range(self._config.retries):
472
- proxy = self._config.proxy_rotator.get_proxy() if self._config.proxy_rotator else None
 
 
 
473
 
474
  async with self._page_generator(
475
  params.timeout, params.extra_headers, params.disable_resources, proxy
@@ -507,7 +519,7 @@ class AsyncStealthySession(AsyncSession, StealthySessionMixin):
507
  await page.wait_for_timeout(params.wait)
508
 
509
  response = await ResponseFactory.from_async_playwright_response(
510
- page, first_response, final_response[0], params.selector_config
511
  )
512
  return response
513
 
 
204
  :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
205
  :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
206
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
207
+ :param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
208
  :return: A `Response` object.
209
  """
210
+ static_proxy = kwargs.pop("proxy", None)
211
+
212
  params = _validate(kwargs, self, StealthConfig)
213
  if not self._is_alive: # pragma: no cover
214
  raise RuntimeError("Context manager has been closed")
 
221
  )
222
 
223
  for attempt in range(self._config.retries):
224
+ if self._config.proxy_rotator and static_proxy is None:
225
+ proxy = self._config.proxy_rotator.get_proxy()
226
+ else:
227
+ proxy = static_proxy
228
 
229
  with self._page_generator(
230
  params.timeout, params.extra_headers, params.disable_resources, proxy
 
262
  page.wait_for_timeout(params.wait)
263
 
264
  response = ResponseFactory.from_playwright_response(
265
+ page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy}
266
  )
267
  return response
268
 
 
460
  :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
461
  :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
462
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
463
+ :param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
464
  :return: A `Response` object.
465
  """
466
+ static_proxy = kwargs.pop("proxy", None)
467
+
468
  params = _validate(kwargs, self, StealthConfig)
469
 
470
  if not self._is_alive: # pragma: no cover
 
478
  )
479
 
480
  for attempt in range(self._config.retries):
481
+ if self._config.proxy_rotator and static_proxy is None:
482
+ proxy = self._config.proxy_rotator.get_proxy()
483
+ else:
484
+ proxy = static_proxy
485
 
486
  async with self._page_generator(
487
  params.timeout, params.extra_headers, params.disable_resources, proxy
 
519
  await page.wait_for_timeout(params.wait)
520
 
521
  response = await ResponseFactory.from_async_playwright_response(
522
+ page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy}
523
  )
524
  return response
525
 
scrapling/engines/_browsers/_types.py CHANGED
@@ -99,6 +99,7 @@ if TYPE_CHECKING: # pragma: no cover
99
  selector_config: Optional[Dict]
100
  extra_headers: Optional[Dict[str, str]]
101
  wait_selector_state: SelectorWaitStates
 
102
 
103
  class StealthSession(PlaywrightSession, total=False):
104
  allow_webgl: bool
 
99
  selector_config: Optional[Dict]
100
  extra_headers: Optional[Dict[str, str]]
101
  wait_selector_state: SelectorWaitStates
102
+ proxy: Optional[str | Dict[str, str]]
103
 
104
  class StealthSession(PlaywrightSession, total=False):
105
  allow_webgl: bool
scrapling/engines/static.py CHANGED
@@ -250,7 +250,7 @@ class _SyncSessionLogic(_ConfigurationLogic):
250
  request_args = self._merge_request_args(stealth=stealth, proxy=proxy, **kwargs)
251
  try:
252
  response = session.request(method, **request_args)
253
- result = ResponseFactory.from_http_request(response, selector_config)
254
  return result
255
  except CurlError as e: # pragma: no cover
256
  if attempt < max_retries - 1:
@@ -466,7 +466,7 @@ class _ASyncSessionLogic(_ConfigurationLogic):
466
  request_args = self._merge_request_args(stealth=stealth, proxy=proxy, **kwargs)
467
  try:
468
  response = await session.request(method, **request_args)
469
- result = ResponseFactory.from_http_request(response, selector_config)
470
  return result
471
  except CurlError as e: # pragma: no cover
472
  if attempt < max_retries - 1:
 
250
  request_args = self._merge_request_args(stealth=stealth, proxy=proxy, **kwargs)
251
  try:
252
  response = session.request(method, **request_args)
253
+ result = ResponseFactory.from_http_request(response, selector_config, meta={"proxy": proxy})
254
  return result
255
  except CurlError as e: # pragma: no cover
256
  if attempt < max_retries - 1:
 
466
  request_args = self._merge_request_args(stealth=stealth, proxy=proxy, **kwargs)
467
  try:
468
  response = await session.request(method, **request_args)
469
+ result = ResponseFactory.from_http_request(response, selector_config, meta={"proxy": proxy})
470
  return result
471
  except CurlError as e: # pragma: no cover
472
  if attempt < max_retries - 1:
scrapling/engines/toolbelt/convertor.py CHANGED
@@ -85,6 +85,7 @@ class ResponseFactory:
85
  first_response: SyncResponse,
86
  final_response: Optional[SyncResponse],
87
  parser_arguments: Dict,
 
88
  ) -> Response:
89
  """
90
  Transforms a Playwright response into an internal `Response` object, encapsulating
@@ -134,6 +135,7 @@ class ResponseFactory:
134
  "headers": first_response.all_headers(),
135
  "request_headers": first_response.request.all_headers(),
136
  "history": history,
 
137
  **parser_arguments,
138
  }
139
  )
@@ -220,6 +222,7 @@ class ResponseFactory:
220
  first_response: AsyncResponse,
221
  final_response: Optional[AsyncResponse],
222
  parser_arguments: Dict,
 
223
  ) -> Response:
224
  """
225
  Transforms a Playwright response into an internal `Response` object, encapsulating
@@ -269,16 +272,18 @@ class ResponseFactory:
269
  "headers": await first_response.all_headers(),
270
  "request_headers": await first_response.request.all_headers(),
271
  "history": history,
 
272
  **parser_arguments,
273
  }
274
  )
275
 
276
  @staticmethod
277
- def from_http_request(response: CurlResponse, parser_arguments: Dict) -> Response:
278
  """Takes `curl_cffi` response and generates `Response` object from it.
279
 
280
  :param response: `curl_cffi` response object
281
  :param parser_arguments: Additional arguments to be passed to the `Response` object constructor.
 
282
  :return: A `Response` object that is the same as `Selector` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
283
  """
284
  return Response(
@@ -293,6 +298,7 @@ class ResponseFactory:
293
  "request_headers": dict(response.request.headers) if response.request else {},
294
  "method": response.request.method if response.request else "GET",
295
  "history": response.history, # https://github.com/lexiforest/curl_cffi/issues/82
 
296
  **parser_arguments,
297
  }
298
  )
 
85
  first_response: SyncResponse,
86
  final_response: Optional[SyncResponse],
87
  parser_arguments: Dict,
88
+ meta: Optional[Dict] = None,
89
  ) -> Response:
90
  """
91
  Transforms a Playwright response into an internal `Response` object, encapsulating
 
135
  "headers": first_response.all_headers(),
136
  "request_headers": first_response.request.all_headers(),
137
  "history": history,
138
+ "meta": meta,
139
  **parser_arguments,
140
  }
141
  )
 
222
  first_response: AsyncResponse,
223
  final_response: Optional[AsyncResponse],
224
  parser_arguments: Dict,
225
+ meta: Optional[Dict] = None,
226
  ) -> Response:
227
  """
228
  Transforms a Playwright response into an internal `Response` object, encapsulating
 
272
  "headers": await first_response.all_headers(),
273
  "request_headers": await first_response.request.all_headers(),
274
  "history": history,
275
+ "meta": meta,
276
  **parser_arguments,
277
  }
278
  )
279
 
280
  @staticmethod
281
+ def from_http_request(response: CurlResponse, parser_arguments: Dict, meta: Optional[Dict] = None) -> Response:
282
  """Takes `curl_cffi` response and generates `Response` object from it.
283
 
284
  :param response: `curl_cffi` response object
285
  :param parser_arguments: Additional arguments to be passed to the `Response` object constructor.
286
+ :param meta: Optional metadata dictionary to attach to the Response.
287
  :return: A `Response` object that is the same as `Selector` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
288
  """
289
  return Response(
 
298
  "request_headers": dict(response.request.headers) if response.request else {},
299
  "method": response.request.method if response.request else "GET",
300
  "history": response.history, # https://github.com/lexiforest/curl_cffi/issues/82
301
+ "meta": meta,
302
  **parser_arguments,
303
  }
304
  )
scrapling/engines/toolbelt/custom.py CHANGED
@@ -39,6 +39,7 @@ class Response(Selector):
39
  encoding: str = "utf-8",
40
  method: str = "GET",
41
  history: List | None = None,
 
42
  **selector_config: Any,
43
  ):
44
  adaptive_domain: str = cast(str, selector_config.pop("adaptive_domain", ""))
@@ -57,7 +58,10 @@ class Response(Selector):
57
  # For easier debugging while working from a Python shell
58
  log.info(f"Fetched ({status}) <{method} {url}> (referer: {request_headers.get('referer')})")
59
 
60
- self.meta: Dict[str, Any] = {}
 
 
 
61
  self.request: Optional["Request"] = None # Will be set by crawler
62
 
63
  def follow(
 
39
  encoding: str = "utf-8",
40
  method: str = "GET",
41
  history: List | None = None,
42
+ meta: Dict[str, Any] | None = None,
43
  **selector_config: Any,
44
  ):
45
  adaptive_domain: str = cast(str, selector_config.pop("adaptive_domain", ""))
 
58
  # For easier debugging while working from a Python shell
59
  log.info(f"Fetched ({status}) <{method} {url}> (referer: {request_headers.get('referer')})")
60
 
61
+ if meta and not isinstance(meta, dict):
62
+ raise TypeError(f"Response meta should be dictionary but got {type(meta).__name__} instead!")
63
+
64
+ self.meta: Dict[str, Any] = meta or {}
65
  self.request: Optional["Request"] = None # Will be set by crawler
66
 
67
  def follow(
scrapling/spiders/engine.py CHANGED
@@ -113,6 +113,9 @@ class CrawlerEngine:
113
  retry_request._retry_count += 1
114
  retry_request.priority -= 1 # Don't retry immediately
115
  retry_request.dont_filter = True
 
 
 
116
  new_request = await self.spider.retry_blocked_request(retry_request, response)
117
  self._normalize_request(new_request)
118
  await self.scheduler.enqueue(new_request)
 
113
  retry_request._retry_count += 1
114
  retry_request.priority -= 1 # Don't retry immediately
115
  retry_request.dont_filter = True
116
+ retry_request._session_kwargs.pop("proxy", None)
117
+ retry_request._session_kwargs.pop("proxies", None)
118
+
119
  new_request = await self.spider.retry_blocked_request(retry_request, response)
120
  self._normalize_request(new_request)
121
  await self.scheduler.enqueue(new_request)
scrapling/spiders/session.py CHANGED
@@ -124,7 +124,8 @@ class SessionManager:
124
  response = await session.fetch(url=request.url, **request._session_kwargs)
125
 
126
  response.request = request
127
- response.meta = request.meta
 
128
  return response
129
  raise RuntimeError("No session found with the request session id")
130
 
 
124
  response = await session.fetch(url=request.url, **request._session_kwargs)
125
 
126
  response.request = request
127
+ # Merge request meta into response meta (response meta takes priority)
128
+ response.meta = {**request.meta, **response.meta}
129
  return response
130
  raise RuntimeError("No session found with the request session id")
131