Karim shoair commited on
Commit
76e6484
·
1 Parent(s): 313b59a

refactor(Fetcher): Fix the issue of caching impersonation state + Less code duplication

Browse files

It now creates a session with each request, but at the same time, it's still faster than the fetcher in v0.2.99 by 20% with more features enabled.

Files changed (2) hide show
  1. scrapling/engines/static.py +31 -316
  2. scrapling/fetchers.py +5 -4
scrapling/engines/static.py CHANGED
@@ -265,10 +265,18 @@ class FetcherSession:
265
  :param adaptor_arguments: Arguments passed when creating the final Adaptor class.
266
  :return: A `Response` object for synchronous requests or an awaitable for asynchronous.
267
  """
268
- if self._curl_session:
 
 
 
 
 
 
 
 
269
  for attempt in range(max_retries):
270
  try:
271
- response = self._curl_session.request(method, **request_args)
272
  # response.raise_for_status() # Retry responses with a status code between 200-400
273
  return ResponseFactory.from_http_request(
274
  response, adaptor_arguments
@@ -304,12 +312,20 @@ class FetcherSession:
304
  :param adaptor_arguments: Arguments passed when creating the final Adaptor class.
305
  :return: A `Response` object for synchronous requests or an awaitable for asynchronous.
306
  """
307
- if self._async_curl_session:
 
 
 
 
 
 
 
 
 
 
308
  for attempt in range(max_retries):
309
  try:
310
- response = await self._async_curl_session.request(
311
- method, **request_args
312
- )
313
  # response.raise_for_status() # Retry responses with a status code between 200-400
314
  return ResponseFactory.from_http_request(
315
  response, adaptor_arguments
@@ -677,319 +693,18 @@ class FetcherSession:
677
  class FetcherClient(FetcherSession):
678
  def __init__(self, *args, **kwargs):
679
  super().__init__(*args, **kwargs)
680
- # Using one session for all requests is faster than using stateless `curl_cffi.get`
681
  self.__enter__ = None
682
  self.__exit__ = None
683
  self.__aenter__ = None
684
  self.__aexit__ = None
685
- self._curl_session = CurlSession()
686
-
687
-
688
- class AsyncFetcherClient:
689
- # Since curl_cffi doesn't support making async requests without sessions
690
- # And using a single session for many requests at the same time in async doesn't sit well with curl_cffi.
691
- # We do this
692
-
693
- @staticmethod
694
- async def get(
695
- url: str,
696
- params: Optional[Union[Dict, List, Tuple]] = None,
697
- headers: Optional[Mapping[str, Optional[str]]] = _UNSET,
698
- cookies: Optional[CookieTypes] = None,
699
- timeout: Optional[Union[int, float]] = _UNSET,
700
- follow_redirects: Optional[bool] = _UNSET,
701
- max_redirects: Optional[int] = _UNSET,
702
- retries: Optional[int] = _UNSET,
703
- retry_delay: Optional[int] = _UNSET,
704
- proxies: Optional[ProxySpec] = _UNSET,
705
- proxy: Optional[str] = _UNSET,
706
- proxy_auth: Optional[Tuple[str, str]] = _UNSET,
707
- auth: Optional[Tuple[str, str]] = None,
708
- verify: Optional[bool] = _UNSET,
709
- cert: Optional[Union[str, Tuple[str, str]]] = _UNSET,
710
- impersonate: Optional[BrowserTypeLiteral] = _UNSET,
711
- http3: Optional[bool] = _UNSET,
712
- stealthy_headers: Optional[bool] = _UNSET,
713
- **kwargs,
714
- ) -> Response:
715
- """
716
- Perform a GET request.
717
-
718
- :param url: Target URL for the request.
719
- :param params: Query string parameters for the request.
720
- :param headers: Headers to include in the request.
721
- :param cookies: Cookies to use in the request.
722
- :param timeout: Number of seconds to wait before timing out.
723
- :param follow_redirects: Whether to follow redirects. Defaults to True.
724
- :param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
725
- :param retries: Number of retry attempts. Defaults to 3.
726
- :param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
727
- :param proxies: Dict of proxies to use.
728
- :param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
729
- Cannot be used together with the `proxies` parameter.
730
- :param proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
731
- :param auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
732
- :param verify: Whether to verify HTTPS certificates.
733
- :param cert: Tuple of (cert, key) filenames for the client certificate.
734
- :param impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
735
- :param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
736
- :param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
737
- :param kwargs: Additional keyword arguments to pass to the `curl_cffi.requests.AsyncSession().request()` method.
738
- :return: An awaitable `Response` object.
739
- """
740
- request_args = {
741
- "url": url,
742
- "params": params,
743
- "headers": headers,
744
- "cookies": cookies,
745
- "timeout": timeout,
746
- "retry_delay": retry_delay,
747
- "allow_redirects": follow_redirects,
748
- "max_redirects": max_redirects,
749
- "retries": retries,
750
- "proxies": proxies,
751
- "proxy": proxy,
752
- "proxy_auth": proxy_auth,
753
- "auth": auth,
754
- "verify": verify,
755
- "cert": cert,
756
- "impersonate": impersonate,
757
- "http3": http3,
758
- "stealthy_headers": stealthy_headers,
759
- **kwargs,
760
- }
761
- async with FetcherSession() as client:
762
- return await client.get(**request_args)
763
-
764
- @staticmethod
765
- async def post(
766
- url: str,
767
- data: Optional[Union[Dict, str]] = None,
768
- json: Optional[Union[Dict, List]] = None,
769
- headers: Optional[Mapping[str, Optional[str]]] = _UNSET,
770
- params: Optional[Union[Dict, List, Tuple]] = None,
771
- cookies: Optional[CookieTypes] = None,
772
- timeout: Optional[Union[int, float]] = _UNSET,
773
- follow_redirects: Optional[bool] = _UNSET,
774
- max_redirects: Optional[int] = _UNSET,
775
- retries: Optional[int] = _UNSET,
776
- retry_delay: Optional[int] = _UNSET,
777
- proxies: Optional[ProxySpec] = _UNSET,
778
- proxy: Optional[str] = _UNSET,
779
- proxy_auth: Optional[Tuple[str, str]] = _UNSET,
780
- auth: Optional[Tuple[str, str]] = None,
781
- verify: Optional[bool] = _UNSET,
782
- cert: Optional[Union[str, Tuple[str, str]]] = _UNSET,
783
- impersonate: Optional[BrowserTypeLiteral] = _UNSET,
784
- http3: Optional[bool] = _UNSET,
785
- stealthy_headers: Optional[bool] = _UNSET,
786
- **kwargs,
787
- ) -> Response:
788
- """
789
- Perform a POST request.
790
-
791
- :param url: Target URL for the request.
792
- :param data: Form data to include in the request body.
793
- :param json: A JSON serializable object to include in the body of the request.
794
- :param headers: Headers to include in the request.
795
- :param params: Query string parameters for the request.
796
- :param cookies: Cookies to use in the request.
797
- :param timeout: Number of seconds to wait before timing out.
798
- :param follow_redirects: Whether to follow redirects. Defaults to True.
799
- :param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
800
- :param retries: Number of retry attempts. Defaults to 3.
801
- :param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
802
- :param proxies: Dict of proxies to use. Format: {"http": proxy_url, "https": proxy_url}.
803
- :param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
804
- Cannot be used together with the `proxies` parameter.
805
- :param proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
806
- :param auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
807
- :param verify: Whether to verify HTTPS certificates. Defaults to True.
808
- :param cert: Tuple of (cert, key) filenames for the client certificate.
809
- :param impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
810
- :param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
811
- :param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
812
- :param kwargs: Additional keyword arguments to pass to the `curl_cffi.requests.AsyncSession().request()` method.
813
- :return: An awaitable `Response` object.
814
- """
815
- request_args = {
816
- "url": url,
817
- "data": data,
818
- "json": json,
819
- "headers": headers,
820
- "params": params,
821
- "cookies": cookies,
822
- "timeout": timeout,
823
- "retry_delay": retry_delay,
824
- "proxy": proxy,
825
- "impersonate": impersonate,
826
- "allow_redirects": follow_redirects,
827
- "max_redirects": max_redirects,
828
- "retries": retries,
829
- "proxies": proxies,
830
- "proxy_auth": proxy_auth,
831
- "auth": auth,
832
- "verify": verify,
833
- "cert": cert,
834
- "http3": http3,
835
- "stealthy_headers": stealthy_headers,
836
- **kwargs,
837
- }
838
- async with FetcherSession() as client:
839
- return await client.post(**request_args)
840
-
841
- @staticmethod
842
- async def put(
843
- url: str,
844
- data: Optional[Union[Dict, str]] = None,
845
- json: Optional[Union[Dict, List]] = None,
846
- headers: Optional[Mapping[str, Optional[str]]] = _UNSET,
847
- params: Optional[Union[Dict, List, Tuple]] = None,
848
- cookies: Optional[CookieTypes] = None,
849
- timeout: Optional[Union[int, float]] = _UNSET,
850
- follow_redirects: Optional[bool] = _UNSET,
851
- max_redirects: Optional[int] = _UNSET,
852
- retries: Optional[int] = _UNSET,
853
- retry_delay: Optional[int] = _UNSET,
854
- proxies: Optional[ProxySpec] = _UNSET,
855
- proxy: Optional[str] = _UNSET,
856
- proxy_auth: Optional[Tuple[str, str]] = _UNSET,
857
- auth: Optional[Tuple[str, str]] = None,
858
- verify: Optional[bool] = _UNSET,
859
- cert: Optional[Union[str, Tuple[str, str]]] = _UNSET,
860
- impersonate: Optional[BrowserTypeLiteral] = _UNSET,
861
- http3: Optional[bool] = _UNSET,
862
- stealthy_headers: Optional[bool] = _UNSET,
863
- **kwargs,
864
- ) -> Response:
865
- """
866
- Perform a PUT request.
867
 
868
- :param url: Target URL for the request.
869
- :param data: Form data to include in the request body.
870
- :param json: A JSON serializable object to include in the body of the request.
871
- :param headers: Headers to include in the request.
872
- :param params: Query string parameters for the request.
873
- :param cookies: Cookies to use in the request.
874
- :param timeout: Number of seconds to wait before timing out.
875
- :param follow_redirects: Whether to follow redirects. Defaults to True.
876
- :param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
877
- :param retries: Number of retry attempts. Defaults to 3.
878
- :param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
879
- :param proxies: Dict of proxies to use. Format: {"http": proxy_url, "https": proxy_url}.
880
- :param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
881
- Cannot be used together with the `proxies` parameter.
882
- :param proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
883
- :param auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
884
- :param verify: Whether to verify HTTPS certificates. Defaults to True.
885
- :param cert: Tuple of (cert, key) filenames for the client certificate.
886
- :param impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
887
- :param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
888
- :param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
889
- :param kwargs: Additional keyword arguments to pass to the `curl_cffi.requests.AsyncSession().request()` method.
890
- :return: An awaitable `Response` object.
891
- """
892
- request_args = {
893
- "url": url,
894
- "data": data,
895
- "json": json,
896
- "headers": headers,
897
- "params": params,
898
- "cookies": cookies,
899
- "timeout": timeout,
900
- "retry_delay": retry_delay,
901
- "proxy": proxy,
902
- "impersonate": impersonate,
903
- "allow_redirects": follow_redirects,
904
- "max_redirects": max_redirects,
905
- "retries": retries,
906
- "proxies": proxies,
907
- "proxy_auth": proxy_auth,
908
- "auth": auth,
909
- "verify": verify,
910
- "cert": cert,
911
- "http3": http3,
912
- "stealthy_headers": stealthy_headers,
913
- **kwargs,
914
- }
915
- async with FetcherSession() as client:
916
- return await client.put(**request_args)
917
 
918
- @staticmethod
919
- async def delete(
920
- url: str,
921
- data: Optional[Union[Dict, str]] = None,
922
- json: Optional[Union[Dict, List]] = None,
923
- headers: Optional[Mapping[str, Optional[str]]] = _UNSET,
924
- params: Optional[Union[Dict, List, Tuple]] = None,
925
- cookies: Optional[CookieTypes] = None,
926
- timeout: Optional[Union[int, float]] = _UNSET,
927
- follow_redirects: Optional[bool] = _UNSET,
928
- max_redirects: Optional[int] = _UNSET,
929
- retries: Optional[int] = _UNSET,
930
- retry_delay: Optional[int] = _UNSET,
931
- proxies: Optional[ProxySpec] = _UNSET,
932
- proxy: Optional[str] = _UNSET,
933
- proxy_auth: Optional[Tuple[str, str]] = _UNSET,
934
- auth: Optional[Tuple[str, str]] = None,
935
- verify: Optional[bool] = _UNSET,
936
- cert: Optional[Union[str, Tuple[str, str]]] = _UNSET,
937
- impersonate: Optional[BrowserTypeLiteral] = _UNSET,
938
- http3: Optional[bool] = _UNSET,
939
- stealthy_headers: Optional[bool] = _UNSET,
940
- **kwargs,
941
- ) -> Response:
942
- """
943
- Perform a DELETE request.
944
-
945
- :param url: Target URL for the request.
946
- :param data: Form data to include in the request body.
947
- :param json: A JSON serializable object to include in the body of the request.
948
- :param headers: Headers to include in the request.
949
- :param params: Query string parameters for the request.
950
- :param cookies: Cookies to use in the request.
951
- :param timeout: Number of seconds to wait before timing out.
952
- :param follow_redirects: Whether to follow redirects. Defaults to True.
953
- :param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
954
- :param retries: Number of retry attempts. Defaults to 3.
955
- :param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
956
- :param proxies: Dict of proxies to use. Format: {"http": proxy_url, "https": proxy_url}.
957
- :param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
958
- Cannot be used together with the `proxies` parameter.
959
- :param proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
960
- :param auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
961
- :param verify: Whether to verify HTTPS certificates. Defaults to True.
962
- :param cert: Tuple of (cert, key) filenames for the client certificate.
963
- :param impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
964
- :param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
965
- :param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
966
- :param kwargs: Additional keyword arguments to pass to the `curl_cffi.requests.AsyncSession().request()` method.
967
- :return: An awaitable `Response` object.
968
- """
969
- request_args = {
970
- "url": url,
971
- # Careful of sending a body in a DELETE request, it might cause some websites to reject the request as per https://www.rfc-editor.org/rfc/rfc7231#section-4.3.5,
972
- # But some websites accept it, it depends on the implementation used.
973
- "data": data,
974
- "json": json,
975
- "headers": headers,
976
- "params": params,
977
- "cookies": cookies,
978
- "timeout": timeout,
979
- "retry_delay": retry_delay,
980
- "proxy": proxy,
981
- "impersonate": impersonate,
982
- "allow_redirects": follow_redirects,
983
- "max_redirects": max_redirects,
984
- "retries": retries,
985
- "proxies": proxies,
986
- "proxy_auth": proxy_auth,
987
- "auth": auth,
988
- "verify": verify,
989
- "cert": cert,
990
- "http3": http3,
991
- "stealthy_headers": stealthy_headers,
992
- **kwargs,
993
- }
994
- async with FetcherSession() as client:
995
- return await client.delete(**request_args)
 
265
  :param adaptor_arguments: Arguments passed when creating the final Adaptor class.
266
  :return: A `Response` object for synchronous requests or an awaitable for asynchronous.
267
  """
268
+ session = self._curl_session
269
+ if session is True and not any(
270
+ (self.__enter__, self.__exit__, self.__aenter__, self.__aexit__)
271
+ ):
272
+ # For usage inside FetcherClient
273
+ # It turns out `curl_cffi` caches impersonation state, so if you turned it off, then on then off, it won't be off on the last time.
274
+ session = CurlSession()
275
+
276
+ if session:
277
  for attempt in range(max_retries):
278
  try:
279
+ response = session.request(method, **request_args)
280
  # response.raise_for_status() # Retry responses with a status code between 200-400
281
  return ResponseFactory.from_http_request(
282
  response, adaptor_arguments
 
312
  :param adaptor_arguments: Arguments passed when creating the final Adaptor class.
313
  :return: A `Response` object for synchronous requests or an awaitable for asynchronous.
314
  """
315
+ session = self._async_curl_session
316
+ if session is True and not any(
317
+ (self.__enter__, self.__exit__, self.__aenter__, self.__aexit__)
318
+ ):
319
+ # For usage inside the ` AsyncFetcherClient ` class, and that's for several reasons
320
+ # 1. It turns out `curl_cffi` caches impersonation state, so if you turned it off, then on then off, it won't be off on the last time.
321
+ # 2. `curl_cffi` doesn't support making async requests without sessions
322
+ # 3. Using a single session for many requests at the same time in async doesn't sit well with curl_cffi.
323
+ session = AsyncCurlSession()
324
+
325
+ if session:
326
  for attempt in range(max_retries):
327
  try:
328
+ response = await session.request(method, **request_args)
 
 
329
  # response.raise_for_status() # Retry responses with a status code between 200-400
330
  return ResponseFactory.from_http_request(
331
  response, adaptor_arguments
 
693
  class FetcherClient(FetcherSession):
694
  def __init__(self, *args, **kwargs):
695
  super().__init__(*args, **kwargs)
 
696
  self.__enter__ = None
697
  self.__exit__ = None
698
  self.__aenter__ = None
699
  self.__aexit__ = None
700
+ self._curl_session = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
701
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
702
 
703
+ class AsyncFetcherClient(FetcherSession):
704
+ def __init__(self, *args, **kwargs):
705
+ super().__init__(*args, **kwargs)
706
+ self.__enter__ = None
707
+ self.__exit__ = None
708
+ self.__aenter__ = None
709
+ self.__aexit__ = None
710
+ self._async_curl_session = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scrapling/fetchers.py CHANGED
@@ -20,6 +20,7 @@ from scrapling.engines import (
20
  from scrapling.engines.toolbelt import BaseFetcher, Response
21
 
22
  __FetcherClientInstance__ = _FetcherClient()
 
23
 
24
 
25
  class Fetcher(BaseFetcher):
@@ -34,10 +35,10 @@ class Fetcher(BaseFetcher):
34
  class AsyncFetcher(BaseFetcher):
35
  """A basic `Fetcher` class type that can only do basic GET, POST, PUT, and DELETE HTTP requests based on `curl_cffi`."""
36
 
37
- get = _AsyncFetcherClient.get
38
- post = _AsyncFetcherClient.post
39
- put = _AsyncFetcherClient.put
40
- delete = _AsyncFetcherClient.delete
41
 
42
 
43
  class StealthyFetcher(BaseFetcher):
 
20
  from scrapling.engines.toolbelt import BaseFetcher, Response
21
 
22
  __FetcherClientInstance__ = _FetcherClient()
23
+ __AsyncFetcherClientInstance__ = _AsyncFetcherClient()
24
 
25
 
26
  class Fetcher(BaseFetcher):
 
35
  class AsyncFetcher(BaseFetcher):
36
  """A basic `Fetcher` class type that can only do basic GET, POST, PUT, and DELETE HTTP requests based on `curl_cffi`."""
37
 
38
+ get = __AsyncFetcherClientInstance__.get
39
+ post = __AsyncFetcherClientInstance__.post
40
+ put = __AsyncFetcherClientInstance__.put
41
+ delete = __AsyncFetcherClientInstance__.delete
42
 
43
 
44
  class StealthyFetcher(BaseFetcher):