Karim shoair commited on
Commit ·
76e6484
1
Parent(s): 313b59a
refactor(Fetcher): Fix the issue of caching impersonation state + Less code duplication
Browse filesIt now creates a session with each request, but at the same time, it's still faster than the fetcher in v0.2.99 by 20% with more features enabled.
- scrapling/engines/static.py +31 -316
- scrapling/fetchers.py +5 -4
scrapling/engines/static.py
CHANGED
|
@@ -265,10 +265,18 @@ class FetcherSession:
|
|
| 265 |
:param adaptor_arguments: Arguments passed when creating the final Adaptor class.
|
| 266 |
:return: A `Response` object for synchronous requests or an awaitable for asynchronous.
|
| 267 |
"""
|
| 268 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
for attempt in range(max_retries):
|
| 270 |
try:
|
| 271 |
-
response =
|
| 272 |
# response.raise_for_status() # Retry responses with a status code between 200-400
|
| 273 |
return ResponseFactory.from_http_request(
|
| 274 |
response, adaptor_arguments
|
|
@@ -304,12 +312,20 @@ class FetcherSession:
|
|
| 304 |
:param adaptor_arguments: Arguments passed when creating the final Adaptor class.
|
| 305 |
:return: A `Response` object for synchronous requests or an awaitable for asynchronous.
|
| 306 |
"""
|
| 307 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 308 |
for attempt in range(max_retries):
|
| 309 |
try:
|
| 310 |
-
response = await
|
| 311 |
-
method, **request_args
|
| 312 |
-
)
|
| 313 |
# response.raise_for_status() # Retry responses with a status code between 200-400
|
| 314 |
return ResponseFactory.from_http_request(
|
| 315 |
response, adaptor_arguments
|
|
@@ -677,319 +693,18 @@ class FetcherSession:
|
|
| 677 |
class FetcherClient(FetcherSession):
|
| 678 |
def __init__(self, *args, **kwargs):
|
| 679 |
super().__init__(*args, **kwargs)
|
| 680 |
-
# Using one session for all requests is faster than using stateless `curl_cffi.get`
|
| 681 |
self.__enter__ = None
|
| 682 |
self.__exit__ = None
|
| 683 |
self.__aenter__ = None
|
| 684 |
self.__aexit__ = None
|
| 685 |
-
self._curl_session =
|
| 686 |
-
|
| 687 |
-
|
| 688 |
-
class AsyncFetcherClient:
|
| 689 |
-
# Since curl_cffi doesn't support making async requests without sessions
|
| 690 |
-
# And using a single session for many requests at the same time in async doesn't sit well with curl_cffi.
|
| 691 |
-
# We do this
|
| 692 |
-
|
| 693 |
-
@staticmethod
|
| 694 |
-
async def get(
|
| 695 |
-
url: str,
|
| 696 |
-
params: Optional[Union[Dict, List, Tuple]] = None,
|
| 697 |
-
headers: Optional[Mapping[str, Optional[str]]] = _UNSET,
|
| 698 |
-
cookies: Optional[CookieTypes] = None,
|
| 699 |
-
timeout: Optional[Union[int, float]] = _UNSET,
|
| 700 |
-
follow_redirects: Optional[bool] = _UNSET,
|
| 701 |
-
max_redirects: Optional[int] = _UNSET,
|
| 702 |
-
retries: Optional[int] = _UNSET,
|
| 703 |
-
retry_delay: Optional[int] = _UNSET,
|
| 704 |
-
proxies: Optional[ProxySpec] = _UNSET,
|
| 705 |
-
proxy: Optional[str] = _UNSET,
|
| 706 |
-
proxy_auth: Optional[Tuple[str, str]] = _UNSET,
|
| 707 |
-
auth: Optional[Tuple[str, str]] = None,
|
| 708 |
-
verify: Optional[bool] = _UNSET,
|
| 709 |
-
cert: Optional[Union[str, Tuple[str, str]]] = _UNSET,
|
| 710 |
-
impersonate: Optional[BrowserTypeLiteral] = _UNSET,
|
| 711 |
-
http3: Optional[bool] = _UNSET,
|
| 712 |
-
stealthy_headers: Optional[bool] = _UNSET,
|
| 713 |
-
**kwargs,
|
| 714 |
-
) -> Response:
|
| 715 |
-
"""
|
| 716 |
-
Perform a GET request.
|
| 717 |
-
|
| 718 |
-
:param url: Target URL for the request.
|
| 719 |
-
:param params: Query string parameters for the request.
|
| 720 |
-
:param headers: Headers to include in the request.
|
| 721 |
-
:param cookies: Cookies to use in the request.
|
| 722 |
-
:param timeout: Number of seconds to wait before timing out.
|
| 723 |
-
:param follow_redirects: Whether to follow redirects. Defaults to True.
|
| 724 |
-
:param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
|
| 725 |
-
:param retries: Number of retry attempts. Defaults to 3.
|
| 726 |
-
:param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
|
| 727 |
-
:param proxies: Dict of proxies to use.
|
| 728 |
-
:param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
|
| 729 |
-
Cannot be used together with the `proxies` parameter.
|
| 730 |
-
:param proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
|
| 731 |
-
:param auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
|
| 732 |
-
:param verify: Whether to verify HTTPS certificates.
|
| 733 |
-
:param cert: Tuple of (cert, key) filenames for the client certificate.
|
| 734 |
-
:param impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
|
| 735 |
-
:param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
|
| 736 |
-
:param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
|
| 737 |
-
:param kwargs: Additional keyword arguments to pass to the `curl_cffi.requests.AsyncSession().request()` method.
|
| 738 |
-
:return: An awaitable `Response` object.
|
| 739 |
-
"""
|
| 740 |
-
request_args = {
|
| 741 |
-
"url": url,
|
| 742 |
-
"params": params,
|
| 743 |
-
"headers": headers,
|
| 744 |
-
"cookies": cookies,
|
| 745 |
-
"timeout": timeout,
|
| 746 |
-
"retry_delay": retry_delay,
|
| 747 |
-
"allow_redirects": follow_redirects,
|
| 748 |
-
"max_redirects": max_redirects,
|
| 749 |
-
"retries": retries,
|
| 750 |
-
"proxies": proxies,
|
| 751 |
-
"proxy": proxy,
|
| 752 |
-
"proxy_auth": proxy_auth,
|
| 753 |
-
"auth": auth,
|
| 754 |
-
"verify": verify,
|
| 755 |
-
"cert": cert,
|
| 756 |
-
"impersonate": impersonate,
|
| 757 |
-
"http3": http3,
|
| 758 |
-
"stealthy_headers": stealthy_headers,
|
| 759 |
-
**kwargs,
|
| 760 |
-
}
|
| 761 |
-
async with FetcherSession() as client:
|
| 762 |
-
return await client.get(**request_args)
|
| 763 |
-
|
| 764 |
-
@staticmethod
|
| 765 |
-
async def post(
|
| 766 |
-
url: str,
|
| 767 |
-
data: Optional[Union[Dict, str]] = None,
|
| 768 |
-
json: Optional[Union[Dict, List]] = None,
|
| 769 |
-
headers: Optional[Mapping[str, Optional[str]]] = _UNSET,
|
| 770 |
-
params: Optional[Union[Dict, List, Tuple]] = None,
|
| 771 |
-
cookies: Optional[CookieTypes] = None,
|
| 772 |
-
timeout: Optional[Union[int, float]] = _UNSET,
|
| 773 |
-
follow_redirects: Optional[bool] = _UNSET,
|
| 774 |
-
max_redirects: Optional[int] = _UNSET,
|
| 775 |
-
retries: Optional[int] = _UNSET,
|
| 776 |
-
retry_delay: Optional[int] = _UNSET,
|
| 777 |
-
proxies: Optional[ProxySpec] = _UNSET,
|
| 778 |
-
proxy: Optional[str] = _UNSET,
|
| 779 |
-
proxy_auth: Optional[Tuple[str, str]] = _UNSET,
|
| 780 |
-
auth: Optional[Tuple[str, str]] = None,
|
| 781 |
-
verify: Optional[bool] = _UNSET,
|
| 782 |
-
cert: Optional[Union[str, Tuple[str, str]]] = _UNSET,
|
| 783 |
-
impersonate: Optional[BrowserTypeLiteral] = _UNSET,
|
| 784 |
-
http3: Optional[bool] = _UNSET,
|
| 785 |
-
stealthy_headers: Optional[bool] = _UNSET,
|
| 786 |
-
**kwargs,
|
| 787 |
-
) -> Response:
|
| 788 |
-
"""
|
| 789 |
-
Perform a POST request.
|
| 790 |
-
|
| 791 |
-
:param url: Target URL for the request.
|
| 792 |
-
:param data: Form data to include in the request body.
|
| 793 |
-
:param json: A JSON serializable object to include in the body of the request.
|
| 794 |
-
:param headers: Headers to include in the request.
|
| 795 |
-
:param params: Query string parameters for the request.
|
| 796 |
-
:param cookies: Cookies to use in the request.
|
| 797 |
-
:param timeout: Number of seconds to wait before timing out.
|
| 798 |
-
:param follow_redirects: Whether to follow redirects. Defaults to True.
|
| 799 |
-
:param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
|
| 800 |
-
:param retries: Number of retry attempts. Defaults to 3.
|
| 801 |
-
:param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
|
| 802 |
-
:param proxies: Dict of proxies to use. Format: {"http": proxy_url, "https": proxy_url}.
|
| 803 |
-
:param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
|
| 804 |
-
Cannot be used together with the `proxies` parameter.
|
| 805 |
-
:param proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
|
| 806 |
-
:param auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
|
| 807 |
-
:param verify: Whether to verify HTTPS certificates. Defaults to True.
|
| 808 |
-
:param cert: Tuple of (cert, key) filenames for the client certificate.
|
| 809 |
-
:param impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
|
| 810 |
-
:param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
|
| 811 |
-
:param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
|
| 812 |
-
:param kwargs: Additional keyword arguments to pass to the `curl_cffi.requests.AsyncSession().request()` method.
|
| 813 |
-
:return: An awaitable `Response` object.
|
| 814 |
-
"""
|
| 815 |
-
request_args = {
|
| 816 |
-
"url": url,
|
| 817 |
-
"data": data,
|
| 818 |
-
"json": json,
|
| 819 |
-
"headers": headers,
|
| 820 |
-
"params": params,
|
| 821 |
-
"cookies": cookies,
|
| 822 |
-
"timeout": timeout,
|
| 823 |
-
"retry_delay": retry_delay,
|
| 824 |
-
"proxy": proxy,
|
| 825 |
-
"impersonate": impersonate,
|
| 826 |
-
"allow_redirects": follow_redirects,
|
| 827 |
-
"max_redirects": max_redirects,
|
| 828 |
-
"retries": retries,
|
| 829 |
-
"proxies": proxies,
|
| 830 |
-
"proxy_auth": proxy_auth,
|
| 831 |
-
"auth": auth,
|
| 832 |
-
"verify": verify,
|
| 833 |
-
"cert": cert,
|
| 834 |
-
"http3": http3,
|
| 835 |
-
"stealthy_headers": stealthy_headers,
|
| 836 |
-
**kwargs,
|
| 837 |
-
}
|
| 838 |
-
async with FetcherSession() as client:
|
| 839 |
-
return await client.post(**request_args)
|
| 840 |
-
|
| 841 |
-
@staticmethod
|
| 842 |
-
async def put(
|
| 843 |
-
url: str,
|
| 844 |
-
data: Optional[Union[Dict, str]] = None,
|
| 845 |
-
json: Optional[Union[Dict, List]] = None,
|
| 846 |
-
headers: Optional[Mapping[str, Optional[str]]] = _UNSET,
|
| 847 |
-
params: Optional[Union[Dict, List, Tuple]] = None,
|
| 848 |
-
cookies: Optional[CookieTypes] = None,
|
| 849 |
-
timeout: Optional[Union[int, float]] = _UNSET,
|
| 850 |
-
follow_redirects: Optional[bool] = _UNSET,
|
| 851 |
-
max_redirects: Optional[int] = _UNSET,
|
| 852 |
-
retries: Optional[int] = _UNSET,
|
| 853 |
-
retry_delay: Optional[int] = _UNSET,
|
| 854 |
-
proxies: Optional[ProxySpec] = _UNSET,
|
| 855 |
-
proxy: Optional[str] = _UNSET,
|
| 856 |
-
proxy_auth: Optional[Tuple[str, str]] = _UNSET,
|
| 857 |
-
auth: Optional[Tuple[str, str]] = None,
|
| 858 |
-
verify: Optional[bool] = _UNSET,
|
| 859 |
-
cert: Optional[Union[str, Tuple[str, str]]] = _UNSET,
|
| 860 |
-
impersonate: Optional[BrowserTypeLiteral] = _UNSET,
|
| 861 |
-
http3: Optional[bool] = _UNSET,
|
| 862 |
-
stealthy_headers: Optional[bool] = _UNSET,
|
| 863 |
-
**kwargs,
|
| 864 |
-
) -> Response:
|
| 865 |
-
"""
|
| 866 |
-
Perform a PUT request.
|
| 867 |
|
| 868 |
-
:param url: Target URL for the request.
|
| 869 |
-
:param data: Form data to include in the request body.
|
| 870 |
-
:param json: A JSON serializable object to include in the body of the request.
|
| 871 |
-
:param headers: Headers to include in the request.
|
| 872 |
-
:param params: Query string parameters for the request.
|
| 873 |
-
:param cookies: Cookies to use in the request.
|
| 874 |
-
:param timeout: Number of seconds to wait before timing out.
|
| 875 |
-
:param follow_redirects: Whether to follow redirects. Defaults to True.
|
| 876 |
-
:param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
|
| 877 |
-
:param retries: Number of retry attempts. Defaults to 3.
|
| 878 |
-
:param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
|
| 879 |
-
:param proxies: Dict of proxies to use. Format: {"http": proxy_url, "https": proxy_url}.
|
| 880 |
-
:param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
|
| 881 |
-
Cannot be used together with the `proxies` parameter.
|
| 882 |
-
:param proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
|
| 883 |
-
:param auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
|
| 884 |
-
:param verify: Whether to verify HTTPS certificates. Defaults to True.
|
| 885 |
-
:param cert: Tuple of (cert, key) filenames for the client certificate.
|
| 886 |
-
:param impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
|
| 887 |
-
:param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
|
| 888 |
-
:param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
|
| 889 |
-
:param kwargs: Additional keyword arguments to pass to the `curl_cffi.requests.AsyncSession().request()` method.
|
| 890 |
-
:return: An awaitable `Response` object.
|
| 891 |
-
"""
|
| 892 |
-
request_args = {
|
| 893 |
-
"url": url,
|
| 894 |
-
"data": data,
|
| 895 |
-
"json": json,
|
| 896 |
-
"headers": headers,
|
| 897 |
-
"params": params,
|
| 898 |
-
"cookies": cookies,
|
| 899 |
-
"timeout": timeout,
|
| 900 |
-
"retry_delay": retry_delay,
|
| 901 |
-
"proxy": proxy,
|
| 902 |
-
"impersonate": impersonate,
|
| 903 |
-
"allow_redirects": follow_redirects,
|
| 904 |
-
"max_redirects": max_redirects,
|
| 905 |
-
"retries": retries,
|
| 906 |
-
"proxies": proxies,
|
| 907 |
-
"proxy_auth": proxy_auth,
|
| 908 |
-
"auth": auth,
|
| 909 |
-
"verify": verify,
|
| 910 |
-
"cert": cert,
|
| 911 |
-
"http3": http3,
|
| 912 |
-
"stealthy_headers": stealthy_headers,
|
| 913 |
-
**kwargs,
|
| 914 |
-
}
|
| 915 |
-
async with FetcherSession() as client:
|
| 916 |
-
return await client.put(**request_args)
|
| 917 |
|
| 918 |
-
|
| 919 |
-
|
| 920 |
-
|
| 921 |
-
|
| 922 |
-
|
| 923 |
-
|
| 924 |
-
|
| 925 |
-
|
| 926 |
-
timeout: Optional[Union[int, float]] = _UNSET,
|
| 927 |
-
follow_redirects: Optional[bool] = _UNSET,
|
| 928 |
-
max_redirects: Optional[int] = _UNSET,
|
| 929 |
-
retries: Optional[int] = _UNSET,
|
| 930 |
-
retry_delay: Optional[int] = _UNSET,
|
| 931 |
-
proxies: Optional[ProxySpec] = _UNSET,
|
| 932 |
-
proxy: Optional[str] = _UNSET,
|
| 933 |
-
proxy_auth: Optional[Tuple[str, str]] = _UNSET,
|
| 934 |
-
auth: Optional[Tuple[str, str]] = None,
|
| 935 |
-
verify: Optional[bool] = _UNSET,
|
| 936 |
-
cert: Optional[Union[str, Tuple[str, str]]] = _UNSET,
|
| 937 |
-
impersonate: Optional[BrowserTypeLiteral] = _UNSET,
|
| 938 |
-
http3: Optional[bool] = _UNSET,
|
| 939 |
-
stealthy_headers: Optional[bool] = _UNSET,
|
| 940 |
-
**kwargs,
|
| 941 |
-
) -> Response:
|
| 942 |
-
"""
|
| 943 |
-
Perform a DELETE request.
|
| 944 |
-
|
| 945 |
-
:param url: Target URL for the request.
|
| 946 |
-
:param data: Form data to include in the request body.
|
| 947 |
-
:param json: A JSON serializable object to include in the body of the request.
|
| 948 |
-
:param headers: Headers to include in the request.
|
| 949 |
-
:param params: Query string parameters for the request.
|
| 950 |
-
:param cookies: Cookies to use in the request.
|
| 951 |
-
:param timeout: Number of seconds to wait before timing out.
|
| 952 |
-
:param follow_redirects: Whether to follow redirects. Defaults to True.
|
| 953 |
-
:param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
|
| 954 |
-
:param retries: Number of retry attempts. Defaults to 3.
|
| 955 |
-
:param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
|
| 956 |
-
:param proxies: Dict of proxies to use. Format: {"http": proxy_url, "https": proxy_url}.
|
| 957 |
-
:param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
|
| 958 |
-
Cannot be used together with the `proxies` parameter.
|
| 959 |
-
:param proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
|
| 960 |
-
:param auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
|
| 961 |
-
:param verify: Whether to verify HTTPS certificates. Defaults to True.
|
| 962 |
-
:param cert: Tuple of (cert, key) filenames for the client certificate.
|
| 963 |
-
:param impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
|
| 964 |
-
:param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
|
| 965 |
-
:param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
|
| 966 |
-
:param kwargs: Additional keyword arguments to pass to the `curl_cffi.requests.AsyncSession().request()` method.
|
| 967 |
-
:return: An awaitable `Response` object.
|
| 968 |
-
"""
|
| 969 |
-
request_args = {
|
| 970 |
-
"url": url,
|
| 971 |
-
# Careful of sending a body in a DELETE request, it might cause some websites to reject the request as per https://www.rfc-editor.org/rfc/rfc7231#section-4.3.5,
|
| 972 |
-
# But some websites accept it, it depends on the implementation used.
|
| 973 |
-
"data": data,
|
| 974 |
-
"json": json,
|
| 975 |
-
"headers": headers,
|
| 976 |
-
"params": params,
|
| 977 |
-
"cookies": cookies,
|
| 978 |
-
"timeout": timeout,
|
| 979 |
-
"retry_delay": retry_delay,
|
| 980 |
-
"proxy": proxy,
|
| 981 |
-
"impersonate": impersonate,
|
| 982 |
-
"allow_redirects": follow_redirects,
|
| 983 |
-
"max_redirects": max_redirects,
|
| 984 |
-
"retries": retries,
|
| 985 |
-
"proxies": proxies,
|
| 986 |
-
"proxy_auth": proxy_auth,
|
| 987 |
-
"auth": auth,
|
| 988 |
-
"verify": verify,
|
| 989 |
-
"cert": cert,
|
| 990 |
-
"http3": http3,
|
| 991 |
-
"stealthy_headers": stealthy_headers,
|
| 992 |
-
**kwargs,
|
| 993 |
-
}
|
| 994 |
-
async with FetcherSession() as client:
|
| 995 |
-
return await client.delete(**request_args)
|
|
|
|
| 265 |
:param adaptor_arguments: Arguments passed when creating the final Adaptor class.
|
| 266 |
:return: A `Response` object for synchronous requests or an awaitable for asynchronous.
|
| 267 |
"""
|
| 268 |
+
session = self._curl_session
|
| 269 |
+
if session is True and not any(
|
| 270 |
+
(self.__enter__, self.__exit__, self.__aenter__, self.__aexit__)
|
| 271 |
+
):
|
| 272 |
+
# For usage inside FetcherClient
|
| 273 |
+
# It turns out `curl_cffi` caches impersonation state, so if you turned it off, then on then off, it won't be off on the last time.
|
| 274 |
+
session = CurlSession()
|
| 275 |
+
|
| 276 |
+
if session:
|
| 277 |
for attempt in range(max_retries):
|
| 278 |
try:
|
| 279 |
+
response = session.request(method, **request_args)
|
| 280 |
# response.raise_for_status() # Retry responses with a status code between 200-400
|
| 281 |
return ResponseFactory.from_http_request(
|
| 282 |
response, adaptor_arguments
|
|
|
|
| 312 |
:param adaptor_arguments: Arguments passed when creating the final Adaptor class.
|
| 313 |
:return: A `Response` object for synchronous requests or an awaitable for asynchronous.
|
| 314 |
"""
|
| 315 |
+
session = self._async_curl_session
|
| 316 |
+
if session is True and not any(
|
| 317 |
+
(self.__enter__, self.__exit__, self.__aenter__, self.__aexit__)
|
| 318 |
+
):
|
| 319 |
+
# For usage inside the ` AsyncFetcherClient ` class, and that's for several reasons
|
| 320 |
+
# 1. It turns out `curl_cffi` caches impersonation state, so if you turned it off, then on then off, it won't be off on the last time.
|
| 321 |
+
# 2. `curl_cffi` doesn't support making async requests without sessions
|
| 322 |
+
# 3. Using a single session for many requests at the same time in async doesn't sit well with curl_cffi.
|
| 323 |
+
session = AsyncCurlSession()
|
| 324 |
+
|
| 325 |
+
if session:
|
| 326 |
for attempt in range(max_retries):
|
| 327 |
try:
|
| 328 |
+
response = await session.request(method, **request_args)
|
|
|
|
|
|
|
| 329 |
# response.raise_for_status() # Retry responses with a status code between 200-400
|
| 330 |
return ResponseFactory.from_http_request(
|
| 331 |
response, adaptor_arguments
|
|
|
|
| 693 |
class FetcherClient(FetcherSession):
|
| 694 |
def __init__(self, *args, **kwargs):
|
| 695 |
super().__init__(*args, **kwargs)
|
|
|
|
| 696 |
self.__enter__ = None
|
| 697 |
self.__exit__ = None
|
| 698 |
self.__aenter__ = None
|
| 699 |
self.__aexit__ = None
|
| 700 |
+
self._curl_session = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 701 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 702 |
|
| 703 |
+
class AsyncFetcherClient(FetcherSession):
|
| 704 |
+
def __init__(self, *args, **kwargs):
|
| 705 |
+
super().__init__(*args, **kwargs)
|
| 706 |
+
self.__enter__ = None
|
| 707 |
+
self.__exit__ = None
|
| 708 |
+
self.__aenter__ = None
|
| 709 |
+
self.__aexit__ = None
|
| 710 |
+
self._async_curl_session = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scrapling/fetchers.py
CHANGED
|
@@ -20,6 +20,7 @@ from scrapling.engines import (
|
|
| 20 |
from scrapling.engines.toolbelt import BaseFetcher, Response
|
| 21 |
|
| 22 |
__FetcherClientInstance__ = _FetcherClient()
|
|
|
|
| 23 |
|
| 24 |
|
| 25 |
class Fetcher(BaseFetcher):
|
|
@@ -34,10 +35,10 @@ class Fetcher(BaseFetcher):
|
|
| 34 |
class AsyncFetcher(BaseFetcher):
|
| 35 |
"""A basic `Fetcher` class type that can only do basic GET, POST, PUT, and DELETE HTTP requests based on `curl_cffi`."""
|
| 36 |
|
| 37 |
-
get =
|
| 38 |
-
post =
|
| 39 |
-
put =
|
| 40 |
-
delete =
|
| 41 |
|
| 42 |
|
| 43 |
class StealthyFetcher(BaseFetcher):
|
|
|
|
| 20 |
from scrapling.engines.toolbelt import BaseFetcher, Response
|
| 21 |
|
| 22 |
__FetcherClientInstance__ = _FetcherClient()
|
| 23 |
+
__AsyncFetcherClientInstance__ = _AsyncFetcherClient()
|
| 24 |
|
| 25 |
|
| 26 |
class Fetcher(BaseFetcher):
|
|
|
|
| 35 |
class AsyncFetcher(BaseFetcher):
|
| 36 |
"""A basic `Fetcher` class type that can only do basic GET, POST, PUT, and DELETE HTTP requests based on `curl_cffi`."""
|
| 37 |
|
| 38 |
+
get = __AsyncFetcherClientInstance__.get
|
| 39 |
+
post = __AsyncFetcherClientInstance__.post
|
| 40 |
+
put = __AsyncFetcherClientInstance__.put
|
| 41 |
+
delete = __AsyncFetcherClientInstance__.delete
|
| 42 |
|
| 43 |
|
| 44 |
class StealthyFetcher(BaseFetcher):
|