Karim shoair commited on
Commit
2e15903
·
1 Parent(s): f3f8b20

feat(cli): Adding two new commands (uncurl/curl2fetcher)

Browse files
Files changed (2) hide show
  1. scrapling/cli.py +3 -3
  2. scrapling/core/shell.py +345 -16
scrapling/cli.py CHANGED
@@ -1,12 +1,10 @@
1
  import os
2
- import subprocess
3
  import sys
 
4
  from pathlib import Path
5
 
6
  import click
7
 
8
- from scrapling.core.shell import CustomShell
9
-
10
 
11
  def get_package_dir():
12
  return Path(os.path.dirname(__file__))
@@ -73,6 +71,8 @@ def install(force):
73
  help="Log level (default: DEBUG)",
74
  )
75
  def shell(code, level):
 
 
76
  console = CustomShell(code=code, log_level=level)
77
  console.start()
78
 
 
1
  import os
 
2
  import sys
3
+ import subprocess
4
  from pathlib import Path
5
 
6
  import click
7
 
 
 
8
 
9
  def get_package_dir():
10
  return Path(os.path.dirname(__file__))
 
71
  help="Log level (default: DEBUG)",
72
  )
73
  def shell(code, level):
74
+ from scrapling.core.shell import CustomShell
75
+
76
  console = CustomShell(code=code, log_level=level)
77
  console.start()
78
 
scrapling/core/shell.py CHANGED
@@ -1,36 +1,360 @@
 
1
  import os
2
- import logging
3
- import tempfile
4
- import webbrowser
5
  from functools import wraps
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  from IPython.terminal.embed import InteractiveShellEmbed
8
 
9
  from scrapling import __version__
10
  from scrapling.core.utils import log
11
  from scrapling.parser import Adaptor, Adaptors
12
- from scrapling.fetchers import Fetcher, AsyncFetcher, PlayWrightFetcher, StealthyFetcher
13
-
 
 
 
 
 
 
14
 
15
  _known_logging_levels = {
16
- "debug": logging.DEBUG,
17
- "info": logging.INFO,
18
- "warning": logging.WARNING,
19
- "error": logging.ERROR,
20
- "critical": logging.CRITICAL,
21
- "fatal": logging.FATAL,
22
  }
23
 
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  def show_page_in_browser(page):
26
  if not page:
27
  log.error("Input must be of type `Adaptor`")
28
  return
29
 
30
- fd, fname = tempfile.mkstemp(".html")
31
  os.write(fd, page.body.encode("utf-8"))
32
  os.close(fd)
33
- webbrowser.open(f"file://{fname}")
34
 
35
 
36
  class CustomShell:
@@ -40,13 +364,14 @@ class CustomShell:
40
  self.code = code
41
  self.page = None
42
  self.pages = Adaptors([])
 
43
  log_level = log_level.strip().lower()
44
 
45
  if _known_logging_levels.get(log_level):
46
  self.log_level = _known_logging_levels[log_level]
47
  else:
48
  log.error(f'Unknown log level "{log_level}", defaulting to "DEBUG"')
49
- self.log_level = logging.DEBUG
50
 
51
  self.shell = None
52
 
@@ -57,13 +382,13 @@ class CustomShell:
57
  """Initialize application components"""
58
  # This is where you'd set up your application-specific objects
59
  if self.log_level:
60
- logging.getLogger("scrapling").setLevel(self.log_level)
61
 
62
  settings = Fetcher.display_config()
63
  _ = settings.pop("storage")
64
  _ = settings.pop("storage_args")
65
  log.info(f"Scrapling {__version__} shell started")
66
- log.info(f"Logging level is set to '{logging.getLevelName(self.log_level)}'")
67
  log.info(f"Fetchers' parsing settings: {settings}")
68
 
69
  @staticmethod
@@ -87,6 +412,8 @@ class CustomShell:
87
  -> Useful commands
88
  - {"page / response":<30} The response object of the last page you fetched
89
  - {"pages":<30} Adaptors object of the last 5 response objects you fetched
 
 
90
  - {"view(page)":<30} View page in a browser
91
  - {"help()":<30} Show this help message (Shell help)
92
 
@@ -146,6 +473,8 @@ Type 'exit' or press Ctrl+D to exit.
146
  "response": self.page,
147
  "pages": self.pages,
148
  "view": show_page_in_browser,
 
 
149
  "help": self.show_help,
150
  }
151
 
 
1
+ # -*- coding: utf-8 -*-
2
  import os
3
+ import json
4
+ from sys import stderr
 
5
  from functools import wraps
6
+ from http import cookies as Cookie
7
+ from collections import namedtuple
8
+ from shlex import split as shlex_split
9
+ from tempfile import mkstemp as make_temp_file
10
+ from urllib.parse import urlparse, urlunparse, parse_qsl
11
+ from argparse import ArgumentParser, SUPPRESS
12
+ from webbrowser import open as open_in_browser
13
+ from logging import (
14
+ DEBUG,
15
+ INFO,
16
+ WARNING,
17
+ ERROR,
18
+ CRITICAL,
19
+ FATAL,
20
+ getLogger,
21
+ getLevelName,
22
+ )
23
 
24
  from IPython.terminal.embed import InteractiveShellEmbed
25
 
26
  from scrapling import __version__
27
  from scrapling.core.utils import log
28
  from scrapling.parser import Adaptor, Adaptors
29
+ from scrapling.core._types import List, Optional, Dict, Tuple, Any, Union
30
+ from scrapling.fetchers import (
31
+ Fetcher,
32
+ AsyncFetcher,
33
+ PlayWrightFetcher,
34
+ StealthyFetcher,
35
+ Response,
36
+ )
37
 
38
  _known_logging_levels = {
39
+ "debug": DEBUG,
40
+ "info": INFO,
41
+ "warning": WARNING,
42
+ "error": ERROR,
43
+ "critical": CRITICAL,
44
+ "fatal": FATAL,
45
  }
46
 
47
 
48
+ # Define the structure for parsed context - Simplified for Fetcher args
49
+ Request = namedtuple(
50
+ "Request",
51
+ [
52
+ "method",
53
+ "url",
54
+ "params",
55
+ "data", # Can be str, bytes, or dict (for urlencoded)
56
+ "json_data", # Python object (dict/list) for JSON payload
57
+ "headers",
58
+ "cookies",
59
+ "proxy",
60
+ "follow_redirects", # Added for -L flag
61
+ ],
62
+ )
63
+
64
+
65
+ # Suppress exit on error to handle parsing errors gracefully
66
+ class NoExitArgumentParser(ArgumentParser):
67
+ def error(self, message):
68
+ log.error(f"Curl arguments parsing error: {message}")
69
+ raise ValueError(f"Curl arguments parsing error: {message}")
70
+
71
+ def exit(self, status=0, message=None):
72
+ if message:
73
+ log.error(f"Scrapling shell exited with status {status}: {message}")
74
+ self._print_message(message, stderr)
75
+ raise ValueError(
76
+ f"Scrapling shell exited with status {status}: {message or 'Unknown reason'}"
77
+ )
78
+
79
+
80
+ class CurlParser:
81
+ """Builds the argument parser for relevant curl flags from DevTools."""
82
+
83
+ def __init__(self):
84
+ # We will use argparse parser to parse the curl command directly instead of regex
85
+ # We will focus more on flags that will show up on curl commands copied from DevTools's network tab
86
+ _parser = NoExitArgumentParser(add_help=False) # Disable default help
87
+ # Basic curl arguments
88
+ _parser.add_argument("curl_command_placeholder", nargs="?", help=SUPPRESS)
89
+ _parser.add_argument("url")
90
+ _parser.add_argument("-X", "--request", dest="method", default=None)
91
+ _parser.add_argument("-H", "--header", action="append", default=[])
92
+ _parser.add_argument(
93
+ "-A", "--user-agent", help="Will be parsed from -H if present"
94
+ ) # Note: DevTools usually includes this in -H
95
+
96
+ # Data arguments (prioritizing types common from DevTools)
97
+ _parser.add_argument("-d", "--data", default=None)
98
+ _parser.add_argument(
99
+ "--data-raw", default=None
100
+ ) # Often used by browsers for JSON body
101
+ _parser.add_argument("--data-binary", default=None)
102
+ # Keep urlencode for completeness, though less common from browser copy/paste
103
+ _parser.add_argument("--data-urlencode", action="append", default=[])
104
+ _parser.add_argument(
105
+ "-G", "--get", action="store_true"
106
+ ) # Use GET and put data in URL
107
+
108
+ # Proxy
109
+ _parser.add_argument("-x", "--proxy", default=None)
110
+ _parser.add_argument("-U", "--proxy-user", default=None) # Basic proxy auth
111
+
112
+ # Connection/Security
113
+ _parser.add_argument("-k", "--insecure", action="store_true")
114
+ _parser.add_argument(
115
+ "--compressed", action="store_true"
116
+ ) # Very common from browsers
117
+
118
+ # Other flags often included but may not map directly to request args
119
+ _parser.add_argument("-i", "--include", action="store_true")
120
+ _parser.add_argument("-s", "--silent", action="store_true")
121
+ _parser.add_argument("-v", "--verbose", action="store_true")
122
+
123
+ self.parser: NoExitArgumentParser = _parser
124
+ self._supported_methods = ("get", "post", "put", "delete")
125
+
126
+ # --- Helper Functions ---
127
+ @staticmethod
128
+ def parse_headers(header_lines: List[str]) -> Tuple[Dict[str, str], Dict[str, str]]:
129
+ """Parses -H headers into separate header and cookie dictionaries."""
130
+ header_dict = dict()
131
+ cookie_dict = dict()
132
+
133
+ for header_line in header_lines:
134
+ if ":" not in header_line:
135
+ if header_line.endswith(";"):
136
+ header_key = header_line[:-1].strip()
137
+ header_value = ""
138
+ header_dict[header_key] = header_value
139
+ else:
140
+ log.warning(
141
+ f"Could not parse header without colon: '{header_line}', skipping."
142
+ )
143
+ continue
144
+ else:
145
+ header_key, header_value = header_line.split(":", 1)
146
+ header_key = header_key.strip()
147
+ header_value = header_value.strip()
148
+
149
+ if header_key.lower() == "cookie":
150
+ try:
151
+ cookie_parser = Cookie.SimpleCookie()
152
+ cookie_parser.load(header_value)
153
+ for key, morsel in cookie_parser.items():
154
+ cookie_dict[key] = morsel.value
155
+ except Exception as e:
156
+ log.error(
157
+ f"Could not parse cookie string '{header_value}': {e}"
158
+ )
159
+ else:
160
+ header_dict[header_key] = header_value
161
+
162
+ return header_dict, cookie_dict
163
+
164
+ # --- Main Parsing Logic ---
165
+ def parse(self, curl_command: str) -> Optional[Request]:
166
+ """Parses the curl command string into a structured context for Fetcher."""
167
+
168
+ clean_command = curl_command.strip().lstrip("curl").strip().replace("\\\n", " ")
169
+
170
+ try:
171
+ tokens = shlex_split(
172
+ clean_command
173
+ ) # Split the string using shell-like syntax
174
+ except ValueError as e:
175
+ log.error(f"Could not split command line: {e}")
176
+ return None
177
+
178
+ try:
179
+ parsed_args, unknown = self.parser.parse_known_args(tokens)
180
+ if unknown:
181
+ log.warning(f"Ignored unknown curl arguments: {unknown}")
182
+
183
+ except ValueError:
184
+ return None
185
+
186
+ except Exception as e:
187
+ log.error(
188
+ f"An unexpected error occurred during curl arguments parsing: {e}"
189
+ )
190
+ return None
191
+
192
+ # --- Determine Method ---
193
+ method = "get" # Default
194
+ if parsed_args.get: # -G forces GET
195
+ method = "get"
196
+
197
+ elif parsed_args.method:
198
+ method = parsed_args.method.strip().lower()
199
+
200
+ # Infer POST if data is present (unless overridden by -X or -G)
201
+ elif any(
202
+ [
203
+ parsed_args.data,
204
+ parsed_args.data_raw,
205
+ parsed_args.data_binary,
206
+ parsed_args.data_urlencode,
207
+ ]
208
+ ):
209
+ method = "post"
210
+
211
+ headers, cookies = self.parse_headers(parsed_args.header)
212
+
213
+ # --- Process Data Payload ---
214
+ params = dict()
215
+ data_payload: Union[str, bytes, Dict, None] = None
216
+ json_payload: Optional[Any] = None
217
+
218
+ # DevTools often uses --data-raw for JSON bodies
219
+ # Precedence: --data-binary > --data-raw / -d > --data-urlencode
220
+ if parsed_args.data_binary is not None:
221
+ try:
222
+ data_payload = parsed_args.data_binary.encode("utf-8")
223
+ log.debug("Using data from --data-binary as bytes.")
224
+ except Exception as e:
225
+ log.warning(
226
+ f"Could not encode binary data '{parsed_args.data_binary}' as bytes: {e}. Using raw string."
227
+ )
228
+ data_payload = parsed_args.data_binary # Fallback to string
229
+
230
+ elif parsed_args.data_raw is not None:
231
+ data_payload = parsed_args.data_raw
232
+
233
+ elif parsed_args.data is not None:
234
+ data_payload = parsed_args.data
235
+
236
+ elif parsed_args.data_urlencode:
237
+ # Combine and parse urlencoded data
238
+ combined_data = "&".join(parsed_args.data_urlencode)
239
+ try:
240
+ data_payload = dict(parse_qsl(combined_data, keep_blank_values=True))
241
+ except Exception as e:
242
+ log.warning(
243
+ f"Could not parse urlencoded data '{combined_data}': {e}. Treating as raw string."
244
+ )
245
+ data_payload = combined_data
246
+
247
+ # Check if raw data looks like JSON, prefer 'json' param if so
248
+ if isinstance(data_payload, str):
249
+ try:
250
+ maybe_json = json.loads(data_payload)
251
+ if isinstance(maybe_json, (dict, list)):
252
+ json_payload = maybe_json
253
+ data_payload = None
254
+ except json.JSONDecodeError:
255
+ pass # Not JSON, keep it in data_payload
256
+
257
+ # Handle -G: Move data to params if method is GET
258
+ if method == "get" and data_payload:
259
+ if isinstance(data_payload, dict): # From --data-urlencode likely
260
+ params.update(data_payload)
261
+ elif isinstance(data_payload, str):
262
+ try:
263
+ params.update(dict(parse_qsl(data_payload, keep_blank_values=True)))
264
+ except ValueError:
265
+ log.warning(
266
+ f"Could not parse data '{data_payload}' into GET parameters for -G."
267
+ )
268
+
269
+ if params:
270
+ data_payload = None # Clear data as it's moved to params
271
+ json_payload = None # Should not have JSON body with -G
272
+
273
+ # --- Process Proxy ---
274
+ proxies: Optional[Dict[str, str]] = None
275
+ if parsed_args.proxy:
276
+ proxy_url = (
277
+ f"http://{parsed_args.proxy}"
278
+ if "://" not in parsed_args.proxy
279
+ else parsed_args.proxy
280
+ )
281
+
282
+ if parsed_args.proxy_user:
283
+ user_pass = parsed_args.proxy_user
284
+ parts = urlparse(proxy_url)
285
+ netloc_parts = parts.netloc.split("@")
286
+ netloc = (
287
+ f"{user_pass}@{netloc_parts[-1]}"
288
+ if len(netloc_parts) > 1
289
+ else f"{user_pass}@{parts.netloc}"
290
+ )
291
+ proxy_url = urlunparse(
292
+ (
293
+ parts.scheme,
294
+ netloc,
295
+ parts.path,
296
+ parts.params,
297
+ parts.query,
298
+ parts.fragment,
299
+ )
300
+ )
301
+
302
+ # Standard proxy dict format
303
+ proxies = {"http": proxy_url, "https": proxy_url}
304
+ log.debug(f"Using proxy configuration: {proxies}")
305
+
306
+ # --- Final Context ---
307
+ return Request(
308
+ method=method,
309
+ url=parsed_args.url,
310
+ params=params,
311
+ data=data_payload,
312
+ json_data=json_payload,
313
+ headers=headers,
314
+ cookies=cookies,
315
+ proxy=proxies,
316
+ follow_redirects=True, # Scrapling default is True
317
+ )
318
+
319
+ def convert2fetcher(self, curl_command: [Request, str]) -> Optional[Response]:
320
+ request = None
321
+ if isinstance(curl_command, (Request, str)):
322
+ request = (
323
+ self.parse(curl_command)
324
+ if isinstance(curl_command, str)
325
+ else curl_command
326
+ )
327
+ request_args = request._asdict()
328
+ method = request_args.pop("method").strip().lower()
329
+ if method in self._supported_methods:
330
+ request_args["json"] = request_args.pop("json_data")
331
+ if method not in ("post", "put"):
332
+ _ = request_args.pop("data")
333
+ _ = request_args.pop("json")
334
+
335
+ return getattr(Fetcher, method)(**request_args)
336
+ else:
337
+ log.error(
338
+ f'Request method "{method}" isn\'t supported by Scrapling yet'
339
+ )
340
+
341
+ if request is None:
342
+ log.error(
343
+ "This class accepts `Request` objects only generated by the `uncurl` command or a curl command passed as string."
344
+ )
345
+
346
+ return None
347
+
348
+
349
  def show_page_in_browser(page):
350
  if not page:
351
  log.error("Input must be of type `Adaptor`")
352
  return
353
 
354
+ fd, fname = make_temp_file(".html")
355
  os.write(fd, page.body.encode("utf-8"))
356
  os.close(fd)
357
+ open_in_browser(f"file://{fname}")
358
 
359
 
360
  class CustomShell:
 
364
  self.code = code
365
  self.page = None
366
  self.pages = Adaptors([])
367
+ self._curl_parser = CurlParser()
368
  log_level = log_level.strip().lower()
369
 
370
  if _known_logging_levels.get(log_level):
371
  self.log_level = _known_logging_levels[log_level]
372
  else:
373
  log.error(f'Unknown log level "{log_level}", defaulting to "DEBUG"')
374
+ self.log_level = DEBUG
375
 
376
  self.shell = None
377
 
 
382
  """Initialize application components"""
383
  # This is where you'd set up your application-specific objects
384
  if self.log_level:
385
+ getLogger("scrapling").setLevel(self.log_level)
386
 
387
  settings = Fetcher.display_config()
388
  _ = settings.pop("storage")
389
  _ = settings.pop("storage_args")
390
  log.info(f"Scrapling {__version__} shell started")
391
+ log.info(f"Logging level is set to '{getLevelName(self.log_level)}'")
392
  log.info(f"Fetchers' parsing settings: {settings}")
393
 
394
  @staticmethod
 
412
  -> Useful commands
413
  - {"page / response":<30} The response object of the last page you fetched
414
  - {"pages":<30} Adaptors object of the last 5 response objects you fetched
415
+ - {"uncurl('curl_command')":<30} Convert a curl command to a Fetcher's request and return the Request object for you. (Optimized to handle curl commands copied from DevTools network tab.)
416
+ - {"curl2fetcher('curl_command')":<30} Convert a curl command to a Fetcher's request and execute it. (Optimized to handle curl commands copied from DevTools network tab.)
417
  - {"view(page)":<30} View page in a browser
418
  - {"help()":<30} Show this help message (Shell help)
419
 
 
473
  "response": self.page,
474
  "pages": self.pages,
475
  "view": show_page_in_browser,
476
+ "uncurl": self._curl_parser.parse,
477
+ "curl2fetcher": self._curl_parser.convert2fetcher,
478
  "help": self.show_help,
479
  }
480