Karim shoair commited on
Commit
db64073
·
1 Parent(s): 62bd429

feat(shell): Add support to curl `-b` argument

Browse files
Files changed (1) hide show
  1. scrapling/core/shell.py +73 -31
scrapling/core/shell.py CHANGED
@@ -35,6 +35,7 @@ from scrapling.fetchers import (
35
  Response,
36
  )
37
 
 
38
  _known_logging_levels = {
39
  "debug": DEBUG,
40
  "info": INFO,
@@ -105,6 +106,13 @@ class CurlParser:
105
  "-G", "--get", action="store_true"
106
  ) # Use GET and put data in URL
107
 
 
 
 
 
 
 
 
108
  # Proxy
109
  _parser.add_argument("-x", "--proxy", default=None)
110
  _parser.add_argument("-U", "--proxy-user", default=None) # Basic proxy auth
@@ -154,7 +162,7 @@ class CurlParser:
154
  cookie_dict[key] = morsel.value
155
  except Exception as e:
156
  log.error(
157
- f"Could not parse cookie string '{header_value}': {e}"
158
  )
159
  else:
160
  header_dict[header_key] = header_value
@@ -210,6 +218,21 @@ class CurlParser:
210
 
211
  headers, cookies = self.parse_headers(parsed_args.header)
212
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  # --- Process Data Payload ---
214
  params = dict()
215
  data_payload: Union[str, bytes, Dict, None] = None
@@ -316,7 +339,7 @@ class CurlParser:
316
  follow_redirects=True, # Scrapling default is True
317
  )
318
 
319
- def convert2fetcher(self, curl_command: [Request, str]) -> Optional[Response]:
320
  request = None
321
  if isinstance(curl_command, (Request, str)):
322
  request = (
@@ -324,37 +347,53 @@ class CurlParser:
324
  if isinstance(curl_command, str)
325
  else curl_command
326
  )
 
 
 
 
 
 
327
  request_args = request._asdict()
328
  method = request_args.pop("method").strip().lower()
329
  if method in self._supported_methods:
330
  request_args["json"] = request_args.pop("json_data")
 
 
331
  if method not in ("post", "put"):
332
- _ = request_args.pop("data")
333
- _ = request_args.pop("json")
334
 
335
- return getattr(Fetcher, method)(**request_args)
 
 
 
 
336
  else:
337
  log.error(
338
  f'Request method "{method}" isn\'t supported by Scrapling yet'
339
  )
 
340
 
341
- if request is None:
342
- log.error(
343
- "This class accepts `Request` objects only generated by the `uncurl` command or a curl command passed as string."
344
- )
345
 
346
  return None
347
 
348
 
349
- def show_page_in_browser(page):
350
- if not page:
351
  log.error("Input must be of type `Adaptor`")
352
  return
353
 
354
- fd, fname = make_temp_file(".html")
355
- os.write(fd, page.body.encode("utf-8"))
356
- os.close(fd)
357
- open_in_browser(f"file://{fname}")
 
 
 
 
 
358
 
359
 
360
  class CustomShell:
@@ -370,7 +409,7 @@ class CustomShell:
370
  if _known_logging_levels.get(log_level):
371
  self.log_level = _known_logging_levels[log_level]
372
  else:
373
- log.error(f'Unknown log level "{log_level}", defaulting to "DEBUG"')
374
  self.log_level = DEBUG
375
 
376
  self.shell = None
@@ -385,8 +424,8 @@ class CustomShell:
385
  getLogger("scrapling").setLevel(self.log_level)
386
 
387
  settings = Fetcher.display_config()
388
- _ = settings.pop("storage")
389
- _ = settings.pop("storage_args")
390
  log.info(f"Scrapling {__version__} shell started")
391
  log.info(f"Logging level is set to '{getLevelName(self.log_level)}'")
392
  log.info(f"Fetchers' parsing settings: {settings}")
@@ -412,8 +451,8 @@ class CustomShell:
412
  -> Useful commands
413
  - {"page / response":<30} The response object of the last page you fetched
414
  - {"pages":<30} Adaptors object of the last 5 response objects you fetched
415
- - {"uncurl('curl_command')":<30} Convert a curl command to a Fetcher's request and return the Request object for you. (Optimized to handle curl commands copied from DevTools network tab.)
416
- - {"curl2fetcher('curl_command')":<30} Convert a curl command to a Fetcher's request and execute it. (Optimized to handle curl commands copied from DevTools network tab.)
417
  - {"view(page)":<30} View page in a browser
418
  - {"help()":<30} Show this help message (Shell help)
419
 
@@ -423,15 +462,16 @@ Type 'exit' or press Ctrl+D to exit.
423
  def update_page(self, result):
424
  """Update current page and add to pages history"""
425
  self.page = result
426
- self.pages.append(result)
427
- if len(self.pages) > 5:
428
- self.pages.pop(0) # Remove oldest item
 
429
 
430
- # Update in IPython namespace too
431
- if self.shell:
432
- self.shell.user_ns["page"] = self.page
433
- self.shell.user_ns["response"] = self.page
434
- self.shell.user_ns["pages"] = self.pages
435
 
436
  return result
437
 
@@ -497,9 +537,11 @@ Type 'exit' or press Ctrl+D to exit.
497
  ipython_shell.user_ns.update(namespace)
498
  # If a command was provided, execute it and exit
499
  if self.code:
500
- # Execute the command in the namespace
501
- ipython_shell.run_cell(self.code, store_history=False)
 
 
 
502
  return
503
 
504
- # Start the shell with our namespace
505
  ipython_shell(local_ns=namespace)
 
35
  Response,
36
  )
37
 
38
+
39
  _known_logging_levels = {
40
  "debug": DEBUG,
41
  "info": INFO,
 
106
  "-G", "--get", action="store_true"
107
  ) # Use GET and put data in URL
108
 
109
+ _parser.add_argument(
110
+ "-b",
111
+ "--cookie",
112
+ default=None,
113
+ help="Send cookies from string/file (string format used by DevTools)",
114
+ )
115
+
116
  # Proxy
117
  _parser.add_argument("-x", "--proxy", default=None)
118
  _parser.add_argument("-U", "--proxy-user", default=None) # Basic proxy auth
 
162
  cookie_dict[key] = morsel.value
163
  except Exception as e:
164
  log.error(
165
+ f"Could not parse cookie string from -H '{header_value}': {e}"
166
  )
167
  else:
168
  header_dict[header_key] = header_value
 
218
 
219
  headers, cookies = self.parse_headers(parsed_args.header)
220
 
221
+ if parsed_args.cookie:
222
+ # We are focusing on the string format from DevTools.
223
+ try:
224
+ cookie_parser = Cookie.SimpleCookie()
225
+ cookie_parser.load(parsed_args.cookie)
226
+ for key, morsel in cookie_parser.items():
227
+ # Update the cookies dict, potentially overwriting
228
+ # cookies with the same name from -H 'Cookie:'
229
+ cookies[key] = morsel.value
230
+ log.debug(f"Parsed cookies from -b argument: {list(cookies.keys())}")
231
+ except Exception as e:
232
+ log.error(
233
+ f"Could not parse cookie string from -b '{parsed_args.cookie}': {e}"
234
+ )
235
+
236
  # --- Process Data Payload ---
237
  params = dict()
238
  data_payload: Union[str, bytes, Dict, None] = None
 
339
  follow_redirects=True, # Scrapling default is True
340
  )
341
 
342
+ def convert2fetcher(self, curl_command: Union[Request, str]) -> Optional[Response]:
343
  request = None
344
  if isinstance(curl_command, (Request, str)):
345
  request = (
 
347
  if isinstance(curl_command, str)
348
  else curl_command
349
  )
350
+
351
+ # Ensure request parsing was successful before proceeding
352
+ if request is None:
353
+ log.error("Failed to parse curl command, cannot convert to fetcher.")
354
+ return None
355
+
356
  request_args = request._asdict()
357
  method = request_args.pop("method").strip().lower()
358
  if method in self._supported_methods:
359
  request_args["json"] = request_args.pop("json_data")
360
+
361
+ # Ensure data/json are removed for non-POST/PUT methods
362
  if method not in ("post", "put"):
363
+ _ = request_args.pop("data", None)
364
+ _ = request_args.pop("json", None)
365
 
366
+ try:
367
+ return getattr(Fetcher, method)(**request_args)
368
+ except Exception as e:
369
+ log.error(f"Error calling Fetcher.{method}: {e}")
370
+ return None
371
  else:
372
  log.error(
373
  f'Request method "{method}" isn\'t supported by Scrapling yet'
374
  )
375
+ return None
376
 
377
+ else:
378
+ log.error("Input must be a valid curl command string or a Request object.")
 
 
379
 
380
  return None
381
 
382
 
383
+ def show_page_in_browser(page: Adaptor):
384
+ if not page or not isinstance(page, Adaptor):
385
  log.error("Input must be of type `Adaptor`")
386
  return
387
 
388
+ try:
389
+ fd, fname = make_temp_file(".html")
390
+ os.write(fd, page.body.encode("utf-8"))
391
+ os.close(fd)
392
+ open_in_browser(f"file://{fname}")
393
+ except IOError as e:
394
+ log.error(f"Failed to write temporary file for viewing: {e}")
395
+ except Exception as e:
396
+ log.error(f"An unexpected error occurred while viewing the page: {e}")
397
 
398
 
399
  class CustomShell:
 
409
  if _known_logging_levels.get(log_level):
410
  self.log_level = _known_logging_levels[log_level]
411
  else:
412
+ log.warning(f'Unknown log level "{log_level}", defaulting to "DEBUG"')
413
  self.log_level = DEBUG
414
 
415
  self.shell = None
 
424
  getLogger("scrapling").setLevel(self.log_level)
425
 
426
  settings = Fetcher.display_config()
427
+ settings.pop("storage", None)
428
+ settings.pop("storage_args", None)
429
  log.info(f"Scrapling {__version__} shell started")
430
  log.info(f"Logging level is set to '{getLevelName(self.log_level)}'")
431
  log.info(f"Fetchers' parsing settings: {settings}")
 
451
  -> Useful commands
452
  - {"page / response":<30} The response object of the last page you fetched
453
  - {"pages":<30} Adaptors object of the last 5 response objects you fetched
454
+ - {"uncurl('curl_command')":<30} Convert curl command to a Request object. (Optimized to handle curl commands copied from DevTools network tab.)
455
+ - {"curl2fetcher('curl_command')":<30} Convert curl command and make the request with Fetcher. (Optimized to handle curl commands copied from DevTools network tab.)
456
  - {"view(page)":<30} View page in a browser
457
  - {"help()":<30} Show this help message (Shell help)
458
 
 
462
  def update_page(self, result):
463
  """Update current page and add to pages history"""
464
  self.page = result
465
+ if isinstance(result, (Response, Adaptor)):
466
+ self.pages.append(result)
467
+ if len(self.pages) > 5:
468
+ self.pages.pop(0) # Remove oldest item
469
 
470
+ # Update in IPython namespace too
471
+ if self.shell:
472
+ self.shell.user_ns["page"] = self.page
473
+ self.shell.user_ns["response"] = self.page
474
+ self.shell.user_ns["pages"] = self.pages
475
 
476
  return result
477
 
 
537
  ipython_shell.user_ns.update(namespace)
538
  # If a command was provided, execute it and exit
539
  if self.code:
540
+ log.info(f"Executing provided code: {self.code}")
541
+ try:
542
+ ipython_shell.run_cell(self.code, store_history=False)
543
+ except Exception as e:
544
+ log.error(f"Error executing initial code: {e}")
545
  return
546
 
 
547
  ipython_shell(local_ns=namespace)