Karim shoair commited on
Commit ·
db64073
1
Parent(s): 62bd429
feat(shell): Add support to curl `-b` argument
Browse files- scrapling/core/shell.py +73 -31
scrapling/core/shell.py
CHANGED
|
@@ -35,6 +35,7 @@ from scrapling.fetchers import (
|
|
| 35 |
Response,
|
| 36 |
)
|
| 37 |
|
|
|
|
| 38 |
_known_logging_levels = {
|
| 39 |
"debug": DEBUG,
|
| 40 |
"info": INFO,
|
|
@@ -105,6 +106,13 @@ class CurlParser:
|
|
| 105 |
"-G", "--get", action="store_true"
|
| 106 |
) # Use GET and put data in URL
|
| 107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
# Proxy
|
| 109 |
_parser.add_argument("-x", "--proxy", default=None)
|
| 110 |
_parser.add_argument("-U", "--proxy-user", default=None) # Basic proxy auth
|
|
@@ -154,7 +162,7 @@ class CurlParser:
|
|
| 154 |
cookie_dict[key] = morsel.value
|
| 155 |
except Exception as e:
|
| 156 |
log.error(
|
| 157 |
-
f"Could not parse cookie string '{header_value}': {e}"
|
| 158 |
)
|
| 159 |
else:
|
| 160 |
header_dict[header_key] = header_value
|
|
@@ -210,6 +218,21 @@ class CurlParser:
|
|
| 210 |
|
| 211 |
headers, cookies = self.parse_headers(parsed_args.header)
|
| 212 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
# --- Process Data Payload ---
|
| 214 |
params = dict()
|
| 215 |
data_payload: Union[str, bytes, Dict, None] = None
|
|
@@ -316,7 +339,7 @@ class CurlParser:
|
|
| 316 |
follow_redirects=True, # Scrapling default is True
|
| 317 |
)
|
| 318 |
|
| 319 |
-
def convert2fetcher(self, curl_command: [Request, str]) -> Optional[Response]:
|
| 320 |
request = None
|
| 321 |
if isinstance(curl_command, (Request, str)):
|
| 322 |
request = (
|
|
@@ -324,37 +347,53 @@ class CurlParser:
|
|
| 324 |
if isinstance(curl_command, str)
|
| 325 |
else curl_command
|
| 326 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 327 |
request_args = request._asdict()
|
| 328 |
method = request_args.pop("method").strip().lower()
|
| 329 |
if method in self._supported_methods:
|
| 330 |
request_args["json"] = request_args.pop("json_data")
|
|
|
|
|
|
|
| 331 |
if method not in ("post", "put"):
|
| 332 |
-
_ = request_args.pop("data")
|
| 333 |
-
_ = request_args.pop("json")
|
| 334 |
|
| 335 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 336 |
else:
|
| 337 |
log.error(
|
| 338 |
f'Request method "{method}" isn\'t supported by Scrapling yet'
|
| 339 |
)
|
|
|
|
| 340 |
|
| 341 |
-
|
| 342 |
-
log.error(
|
| 343 |
-
"This class accepts `Request` objects only generated by the `uncurl` command or a curl command passed as string."
|
| 344 |
-
)
|
| 345 |
|
| 346 |
return None
|
| 347 |
|
| 348 |
|
| 349 |
-
def show_page_in_browser(page):
|
| 350 |
-
if not page:
|
| 351 |
log.error("Input must be of type `Adaptor`")
|
| 352 |
return
|
| 353 |
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 358 |
|
| 359 |
|
| 360 |
class CustomShell:
|
|
@@ -370,7 +409,7 @@ class CustomShell:
|
|
| 370 |
if _known_logging_levels.get(log_level):
|
| 371 |
self.log_level = _known_logging_levels[log_level]
|
| 372 |
else:
|
| 373 |
-
log.
|
| 374 |
self.log_level = DEBUG
|
| 375 |
|
| 376 |
self.shell = None
|
|
@@ -385,8 +424,8 @@ class CustomShell:
|
|
| 385 |
getLogger("scrapling").setLevel(self.log_level)
|
| 386 |
|
| 387 |
settings = Fetcher.display_config()
|
| 388 |
-
|
| 389 |
-
|
| 390 |
log.info(f"Scrapling {__version__} shell started")
|
| 391 |
log.info(f"Logging level is set to '{getLevelName(self.log_level)}'")
|
| 392 |
log.info(f"Fetchers' parsing settings: {settings}")
|
|
@@ -412,8 +451,8 @@ class CustomShell:
|
|
| 412 |
-> Useful commands
|
| 413 |
- {"page / response":<30} The response object of the last page you fetched
|
| 414 |
- {"pages":<30} Adaptors object of the last 5 response objects you fetched
|
| 415 |
-
- {"uncurl('curl_command')":<30} Convert
|
| 416 |
-
- {"curl2fetcher('curl_command')":<30} Convert
|
| 417 |
- {"view(page)":<30} View page in a browser
|
| 418 |
- {"help()":<30} Show this help message (Shell help)
|
| 419 |
|
|
@@ -423,15 +462,16 @@ Type 'exit' or press Ctrl+D to exit.
|
|
| 423 |
def update_page(self, result):
|
| 424 |
"""Update current page and add to pages history"""
|
| 425 |
self.page = result
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
self.pages
|
|
|
|
| 429 |
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
|
| 436 |
return result
|
| 437 |
|
|
@@ -497,9 +537,11 @@ Type 'exit' or press Ctrl+D to exit.
|
|
| 497 |
ipython_shell.user_ns.update(namespace)
|
| 498 |
# If a command was provided, execute it and exit
|
| 499 |
if self.code:
|
| 500 |
-
|
| 501 |
-
|
|
|
|
|
|
|
|
|
|
| 502 |
return
|
| 503 |
|
| 504 |
-
# Start the shell with our namespace
|
| 505 |
ipython_shell(local_ns=namespace)
|
|
|
|
| 35 |
Response,
|
| 36 |
)
|
| 37 |
|
| 38 |
+
|
| 39 |
_known_logging_levels = {
|
| 40 |
"debug": DEBUG,
|
| 41 |
"info": INFO,
|
|
|
|
| 106 |
"-G", "--get", action="store_true"
|
| 107 |
) # Use GET and put data in URL
|
| 108 |
|
| 109 |
+
_parser.add_argument(
|
| 110 |
+
"-b",
|
| 111 |
+
"--cookie",
|
| 112 |
+
default=None,
|
| 113 |
+
help="Send cookies from string/file (string format used by DevTools)",
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
# Proxy
|
| 117 |
_parser.add_argument("-x", "--proxy", default=None)
|
| 118 |
_parser.add_argument("-U", "--proxy-user", default=None) # Basic proxy auth
|
|
|
|
| 162 |
cookie_dict[key] = morsel.value
|
| 163 |
except Exception as e:
|
| 164 |
log.error(
|
| 165 |
+
f"Could not parse cookie string from -H '{header_value}': {e}"
|
| 166 |
)
|
| 167 |
else:
|
| 168 |
header_dict[header_key] = header_value
|
|
|
|
| 218 |
|
| 219 |
headers, cookies = self.parse_headers(parsed_args.header)
|
| 220 |
|
| 221 |
+
if parsed_args.cookie:
|
| 222 |
+
# We are focusing on the string format from DevTools.
|
| 223 |
+
try:
|
| 224 |
+
cookie_parser = Cookie.SimpleCookie()
|
| 225 |
+
cookie_parser.load(parsed_args.cookie)
|
| 226 |
+
for key, morsel in cookie_parser.items():
|
| 227 |
+
# Update the cookies dict, potentially overwriting
|
| 228 |
+
# cookies with the same name from -H 'Cookie:'
|
| 229 |
+
cookies[key] = morsel.value
|
| 230 |
+
log.debug(f"Parsed cookies from -b argument: {list(cookies.keys())}")
|
| 231 |
+
except Exception as e:
|
| 232 |
+
log.error(
|
| 233 |
+
f"Could not parse cookie string from -b '{parsed_args.cookie}': {e}"
|
| 234 |
+
)
|
| 235 |
+
|
| 236 |
# --- Process Data Payload ---
|
| 237 |
params = dict()
|
| 238 |
data_payload: Union[str, bytes, Dict, None] = None
|
|
|
|
| 339 |
follow_redirects=True, # Scrapling default is True
|
| 340 |
)
|
| 341 |
|
| 342 |
+
def convert2fetcher(self, curl_command: Union[Request, str]) -> Optional[Response]:
|
| 343 |
request = None
|
| 344 |
if isinstance(curl_command, (Request, str)):
|
| 345 |
request = (
|
|
|
|
| 347 |
if isinstance(curl_command, str)
|
| 348 |
else curl_command
|
| 349 |
)
|
| 350 |
+
|
| 351 |
+
# Ensure request parsing was successful before proceeding
|
| 352 |
+
if request is None:
|
| 353 |
+
log.error("Failed to parse curl command, cannot convert to fetcher.")
|
| 354 |
+
return None
|
| 355 |
+
|
| 356 |
request_args = request._asdict()
|
| 357 |
method = request_args.pop("method").strip().lower()
|
| 358 |
if method in self._supported_methods:
|
| 359 |
request_args["json"] = request_args.pop("json_data")
|
| 360 |
+
|
| 361 |
+
# Ensure data/json are removed for non-POST/PUT methods
|
| 362 |
if method not in ("post", "put"):
|
| 363 |
+
_ = request_args.pop("data", None)
|
| 364 |
+
_ = request_args.pop("json", None)
|
| 365 |
|
| 366 |
+
try:
|
| 367 |
+
return getattr(Fetcher, method)(**request_args)
|
| 368 |
+
except Exception as e:
|
| 369 |
+
log.error(f"Error calling Fetcher.{method}: {e}")
|
| 370 |
+
return None
|
| 371 |
else:
|
| 372 |
log.error(
|
| 373 |
f'Request method "{method}" isn\'t supported by Scrapling yet'
|
| 374 |
)
|
| 375 |
+
return None
|
| 376 |
|
| 377 |
+
else:
|
| 378 |
+
log.error("Input must be a valid curl command string or a Request object.")
|
|
|
|
|
|
|
| 379 |
|
| 380 |
return None
|
| 381 |
|
| 382 |
|
| 383 |
+
def show_page_in_browser(page: Adaptor):
|
| 384 |
+
if not page or not isinstance(page, Adaptor):
|
| 385 |
log.error("Input must be of type `Adaptor`")
|
| 386 |
return
|
| 387 |
|
| 388 |
+
try:
|
| 389 |
+
fd, fname = make_temp_file(".html")
|
| 390 |
+
os.write(fd, page.body.encode("utf-8"))
|
| 391 |
+
os.close(fd)
|
| 392 |
+
open_in_browser(f"file://{fname}")
|
| 393 |
+
except IOError as e:
|
| 394 |
+
log.error(f"Failed to write temporary file for viewing: {e}")
|
| 395 |
+
except Exception as e:
|
| 396 |
+
log.error(f"An unexpected error occurred while viewing the page: {e}")
|
| 397 |
|
| 398 |
|
| 399 |
class CustomShell:
|
|
|
|
| 409 |
if _known_logging_levels.get(log_level):
|
| 410 |
self.log_level = _known_logging_levels[log_level]
|
| 411 |
else:
|
| 412 |
+
log.warning(f'Unknown log level "{log_level}", defaulting to "DEBUG"')
|
| 413 |
self.log_level = DEBUG
|
| 414 |
|
| 415 |
self.shell = None
|
|
|
|
| 424 |
getLogger("scrapling").setLevel(self.log_level)
|
| 425 |
|
| 426 |
settings = Fetcher.display_config()
|
| 427 |
+
settings.pop("storage", None)
|
| 428 |
+
settings.pop("storage_args", None)
|
| 429 |
log.info(f"Scrapling {__version__} shell started")
|
| 430 |
log.info(f"Logging level is set to '{getLevelName(self.log_level)}'")
|
| 431 |
log.info(f"Fetchers' parsing settings: {settings}")
|
|
|
|
| 451 |
-> Useful commands
|
| 452 |
- {"page / response":<30} The response object of the last page you fetched
|
| 453 |
- {"pages":<30} Adaptors object of the last 5 response objects you fetched
|
| 454 |
+
- {"uncurl('curl_command')":<30} Convert curl command to a Request object. (Optimized to handle curl commands copied from DevTools network tab.)
|
| 455 |
+
- {"curl2fetcher('curl_command')":<30} Convert curl command and make the request with Fetcher. (Optimized to handle curl commands copied from DevTools network tab.)
|
| 456 |
- {"view(page)":<30} View page in a browser
|
| 457 |
- {"help()":<30} Show this help message (Shell help)
|
| 458 |
|
|
|
|
| 462 |
def update_page(self, result):
|
| 463 |
"""Update current page and add to pages history"""
|
| 464 |
self.page = result
|
| 465 |
+
if isinstance(result, (Response, Adaptor)):
|
| 466 |
+
self.pages.append(result)
|
| 467 |
+
if len(self.pages) > 5:
|
| 468 |
+
self.pages.pop(0) # Remove oldest item
|
| 469 |
|
| 470 |
+
# Update in IPython namespace too
|
| 471 |
+
if self.shell:
|
| 472 |
+
self.shell.user_ns["page"] = self.page
|
| 473 |
+
self.shell.user_ns["response"] = self.page
|
| 474 |
+
self.shell.user_ns["pages"] = self.pages
|
| 475 |
|
| 476 |
return result
|
| 477 |
|
|
|
|
| 537 |
ipython_shell.user_ns.update(namespace)
|
| 538 |
# If a command was provided, execute it and exit
|
| 539 |
if self.code:
|
| 540 |
+
log.info(f"Executing provided code: {self.code}")
|
| 541 |
+
try:
|
| 542 |
+
ipython_shell.run_cell(self.code, store_history=False)
|
| 543 |
+
except Exception as e:
|
| 544 |
+
log.error(f"Error executing initial code: {e}")
|
| 545 |
return
|
| 546 |
|
|
|
|
| 547 |
ipython_shell(local_ns=namespace)
|