Karim shoair commited on
Commit ·
b088bed
1
Parent(s): ce80cfd
refactor: Optimizations to CLI
Browse files- scrapling/cli.py +10 -30
- scrapling/core/shell.py +41 -36
scrapling/cli.py
CHANGED
|
@@ -3,7 +3,7 @@ from subprocess import check_output
|
|
| 3 |
from sys import executable as python_executable
|
| 4 |
|
| 5 |
from scrapling.core.utils import log
|
| 6 |
-
from scrapling.core.shell import Convertor, _CookieParser
|
| 7 |
from scrapling.fetchers import Fetcher, DynamicFetcher, StealthyFetcher
|
| 8 |
|
| 9 |
from orjson import loads as json_loads, JSONDecodeError
|
|
@@ -22,31 +22,6 @@ def run_command(cmd, line):
|
|
| 22 |
# I meant to not use try except here
|
| 23 |
|
| 24 |
|
| 25 |
-
def parse_headers(header_strings):
|
| 26 |
-
"""Parse header strings into a dictionary"""
|
| 27 |
-
headers = {}
|
| 28 |
-
for header in header_strings:
|
| 29 |
-
if ":" in header:
|
| 30 |
-
key, value = header.split(":", 1)
|
| 31 |
-
headers[key.strip()] = value.strip()
|
| 32 |
-
else:
|
| 33 |
-
log.warning(f"Invalid header format '{header}', should be 'Key: Value'")
|
| 34 |
-
return headers
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
def parse_cookies(cookie_string):
|
| 38 |
-
"""Parse cookie string into a dictionary"""
|
| 39 |
-
if not cookie_string:
|
| 40 |
-
return {}
|
| 41 |
-
|
| 42 |
-
try:
|
| 43 |
-
cookies = {key: value for key, value in _CookieParser(cookie_string)}
|
| 44 |
-
except Exception as e:
|
| 45 |
-
raise ValueError(f"Could not parse cookies '{cookie_string}': {e}")
|
| 46 |
-
|
| 47 |
-
return cookies
|
| 48 |
-
|
| 49 |
-
|
| 50 |
def parse_json_data(json_string):
|
| 51 |
"""Parse JSON string into a Python object"""
|
| 52 |
if not json_string:
|
|
@@ -140,8 +115,13 @@ def shell(code, level):
|
|
| 140 |
|
| 141 |
def parse_extract_arguments(headers, cookies, params, json=None):
|
| 142 |
"""Parse arguments for extract command"""
|
| 143 |
-
parsed_headers =
|
| 144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
parsed_json = parse_json_data(json)
|
| 146 |
parsed_params = {}
|
| 147 |
for param in params:
|
|
@@ -673,7 +653,7 @@ def fetch(
|
|
| 673 |
"""
|
| 674 |
|
| 675 |
# Parse parameters
|
| 676 |
-
parsed_headers =
|
| 677 |
|
| 678 |
# Build request arguments
|
| 679 |
kwargs = {
|
|
@@ -821,7 +801,7 @@ def stealthy_fetch(
|
|
| 821 |
"""
|
| 822 |
|
| 823 |
# Parse parameters
|
| 824 |
-
parsed_headers =
|
| 825 |
|
| 826 |
# Build request arguments
|
| 827 |
kwargs = {
|
|
|
|
| 3 |
from sys import executable as python_executable
|
| 4 |
|
| 5 |
from scrapling.core.utils import log
|
| 6 |
+
from scrapling.core.shell import Convertor, _CookieParser, _ParseHeaders
|
| 7 |
from scrapling.fetchers import Fetcher, DynamicFetcher, StealthyFetcher
|
| 8 |
|
| 9 |
from orjson import loads as json_loads, JSONDecodeError
|
|
|
|
| 22 |
# I meant to not use try except here
|
| 23 |
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
def parse_json_data(json_string):
|
| 26 |
"""Parse JSON string into a Python object"""
|
| 27 |
if not json_string:
|
|
|
|
| 115 |
|
| 116 |
def parse_extract_arguments(headers, cookies, params, json=None):
|
| 117 |
"""Parse arguments for extract command"""
|
| 118 |
+
parsed_headers, parsed_cookies = _ParseHeaders(headers)
|
| 119 |
+
for key, value in _CookieParser(cookies):
|
| 120 |
+
try:
|
| 121 |
+
parsed_cookies[key] = value
|
| 122 |
+
except Exception as e:
|
| 123 |
+
raise ValueError(f"Could not parse cookies '{cookies}': {e}")
|
| 124 |
+
|
| 125 |
parsed_json = parse_json_data(json)
|
| 126 |
parsed_params = {}
|
| 127 |
for param in params:
|
|
|
|
| 653 |
"""
|
| 654 |
|
| 655 |
# Parse parameters
|
| 656 |
+
parsed_headers, _ = _ParseHeaders(extra_headers, False)
|
| 657 |
|
| 658 |
# Build request arguments
|
| 659 |
kwargs = {
|
|
|
|
| 801 |
"""
|
| 802 |
|
| 803 |
# Parse parameters
|
| 804 |
+
parsed_headers, _ = _ParseHeaders(extra_headers, False)
|
| 805 |
|
| 806 |
# Build request arguments
|
| 807 |
kwargs = {
|
scrapling/core/shell.py
CHANGED
|
@@ -73,6 +73,46 @@ def _CookieParser(cookie_string):
|
|
| 73 |
yield key, morsel.value
|
| 74 |
|
| 75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
# Suppress exit on error to handle parsing errors gracefully
|
| 77 |
class NoExitArgumentParser(ArgumentParser):
|
| 78 |
def error(self, message):
|
|
@@ -142,41 +182,6 @@ class CurlParser:
|
|
| 142 |
self._supported_methods = ("get", "post", "put", "delete")
|
| 143 |
|
| 144 |
# --- Helper Functions ---
|
| 145 |
-
@staticmethod
|
| 146 |
-
def parse_headers(header_lines: List[str]) -> Tuple[Dict[str, str], Dict[str, str]]:
|
| 147 |
-
"""Parses -H headers into separate header and cookie dictionaries."""
|
| 148 |
-
header_dict = dict()
|
| 149 |
-
cookie_dict = dict()
|
| 150 |
-
|
| 151 |
-
for header_line in header_lines:
|
| 152 |
-
if ":" not in header_line:
|
| 153 |
-
if header_line.endswith(";"):
|
| 154 |
-
header_key = header_line[:-1].strip()
|
| 155 |
-
header_value = ""
|
| 156 |
-
header_dict[header_key] = header_value
|
| 157 |
-
else:
|
| 158 |
-
log.warning(
|
| 159 |
-
f"Could not parse header without colon: '{header_line}', skipping."
|
| 160 |
-
)
|
| 161 |
-
continue
|
| 162 |
-
else:
|
| 163 |
-
header_key, header_value = header_line.split(":", 1)
|
| 164 |
-
header_key = header_key.strip()
|
| 165 |
-
header_value = header_value.strip()
|
| 166 |
-
|
| 167 |
-
if header_key.lower() == "cookie":
|
| 168 |
-
try:
|
| 169 |
-
cookie_dict = {
|
| 170 |
-
key: value for key, value in _CookieParser(header_value)
|
| 171 |
-
}
|
| 172 |
-
except Exception as e:
|
| 173 |
-
raise ValueError(
|
| 174 |
-
f"Could not parse cookie string from -H '{header_value}': {e}"
|
| 175 |
-
)
|
| 176 |
-
else:
|
| 177 |
-
header_dict[header_key] = header_value
|
| 178 |
-
|
| 179 |
-
return header_dict, cookie_dict
|
| 180 |
|
| 181 |
# --- Main Parsing Logic ---
|
| 182 |
def parse(self, curl_command: str) -> Optional[Request]:
|
|
@@ -225,7 +230,7 @@ class CurlParser:
|
|
| 225 |
):
|
| 226 |
method = "post"
|
| 227 |
|
| 228 |
-
headers, cookies =
|
| 229 |
|
| 230 |
if parsed_args.cookie:
|
| 231 |
# We are focusing on the string format from DevTools.
|
|
|
|
| 73 |
yield key, morsel.value
|
| 74 |
|
| 75 |
|
| 76 |
+
def _ParseHeaders(
|
| 77 |
+
header_lines: List[str], parse_cookies: bool = True
|
| 78 |
+
) -> Tuple[Dict[str, str], Dict[str, str]]:
|
| 79 |
+
"""Parses headers into separate header and cookie dictionaries."""
|
| 80 |
+
header_dict = dict()
|
| 81 |
+
cookie_dict = dict()
|
| 82 |
+
|
| 83 |
+
for header_line in header_lines:
|
| 84 |
+
if ":" not in header_line:
|
| 85 |
+
if header_line.endswith(";"):
|
| 86 |
+
header_key = header_line[:-1].strip()
|
| 87 |
+
header_value = ""
|
| 88 |
+
header_dict[header_key] = header_value
|
| 89 |
+
else:
|
| 90 |
+
raise ValueError(
|
| 91 |
+
f"Could not parse header without colon: '{header_line}'."
|
| 92 |
+
)
|
| 93 |
+
else:
|
| 94 |
+
header_key, header_value = header_line.split(":", 1)
|
| 95 |
+
header_key = header_key.strip()
|
| 96 |
+
header_value = header_value.strip()
|
| 97 |
+
|
| 98 |
+
if parse_cookies:
|
| 99 |
+
if header_key.lower() == "cookie":
|
| 100 |
+
try:
|
| 101 |
+
cookie_dict = {
|
| 102 |
+
key: value for key, value in _CookieParser(header_value)
|
| 103 |
+
}
|
| 104 |
+
except Exception as e:
|
| 105 |
+
raise ValueError(
|
| 106 |
+
f"Could not parse cookie string from header '{header_value}': {e}"
|
| 107 |
+
)
|
| 108 |
+
else:
|
| 109 |
+
header_dict[header_key] = header_value
|
| 110 |
+
else:
|
| 111 |
+
header_dict[header_key] = header_value
|
| 112 |
+
|
| 113 |
+
return header_dict, cookie_dict
|
| 114 |
+
|
| 115 |
+
|
| 116 |
# Suppress exit on error to handle parsing errors gracefully
|
| 117 |
class NoExitArgumentParser(ArgumentParser):
|
| 118 |
def error(self, message):
|
|
|
|
| 182 |
self._supported_methods = ("get", "post", "put", "delete")
|
| 183 |
|
| 184 |
# --- Helper Functions ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
|
| 186 |
# --- Main Parsing Logic ---
|
| 187 |
def parse(self, curl_command: str) -> Optional[Request]:
|
|
|
|
| 230 |
):
|
| 231 |
method = "post"
|
| 232 |
|
| 233 |
+
headers, cookies = _ParseHeaders(parsed_args.header)
|
| 234 |
|
| 235 |
if parsed_args.cookie:
|
| 236 |
# We are focusing on the string format from DevTools.
|