Karim shoair commited on
Commit
b088bed
·
1 Parent(s): ce80cfd

refactor: Optimizations to CLI

Browse files
Files changed (2) hide show
  1. scrapling/cli.py +10 -30
  2. scrapling/core/shell.py +41 -36
scrapling/cli.py CHANGED
@@ -3,7 +3,7 @@ from subprocess import check_output
3
  from sys import executable as python_executable
4
 
5
  from scrapling.core.utils import log
6
- from scrapling.core.shell import Convertor, _CookieParser
7
  from scrapling.fetchers import Fetcher, DynamicFetcher, StealthyFetcher
8
 
9
  from orjson import loads as json_loads, JSONDecodeError
@@ -22,31 +22,6 @@ def run_command(cmd, line):
22
  # I meant to not use try except here
23
 
24
 
25
- def parse_headers(header_strings):
26
- """Parse header strings into a dictionary"""
27
- headers = {}
28
- for header in header_strings:
29
- if ":" in header:
30
- key, value = header.split(":", 1)
31
- headers[key.strip()] = value.strip()
32
- else:
33
- log.warning(f"Invalid header format '{header}', should be 'Key: Value'")
34
- return headers
35
-
36
-
37
- def parse_cookies(cookie_string):
38
- """Parse cookie string into a dictionary"""
39
- if not cookie_string:
40
- return {}
41
-
42
- try:
43
- cookies = {key: value for key, value in _CookieParser(cookie_string)}
44
- except Exception as e:
45
- raise ValueError(f"Could not parse cookies '{cookie_string}': {e}")
46
-
47
- return cookies
48
-
49
-
50
  def parse_json_data(json_string):
51
  """Parse JSON string into a Python object"""
52
  if not json_string:
@@ -140,8 +115,13 @@ def shell(code, level):
140
 
141
  def parse_extract_arguments(headers, cookies, params, json=None):
142
  """Parse arguments for extract command"""
143
- parsed_headers = parse_headers(headers)
144
- parsed_cookies = parse_cookies(cookies)
 
 
 
 
 
145
  parsed_json = parse_json_data(json)
146
  parsed_params = {}
147
  for param in params:
@@ -673,7 +653,7 @@ def fetch(
673
  """
674
 
675
  # Parse parameters
676
- parsed_headers = parse_headers(extra_headers)
677
 
678
  # Build request arguments
679
  kwargs = {
@@ -821,7 +801,7 @@ def stealthy_fetch(
821
  """
822
 
823
  # Parse parameters
824
- parsed_headers = parse_headers(extra_headers)
825
 
826
  # Build request arguments
827
  kwargs = {
 
3
  from sys import executable as python_executable
4
 
5
  from scrapling.core.utils import log
6
+ from scrapling.core.shell import Convertor, _CookieParser, _ParseHeaders
7
  from scrapling.fetchers import Fetcher, DynamicFetcher, StealthyFetcher
8
 
9
  from orjson import loads as json_loads, JSONDecodeError
 
22
  # I meant to not use try except here
23
 
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  def parse_json_data(json_string):
26
  """Parse JSON string into a Python object"""
27
  if not json_string:
 
115
 
116
  def parse_extract_arguments(headers, cookies, params, json=None):
117
  """Parse arguments for extract command"""
118
+ parsed_headers, parsed_cookies = _ParseHeaders(headers)
119
+ for key, value in _CookieParser(cookies):
120
+ try:
121
+ parsed_cookies[key] = value
122
+ except Exception as e:
123
+ raise ValueError(f"Could not parse cookies '{cookies}': {e}")
124
+
125
  parsed_json = parse_json_data(json)
126
  parsed_params = {}
127
  for param in params:
 
653
  """
654
 
655
  # Parse parameters
656
+ parsed_headers, _ = _ParseHeaders(extra_headers, False)
657
 
658
  # Build request arguments
659
  kwargs = {
 
801
  """
802
 
803
  # Parse parameters
804
+ parsed_headers, _ = _ParseHeaders(extra_headers, False)
805
 
806
  # Build request arguments
807
  kwargs = {
scrapling/core/shell.py CHANGED
@@ -73,6 +73,46 @@ def _CookieParser(cookie_string):
73
  yield key, morsel.value
74
 
75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  # Suppress exit on error to handle parsing errors gracefully
77
  class NoExitArgumentParser(ArgumentParser):
78
  def error(self, message):
@@ -142,41 +182,6 @@ class CurlParser:
142
  self._supported_methods = ("get", "post", "put", "delete")
143
 
144
  # --- Helper Functions ---
145
- @staticmethod
146
- def parse_headers(header_lines: List[str]) -> Tuple[Dict[str, str], Dict[str, str]]:
147
- """Parses -H headers into separate header and cookie dictionaries."""
148
- header_dict = dict()
149
- cookie_dict = dict()
150
-
151
- for header_line in header_lines:
152
- if ":" not in header_line:
153
- if header_line.endswith(";"):
154
- header_key = header_line[:-1].strip()
155
- header_value = ""
156
- header_dict[header_key] = header_value
157
- else:
158
- log.warning(
159
- f"Could not parse header without colon: '{header_line}', skipping."
160
- )
161
- continue
162
- else:
163
- header_key, header_value = header_line.split(":", 1)
164
- header_key = header_key.strip()
165
- header_value = header_value.strip()
166
-
167
- if header_key.lower() == "cookie":
168
- try:
169
- cookie_dict = {
170
- key: value for key, value in _CookieParser(header_value)
171
- }
172
- except Exception as e:
173
- raise ValueError(
174
- f"Could not parse cookie string from -H '{header_value}': {e}"
175
- )
176
- else:
177
- header_dict[header_key] = header_value
178
-
179
- return header_dict, cookie_dict
180
 
181
  # --- Main Parsing Logic ---
182
  def parse(self, curl_command: str) -> Optional[Request]:
@@ -225,7 +230,7 @@ class CurlParser:
225
  ):
226
  method = "post"
227
 
228
- headers, cookies = self.parse_headers(parsed_args.header)
229
 
230
  if parsed_args.cookie:
231
  # We are focusing on the string format from DevTools.
 
73
  yield key, morsel.value
74
 
75
 
76
+ def _ParseHeaders(
77
+ header_lines: List[str], parse_cookies: bool = True
78
+ ) -> Tuple[Dict[str, str], Dict[str, str]]:
79
+ """Parses headers into separate header and cookie dictionaries."""
80
+ header_dict = dict()
81
+ cookie_dict = dict()
82
+
83
+ for header_line in header_lines:
84
+ if ":" not in header_line:
85
+ if header_line.endswith(";"):
86
+ header_key = header_line[:-1].strip()
87
+ header_value = ""
88
+ header_dict[header_key] = header_value
89
+ else:
90
+ raise ValueError(
91
+ f"Could not parse header without colon: '{header_line}'."
92
+ )
93
+ else:
94
+ header_key, header_value = header_line.split(":", 1)
95
+ header_key = header_key.strip()
96
+ header_value = header_value.strip()
97
+
98
+ if parse_cookies:
99
+ if header_key.lower() == "cookie":
100
+ try:
101
+ cookie_dict = {
102
+ key: value for key, value in _CookieParser(header_value)
103
+ }
104
+ except Exception as e:
105
+ raise ValueError(
106
+ f"Could not parse cookie string from header '{header_value}': {e}"
107
+ )
108
+ else:
109
+ header_dict[header_key] = header_value
110
+ else:
111
+ header_dict[header_key] = header_value
112
+
113
+ return header_dict, cookie_dict
114
+
115
+
116
  # Suppress exit on error to handle parsing errors gracefully
117
  class NoExitArgumentParser(ArgumentParser):
118
  def error(self, message):
 
182
  self._supported_methods = ("get", "post", "put", "delete")
183
 
184
  # --- Helper Functions ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
 
186
  # --- Main Parsing Logic ---
187
  def parse(self, curl_command: str) -> Optional[Request]:
 
230
  ):
231
  method = "post"
232
 
233
+ headers, cookies = _ParseHeaders(parsed_args.header)
234
 
235
  if parsed_args.cookie:
236
  # We are focusing on the string format from DevTools.