Karim shoair commited on
Commit ·
31f70c8
1
Parent(s): 2110f64
refactor(Adaptor): Cleaner approach to `find_similar` method
Browse filesThis code is slower than before by about 2-5μs, but it's worth it.
- scrapling/parser.py +74 -63
scrapling/parser.py
CHANGED
|
@@ -1002,6 +1002,55 @@ class Adaptor(SelectorsGeneration):
|
|
| 1002 |
regex, default, replace_entities, clean_match, case_sensitive
|
| 1003 |
)
|
| 1004 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1005 |
def find_similar(
|
| 1006 |
self,
|
| 1007 |
similarity_threshold: float = 0.2,
|
|
@@ -1031,74 +1080,36 @@ class Adaptor(SelectorsGeneration):
|
|
| 1031 |
|
| 1032 |
:return: A ``Adaptors`` container of ``Adaptor`` objects or empty list
|
| 1033 |
"""
|
| 1034 |
-
|
| 1035 |
-
def get_attributes(element: html.HtmlElement) -> Dict:
|
| 1036 |
-
"""Return attributes dictionary without the ignored list"""
|
| 1037 |
-
return {
|
| 1038 |
-
k: v for k, v in element.attrib.items() if k not in ignore_attributes
|
| 1039 |
-
}
|
| 1040 |
-
|
| 1041 |
-
def are_alike(
|
| 1042 |
-
original: html.HtmlElement,
|
| 1043 |
-
original_attributes: Dict,
|
| 1044 |
-
candidate: html.HtmlElement,
|
| 1045 |
-
) -> bool:
|
| 1046 |
-
"""Calculate a score of how much these elements are alike and return True
|
| 1047 |
-
if the score is higher or equals the threshold"""
|
| 1048 |
-
candidate_attributes = (
|
| 1049 |
-
get_attributes(candidate) if ignore_attributes else candidate.attrib
|
| 1050 |
-
)
|
| 1051 |
-
score, checks = 0, 0
|
| 1052 |
-
|
| 1053 |
-
if original_attributes:
|
| 1054 |
-
score += sum(
|
| 1055 |
-
SequenceMatcher(None, v, candidate_attributes.get(k, "")).ratio()
|
| 1056 |
-
for k, v in original_attributes.items()
|
| 1057 |
-
)
|
| 1058 |
-
checks += len(candidate_attributes)
|
| 1059 |
-
else:
|
| 1060 |
-
if not candidate_attributes:
|
| 1061 |
-
# Both don't have attributes, this must mean something
|
| 1062 |
-
score += 1
|
| 1063 |
-
checks += 1
|
| 1064 |
-
|
| 1065 |
-
if match_text:
|
| 1066 |
-
score += SequenceMatcher(
|
| 1067 |
-
None,
|
| 1068 |
-
clean_spaces(original.text or ""),
|
| 1069 |
-
clean_spaces(candidate.text or ""),
|
| 1070 |
-
).ratio()
|
| 1071 |
-
checks += 1
|
| 1072 |
-
|
| 1073 |
-
if checks:
|
| 1074 |
-
return round(score / checks, 2) >= similarity_threshold
|
| 1075 |
-
return False
|
| 1076 |
-
|
| 1077 |
# We will use the elements' root from now on to get the speed boost of using Lxml directly
|
| 1078 |
root = self._root
|
| 1079 |
-
current_depth = len(list(root.iterancestors()))
|
| 1080 |
-
target_attrs = get_attributes(root) if ignore_attributes else root.attrib
|
| 1081 |
similar_elements = list()
|
| 1082 |
-
|
| 1083 |
-
|
| 1084 |
-
|
| 1085 |
-
|
| 1086 |
-
if
|
| 1087 |
-
|
| 1088 |
-
|
| 1089 |
-
|
| 1090 |
-
|
| 1091 |
-
|
| 1092 |
-
|
| 1093 |
-
|
| 1094 |
-
|
| 1095 |
-
|
| 1096 |
-
|
| 1097 |
-
|
|
|
|
|
|
|
| 1098 |
|
| 1099 |
for potential_match in potential_matches:
|
| 1100 |
-
if potential_match != root and
|
| 1101 |
-
root,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1102 |
):
|
| 1103 |
similar_elements.append(potential_match)
|
| 1104 |
|
|
|
|
| 1002 |
regex, default, replace_entities, clean_match, case_sensitive
|
| 1003 |
)
|
| 1004 |
|
| 1005 |
+
@staticmethod
|
| 1006 |
+
def __get_attributes(
|
| 1007 |
+
element: html.HtmlElement, ignore_attributes: Union[List, Tuple]
|
| 1008 |
+
) -> Dict:
|
| 1009 |
+
"""Return attributes dictionary without the ignored list"""
|
| 1010 |
+
return {k: v for k, v in element.attrib.items() if k not in ignore_attributes}
|
| 1011 |
+
|
| 1012 |
+
def __are_alike(
|
| 1013 |
+
self,
|
| 1014 |
+
original: html.HtmlElement,
|
| 1015 |
+
original_attributes: Dict,
|
| 1016 |
+
candidate: html.HtmlElement,
|
| 1017 |
+
ignore_attributes: Union[List, Tuple],
|
| 1018 |
+
similarity_threshold: float,
|
| 1019 |
+
match_text: bool = False,
|
| 1020 |
+
) -> bool:
|
| 1021 |
+
"""Calculate a score of how much these elements are alike and return True
|
| 1022 |
+
if the score is higher or equals the threshold"""
|
| 1023 |
+
candidate_attributes = (
|
| 1024 |
+
self.__get_attributes(candidate, ignore_attributes)
|
| 1025 |
+
if ignore_attributes
|
| 1026 |
+
else candidate.attrib
|
| 1027 |
+
)
|
| 1028 |
+
score, checks = 0, 0
|
| 1029 |
+
|
| 1030 |
+
if original_attributes:
|
| 1031 |
+
score += sum(
|
| 1032 |
+
SequenceMatcher(None, v, candidate_attributes.get(k, "")).ratio()
|
| 1033 |
+
for k, v in original_attributes.items()
|
| 1034 |
+
)
|
| 1035 |
+
checks += len(candidate_attributes)
|
| 1036 |
+
else:
|
| 1037 |
+
if not candidate_attributes:
|
| 1038 |
+
# Both don't have attributes, this must mean something
|
| 1039 |
+
score += 1
|
| 1040 |
+
checks += 1
|
| 1041 |
+
|
| 1042 |
+
if match_text:
|
| 1043 |
+
score += SequenceMatcher(
|
| 1044 |
+
None,
|
| 1045 |
+
clean_spaces(original.text or ""),
|
| 1046 |
+
clean_spaces(candidate.text or ""),
|
| 1047 |
+
).ratio()
|
| 1048 |
+
checks += 1
|
| 1049 |
+
|
| 1050 |
+
if checks:
|
| 1051 |
+
return round(score / checks, 2) >= similarity_threshold
|
| 1052 |
+
return False
|
| 1053 |
+
|
| 1054 |
def find_similar(
|
| 1055 |
self,
|
| 1056 |
similarity_threshold: float = 0.2,
|
|
|
|
| 1080 |
|
| 1081 |
:return: A ``Adaptors`` container of ``Adaptor`` objects or empty list
|
| 1082 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1083 |
# We will use the elements' root from now on to get the speed boost of using Lxml directly
|
| 1084 |
root = self._root
|
|
|
|
|
|
|
| 1085 |
similar_elements = list()
|
| 1086 |
+
|
| 1087 |
+
current_depth = len(list(root.iterancestors()))
|
| 1088 |
+
target_attrs = (
|
| 1089 |
+
self.__get_attributes(root, ignore_attributes)
|
| 1090 |
+
if ignore_attributes
|
| 1091 |
+
else root.attrib
|
| 1092 |
+
)
|
| 1093 |
+
|
| 1094 |
+
path_parts = [self.tag]
|
| 1095 |
+
if (parent := root.getparent()) is not None:
|
| 1096 |
+
path_parts.insert(0, parent.tag)
|
| 1097 |
+
if (grandparent := parent.getparent()) is not None:
|
| 1098 |
+
path_parts.insert(0, grandparent.tag)
|
| 1099 |
+
|
| 1100 |
+
xpath_path = "//{}".format("/".join(path_parts))
|
| 1101 |
+
potential_matches = root.xpath(
|
| 1102 |
+
f"{xpath_path}[count(ancestor::*) = {current_depth}]"
|
| 1103 |
+
)
|
| 1104 |
|
| 1105 |
for potential_match in potential_matches:
|
| 1106 |
+
if potential_match != root and self.__are_alike(
|
| 1107 |
+
root,
|
| 1108 |
+
target_attrs,
|
| 1109 |
+
potential_match,
|
| 1110 |
+
ignore_attributes,
|
| 1111 |
+
similarity_threshold,
|
| 1112 |
+
match_text,
|
| 1113 |
):
|
| 1114 |
similar_elements.append(potential_match)
|
| 1115 |
|