Karim shoair commited on
Commit
31f70c8
·
1 Parent(s): 2110f64

refactor(Adaptor): Cleaner approach to `find_similar` method

Browse files

This code is slower than before by about 2-5μs, but it's worth it.

Files changed (1) hide show
  1. scrapling/parser.py +74 -63
scrapling/parser.py CHANGED
@@ -1002,6 +1002,55 @@ class Adaptor(SelectorsGeneration):
1002
  regex, default, replace_entities, clean_match, case_sensitive
1003
  )
1004
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1005
  def find_similar(
1006
  self,
1007
  similarity_threshold: float = 0.2,
@@ -1031,74 +1080,36 @@ class Adaptor(SelectorsGeneration):
1031
 
1032
  :return: A ``Adaptors`` container of ``Adaptor`` objects or empty list
1033
  """
1034
-
1035
- def get_attributes(element: html.HtmlElement) -> Dict:
1036
- """Return attributes dictionary without the ignored list"""
1037
- return {
1038
- k: v for k, v in element.attrib.items() if k not in ignore_attributes
1039
- }
1040
-
1041
- def are_alike(
1042
- original: html.HtmlElement,
1043
- original_attributes: Dict,
1044
- candidate: html.HtmlElement,
1045
- ) -> bool:
1046
- """Calculate a score of how much these elements are alike and return True
1047
- if the score is higher or equals the threshold"""
1048
- candidate_attributes = (
1049
- get_attributes(candidate) if ignore_attributes else candidate.attrib
1050
- )
1051
- score, checks = 0, 0
1052
-
1053
- if original_attributes:
1054
- score += sum(
1055
- SequenceMatcher(None, v, candidate_attributes.get(k, "")).ratio()
1056
- for k, v in original_attributes.items()
1057
- )
1058
- checks += len(candidate_attributes)
1059
- else:
1060
- if not candidate_attributes:
1061
- # Both don't have attributes, this must mean something
1062
- score += 1
1063
- checks += 1
1064
-
1065
- if match_text:
1066
- score += SequenceMatcher(
1067
- None,
1068
- clean_spaces(original.text or ""),
1069
- clean_spaces(candidate.text or ""),
1070
- ).ratio()
1071
- checks += 1
1072
-
1073
- if checks:
1074
- return round(score / checks, 2) >= similarity_threshold
1075
- return False
1076
-
1077
  # We will use the elements' root from now on to get the speed boost of using Lxml directly
1078
  root = self._root
1079
- current_depth = len(list(root.iterancestors()))
1080
- target_attrs = get_attributes(root) if ignore_attributes else root.attrib
1081
  similar_elements = list()
1082
- # + root.xpath(f"//{self.tag}[count(ancestor::*) = {current_depth-1}]")
1083
- parent = root.getparent()
1084
- if parent is not None:
1085
- grandparent = parent.getparent() # lol
1086
- if grandparent is not None:
1087
- potential_matches = root.xpath(
1088
- f"//{grandparent.tag}/{parent.tag}/{self.tag}[count(ancestor::*) = {current_depth}]"
1089
- )
1090
- else:
1091
- potential_matches = root.xpath(
1092
- f"//{parent.tag}/{self.tag}[count(ancestor::*) = {current_depth}]"
1093
- )
1094
- else:
1095
- potential_matches = root.xpath(
1096
- f"//{self.tag}[count(ancestor::*) = {current_depth}]"
1097
- )
 
 
1098
 
1099
  for potential_match in potential_matches:
1100
- if potential_match != root and are_alike(
1101
- root, target_attrs, potential_match
 
 
 
 
 
1102
  ):
1103
  similar_elements.append(potential_match)
1104
 
 
1002
  regex, default, replace_entities, clean_match, case_sensitive
1003
  )
1004
 
1005
+ @staticmethod
1006
+ def __get_attributes(
1007
+ element: html.HtmlElement, ignore_attributes: Union[List, Tuple]
1008
+ ) -> Dict:
1009
+ """Return attributes dictionary without the ignored list"""
1010
+ return {k: v for k, v in element.attrib.items() if k not in ignore_attributes}
1011
+
1012
+ def __are_alike(
1013
+ self,
1014
+ original: html.HtmlElement,
1015
+ original_attributes: Dict,
1016
+ candidate: html.HtmlElement,
1017
+ ignore_attributes: Union[List, Tuple],
1018
+ similarity_threshold: float,
1019
+ match_text: bool = False,
1020
+ ) -> bool:
1021
+ """Calculate a score of how much these elements are alike and return True
1022
+ if the score is higher or equals the threshold"""
1023
+ candidate_attributes = (
1024
+ self.__get_attributes(candidate, ignore_attributes)
1025
+ if ignore_attributes
1026
+ else candidate.attrib
1027
+ )
1028
+ score, checks = 0, 0
1029
+
1030
+ if original_attributes:
1031
+ score += sum(
1032
+ SequenceMatcher(None, v, candidate_attributes.get(k, "")).ratio()
1033
+ for k, v in original_attributes.items()
1034
+ )
1035
+ checks += len(candidate_attributes)
1036
+ else:
1037
+ if not candidate_attributes:
1038
+ # Both don't have attributes, this must mean something
1039
+ score += 1
1040
+ checks += 1
1041
+
1042
+ if match_text:
1043
+ score += SequenceMatcher(
1044
+ None,
1045
+ clean_spaces(original.text or ""),
1046
+ clean_spaces(candidate.text or ""),
1047
+ ).ratio()
1048
+ checks += 1
1049
+
1050
+ if checks:
1051
+ return round(score / checks, 2) >= similarity_threshold
1052
+ return False
1053
+
1054
  def find_similar(
1055
  self,
1056
  similarity_threshold: float = 0.2,
 
1080
 
1081
  :return: A ``Adaptors`` container of ``Adaptor`` objects or empty list
1082
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1083
  # We will use the elements' root from now on to get the speed boost of using Lxml directly
1084
  root = self._root
 
 
1085
  similar_elements = list()
1086
+
1087
+ current_depth = len(list(root.iterancestors()))
1088
+ target_attrs = (
1089
+ self.__get_attributes(root, ignore_attributes)
1090
+ if ignore_attributes
1091
+ else root.attrib
1092
+ )
1093
+
1094
+ path_parts = [self.tag]
1095
+ if (parent := root.getparent()) is not None:
1096
+ path_parts.insert(0, parent.tag)
1097
+ if (grandparent := parent.getparent()) is not None:
1098
+ path_parts.insert(0, grandparent.tag)
1099
+
1100
+ xpath_path = "//{}".format("/".join(path_parts))
1101
+ potential_matches = root.xpath(
1102
+ f"{xpath_path}[count(ancestor::*) = {current_depth}]"
1103
+ )
1104
 
1105
  for potential_match in potential_matches:
1106
+ if potential_match != root and self.__are_alike(
1107
+ root,
1108
+ target_attrs,
1109
+ potential_match,
1110
+ ignore_attributes,
1111
+ similarity_threshold,
1112
+ match_text,
1113
  ):
1114
  similar_elements.append(potential_match)
1115