dusan-presswhizz commited on
Commit
bfd5b70
·
verified ·
1 Parent(s): 195a2aa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -7
app.py CHANGED
@@ -8,6 +8,7 @@ from functools import lru_cache
8
  import hashlib
9
  from langdetect import detect, LangDetectException
10
  import html
 
11
  # =========================
12
  # Config
13
  # =========================
@@ -76,13 +77,36 @@ def is_likely_author_bio_or_footer(element, text):
76
  """Check if an element is likely an author bio, footer, or other non-main content."""
77
  # Check parent and element classes/ids
78
  parent = element.parent
79
- element_attrs = " ".join([
80
- element.get('class', []) if isinstance(element.get('class'), list) else [element.get('class', '')],
81
- [element.get('id', '')],
82
- parent.get('class', []) if parent and isinstance(parent.get('class'), list) else [parent.get('class', '') if parent else ''],
83
- [parent.get('id', '') if parent else '']
84
- ])
85
- element_attrs = " ".join(filter(None, element_attrs)).lower()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
  # Common indicators of non-main content
88
  bio_indicators = ['author', 'bio', 'about', 'profile', 'sidebar', 'footer',
 
8
  import hashlib
9
  from langdetect import detect, LangDetectException
10
  import html
11
+
12
  # =========================
13
  # Config
14
  # =========================
 
77
  """Check if an element is likely an author bio, footer, or other non-main content."""
78
  # Check parent and element classes/ids
79
  parent = element.parent
80
+
81
+ # Safely extract and flatten all attributes
82
+ attrs_list = []
83
+
84
+ # Get element classes
85
+ elem_classes = element.get('class', [])
86
+ if isinstance(elem_classes, list):
87
+ attrs_list.extend(elem_classes)
88
+ elif elem_classes:
89
+ attrs_list.append(str(elem_classes))
90
+
91
+ # Get element id
92
+ elem_id = element.get('id', '')
93
+ if elem_id:
94
+ attrs_list.append(str(elem_id))
95
+
96
+ # Get parent classes and id if parent exists
97
+ if parent:
98
+ parent_classes = parent.get('class', [])
99
+ if isinstance(parent_classes, list):
100
+ attrs_list.extend(parent_classes)
101
+ elif parent_classes:
102
+ attrs_list.append(str(parent_classes))
103
+
104
+ parent_id = parent.get('id', '')
105
+ if parent_id:
106
+ attrs_list.append(str(parent_id))
107
+
108
+ # Join all attributes into a single string
109
+ element_attrs = " ".join(str(attr) for attr in attrs_list if attr).lower()
110
 
111
  # Common indicators of non-main content
112
  bio_indicators = ['author', 'bio', 'about', 'profile', 'sidebar', 'footer',