File size: 19,789 Bytes
046723b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
from loguru import logger
from lxml import etree
from typing import List
import json
import re

# HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis
TEXT_FILTER_LIST_LINE_SUFFIX = "<br>"
TRANSLATE_WHITESPACE_TABLE = str.maketrans('', '', '\r\n\t ')
PERL_STYLE_REGEX = r'^/(.*?)/([a-z]*)?$'

# 'price' , 'lowPrice', 'highPrice' are usually under here
# All of those may or may not appear on different websites - I didnt find a way todo case-insensitive searching here
LD_JSON_PRODUCT_OFFER_SELECTORS = ["json:$..offers", "json:$..Offers"]

class JSONNotFound(ValueError):
    def __init__(self, msg):
        ValueError.__init__(self, msg)


# Doesn't look like python supports forward slash auto enclosure in re.findall
# So convert it to inline flag "(?i)foobar" type configuration
def perl_style_slash_enclosed_regex_to_options(regex):

    res = re.search(PERL_STYLE_REGEX, regex, re.IGNORECASE)

    if res:
        flags = res.group(2) if res.group(2) else 'i'
        regex = f"(?{flags}){res.group(1)}"
    else:
        # Fall back to just ignorecase as an option
        regex = f"(?i){regex}"

    return regex

# Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches
def include_filters(include_filters, html_content, append_pretty_line_formatting=False):
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html_content, "html.parser")
    html_block = ""
    r = soup.select(include_filters, separator="")

    for element in r:
        # When there's more than 1 match, then add the suffix to separate each line
        # And where the matched result doesn't include something that will cause Inscriptis to add a newline
        # (This way each 'match' reliably has a new-line in the diff)
        # Divs are converted to 4 whitespaces by inscriptis
        if append_pretty_line_formatting and len(html_block) and not element.name in (['br', 'hr', 'div', 'p']):
            html_block += TEXT_FILTER_LIST_LINE_SUFFIX

        html_block += str(element)

    return html_block

def subtractive_css_selector(css_selector, html_content):
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html_content, "html.parser")

    # So that the elements dont shift their index, build a list of elements here which will be pointers to their place in the DOM
    elements_to_remove = soup.select(css_selector)

    # Then, remove them in a separate loop
    for item in elements_to_remove:
        item.decompose()

    return str(soup)

def subtractive_xpath_selector(selectors: List[str], html_content: str) -> str:
    # Parse the HTML content using lxml
    html_tree = etree.HTML(html_content)

    # First, collect all elements to remove
    elements_to_remove = []

    # Iterate over the list of XPath selectors
    for selector in selectors:
        # Collect elements for each selector
        elements_to_remove.extend(html_tree.xpath(selector))

    # Then, remove them in a separate loop
    for element in elements_to_remove:
        if element.getparent() is not None:  # Ensure the element has a parent before removing
            element.getparent().remove(element)

    # Convert the modified HTML tree back to a string
    modified_html = etree.tostring(html_tree, method="html").decode("utf-8")
    return modified_html


def element_removal(selectors: List[str], html_content):
    """Removes elements that match a list of CSS or XPath selectors."""
    modified_html = html_content
    css_selectors = []
    xpath_selectors = []

    for selector in selectors:
        if selector.startswith(('xpath:', 'xpath1:', '//')):
            # Handle XPath selectors separately
            xpath_selector = selector.removeprefix('xpath:').removeprefix('xpath1:')
            xpath_selectors.append(xpath_selector)
        else:
            # Collect CSS selectors as one "hit", see comment in subtractive_css_selector
            css_selectors.append(selector.strip().strip(","))

    if xpath_selectors:
        modified_html = subtractive_xpath_selector(xpath_selectors, modified_html)

    if css_selectors:
        # Remove duplicates, then combine all CSS selectors into one string, separated by commas
        # This stops the elements index shifting
        unique_selectors = list(set(css_selectors))  # Ensure uniqueness
        combined_css_selector = " , ".join(unique_selectors)
        modified_html = subtractive_css_selector(combined_css_selector, modified_html)


    return modified_html

def elementpath_tostring(obj):
    """
    change elementpath.select results to string type
    # The MIT License (MIT), Copyright (c), 2018-2021, SISSA (Scuola Internazionale Superiore di Studi Avanzati)
    # https://github.com/sissaschool/elementpath/blob/dfcc2fd3d6011b16e02bf30459a7924f547b47d0/elementpath/xpath_tokens.py#L1038
    """

    import elementpath
    from decimal import Decimal
    import math

    if obj is None:
        return ''
    # https://elementpath.readthedocs.io/en/latest/xpath_api.html#elementpath.select
    elif isinstance(obj, elementpath.XPathNode):
        return obj.string_value
    elif isinstance(obj, bool):
        return 'true' if obj else 'false'
    elif isinstance(obj, Decimal):
        value = format(obj, 'f')
        if '.' in value:
            return value.rstrip('0').rstrip('.')
        return value

    elif isinstance(obj, float):
        if math.isnan(obj):
            return 'NaN'
        elif math.isinf(obj):
            return str(obj).upper()

        value = str(obj)
        if '.' in value:
            value = value.rstrip('0').rstrip('.')
        if '+' in value:
            value = value.replace('+', '')
        if 'e' in value:
            return value.upper()
        return value

    return str(obj)

# Return str Utf-8 of matched rules
def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False, is_rss=False):
    from lxml import etree, html
    import elementpath
    # xpath 2.0-3.1
    from elementpath.xpath3 import XPath3Parser

    parser = etree.HTMLParser()
    if is_rss:
        # So that we can keep CDATA for cdata_in_document_to_text() to process
        parser = etree.XMLParser(strip_cdata=False)

    tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser)
    html_block = ""

    r = elementpath.select(tree, xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}, parser=XPath3Parser)
    #@note: //title/text() wont work where <title>CDATA..

    if type(r) != list:
        r = [r]

    for element in r:
        # When there's more than 1 match, then add the suffix to separate each line
        # And where the matched result doesn't include something that will cause Inscriptis to add a newline
        # (This way each 'match' reliably has a new-line in the diff)
        # Divs are converted to 4 whitespaces by inscriptis
        if append_pretty_line_formatting and len(html_block) and (not hasattr( element, 'tag' ) or not element.tag in (['br', 'hr', 'div', 'p'])):
            html_block += TEXT_FILTER_LIST_LINE_SUFFIX

        if type(element) == str:
            html_block += element
        elif issubclass(type(element), etree._Element) or issubclass(type(element), etree._ElementTree):
            html_block += etree.tostring(element, pretty_print=True).decode('utf-8')
        else:
            html_block += elementpath_tostring(element)

    return html_block

# Return str Utf-8 of matched rules
# 'xpath1:'
def xpath1_filter(xpath_filter, html_content, append_pretty_line_formatting=False, is_rss=False):
    from lxml import etree, html

    parser = None
    if is_rss:
        # So that we can keep CDATA for cdata_in_document_to_text() to process
        parser = etree.XMLParser(strip_cdata=False)

    tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser)
    html_block = ""

    r = tree.xpath(xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'})
    #@note: //title/text() wont work where <title>CDATA..

    for element in r:
        # When there's more than 1 match, then add the suffix to separate each line
        # And where the matched result doesn't include something that will cause Inscriptis to add a newline
        # (This way each 'match' reliably has a new-line in the diff)
        # Divs are converted to 4 whitespaces by inscriptis
        if append_pretty_line_formatting and len(html_block) and (not hasattr(element, 'tag') or not element.tag in (['br', 'hr', 'div', 'p'])):
            html_block += TEXT_FILTER_LIST_LINE_SUFFIX

        # Some kind of text, UTF-8 or other
        if isinstance(element, (str, bytes)):
            html_block += element
        else:
            # Return the HTML which will get parsed as text
            html_block += etree.tostring(element, pretty_print=True).decode('utf-8')

    return html_block

# Extract/find element
def extract_element(find='title', html_content=''):
    from bs4 import BeautifulSoup

    #Re #106, be sure to handle when its not found
    element_text = None

    soup = BeautifulSoup(html_content, 'html.parser')
    result = soup.find(find)
    if result and result.string:
        element_text = result.string.strip()

    return element_text

#
def _parse_json(json_data, json_filter):
    from jsonpath_ng.ext import parse

    if json_filter.startswith("json:"):
        jsonpath_expression = parse(json_filter.replace('json:', ''))
        match = jsonpath_expression.find(json_data)
        return _get_stripped_text_from_json_match(match)

    if json_filter.startswith("jq:") or json_filter.startswith("jqraw:"):

        try:
            import jq
        except ModuleNotFoundError:
            # `jq` requires full compilation in windows and so isn't generally available
            raise Exception("jq not support not found")

        if json_filter.startswith("jq:"):
            jq_expression = jq.compile(json_filter.removeprefix("jq:"))
            match = jq_expression.input(json_data).all()
            return _get_stripped_text_from_json_match(match)

        if json_filter.startswith("jqraw:"):
            jq_expression = jq.compile(json_filter.removeprefix("jqraw:"))
            match = jq_expression.input(json_data).all()
            return '\n'.join(str(item) for item in match)

def _get_stripped_text_from_json_match(match):
    s = []
    # More than one result, we will return it as a JSON list.
    if len(match) > 1:
        for i in match:
            s.append(i.value if hasattr(i, 'value') else i)

    # Single value, use just the value, as it could be later used in a token in notifications.
    if len(match) == 1:
        s = match[0].value if hasattr(match[0], 'value') else match[0]

    # Re #257 - Better handling where it does not exist, in the case the original 's' value was False..
    if not match:
        # Re 265 - Just return an empty string when filter not found
        return ''

    # Ticket #462 - allow the original encoding through, usually it's UTF-8 or similar
    stripped_text_from_html = json.dumps(s, indent=4, ensure_ascii=False)

    return stripped_text_from_html

# content - json
# json_filter - ie json:$..price
# ensure_is_ldjson_info_type - str "product", optional, "@type == product" (I dont know how to do that as a json selector)
def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None):
    from bs4 import BeautifulSoup

    stripped_text_from_html = False
# https://github.com/dgtlmoon/changedetection.io/pull/2041#issuecomment-1848397161w
    # Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded within HTML tags
    try:
        # .lstrip("\ufeff") strings ByteOrderMark from UTF8 and still lets the UTF work
        stripped_text_from_html = _parse_json(json.loads(content.lstrip("\ufeff") ), json_filter)
    except json.JSONDecodeError as e:
        logger.warning(str(e))

        # Foreach <script json></script> blob.. just return the first that matches json_filter
        # As a last resort, try to parse the whole <body>
        soup = BeautifulSoup(content, 'html.parser')

        if ensure_is_ldjson_info_type:
            bs_result = soup.find_all('script', {"type": "application/ld+json"})
        else:
            bs_result = soup.find_all('script')
        bs_result += soup.find_all('body')

        bs_jsons = []
        for result in bs_result:
            # Skip empty tags, and things that dont even look like JSON
            if not result.text or '{' not in result.text:
                continue
            try:
                json_data = json.loads(result.text)
                bs_jsons.append(json_data)
            except json.JSONDecodeError:
                # Skip objects which cannot be parsed
                continue

        if not bs_jsons:
            raise JSONNotFound("No parsable JSON found in this document")
        
        for json_data in bs_jsons:
            stripped_text_from_html = _parse_json(json_data, json_filter)

            if ensure_is_ldjson_info_type:
                # Could sometimes be list, string or something else random
                if isinstance(json_data, dict):
                    # If it has LD JSON 'key' @type, and @type is 'product', and something was found for the search
                    # (Some sites have multiple of the same ld+json @type='product', but some have the review part, some have the 'price' part)
                    # @type could also be a list although non-standard ("@type": ["Product", "SubType"],)
                    # LD_JSON auto-extract also requires some content PLUS the ldjson to be present
                    # 1833 - could be either str or dict, should not be anything else

                    t = json_data.get('@type')
                    if t and stripped_text_from_html:

                        if isinstance(t, str) and t.lower() == ensure_is_ldjson_info_type.lower():
                            break
                        # The non-standard part, some have a list
                        elif isinstance(t, list):
                            if ensure_is_ldjson_info_type.lower() in [x.lower().strip() for x in t]:
                                break

            elif stripped_text_from_html:
                break

    if not stripped_text_from_html:
        # Re 265 - Just return an empty string when filter not found
        return ''

    return stripped_text_from_html

# Mode     - "content" return the content without the matches (default)
#          - "line numbers" return a list of line numbers that match (int list)
#
# wordlist - list of regex's (str) or words (str)
# Preserves all linefeeds and other whitespacing, its not the job of this to remove that
def strip_ignore_text(content, wordlist, mode="content"):
    ignore_text = []
    ignore_regex = []
    ignore_regex_multiline = []
    ignored_lines = []

    for k in wordlist:
        # Is it a regex?
        res = re.search(PERL_STYLE_REGEX, k, re.IGNORECASE)
        if res:
            res = re.compile(perl_style_slash_enclosed_regex_to_options(k))
            if res.flags & re.DOTALL or res.flags & re.MULTILINE:
                ignore_regex_multiline.append(res)
            else:
                ignore_regex.append(res)
        else:
            ignore_text.append(k.strip())

    for r in ignore_regex_multiline:
        for match in r.finditer(content):
            content_lines = content[:match.end()].splitlines(keepends=True)
            match_lines = content[match.start():match.end()].splitlines(keepends=True)

            end_line = len(content_lines)
            start_line = end_line - len(match_lines)

            if end_line - start_line <= 1:
                # Match is empty or in the middle of the line
                ignored_lines.append(start_line)
            else:
                for i in range(start_line, end_line):
                    ignored_lines.append(i)

    line_index = 0
    lines = content.splitlines(keepends=True)
    for line in lines:
        # Always ignore blank lines in this mode. (when this function gets called)
        got_match = False
        for l in ignore_text:
            if l.lower() in line.lower():
                got_match = True

        if not got_match:
            for r in ignore_regex:
                if r.search(line):
                    got_match = True

        if got_match:
            ignored_lines.append(line_index)

        line_index += 1

    ignored_lines = set([i for i in ignored_lines if i >= 0 and i < len(lines)])

    # Used for finding out what to highlight
    if mode == "line numbers":
        return [i + 1 for i in ignored_lines]

    output_lines = set(range(len(lines))) - ignored_lines
    return ''.join([lines[i] for i in output_lines])

def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False) -> str:
    from xml.sax.saxutils import escape as xml_escape
    pattern = '<!\[CDATA\[(\s*(?:.(?<!\]\]>)\s*)*)\]\]>'
    def repl(m):
        text = m.group(1)
        return xml_escape(html_to_text(html_content=text)).strip()

    return re.sub(pattern, repl, html_content)


# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON


def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False, timeout=10) -> str:
    from inscriptis import get_text
    from inscriptis.model.config import ParserConfig

    if render_anchor_tag_content:
        parser_config = ParserConfig(
            annotation_rules={"a": ["hyperlink"]},
            display_links=True
        )
    else:
        parser_config = None

    if is_rss:
        html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
        html_content = re.sub(r'</title>', r'</h1>', html_content)

    text_content = get_text(html_content, config=parser_config)
    return text_content

# Does LD+JSON exist with a @type=='product' and a .price set anywhere?
def has_ldjson_product_info(content):
    try:
        # Better than .lower() which can use a lot of ram
        if (re.search(r'application/ld\+json', content, re.IGNORECASE) and
            re.search(r'"price"', content, re.IGNORECASE) and
            re.search(r'"pricecurrency"', content, re.IGNORECASE)):
            return True

#       On some pages this is really terribly expensive when they dont really need it
#       (For example you never want price monitoring, but this runs on every watch to suggest it)
#        for filter in LD_JSON_PRODUCT_OFFER_SELECTORS:
#            pricing_data += extract_json_as_string(content=content,
#                                                  json_filter=filter,
#                                                  ensure_is_ldjson_info_type="product")
    except Exception as e:
        # OK too
        return False

    return False



def workarounds_for_obfuscations(content):
    """
    Some sites are using sneaky tactics to make prices and other information un-renderable by Inscriptis
    This could go into its own Pip package in the future, for faster updates
    """

    # HomeDepot.com style <span>$<!-- -->90<!-- -->.<!-- -->74</span>
    # https://github.com/weblyzard/inscriptis/issues/45
    if not content:
        return content

    content = re.sub('<!--\s+-->', '', content)

    return content


def get_triggered_text(content, trigger_text):
    triggered_text = []
    result = strip_ignore_text(content=content,
                               wordlist=trigger_text,
                               mode="line numbers")

    i = 1
    for p in content.splitlines():
        if i in result:
            triggered_text.append(p)
        i += 1

    return triggered_text