Spaces:
Paused
Paused
File size: 6,924 Bytes
046723b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 |
#!/usr/bin/env python3
import time
from flask import url_for
from ..html_tools import *
from .util import live_server_setup, wait_for_all_checks
def set_response_with_multiple_index():
data= """<!DOCTYPE html>
<html>
<body>
<!-- NOTE!! CHROME WILL ADD TBODY HERE IF ITS NOT THERE!! -->
<table style="width:100%">
<tr>
<th>Person 1</th>
<th>Person 2</th>
<th>Person 3</th>
</tr>
<tr>
<td>Emil</td>
<td>Tobias</td>
<td>Linus</td>
</tr>
<tr>
<td>16</td>
<td>14</td>
<td>10</td>
</tr>
</table>
</body>
</html>
"""
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(data)
def set_original_response():
test_return_data = """<html>
<header>
<h2>Header</h2>
</header>
<nav>
<ul>
<li><a href="#">A</a></li>
<li><a href="#">B</a></li>
<li><a href="#">C</a></li>
</ul>
</nav>
<body>
Some initial text<br>
<p>Which is across multiple lines</p>
<br>
So let's see what happens. <br>
<div id="changetext">Some text that will change</div>
</body>
<footer>
<p>Footer</p>
</footer>
</html>
"""
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(test_return_data)
def set_modified_response():
test_return_data = """<html>
<header>
<h2>Header changed</h2>
</header>
<nav>
<ul>
<li><a href="#">A changed</a></li>
<li><a href="#">B</a></li>
<li><a href="#">C</a></li>
</ul>
</nav>
<body>
Some initial text<br>
<p>Which is across multiple lines</p>
<br>
So let's see what happens. <br>
<div id="changetext">Some text that changes</div>
</body>
<footer>
<p>Footer changed</p>
</footer>
</html>
"""
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(test_return_data)
def test_element_removal_output():
from inscriptis import get_text
# Check text with sub-parts renders correctly
content = """<html>
<header>
<h2>Header</h2>
</header>
<nav>
<ul>
<li><a href="#">A</a></li>
</ul>
</nav>
<body>
Some initial text<br>
<p>across multiple lines</p>
<div id="changetext">Some text that changes</div>
<div>Some text should be matched by xPath // selector</div>
<div>Some text should be matched by xPath selector</div>
<div>Some text should be matched by xPath1 selector</div>
</body>
<footer>
<p>Footer</p>
</footer>
</html>
"""
html_blob = element_removal(
[
"header",
"footer",
"nav",
"#changetext",
"//*[contains(text(), 'xPath // selector')]",
"xpath://*[contains(text(), 'xPath selector')]",
"xpath1://*[contains(text(), 'xPath1 selector')]"
],
html_content=content
)
text = get_text(html_blob)
assert (
text
== """Some initial text
across multiple lines
"""
)
def test_element_removal_full(client, live_server, measure_memory_usage):
set_original_response()
# Add our URL to the import page
test_url = url_for("test_endpoint", _external=True)
res = client.post(
url_for("imports.import_page"), data={"urls": test_url}, follow_redirects=True
)
assert b"1 Imported" in res.data
wait_for_all_checks(client)
# Goto the edit page, add the filter data
# Not sure why \r needs to be added - absent of the #changetext this is not necessary
subtractive_selectors_data = "header\r\nfooter\r\nnav\r\n#changetext"
res = client.post(
url_for("ui.ui_edit.edit_page", uuid="first"),
data={
"subtractive_selectors": subtractive_selectors_data,
"url": test_url,
"tags": "",
"headers": "",
"fetch_backend": "html_requests",
},
follow_redirects=True,
)
assert b"Updated watch." in res.data
wait_for_all_checks(client)
# Check it saved
res = client.get(
url_for("ui.ui_edit.edit_page", uuid="first"),
)
assert bytes(subtractive_selectors_data.encode("utf-8")) in res.data
# Trigger a check
res = client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
assert b'Queued 1 watch for rechecking.' in res.data
wait_for_all_checks(client)
# so that we set the state to 'unviewed' after all the edits
client.get(url_for("ui.ui_views.diff_history_page", uuid="first"))
# Make a change to header/footer/nav
set_modified_response()
# Trigger a check
res = client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
assert b'Queued 1 watch for rechecking.' in res.data
# Give the thread time to pick it up
wait_for_all_checks(client)
# There should not be an unviewed change, as changes should be removed
res = client.get(url_for("watchlist.index"))
assert b"unviewed" not in res.data
# Re #2752
def test_element_removal_nth_offset_no_shift(client, live_server, measure_memory_usage):
set_response_with_multiple_index()
subtractive_selectors_data = ["""
body > table > tr:nth-child(1) > th:nth-child(2)
body > table > tr:nth-child(2) > td:nth-child(2)
body > table > tr:nth-child(3) > td:nth-child(2)
body > table > tr:nth-child(1) > th:nth-child(3)
body > table > tr:nth-child(2) > td:nth-child(3)
body > table > tr:nth-child(3) > td:nth-child(3)""",
"""//body/table/tr[1]/th[2]
//body/table/tr[2]/td[2]
//body/table/tr[3]/td[2]
//body/table/tr[1]/th[3]
//body/table/tr[2]/td[3]
//body/table/tr[3]/td[3]"""]
for selector_list in subtractive_selectors_data:
res = client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data
# Add our URL to the import page
test_url = url_for("test_endpoint", _external=True)
res = client.post(
url_for("imports.import_page"), data={"urls": test_url}, follow_redirects=True
)
assert b"1 Imported" in res.data
wait_for_all_checks(client)
res = client.post(
url_for("ui.ui_edit.edit_page", uuid="first"),
data={
"subtractive_selectors": selector_list,
"url": test_url,
"tags": "",
"fetch_backend": "html_requests",
},
follow_redirects=True,
)
assert b"Updated watch." in res.data
wait_for_all_checks(client)
res = client.get(
url_for("ui.ui_views.preview_page", uuid="first"),
follow_redirects=True
)
assert b"Tobias" not in res.data
assert b"Linus" not in res.data
assert b"Person 2" not in res.data
assert b"Person 3" not in res.data
# First column should exist
assert b"Emil" in res.data
|