Spaces:
Sleeping
Sleeping
DEBUG: updating getLinks
Browse files- functions.py +4 -3
functions.py
CHANGED
|
@@ -255,7 +255,7 @@ def listTables(username: str):
|
|
| 255 |
|
| 256 |
def getLinks(url: str, timeout=30):
|
| 257 |
start = time.time()
|
| 258 |
-
|
| 259 |
def getLinksFromPage(url: str) -> list:
|
| 260 |
response = requests.get(url)
|
| 261 |
soup = BeautifulSoup(response.content, "lxml")
|
|
@@ -265,11 +265,12 @@ def getLinks(url: str, timeout=30):
|
|
| 265 |
if "href" in anchor.attrs:
|
| 266 |
if urlparse(anchor.attrs["href"]).netloc == urlparse(url).netloc:
|
| 267 |
links.append(anchor.attrs["href"])
|
| 268 |
-
elif anchor.attrs["href"].startswith("
|
| 269 |
links.append(urljoin(url + "/", anchor.attrs["href"]))
|
| 270 |
else:
|
| 271 |
pass
|
| 272 |
-
links =
|
|
|
|
| 273 |
else:
|
| 274 |
continue
|
| 275 |
return links
|
|
|
|
| 255 |
|
| 256 |
def getLinks(url: str, timeout=30):
|
| 257 |
start = time.time()
|
| 258 |
+
|
| 259 |
def getLinksFromPage(url: str) -> list:
|
| 260 |
response = requests.get(url)
|
| 261 |
soup = BeautifulSoup(response.content, "lxml")
|
|
|
|
| 265 |
if "href" in anchor.attrs:
|
| 266 |
if urlparse(anchor.attrs["href"]).netloc == urlparse(url).netloc:
|
| 267 |
links.append(anchor.attrs["href"])
|
| 268 |
+
elif not anchor.attrs["href"].startswith(("//", "file", "javascript", "tel", "mailto", "http")):
|
| 269 |
links.append(urljoin(url + "/", anchor.attrs["href"]))
|
| 270 |
else:
|
| 271 |
pass
|
| 272 |
+
links = [link for link in links if "#" not in link]
|
| 273 |
+
links = list(set(links))
|
| 274 |
else:
|
| 275 |
continue
|
| 276 |
return links
|