Spaces:

techconspartners
/

ConversAI

Sleeping

Rauhan commited on Aug 14, 2024

Commit

4a2e5ad

1 Parent(s): d7b4497

DEBUG: updating getLinks

Files changed (1) hide show

functions.py CHANGED Viewed

@@ -255,7 +255,7 @@ def listTables(username: str):
 def getLinks(url: str, timeout=30):
     start = time.time()
     def getLinksFromPage(url: str) -> list:
         response = requests.get(url)
         soup = BeautifulSoup(response.content, "lxml")
@@ -265,11 +265,12 @@ def getLinks(url: str, timeout=30):
             if "href" in anchor.attrs:
                 if urlparse(anchor.attrs["href"]).netloc == urlparse(url).netloc:
                     links.append(anchor.attrs["href"])
-                elif anchor.attrs["href"].startswith("/"):
                     links.append(urljoin(url + "/", anchor.attrs["href"]))
                 else:
                     pass
-                links = list(set([link for link in links if url in link]))
             else:
                 continue
         return links

 def getLinks(url: str, timeout=30):
     start = time.time()
     def getLinksFromPage(url: str) -> list:
         response = requests.get(url)
         soup = BeautifulSoup(response.content, "lxml")
             if "href" in anchor.attrs:
                 if urlparse(anchor.attrs["href"]).netloc == urlparse(url).netloc:
                     links.append(anchor.attrs["href"])
+                elif not anchor.attrs["href"].startswith(("//", "file", "javascript", "tel", "mailto", "http")):
                     links.append(urljoin(url + "/", anchor.attrs["href"]))
                 else:
                     pass
+                links = [link for link in links if "#" not in link]
+                links = list(set(links))
             else:
                 continue
         return links