Spaces:
Sleeping
Sleeping
DEBUG: WEB CRAWLER
Browse files- functions.py +32 -25
functions.py
CHANGED
|
@@ -19,7 +19,7 @@ from supabase.client import create_client
|
|
| 19 |
from qdrant_client import QdrantClient
|
| 20 |
from langchain_groq import ChatGroq
|
| 21 |
from bs4 import BeautifulSoup
|
| 22 |
-
from urllib.parse import urlparse
|
| 23 |
from supabase import create_client
|
| 24 |
from dotenv import load_dotenv
|
| 25 |
import os
|
|
@@ -258,29 +258,36 @@ def listTables(username: str):
|
|
| 258 |
}
|
| 259 |
|
| 260 |
|
|
|
|
| 261 |
def getLinks(url: str, timeout = 30):
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
else:
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
continue
|
| 277 |
-
return allLinks
|
| 278 |
-
links = getLinksFromPage(url)
|
| 279 |
-
uniqueLinks = set()
|
| 280 |
-
for link in links:
|
| 281 |
-
now = time.time()
|
| 282 |
-
if now - start > timeout:
|
| 283 |
-
break
|
| 284 |
-
else:
|
| 285 |
-
uniqueLinks = uniqueLinks.union(set(getLinksFromPage(link)))
|
| 286 |
-
return list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in uniqueLinks]))
|
|
|
|
| 19 |
from qdrant_client import QdrantClient
|
| 20 |
from langchain_groq import ChatGroq
|
| 21 |
from bs4 import BeautifulSoup
|
| 22 |
+
from urllib.parse import urlparse, urljoin
|
| 23 |
from supabase import create_client
|
| 24 |
from dotenv import load_dotenv
|
| 25 |
import os
|
|
|
|
| 258 |
}
|
| 259 |
|
| 260 |
|
| 261 |
+
|
| 262 |
def getLinks(url: str, timeout = 30):
|
| 263 |
+
start = time.time()
|
| 264 |
+
def getLinksFromPage(url: str):
|
| 265 |
+
response = requests.get(url)
|
| 266 |
+
htmlContent = response.content
|
| 267 |
+
soup = BeautifulSoup(htmlContent, "lxml")
|
| 268 |
+
anchorTags = soup.find_all("a")
|
| 269 |
+
allLinks = []
|
| 270 |
+
for tag in anchorTags:
|
| 271 |
+
if "href" in tag.attrs:
|
| 272 |
+
href = tag.attrs["href"]
|
| 273 |
+
parseObject = urlparse(href)
|
| 274 |
+
if ((parseObject.scheme == "") | (parseObject.netloc == "")):
|
| 275 |
+
fullUrl = urljoin(url, os.path.join(parseObject.path, parseObject.params, parseObject.query, parseObject.fragment))
|
| 276 |
+
else:
|
| 277 |
+
fullUrl = href
|
| 278 |
+
if urlparse(fullUrl).netloc == urlparse(url).netloc:
|
| 279 |
+
allLinks.append(fullUrl)
|
| 280 |
+
else:
|
| 281 |
+
continue
|
| 282 |
+
else:
|
| 283 |
+
continue
|
| 284 |
+
return allLinks
|
| 285 |
+
links = getLinksFromPage(url)
|
| 286 |
+
uniqueLinks = set()
|
| 287 |
+
for link in links:
|
| 288 |
+
now = time.time()
|
| 289 |
+
if now - start > timeout:
|
| 290 |
+
break
|
| 291 |
else:
|
| 292 |
+
uniqueLinks = uniqueLinks.union(set(getLinksFromPage(link)))
|
| 293 |
+
return list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in uniqueLinks]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|