Spaces:

ayushsinghal1510
/

sts-llm-backend

Sleeping

Init COmmit

cc65c1f 10 months ago

1.79 kB

	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin
	from collections import deque
	from typing import Tuple , List , Set

	from scripts.services.services import process_link

	async def create_soup(url : str) -> BeautifulSoup :

	response = requests.get(url)
	response.raise_for_status()
	soup = BeautifulSoup(response.content , 'html.parser')

	# print(type(soup))

	return soup

	async def process_page(url : str) :

	soup : BeautifulSoup = await create_soup(url)

	links = soup.find_all('a' , href = True)

	for a_tag in links :

	href = a_tag['href']

	href = await process_link(href)

	if href : yield href

	async def get_pdf_links(base_html : str) :

	pdf_links : List[str] = []
	all_links : Set[str] = set()
	all_links.add(base_html)

	visited_urls : set = set()

	url_queue : deque = deque([base_html])

	visited_urls.add(base_html)

	while url_queue :

	try :

	current_url : str = url_queue.popleft()

	print(current_url , len(url_queue))

	async for link in process_page(current_url) :

	if link.endswith('pdf') :

	if not link.startswith('http') : link : str = f'{base_html}{link}'

	pdf_links.append(link)

	else :

	absolute_url : str = urljoin(current_url , link)

	if absolute_url.startswith(base_html) :

	if absolute_url not in visited_urls :

	visited_urls.add(absolute_url)
	all_links.add(absolute_url)
	url_queue.append(absolute_url)

	except : pass

	return pdf_links , all_links