ayushsinghal1510's picture
Init COmmit
cc65c1f
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from collections import deque
from typing import Tuple , List , Set
from scripts.services.services import process_link
async def create_soup(url : str) -> BeautifulSoup :
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content , 'html.parser')
# print(type(soup))
return soup
async def process_page(url : str) :
soup : BeautifulSoup = await create_soup(url)
links = soup.find_all('a' , href = True)
for a_tag in links :
href = a_tag['href']
href = await process_link(href)
if href : yield href
async def get_pdf_links(base_html : str) :
pdf_links : List[str] = []
all_links : Set[str] = set()
all_links.add(base_html)
visited_urls : set = set()
url_queue : deque = deque([base_html])
visited_urls.add(base_html)
while url_queue :
try :
current_url : str = url_queue.popleft()
print(current_url , len(url_queue))
async for link in process_page(current_url) :
if link.endswith('pdf') :
if not link.startswith('http') : link : str = f'{base_html}{link}'
pdf_links.append(link)
else :
absolute_url : str = urljoin(current_url , link)
if absolute_url.startswith(base_html) :
if absolute_url not in visited_urls :
visited_urls.add(absolute_url)
all_links.add(absolute_url)
url_queue.append(absolute_url)
except : pass
return pdf_links , all_links