In [4]:
import os
import requests
from bs4 import BeautifulSoup
import re


def has_excluded_parent(tag, exclude_tags):
    parent = tag.parent
    while parent.name != 'html':
        if parent.name in exclude_tags:
            return True
        parent = parent.parent
    return False

def get_text(soup):
    target_tags = {'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li'}    
    exclude_tags = {'header', 'nav', 'footer'}    
    text_list = []    
    for tag in soup.find_all(target_tags):
        if not has_excluded_parent(tag, exclude_tags):        
                text_list.append(tag.get_text())   
    return text_list  

def get_images(soup):
    # Find all images on the webpage
    images = soup.find_all('img')
    # Find all elements with a style attribute that contains 'background-image'
    background_images = soup.find_all(style=re.compile('background-image'))
    # Download each image
    imagelist=[]
    for img in images:
        img_url = img.get('src')
        # Skip if the image URL is empty or None
        if not img_url:
            continue
        # Check if the image is likely a logo or icon based on its size
        width = img.get('width')
        height = img.get('height')
        if width and height:
            if int(width) < 100 and int(height) < 100:
                #print(f"Skipping {img_url} as it's likely a logo or icon")
                continue
        
        # Check if the image is likely a logo or icon based on its URL
        if 'logo' in img_url.lower() or 'icon' in img_url.lower():
            #print(f"Skipping {img_url} as it's likely a logo or icon")
            continue
        
        # Check if the image is an SVG
        if img_url.lower().endswith('.svg'):
            #print(f"Skipping {img_url} as it's an SVG")
            continue
        imagelist.append(img_url)
    
    # Download each background image
    for elem in background_images:
        style = elem.get('style')
        match = re.search(r'background-image\s*:\s*url\(([^)]+)\)', style)
        if match:
            img_url = match.group(1).strip('"\'')
            
            # Check if the image is likely a logo or icon based on its size
            width_match = re.search(r'width\s*:\s*(\d+)px', style)
            height_match = re.search(r'height\s*:\s*(\d+)px', style)
            if width_match and height_match:
                width = int(width_match.group(1))
                height = int(height_match.group(1))
                if width < 100 and height < 100:
                    #print(f"Skipping {img_url} as it's likely a logo or icon")
                    continue
            
            # Check if the image is likely a logo or icon based on its URL
            if 'logo' in img_url.lower() or 'icon' in img_url.lower():
                #print(f"Skipping {img_url} as it's likely a logo or icon")
                continue
            
            # Check if the image is an SVG
            if img_url.lower().endswith('.svg'):
                #print(f"Skipping {img_url} as it's an SVG")
                continue
            imagelist.append(img_url)
    return imagelist

def scrapePage(url:str,scrapeImages:bool=True, scrapeText:bool=True):
    # Send a GET request
    response = requests.get(url)
    res = {}
    # If the GET request is successful, the status code will be 200
    if response.status_code == 200:
        # Get the content of the response
        page_content = response.content
        # Create a BeautifulSoup object and specify the parser
        soup = BeautifulSoup(page_content, 'html.parser')
        if(scrapeImages):
            res["images"]=get_images(soup)
        if(scrapeText):
            res["text"]=get_text(soup)
        return res
    else:
        return{"error":"scraping.py: webpage could not be loaded"}

# Example usage
url = "http://gocusdom.com"
res = scrapePage(url)
print(res)

{'images': ['https://assets-global.website-files.com/64f6fcb22fc6650621823722/6604669ba540f869e121c7b0_PNG-500_h-single_full.png', 'https://assets-global.website-files.com/64f6fcb22fc6650621823722/64f8792a64b8b84068d0968c_Codehouse-removebg-preview.png', 'https://assets-global.website-files.com/64f6fcb22fc6650621823722/6604669ba540f869e121c7b0_PNG-500_h-single_full.png', 'https://assets-global.website-files.com/64f6fcb22fc6650621823722/64f8792a64b8b84068d0968c_Codehouse-removebg-preview.png', 'https://assets-global.website-files.com/64f6fcb22fc6650621823722/6604669ba540f869e121c7b0_PNG-500_h-single_full.png', 'https://assets-global.website-files.com/64f6fcb22fc6650621823722/64f8792a64b8b84068d0968c_Codehouse-removebg-preview.png', 'https://assets-global.website-files.com/64f6fcb22fc6650621823722/6604669ba540f869e121c7b0_PNG-500_h-single_full.png', 'https://assets-global.website-files.com/64f6fcb22fc6650621823722/64f8792a64b8b84068d0968c_Codehouse-removebg-preview.png', 'https://assets-