In [12]:
import os
import requests
from bs4 import BeautifulSoup
import re
import validators.url as urlvalid

#Helper for get text, iterates through parents of an html tag, to see whether it should be filtered.
def has_excluded_parent(tag, exclude_tags):
    parent = tag.parent
    while parent.name != 'html':
        if parent.name in exclude_tags:
            return True
        parent = parent.parent
    return False

#Retrieve text, restricted to certain tabs
def get_text(soup):
    target_tags = {'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li'}    
    exclude_tags = {'header', 'nav', 'footer'}    
    text_list = []    
    for tag in soup.find_all(target_tags):
        if not has_excluded_parent(tag, exclude_tags):        
                text_list.append(tag.get_text())   
    return text_list  

# Find all images on the webpage
def get_images(soup):
    images = soup.find_all('img')
    # Find all elements with a style attribute that contains 'background-image'
    background_images = soup.find_all(style=re.compile('background-image'))
    # Check each image
    imagelist=[]
    for img in images:
        img_url = img.get('src')
        # Skip if the image URL is empty or None
        if(img_url and not (urlvalid(img_url))):
            img_url = f'https:{img_url}'
        if not img_url or not(urlvalid(img_url)):
            print("Invalid image url",img_url)
            continue
        # Check if the image is likely a logo or icon based on its size
        width = img.get('width')
        height = img.get('height')
        if width and height:
            if int(width) < 100 and int(height) < 100:
                #print(f"Skipping {img_url} as it's likely a logo or icon")
                continue
        
        # Check if the image is likely a logo or icon based on its URL
        if 'logo' in img_url.lower() or 'icon' in img_url.lower():
            #print(f"Skipping {img_url} as it's likely a logo or icon")
            continue
        
        # Check if the image is an SVG
        if img_url.lower().endswith('.svg'):
            #print(f"Skipping {img_url} as it's an SVG")
            continue
        imagelist.append(img_url)
    
    # Check each background_image
    for elem in background_images:
        style = elem.get('style')
        match = re.search(r'background-image\s*:\s*url\(([^)]+)\)', style)
        if match:
            img_url = match.group(1).strip('"\'')
            
            # Check if the image is likely a logo or icon based on its size
            width_match = re.search(r'width\s*:\s*(\d+)px', style)
            height_match = re.search(r'height\s*:\s*(\d+)px', style)
            if width_match and height_match:
                width = int(width_match.group(1))
                height = int(height_match.group(1))
                if width < 100 and height < 100:
                    #print(f"Skipping {img_url} as it's likely a logo or icon")
                    continue
            
            # Check if the image is likely a logo or icon based on its URL
            if 'logo' in img_url.lower() or 'icon' in img_url.lower() or not(urlvalid(img_url)):
                #print(f"Skipping {img_url} as it's likely a logo or icon")
                continue
            
            # Check if the image is an SVG
            if img_url.lower().endswith('.svg'):
                #print(f"Skipping {img_url} as it's an SVG")
                continue
            imagelist.append(img_url)
    return imagelist

def scrapePage(req:dict):
    # Send a GET request
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    if(not urlvalid(req["url"])):
        return {"error": "scraping.py: url is not recognized as a valid url."}
    try:
        response = requests.get(req["url"],headers=headers)
        response.raise_for_status()    
    except requests.exceptions.RequestException as e: 
        return {"error":"scraping.py: request error","message":e}
    res = {}
    # If the GET request is successful, the status code will be 200
    if response.status_code == 200:
        # Get the content of the response
        page_content = response.content
        # Create a BeautifulSoup object and specify the parser
        soup = BeautifulSoup(page_content, 'html.parser')
        if(req["use_images"]):
            res["images"]=get_images(soup)
        if(req["use_text"]):
            res["text"]=get_text(soup)
        return res
    else:
        return{"error":"scraping.py: webpage could not be loaded"}


In [14]:
exampleReq4 = {"url": "https://brandlume.com/12-proven-ways-to-make-your-website-stand-out/","use_images": True,"use_text":False,"num_images":1,"page": 0,"num_keywords_text": 10,"num_keywords_images": 10,"num_query_keywords":5,"result_images":24}
scrapePage(exampleReq4)

Invalid image url https:data:image/svg+xml;nitro-empty-id=MTMzNTo2MDU=-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMjAzIDY3IiB3aWR0aD0iMjAzIiBoZWlnaHQ9IjY3IiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPjwvc3ZnPg==
Invalid image url https:data:image/svg+xml;nitro-empty-id=MTMzODo2MTA=-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTcwIDU4IiB3aWR0aD0iMTcwIiBoZWlnaHQ9IjU4IiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPjwvc3ZnPg==
Invalid image url https:data:image/svg+xml;nitro-empty-id=MTM0MTo2MDQ=-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMjAzIDY3IiB3aWR0aD0iMjAzIiBoZWlnaHQ9IjY3IiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPjwvc3ZnPg==
Invalid image url https:data:image/svg+xml;nitro-empty-id=MTQ0MjoxNzU5-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTU3MiA4NzYiIHdpZHRoPSIxNTcyIiBoZWlnaHQ9Ijg3NiIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48L3N2Zz4=
Invalid image url https:data:image/svg+xml;nitro-empty-id=MTQ1NToxNDM4-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTAwMCA2NjciIHdpZHRoPSIxMDAwIiBoZWlnaHQ9IjY2NyIgeG1sbnM9Imh0dHA6Ly93d3cu

{'images': ['https://cdn-agiod.nitrocdn.com/IzoObPRaJTXqmzxBrypHgZRGhBszRtaj/assets/images/optimized/rev-32e7c69/brandlume.com/wp-content/uploads/2023/04/how-to-make-your-website-stand-out.jpg',
  'https://brandlume.com/wp-content/uploads/2023/04/how-to-make-your-website-stand-out.jpg',
  'https://brandlume.com/wp-content/uploads/2023/04/The-Importance-of-Having-a-Standout-Website-1.jpg',
  'https://brandlume.com/wp-content/uploads/2023/04/How-to-Make-Your-Business-Website-Stand-Out.jpg',
  'https://brandlume.com/wp-content/uploads/2023/04/Select-the-Ideal-Template.jpg',
  'https://brandlume.com/wp-content/uploads/2023/04/Enhance-User-Experience.jpg',
  'https://brandlume.com/wp-content/uploads/2023/04/Typography.jpg',
  'https://brandlume.com/wp-content/uploads/2023/04/Create-High-Quality-Content-1.jpg',
  'https://brandlume.com/wp-content/uploads/2023/04/Maintain-Fresh-Website-Content.jpg',
  'https://brandlume.com/wp-content/uploads/2023/04/Bios.jpg',
  'https://brandlume.com/wp-con

In [3]:
joined_string = " ".join(scraped_text)


<h1>Rake based</h1>

In [4]:
from rake_nltk import Rake
import nltk #you may need to download additional resources like punkt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download the Punkt tokenizer
nltk.download('punkt')  
nltk.download('stopwords')

#Sample text
text = "Natural language processing (NLP) is an interdisciplinary field that focuses on the interactions between computers and human language. including speech recognition, machine translation, and text analysis."

# Initiate the RAKE object and run it on the text
r = Rake()

# Extraction given the text.
r.extract_keywords_from_text(joined_string)
r.get_ranked_phrases_with_scores()


[nltk_data] Downloading package punkt to
[nltk_data]     d:\Programms\Anaconda\envs\sneakpic\lib\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     d:\Programms\Anaconda\envs\sneakpic\lib\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[(275.00490720798564,
  'billed heron subfamily agamiinae genus agamia – agami heron genus agamia – agami heron subfamily botaurinae genus zebrilus – zigzag heron genus ixobrychus – small bitterns'),
 (249.06392694063925,
  '148747 france bnf data germany israel united states japan czech republic ardeidae herons extant paleocene first appearances taxa named'),
 (125.0750536284169,
  'genus pilherodius – capped heron genus zonerodius – forest bittern genus ardeola – pond herons'),
 (125.0750536284169,
  'genus pilherodius – capped heron genus zonerodius – forest bittern genus ardeola – pond herons'),
 (112.95397300068352,
  'billed heron agamia – agami heron zebrilus – zigzag heron botaurus – bitterns'),
 (85.98584252505485,
  'pilherodius – capped heron syrigma – whistling heron egretta – herons'),
 (72.83612300476749,
  'genus zebrilus – zigzag heron genus ixobrychus – small bitterns'),
 (67.35917020610415,
  'genus syrigma – whistling heron genus egretta – typical egrets'),
 (67.3591

In [5]:
#Take 2
import yake 

yake_kw = yake.KeywordExtractor() 
KeyWords = yake_kw.extract_keywords(joined_string) 
  
# Displaying the keywords 
print(KeyWords) 


[('Herons', 0.0019341151026289274), ('heron', 0.0020806389740402097), ('genus', 0.002779114198769202), ('night herons', 0.003407551995779094), ('heron Genus', 0.003440693868056681), ('species', 0.0035328448489132763), ('heron Genus Ixobrychus', 0.007802717741497088), ('heron Genus Egretta', 0.008165381990151557), ('heron Genus Zonerodius', 0.01088922519768467), ('heron Genus Taphophoyx', 0.012246806133592213), ('boat-billed heron', 0.012547724730764211), ('Agami heron Genus', 0.012552446379536716), ('heron Genus Cochlearius', 0.0126236034378057), ('heron Genus Agamia', 0.01263198211070613), ('tiger heron Genus', 0.01263302474300582), ('tiger herons', 0.013108091837444087), ('Ardeidae', 0.013458020786417972), ('genus Ardea', 0.014145228646781595), ('boat-billed heron Genus', 0.014222182748485945), ('zigzag heron Genus', 0.015083995855833034)]


In [6]:
#take 1
import nltk
from nltk.corpus import stopwords
from rake_nltk import Rake

# Download the Punkt tokenizer
nltk.download('punkt')

# Initialize the RAKE object
r = Rake(stopwords=stopwords.words("english"), min_length=1, max_length=4)

# Run RAKE on the joined string
keywords = r.run(joined_string)

# Get the top keywords with their scores
top_keywords = r.get_ranked_phrases()
print(top_keywords)  

[nltk_data] Downloading package punkt to
[nltk_data]     d:\Programms\Anaconda\envs\sneakpic\lib\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


AttributeError: 'Rake' object has no attribute 'run'

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

def extract_keywords_spacy(text):
    doc = nlp(text)
    keywords = [token.text for token in doc if token.is_alpha and token.is_stop != True and token.pos_ == "NOUN"]
    return keywords

<h1>Frequency Based</h1>

In [18]:

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tag import pos_tag
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
# Stop words list
stop_words = set(stopwords.words('english'))

print(pos_tag(["Halo"]))
def preprocess_text(text):
    wnl = WordNetLemmatizer()
    tokens = word_tokenize(text.lower())
    filtered_tokens = [word for (word,pos) in pos_tag(tokens) if word not in stop_words and len(word) > 2 and pos[:2] == 'NN']
    lemmatized_tokens = [wnl.lemmatize(word) for word in filtered_tokens]
    return lemmatized_tokens


def extract_keywords_nltk(text):
    words = preprocess_text(text)
    # Frequency distribution of words
    freq_dist = nltk.FreqDist(words)
    # Get the 10 most common words
    most_common_words = freq_dist.most_common(10)
    keywords = [word for word, _ in most_common_words]
    return keywords

text = "This is a sample text that contains some important keywords like machine learning, natural language processing, and Python."

#print("SpaCy Keywords:", extract_keywords_spacy(joined_string))
print("NLTK Keywords:", extract_keywords_nltk(joined_string))

[nltk_data] Downloading package punkt to
[nltk_data]     d:\Programms\Anaconda\envs\sneakpic\lib\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     d:\Programms\Anaconda\envs\sneakpic\lib\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     d:\Programms\Anaconda\envs\sneakpic\lib\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


[('Halo', 'NN')]
NLTK Keywords: ['heron', 'specie', 'genus', 'bittern', 'night', 'bird', 'egret', 'ardeidae', 'edit', 'prey']
