{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "import os\n", "import requests\n", "from bs4 import BeautifulSoup\n", "import re\n", "import validators.url as urlvalid\n", "\n", "#Helper for get text, iterates through parents of an html tag, to see whether it should be filtered.\n", "def has_excluded_parent(tag, exclude_tags):\n", " parent = tag.parent\n", " while parent.name != 'html':\n", " if parent.name in exclude_tags:\n", " return True\n", " parent = parent.parent\n", " return False\n", "\n", "#Retrieve text, restricted to certain tabs\n", "def get_text(soup):\n", " target_tags = {'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li'} \n", " exclude_tags = {'header', 'nav', 'footer'} \n", " text_list = [] \n", " for tag in soup.find_all(target_tags):\n", " if not has_excluded_parent(tag, exclude_tags): \n", " text_list.append(tag.get_text()) \n", " return text_list \n", "\n", "# Find all images on the webpage\n", "def get_images(soup):\n", " images = soup.find_all('img')\n", " # Find all elements with a style attribute that contains 'background-image'\n", " background_images = soup.find_all(style=re.compile('background-image'))\n", " # Check each image\n", " imagelist=[]\n", " for img in images:\n", " img_url = img.get('src')\n", " # Skip if the image URL is empty or None\n", " if(img_url and not (urlvalid(img_url))):\n", " img_url = f'https:{img_url}'\n", " if not img_url or not(urlvalid(img_url)):\n", " print(\"Invalid image url\",img_url)\n", " continue\n", " # Check if the image is likely a logo or icon based on its size\n", " width = img.get('width')\n", " height = img.get('height')\n", " if width and height:\n", " if int(width) < 100 and int(height) < 100:\n", " #print(f\"Skipping {img_url} as it's likely a logo or icon\")\n", " continue\n", " \n", " # Check if the image is likely a logo or icon based on its URL\n", " if 'logo' in img_url.lower() or 'icon' in img_url.lower():\n", " #print(f\"Skipping {img_url} as it's likely a logo or icon\")\n", " continue\n", " \n", " # Check if the image is an SVG\n", " if img_url.lower().endswith('.svg'):\n", " #print(f\"Skipping {img_url} as it's an SVG\")\n", " continue\n", " imagelist.append(img_url)\n", " \n", " # Check each background_image\n", " for elem in background_images:\n", " style = elem.get('style')\n", " match = re.search(r'background-image\\s*:\\s*url\\(([^)]+)\\)', style)\n", " if match:\n", " img_url = match.group(1).strip('\"\\'')\n", " \n", " # Check if the image is likely a logo or icon based on its size\n", " width_match = re.search(r'width\\s*:\\s*(\\d+)px', style)\n", " height_match = re.search(r'height\\s*:\\s*(\\d+)px', style)\n", " if width_match and height_match:\n", " width = int(width_match.group(1))\n", " height = int(height_match.group(1))\n", " if width < 100 and height < 100:\n", " #print(f\"Skipping {img_url} as it's likely a logo or icon\")\n", " continue\n", " \n", " # Check if the image is likely a logo or icon based on its URL\n", " if 'logo' in img_url.lower() or 'icon' in img_url.lower() or not(urlvalid(img_url)):\n", " #print(f\"Skipping {img_url} as it's likely a logo or icon\")\n", " continue\n", " \n", " # Check if the image is an SVG\n", " if img_url.lower().endswith('.svg'):\n", " #print(f\"Skipping {img_url} as it's an SVG\")\n", " continue\n", " imagelist.append(img_url)\n", " return imagelist\n", "\n", "def scrapePage(req:dict):\n", " # Send a GET request\n", " headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}\n", " if(not urlvalid(req[\"url\"])):\n", " return {\"error\": \"scraping.py: url is not recognized as a valid url.\"}\n", " try:\n", " response = requests.get(req[\"url\"],headers=headers)\n", " response.raise_for_status() \n", " except requests.exceptions.RequestException as e: \n", " return {\"error\":\"scraping.py: request error\",\"message\":e}\n", " res = {}\n", " # If the GET request is successful, the status code will be 200\n", " if response.status_code == 200:\n", " # Get the content of the response\n", " page_content = response.content\n", " # Create a BeautifulSoup object and specify the parser\n", " soup = BeautifulSoup(page_content, 'html.parser')\n", " if(req[\"use_images\"]):\n", " res[\"images\"]=get_images(soup)\n", " if(req[\"use_text\"]):\n", " res[\"text\"]=get_text(soup)\n", " return res\n", " else:\n", " return{\"error\":\"scraping.py: webpage could not be loaded\"}\n" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTMzNTo2MDU=-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMjAzIDY3IiB3aWR0aD0iMjAzIiBoZWlnaHQ9IjY3IiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPjwvc3ZnPg==\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTMzODo2MTA=-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTcwIDU4IiB3aWR0aD0iMTcwIiBoZWlnaHQ9IjU4IiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPjwvc3ZnPg==\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTM0MTo2MDQ=-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMjAzIDY3IiB3aWR0aD0iMjAzIiBoZWlnaHQ9IjY3IiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPjwvc3ZnPg==\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTQ0MjoxNzU5-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTU3MiA4NzYiIHdpZHRoPSIxNTcyIiBoZWlnaHQ9Ijg3NiIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48L3N2Zz4=\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTQ1NToxNDM4-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTAwMCA2NjciIHdpZHRoPSIxMDAwIiBoZWlnaHQ9IjY2NyIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48L3N2Zz4=\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTQ1ODoxMjgw-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTAwMCA2NjciIHdpZHRoPSIxMDAwIiBoZWlnaHQ9IjY2NyIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48L3N2Zz4=\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTQ2ODoxMTk1-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTAwMCA2NjciIHdpZHRoPSIxMDAwIiBoZWlnaHQ9IjY2NyIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48L3N2Zz4=\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTQ3MjoxMDc1-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTAwMCA2MDQiIHdpZHRoPSIxMDAwIiBoZWlnaHQ9IjYwNCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48L3N2Zz4=\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTQ3NjoxNTQy-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgODEwIDgwNiIgd2lkdGg9IjgxMCIgaGVpZ2h0PSI4MDYiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+PC9zdmc+\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTQ4NToxMzI2-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTAwMCA2NDYiIHdpZHRoPSIxMDAwIiBoZWlnaHQ9IjY0NiIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48L3N2Zz4=\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTQ4OToxMjMz-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTg5OCA4NzUiIHdpZHRoPSIxODk4IiBoZWlnaHQ9Ijg3NSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48L3N2Zz4=\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTQ5MzoxNjQ5-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTAwMCA2MzUiIHdpZHRoPSIxMDAwIiBoZWlnaHQ9IjYzNSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48L3N2Zz4=\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTUxMjoxMjg0-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTAwMCA1MzYiIHdpZHRoPSIxMDAwIiBoZWlnaHQ9IjUzNiIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48L3N2Zz4=\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTUxNjoxMzY4-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTYyNyA4MTQiIHdpZHRoPSIxNjI3IiBoZWlnaHQ9IjgxNCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48L3N2Zz4=\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTUyMToxMjQ4-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTUyMiA4MTciIHdpZHRoPSIxNTIyIiBoZWlnaHQ9IjgxNyIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48L3N2Zz4=\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTUzMToxNDg3-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTU0NSA4NjUiIHdpZHRoPSIxNTQ1IiBoZWlnaHQ9Ijg2NSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48L3N2Zz4=\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTUzOTo0MjQ=-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMjQwIDI0MCIgd2lkdGg9IjI0MCIgaGVpZ2h0PSIyNDAiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+PC9zdmc+\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTU3MDo1MDA=-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgNTAwIDM4MyIgd2lkdGg9IjUwMCIgaGVpZ2h0PSIzODMiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+PC9zdmc+\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTYwODo1NTA=-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgNTAwIDM4MyIgd2lkdGg9IjUwMCIgaGVpZ2h0PSIzODMiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+PC9zdmc+\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTY0Njo1MDg=-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgNTAwIDM4MyIgd2lkdGg9IjUwMCIgaGVpZ2h0PSIzODMiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+PC9zdmc+\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTY4NDo1MjY=-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgNTAwIDM4MyIgd2lkdGg9IjUwMCIgaGVpZ2h0PSIzODMiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+PC9zdmc+\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTk0OTo3OTQ=-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMjAzIDY3IiB3aWR0aD0iMjAzIiBoZWlnaHQ9IjY3IiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPjwvc3ZnPg==\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MjAwNzoxMjMx-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMzAwIDEzOSIgd2lkdGg9IjMwMCIgaGVpZ2h0PSIxMzkiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+PC9zdmc+\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MjAwNzoyNDM3-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTUwIDQwIiB3aWR0aD0iMTUwIiBoZWlnaHQ9IjQwIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPjwvc3ZnPg==\n" ] }, { "data": { "text/plain": [ "{'images': ['https://cdn-agiod.nitrocdn.com/IzoObPRaJTXqmzxBrypHgZRGhBszRtaj/assets/images/optimized/rev-32e7c69/brandlume.com/wp-content/uploads/2023/04/how-to-make-your-website-stand-out.jpg',\n", " 'https://brandlume.com/wp-content/uploads/2023/04/how-to-make-your-website-stand-out.jpg',\n", " 'https://brandlume.com/wp-content/uploads/2023/04/The-Importance-of-Having-a-Standout-Website-1.jpg',\n", " 'https://brandlume.com/wp-content/uploads/2023/04/How-to-Make-Your-Business-Website-Stand-Out.jpg',\n", " 'https://brandlume.com/wp-content/uploads/2023/04/Select-the-Ideal-Template.jpg',\n", " 'https://brandlume.com/wp-content/uploads/2023/04/Enhance-User-Experience.jpg',\n", " 'https://brandlume.com/wp-content/uploads/2023/04/Typography.jpg',\n", " 'https://brandlume.com/wp-content/uploads/2023/04/Create-High-Quality-Content-1.jpg',\n", " 'https://brandlume.com/wp-content/uploads/2023/04/Maintain-Fresh-Website-Content.jpg',\n", " 'https://brandlume.com/wp-content/uploads/2023/04/Bios.jpg',\n", " 'https://brandlume.com/wp-content/uploads/2023/04/Demonstrate-Engaging-and-Relatable-Video-Content.jpg',\n", " 'https://brandlume.com/wp-content/uploads/2023/04/Showcase-Examples-and-Metrics.jpg',\n", " 'https://brandlume.com/wp-content/uploads/2023/04/Website-Menus.jpg',\n", " 'https://brandlume.com/wp-content/uploads/2023/04/case-studies.jpg',\n", " 'https://brandlume.com/wp-content/uploads/2023/04/Stand-Out-from-the-Crowd.jpg',\n", " 'https://brandlume.com/wp-content/uploads/2022/04/responsive-web-concept-500x383.jpg',\n", " 'https://brandlume.com/wp-content/uploads/2023/05/Questions-to-Ask-a-Web-Design-Company-500x383.jpg',\n", " 'https://brandlume.com/wp-content/uploads/2023/04/Error-404-poor-website-500x383.png',\n", " 'https://brandlume.com/wp-content/uploads/2023/04/Website-Dos-and-Donts-500x383.png',\n", " 'https://brandlume.com/wp-content/uploads/2017/01/featured.png']}" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "exampleReq4 = {\"url\": \"https://brandlume.com/12-proven-ways-to-make-your-website-stand-out/\",\"use_images\": True,\"use_text\":False,\"num_images\":1,\"page\": 0,\"num_keywords_text\": 10,\"num_keywords_images\": 10,\"num_query_keywords\":5,\"result_images\":24}\n", "scrapePage(exampleReq4)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "joined_string = \" \".join(scraped_text)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "