{ "cells": [ { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'images': ['https://assets-global.website-files.com/64f6fcb22fc6650621823722/6604669ba540f869e121c7b0_PNG-500_h-single_full.png', 'https://assets-global.website-files.com/64f6fcb22fc6650621823722/64f8792a64b8b84068d0968c_Codehouse-removebg-preview.png', 'https://assets-global.website-files.com/64f6fcb22fc6650621823722/6604669ba540f869e121c7b0_PNG-500_h-single_full.png', 'https://assets-global.website-files.com/64f6fcb22fc6650621823722/64f8792a64b8b84068d0968c_Codehouse-removebg-preview.png', 'https://assets-global.website-files.com/64f6fcb22fc6650621823722/6604669ba540f869e121c7b0_PNG-500_h-single_full.png', 'https://assets-global.website-files.com/64f6fcb22fc6650621823722/64f8792a64b8b84068d0968c_Codehouse-removebg-preview.png', 'https://assets-global.website-files.com/64f6fcb22fc6650621823722/6604669ba540f869e121c7b0_PNG-500_h-single_full.png', 'https://assets-global.website-files.com/64f6fcb22fc6650621823722/64f8792a64b8b84068d0968c_Codehouse-removebg-preview.png', 'https://assets-global.website-files.com/64f6fcb22fc6650621823722/6604669ba540f869e121c7b0_PNG-500_h-single_full.png', 'https://assets-global.website-files.com/64f6fcb22fc6650621823722/64f8792a64b8b84068d0968c_Codehouse-removebg-preview.png', 'https://assets-global.website-files.com/64f6fcb22fc6650621823722/6604669ba540f869e121c7b0_PNG-500_h-single_full.png', 'https://assets-global.website-files.com/64f6fcb22fc6650621823722/64f8792a64b8b84068d0968c_Codehouse-removebg-preview.png', 'https://assets-global.website-files.com/64f6fcb22fc6650621823722/6604669ba540f869e121c7b0_PNG-500_h-single_full.png', 'https://assets-global.website-files.com/64f6fcb22fc6650621823722/64f8792a64b8b84068d0968c_Codehouse-removebg-preview.png', 'https://assets-global.website-files.com/64f6fcb22fc6650621823722/6604669ba540f869e121c7b0_PNG-500_h-single_full.png', 'https://assets-global.website-files.com/64f6fcb22fc6650621823722/64f8792a64b8b84068d0968c_Codehouse-removebg-preview.png', 'https://assets-global.website-files.com/64f6fcb22fc6650621823722/651488cfeb9835540251622a_drew_schwartz.jpeg', 'https://assets-global.website-files.com/64f6fcb22fc6650621823722/651488a8a9e33114581e251d_jas_espinosa.jpeg', 'https://assets-global.website-files.com/64f6fcb22fc6650621823722/6514887e5ff4e3a188f0f6e6_estelle-freedman_profilephoto.jpg', 'https://assets-global.website-files.com/65045b272270a1f11782fa52/660460830916d33cb1917ead_RenewScale-hero.jpg', 'https://assets-global.website-files.com/65045b272270a1f11782fa52/65b15e79a3fdcd7102133a2e_PDC-hero.jpg', 'https://assets-global.website-files.com/65045b272270a1f11782fa52/651dec8e708d32af7fbcb815_collegeF-hero.jpg', 'https://assets-global.website-files.com/65045b272270a1f11782fa52/651ded7f0c43865b9b116c3d_piggyback-hero.jpg'], 'text': ['Cusdom Agency', 'We help businesses make', 'Websites', 'Wireframes', 'MVPs', 'Apps', 'Websites', 'We help small businesses, startups, and non-profits make clean, good looking, and maintainable websites, mobile apps, and SAAS products.', 'What We Specialize In', 'Development', 'We are dedicated to delivering exceptional solutions that cater to the needs of our clients. Our expertise spans two key domains: crafting cutting-edge Webflow websites and architecting robust full-stack projects. With a user-centric design philosophy, we take pride in merging functionality and aesthetics to create digital experiences that leave a lasting impact. Discover the difference Cusdom can make for your business today.', 'Design', \"Whether it's crafting visually stunning user interfaces for Webflow websites or designing intuitive user experiences for web or mobile applications, our design team is your creative partner from concept to completion. We pride ourselves on infusing every project with a distinct visual identity that captures your brand's essence. When you choose Cusdom, you choose design that resonates, captivates, and elevates your online presence.\", 'Development', 'Design', 'We are dedicated to delivering exceptional solutions that cater to the needs of our clients. Our expertise spans two key domains: crafting cutting-edge Webflow websites and architecting robust full-stack projects. With a user-centric design philosophy, we take pride in merging functionality and aesthetics to create digital experiences that leave a lasting impact. Discover the difference Cusdom can make for your business today.', \"Whether it's crafting visually stunning user interfaces for Webflow websites or designing intuitive user experiences for web or mobile applications, our design team is your creative partner from concept to completion. We pride ourselves on infusing every project with a distinct visual identity that captures your brand's essence. When you choose Cusdom, you choose design that resonates, captivates, and elevates your online presence.\", '...and more', 'Our Past Clients', '\"What sets them apart is their ability to solve complex problems related to tech and business growth.\"', 'Their approach and expertise have proved vital and timely for our growth. What sets them apart is their ability to solve complex problems related to tech and business growth. While maintaining a sense of urgency, their clear communication and methodical approach to ensuring long-term goals has helped keep our customers happy as we grow.', 'Our Work', 'RenewScale', 'Branding and web design for an up-and-coming cleantech company.', 'RenewScale', 'Branding and web design for an up-and-coming cleantech company.', 'PDC Strategy', 'A rebrand and web redesign project for a consulting firm in the pharmaceutical industry.', 'PDC Strategy', 'A rebrand and web redesign project for a consulting firm in the pharmaceutical industry.', 'College Fans', 'Full-stack MVP for a startup that wanted to create a bidding site where college athletes would auction off signed memorabilia. ', 'College Fans', 'Full-stack MVP for a startup that wanted to create a bidding site where college athletes would auction off signed memorabilia. ', 'Piggyback', 'Custom full-stack web app built with React that helps kids with their reading comprehension.', 'Piggyback', 'Custom full-stack web app built with React that helps kids with their reading comprehension.', '\"GoCusdom delivered a quality product in a fraction of the time and cost.\"', 'GoCusdom delivered a quality product in a fraction of the time and cost. They took into consideration our non-profit budget and offered feasible options that still met our operational needs and ideal aesthetic. We were most impressed with their attention to detail, thoughtful feedback sessions, and rapid response times. All feedback was diligently implemented and we ended up with a website that exceeded our expectations. From start to finish, GoCusdom made the process seamless for us. Highly recommend them!', 'Get In touch', 'Partner with us!', 'Fill out this form or email us at contact@gocusdom.com to get in touch!', '\"Working with Cusdom Agency made designing and implementing a web app user interface more rewarding than I could have imagined.\"', 'Their technical, communication, and interpersonal skills -- including open-mindedness and the ability to work productively with a team, and within deadline and budget projections – exceeded even the glowing recommendation that led me to them.']}\n" ] } ], "source": [ "import os\n", "import requests\n", "from bs4 import BeautifulSoup\n", "import re\n", "\n", "\n", "def has_excluded_parent(tag, exclude_tags):\n", " parent = tag.parent\n", " while parent.name != 'html':\n", " if parent.name in exclude_tags:\n", " return True\n", " parent = parent.parent\n", " return False\n", "\n", "def get_text(soup):\n", " target_tags = {'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li'} \n", " exclude_tags = {'header', 'nav', 'footer'} \n", " text_list = [] \n", " for tag in soup.find_all(target_tags):\n", " if not has_excluded_parent(tag, exclude_tags): \n", " text_list.append(tag.get_text()) \n", " return text_list \n", "\n", "def get_images(soup):\n", " # Find all images on the webpage\n", " images = soup.find_all('img')\n", " # Find all elements with a style attribute that contains 'background-image'\n", " background_images = soup.find_all(style=re.compile('background-image'))\n", " # Download each image\n", " imagelist=[]\n", " for img in images:\n", " img_url = img.get('src')\n", " # Skip if the image URL is empty or None\n", " if not img_url:\n", " continue\n", " # Check if the image is likely a logo or icon based on its size\n", " width = img.get('width')\n", " height = img.get('height')\n", " if width and height:\n", " if int(width) < 100 and int(height) < 100:\n", " #print(f\"Skipping {img_url} as it's likely a logo or icon\")\n", " continue\n", " \n", " # Check if the image is likely a logo or icon based on its URL\n", " if 'logo' in img_url.lower() or 'icon' in img_url.lower():\n", " #print(f\"Skipping {img_url} as it's likely a logo or icon\")\n", " continue\n", " \n", " # Check if the image is an SVG\n", " if img_url.lower().endswith('.svg'):\n", " #print(f\"Skipping {img_url} as it's an SVG\")\n", " continue\n", " imagelist.append(img_url)\n", " \n", " # Download each background image\n", " for elem in background_images:\n", " style = elem.get('style')\n", " match = re.search(r'background-image\\s*:\\s*url\\(([^)]+)\\)', style)\n", " if match:\n", " img_url = match.group(1).strip('\"\\'')\n", " \n", " # Check if the image is likely a logo or icon based on its size\n", " width_match = re.search(r'width\\s*:\\s*(\\d+)px', style)\n", " height_match = re.search(r'height\\s*:\\s*(\\d+)px', style)\n", " if width_match and height_match:\n", " width = int(width_match.group(1))\n", " height = int(height_match.group(1))\n", " if width < 100 and height < 100:\n", " #print(f\"Skipping {img_url} as it's likely a logo or icon\")\n", " continue\n", " \n", " # Check if the image is likely a logo or icon based on its URL\n", " if 'logo' in img_url.lower() or 'icon' in img_url.lower():\n", " #print(f\"Skipping {img_url} as it's likely a logo or icon\")\n", " continue\n", " \n", " # Check if the image is an SVG\n", " if img_url.lower().endswith('.svg'):\n", " #print(f\"Skipping {img_url} as it's an SVG\")\n", " continue\n", " imagelist.append(img_url)\n", " return imagelist\n", "\n", "def scrapePage(url:str,scrapeImages:bool=True, scrapeText:bool=True):\n", " # Send a GET request\n", " response = requests.get(url)\n", " res = {}\n", " # If the GET request is successful, the status code will be 200\n", " if response.status_code == 200:\n", " # Get the content of the response\n", " page_content = response.content\n", " # Create a BeautifulSoup object and specify the parser\n", " soup = BeautifulSoup(page_content, 'html.parser')\n", " if(scrapeImages):\n", " res[\"images\"]=get_images(soup)\n", " if(scrapeText):\n", " res[\"text\"]=get_text(soup)\n", " return res\n", " else:\n", " return{\"error\":\"scraping.py: webpage could not be loaded\"}\n", "\n", "# Example usage\n", "url = \"http://gocusdom.com\"\n", "res = scrapePage(url)\n", "print(res)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "sneakpic", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 2 }