{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "import os\n", "import requests\n", "from bs4 import BeautifulSoup\n", "import re\n", "import validators.url as urlvalid\n", "\n", "#Helper for get text, iterates through parents of an html tag, to see whether it should be filtered.\n", "def has_excluded_parent(tag, exclude_tags):\n", " parent = tag.parent\n", " while parent.name != 'html':\n", " if parent.name in exclude_tags:\n", " return True\n", " parent = parent.parent\n", " return False\n", "\n", "#Retrieve text, restricted to certain tabs\n", "def get_text(soup):\n", " target_tags = {'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li'} \n", " exclude_tags = {'header', 'nav', 'footer'} \n", " text_list = [] \n", " for tag in soup.find_all(target_tags):\n", " if not has_excluded_parent(tag, exclude_tags): \n", " text_list.append(tag.get_text()) \n", " return text_list \n", "\n", "# Find all images on the webpage\n", "def get_images(soup):\n", " images = soup.find_all('img')\n", " # Find all elements with a style attribute that contains 'background-image'\n", " background_images = soup.find_all(style=re.compile('background-image'))\n", " # Check each image\n", " imagelist=[]\n", " for img in images:\n", " img_url = img.get('src')\n", " # Skip if the image URL is empty or None\n", " if(img_url and not (urlvalid(img_url))):\n", " img_url = f'https:{img_url}'\n", " if not img_url or not(urlvalid(img_url)):\n", " print(\"Invalid image url\",img_url)\n", " continue\n", " # Check if the image is likely a logo or icon based on its size\n", " width = img.get('width')\n", " height = img.get('height')\n", " if width and height:\n", " if int(width) < 100 and int(height) < 100:\n", " #print(f\"Skipping {img_url} as it's likely a logo or icon\")\n", " continue\n", " \n", " # Check if the image is likely a logo or icon based on its URL\n", " if 'logo' in img_url.lower() or 'icon' in img_url.lower():\n", " #print(f\"Skipping {img_url} as it's likely a logo or icon\")\n", " continue\n", " \n", " # Check if the image is an SVG\n", " if img_url.lower().endswith('.svg'):\n", " #print(f\"Skipping {img_url} as it's an SVG\")\n", " continue\n", " imagelist.append(img_url)\n", " \n", " # Check each background_image\n", " for elem in background_images:\n", " style = elem.get('style')\n", " match = re.search(r'background-image\\s*:\\s*url\\(([^)]+)\\)', style)\n", " if match:\n", " img_url = match.group(1).strip('\"\\'')\n", " \n", " # Check if the image is likely a logo or icon based on its size\n", " width_match = re.search(r'width\\s*:\\s*(\\d+)px', style)\n", " height_match = re.search(r'height\\s*:\\s*(\\d+)px', style)\n", " if width_match and height_match:\n", " width = int(width_match.group(1))\n", " height = int(height_match.group(1))\n", " if width < 100 and height < 100:\n", " #print(f\"Skipping {img_url} as it's likely a logo or icon\")\n", " continue\n", " \n", " # Check if the image is likely a logo or icon based on its URL\n", " if 'logo' in img_url.lower() or 'icon' in img_url.lower() or not(urlvalid(img_url)):\n", " #print(f\"Skipping {img_url} as it's likely a logo or icon\")\n", " continue\n", " \n", " # Check if the image is an SVG\n", " if img_url.lower().endswith('.svg'):\n", " #print(f\"Skipping {img_url} as it's an SVG\")\n", " continue\n", " imagelist.append(img_url)\n", " return imagelist\n", "\n", "def scrapePage(req:dict):\n", " # Send a GET request\n", " headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}\n", " if(not urlvalid(req[\"url\"])):\n", " return {\"error\": \"scraping.py: url is not recognized as a valid url.\"}\n", " try:\n", " response = requests.get(req[\"url\"],headers=headers)\n", " response.raise_for_status() \n", " except requests.exceptions.RequestException as e: \n", " return {\"error\":\"scraping.py: request error\",\"message\":e}\n", " res = {}\n", " # If the GET request is successful, the status code will be 200\n", " if response.status_code == 200:\n", " # Get the content of the response\n", " page_content = response.content\n", " # Create a BeautifulSoup object and specify the parser\n", " soup = BeautifulSoup(page_content, 'html.parser')\n", " if(req[\"use_images\"]):\n", " res[\"images\"]=get_images(soup)\n", " if(req[\"use_text\"]):\n", " res[\"text\"]=get_text(soup)\n", " return res\n", " else:\n", " return{\"error\":\"scraping.py: webpage could not be loaded\"}\n" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTMzNTo2MDU=-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMjAzIDY3IiB3aWR0aD0iMjAzIiBoZWlnaHQ9IjY3IiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPjwvc3ZnPg==\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTMzODo2MTA=-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTcwIDU4IiB3aWR0aD0iMTcwIiBoZWlnaHQ9IjU4IiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPjwvc3ZnPg==\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTM0MTo2MDQ=-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMjAzIDY3IiB3aWR0aD0iMjAzIiBoZWlnaHQ9IjY3IiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPjwvc3ZnPg==\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTQ0MjoxNzU5-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTU3MiA4NzYiIHdpZHRoPSIxNTcyIiBoZWlnaHQ9Ijg3NiIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48L3N2Zz4=\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTQ1NToxNDM4-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTAwMCA2NjciIHdpZHRoPSIxMDAwIiBoZWlnaHQ9IjY2NyIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48L3N2Zz4=\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTQ1ODoxMjgw-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTAwMCA2NjciIHdpZHRoPSIxMDAwIiBoZWlnaHQ9IjY2NyIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48L3N2Zz4=\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTQ2ODoxMTk1-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTAwMCA2NjciIHdpZHRoPSIxMDAwIiBoZWlnaHQ9IjY2NyIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48L3N2Zz4=\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTQ3MjoxMDc1-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTAwMCA2MDQiIHdpZHRoPSIxMDAwIiBoZWlnaHQ9IjYwNCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48L3N2Zz4=\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTQ3NjoxNTQy-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgODEwIDgwNiIgd2lkdGg9IjgxMCIgaGVpZ2h0PSI4MDYiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+PC9zdmc+\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTQ4NToxMzI2-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTAwMCA2NDYiIHdpZHRoPSIxMDAwIiBoZWlnaHQ9IjY0NiIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48L3N2Zz4=\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTQ4OToxMjMz-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTg5OCA4NzUiIHdpZHRoPSIxODk4IiBoZWlnaHQ9Ijg3NSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48L3N2Zz4=\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTQ5MzoxNjQ5-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTAwMCA2MzUiIHdpZHRoPSIxMDAwIiBoZWlnaHQ9IjYzNSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48L3N2Zz4=\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTUxMjoxMjg0-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTAwMCA1MzYiIHdpZHRoPSIxMDAwIiBoZWlnaHQ9IjUzNiIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48L3N2Zz4=\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTUxNjoxMzY4-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTYyNyA4MTQiIHdpZHRoPSIxNjI3IiBoZWlnaHQ9IjgxNCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48L3N2Zz4=\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTUyMToxMjQ4-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTUyMiA4MTciIHdpZHRoPSIxNTIyIiBoZWlnaHQ9IjgxNyIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48L3N2Zz4=\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTUzMToxNDg3-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTU0NSA4NjUiIHdpZHRoPSIxNTQ1IiBoZWlnaHQ9Ijg2NSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48L3N2Zz4=\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTUzOTo0MjQ=-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMjQwIDI0MCIgd2lkdGg9IjI0MCIgaGVpZ2h0PSIyNDAiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+PC9zdmc+\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTU3MDo1MDA=-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgNTAwIDM4MyIgd2lkdGg9IjUwMCIgaGVpZ2h0PSIzODMiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+PC9zdmc+\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTYwODo1NTA=-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgNTAwIDM4MyIgd2lkdGg9IjUwMCIgaGVpZ2h0PSIzODMiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+PC9zdmc+\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTY0Njo1MDg=-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgNTAwIDM4MyIgd2lkdGg9IjUwMCIgaGVpZ2h0PSIzODMiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+PC9zdmc+\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTY4NDo1MjY=-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgNTAwIDM4MyIgd2lkdGg9IjUwMCIgaGVpZ2h0PSIzODMiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+PC9zdmc+\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTk0OTo3OTQ=-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMjAzIDY3IiB3aWR0aD0iMjAzIiBoZWlnaHQ9IjY3IiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPjwvc3ZnPg==\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MjAwNzoxMjMx-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMzAwIDEzOSIgd2lkdGg9IjMwMCIgaGVpZ2h0PSIxMzkiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+PC9zdmc+\n", "Invalid image url https:data:image/svg+xml;nitro-empty-id=MjAwNzoyNDM3-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTUwIDQwIiB3aWR0aD0iMTUwIiBoZWlnaHQ9IjQwIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPjwvc3ZnPg==\n" ] }, { "data": { "text/plain": [ "{'images': ['https://cdn-agiod.nitrocdn.com/IzoObPRaJTXqmzxBrypHgZRGhBszRtaj/assets/images/optimized/rev-32e7c69/brandlume.com/wp-content/uploads/2023/04/how-to-make-your-website-stand-out.jpg',\n", " 'https://brandlume.com/wp-content/uploads/2023/04/how-to-make-your-website-stand-out.jpg',\n", " 'https://brandlume.com/wp-content/uploads/2023/04/The-Importance-of-Having-a-Standout-Website-1.jpg',\n", " 'https://brandlume.com/wp-content/uploads/2023/04/How-to-Make-Your-Business-Website-Stand-Out.jpg',\n", " 'https://brandlume.com/wp-content/uploads/2023/04/Select-the-Ideal-Template.jpg',\n", " 'https://brandlume.com/wp-content/uploads/2023/04/Enhance-User-Experience.jpg',\n", " 'https://brandlume.com/wp-content/uploads/2023/04/Typography.jpg',\n", " 'https://brandlume.com/wp-content/uploads/2023/04/Create-High-Quality-Content-1.jpg',\n", " 'https://brandlume.com/wp-content/uploads/2023/04/Maintain-Fresh-Website-Content.jpg',\n", " 'https://brandlume.com/wp-content/uploads/2023/04/Bios.jpg',\n", " 'https://brandlume.com/wp-content/uploads/2023/04/Demonstrate-Engaging-and-Relatable-Video-Content.jpg',\n", " 'https://brandlume.com/wp-content/uploads/2023/04/Showcase-Examples-and-Metrics.jpg',\n", " 'https://brandlume.com/wp-content/uploads/2023/04/Website-Menus.jpg',\n", " 'https://brandlume.com/wp-content/uploads/2023/04/case-studies.jpg',\n", " 'https://brandlume.com/wp-content/uploads/2023/04/Stand-Out-from-the-Crowd.jpg',\n", " 'https://brandlume.com/wp-content/uploads/2022/04/responsive-web-concept-500x383.jpg',\n", " 'https://brandlume.com/wp-content/uploads/2023/05/Questions-to-Ask-a-Web-Design-Company-500x383.jpg',\n", " 'https://brandlume.com/wp-content/uploads/2023/04/Error-404-poor-website-500x383.png',\n", " 'https://brandlume.com/wp-content/uploads/2023/04/Website-Dos-and-Donts-500x383.png',\n", " 'https://brandlume.com/wp-content/uploads/2017/01/featured.png']}" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "exampleReq4 = {\"url\": \"https://brandlume.com/12-proven-ways-to-make-your-website-stand-out/\",\"use_images\": True,\"use_text\":False,\"num_images\":1,\"page\": 0,\"num_keywords_text\": 10,\"num_keywords_images\": 10,\"num_query_keywords\":5,\"result_images\":24}\n", "scrapePage(exampleReq4)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "joined_string = \" \".join(scraped_text)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "

Rake based

" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt to\n", "[nltk_data] d:\\Programms\\Anaconda\\envs\\sneakpic\\lib\\nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n", "[nltk_data] Downloading package stopwords to\n", "[nltk_data] d:\\Programms\\Anaconda\\envs\\sneakpic\\lib\\nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n" ] }, { "data": { "text/plain": [ "[(275.00490720798564,\n", " 'billed heron subfamily agamiinae genus agamia – agami heron genus agamia – agami heron subfamily botaurinae genus zebrilus – zigzag heron genus ixobrychus – small bitterns'),\n", " (249.06392694063925,\n", " '148747 france bnf data germany israel united states japan czech republic ardeidae herons extant paleocene first appearances taxa named'),\n", " (125.0750536284169,\n", " 'genus pilherodius – capped heron genus zonerodius – forest bittern genus ardeola – pond herons'),\n", " (125.0750536284169,\n", " 'genus pilherodius – capped heron genus zonerodius – forest bittern genus ardeola – pond herons'),\n", " (112.95397300068352,\n", " 'billed heron agamia – agami heron zebrilus – zigzag heron botaurus – bitterns'),\n", " (85.98584252505485,\n", " 'pilherodius – capped heron syrigma – whistling heron egretta – herons'),\n", " (72.83612300476749,\n", " 'genus zebrilus – zigzag heron genus ixobrychus – small bitterns'),\n", " (67.35917020610415,\n", " 'genus syrigma – whistling heron genus egretta – typical egrets'),\n", " (67.35917020610415,\n", " 'genus syrigma – whistling heron genus egretta – typical egrets'),\n", " (58.909716776933756,\n", " 'crested tiger heron subfamily cochleariinae genus cochlearius – boat'),\n", " (54.388888888888886,\n", " 'william elford leach webarchive template wayback links articles'),\n", " (40.388888888888886,\n", " 'short description short description matches wikidata articles'),\n", " (37.45614534836233, 'billed heron genus cochlearius – boat'),\n", " (37.09006330427564, 'crested tiger heron tigrisoma – tiger herons'),\n", " (34.8699279888573, 'genus tigrisoma – typical tiger herons'),\n", " (34.8699279888573, 'genus tigrisoma – typical tiger herons'),\n", " (33.624689893619205, 'genus nycticorax – typical night herons'),\n", " (33.624689893619205, 'genus nycticorax – typical night herons'),\n", " (33.42857142857143, 'nkc identifiers toggle limited content width'),\n", " (31.8625, 'nine ), 15 – 20 secondaries'),\n", " (31.08024544917476, 'genus nyctanassa – american night herons'),\n", " (31.08024544917476, 'genus nyctanassa – american night herons'),\n", " (30.211556603773584, 'genus pikaihao – saint bathan'),\n", " (30.211556603773584, 'genus pikaihao – saint bathan'),\n", " (28.966453447050466, 'many species also opportunistically take larger prey'),\n", " (28.759760273972603, 'ardeola – pond herons'),\n", " (28.529359634076613, 'crested tiger heron genus taphophoyx'),\n", " (27.813310989738497, 'genus botaurus – large bitterns'),\n", " (27.813310989738497, 'genus botaurus – large bitterns'),\n", " (27.054807692307694, '5884 ): 1763 – 1768'),\n", " (27.046026300743282, 'genus undetermined easter island heron'),\n", " (27.046026300743282, 'genus undetermined easter island heron'),\n", " (26.90174617067548, 'genus ardea – typical herons'),\n", " (26.90174617067548, 'genus ardea – typical herons'),\n", " (26.333333333333336, 'active feeding behaviours include foot stirring'),\n", " (25.954807692307693, '3 ): 672 – 679'),\n", " (25.954807692307693, '3 ): 569 – 571'),\n", " (25.954807692307693, '3 ): 471 – 472'),\n", " (25.954807692307693, '3 ): 450 – 452'),\n", " (25.954807692307693, '3 ): 441 – 442'),\n", " (25.954807692307693, '3 ): 437 – 450'),\n", " (25.8625, 'measures 25 – 30 cm'),\n", " (24.911950549450548, '1 ): 242 – 246'),\n", " (24.911950549450548, '1 ): 127 – 141'),\n", " (24.799056603773586, 'genus bubulcus – cattle egrets'),\n", " (24.799056603773586, 'genus bubulcus – cattle egrets'),\n", " (24.724056603773583, 'subfamily tigriornithinae genus taphophoyx'),\n", " (24.388141025641026, '2 ): 383 – 389'),\n", " (24.254807692307693, '4 ): 860 – 865'),\n", " (23.5, 'congress catalogue card number 76'),\n", " (23.5, '12th century ), earlier hairo'),\n", " (23.224056603773583, 'subfamily ardeinae genus zeltornis'),\n", " (23.166666666666668, 'distinguished ,[ 19 ][ 20'),\n", " (22.461556603773584, 'genus gorsachius – asian'),\n", " (22.461556603773584, 'genus gorsachius – asian'),\n", " (22.280952380952378, 'ioc world bird list version 13'),\n", " (22.28030303030303, 'agami heron'),\n", " (22.196969696969695, 'word heron first appeared'),\n", " (21.649056603773587, 'monotypic genus zebrilus'),\n", " (21.011556603773585, 'genus butorides – green'),\n", " (21.011556603773585, 'genus butorides – green'),\n", " (21.0, 'united states'),\n", " (21.0, 'czech republic'),\n", " (20.468688845401175, 'night herons could warrant separation'),\n", " (20.370647512864494, 'genus tigriornis – white'),\n", " (20.370647512864494, 'genus tigriornis – white'),\n", " (20.333333333333332, 'skull morphology reflect convergent evolution'),\n", " (20.25, 'international ornithological congress reclassified ardeidae'),\n", " (19.664522178734508, 'nycticorax – night herons'),\n", " (19.099056603773583, 'genus ardeola rather'),\n", " (18.897260273972602, 'reported observing female herons attaching'),\n", " (18.8625, 'transcends elements – earth'),\n", " (18.731188845401174, 'nyctanassa – night herons'),\n", " (18.3625, 'ardeagradis proardeola – possibly'),\n", " (18.357142857142858, 'egretta novaehollandiae ), demonstrating'),\n", " (18.317460317460316, 'bnf identifiers articles'),\n", " (18.30234962406015, 'ixobrychus – bitterns'),\n", " (17.831188845401176, 'calherodius – night herons'),\n", " (17.797260273972604, 'herons may use items already'),\n", " (17.581188845401176, 'gorsachius – night herons'),\n", " (17.42857142857143, 'small indian city \".'),\n", " (16.505357142857143, 'cochlearius – boat'),\n", " (16.428358208955224, 'colonies may contain several species'),\n", " (16.214285714285715, 'internet bird collection wikidata'),\n", " (16.0, 'watched repeatedly dropping seeds'),\n", " (15.880533445049574, 'zigzag heron ).'),\n", " (15.387445887445889, 'small green heron'),\n", " (15.333333333333334, 'seeking sexual gratification elsewhere'),\n", " (15.190858208955225, '11 – 17 species'),\n", " (15.190858208955225, '11 – 17 species'),\n", " (14.833333333333334, 'retrieved 19 september 2023'),\n", " (14.696969696969697, 'heronconservation heron specialist group'),\n", " (14.666666666666666, 'males arrive first'),\n", " (14.662286780383797, '7 – 13 species'),\n", " (14.662286780383797, '7 – 13 species'),\n", " (14.657142857142858, 'one breeding season per year'),\n", " (14.512820512820513, 'forest bittern'),\n", " (14.471428571428572, 'prey capture increased 3'),\n", " (14.420485175202156, 'genus ixobrychus'),\n", " (14.4, 'following large grazing animals'),\n", " (14.36568132660418, 'boatbill bitterns day herons'),\n", " (14.349056603773585, 'genus calherodius peters'),\n", " (14.302022178734507, 'contain two night herons'),\n", " (14.196969696969695, 'easter island heron'),\n", " (14.029166666666667, '20 – 21'),\n", " (14.0, 'sister taxa threskiornithidae'),\n", " (13.897260273972602, 'see text cochlearidae herons'),\n", " (13.849056603773585, 'respective genus accounts'),\n", " (13.833333333333334, 'oxford english dictionary describes'),\n", " (13.828358208955224, 'ioc lists 72 species'),\n", " (13.816017316017316, 'zigzag heron'),\n", " (13.816017316017316, 'zigzag heron'),\n", " (13.780303030303031, 'whistling heron'),\n", " (13.6875, 'vijay k .; kittur'),\n", " (13.666666666666666, 'english language around 1300'),\n", " (13.666666666666666, 'displays involve visual cues'),\n", " (13.559760273972602, 'butorides – herons'),\n", " (13.520833333333334, 'frederick h .; jones'),\n", " (13.43030303030303, 'throated tiger heron'),\n", " (13.3625, '376 – 403'),\n", " (13.3625, '1566 – 1625'),\n", " (13.333333333333334, 'neck area may swell'),\n", " (13.1875, 'clare e .; mccracken'),\n", " (13.166666666666666, 'nesting intensity varies throughout'),\n", " (13.1017316017316, 'billed heron'),\n", " (13.1017316017316, 'billed heron'),\n", " (13.1017316017316, 'billed heron'),\n", " (13.1017316017316, 'billed heron'),\n", " (13.066666666666666, 'uses erectile neck feathers'),\n", " (12.941578455790783, 'ardea – herons'),\n", " (12.8, 'zebrilus undulatus'),\n", " (12.722222222222221, 'three forward pointing ones'),\n", " (12.682389937106919, 'genus botaurus'),\n", " (12.666666666666666, 'zonerodius heliosylus'),\n", " (12.631944444444445, 'kevin g .; stuebing'),\n", " (12.52159090909091, 'tigriornis – white'),\n", " (12.520833333333334, 'frederick h .; mccracken'),\n", " (12.5, 'named differently'),\n", " (12.5, 'j k l martínez'),\n", " (12.375, 'subfamily tigriornithinae'),\n", " (12.375, 'subfamily ardeinae'),\n", " (12.321428571428571, 'prey comes within range'),\n", " (12.166666666666666, 'light colour morphs exist'),\n", " (12.166666666666666, '18 extant'),\n", " (12.131868131868133, 'zigzag bittern'),\n", " (12.1017316017316, 'eared night heron'),\n", " (12.1017316017316, 'crowned night heron'),\n", " (11.857142857142858, 'egretta sacra )\"'),\n", " (11.683333333333334, 'may feed far away'),\n", " (11.647619047619047, 'reed beds may nest'),\n", " (11.566666666666666, 'london editions isbn 0'),\n", " (11.542643923240938, 'one species formerly considered'),\n", " (11.530303030303031, 'russia squacco heron'),\n", " (11.530303030303031, 'pacific reef heron'),\n", " (11.530303030303031, 'osteological heron phylogenies'),\n", " (11.530303030303031, 'iucn heron videos'),\n", " (11.530303030303031, 'eastern reef heron'),\n", " (11.5, '11th century ),'),\n", " (11.375, 'subfamily nycticoracinae'),\n", " (11.357142857142858, '1 external links'),\n", " (11.301731601731603, 'backed night heron'),\n", " (11.196969696969697, 'heron symbolizes purity'),\n", " (11.195833333333333, '10 – 12'),\n", " (10.897260273972602, 'although herons resemble birds'),\n", " (10.849056603773585, 'genus zeltornis'),\n", " (10.8, 'waterbirds use different sites'),\n", " (10.777777777777777, 'e f g h'),\n", " (10.666666666666668, 'courtship usually takes part'),\n", " (10.5, 'native american culture'),\n", " (10.5, 'include adopting postures'),\n", " (10.5, 'american midland naturalist'),\n", " (10.431944444444445, 'kevin g .; sheldon'),\n", " (10.357142857142858, 'small number'),\n", " (10.317460317460318, 'ndl identifiers articles'),\n", " (10.317460317460318, 'lccn identifiers articles'),\n", " (10.317460317460318, 'j9u identifiers articles'),\n", " (10.317460317460318, 'gnd identifiers articles'),\n", " (10.317460317460318, 'bnfdata identifiers articles'),\n", " (10.25, 'international ornithological committee'),\n", " (10.030874785591767, 'genus ardea'),\n", " (10.025641025641026, 'new ): 82e46850'),\n", " (10.0, 'studied taxa'),\n", " (10.0, 'english language unabridged'),\n", " (9.968688845401173, 'african night herons'),\n", " (9.968688845401173, 'african night herons'),\n", " (9.849056603773585, 'genus proardea'),\n", " (9.849056603773585, 'genus proardea'),\n", " (9.833333333333334, 'oxford english dictionary'),\n", " (9.8, 'use reed beds'),\n", " (9.666666666666666, 'colonies surveyed contained'),\n", " (9.596153846153847, 'crested bittern'),\n", " (9.5, 'ultraconserved elements resolve'),\n", " (9.497260273972602, 'larger day herons'),\n", " (9.4, 'sometimes includes nyctanassa'),\n", " (9.4, 'sometimes includes nyctanassa'),\n", " (9.4, 'many broadly overlap'),\n", " (9.397260273972602, 'herons eating acorns'),\n", " (9.397260273972602, 'herons ); differences'),\n", " (9.3625, 'rebirth –'),\n", " (9.3625, 'belgium –'),\n", " (9.333333333333334, 'oxford university press'),\n", " (9.333333333333334, 'morph varies geographically'),\n", " (9.25, 'within striking distance'),\n", " (9.25, 'third international dictionary'),\n", " (9.19047619047619, 'small groups'),\n", " (9.1875, 'h .; kaul'),\n", " (9.1875, 'cattle egrets improve'),\n", " (9.025641025641026, '2 ): ukad005'),\n", " (9.0, 'zagreb zoological garden'),\n", " (9.0, 'vegetable matter consumed'),\n", " (9.0, 'probably closely related'),\n", " (9.0, 'philip babcock gove'),\n", " (9.0, 'paleognath lithornis vulturinus'),\n", " (9.0, 'online etymology dictionary'),\n", " (9.0, 'old french heronçeau'),\n", " (9.0, 'old french hairon'),\n", " (9.0, 'croatia symbolic meaning'),\n", " (9.0, 'common hunting technique'),\n", " (9.0, 'clear consensus exists'),\n", " (9.0, 'b66d2f24e21b open tree'),\n", " (9.0, '609781 paleobiology database'),\n", " (9.0, '1ardef fauna europaea'),\n", " (9.0, '10741 fauna europaea'),\n", " (8.933333333333334, 'may walk slowly'),\n", " (8.916666666666666, 'monophyletic group within'),\n", " (8.904761904761905, '7 ][ 15'),\n", " (8.89502487562189, 'many species also'),\n", " (8.833333333333334, 'feeding grounds near'),\n", " (8.828358208955224, '72 recognised species'),\n", " (8.75, 'pond'),\n", " (8.714285714285715, 'one pointing backwards'),\n", " (8.666666666666666, 'oed also observes'),\n", " (8.666666666666666, 'different colour morphs'),\n", " (8.666666666666666, 'colonies commonly occur'),\n", " (8.666666666666666, 'bread ;[ 12'),\n", " (8.666666666666666, 'biologically distinct group'),\n", " (8.666666666666666, 'actively add items'),\n", " (8.666666666666666, '11 primary feathers'),\n", " (8.619047619047619, 'little sexual dimorphism'),\n", " (8.619047619047619, 'little sexual dimorphism'),\n", " (8.5875, 'c .; shakya'),\n", " (8.580593607305936, 'herons may feed'),\n", " (8.578358208955224, 'latitude within species'),\n", " (8.571428571428571, 'ixobrychus involucris'),\n", " (8.571428571428571, 'ixobrychus exilis'),\n", " (8.571428571428571, 'certain prey types'),\n", " (8.520833333333334, 'p .; mccracken'),\n", " (8.520833333333334, 'p .; holmes'),\n", " (8.520833333333334, 'including cattle egrets'),\n", " (8.5, 'temperate climates laying'),\n", " (8.5, 'derogatory terms meaning'),\n", " (8.428571428571429, 'western cattle egret'),\n", " (8.428571428571429, 'eastern cattle egret'),\n", " (8.380952380952381, 'bird symbolizes renewal'),\n", " (8.333333333333334, 'food usage patterns'),\n", " (8.328358208955224, 'eight living species'),\n", " (8.328358208955224, 'eight living species'),\n", " (8.314285714285715, 'one recently extinct'),\n", " (8.314285714285715, 'one recently extinct'),\n", " (8.314285714285715, 'one recently extinct'),\n", " (8.314285714285715, 'one recently extinct'),\n", " (8.313926940639268, 'tropical herons typically'),\n", " (8.297260273972602, 'tiger herons'),\n", " (8.297260273972602, 'tiger herons'),\n", " (8.297260273972602, 'tiger herons'),\n", " (8.254166666666666, 'c .; f'),\n", " (8.196969696969697, 'heron symbolizes'),\n", " (8.1875, 'j .; oliveros'),\n", " (8.166666666666666, 'migration typically occurs'),\n", " (8.161691542288558, 'two living species'),\n", " (8.161691542288558, 'two living species'),\n", " (8.101731601731602, 'heron \".'),\n", " (8.1, 'items used may'),\n", " (8.071428571428571, 'evolutionary history \".'),\n", " (8.066666666666666, 'row isbn 0'),\n", " (8.06392694063927, 'herons archived 2019'),\n", " (8.042643923240938, 'one living species'),\n", " (8.042643923240938, 'one living species'),\n", " (8.030303030303031, 'individual heron'),\n", " (8.0, 'syrigma'),\n", " (8.0, 'skeletal analyses focusing'),\n", " (8.0, 'separate monotypic family'),\n", " (8.0, 'highly mobile family'),\n", " (8.0, 'george frederic watts'),\n", " (8.0, 'extremely high mountains'),\n", " (8.0, 'ever present reminder'),\n", " (8.0, 'earliest temporally well'),\n", " (8.0, 'continents except antarctica'),\n", " (8.0, 'clutch size varies'),\n", " (8.0, '2008sci ... 320'),\n", " (7.928571428571429, 'eastern great egret'),\n", " (7.885714285714286, 'four recently extinct'),\n", " (7.885714285714286, 'four recently extinct'),\n", " (7.857142857142857, 'egretta garzetta'),\n", " (7.833333333333334, 'least partially migratory'),\n", " (7.8, 'partly colonial depending'),\n", " (7.768421052631579, 'large bitterns'),\n", " (7.761691542288558, 'individual species may'),\n", " (7.7214285714285715, 'bubulcus ibis \".'),\n", " (7.633333333333334, 'two major genera'),\n", " (7.6, 'another former name'),\n", " (7.555555555555555, 'three major groups'),\n", " (7.533333333333333, 'mostly solitary nesters'),\n", " (7.53030303030303, 'wounded heron'),\n", " (7.53030303030303, 'heron pictured'),\n", " (7.53030303030303, 'heron nuclear'),\n", " (7.53030303030303, 'headed heron'),\n", " (7.53030303030303, 'goliath heron'),\n", " (7.53030303030303, 'faced heron'),\n", " (7.53030303030303, 'egypt heron'),\n", " (7.5, 'order pelecaniformes instead'),\n", " (7.5, 'documented using bait'),\n", " (7.404761904761905, 'live aquatic prey'),\n", " (7.4, '2008 study suggests'),\n", " (7.4, '1971 isbn 0'),\n", " (7.3809523809523805, 'group nest colonially'),\n", " (7.333333333333334, 'usually found near'),\n", " (7.333333333333334, 'taxonomy published online'),\n", " (7.333333333333334, '1971 compact edition'),\n", " (7.33030303030303, 'grey heron'),\n", " (7.33030303030303, 'grey heron'),\n", " (7.33030303030303, 'grey heron'),\n", " (7.33030303030303, 'grey heron'),\n", " (7.25, 'tigrisoma mexicanum'),\n", " (7.2, 'molecular rate variation'),\n", " (7.166666666666667, 'new feeding areas'),\n", " (7.15, 'b c gill'),\n", " (7.114285714285714, 'one study found'),\n", " (7.1, 'molecular phylogenetic study'),\n", " (7.0, 'wayback machine'),\n", " (7.0, 'corruption took place'),\n", " (7.0, 'catching insects flushed'),\n", " (6.9950248756218905, 'whereas species living'),\n", " (6.9950248756218905, 'almost every species'),\n", " (6.968688845401174, 'night herons'),\n", " (6.968688845401174, 'night herons'),\n", " (6.968688845401174, 'night herons'),\n", " (6.968688845401174, 'night herons'),\n", " (6.966666666666666, 'immediately around water'),\n", " (6.933333333333334, 'birds may either'),\n", " (6.93030303030303, 'black heron'),\n", " (6.93030303030303, 'black heron'),\n", " (6.857142857142857, 'small'),\n", " (6.854166666666667, 'r .; post'),\n", " (6.833333333333333, 'botaurus stellaris'),\n", " (6.803030303030303, 'fossil heron'),\n", " (6.731182795698925, 'july 2023 ).'),\n", " (6.633333333333333, 'genera botaurus'),\n", " (6.571428571428571, 'ixobrychus'),\n", " (6.5, 'ardeidae could'),\n", " (6.432937181663837, 'bitterns ).'),\n", " (6.368421052631579, 'smaller bitterns'),\n", " (6.368421052631579, 'smaller bitterns'),\n", " (6.368421052631579, 'bitterns rather'),\n", " (6.346153846153847, 'great bittern'),\n", " (6.333333333333334, 'frederick h'),\n", " (6.228358208955224, '3 species'),\n", " (6.228358208955224, '3 species'),\n", " (6.1875, 'cattle egrets'),\n", " (6.1875, 'cattle egrets'),\n", " (6.1875, 'cattle egrets'),\n", " (6.17948717948718, 'least bittern'),\n", " (6.17948717948718, 'least bittern'),\n", " (6.131868131868131, 'little bittern'),\n", " (6.111111111111111, 'typical'),\n", " (6.064516129032258, 'ioc ).'),\n", " (6.046153846153846, 'backed bittern'),\n", " (5.997260273972603, 'day herons'),\n", " (5.997260273972603, 'day herons'),\n", " (5.997260273972603, 'day herons'),\n", " (5.888888888888889, 'microformats articles'),\n", " (5.880952380952381, 'also one'),\n", " (5.857142857142857, 'egretta'),\n", " (5.846153846153847, 'dwarf bittern'),\n", " (5.728358208955224, 'many species'),\n", " (5.728358208955224, 'many species'),\n", " (5.728358208955224, '13 species'),\n", " (5.597260273972603, 'backed herons'),\n", " (5.597260273972603, 'backed herons'),\n", " (5.597260273972603, 'backed herons'),\n", " (5.571428571428571, 'browned night'),\n", " (5.53030303030303, 'heron'),\n", " (5.53030303030303, 'heron'),\n", " (5.53030303030303, 'heron'),\n", " (5.53030303030303, 'heron'),\n", " (5.53030303030303, 'heron'),\n", " (5.53030303030303, 'heron'),\n", " (5.5, 'larger cycle'),\n", " (5.5, 'larger clutches'),\n", " (5.5, 'female risks'),\n", " (5.5, 'dictionary suggests'),\n", " (5.5, '152 cm'),\n", " (5.473214285714286, 'little egrets'),\n", " (5.473214285714286, 'little egrets'),\n", " (5.457142857142857, '1 extinct'),\n", " (5.457142857142857, '1 extinct'),\n", " (5.444444444444445, 'kevin g'),\n", " (5.428571428571429, 'cattle egret'),\n", " (5.4, 'bubulcus coromandus'),\n", " (5.397260273972602, 'striated herons'),\n", " (5.397260273972602, 'herons lay'),\n", " (5.397260273972602, 'herons )\".'),\n", " (5.333333333333334, 'aquatic animals'),\n", " (5.328358208955224, 'several species'),\n", " (5.25, 'tropical ones'),\n", " (5.25, 'tigriornis leucolophus'),\n", " (5.1875, '.; yahya'),\n", " (5.1875, '.; lavretsky'),\n", " (5.1875, '.; khan'),\n", " (5.166666666666666, 'also applied'),\n", " (5.161691542288557, 'two species'),\n", " (5.161691542288557, 'two species'),\n", " (5.15, 'bubulcus ibis'),\n", " (5.142857142857142, 'breeding sites'),\n", " (5.142857142857142, 'breeding season'),\n", " (5.0476190476190474, 'bird groups'),\n", " (5.0, 'several cities'),\n", " (5.0, 'ritual displays'),\n", " (5.0, 'rainy season'),\n", " (5.0, 'possibly attract'),\n", " (5.0, 'family ardeidae'),\n", " (5.0, 'family ardeidae'),\n", " (5.0, 'different challenges'),\n", " (5.0, 'description'),\n", " (5.0, 'compact edition'),\n", " (5.0, 'ardeidae gen'),\n", " (5.0, 'ardeidae gen'),\n", " (5.0, 'ardeidae gen'),\n", " (5.0, 'ardeidae bold'),\n", " (5.0, 'ardeidae afd'),\n", " (5.0, 'ardeidae adw'),\n", " (5.0, 'ardeidae )\".'),\n", " (4.933333333333334, 'may raise'),\n", " (4.909090909090909, 'white morphs'),\n", " (4.857142857142858, 'volume 1'),\n", " (4.833333333333334, 'usually considered'),\n", " (4.833333333333334, 'nighttime feeding'),\n", " (4.833333333333334, 'feeding behavior'),\n", " (4.828358208955224, 'temperate species'),\n", " (4.828358208955224, 'species live'),\n", " (4.800000000000001, '4 extinct'),\n", " (4.8, 'butorides virescens'),\n", " (4.756929637526652, 'egret species'),\n", " (4.75, 'international ornithologists'),\n", " (4.7142857142857135, 'little egret'),\n", " (4.7142857142857135, 'little egret'),\n", " (4.7, 'mostly migratory'),\n", " (4.666666666666667, 'july 2023'),\n", " (4.666666666666667, 'july 2023'),\n", " (4.666666666666666, 'typically placed'),\n", " (4.666666666666666, 'similar cues'),\n", " (4.666666666666666, 'nocturnal group'),\n", " (4.666666666666666, 'isbn 978'),\n", " (4.666666666666666, 'isbn 978'),\n", " (4.666666666666666, 'isbn 978'),\n", " (4.666666666666666, 'auditory cues'),\n", " (4.666666666666666, 'also known'),\n", " (4.666666666666666, '12 rectrices'),\n", " (4.661691542288557, 'solitary species'),\n", " (4.661691542288557, '2 species'),\n", " (4.661691542288557, '2 species'),\n", " (4.65, 'b c'),\n", " (4.614072494669509, 'four species'),\n", " (4.614072494669509, 'four species'),\n", " (4.614072494669509, 'four species'),\n", " (4.614072494669509, 'four species'),\n", " (4.578358208955224, 'tropical species'),\n", " (4.571428571428571, 'snatching prey'),\n", " (4.571428571428571, 'siege \".'),\n", " (4.571428571428571, 'seeing prey'),\n", " (4.571428571428571, 'pelicans \".'),\n", " (4.571428571428571, 'lure prey'),\n", " (4.571428571428571, 'kenya \".'),\n", " (4.571428571428571, 'hidden prey'),\n", " (4.571428571428571, 'frighten prey'),\n", " (4.564516129032258, '2008 ).'),\n", " (4.550580431177446, 'three species'),\n", " (4.550580431177446, 'three species'),\n", " (4.550580431177446, 'three species'),\n", " (4.550580431177446, 'three species'),\n", " (4.550580431177446, 'three species'),\n", " (4.528358208955224, '4 species'),\n", " (4.5, 'variously considered'),\n", " (4.5, 'using bait'),\n", " (4.5, 'success rate'),\n", " (4.5, 'previous order'),\n", " (4.5, 'placed high'),\n", " (4.5, 'oed speculates'),\n", " (4.5, 'held backwards'),\n", " (4.5, 'geographical variation'),\n", " (4.5, 'extremely fine'),\n", " (4.5, 'evolutionary adaptation'),\n", " (4.5, 'considered provisional'),\n", " (4.5, 'annual migration'),\n", " (4.4, 'phylogenomic study'),\n", " (4.397849462365592, '2023 ).'),\n", " (4.368421052631579, 'bitterns'),\n", " (4.368421052631579, 'bitterns'),\n", " (4.368421052631579, 'bitterns'),\n", " (4.368421052631579, 'bitterns'),\n", " (4.368421052631579, 'bitterns'),\n", " (4.368421052631579, 'bitterns'),\n", " (4.368421052631579, 'bitterns'),\n", " (4.333333333333334, 'usually blue'),\n", " (4.333333333333334, 'solitary foraging'),\n", " (4.333333333333334, 'relative patterns'),\n", " (4.333333333333334, 'new zealand'),\n", " (4.333333333333334, 'new zealand'),\n", " (4.333333333333334, 'new zealand'),\n", " (4.333333333333334, 'new york'),\n", " (4.333333333333334, 'lower part'),\n", " (4.333333333333334, 'including fish'),\n", " (4.333333333333334, 'including birds'),\n", " (4.333333333333334, 'corroborate patterns'),\n", " (4.333333333333334, 'collaborators published'),\n", " (4.333333333333334, 'aquatic insects'),\n", " (4.328358208955224, 'species occur'),\n", " (4.328358208955224, 'smallest species'),\n", " (4.328358208955224, 'six species'),\n", " (4.328358208955224, 'six species'),\n", " (4.328358208955224, 'largest species'),\n", " (4.314285714285715, 'bird eggs'),\n", " (4.285714285714286, 'four days'),\n", " (4.266666666666667, 'wings may'),\n", " (4.25, 'tropical birds'),\n", " (4.25, 'b hilaluddin'),\n", " (4.222222222222222, 'three broods'),\n", " (4.2, 'mostly sedentary'),\n", " (4.2, 'molecular studies'),\n", " (4.2, 'molecular biology'),\n", " (4.181818181818182, 'ardea modesta'),\n", " (4.181818181818182, 'ardea cinerea'),\n", " (4.166666666666667, 'phylogenetic relationships'),\n", " (4.166666666666666, 'diet includes'),\n", " (4.142857142857142, 'breeding strategies'),\n", " (4.128358208955224, 'colonial species'),\n", " (4.128358208955224, '6 species'),\n", " (4.128358208955224, '6 species'),\n", " (4.128358208955224, '6 species'),\n", " (4.066666666666666, 'retracted neck'),\n", " (4.064516129032258, '2024 ).'),\n", " (4.064516129032258, '2009 ).'),\n", " (4.064516129032258, '2009 ).'),\n", " (4.064516129032258, '2000 ).'),\n", " (4.064516129032258, '1998 ).'),\n", " (4.064516129032258, '1995 ).'),\n", " (4.064516129032258, '1994 ).'),\n", " (4.064516129032258, '1992 ).'),\n", " (4.064516129032258, '1991 ).'),\n", " (4.064516129032258, '1991 ).'),\n", " (4.064516129032258, '1988 ).'),\n", " (4.064516129032258, '1973 ).'),\n", " (4.064516129032258, '1966 ).'),\n", " (4.064516129032258, '1946 ).'),\n", " (4.0, 'ˈʃaɪtpoʊk /,'),\n", " (4.0, 'witt cc'),\n", " (4.0, 'wilson bulletin'),\n", " (4.0, 'widespread family'),\n", " (4.0, 'wider field'),\n", " (4.0, 'weakly person'),\n", " (4.0, 'watts gallery'),\n", " (4.0, 'watery environments'),\n", " (4.0, 'uttar pradesh'),\n", " (4.0, 'urban ecosystems'),\n", " (4.0, 'upright posture'),\n", " (4.0, 'traditionally done'),\n", " (4.0, 'terms used'),\n", " (4.0, 'tarsometatarsus assigned'),\n", " (4.0, 'swimming waterbirds'),\n", " (4.0, 'strikingly complex'),\n", " (4.0, 'steadman dw'),\n", " (4.0, 'sometimes included'),\n", " (4.0, 'sometimes included'),\n", " (4.0, 'sometimes included'),\n", " (4.0, 'sometimes included'),\n", " (4.0, 'sized birds'),\n", " (4.0, 'sit motionless'),\n", " (4.0, 's2cid 85622885'),\n", " (4.0, 's2cid 6472805'),\n", " (4.0, 'royal decree'),\n", " (4.0, 'robert gillmor'),\n", " (4.0, 'robert gillmor'),\n", " (4.0, 'results suggest'),\n", " (4.0, 'results conflict'),\n", " (4.0, 'reduce glare'),\n", " (4.0, 'q18789 wikispecies'),\n", " (4.0, 'published example'),\n", " (4.0, 'proardea matuku'),\n", " (4.0, 'previously placed'),\n", " (4.0, 'predominantly found'),\n", " (4.0, 'pmid 18583609'),\n", " (4.0, 'pmid 10723744'),\n", " (4.0, 'peter hayman'),\n", " (4.0, 'particularly inclined'),\n", " (4.0, 'particularly crabs'),\n", " (4.0, 'north america'),\n", " (4.0, 'norfolk broads'),\n", " (4.0, 'nbnsys0000159424 ncbi'),\n", " (4.0, 'mostly colonial'),\n", " (4.0, 'moore ws'),\n", " (4.0, 'mirriam company'),\n", " (4.0, 'miglia kj'),\n", " (4.0, 'melanophoyx ardesiaca'),\n", " (4.0, 'marks bd'),\n", " (4.0, 'male works'),\n", " (4.0, 'male employs'),\n", " (4.0, 'lynx edicions'),\n", " (4.0, 'lowland areas'),\n", " (4.0, 'locate roosting'),\n", " (4.0, 'levy county'),\n", " (4.0, 'levy county'),\n", " (4.0, 'lay olive'),\n", " (4.0, 'late oligocene'),\n", " (4.0, 'late miocene'),\n", " (4.0, 'late miocene'),\n", " (4.0, 'kimball rt'),\n", " (4.0, 'jstor 4163462'),\n", " (4.0, 'jstor 4088682'),\n", " (4.0, 'impotent mates'),\n", " (4.0, 'human persecution'),\n", " (4.0, 'huddleston cj'),\n", " (4.0, 'harshman j'),\n", " (4.0, 'han kl'),\n", " (4.0, 'hackett sj'),\n", " (4.0, 'habitats except'),\n", " (4.0, 'glossy blue'),\n", " (4.0, 'game birds'),\n", " (4.0, 'full canopy'),\n", " (4.0, 'frankish haigiro'),\n", " (4.0, 'frank gill'),\n", " (4.0, 'foraging success'),\n", " (4.0, 'foraging success'),\n", " (4.0, 'foraging implications'),\n", " (4.0, 'family exhibits'),\n", " (4.0, 'family belongs'),\n", " (4.0, 'extinct long'),\n", " (4.0, 'et sp'),\n", " (4.0, 'et sp'),\n", " (4.0, 'et sp'),\n", " (4.0, 'et al'),\n", " (4.0, 'essentially non'),\n", " (4.0, 'egg clutches'),\n", " (4.0, 'early oligocene'),\n", " (4.0, 'early miocene'),\n", " (4.0, 'early miocene'),\n", " (4.0, 'early miocene'),\n", " (4.0, 'early miocene'),\n", " (4.0, 'early miocene'),\n", " (4.0, 'driest deserts'),\n", " (4.0, 'djebel zelten'),\n", " (4.0, 'djebel zelten'),\n", " (4.0, 'disperse widely'),\n", " (4.0, 'del hoyo'),\n", " (4.0, 'deep booming'),\n", " (4.0, 'decorative plumes'),\n", " (4.0, 'cox wa'),\n", " (4.0, 'coral beaches'),\n", " (4.0, 'constrained record'),\n", " (4.0, 'completely resolved'),\n", " (4.0, 'collaborators resurrected'),\n", " (4.0, 'coldest extremes'),\n", " (4.0, 'coastal birds'),\n", " (4.0, 'cladogram shown'),\n", " (4.0, 'chojnowski jl'),\n", " (4.0, 'cervical vertebrae'),\n", " (4.0, 'catch fish'),\n", " (4.0, 'braun mj'),\n", " (4.0, 'braun el'),\n", " (4.0, 'bowie rc'),\n", " (4.0, 'birds reveals'),\n", " (4.0, 'banded killifish'),\n", " (4.0, 'badly resolved'),\n", " (4.0, 'attract fish'),\n", " (4.0, 'attract females'),\n", " (4.0, 'arrangement presented'),\n", " (4.0, 'amazonian peru'),\n", " (4.0, 'alpine areas'),\n", " (4.0, 'aggressive attack'),\n", " (4.0, 'additional observations'),\n", " (4.0, 'actually belongs'),\n", " (4.0, '8899 nzor'),\n", " (4.0, '8013 eppo'),\n", " (4.0, '6pb eol'),\n", " (4.0, '4929 irmng'),\n", " (4.0, '39541 worms'),\n", " (4.0, '3685 inaturalist'),\n", " (4.0, '174771 nbn'),\n", " (4.0, '1444 col'),\n", " (4.0, '101354 itis'),\n", " (4.0, '062322a50823 gbif'),\n", " (3.916666666666667, 'b hruska'),\n", " (3.904761904761905, 'seen prey'),\n", " (3.9, '3'),\n", " (3.8826979472140764, 'pdf ).'),\n", " (3.8826979472140764, 'pdf ).'),\n", " (3.8826979472140764, 'pdf ).'),\n", " (3.8826979472140764, 'pdf ).'),\n", " (3.8826979472140764, 'pdf ).'),\n", " (3.8826979472140764, 'pdf ).'),\n", " (3.8826979472140764, 'pdf ).'),\n", " (3.8826979472140764, 'pdf ).'),\n", " (3.8826979472140764, 'pdf ).'),\n", " (3.8461538461538463, 'bittern'),\n", " (3.8461538461538463, 'bittern'),\n", " (3.833333333333333, 'nesting site'),\n", " (3.828358208955224, '14 species'),\n", " (3.8, 'sheldon fh'),\n", " (3.8, '6 times'),\n", " (3.75, 'mitochondrial dna'),\n", " (3.75, 'dna studies'),\n", " (3.75, 'dna hybridization'),\n", " (3.7, 'mostly associated'),\n", " (3.666666666666667, 'usually yellow'),\n", " (3.666666666666667, 'relationships among'),\n", " (3.666666666666667, 'often coinciding'),\n", " (3.666666666666667, 'jack hruska'),\n", " (3.666666666666667, 'extreme example'),\n", " (3.6616915422885574, '8 species'),\n", " (3.642857142857143, 'breeding plumage'),\n", " (3.601085481682497, 'fossil species'),\n", " (3.6, 'seven eggs'),\n", " (3.6, 'name appears'),\n", " (3.5714285714285716, 'night'),\n", " (3.564516129032258, '2011 ).'),\n", " (3.564516129032258, '2006 ).'),\n", " (3.5, 'wing fishing'),\n", " (3.5, 'wide variety'),\n", " (3.5, 'thin toes'),\n", " (3.5, 'stretch display'),\n", " (3.5, 'stirring'),\n", " (3.5, 'round breeders'),\n", " (3.5, 'plumage polymorphism'),\n", " (3.5, 'necked birds'),\n", " (3.5, 'modified shape'),\n", " (3.5, 'list'),\n", " (3.5, 'less tied'),\n", " (3.5, 'legged waterfowl'),\n", " (3.5, 'jstor 4089118'),\n", " (3.5, 'jstor 4083060'),\n", " (3.5, 'jstor 4080141'),\n", " (3.5, 'jstor 2424157'),\n", " (3.5, 'jstor 1368954'),\n", " (3.5, 'jstor 1368843'),\n", " (3.5, 'james vi'),\n", " (3.5, 'james j'),\n", " (3.5, 'foot'),\n", " (3.5, 'eds .).'),\n", " (3.5, 'cosmopolitan distribution'),\n", " (3.5, 'correct placement'),\n", " (3.5, 'bare parts'),\n", " (3.5, '60 paces'),\n", " (3.466666666666667, '18 genera'),\n", " (3.409090909090909, 'mainly white'),\n", " (3.4, 'long beaks'),\n", " (3.4, 'large'),\n", " (3.4, '13'),\n", " (3.3972602739726026, 'herons'),\n", " (3.3972602739726026, 'herons'),\n", " (3.3972602739726026, 'herons'),\n", " (3.3972602739726026, 'herons'),\n", " (3.3972602739726026, 'herons'),\n", " (3.3972602739726026, 'herons'),\n", " (3.3972602739726026, 'herons'),\n", " (3.3972602739726026, 'herons'),\n", " (3.3972602739726026, 'herons'),\n", " (3.3972602739726026, 'herons'),\n", " (3.3972602739726026, 'herons'),\n", " (3.3972602739726026, 'herons'),\n", " (3.3972602739726026, 'herons'),\n", " (3.3972602739726026, 'herons'),\n", " (3.3972602739726026, 'herons'),\n", " (3.3972602739726026, 'herons'),\n", " (3.3972602739726026, 'herons'),\n", " (3.3972602739726026, 'herons'),\n", " (3.3972602739726026, 'herons'),\n", " (3.3972602739726026, 'herons'),\n", " (3.3972602739726026, 'herons'),\n", " (3.3972602739726026, 'herons'),\n", " (3.3333333333333335, '15'),\n", " (3.333333333333333, 'suitable trees'),\n", " (3.333333333333333, 'rarely carrion'),\n", " (3.333333333333333, 'nesting ecology'),\n", " (3.333333333333333, 'crouched position'),\n", " (3.333333333333333, 'atypical bill'),\n", " (3.3, 'shallow water'),\n", " (3.2727272727272725, 'disputed fossil'),\n", " (3.25, 'gorsachius'),\n", " (3.1875, 'egrets'),\n", " (3.1875, 'egrets'),\n", " (3.1875, 'egrets'),\n", " (3.1875, 'egrets'),\n", " (3.1875, 'egrets'),\n", " (3.166666666666667, 'exhibiting 10'),\n", " (3.142857142857143, 'boat'),\n", " (3.142857142857143, 'boat'),\n", " (3.142857142857143, 'boat'),\n", " (3.142857142857143, 'boat'),\n", " (3.0, 'skull'),\n", " (3.0, 'pamela rasmussen'),\n", " (3.0, 'often referred'),\n", " (3.0, 'necks retracted'),\n", " (3.0, 'mccracken'),\n", " (3.0, 'h'),\n", " (3.0, 'green'),\n", " (3.0, 'e'),\n", " (3.0, 'david donsker'),\n", " (3.0, 'ardeidae'),\n", " (3.0, 'ardeidae'),\n", " (3.0, 'ardeidae'),\n", " (3.0, 'ardeidae'),\n", " (3.0, '11'),\n", " (2.9333333333333336, 'brown eggs'),\n", " (2.933333333333333, 'may'),\n", " (2.933333333333333, 'may'),\n", " (2.933333333333333, 'may'),\n", " (2.9, 'generally long'),\n", " (2.857142857142857, '1'),\n", " (2.8333333333333335, 'feeding'),\n", " (2.833333333333333, 'thick bill'),\n", " (2.8, 'use'),\n", " (2.8, 'use'),\n", " (2.8, 'butorides'),\n", " (2.7333333333333334, 'long legs'),\n", " (2.7142857142857144, 'one'),\n", " (2.7142857142857144, 'one'),\n", " (2.7142857142857144, 'one'),\n", " (2.7142857142857144, 'bird'),\n", " (2.7142857142857144, 'bird'),\n", " (2.6666666666666665, 'world'),\n", " (2.6666666666666665, 'world'),\n", " (2.6666666666666665, 'group'),\n", " (2.6666666666666665, 'feathers'),\n", " (2.6666666666666665, 'f'),\n", " (2.6666666666666665, 'colour'),\n", " (2.6666666666666665, 'around'),\n", " (2.5714285714285716, 'prey'),\n", " (2.5714285714285716, 'prey'),\n", " (2.5714285714285716, 'prey'),\n", " (2.5, 'range'),\n", " (2.5, 'london'),\n", " (2.5, 'k'),\n", " (2.5, 'k'),\n", " (2.5, 'heronconservation'),\n", " (2.5, 'exist'),\n", " (2.5, 'cochlearidae'),\n", " (2.5, 'away'),\n", " (2.5, '17'),\n", " (2.5, '17'),\n", " (2.4, 'study'),\n", " (2.4, 'neck'),\n", " (2.4, 'neck'),\n", " (2.4, 'c'),\n", " (2.4, '0'),\n", " (2.4, '0'),\n", " (2.3333333333333335, 'usually'),\n", " (2.3333333333333335, 'part'),\n", " (2.3333333333333335, 'p'),\n", " (2.3333333333333335, 'evolution'),\n", " (2.3333333333333335, 'evolution'),\n", " (2.3333333333333335, '2023'),\n", " (2.3333333333333335, '2'),\n", " (2.3333333333333335, '2'),\n", " (2.3333333333333335, '1971'),\n", " (2.328358208955224, 'species'),\n", " (2.328358208955224, 'species'),\n", " (2.328358208955224, 'species'),\n", " (2.328358208955224, 'species'),\n", " (2.328358208955224, 'species'),\n", " (2.328358208955224, 'species'),\n", " (2.328358208955224, 'species'),\n", " (2.328358208955224, 'species'),\n", " (2.328358208955224, 'species'),\n", " (2.328358208955224, 'species'),\n", " (2.25, 'feed'),\n", " (2.25, 'feed'),\n", " (2.2222222222222223, 'three'),\n", " (2.2, 'molecular'),\n", " (2.2, '4'),\n", " (2.2, '4'),\n", " (2.1818181818181817, 'ardea'),\n", " (2.1818181818181817, 'ardea'),\n", " (2.1818181818181817, 'ardea'),\n", " (2.1818181818181817, 'ardea'),\n", " (2.1818181818181817, 'ardea'),\n", " (2.142857142857143, 'breeding'),\n", " (2.142857142857143, 'breeding'),\n", " (2.111111111111111, 'g'),\n", " (2.111111111111111, 'g'),\n", " (2.111111111111111, 'g'),\n", " (2.111111111111111, 'g'),\n", " (2.111111111111111, 'g'),\n", " (2.0, 'well'),\n", " (2.0, 'watts'),\n", " (2.0, 'waterbirds'),\n", " (2.0, 'waterbirds'),\n", " (2.0, 'ukad005'),\n", " (2.0, 'taxonomy'),\n", " (2.0, 'reported'),\n", " (2.0, 'reported'),\n", " (2.0, 'reported'),\n", " (2.0, 'rather'),\n", " (2.0, 'present'),\n", " (2.0, 'post'),\n", " (2.0, 'place'),\n", " (2.0, 'pelecaniformes'),\n", " (2.0, 'occur'),\n", " (2.0, 'j'),\n", " (2.0, 'j'),\n", " (2.0, 'j'),\n", " (2.0, 'insects'),\n", " (2.0, 'found'),\n", " (2.0, 'found'),\n", " (2.0, 'flushed'),\n", " (2.0, 'family'),\n", " (2.0, 'family'),\n", " (2.0, 'except'),\n", " (2.0, 'either'),\n", " (2.0, 'courtship'),\n", " (2.0, 'courtship'),\n", " (2.0, 'corruption'),\n", " (2.0, 'birds'),\n", " (2.0, 'birds'),\n", " (2.0, 'birds'),\n", " (2.0, 'bait'),\n", " (2.0, 'areas'),\n", " (2.0, 'analyses'),\n", " (2.0, 'actively'),\n", " (2.0, '320'),\n", " (2.0, '21'),\n", " (1.9090909090909092, 'white'),\n", " (1.9090909090909092, 'white'),\n", " (1.9090909090909092, 'white'),\n", " (1.9090909090909092, 'white'),\n", " (1.9090909090909092, 'white'),\n", " (1.9090909090909092, 'white'),\n", " (1.8333333333333333, 'nesting'),\n", " (1.8333333333333333, 'nesting'),\n", " (1.8333333333333333, 'nesting'),\n", " (1.8181818181818181, 'pdf'),\n", " (1.8181818181818181, 'pdf'),\n", " (1.8, 'year'),\n", " (1.8, 'year'),\n", " (1.8, 'year'),\n", " (1.8, 'year'),\n", " (1.8, 'sheldon'),\n", " (1.8, 'sheldon'),\n", " (1.8, 'sheldon'),\n", " (1.8, 'grey'),\n", " (1.8, 'genera'),\n", " (1.8, 'genera'),\n", " (1.8, 'colonial'),\n", " (1.8, 'colonial'),\n", " (1.8, '6'),\n", " (1.75, 'ibis'),\n", " (1.75, 'ibis'),\n", " (1.75, 'dna'),\n", " (1.7142857142857142, 'nest'),\n", " (1.7142857142857142, 'nest'),\n", " (1.7142857142857142, 'nest'),\n", " (1.7142857142857142, 'nest'),\n", " (1.7142857142857142, 'nest'),\n", " (1.6666666666666667, 'whereas'),\n", " (1.6666666666666667, 'whereas'),\n", " (1.6666666666666667, 'size'),\n", " (1.6666666666666667, 'size'),\n", " (1.6666666666666667, 'retracted'),\n", " (1.6666666666666667, 'relationships'),\n", " (1.6666666666666667, 'r'),\n", " (1.6666666666666667, 'r'),\n", " (1.6666666666666667, 'often'),\n", " (1.6666666666666667, 'included'),\n", " (1.6666666666666667, 'included'),\n", " (1.6666666666666667, 'hruska'),\n", " (1.6666666666666667, 'example'),\n", " (1.6666666666666667, 'archived'),\n", " (1.6666666666666667, 'archived'),\n", " (1.6666666666666667, 'almost'),\n", " (1.6666666666666667, 'almost'),\n", " (1.6666666666666667, '18'),\n", " (1.6, 'name'),\n", " (1.6, 'name'),\n", " (1.6, 'name'),\n", " (1.6, 'eggs'),\n", " (1.6, 'eggs'),\n", " (1.5714285714285714, '7'),\n", " (1.5714285714285714, '7'),\n", " (1.5714285714285714, '7'),\n", " (1.5714285714285714, '7'),\n", " (1.5714285714285714, '7'),\n", " (1.5714285714285714, '7'),\n", " (1.5714285714285714, '7'),\n", " (1.5714285714285714, '7'),\n", " (1.5714285714285714, '7'),\n", " (1.5714285714285714, '7'),\n", " (1.5714285714285714, '7'),\n", " (1.5, 'variety'),\n", " (1.5, 'used'),\n", " (1.5, 'used'),\n", " (1.5, 'used'),\n", " (1.5, 'used'),\n", " ...]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from rake_nltk import Rake\n", "import nltk #you may need to download additional resources like punkt\n", "from nltk.corpus import stopwords\n", "from nltk.tokenize import word_tokenize\n", "\n", "# Download the Punkt tokenizer\n", "nltk.download('punkt') \n", "nltk.download('stopwords')\n", "\n", "#Sample text\n", "text = \"Natural language processing (NLP) is an interdisciplinary field that focuses on the interactions between computers and human language. including speech recognition, machine translation, and text analysis.\"\n", "\n", "# Initiate the RAKE object and run it on the text\n", "r = Rake()\n", "\n", "# Extraction given the text.\n", "r.extract_keywords_from_text(joined_string)\n", "r.get_ranked_phrases_with_scores()\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[('Herons', 0.0019341151026289274), ('heron', 0.0020806389740402097), ('genus', 0.002779114198769202), ('night herons', 0.003407551995779094), ('heron Genus', 0.003440693868056681), ('species', 0.0035328448489132763), ('heron Genus Ixobrychus', 0.007802717741497088), ('heron Genus Egretta', 0.008165381990151557), ('heron Genus Zonerodius', 0.01088922519768467), ('heron Genus Taphophoyx', 0.012246806133592213), ('boat-billed heron', 0.012547724730764211), ('Agami heron Genus', 0.012552446379536716), ('heron Genus Cochlearius', 0.0126236034378057), ('heron Genus Agamia', 0.01263198211070613), ('tiger heron Genus', 0.01263302474300582), ('tiger herons', 0.013108091837444087), ('Ardeidae', 0.013458020786417972), ('genus Ardea', 0.014145228646781595), ('boat-billed heron Genus', 0.014222182748485945), ('zigzag heron Genus', 0.015083995855833034)]\n" ] } ], "source": [ "#Take 2\n", "import yake \n", "\n", "yake_kw = yake.KeywordExtractor() \n", "KeyWords = yake_kw.extract_keywords(joined_string) \n", " \n", "# Displaying the keywords \n", "print(KeyWords) \n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt to\n", "[nltk_data] d:\\Programms\\Anaconda\\envs\\sneakpic\\lib\\nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n" ] }, { "ename": "AttributeError", "evalue": "'Rake' object has no attribute 'run'", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[6], line 13\u001b[0m\n\u001b[0;32m 10\u001b[0m r \u001b[38;5;241m=\u001b[39m Rake(stopwords\u001b[38;5;241m=\u001b[39mstopwords\u001b[38;5;241m.\u001b[39mwords(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124menglish\u001b[39m\u001b[38;5;124m\"\u001b[39m), min_length\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m, max_length\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m4\u001b[39m)\n\u001b[0;32m 12\u001b[0m \u001b[38;5;66;03m# Run RAKE on the joined string\u001b[39;00m\n\u001b[1;32m---> 13\u001b[0m keywords \u001b[38;5;241m=\u001b[39m \u001b[43mr\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m(joined_string)\n\u001b[0;32m 15\u001b[0m \u001b[38;5;66;03m# Get the top keywords with their scores\u001b[39;00m\n\u001b[0;32m 16\u001b[0m top_keywords \u001b[38;5;241m=\u001b[39m r\u001b[38;5;241m.\u001b[39mget_ranked_phrases()\n", "\u001b[1;31mAttributeError\u001b[0m: 'Rake' object has no attribute 'run'" ] } ], "source": [ "#take 1\n", "import nltk\n", "from nltk.corpus import stopwords\n", "from rake_nltk import Rake\n", "\n", "# Download the Punkt tokenizer\n", "nltk.download('punkt')\n", "\n", "# Initialize the RAKE object\n", "r = Rake(stopwords=stopwords.words(\"english\"), min_length=1, max_length=4)\n", "\n", "# Run RAKE on the joined string\n", "keywords = r.run(joined_string)\n", "\n", "# Get the top keywords with their scores\n", "top_keywords = r.get_ranked_phrases()\n", "print(top_keywords) " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import spacy\n", "nlp = spacy.load('en_core_web_sm')\n", "\n", "def extract_keywords_spacy(text):\n", " doc = nlp(text)\n", " keywords = [token.text for token in doc if token.is_alpha and token.is_stop != True and token.pos_ == \"NOUN\"]\n", " return keywords" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "

Frequency Based

" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt to\n", "[nltk_data] d:\\Programms\\Anaconda\\envs\\sneakpic\\lib\\nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n", "[nltk_data] Downloading package averaged_perceptron_tagger to\n", "[nltk_data] d:\\Programms\\Anaconda\\envs\\sneakpic\\lib\\nltk_data...\n", "[nltk_data] Package averaged_perceptron_tagger is already up-to-\n", "[nltk_data] date!\n", "[nltk_data] Downloading package wordnet to\n", "[nltk_data] d:\\Programms\\Anaconda\\envs\\sneakpic\\lib\\nltk_data...\n", "[nltk_data] Unzipping corpora\\wordnet.zip.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[('Halo', 'NN')]\n", "NLTK Keywords: ['heron', 'specie', 'genus', 'bittern', 'night', 'bird', 'egret', 'ardeidae', 'edit', 'prey']\n" ] } ], "source": [ "\n", "import nltk\n", "from nltk.tokenize import word_tokenize\n", "from nltk.stem import WordNetLemmatizer\n", "from nltk.corpus import stopwords\n", "from nltk.tag import pos_tag\n", "nltk.download('punkt')\n", "nltk.download('averaged_perceptron_tagger')\n", "nltk.download('wordnet')\n", "# Stop words list\n", "stop_words = set(stopwords.words('english'))\n", "\n", "print(pos_tag([\"Halo\"]))\n", "def preprocess_text(text):\n", " wnl = WordNetLemmatizer()\n", " tokens = word_tokenize(text.lower())\n", " filtered_tokens = [word for (word,pos) in pos_tag(tokens) if word not in stop_words and len(word) > 2 and pos[:2] == 'NN']\n", " lemmatized_tokens = [wnl.lemmatize(word) for word in filtered_tokens]\n", " return lemmatized_tokens\n", "\n", "\n", "def extract_keywords_nltk(text):\n", " words = preprocess_text(text)\n", " # Frequency distribution of words\n", " freq_dist = nltk.FreqDist(words)\n", " # Get the 10 most common words\n", " most_common_words = freq_dist.most_common(10)\n", " keywords = [word for word, _ in most_common_words]\n", " return keywords\n", "\n", "text = \"This is a sample text that contains some important keywords like machine learning, natural language processing, and Python.\"\n", "\n", "#print(\"SpaCy Keywords:\", extract_keywords_spacy(joined_string))\n", "print(\"NLTK Keywords:\", extract_keywords_nltk(joined_string))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "sneakpic", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 2 }