{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "import re\n",
    "import validators.url as urlvalid\n",
    "\n",
    "#Helper for get text, iterates through parents of an html tag, to see whether it should be filtered.\n",
    "def has_excluded_parent(tag, exclude_tags):\n",
    "    parent = tag.parent\n",
    "    while parent.name != 'html':\n",
    "        if parent.name in exclude_tags:\n",
    "            return True\n",
    "        parent = parent.parent\n",
    "    return False\n",
    "\n",
    "#Retrieve text, restricted to certain tabs\n",
    "def get_text(soup):\n",
    "    target_tags = {'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li'}    \n",
    "    exclude_tags = {'header', 'nav', 'footer'}    \n",
    "    text_list = []    \n",
    "    for tag in soup.find_all(target_tags):\n",
    "        if not has_excluded_parent(tag, exclude_tags):        \n",
    "                text_list.append(tag.get_text())   \n",
    "    return text_list  \n",
    "\n",
    "# Find all images on the webpage\n",
    "def get_images(soup):\n",
    "    images = soup.find_all('img')\n",
    "    # Find all elements with a style attribute that contains 'background-image'\n",
    "    background_images = soup.find_all(style=re.compile('background-image'))\n",
    "    # Check each image\n",
    "    imagelist=[]\n",
    "    for img in images:\n",
    "        img_url = img.get('src')\n",
    "        # Skip if the image URL is empty or None\n",
    "        if(img_url and not (urlvalid(img_url))):\n",
    "            img_url = f'https:{img_url}'\n",
    "        if not img_url or not(urlvalid(img_url)):\n",
    "            print(\"Invalid image url\",img_url)\n",
    "            continue\n",
    "        # Check if the image is likely a logo or icon based on its size\n",
    "        width = img.get('width')\n",
    "        height = img.get('height')\n",
    "        if width and height:\n",
    "            if int(width) < 100 and int(height) < 100:\n",
    "                #print(f\"Skipping {img_url} as it's likely a logo or icon\")\n",
    "                continue\n",
    "        \n",
    "        # Check if the image is likely a logo or icon based on its URL\n",
    "        if 'logo' in img_url.lower() or 'icon' in img_url.lower():\n",
    "            #print(f\"Skipping {img_url} as it's likely a logo or icon\")\n",
    "            continue\n",
    "        \n",
    "        # Check if the image is an SVG\n",
    "        if img_url.lower().endswith('.svg'):\n",
    "            #print(f\"Skipping {img_url} as it's an SVG\")\n",
    "            continue\n",
    "        imagelist.append(img_url)\n",
    "    \n",
    "    # Check each background_image\n",
    "    for elem in background_images:\n",
    "        style = elem.get('style')\n",
    "        match = re.search(r'background-image\\s*:\\s*url\\(([^)]+)\\)', style)\n",
    "        if match:\n",
    "            img_url = match.group(1).strip('\"\\'')\n",
    "            \n",
    "            # Check if the image is likely a logo or icon based on its size\n",
    "            width_match = re.search(r'width\\s*:\\s*(\\d+)px', style)\n",
    "            height_match = re.search(r'height\\s*:\\s*(\\d+)px', style)\n",
    "            if width_match and height_match:\n",
    "                width = int(width_match.group(1))\n",
    "                height = int(height_match.group(1))\n",
    "                if width < 100 and height < 100:\n",
    "                    #print(f\"Skipping {img_url} as it's likely a logo or icon\")\n",
    "                    continue\n",
    "            \n",
    "            # Check if the image is likely a logo or icon based on its URL\n",
    "            if 'logo' in img_url.lower() or 'icon' in img_url.lower() or not(urlvalid(img_url)):\n",
    "                #print(f\"Skipping {img_url} as it's likely a logo or icon\")\n",
    "                continue\n",
    "            \n",
    "            # Check if the image is an SVG\n",
    "            if img_url.lower().endswith('.svg'):\n",
    "                #print(f\"Skipping {img_url} as it's an SVG\")\n",
    "                continue\n",
    "            imagelist.append(img_url)\n",
    "    return imagelist\n",
    "\n",
    "def scrapePage(req:dict):\n",
    "    # Send a GET request\n",
    "    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}\n",
    "    if(not urlvalid(req[\"url\"])):\n",
    "        return {\"error\": \"scraping.py: url is not recognized as a valid url.\"}\n",
    "    try:\n",
    "        response = requests.get(req[\"url\"],headers=headers)\n",
    "        response.raise_for_status()    \n",
    "    except requests.exceptions.RequestException as e: \n",
    "        return {\"error\":\"scraping.py: request error\",\"message\":e}\n",
    "    res = {}\n",
    "    # If the GET request is successful, the status code will be 200\n",
    "    if response.status_code == 200:\n",
    "        # Get the content of the response\n",
    "        page_content = response.content\n",
    "        # Create a BeautifulSoup object and specify the parser\n",
    "        soup = BeautifulSoup(page_content, 'html.parser')\n",
    "        if(req[\"use_images\"]):\n",
    "            res[\"images\"]=get_images(soup)\n",
    "        if(req[\"use_text\"]):\n",
    "            res[\"text\"]=get_text(soup)\n",
    "        return res\n",
    "    else:\n",
    "        return{\"error\":\"scraping.py: webpage could not be loaded\"}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTMzNTo2MDU=-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMjAzIDY3IiB3aWR0aD0iMjAzIiBoZWlnaHQ9IjY3IiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPjwvc3ZnPg==\n",
      "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTMzODo2MTA=-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTcwIDU4IiB3aWR0aD0iMTcwIiBoZWlnaHQ9IjU4IiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPjwvc3ZnPg==\n",
      "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTM0MTo2MDQ=-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMjAzIDY3IiB3aWR0aD0iMjAzIiBoZWlnaHQ9IjY3IiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPjwvc3ZnPg==\n",
      "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTQ0MjoxNzU5-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTU3MiA4NzYiIHdpZHRoPSIxNTcyIiBoZWlnaHQ9Ijg3NiIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48L3N2Zz4=\n",
      "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTQ1NToxNDM4-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTAwMCA2NjciIHdpZHRoPSIxMDAwIiBoZWlnaHQ9IjY2NyIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48L3N2Zz4=\n",
      "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTQ1ODoxMjgw-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTAwMCA2NjciIHdpZHRoPSIxMDAwIiBoZWlnaHQ9IjY2NyIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48L3N2Zz4=\n",
      "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTQ2ODoxMTk1-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTAwMCA2NjciIHdpZHRoPSIxMDAwIiBoZWlnaHQ9IjY2NyIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48L3N2Zz4=\n",
      "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTQ3MjoxMDc1-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTAwMCA2MDQiIHdpZHRoPSIxMDAwIiBoZWlnaHQ9IjYwNCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48L3N2Zz4=\n",
      "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTQ3NjoxNTQy-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgODEwIDgwNiIgd2lkdGg9IjgxMCIgaGVpZ2h0PSI4MDYiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+PC9zdmc+\n",
      "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTQ4NToxMzI2-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTAwMCA2NDYiIHdpZHRoPSIxMDAwIiBoZWlnaHQ9IjY0NiIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48L3N2Zz4=\n",
      "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTQ4OToxMjMz-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTg5OCA4NzUiIHdpZHRoPSIxODk4IiBoZWlnaHQ9Ijg3NSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48L3N2Zz4=\n",
      "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTQ5MzoxNjQ5-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTAwMCA2MzUiIHdpZHRoPSIxMDAwIiBoZWlnaHQ9IjYzNSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48L3N2Zz4=\n",
      "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTUxMjoxMjg0-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTAwMCA1MzYiIHdpZHRoPSIxMDAwIiBoZWlnaHQ9IjUzNiIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48L3N2Zz4=\n",
      "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTUxNjoxMzY4-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTYyNyA4MTQiIHdpZHRoPSIxNjI3IiBoZWlnaHQ9IjgxNCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48L3N2Zz4=\n",
      "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTUyMToxMjQ4-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTUyMiA4MTciIHdpZHRoPSIxNTIyIiBoZWlnaHQ9IjgxNyIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48L3N2Zz4=\n",
      "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTUzMToxNDg3-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTU0NSA4NjUiIHdpZHRoPSIxNTQ1IiBoZWlnaHQ9Ijg2NSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48L3N2Zz4=\n",
      "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTUzOTo0MjQ=-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMjQwIDI0MCIgd2lkdGg9IjI0MCIgaGVpZ2h0PSIyNDAiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+PC9zdmc+\n",
      "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTU3MDo1MDA=-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgNTAwIDM4MyIgd2lkdGg9IjUwMCIgaGVpZ2h0PSIzODMiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+PC9zdmc+\n",
      "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTYwODo1NTA=-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgNTAwIDM4MyIgd2lkdGg9IjUwMCIgaGVpZ2h0PSIzODMiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+PC9zdmc+\n",
      "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTY0Njo1MDg=-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgNTAwIDM4MyIgd2lkdGg9IjUwMCIgaGVpZ2h0PSIzODMiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+PC9zdmc+\n",
      "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTY4NDo1MjY=-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgNTAwIDM4MyIgd2lkdGg9IjUwMCIgaGVpZ2h0PSIzODMiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+PC9zdmc+\n",
      "Invalid image url https:data:image/svg+xml;nitro-empty-id=MTk0OTo3OTQ=-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMjAzIDY3IiB3aWR0aD0iMjAzIiBoZWlnaHQ9IjY3IiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPjwvc3ZnPg==\n",
      "Invalid image url https:data:image/svg+xml;nitro-empty-id=MjAwNzoxMjMx-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMzAwIDEzOSIgd2lkdGg9IjMwMCIgaGVpZ2h0PSIxMzkiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+PC9zdmc+\n",
      "Invalid image url https:data:image/svg+xml;nitro-empty-id=MjAwNzoyNDM3-1;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMTUwIDQwIiB3aWR0aD0iMTUwIiBoZWlnaHQ9IjQwIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPjwvc3ZnPg==\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'images': ['https://cdn-agiod.nitrocdn.com/IzoObPRaJTXqmzxBrypHgZRGhBszRtaj/assets/images/optimized/rev-32e7c69/brandlume.com/wp-content/uploads/2023/04/how-to-make-your-website-stand-out.jpg',\n",
       "  'https://brandlume.com/wp-content/uploads/2023/04/how-to-make-your-website-stand-out.jpg',\n",
       "  'https://brandlume.com/wp-content/uploads/2023/04/The-Importance-of-Having-a-Standout-Website-1.jpg',\n",
       "  'https://brandlume.com/wp-content/uploads/2023/04/How-to-Make-Your-Business-Website-Stand-Out.jpg',\n",
       "  'https://brandlume.com/wp-content/uploads/2023/04/Select-the-Ideal-Template.jpg',\n",
       "  'https://brandlume.com/wp-content/uploads/2023/04/Enhance-User-Experience.jpg',\n",
       "  'https://brandlume.com/wp-content/uploads/2023/04/Typography.jpg',\n",
       "  'https://brandlume.com/wp-content/uploads/2023/04/Create-High-Quality-Content-1.jpg',\n",
       "  'https://brandlume.com/wp-content/uploads/2023/04/Maintain-Fresh-Website-Content.jpg',\n",
       "  'https://brandlume.com/wp-content/uploads/2023/04/Bios.jpg',\n",
       "  'https://brandlume.com/wp-content/uploads/2023/04/Demonstrate-Engaging-and-Relatable-Video-Content.jpg',\n",
       "  'https://brandlume.com/wp-content/uploads/2023/04/Showcase-Examples-and-Metrics.jpg',\n",
       "  'https://brandlume.com/wp-content/uploads/2023/04/Website-Menus.jpg',\n",
       "  'https://brandlume.com/wp-content/uploads/2023/04/case-studies.jpg',\n",
       "  'https://brandlume.com/wp-content/uploads/2023/04/Stand-Out-from-the-Crowd.jpg',\n",
       "  'https://brandlume.com/wp-content/uploads/2022/04/responsive-web-concept-500x383.jpg',\n",
       "  'https://brandlume.com/wp-content/uploads/2023/05/Questions-to-Ask-a-Web-Design-Company-500x383.jpg',\n",
       "  'https://brandlume.com/wp-content/uploads/2023/04/Error-404-poor-website-500x383.png',\n",
       "  'https://brandlume.com/wp-content/uploads/2023/04/Website-Dos-and-Donts-500x383.png',\n",
       "  'https://brandlume.com/wp-content/uploads/2017/01/featured.png']}"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "exampleReq4 = {\"url\": \"https://brandlume.com/12-proven-ways-to-make-your-website-stand-out/\",\"use_images\": True,\"use_text\":False,\"num_images\":1,\"page\": 0,\"num_keywords_text\": 10,\"num_keywords_images\": 10,\"num_query_keywords\":5,\"result_images\":24}\n",
    "scrapePage(exampleReq4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "joined_string = \" \".join(scraped_text)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h1>Rake based</h1>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package punkt to\n",
      "[nltk_data]     d:\\Programms\\Anaconda\\envs\\sneakpic\\lib\\nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n",
      "[nltk_data] Downloading package stopwords to\n",
      "[nltk_data]     d:\\Programms\\Anaconda\\envs\\sneakpic\\lib\\nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[(275.00490720798564,\n",
       "  'billed heron subfamily agamiinae genus agamia – agami heron genus agamia – agami heron subfamily botaurinae genus zebrilus – zigzag heron genus ixobrychus – small bitterns'),\n",
       " (249.06392694063925,\n",
       "  '148747 france bnf data germany israel united states japan czech republic ardeidae herons extant paleocene first appearances taxa named'),\n",
       " (125.0750536284169,\n",
       "  'genus pilherodius – capped heron genus zonerodius – forest bittern genus ardeola – pond herons'),\n",
       " (125.0750536284169,\n",
       "  'genus pilherodius – capped heron genus zonerodius – forest bittern genus ardeola – pond herons'),\n",
       " (112.95397300068352,\n",
       "  'billed heron agamia – agami heron zebrilus – zigzag heron botaurus – bitterns'),\n",
       " (85.98584252505485,\n",
       "  'pilherodius – capped heron syrigma – whistling heron egretta – herons'),\n",
       " (72.83612300476749,\n",
       "  'genus zebrilus – zigzag heron genus ixobrychus – small bitterns'),\n",
       " (67.35917020610415,\n",
       "  'genus syrigma – whistling heron genus egretta – typical egrets'),\n",
       " (67.35917020610415,\n",
       "  'genus syrigma – whistling heron genus egretta – typical egrets'),\n",
       " (58.909716776933756,\n",
       "  'crested tiger heron subfamily cochleariinae genus cochlearius – boat'),\n",
       " (54.388888888888886,\n",
       "  'william elford leach webarchive template wayback links articles'),\n",
       " (40.388888888888886,\n",
       "  'short description short description matches wikidata articles'),\n",
       " (37.45614534836233, 'billed heron genus cochlearius – boat'),\n",
       " (37.09006330427564, 'crested tiger heron tigrisoma – tiger herons'),\n",
       " (34.8699279888573, 'genus tigrisoma – typical tiger herons'),\n",
       " (34.8699279888573, 'genus tigrisoma – typical tiger herons'),\n",
       " (33.624689893619205, 'genus nycticorax – typical night herons'),\n",
       " (33.624689893619205, 'genus nycticorax – typical night herons'),\n",
       " (33.42857142857143, 'nkc identifiers toggle limited content width'),\n",
       " (31.8625, 'nine ), 15 – 20 secondaries'),\n",
       " (31.08024544917476, 'genus nyctanassa – american night herons'),\n",
       " (31.08024544917476, 'genus nyctanassa – american night herons'),\n",
       " (30.211556603773584, 'genus pikaihao – saint bathan'),\n",
       " (30.211556603773584, 'genus pikaihao – saint bathan'),\n",
       " (28.966453447050466, 'many species also opportunistically take larger prey'),\n",
       " (28.759760273972603, 'ardeola – pond herons'),\n",
       " (28.529359634076613, 'crested tiger heron genus taphophoyx'),\n",
       " (27.813310989738497, 'genus botaurus – large bitterns'),\n",
       " (27.813310989738497, 'genus botaurus – large bitterns'),\n",
       " (27.054807692307694, '5884 ): 1763 – 1768'),\n",
       " (27.046026300743282, 'genus undetermined easter island heron'),\n",
       " (27.046026300743282, 'genus undetermined easter island heron'),\n",
       " (26.90174617067548, 'genus ardea – typical herons'),\n",
       " (26.90174617067548, 'genus ardea – typical herons'),\n",
       " (26.333333333333336, 'active feeding behaviours include foot stirring'),\n",
       " (25.954807692307693, '3 ): 672 – 679'),\n",
       " (25.954807692307693, '3 ): 569 – 571'),\n",
       " (25.954807692307693, '3 ): 471 – 472'),\n",
       " (25.954807692307693, '3 ): 450 – 452'),\n",
       " (25.954807692307693, '3 ): 441 – 442'),\n",
       " (25.954807692307693, '3 ): 437 – 450'),\n",
       " (25.8625, 'measures 25 – 30 cm'),\n",
       " (24.911950549450548, '1 ): 242 – 246'),\n",
       " (24.911950549450548, '1 ): 127 – 141'),\n",
       " (24.799056603773586, 'genus bubulcus – cattle egrets'),\n",
       " (24.799056603773586, 'genus bubulcus – cattle egrets'),\n",
       " (24.724056603773583, 'subfamily tigriornithinae genus taphophoyx'),\n",
       " (24.388141025641026, '2 ): 383 – 389'),\n",
       " (24.254807692307693, '4 ): 860 – 865'),\n",
       " (23.5, 'congress catalogue card number 76'),\n",
       " (23.5, '12th century ), earlier hairo'),\n",
       " (23.224056603773583, 'subfamily ardeinae genus zeltornis'),\n",
       " (23.166666666666668, 'distinguished ,[ 19 ][ 20'),\n",
       " (22.461556603773584, 'genus gorsachius – asian'),\n",
       " (22.461556603773584, 'genus gorsachius – asian'),\n",
       " (22.280952380952378, 'ioc world bird list version 13'),\n",
       " (22.28030303030303, 'agami heron'),\n",
       " (22.196969696969695, 'word heron first appeared'),\n",
       " (21.649056603773587, 'monotypic genus zebrilus'),\n",
       " (21.011556603773585, 'genus butorides – green'),\n",
       " (21.011556603773585, 'genus butorides – green'),\n",
       " (21.0, 'united states'),\n",
       " (21.0, 'czech republic'),\n",
       " (20.468688845401175, 'night herons could warrant separation'),\n",
       " (20.370647512864494, 'genus tigriornis – white'),\n",
       " (20.370647512864494, 'genus tigriornis – white'),\n",
       " (20.333333333333332, 'skull morphology reflect convergent evolution'),\n",
       " (20.25, 'international ornithological congress reclassified ardeidae'),\n",
       " (19.664522178734508, 'nycticorax – night herons'),\n",
       " (19.099056603773583, 'genus ardeola rather'),\n",
       " (18.897260273972602, 'reported observing female herons attaching'),\n",
       " (18.8625, 'transcends elements – earth'),\n",
       " (18.731188845401174, 'nyctanassa – night herons'),\n",
       " (18.3625, 'ardeagradis proardeola – possibly'),\n",
       " (18.357142857142858, 'egretta novaehollandiae ), demonstrating'),\n",
       " (18.317460317460316, 'bnf identifiers articles'),\n",
       " (18.30234962406015, 'ixobrychus – bitterns'),\n",
       " (17.831188845401176, 'calherodius – night herons'),\n",
       " (17.797260273972604, 'herons may use items already'),\n",
       " (17.581188845401176, 'gorsachius – night herons'),\n",
       " (17.42857142857143, 'small indian city \".'),\n",
       " (16.505357142857143, 'cochlearius – boat'),\n",
       " (16.428358208955224, 'colonies may contain several species'),\n",
       " (16.214285714285715, 'internet bird collection wikidata'),\n",
       " (16.0, 'watched repeatedly dropping seeds'),\n",
       " (15.880533445049574, 'zigzag heron ).'),\n",
       " (15.387445887445889, 'small green heron'),\n",
       " (15.333333333333334, 'seeking sexual gratification elsewhere'),\n",
       " (15.190858208955225, '11 – 17 species'),\n",
       " (15.190858208955225, '11 – 17 species'),\n",
       " (14.833333333333334, 'retrieved 19 september 2023'),\n",
       " (14.696969696969697, 'heronconservation heron specialist group'),\n",
       " (14.666666666666666, 'males arrive first'),\n",
       " (14.662286780383797, '7 – 13 species'),\n",
       " (14.662286780383797, '7 – 13 species'),\n",
       " (14.657142857142858, 'one breeding season per year'),\n",
       " (14.512820512820513, 'forest bittern'),\n",
       " (14.471428571428572, 'prey capture increased 3'),\n",
       " (14.420485175202156, 'genus ixobrychus'),\n",
       " (14.4, 'following large grazing animals'),\n",
       " (14.36568132660418, 'boatbill bitterns day herons'),\n",
       " (14.349056603773585, 'genus calherodius peters'),\n",
       " (14.302022178734507, 'contain two night herons'),\n",
       " (14.196969696969695, 'easter island heron'),\n",
       " (14.029166666666667, '20 – 21'),\n",
       " (14.0, 'sister taxa threskiornithidae'),\n",
       " (13.897260273972602, 'see text cochlearidae herons'),\n",
       " (13.849056603773585, 'respective genus accounts'),\n",
       " (13.833333333333334, 'oxford english dictionary describes'),\n",
       " (13.828358208955224, 'ioc lists 72 species'),\n",
       " (13.816017316017316, 'zigzag heron'),\n",
       " (13.816017316017316, 'zigzag heron'),\n",
       " (13.780303030303031, 'whistling heron'),\n",
       " (13.6875, 'vijay k .; kittur'),\n",
       " (13.666666666666666, 'english language around 1300'),\n",
       " (13.666666666666666, 'displays involve visual cues'),\n",
       " (13.559760273972602, 'butorides – herons'),\n",
       " (13.520833333333334, 'frederick h .; jones'),\n",
       " (13.43030303030303, 'throated tiger heron'),\n",
       " (13.3625, '376 – 403'),\n",
       " (13.3625, '1566 – 1625'),\n",
       " (13.333333333333334, 'neck area may swell'),\n",
       " (13.1875, 'clare e .; mccracken'),\n",
       " (13.166666666666666, 'nesting intensity varies throughout'),\n",
       " (13.1017316017316, 'billed heron'),\n",
       " (13.1017316017316, 'billed heron'),\n",
       " (13.1017316017316, 'billed heron'),\n",
       " (13.1017316017316, 'billed heron'),\n",
       " (13.066666666666666, 'uses erectile neck feathers'),\n",
       " (12.941578455790783, 'ardea – herons'),\n",
       " (12.8, 'zebrilus undulatus'),\n",
       " (12.722222222222221, 'three forward pointing ones'),\n",
       " (12.682389937106919, 'genus botaurus'),\n",
       " (12.666666666666666, 'zonerodius heliosylus'),\n",
       " (12.631944444444445, 'kevin g .; stuebing'),\n",
       " (12.52159090909091, 'tigriornis – white'),\n",
       " (12.520833333333334, 'frederick h .; mccracken'),\n",
       " (12.5, 'named differently'),\n",
       " (12.5, 'j k l martínez'),\n",
       " (12.375, 'subfamily tigriornithinae'),\n",
       " (12.375, 'subfamily ardeinae'),\n",
       " (12.321428571428571, 'prey comes within range'),\n",
       " (12.166666666666666, 'light colour morphs exist'),\n",
       " (12.166666666666666, '18 extant'),\n",
       " (12.131868131868133, 'zigzag bittern'),\n",
       " (12.1017316017316, 'eared night heron'),\n",
       " (12.1017316017316, 'crowned night heron'),\n",
       " (11.857142857142858, 'egretta sacra )\"'),\n",
       " (11.683333333333334, 'may feed far away'),\n",
       " (11.647619047619047, 'reed beds may nest'),\n",
       " (11.566666666666666, 'london editions isbn 0'),\n",
       " (11.542643923240938, 'one species formerly considered'),\n",
       " (11.530303030303031, 'russia squacco heron'),\n",
       " (11.530303030303031, 'pacific reef heron'),\n",
       " (11.530303030303031, 'osteological heron phylogenies'),\n",
       " (11.530303030303031, 'iucn heron videos'),\n",
       " (11.530303030303031, 'eastern reef heron'),\n",
       " (11.5, '11th century ),'),\n",
       " (11.375, 'subfamily nycticoracinae'),\n",
       " (11.357142857142858, '1 external links'),\n",
       " (11.301731601731603, 'backed night heron'),\n",
       " (11.196969696969697, 'heron symbolizes purity'),\n",
       " (11.195833333333333, '10 – 12'),\n",
       " (10.897260273972602, 'although herons resemble birds'),\n",
       " (10.849056603773585, 'genus zeltornis'),\n",
       " (10.8, 'waterbirds use different sites'),\n",
       " (10.777777777777777, 'e f g h'),\n",
       " (10.666666666666668, 'courtship usually takes part'),\n",
       " (10.5, 'native american culture'),\n",
       " (10.5, 'include adopting postures'),\n",
       " (10.5, 'american midland naturalist'),\n",
       " (10.431944444444445, 'kevin g .; sheldon'),\n",
       " (10.357142857142858, 'small number'),\n",
       " (10.317460317460318, 'ndl identifiers articles'),\n",
       " (10.317460317460318, 'lccn identifiers articles'),\n",
       " (10.317460317460318, 'j9u identifiers articles'),\n",
       " (10.317460317460318, 'gnd identifiers articles'),\n",
       " (10.317460317460318, 'bnfdata identifiers articles'),\n",
       " (10.25, 'international ornithological committee'),\n",
       " (10.030874785591767, 'genus ardea'),\n",
       " (10.025641025641026, 'new ): 82e46850'),\n",
       " (10.0, 'studied taxa'),\n",
       " (10.0, 'english language unabridged'),\n",
       " (9.968688845401173, 'african night herons'),\n",
       " (9.968688845401173, 'african night herons'),\n",
       " (9.849056603773585, 'genus proardea'),\n",
       " (9.849056603773585, 'genus proardea'),\n",
       " (9.833333333333334, 'oxford english dictionary'),\n",
       " (9.8, 'use reed beds'),\n",
       " (9.666666666666666, 'colonies surveyed contained'),\n",
       " (9.596153846153847, 'crested bittern'),\n",
       " (9.5, 'ultraconserved elements resolve'),\n",
       " (9.497260273972602, 'larger day herons'),\n",
       " (9.4, 'sometimes includes nyctanassa'),\n",
       " (9.4, 'sometimes includes nyctanassa'),\n",
       " (9.4, 'many broadly overlap'),\n",
       " (9.397260273972602, 'herons eating acorns'),\n",
       " (9.397260273972602, 'herons ); differences'),\n",
       " (9.3625, 'rebirth –'),\n",
       " (9.3625, 'belgium –'),\n",
       " (9.333333333333334, 'oxford university press'),\n",
       " (9.333333333333334, 'morph varies geographically'),\n",
       " (9.25, 'within striking distance'),\n",
       " (9.25, 'third international dictionary'),\n",
       " (9.19047619047619, 'small groups'),\n",
       " (9.1875, 'h .; kaul'),\n",
       " (9.1875, 'cattle egrets improve'),\n",
       " (9.025641025641026, '2 ): ukad005'),\n",
       " (9.0, 'zagreb zoological garden'),\n",
       " (9.0, 'vegetable matter consumed'),\n",
       " (9.0, 'probably closely related'),\n",
       " (9.0, 'philip babcock gove'),\n",
       " (9.0, 'paleognath lithornis vulturinus'),\n",
       " (9.0, 'online etymology dictionary'),\n",
       " (9.0, 'old french heronçeau'),\n",
       " (9.0, 'old french hairon'),\n",
       " (9.0, 'croatia symbolic meaning'),\n",
       " (9.0, 'common hunting technique'),\n",
       " (9.0, 'clear consensus exists'),\n",
       " (9.0, 'b66d2f24e21b open tree'),\n",
       " (9.0, '609781 paleobiology database'),\n",
       " (9.0, '1ardef fauna europaea'),\n",
       " (9.0, '10741 fauna europaea'),\n",
       " (8.933333333333334, 'may walk slowly'),\n",
       " (8.916666666666666, 'monophyletic group within'),\n",
       " (8.904761904761905, '7 ][ 15'),\n",
       " (8.89502487562189, 'many species also'),\n",
       " (8.833333333333334, 'feeding grounds near'),\n",
       " (8.828358208955224, '72 recognised species'),\n",
       " (8.75, 'pond'),\n",
       " (8.714285714285715, 'one pointing backwards'),\n",
       " (8.666666666666666, 'oed also observes'),\n",
       " (8.666666666666666, 'different colour morphs'),\n",
       " (8.666666666666666, 'colonies commonly occur'),\n",
       " (8.666666666666666, 'bread ;[ 12'),\n",
       " (8.666666666666666, 'biologically distinct group'),\n",
       " (8.666666666666666, 'actively add items'),\n",
       " (8.666666666666666, '11 primary feathers'),\n",
       " (8.619047619047619, 'little sexual dimorphism'),\n",
       " (8.619047619047619, 'little sexual dimorphism'),\n",
       " (8.5875, 'c .; shakya'),\n",
       " (8.580593607305936, 'herons may feed'),\n",
       " (8.578358208955224, 'latitude within species'),\n",
       " (8.571428571428571, 'ixobrychus involucris'),\n",
       " (8.571428571428571, 'ixobrychus exilis'),\n",
       " (8.571428571428571, 'certain prey types'),\n",
       " (8.520833333333334, 'p .; mccracken'),\n",
       " (8.520833333333334, 'p .; holmes'),\n",
       " (8.520833333333334, 'including cattle egrets'),\n",
       " (8.5, 'temperate climates laying'),\n",
       " (8.5, 'derogatory terms meaning'),\n",
       " (8.428571428571429, 'western cattle egret'),\n",
       " (8.428571428571429, 'eastern cattle egret'),\n",
       " (8.380952380952381, 'bird symbolizes renewal'),\n",
       " (8.333333333333334, 'food usage patterns'),\n",
       " (8.328358208955224, 'eight living species'),\n",
       " (8.328358208955224, 'eight living species'),\n",
       " (8.314285714285715, 'one recently extinct'),\n",
       " (8.314285714285715, 'one recently extinct'),\n",
       " (8.314285714285715, 'one recently extinct'),\n",
       " (8.314285714285715, 'one recently extinct'),\n",
       " (8.313926940639268, 'tropical herons typically'),\n",
       " (8.297260273972602, 'tiger herons'),\n",
       " (8.297260273972602, 'tiger herons'),\n",
       " (8.297260273972602, 'tiger herons'),\n",
       " (8.254166666666666, 'c .; f'),\n",
       " (8.196969696969697, 'heron symbolizes'),\n",
       " (8.1875, 'j .; oliveros'),\n",
       " (8.166666666666666, 'migration typically occurs'),\n",
       " (8.161691542288558, 'two living species'),\n",
       " (8.161691542288558, 'two living species'),\n",
       " (8.101731601731602, 'heron \".'),\n",
       " (8.1, 'items used may'),\n",
       " (8.071428571428571, 'evolutionary history \".'),\n",
       " (8.066666666666666, 'row isbn 0'),\n",
       " (8.06392694063927, 'herons archived 2019'),\n",
       " (8.042643923240938, 'one living species'),\n",
       " (8.042643923240938, 'one living species'),\n",
       " (8.030303030303031, 'individual heron'),\n",
       " (8.0, 'syrigma'),\n",
       " (8.0, 'skeletal analyses focusing'),\n",
       " (8.0, 'separate monotypic family'),\n",
       " (8.0, 'highly mobile family'),\n",
       " (8.0, 'george frederic watts'),\n",
       " (8.0, 'extremely high mountains'),\n",
       " (8.0, 'ever present reminder'),\n",
       " (8.0, 'earliest temporally well'),\n",
       " (8.0, 'continents except antarctica'),\n",
       " (8.0, 'clutch size varies'),\n",
       " (8.0, '2008sci ... 320'),\n",
       " (7.928571428571429, 'eastern great egret'),\n",
       " (7.885714285714286, 'four recently extinct'),\n",
       " (7.885714285714286, 'four recently extinct'),\n",
       " (7.857142857142857, 'egretta garzetta'),\n",
       " (7.833333333333334, 'least partially migratory'),\n",
       " (7.8, 'partly colonial depending'),\n",
       " (7.768421052631579, 'large bitterns'),\n",
       " (7.761691542288558, 'individual species may'),\n",
       " (7.7214285714285715, 'bubulcus ibis \".'),\n",
       " (7.633333333333334, 'two major genera'),\n",
       " (7.6, 'another former name'),\n",
       " (7.555555555555555, 'three major groups'),\n",
       " (7.533333333333333, 'mostly solitary nesters'),\n",
       " (7.53030303030303, 'wounded heron'),\n",
       " (7.53030303030303, 'heron pictured'),\n",
       " (7.53030303030303, 'heron nuclear'),\n",
       " (7.53030303030303, 'headed heron'),\n",
       " (7.53030303030303, 'goliath heron'),\n",
       " (7.53030303030303, 'faced heron'),\n",
       " (7.53030303030303, 'egypt heron'),\n",
       " (7.5, 'order pelecaniformes instead'),\n",
       " (7.5, 'documented using bait'),\n",
       " (7.404761904761905, 'live aquatic prey'),\n",
       " (7.4, '2008 study suggests'),\n",
       " (7.4, '1971 isbn 0'),\n",
       " (7.3809523809523805, 'group nest colonially'),\n",
       " (7.333333333333334, 'usually found near'),\n",
       " (7.333333333333334, 'taxonomy published online'),\n",
       " (7.333333333333334, '1971 compact edition'),\n",
       " (7.33030303030303, 'grey heron'),\n",
       " (7.33030303030303, 'grey heron'),\n",
       " (7.33030303030303, 'grey heron'),\n",
       " (7.33030303030303, 'grey heron'),\n",
       " (7.25, 'tigrisoma mexicanum'),\n",
       " (7.2, 'molecular rate variation'),\n",
       " (7.166666666666667, 'new feeding areas'),\n",
       " (7.15, 'b c gill'),\n",
       " (7.114285714285714, 'one study found'),\n",
       " (7.1, 'molecular phylogenetic study'),\n",
       " (7.0, 'wayback machine'),\n",
       " (7.0, 'corruption took place'),\n",
       " (7.0, 'catching insects flushed'),\n",
       " (6.9950248756218905, 'whereas species living'),\n",
       " (6.9950248756218905, 'almost every species'),\n",
       " (6.968688845401174, 'night herons'),\n",
       " (6.968688845401174, 'night herons'),\n",
       " (6.968688845401174, 'night herons'),\n",
       " (6.968688845401174, 'night herons'),\n",
       " (6.966666666666666, 'immediately around water'),\n",
       " (6.933333333333334, 'birds may either'),\n",
       " (6.93030303030303, 'black heron'),\n",
       " (6.93030303030303, 'black heron'),\n",
       " (6.857142857142857, 'small'),\n",
       " (6.854166666666667, 'r .; post'),\n",
       " (6.833333333333333, 'botaurus stellaris'),\n",
       " (6.803030303030303, 'fossil heron'),\n",
       " (6.731182795698925, 'july 2023 ).'),\n",
       " (6.633333333333333, 'genera botaurus'),\n",
       " (6.571428571428571, 'ixobrychus'),\n",
       " (6.5, 'ardeidae could'),\n",
       " (6.432937181663837, 'bitterns ).'),\n",
       " (6.368421052631579, 'smaller bitterns'),\n",
       " (6.368421052631579, 'smaller bitterns'),\n",
       " (6.368421052631579, 'bitterns rather'),\n",
       " (6.346153846153847, 'great bittern'),\n",
       " (6.333333333333334, 'frederick h'),\n",
       " (6.228358208955224, '3 species'),\n",
       " (6.228358208955224, '3 species'),\n",
       " (6.1875, 'cattle egrets'),\n",
       " (6.1875, 'cattle egrets'),\n",
       " (6.1875, 'cattle egrets'),\n",
       " (6.17948717948718, 'least bittern'),\n",
       " (6.17948717948718, 'least bittern'),\n",
       " (6.131868131868131, 'little bittern'),\n",
       " (6.111111111111111, 'typical'),\n",
       " (6.064516129032258, 'ioc ).'),\n",
       " (6.046153846153846, 'backed bittern'),\n",
       " (5.997260273972603, 'day herons'),\n",
       " (5.997260273972603, 'day herons'),\n",
       " (5.997260273972603, 'day herons'),\n",
       " (5.888888888888889, 'microformats articles'),\n",
       " (5.880952380952381, 'also one'),\n",
       " (5.857142857142857, 'egretta'),\n",
       " (5.846153846153847, 'dwarf bittern'),\n",
       " (5.728358208955224, 'many species'),\n",
       " (5.728358208955224, 'many species'),\n",
       " (5.728358208955224, '13 species'),\n",
       " (5.597260273972603, 'backed herons'),\n",
       " (5.597260273972603, 'backed herons'),\n",
       " (5.597260273972603, 'backed herons'),\n",
       " (5.571428571428571, 'browned night'),\n",
       " (5.53030303030303, 'heron'),\n",
       " (5.53030303030303, 'heron'),\n",
       " (5.53030303030303, 'heron'),\n",
       " (5.53030303030303, 'heron'),\n",
       " (5.53030303030303, 'heron'),\n",
       " (5.53030303030303, 'heron'),\n",
       " (5.5, 'larger cycle'),\n",
       " (5.5, 'larger clutches'),\n",
       " (5.5, 'female risks'),\n",
       " (5.5, 'dictionary suggests'),\n",
       " (5.5, '152 cm'),\n",
       " (5.473214285714286, 'little egrets'),\n",
       " (5.473214285714286, 'little egrets'),\n",
       " (5.457142857142857, '1 extinct'),\n",
       " (5.457142857142857, '1 extinct'),\n",
       " (5.444444444444445, 'kevin g'),\n",
       " (5.428571428571429, 'cattle egret'),\n",
       " (5.4, 'bubulcus coromandus'),\n",
       " (5.397260273972602, 'striated herons'),\n",
       " (5.397260273972602, 'herons lay'),\n",
       " (5.397260273972602, 'herons )\".'),\n",
       " (5.333333333333334, 'aquatic animals'),\n",
       " (5.328358208955224, 'several species'),\n",
       " (5.25, 'tropical ones'),\n",
       " (5.25, 'tigriornis leucolophus'),\n",
       " (5.1875, '.; yahya'),\n",
       " (5.1875, '.; lavretsky'),\n",
       " (5.1875, '.; khan'),\n",
       " (5.166666666666666, 'also applied'),\n",
       " (5.161691542288557, 'two species'),\n",
       " (5.161691542288557, 'two species'),\n",
       " (5.15, 'bubulcus ibis'),\n",
       " (5.142857142857142, 'breeding sites'),\n",
       " (5.142857142857142, 'breeding season'),\n",
       " (5.0476190476190474, 'bird groups'),\n",
       " (5.0, 'several cities'),\n",
       " (5.0, 'ritual displays'),\n",
       " (5.0, 'rainy season'),\n",
       " (5.0, 'possibly attract'),\n",
       " (5.0, 'family ardeidae'),\n",
       " (5.0, 'family ardeidae'),\n",
       " (5.0, 'different challenges'),\n",
       " (5.0, 'description'),\n",
       " (5.0, 'compact edition'),\n",
       " (5.0, 'ardeidae gen'),\n",
       " (5.0, 'ardeidae gen'),\n",
       " (5.0, 'ardeidae gen'),\n",
       " (5.0, 'ardeidae bold'),\n",
       " (5.0, 'ardeidae afd'),\n",
       " (5.0, 'ardeidae adw'),\n",
       " (5.0, 'ardeidae )\".'),\n",
       " (4.933333333333334, 'may raise'),\n",
       " (4.909090909090909, 'white morphs'),\n",
       " (4.857142857142858, 'volume 1'),\n",
       " (4.833333333333334, 'usually considered'),\n",
       " (4.833333333333334, 'nighttime feeding'),\n",
       " (4.833333333333334, 'feeding behavior'),\n",
       " (4.828358208955224, 'temperate species'),\n",
       " (4.828358208955224, 'species live'),\n",
       " (4.800000000000001, '4 extinct'),\n",
       " (4.8, 'butorides virescens'),\n",
       " (4.756929637526652, 'egret species'),\n",
       " (4.75, 'international ornithologists'),\n",
       " (4.7142857142857135, 'little egret'),\n",
       " (4.7142857142857135, 'little egret'),\n",
       " (4.7, 'mostly migratory'),\n",
       " (4.666666666666667, 'july 2023'),\n",
       " (4.666666666666667, 'july 2023'),\n",
       " (4.666666666666666, 'typically placed'),\n",
       " (4.666666666666666, 'similar cues'),\n",
       " (4.666666666666666, 'nocturnal group'),\n",
       " (4.666666666666666, 'isbn 978'),\n",
       " (4.666666666666666, 'isbn 978'),\n",
       " (4.666666666666666, 'isbn 978'),\n",
       " (4.666666666666666, 'auditory cues'),\n",
       " (4.666666666666666, 'also known'),\n",
       " (4.666666666666666, '12 rectrices'),\n",
       " (4.661691542288557, 'solitary species'),\n",
       " (4.661691542288557, '2 species'),\n",
       " (4.661691542288557, '2 species'),\n",
       " (4.65, 'b c'),\n",
       " (4.614072494669509, 'four species'),\n",
       " (4.614072494669509, 'four species'),\n",
       " (4.614072494669509, 'four species'),\n",
       " (4.614072494669509, 'four species'),\n",
       " (4.578358208955224, 'tropical species'),\n",
       " (4.571428571428571, 'snatching prey'),\n",
       " (4.571428571428571, 'siege \".'),\n",
       " (4.571428571428571, 'seeing prey'),\n",
       " (4.571428571428571, 'pelicans \".'),\n",
       " (4.571428571428571, 'lure prey'),\n",
       " (4.571428571428571, 'kenya \".'),\n",
       " (4.571428571428571, 'hidden prey'),\n",
       " (4.571428571428571, 'frighten prey'),\n",
       " (4.564516129032258, '2008 ).'),\n",
       " (4.550580431177446, 'three species'),\n",
       " (4.550580431177446, 'three species'),\n",
       " (4.550580431177446, 'three species'),\n",
       " (4.550580431177446, 'three species'),\n",
       " (4.550580431177446, 'three species'),\n",
       " (4.528358208955224, '4 species'),\n",
       " (4.5, 'variously considered'),\n",
       " (4.5, 'using bait'),\n",
       " (4.5, 'success rate'),\n",
       " (4.5, 'previous order'),\n",
       " (4.5, 'placed high'),\n",
       " (4.5, 'oed speculates'),\n",
       " (4.5, 'held backwards'),\n",
       " (4.5, 'geographical variation'),\n",
       " (4.5, 'extremely fine'),\n",
       " (4.5, 'evolutionary adaptation'),\n",
       " (4.5, 'considered provisional'),\n",
       " (4.5, 'annual migration'),\n",
       " (4.4, 'phylogenomic study'),\n",
       " (4.397849462365592, '2023 ).'),\n",
       " (4.368421052631579, 'bitterns'),\n",
       " (4.368421052631579, 'bitterns'),\n",
       " (4.368421052631579, 'bitterns'),\n",
       " (4.368421052631579, 'bitterns'),\n",
       " (4.368421052631579, 'bitterns'),\n",
       " (4.368421052631579, 'bitterns'),\n",
       " (4.368421052631579, 'bitterns'),\n",
       " (4.333333333333334, 'usually blue'),\n",
       " (4.333333333333334, 'solitary foraging'),\n",
       " (4.333333333333334, 'relative patterns'),\n",
       " (4.333333333333334, 'new zealand'),\n",
       " (4.333333333333334, 'new zealand'),\n",
       " (4.333333333333334, 'new zealand'),\n",
       " (4.333333333333334, 'new york'),\n",
       " (4.333333333333334, 'lower part'),\n",
       " (4.333333333333334, 'including fish'),\n",
       " (4.333333333333334, 'including birds'),\n",
       " (4.333333333333334, 'corroborate patterns'),\n",
       " (4.333333333333334, 'collaborators published'),\n",
       " (4.333333333333334, 'aquatic insects'),\n",
       " (4.328358208955224, 'species occur'),\n",
       " (4.328358208955224, 'smallest species'),\n",
       " (4.328358208955224, 'six species'),\n",
       " (4.328358208955224, 'six species'),\n",
       " (4.328358208955224, 'largest species'),\n",
       " (4.314285714285715, 'bird eggs'),\n",
       " (4.285714285714286, 'four days'),\n",
       " (4.266666666666667, 'wings may'),\n",
       " (4.25, 'tropical birds'),\n",
       " (4.25, 'b hilaluddin'),\n",
       " (4.222222222222222, 'three broods'),\n",
       " (4.2, 'mostly sedentary'),\n",
       " (4.2, 'molecular studies'),\n",
       " (4.2, 'molecular biology'),\n",
       " (4.181818181818182, 'ardea modesta'),\n",
       " (4.181818181818182, 'ardea cinerea'),\n",
       " (4.166666666666667, 'phylogenetic relationships'),\n",
       " (4.166666666666666, 'diet includes'),\n",
       " (4.142857142857142, 'breeding strategies'),\n",
       " (4.128358208955224, 'colonial species'),\n",
       " (4.128358208955224, '6 species'),\n",
       " (4.128358208955224, '6 species'),\n",
       " (4.128358208955224, '6 species'),\n",
       " (4.066666666666666, 'retracted neck'),\n",
       " (4.064516129032258, '2024 ).'),\n",
       " (4.064516129032258, '2009 ).'),\n",
       " (4.064516129032258, '2009 ).'),\n",
       " (4.064516129032258, '2000 ).'),\n",
       " (4.064516129032258, '1998 ).'),\n",
       " (4.064516129032258, '1995 ).'),\n",
       " (4.064516129032258, '1994 ).'),\n",
       " (4.064516129032258, '1992 ).'),\n",
       " (4.064516129032258, '1991 ).'),\n",
       " (4.064516129032258, '1991 ).'),\n",
       " (4.064516129032258, '1988 ).'),\n",
       " (4.064516129032258, '1973 ).'),\n",
       " (4.064516129032258, '1966 ).'),\n",
       " (4.064516129032258, '1946 ).'),\n",
       " (4.0, 'ˈʃaɪtpoʊk /,'),\n",
       " (4.0, 'witt cc'),\n",
       " (4.0, 'wilson bulletin'),\n",
       " (4.0, 'widespread family'),\n",
       " (4.0, 'wider field'),\n",
       " (4.0, 'weakly person'),\n",
       " (4.0, 'watts gallery'),\n",
       " (4.0, 'watery environments'),\n",
       " (4.0, 'uttar pradesh'),\n",
       " (4.0, 'urban ecosystems'),\n",
       " (4.0, 'upright posture'),\n",
       " (4.0, 'traditionally done'),\n",
       " (4.0, 'terms used'),\n",
       " (4.0, 'tarsometatarsus assigned'),\n",
       " (4.0, 'swimming waterbirds'),\n",
       " (4.0, 'strikingly complex'),\n",
       " (4.0, 'steadman dw'),\n",
       " (4.0, 'sometimes included'),\n",
       " (4.0, 'sometimes included'),\n",
       " (4.0, 'sometimes included'),\n",
       " (4.0, 'sometimes included'),\n",
       " (4.0, 'sized birds'),\n",
       " (4.0, 'sit motionless'),\n",
       " (4.0, 's2cid 85622885'),\n",
       " (4.0, 's2cid 6472805'),\n",
       " (4.0, 'royal decree'),\n",
       " (4.0, 'robert gillmor'),\n",
       " (4.0, 'robert gillmor'),\n",
       " (4.0, 'results suggest'),\n",
       " (4.0, 'results conflict'),\n",
       " (4.0, 'reduce glare'),\n",
       " (4.0, 'q18789 wikispecies'),\n",
       " (4.0, 'published example'),\n",
       " (4.0, 'proardea matuku'),\n",
       " (4.0, 'previously placed'),\n",
       " (4.0, 'predominantly found'),\n",
       " (4.0, 'pmid 18583609'),\n",
       " (4.0, 'pmid 10723744'),\n",
       " (4.0, 'peter hayman'),\n",
       " (4.0, 'particularly inclined'),\n",
       " (4.0, 'particularly crabs'),\n",
       " (4.0, 'north america'),\n",
       " (4.0, 'norfolk broads'),\n",
       " (4.0, 'nbnsys0000159424 ncbi'),\n",
       " (4.0, 'mostly colonial'),\n",
       " (4.0, 'moore ws'),\n",
       " (4.0, 'mirriam company'),\n",
       " (4.0, 'miglia kj'),\n",
       " (4.0, 'melanophoyx ardesiaca'),\n",
       " (4.0, 'marks bd'),\n",
       " (4.0, 'male works'),\n",
       " (4.0, 'male employs'),\n",
       " (4.0, 'lynx edicions'),\n",
       " (4.0, 'lowland areas'),\n",
       " (4.0, 'locate roosting'),\n",
       " (4.0, 'levy county'),\n",
       " (4.0, 'levy county'),\n",
       " (4.0, 'lay olive'),\n",
       " (4.0, 'late oligocene'),\n",
       " (4.0, 'late miocene'),\n",
       " (4.0, 'late miocene'),\n",
       " (4.0, 'kimball rt'),\n",
       " (4.0, 'jstor 4163462'),\n",
       " (4.0, 'jstor 4088682'),\n",
       " (4.0, 'impotent mates'),\n",
       " (4.0, 'human persecution'),\n",
       " (4.0, 'huddleston cj'),\n",
       " (4.0, 'harshman j'),\n",
       " (4.0, 'han kl'),\n",
       " (4.0, 'hackett sj'),\n",
       " (4.0, 'habitats except'),\n",
       " (4.0, 'glossy blue'),\n",
       " (4.0, 'game birds'),\n",
       " (4.0, 'full canopy'),\n",
       " (4.0, 'frankish haigiro'),\n",
       " (4.0, 'frank gill'),\n",
       " (4.0, 'foraging success'),\n",
       " (4.0, 'foraging success'),\n",
       " (4.0, 'foraging implications'),\n",
       " (4.0, 'family exhibits'),\n",
       " (4.0, 'family belongs'),\n",
       " (4.0, 'extinct long'),\n",
       " (4.0, 'et sp'),\n",
       " (4.0, 'et sp'),\n",
       " (4.0, 'et sp'),\n",
       " (4.0, 'et al'),\n",
       " (4.0, 'essentially non'),\n",
       " (4.0, 'egg clutches'),\n",
       " (4.0, 'early oligocene'),\n",
       " (4.0, 'early miocene'),\n",
       " (4.0, 'early miocene'),\n",
       " (4.0, 'early miocene'),\n",
       " (4.0, 'early miocene'),\n",
       " (4.0, 'early miocene'),\n",
       " (4.0, 'driest deserts'),\n",
       " (4.0, 'djebel zelten'),\n",
       " (4.0, 'djebel zelten'),\n",
       " (4.0, 'disperse widely'),\n",
       " (4.0, 'del hoyo'),\n",
       " (4.0, 'deep booming'),\n",
       " (4.0, 'decorative plumes'),\n",
       " (4.0, 'cox wa'),\n",
       " (4.0, 'coral beaches'),\n",
       " (4.0, 'constrained record'),\n",
       " (4.0, 'completely resolved'),\n",
       " (4.0, 'collaborators resurrected'),\n",
       " (4.0, 'coldest extremes'),\n",
       " (4.0, 'coastal birds'),\n",
       " (4.0, 'cladogram shown'),\n",
       " (4.0, 'chojnowski jl'),\n",
       " (4.0, 'cervical vertebrae'),\n",
       " (4.0, 'catch fish'),\n",
       " (4.0, 'braun mj'),\n",
       " (4.0, 'braun el'),\n",
       " (4.0, 'bowie rc'),\n",
       " (4.0, 'birds reveals'),\n",
       " (4.0, 'banded killifish'),\n",
       " (4.0, 'badly resolved'),\n",
       " (4.0, 'attract fish'),\n",
       " (4.0, 'attract females'),\n",
       " (4.0, 'arrangement presented'),\n",
       " (4.0, 'amazonian peru'),\n",
       " (4.0, 'alpine areas'),\n",
       " (4.0, 'aggressive attack'),\n",
       " (4.0, 'additional observations'),\n",
       " (4.0, 'actually belongs'),\n",
       " (4.0, '8899 nzor'),\n",
       " (4.0, '8013 eppo'),\n",
       " (4.0, '6pb eol'),\n",
       " (4.0, '4929 irmng'),\n",
       " (4.0, '39541 worms'),\n",
       " (4.0, '3685 inaturalist'),\n",
       " (4.0, '174771 nbn'),\n",
       " (4.0, '1444 col'),\n",
       " (4.0, '101354 itis'),\n",
       " (4.0, '062322a50823 gbif'),\n",
       " (3.916666666666667, 'b hruska'),\n",
       " (3.904761904761905, 'seen prey'),\n",
       " (3.9, '3'),\n",
       " (3.8826979472140764, 'pdf ).'),\n",
       " (3.8826979472140764, 'pdf ).'),\n",
       " (3.8826979472140764, 'pdf ).'),\n",
       " (3.8826979472140764, 'pdf ).'),\n",
       " (3.8826979472140764, 'pdf ).'),\n",
       " (3.8826979472140764, 'pdf ).'),\n",
       " (3.8826979472140764, 'pdf ).'),\n",
       " (3.8826979472140764, 'pdf ).'),\n",
       " (3.8826979472140764, 'pdf ).'),\n",
       " (3.8461538461538463, 'bittern'),\n",
       " (3.8461538461538463, 'bittern'),\n",
       " (3.833333333333333, 'nesting site'),\n",
       " (3.828358208955224, '14 species'),\n",
       " (3.8, 'sheldon fh'),\n",
       " (3.8, '6 times'),\n",
       " (3.75, 'mitochondrial dna'),\n",
       " (3.75, 'dna studies'),\n",
       " (3.75, 'dna hybridization'),\n",
       " (3.7, 'mostly associated'),\n",
       " (3.666666666666667, 'usually yellow'),\n",
       " (3.666666666666667, 'relationships among'),\n",
       " (3.666666666666667, 'often coinciding'),\n",
       " (3.666666666666667, 'jack hruska'),\n",
       " (3.666666666666667, 'extreme example'),\n",
       " (3.6616915422885574, '8 species'),\n",
       " (3.642857142857143, 'breeding plumage'),\n",
       " (3.601085481682497, 'fossil species'),\n",
       " (3.6, 'seven eggs'),\n",
       " (3.6, 'name appears'),\n",
       " (3.5714285714285716, 'night'),\n",
       " (3.564516129032258, '2011 ).'),\n",
       " (3.564516129032258, '2006 ).'),\n",
       " (3.5, 'wing fishing'),\n",
       " (3.5, 'wide variety'),\n",
       " (3.5, 'thin toes'),\n",
       " (3.5, 'stretch display'),\n",
       " (3.5, 'stirring'),\n",
       " (3.5, 'round breeders'),\n",
       " (3.5, 'plumage polymorphism'),\n",
       " (3.5, 'necked birds'),\n",
       " (3.5, 'modified shape'),\n",
       " (3.5, 'list'),\n",
       " (3.5, 'less tied'),\n",
       " (3.5, 'legged waterfowl'),\n",
       " (3.5, 'jstor 4089118'),\n",
       " (3.5, 'jstor 4083060'),\n",
       " (3.5, 'jstor 4080141'),\n",
       " (3.5, 'jstor 2424157'),\n",
       " (3.5, 'jstor 1368954'),\n",
       " (3.5, 'jstor 1368843'),\n",
       " (3.5, 'james vi'),\n",
       " (3.5, 'james j'),\n",
       " (3.5, 'foot'),\n",
       " (3.5, 'eds .).'),\n",
       " (3.5, 'cosmopolitan distribution'),\n",
       " (3.5, 'correct placement'),\n",
       " (3.5, 'bare parts'),\n",
       " (3.5, '60 paces'),\n",
       " (3.466666666666667, '18 genera'),\n",
       " (3.409090909090909, 'mainly white'),\n",
       " (3.4, 'long beaks'),\n",
       " (3.4, 'large'),\n",
       " (3.4, '13'),\n",
       " (3.3972602739726026, 'herons'),\n",
       " (3.3972602739726026, 'herons'),\n",
       " (3.3972602739726026, 'herons'),\n",
       " (3.3972602739726026, 'herons'),\n",
       " (3.3972602739726026, 'herons'),\n",
       " (3.3972602739726026, 'herons'),\n",
       " (3.3972602739726026, 'herons'),\n",
       " (3.3972602739726026, 'herons'),\n",
       " (3.3972602739726026, 'herons'),\n",
       " (3.3972602739726026, 'herons'),\n",
       " (3.3972602739726026, 'herons'),\n",
       " (3.3972602739726026, 'herons'),\n",
       " (3.3972602739726026, 'herons'),\n",
       " (3.3972602739726026, 'herons'),\n",
       " (3.3972602739726026, 'herons'),\n",
       " (3.3972602739726026, 'herons'),\n",
       " (3.3972602739726026, 'herons'),\n",
       " (3.3972602739726026, 'herons'),\n",
       " (3.3972602739726026, 'herons'),\n",
       " (3.3972602739726026, 'herons'),\n",
       " (3.3972602739726026, 'herons'),\n",
       " (3.3972602739726026, 'herons'),\n",
       " (3.3333333333333335, '15'),\n",
       " (3.333333333333333, 'suitable trees'),\n",
       " (3.333333333333333, 'rarely carrion'),\n",
       " (3.333333333333333, 'nesting ecology'),\n",
       " (3.333333333333333, 'crouched position'),\n",
       " (3.333333333333333, 'atypical bill'),\n",
       " (3.3, 'shallow water'),\n",
       " (3.2727272727272725, 'disputed fossil'),\n",
       " (3.25, 'gorsachius'),\n",
       " (3.1875, 'egrets'),\n",
       " (3.1875, 'egrets'),\n",
       " (3.1875, 'egrets'),\n",
       " (3.1875, 'egrets'),\n",
       " (3.1875, 'egrets'),\n",
       " (3.166666666666667, 'exhibiting 10'),\n",
       " (3.142857142857143, 'boat'),\n",
       " (3.142857142857143, 'boat'),\n",
       " (3.142857142857143, 'boat'),\n",
       " (3.142857142857143, 'boat'),\n",
       " (3.0, 'skull'),\n",
       " (3.0, 'pamela rasmussen'),\n",
       " (3.0, 'often referred'),\n",
       " (3.0, 'necks retracted'),\n",
       " (3.0, 'mccracken'),\n",
       " (3.0, 'h'),\n",
       " (3.0, 'green'),\n",
       " (3.0, 'e'),\n",
       " (3.0, 'david donsker'),\n",
       " (3.0, 'ardeidae'),\n",
       " (3.0, 'ardeidae'),\n",
       " (3.0, 'ardeidae'),\n",
       " (3.0, 'ardeidae'),\n",
       " (3.0, '11'),\n",
       " (2.9333333333333336, 'brown eggs'),\n",
       " (2.933333333333333, 'may'),\n",
       " (2.933333333333333, 'may'),\n",
       " (2.933333333333333, 'may'),\n",
       " (2.9, 'generally long'),\n",
       " (2.857142857142857, '1'),\n",
       " (2.8333333333333335, 'feeding'),\n",
       " (2.833333333333333, 'thick bill'),\n",
       " (2.8, 'use'),\n",
       " (2.8, 'use'),\n",
       " (2.8, 'butorides'),\n",
       " (2.7333333333333334, 'long legs'),\n",
       " (2.7142857142857144, 'one'),\n",
       " (2.7142857142857144, 'one'),\n",
       " (2.7142857142857144, 'one'),\n",
       " (2.7142857142857144, 'bird'),\n",
       " (2.7142857142857144, 'bird'),\n",
       " (2.6666666666666665, 'world'),\n",
       " (2.6666666666666665, 'world'),\n",
       " (2.6666666666666665, 'group'),\n",
       " (2.6666666666666665, 'feathers'),\n",
       " (2.6666666666666665, 'f'),\n",
       " (2.6666666666666665, 'colour'),\n",
       " (2.6666666666666665, 'around'),\n",
       " (2.5714285714285716, 'prey'),\n",
       " (2.5714285714285716, 'prey'),\n",
       " (2.5714285714285716, 'prey'),\n",
       " (2.5, 'range'),\n",
       " (2.5, 'london'),\n",
       " (2.5, 'k'),\n",
       " (2.5, 'k'),\n",
       " (2.5, 'heronconservation'),\n",
       " (2.5, 'exist'),\n",
       " (2.5, 'cochlearidae'),\n",
       " (2.5, 'away'),\n",
       " (2.5, '17'),\n",
       " (2.5, '17'),\n",
       " (2.4, 'study'),\n",
       " (2.4, 'neck'),\n",
       " (2.4, 'neck'),\n",
       " (2.4, 'c'),\n",
       " (2.4, '0'),\n",
       " (2.4, '0'),\n",
       " (2.3333333333333335, 'usually'),\n",
       " (2.3333333333333335, 'part'),\n",
       " (2.3333333333333335, 'p'),\n",
       " (2.3333333333333335, 'evolution'),\n",
       " (2.3333333333333335, 'evolution'),\n",
       " (2.3333333333333335, '2023'),\n",
       " (2.3333333333333335, '2'),\n",
       " (2.3333333333333335, '2'),\n",
       " (2.3333333333333335, '1971'),\n",
       " (2.328358208955224, 'species'),\n",
       " (2.328358208955224, 'species'),\n",
       " (2.328358208955224, 'species'),\n",
       " (2.328358208955224, 'species'),\n",
       " (2.328358208955224, 'species'),\n",
       " (2.328358208955224, 'species'),\n",
       " (2.328358208955224, 'species'),\n",
       " (2.328358208955224, 'species'),\n",
       " (2.328358208955224, 'species'),\n",
       " (2.328358208955224, 'species'),\n",
       " (2.25, 'feed'),\n",
       " (2.25, 'feed'),\n",
       " (2.2222222222222223, 'three'),\n",
       " (2.2, 'molecular'),\n",
       " (2.2, '4'),\n",
       " (2.2, '4'),\n",
       " (2.1818181818181817, 'ardea'),\n",
       " (2.1818181818181817, 'ardea'),\n",
       " (2.1818181818181817, 'ardea'),\n",
       " (2.1818181818181817, 'ardea'),\n",
       " (2.1818181818181817, 'ardea'),\n",
       " (2.142857142857143, 'breeding'),\n",
       " (2.142857142857143, 'breeding'),\n",
       " (2.111111111111111, 'g'),\n",
       " (2.111111111111111, 'g'),\n",
       " (2.111111111111111, 'g'),\n",
       " (2.111111111111111, 'g'),\n",
       " (2.111111111111111, 'g'),\n",
       " (2.0, 'well'),\n",
       " (2.0, 'watts'),\n",
       " (2.0, 'waterbirds'),\n",
       " (2.0, 'waterbirds'),\n",
       " (2.0, 'ukad005'),\n",
       " (2.0, 'taxonomy'),\n",
       " (2.0, 'reported'),\n",
       " (2.0, 'reported'),\n",
       " (2.0, 'reported'),\n",
       " (2.0, 'rather'),\n",
       " (2.0, 'present'),\n",
       " (2.0, 'post'),\n",
       " (2.0, 'place'),\n",
       " (2.0, 'pelecaniformes'),\n",
       " (2.0, 'occur'),\n",
       " (2.0, 'j'),\n",
       " (2.0, 'j'),\n",
       " (2.0, 'j'),\n",
       " (2.0, 'insects'),\n",
       " (2.0, 'found'),\n",
       " (2.0, 'found'),\n",
       " (2.0, 'flushed'),\n",
       " (2.0, 'family'),\n",
       " (2.0, 'family'),\n",
       " (2.0, 'except'),\n",
       " (2.0, 'either'),\n",
       " (2.0, 'courtship'),\n",
       " (2.0, 'courtship'),\n",
       " (2.0, 'corruption'),\n",
       " (2.0, 'birds'),\n",
       " (2.0, 'birds'),\n",
       " (2.0, 'birds'),\n",
       " (2.0, 'bait'),\n",
       " (2.0, 'areas'),\n",
       " (2.0, 'analyses'),\n",
       " (2.0, 'actively'),\n",
       " (2.0, '320'),\n",
       " (2.0, '21'),\n",
       " (1.9090909090909092, 'white'),\n",
       " (1.9090909090909092, 'white'),\n",
       " (1.9090909090909092, 'white'),\n",
       " (1.9090909090909092, 'white'),\n",
       " (1.9090909090909092, 'white'),\n",
       " (1.9090909090909092, 'white'),\n",
       " (1.8333333333333333, 'nesting'),\n",
       " (1.8333333333333333, 'nesting'),\n",
       " (1.8333333333333333, 'nesting'),\n",
       " (1.8181818181818181, 'pdf'),\n",
       " (1.8181818181818181, 'pdf'),\n",
       " (1.8, 'year'),\n",
       " (1.8, 'year'),\n",
       " (1.8, 'year'),\n",
       " (1.8, 'year'),\n",
       " (1.8, 'sheldon'),\n",
       " (1.8, 'sheldon'),\n",
       " (1.8, 'sheldon'),\n",
       " (1.8, 'grey'),\n",
       " (1.8, 'genera'),\n",
       " (1.8, 'genera'),\n",
       " (1.8, 'colonial'),\n",
       " (1.8, 'colonial'),\n",
       " (1.8, '6'),\n",
       " (1.75, 'ibis'),\n",
       " (1.75, 'ibis'),\n",
       " (1.75, 'dna'),\n",
       " (1.7142857142857142, 'nest'),\n",
       " (1.7142857142857142, 'nest'),\n",
       " (1.7142857142857142, 'nest'),\n",
       " (1.7142857142857142, 'nest'),\n",
       " (1.7142857142857142, 'nest'),\n",
       " (1.6666666666666667, 'whereas'),\n",
       " (1.6666666666666667, 'whereas'),\n",
       " (1.6666666666666667, 'size'),\n",
       " (1.6666666666666667, 'size'),\n",
       " (1.6666666666666667, 'retracted'),\n",
       " (1.6666666666666667, 'relationships'),\n",
       " (1.6666666666666667, 'r'),\n",
       " (1.6666666666666667, 'r'),\n",
       " (1.6666666666666667, 'often'),\n",
       " (1.6666666666666667, 'included'),\n",
       " (1.6666666666666667, 'included'),\n",
       " (1.6666666666666667, 'hruska'),\n",
       " (1.6666666666666667, 'example'),\n",
       " (1.6666666666666667, 'archived'),\n",
       " (1.6666666666666667, 'archived'),\n",
       " (1.6666666666666667, 'almost'),\n",
       " (1.6666666666666667, 'almost'),\n",
       " (1.6666666666666667, '18'),\n",
       " (1.6, 'name'),\n",
       " (1.6, 'name'),\n",
       " (1.6, 'name'),\n",
       " (1.6, 'eggs'),\n",
       " (1.6, 'eggs'),\n",
       " (1.5714285714285714, '7'),\n",
       " (1.5714285714285714, '7'),\n",
       " (1.5714285714285714, '7'),\n",
       " (1.5714285714285714, '7'),\n",
       " (1.5714285714285714, '7'),\n",
       " (1.5714285714285714, '7'),\n",
       " (1.5714285714285714, '7'),\n",
       " (1.5714285714285714, '7'),\n",
       " (1.5714285714285714, '7'),\n",
       " (1.5714285714285714, '7'),\n",
       " (1.5714285714285714, '7'),\n",
       " (1.5, 'variety'),\n",
       " (1.5, 'used'),\n",
       " (1.5, 'used'),\n",
       " (1.5, 'used'),\n",
       " (1.5, 'used'),\n",
       " ...]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from rake_nltk import Rake\n",
    "import nltk #you may need to download additional resources like punkt\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.tokenize import word_tokenize\n",
    "\n",
    "# Download the Punkt tokenizer\n",
    "nltk.download('punkt')  \n",
    "nltk.download('stopwords')\n",
    "\n",
    "#Sample text\n",
    "text = \"Natural language processing (NLP) is an interdisciplinary field that focuses on the interactions between computers and human language. including speech recognition, machine translation, and text analysis.\"\n",
    "\n",
    "# Initiate the RAKE object and run it on the text\n",
    "r = Rake()\n",
    "\n",
    "# Extraction given the text.\n",
    "r.extract_keywords_from_text(joined_string)\n",
    "r.get_ranked_phrases_with_scores()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('Herons', 0.0019341151026289274), ('heron', 0.0020806389740402097), ('genus', 0.002779114198769202), ('night herons', 0.003407551995779094), ('heron Genus', 0.003440693868056681), ('species', 0.0035328448489132763), ('heron Genus Ixobrychus', 0.007802717741497088), ('heron Genus Egretta', 0.008165381990151557), ('heron Genus Zonerodius', 0.01088922519768467), ('heron Genus Taphophoyx', 0.012246806133592213), ('boat-billed heron', 0.012547724730764211), ('Agami heron Genus', 0.012552446379536716), ('heron Genus Cochlearius', 0.0126236034378057), ('heron Genus Agamia', 0.01263198211070613), ('tiger heron Genus', 0.01263302474300582), ('tiger herons', 0.013108091837444087), ('Ardeidae', 0.013458020786417972), ('genus Ardea', 0.014145228646781595), ('boat-billed heron Genus', 0.014222182748485945), ('zigzag heron Genus', 0.015083995855833034)]\n"
     ]
    }
   ],
   "source": [
    "#Take 2\n",
    "import yake \n",
    "\n",
    "yake_kw = yake.KeywordExtractor() \n",
    "KeyWords = yake_kw.extract_keywords(joined_string) \n",
    "  \n",
    "# Displaying the keywords \n",
    "print(KeyWords) \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package punkt to\n",
      "[nltk_data]     d:\\Programms\\Anaconda\\envs\\sneakpic\\lib\\nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n"
     ]
    },
    {
     "ename": "AttributeError",
     "evalue": "'Rake' object has no attribute 'run'",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[6], line 13\u001b[0m\n\u001b[0;32m     10\u001b[0m r \u001b[38;5;241m=\u001b[39m Rake(stopwords\u001b[38;5;241m=\u001b[39mstopwords\u001b[38;5;241m.\u001b[39mwords(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124menglish\u001b[39m\u001b[38;5;124m\"\u001b[39m), min_length\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m, max_length\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m4\u001b[39m)\n\u001b[0;32m     12\u001b[0m \u001b[38;5;66;03m# Run RAKE on the joined string\u001b[39;00m\n\u001b[1;32m---> 13\u001b[0m keywords \u001b[38;5;241m=\u001b[39m \u001b[43mr\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m(joined_string)\n\u001b[0;32m     15\u001b[0m \u001b[38;5;66;03m# Get the top keywords with their scores\u001b[39;00m\n\u001b[0;32m     16\u001b[0m top_keywords \u001b[38;5;241m=\u001b[39m r\u001b[38;5;241m.\u001b[39mget_ranked_phrases()\n",
      "\u001b[1;31mAttributeError\u001b[0m: 'Rake' object has no attribute 'run'"
     ]
    }
   ],
   "source": [
    "#take 1\n",
    "import nltk\n",
    "from nltk.corpus import stopwords\n",
    "from rake_nltk import Rake\n",
    "\n",
    "# Download the Punkt tokenizer\n",
    "nltk.download('punkt')\n",
    "\n",
    "# Initialize the RAKE object\n",
    "r = Rake(stopwords=stopwords.words(\"english\"), min_length=1, max_length=4)\n",
    "\n",
    "# Run RAKE on the joined string\n",
    "keywords = r.run(joined_string)\n",
    "\n",
    "# Get the top keywords with their scores\n",
    "top_keywords = r.get_ranked_phrases()\n",
    "print(top_keywords)  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import spacy\n",
    "nlp = spacy.load('en_core_web_sm')\n",
    "\n",
    "def extract_keywords_spacy(text):\n",
    "    doc = nlp(text)\n",
    "    keywords = [token.text for token in doc if token.is_alpha and token.is_stop != True and token.pos_ == \"NOUN\"]\n",
    "    return keywords"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h1>Frequency Based</h1>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package punkt to\n",
      "[nltk_data]     d:\\Programms\\Anaconda\\envs\\sneakpic\\lib\\nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n",
      "[nltk_data] Downloading package averaged_perceptron_tagger to\n",
      "[nltk_data]     d:\\Programms\\Anaconda\\envs\\sneakpic\\lib\\nltk_data...\n",
      "[nltk_data]   Package averaged_perceptron_tagger is already up-to-\n",
      "[nltk_data]       date!\n",
      "[nltk_data] Downloading package wordnet to\n",
      "[nltk_data]     d:\\Programms\\Anaconda\\envs\\sneakpic\\lib\\nltk_data...\n",
      "[nltk_data]   Unzipping corpora\\wordnet.zip.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('Halo', 'NN')]\n",
      "NLTK Keywords: ['heron', 'specie', 'genus', 'bittern', 'night', 'bird', 'egret', 'ardeidae', 'edit', 'prey']\n"
     ]
    }
   ],
   "source": [
    "\n",
    "import nltk\n",
    "from nltk.tokenize import word_tokenize\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.tag import pos_tag\n",
    "nltk.download('punkt')\n",
    "nltk.download('averaged_perceptron_tagger')\n",
    "nltk.download('wordnet')\n",
    "# Stop words list\n",
    "stop_words = set(stopwords.words('english'))\n",
    "\n",
    "print(pos_tag([\"Halo\"]))\n",
    "def preprocess_text(text):\n",
    "    wnl = WordNetLemmatizer()\n",
    "    tokens = word_tokenize(text.lower())\n",
    "    filtered_tokens = [word for (word,pos) in pos_tag(tokens) if word not in stop_words and len(word) > 2 and pos[:2] == 'NN']\n",
    "    lemmatized_tokens = [wnl.lemmatize(word) for word in filtered_tokens]\n",
    "    return lemmatized_tokens\n",
    "\n",
    "\n",
    "def extract_keywords_nltk(text):\n",
    "    words = preprocess_text(text)\n",
    "    # Frequency distribution of words\n",
    "    freq_dist = nltk.FreqDist(words)\n",
    "    # Get the 10 most common words\n",
    "    most_common_words = freq_dist.most_common(10)\n",
    "    keywords = [word for word, _ in most_common_words]\n",
    "    return keywords\n",
    "\n",
    "text = \"This is a sample text that contains some important keywords like machine learning, natural language processing, and Python.\"\n",
    "\n",
    "#print(\"SpaCy Keywords:\", extract_keywords_spacy(joined_string))\n",
    "print(\"NLTK Keywords:\", extract_keywords_nltk(joined_string))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "sneakpic",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}