Spaces:

Trinay16
/

WIldCards

No application file

App Files Files Community

Trinay16 commited on Jul 1, 2025

Commit

c77c0e7

verified ·

1 Parent(s): cf38d99

Upload app.py

Browse files

Files changed (1) hide show

app.py +1343 -0

app.py ADDED Viewed

	@@ -0,0 +1,1343 @@

+import streamlit as st
+import requests
+import os
+import re
+from PIL import Image
+import tempfile
+# List of allowed file extensions for uploads
+ALLOWED_EXTENSIONS = {'png', 'jpg', 'jpeg', 'gif'}
+def allowed_file(filename):
+    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
+def main():
+    st.set_page_config(page_title="Species Information Finder", layout="wide")
+    st.title("Species Information Finder")
+    st.write("Discover information about any species by name or by uploading an image.")
+    # Create tabs for different functionality
+    tab1, tab2 = st.tabs(["Search by Name", "Search by Image"])
+    with tab1:
+        st.header("Search by Species Name")
+        species_name = st.text_input("Enter a species name (common or scientific):")
+        if st.button("Search"):
+            if not species_name:
+                st.error("Please enter a species name")
+            else:
+                with st.spinner("Searching for species information..."):
+                    # Get species info from Wikispecies API
+                    species_data = get_species_info(species_name)
+                    # Get images from Wikimedia Commons API
+                    images = get_species_images(species_name)
+                    display_results(species_data, images)
+    with tab2:
+        st.header("Search by Image Upload")
+        uploaded_file = st.file_uploader("Upload an image of a species", type=ALLOWED_EXTENSIONS)
+        if uploaded_file is not None:
+            if allowed_file(uploaded_file.name):
+                # Display the uploaded image
+                image = Image.open(uploaded_file)
+                st.image(image, caption="Uploaded Image", use_column_width=True)
+                if st.button("Identify Species"):
+                    with st.spinner("Identifying species from image..."):
+                        # In a real app, you would call an image recognition API here
+                        # For demo purposes, we'll use our mock function
+                        species_name = get_mock_species_from_filename(uploaded_file.name)
+                        # Get species info from Wikispecies API
+                        species_data = get_species_info(species_name)
+                        # Get images from Wikimedia Commons API
+                        images = get_species_images(species_name)
+                        display_results(species_data, images)
+            else:
+                st.error("File type not allowed. Please upload an image file (PNG, JPG, JPEG, GIF).")
+def display_results(species_data, images):
+    """Display the results in a formatted way."""
+    if "error" in species_data:
+        st.error(species_data["error"])
+        return
+    st.success(f"Found information for: {species_data['title']}")
+    # Create columns for layout
+    col1, col2 = st.columns([1, 2])
+    with col1:
+        # Display classification information
+        st.subheader("Classification")
+        classification = species_data.get("classification", {})
+        for rank, value in classification.items():
+            if value != "Unknown":
+                st.write(f"{rank.capitalize()}:** {value}")
+        # Display habitat information
+        if species_data.get("habitat", "Unknown") != "Unknown":
+            st.subheader("Habitat")
+            st.write(species_data["habitat"])
+    with col2:
+        # Display description
+        st.subheader("Description")
+        st.write(species_data.get("description", "No description available."))
+        # Display fun facts if available
+        if species_data.get("fun_facts"):
+            st.subheader("Interesting Facts")
+            for i, fact in enumerate(species_data["fun_facts"], 1):
+                st.write(f"{i}. {fact}")
+    # Display images if available
+    if images:
+        st.subheader("Related Images")
+        # Display up to 4 images in a grid
+        cols = st.columns(min(4, len(images)))
+        for idx, img in enumerate(images[:4]):
+            with cols[idx]:
+                if "thumb_url" in img:
+                    st.image(img["thumb_url"], caption=img.get("description", ""), use_column_width=True)
+                else:
+                    st.image(img["url"], caption=img.get("description", ""), use_column_width=True)
+                st.caption(f"Credit: {img.get('author', 'Unknown')} | License: {img.get('license', 'Unknown')}")
+    else:
+        st.warning("No images found for this species.")
+# All the existing functions from your Flask app can remain exactly the same
+# (get_species_info, get_wikispecies_data, get_wikipedia_data, etc.)
+# I'll include them below for completeness, but they don't need to change
+def get_species_info(species_name):
+    """
+    Get species information from both Wikispecies and Wikipedia APIs
+    with improved extraction and fallback strategies for better results.
+    """
+    # Create the base species info structure
+    species_info = {
+        "title": species_name,  # Default to the search query
+        "description": "No description available.",
+        "categories": [],
+        "links": [],
+        "last_modified": "Unknown",
+        "classification": {
+            "kingdom": "Unknown",
+            "phylum": "Unknown",
+            "class": "Unknown",
+            "order": "Unknown",
+            "family": "Unknown",
+            "genus": "Unknown",
+            "species": "Unknown"
+        },
+        "habitat": "Unknown",
+        "fun_facts": [],
+        "data_sources": []  # Track where we got data from
+    }
+    # Try to get data from Wikispecies first
+    wikispecies_info = get_wikispecies_data(species_name)
+    # If we got a valid response, update our species_info
+    if not wikispecies_info.get("error"):
+        species_info.update(wikispecies_info)
+        species_info["data_sources"].append("Wikispecies")
+    # Now try to get complementary data from Wikipedia
+    wikipedia_info = get_wikipedia_data(species_name)
+    # If Wikipedia returned valid data, supplement our existing info
+    if not wikipedia_info.get("error"):
+        # Use Wikipedia description if Wikispecies didn't have one
+        if species_info["description"] == "No description available." or len(species_info["description"]) < 50:
+            species_info["description"] = wikipedia_info.get("description", species_info["description"])
+        # Always prefer Wikipedia habitat info as it's likely more detailed
+        species_info["habitat"] = wikipedia_info.get("habitat", species_info["habitat"])
+        # Merge classification info from Wikipedia, preferring Wikipedia data
+        if "classification" in wikipedia_info:
+            for rank, value in wikipedia_info["classification"].items():
+                if value != "Unknown":
+                    species_info["classification"][rank] = value
+        # Add Wikipedia fun facts to our collection, avoiding duplicates
+        if wikipedia_info.get("fun_facts"):
+            existing_facts = species_info.get("fun_facts", [])
+            for fact in wikipedia_info["fun_facts"]:
+                if not any(similarity_score(fact, existing) > 0.7 for existing in existing_facts):
+                    existing_facts.append(fact)
+            species_info["fun_facts"] = existing_facts[:4]  # Limit to 4 facts
+        species_info["data_sources"].append("Wikipedia")
+    # If we didn't get any data from either source, return an error
+    if not species_info["data_sources"]:
+        species_info["error"] = "Species information not found in either Wikispecies or Wikipedia."
+    return species_info
+def get_wikispecies_data(species_name):
+    """
+    Get species information from Wikispecies API
+    """
+    # Wikispecies API endpoint
+    url = "https://species.wikimedia.org/w/api.php"
+    # Parameters for the API request - get more info to work with
+    params = {
+        "action": "query",
+        "format": "json",
+        "titles": species_name,
+        "prop": "extracts|categories|info|links",
+        "exintro": True,  # Get only the intro section
+        "explaintext": True,  # Get plain text, not HTML
+        "cllimit": 50,  # Get more categories
+        "pllimit": 50,  # Get more links
+    }
+    try:
+        response = requests.get(url, params=params)
+        data = response.json()
+        # Extract page data
+        pages = data.get("query", {}).get("pages", {})
+        if not pages:
+            return {"error": "No data found in Wikispecies"}
+        # Get the first page (there should only be one)
+        page_id = next(iter(pages))
+        page = pages[page_id]
+        # Default information structure with placeholders
+        species_info = {
+            "title": species_name,  # Default to the search query
+            "description": "No description available.",
+            "categories": [],
+            "links": [],
+            "last_modified": "Unknown",
+            "classification": {
+                "kingdom": "Unknown",
+                "phylum": "Unknown",
+                "class": "Unknown",
+                "order": "Unknown",
+                "family": "Unknown",
+                "genus": "Unknown",
+                "species": "Unknown"
+            },
+            "habitat": "Unknown",
+            "fun_facts": []
+        }
+        # Check if the page exists
+        if int(page_id) < 0:
+            species_info["error"] = "Species not found in Wikispecies. Try a different spelling or check for the scientific name."
+            return species_info
+        # Extract the relevant information
+        species_info["title"] = page.get("title", species_name)
+        species_info["description"] = page.get("extract", "No description available.")
+        # Get all categories
+        if "categories" in page:
+            species_info["categories"] = [cat.get("title") for cat in page.get("categories", [])]
+        # Get all links (can be useful for finding related info)
+        if "links" in page:
+            species_info["links"] = [link.get("title") for link in page.get("links", [])]
+        species_info["last_modified"] = page.get("touched", "Unknown")
+        # Clean up the description (remove unnecessary line breaks, etc.)
+        if species_info["description"]:
+            species_info["description"] = species_info["description"].replace("\n", " ").strip()
+            # Remove multiple spaces
+            import re
+            species_info["description"] = re.sub(r' +', ' ', species_info["description"])
+        # Try different strategies to extract classification
+        # Strategy 1: Extract from categories
+        species_info["classification"] = extract_classification(species_info["categories"])
+        # Strategy 2: Try to extract genus and species from the title if available
+        title = species_info.get("title", "")
+        title_parts = title.split()
+        # If the title consists of two words, it might be a binomial name (genus + species)
+        if len(title_parts) == 2:
+            genus = title_parts[0]
+            species = title_parts[1]
+            # Update classification with this information
+            classification = species_info.get("classification", {})
+            if classification.get("genus") == "Unknown":
+                classification["genus"] = genus
+            if classification.get("species") == "Unknown":
+                classification["species"] = species
+            species_info["classification"] = classification
+        # Strategy 3: Look for classification information in links
+        if species_info.get("links"):
+            for link in species_info["links"]:
+                # Check if link might be a taxonomic rank
+                link_parts = link.split()
+                if len(link_parts) == 1:
+                    # Check common taxonomic suffixes for families, orders, etc.
+                    if link.endswith("idae"):  # Family suffix
+                        species_info["classification"]["family"] = link
+                    elif link.endswith("inae"):  # Subfamily suffix
+                        # Store subfamily info in a separate key
+                        species_info["classification"]["subfamily"] = link
+                    elif link.endswith("ales"):  # Order suffix for plants
+                        species_info["classification"]["order"] = link
+                    elif link.endswith("aceae"):  # Family suffix for plants
+                        species_info["classification"]["family"] = link
+        # Extract habitat info
+        species_info["habitat"] = extract_habitat(species_info["description"])
+        # Extract fun facts
+        species_info["fun_facts"] = extract_fun_facts(species_info["description"])
+        # If the description is too short or missing, try to create a basic description
+        if not species_info["description"] or len(species_info["description"]) < 20:
+            # Create a basic description from available information
+            classification = species_info["classification"]
+            parts = []
+            if classification["genus"] != "Unknown" and classification["species"] != "Unknown":
+                parts.append(f"{species_info['title']} is a species in the genus {classification['genus']}.")
+            if classification["family"] != "Unknown":
+                parts.append(f"It belongs to the family {classification['family']}.")
+            if classification["order"] != "Unknown":
+                parts.append(f"It is classified under the order {classification['order']}.")
+            if parts:
+                species_info["description"] = " ".join(parts)
+            else:
+                species_info["description"] = f"{species_info['title']} is a species documented in Wikispecies, the free species directory."
+        return species_info
+    except Exception as e:
+        error_msg = str(e)
+        return {
+            "error": f"Error retrieving species information from Wikispecies: {error_msg}",
+            "title": species_name,
+            "description": "No information available due to an error. Please try a different species name.",
+            "classification": {"kingdom": "Unknown", "phylum": "Unknown", "class": "Unknown", "order": "Unknown", "family": "Unknown", "genus": "Unknown", "species": "Unknown"},
+            "habitat": "Unknown",
+            "fun_facts": []
+        }
+def get_wikipedia_data(species_name):
+    """
+    Get species information from Wikipedia API, focusing on description,
+    habitat, and fun facts.
+    """
+    # Wikipedia API endpoint
+    url = "https://en.wikipedia.org/w/api.php"
+    # First, try to search for the page to get the correct title
+    search_params = {
+        "action": "query",
+        "format": "json",
+        "list": "search",
+        "srsearch": species_name,
+        "srlimit": 1,  # Get just the best match
+    }
+    try:
+        # Search for the page first to get the exact title
+        search_response = requests.get(url, params=search_params)
+        search_data = search_response.json()
+        # Check if we found any search results
+        search_results = search_data.get("query", {}).get("search", [])
+        if not search_results:
+            return {"error": "No matching Wikipedia page found for this species."}
+        # Get the page title from the search result
+        page_title = search_results[0].get("title")
+        # Now get the full page content
+        content_params = {
+            "action": "query",
+            "format": "json",
+            "titles": page_title,
+            "prop": "extracts|categories|sections",
+            "exintro": False,  # Get the full content, not just the intro
+            "explaintext": True,  # Get plain text, not HTML
+            "cllimit": 50,  # Get more categories
+        }
+        content_response = requests.get(url, params=content_params)
+        content_data = content_response.json()
+        # Extract page data
+        pages = content_data.get("query", {}).get("pages", {})
+        if not pages:
+            return {"error": "Failed to retrieve Wikipedia page content."}
+        # Get the first page (there should only be one)
+        page_id = next(iter(pages))
+        page = pages[page_id]
+        # Check if the page exists
+        if int(page_id) < 0:
+            return {"error": "Wikipedia page not found."}
+        # Get basic information
+        species_info = {
+            "title": page.get("title", species_name),
+            "description": "",
+            "habitat": "Unknown",
+            "fun_facts": [],
+            "classification": {
+                "kingdom": "Unknown",
+                "phylum": "Unknown",
+                "class": "Unknown",
+                "order": "Unknown",
+                "family": "Unknown",
+                "genus": "Unknown",
+                "species": "Unknown"
+            }
+        }
+        # Extract the content
+        full_text = page.get("extract", "")
+        # Clean up the text
+        if full_text:
+            full_text = full_text.replace("\n\n", "||").replace("\n", " ").replace("||", "\n\n")
+            # Get sections from the content
+            sections = full_text.split("\n\n")
+            # The first section is usually a good description
+            if sections:
+                species_info["description"] = sections[0].strip()
+            # Look for habitat information in the full text
+            habitat_section = extract_wikipedia_section(full_text, ["Habitat", "Distribution", "Range", "Ecology", "Environment"])
+            if habitat_section:
+                species_info["habitat"] = habitat_section
+            else:
+                # If no specific habitat section, use our habitat extraction on the full text
+                habitat = extract_habitat(full_text)
+                if habitat != "Unknown":
+                    species_info["habitat"] = habitat
+            # Extract fun facts from various interesting sections
+            behavior_section = extract_wikipedia_section(full_text, ["Behavior", "Behaviour", "Life cycle", "Diet", "Feeding", "Reproduction", "Biology"])
+            if behavior_section:
+                facts = extract_fun_facts(behavior_section)
+                if facts:
+                    species_info["fun_facts"].extend(facts)
+            # If we don't have enough facts, try conservation status or other sections
+            if len(species_info["fun_facts"]) < 2:
+                conservation_section = extract_wikipedia_section(full_text, ["Conservation", "Status", "Threats", "Population"])
+                if conservation_section:
+                    facts = extract_fun_facts(conservation_section)
+                    if facts:
+                        for fact in facts:
+                            if fact not in species_info["fun_facts"]:
+                                species_info["fun_facts"].append(fact)
+            # If we still don't have enough facts, use our fun facts extraction on the full text
+            if len(species_info["fun_facts"]) < 2:
+                general_facts = extract_fun_facts(full_text)
+                if general_facts:
+                    for fact in general_facts:
+                        if fact not in species_info["fun_facts"]:
+                            species_info["fun_facts"].append(fact)
+            # Limit to 4 facts
+            species_info["fun_facts"] = species_info["fun_facts"][:4]
+            # Extract classification from Wikipedia content
+            wiki_classification = extract_wikipedia_classification(full_text, page.get("title", ""), search_data)
+            if wiki_classification:
+                species_info["classification"] = wiki_classification
+        return species_info
+    except Exception as e:
+        error_msg = str(e)
+        return {
+            "error": f"Error retrieving information from Wikipedia: {error_msg}",
+            "title": species_name,
+            "description": "No information available from Wikipedia due to an error.",
+            "habitat": "Unknown",
+            "fun_facts": []
+        }
+def extract_wikipedia_section(text, section_keywords):
+    """
+    Try to extract a specific section from Wikipedia text content.
+    Returns the first matching section or None if no match is found.
+    """
+    if not text:
+        return None
+    # Try to find section headings in the text
+    section_pattern = r"==\s*([^=]+)\s*=="
+    sections = re.findall(section_pattern, text)
+    # Check if any of our target sections exist
+    matching_sections = []
+    for keyword in section_keywords:
+        for section in sections:
+            if keyword.lower() in section.lower():
+                # Found a matching section, now extract its content
+                section_regex = re.escape(f"== {section} ==")
+                try:
+                    # Find where this section starts
+                    start_match = re.search(section_regex, text)
+                    if start_match:
+                        start_pos = start_match.end()
+                        # Find where the next section starts
+                        next_section = re.search(r"==\s*[^=]+\s*==", text[start_pos:])
+                        if next_section:
+                            end_pos = start_pos + next_section.start()
+                            section_text = text[start_pos:end_pos].strip()
+                        else:
+                            # This is the last section
+                            section_text = text[start_pos:].strip()
+                        matching_sections.append(section_text)
+                except Exception:
+                    # Skip this section if there's any error processing it
+                    continue
+    # If we found any matching sections, join them (limit to 2 for conciseness)
+    if matching_sections:
+        return " ".join(matching_sections[:2])
+    # Alternative approach: look for paragraphs containing the keywords
+    paragraphs = text.split("\n\n")
+    for keyword in section_keywords:
+        for paragraph in paragraphs:
+            if keyword.lower() in paragraph.lower():
+                return paragraph
+    return None
+def get_species_images(species_name):
+    """
+    Get species images from Wikimedia Commons API with improved search
+    strategies for better results.
+    """
+    # Wikimedia Commons API endpoint
+    url = "https://commons.wikimedia.org/w/api.php"
+    # Function to perform a search with given parameters
+    def search_images(search_term, limit=10):
+        # Parameters for the API request
+        params = {
+            "action": "query",
+            "format": "json",
+            "generator": "search",
+            "gsrnamespace": 6,  # File namespace
+            "gsrsearch": search_term,
+            "gsrlimit": limit,  # Limit results
+            "prop": "imageinfo",
+            "iiprop": "url|extmetadata",
+            "iiurlwidth": 800,  # Thumbnail width
+        }
+        try:
+            response = requests.get(url, params=params)
+            data = response.json()
+            # Extract image data
+            pages = data.get("query", {}).get("pages", {})
+            if not pages:
+                return []
+            images = []
+            for page_id, page in pages.items():
+                image_info = page.get("imageinfo", [{}])[0]
+                # Extract metadata
+                metadata = image_info.get("extmetadata", {})
+                description = metadata.get("ImageDescription", {}).get("value", "No description")
+                author = metadata.get("Artist", {}).get("value", "Unknown")
+                license = metadata.get("License", {}).get("value", "Unknown")
+                # Skip non-image files (like pdfs, audio, etc.)
+                title = page.get("title", "").lower()
+                if any(ext in title for ext in ['.pdf', '.svg', '.mp3', '.mp4', '.ogg', '.wav', '.webm']):
+                    continue
+                image = {
+                    "title": page.get("title", "Unknown"),
+                    "url": image_info.get("url", ""),
+                    "thumb_url": image_info.get("thumburl", ""),
+                    "description": description,
+                    "author": author,
+                    "license": license,
+                }
+                images.append(image)
+            return images
+        except Exception as e:
+            return [{"error": str(e)}]
+    # STRATEGY 1: Try exact file name search first
+    images = search_images(f"file:{species_name}")
+    # If no results, try a broader search
+    if not images:
+        # STRATEGY 2: Try removing the file: prefix for broader results
+        images = search_images(species_name)
+    # If still no results or very few, try some variations
+    if len(images) < 3:
+        # Split the species name and try different combinations
+        name_parts = species_name.split()
+        # STRATEGY 3: If it's a binomial name, try with just the genus or species part
+        if len(name_parts) == 2:
+            # Try with just the genus (first part)
+            genus_images = search_images(f"{name_parts[0]}")
+            # Add unique images from genus search
+            existing_urls = [img.get("url") for img in images]
+            for img in genus_images:
+                if img.get("url") not in existing_urls:
+                    images.append(img)
+                    existing_urls.append(img.get("url"))
+                    # Stop if we now have enough images
+                    if len(images) >= 5:
+                        break
+    # If we found at least some images, return them
+    if images:
+        return images
+    # STRATEGY 4: Last resort - try a very general search
+    # This could be improved by using the taxonomy info
+    return search_images("species taxonomy nature")
+def extract_classification(categories):
+    """
+    Extract classification information from categories and additional WikiData
+    with improved pattern matching and detection.
+    """
+    # Initialize with default "Unknown" values
+    classification = {
+        "kingdom": "Unknown",
+        "phylum": "Unknown",
+        "class": "Unknown",
+        "order": "Unknown",
+        "family": "Unknown",
+        "genus": "Unknown",
+        "species": "Unknown",
+    }
+    # Skip empty categories
+    if not categories:
+        return classification
+    # Common taxonomy patterns in category names with more variations
+    taxonomy_patterns = {
+        "kingdom": ["kingdom:", "regnum:", "reino:", "regno:", "kingdom ", "regnum ", "reino ", "reino "],
+        "phylum": ["phylum:", "division:", "división:", "divisio:", "phylum ", "division ", "división ", "divisio "],
+        "class": ["class:", "clase:", "classis:", "class ", "clase ", "classis "],
+        "order": ["order:", "orden:", "ordo:", "order ", "orden ", "ordo "],
+        "family": ["family:", "familia:", "family ", "familia "],
+        "genus": ["genus:", "género:", "genero:", "genus ", "género ", "genero "],
+        "species": ["species:", "especie:", "specie:", "species ", "especie ", "specie "]
+    }
+    # STRATEGY 1: Direct matching from category names
+    for category in categories:
+        # Skip Categories: prefix if present
+        if category.startswith("Category:"):
+            category = category[9:]
+        category_lower = category.lower()
+        # Check for direct taxonomy mentions
+        for rank, patterns in taxonomy_patterns.items():
+            for pattern in patterns:
+                if pattern in category_lower:
+                    # Extract the value after the pattern
+                    parts = category_lower.split(pattern)
+                    if len(parts) > 1:
+                        # Clean up the value (capitalize first letter, remove trailing spaces and special chars)
+                        value = parts[1].strip().split()[0].capitalize()
+                        classification[rank] = value
+                        break
+    # STRATEGY 2: Look for categories that directly match taxonomic naming conventions
+    for category in categories:
+        # Skip Categories: prefix if present
+        if category.startswith("Category:"):
+            category = category[9:]
+        category_parts = category.split()
+        # Check for single-word categories that might be taxonomic names
+        if len(category_parts) == 1:
+            name = category_parts[0]
+            # Check for common taxonomic suffixes
+            if name.endswith("idae"):  # Family suffix for animals
+                classification["family"] = name
+            elif name.endswith("inae"):  # Subfamily suffix
+                # Store subfamily info in a separate key
+                classification["subfamily"] = name
+            elif name.endswith("ales"):  # Order suffix for plants
+                classification["order"] = name
+            elif name.endswith("aceae"):  # Family suffix for plants
+                classification["family"] = name
+            elif name.endswith("ineae"):  # Suborder suffix for plants
+                # Store suborder info in a separate key
+                classification["suborder"] = name
+            elif name.endswith("oideae"):  # Subfamily suffix for plants
+                # Store subfamily info in a separate key
+                classification["subfamily"] = name
+    # STRATEGY 3: Check for categories that contain common taxonomic rank names
+    taxonomic_rank_names = ["kingdom", "phylum", "division", "class", "order", "family", "genus", "species"]
+    for category in categories:
+        # Skip Categories: prefix if present
+        if category.startswith("Category:"):
+            category = category[9:]
+        category_lower = category.lower()
+        for rank in taxonomic_rank_names:
+            if rank in category_lower:
+                # Look for words after the rank name
+                parts = category_lower.split(rank)
+                if len(parts) > 1 and parts[1].strip():
+                    # Get the first word after the rank
+                    value = parts[1].strip().split()[0].capitalize()
+                    if classification[rank] == "Unknown":
+                        classification[rank] = value
+    # Final cleanup: ensure proper capitalization and formatting
+    for rank, value in classification.items():
+        if value != "Unknown":
+            # Capitalize first letter for taxonomic ranks
+            classification[rank] = value[0].upper() + value[1:]
+    return classification
+def extract_habitat(description):
+    """
+    Extract habitat information from description using a more comprehensive approach
+    with multiple fallback strategies and pattern recognition.
+    """
+    if not description or description == "No description available":
+        return "Unknown"
+    # Split the description into sentences
+    sentences = description.replace(". ", ".|").replace("! ", "!|").replace("? ", "?|").split("|")
+    sentences = [s.strip() for s in sentences if s.strip()]
+    # STRATEGY 1: Direct habitat statements
+    # Expanded list of habitat-related keywords and phrases
+    habitat_keywords = [
+        "habitat", "lives in", "found in", "native to", "occurs in", "distribution",
+        "range includes", "ecosystem", "biome", "environment", "inhabits", "dwelling in",
+        "endemic to", "natural range", "geographical range", "distributed across",
+        "prefers", "thrives in", "flourishes in", "resides in", "habitat type",
+        "commonly found", "typically found", "often found", "usually found", "primarily found"
+    ]
+    # STRATEGY 2: Geography and climate context
+    # Climate and geography keywords to catch broader context
+    climate_keywords = [
+        "tropical", "temperate", "polar", "arctic", "antarctic", "desert",
+        "rainforest", "forest", "jungle", "grassland", "savanna", "wetland",
+        "marsh", "swamp", "mountain", "alpine", "coastal", "marine", "freshwater",
+        "ocean", "sea", "river", "lake", "stream", "pond", "terrestrial", "aquatic",
+        "woodland", "meadow", "tundra", "taiga", "steppe", "continent", "island",
+        "shore", "beach", "reef", "cave", "burrow", "nest", "canopy", "undergrowth"
+    ]
+    # STRATEGY 3: Regional indicators (continents, regions, countries)
+    region_keywords = [
+        "africa", "asia", "europe", "north america", "south america", "australia",
+        "antarctica", "oceania", "mediterranean", "pacific", "atlantic", "indian ocean",
+        "arctic ocean", "southern ocean", "northern", "southern", "eastern", "western",
+        "central", "worldwide", "global", "cosmopolitan", "international"
+    ]
+    # STRATEGY 4: Verbs that might indicate location or movement patterns
+    action_keywords = [
+        "migrate", "roam", "travel", "swim", "fly", "climb", "burrow", "dig", "nest",
+        "breed", "forage", "hunt", "territory", "range"
+    ]
+    # Sentences that might contain habitat information
+    habitat_sentences = []
+    # Apply Strategy 1: Direct habitat statements
+    for sentence in sentences:
+        for keyword in habitat_keywords:
+            if keyword.lower() in sentence.lower():
+                habitat_sentences.append(sentence)
+                break
+    # Apply Strategy 2: Geography and climate context (if strategy 1 didn't yield results)
+    if not habitat_sentences:
+        for sentence in sentences:
+            for keyword in climate_keywords:
+                if keyword.lower() in sentence.lower():
+                    habitat_sentences.append(sentence)
+                    break
+    # Apply Strategy 3: Regional indicators (if strategies 1-2 didn't yield results)
+    if not habitat_sentences:
+        for sentence in sentences:
+            for keyword in region_keywords:
+                if keyword.lower() in sentence.lower():
+                    habitat_sentences.append(sentence)
+                    break
+    # Apply Strategy 4: Action verbs related to habitat (if strategies 1-3 didn't yield results)
+    if not habitat_sentences:
+        for sentence in sentences:
+            for keyword in action_keywords:
+                if keyword.lower() in sentence.lower():
+                    habitat_sentences.append(sentence)
+                    break
+    # Fallback Strategy: If no habitat information was found, try to use the first or second sentence
+    # as they often contain general information about where the species lives
+    if not habitat_sentences and len(sentences) >= 2:
+        # Skip the first sentence if it's just a definition and take the second
+        if len(sentences) > 2:
+            second_sentence = sentences[1]
+            # Check if the second sentence has reasonable length to be informative
+            if len(second_sentence.split()) > 5:
+                habitat_sentences.append(second_sentence)
+        # If second sentence wasn't suitable or not available, use the first
+        if not habitat_sentences:
+            first_sentence = sentences[0]
+            if len(first_sentence.split()) > 5:
+                habitat_sentences.append(first_sentence)
+    # Format the habitat information
+    if habitat_sentences:
+        # If we have multiple sentences, join them (but limit to 2 for conciseness)
+        if len(habitat_sentences) > 1:
+            combined = ". ".join(habitat_sentences[:2]).strip()
+            # Make sure it ends with proper punctuation
+            if not combined.endswith(('.', '!', '?')):
+                combined += '.'
+            return combined
+        single = habitat_sentences[0].strip()
+        # Make sure it ends with proper punctuation
+        if not single.endswith(('.', '!', '?')):
+            single += '.'
+        return single
+    # Last resort: construct a generic message if we couldn't find specific habitat info
+    return "Specific habitat information not available from Wikispecies. Try searching online for more details about this species' natural environment."
+def extract_fun_facts(description):
+    """
+    Extract interesting fun facts from the description using keyword-based identification,
+    with improved pattern recognition and a structured approach to generate fun facts
+    even with limited information.
+    """
+    if not description or description == "No description available":
+        return ["No specific information available for this species in Wikispecies."]
+    # Split the description into sentences
+    sentences = description.replace(". ", ".|").replace("! ", "!|").replace("? ", "?|").split("|")
+    sentences = [s.strip() for s in sentences if s.strip()]
+    # If the description is too short, include it as a single fact
+    if len(sentences) == 1 and len(description) < 100:
+        if not sentences[0].endswith(('.', '!', '?')):
+            sentences[0] += '.'
+        return [sentences[0]]
+    # STRATEGY 1: Identify sentences with interesting keywords
+    interesting_keywords = [
+        "interesting", "unique", "unusual", "remarkable", "notable", "surprising",
+        "fascinating", "amazing", "extraordinary", "distinctive", "special", "rare",
+        "strange", "curious", "unlike", "peculiar", "odd", "bizarre", "striking",
+        "colorful", "beautiful", "impressive", "popular", "famous", "well-known",
+        "largest", "smallest", "fastest", "slowest", "oldest", "youngest", "only",
+        "record", "discovery", "first", "last", "origin", "discovered", "introduced",
+        "revered", "sacred", "symbol", "iconic", "emblem", "represented", "mythology",
+        "legend", "folklore", "traditional", "cultural", "significance", "historical"
+    ]
+    # STRATEGY 2: Physical characteristics and biology often make good facts
+    biology_keywords = [
+        "lifespan", "longevity", "size", "weight", "height", "length", "wingspan",
+        "color", "pattern", "marking", "appearance", "physical", "morphology", "anatomy",
+        "feature", "characteristic", "distinctive", "body", "shape", "structure",
+        "adaptation", "evolved", "evolution", "mutation", "gene", "genetic", "chromosome",
+        "hybrid", "species", "subspecies", "variety", "breed", "strain", "extinct",
+        "endangered", "threatened", "vulnerable", "conservation", "protected"
+    ]
+    # STRATEGY 3: Behavior and lifestyle information
+    behavior_keywords = [
+        "diet", "eat", "feeding", "food", "prey", "predator", "hunt", "scavenge",
+        "forage", "graze", "browse", "omnivore", "carnivore", "herbivore", "insectivore",
+        "behavior", "behaviour", "habit", "activity", "social", "solitary", "group",
+        "herd", "flock", "pack", "colony", "community", "family", "nocturnal", "diurnal",
+        "crepuscular", "migrate", "migration", "hibernate", "hibernation", "estivate",
+        "dormant", "sleep", "rest", "active", "territory", "defend", "aggressive",
+        "docile", "tame", "wild", "domestic", "domesticated", "trained", "human"
+    ]
+    # STRATEGY 4: Reproduction is always interesting
+    reproduction_keywords = [
+        "reproduce", "reproduction", "breeding", "mate", "mating", "courtship", "display",
+        "attract", "offspring", "young", "juvenile", "infant", "baby", "child", "adult",
+        "egg", "spawn", "birth", "pregnant", "gestation", "incubation", "hatch", "nestling",
+        "fledgling", "litter", "clutch", "brood", "parent", "care", "raise", "nurse", "wean"
+    ]
+    # Comparative patterns that often indicate interesting facts
+    comparative_patterns = [
+        "more than", "less than", "bigger than", "smaller than", "larger than",
+        "faster than", "slower than", "better than", "worse than", "greater than",
+        "unlike", "similar to", "compared to", "in contrast to", "differs from",
+        "up to", "as many as", "can reach", "can grow", "can live", "known to",
+        "capable of", "able to", "estimated", "approximately", "about", "around"
+    ]
+    # Measurement patterns that often indicate interesting statistics
+    measurement_patterns = [
+        "cm", "meter", "metre", "kilometer", "kilometre", "feet", "foot", "inch",
+        "kg", "gram", "pound", "ton", "tonne", "year", "month", "week", "day", "hour",
+        "percent", "°C", "°F", "degree", "celsius", "fahrenheit", "temperature",
+        "speed", "mph", "kph", "knot", "altitude", "depth", "width", "height"
+    ]
+    # Collect potential facts using different strategies
+    fact_candidates = {
+        "interesting": [],
+        "biological": [],
+        "behavioral": [],
+        "reproductive": [],
+        "comparative": [],
+        "measurements": [],
+        "general": []
+    }
+    # Apply strategies to collect potential facts
+    for sentence in sentences:
+        # Skip very short sentences
+        if len(sentence.split()) < 4:
+            continue
+        # Flag to track if the sentence has been categorized
+        categorized = False
+        # Strategy 1: Interesting keywords
+        for keyword in interesting_keywords:
+            if keyword.lower() in sentence.lower():
+                fact_candidates["interesting"].append(sentence)
+                categorized = True
+                break
+        if not categorized:
+            # Strategy 2: Biological characteristics
+            for keyword in biology_keywords:
+                if keyword.lower() in sentence.lower():
+                    fact_candidates["biological"].append(sentence)
+                    categorized = True
+                    break
+        if not categorized:
+            # Strategy 3: Behavior keywords
+            for keyword in behavior_keywords:
+                if keyword.lower() in sentence.lower():
+                    fact_candidates["behavioral"].append(sentence)
+                    categorized = True
+                    break
+        if not categorized:
+            # Strategy 4: Reproduction keywords
+            for keyword in reproduction_keywords:
+                if keyword.lower() in sentence.lower():
+                    fact_candidates["reproductive"].append(sentence)
+                    categorized = True
+                    break
+        if not categorized:
+            # Check for comparative patterns
+            for pattern in comparative_patterns:
+                if pattern.lower() in sentence.lower():
+                    fact_candidates["comparative"].append(sentence)
+                    categorized = True
+                    break
+        if not categorized:
+            # Check for measurement patterns
+            has_number = any(c.isdigit() for c in sentence)
+            if has_number:
+                for pattern in measurement_patterns:
+                    if pattern.lower() in sentence.lower():
+                        fact_candidates["measurements"].append(sentence)
+                        categorized = True
+                        break
+                        fact_candidates["measurements"].append(sentence)
+                        categorized = True
+                        break
+        # If sentence wasn't categorized by any specific strategy, add to general
+        if not categorized and len(sentence.split()) > 5:
+            fact_candidates["general"].append(sentence)
+    # Select facts from each category to ensure diversity (prioritizing the most interesting ones)
+    selected_facts = []
+    # Priority order for fact selection
+    categories = ["interesting", "measurements", "biological", "reproductive", "behavioral", "comparative", "general"]
+    # First, try to get at least one fact from high-priority categories
+    for category in categories[:3]:  # First 3 are highest priority
+        if fact_candidates[category]:
+            selected_facts.append(fact_candidates[category][0])
+            fact_candidates[category].pop(0)  # Remove the used fact
+    # Now fill remaining slots with a mix of all categories
+    remaining_slots = 4 - len(selected_facts)  # Maximum 4 facts total
+    if remaining_slots > 0:
+        for category in categories:
+            if fact_candidates[category] and remaining_slots > 0:
+                next_fact = fact_candidates[category][0]
+                # Only add if not too similar to already selected facts
+                if not any(similarity_score(next_fact, fact) > 0.7 for fact in selected_facts):
+                    selected_facts.append(next_fact)
+                    remaining_slots -= 1
+                fact_candidates[category].pop(0)  # Remove the used fact
+    # If we still don't have enough facts, add more from general pool
+    if len(selected_facts) < 2 and sentences:
+        # Add the first sentence if it's not already included
+        if sentences[0] not in selected_facts and len(sentences[0].split()) > 5:
+            selected_facts.append(sentences[0])
+        # Add another sentence from middle of the text if available
+        middle_idx = len(sentences) // 2
+        if len(sentences) > middle_idx and sentences[middle_idx] not in selected_facts and len(sentences[middle_idx].split()) > 5:
+            selected_facts.append(sentences[middle_idx])
+    # Last resort: if still no facts, create a generic fact
+    if not selected_facts:
+        selected_facts = ["This species is documented in Wikispecies, the free species directory."]
+    # Ensure all facts end with proper punctuation
+    for i in range(len(selected_facts)):
+        if not selected_facts[i].endswith(('.', '!', '?')):
+            selected_facts[i] += '.'
+    # Remove duplicates while preserving order
+    unique_facts = []
+    for fact in selected_facts:
+        if fact not in unique_facts:
+            unique_facts.append(fact)
+    return unique_facts[:4]  # Limit to max 4 facts
+def similarity_score(str1, str2):
+    """
+    Calculate a simple similarity score between two strings
+    based on word overlap. Used to avoid selecting too similar facts.
+    Returns a value between 0 (completely different) and 1 (identical).
+    """
+    if not str1 or not str2:
+        return 0
+    # Convert to lowercase and split into words
+    words1 = set(str1.lower().split())
+    words2 = set(str2.lower().split())
+    # Calculate Jaccard similarity
+    intersection = words1.intersection(words2)
+    union = words1.union(words2)
+    if not union:
+        return 0
+    return len(intersection) / len(union)
+def get_mock_species_from_filename(filename):
+    """
+    A mock function that simulates image recognition by looking at the filename.
+    In a real application, this would be replaced with an actual image recognition API.
+    """
+    filename_lower = filename.lower()
+    # List of common animals and their possible filenames
+    animal_keywords = {
+        "cat": "Felis catus",
+        "dog": "Canis familiaris",
+        "bird": "Aves",
+        "eagle": "Aquila chrysaetos",
+        "lion": "Panthera leo",
+        "tiger": "Panthera tigris",
+        "bear": "Ursus arctos",
+        "wolf": "Canis lupus",
+        "fox": "Vulpes vulpes",
+        "deer": "Cervidae",
+        "elephant": "Loxodonta africana",
+        "giraffe": "Giraffa camelopardalis",
+        "zebra": "Equus quagga",
+        "monkey": "Primates",
+        "gorilla": "Gorilla gorilla",
+        "fish": "Actinopterygii",
+        "shark": "Selachimorpha",
+        "dolphin": "Tursiops truncatus",
+        "whale": "Cetacea",
+        "snake": "Serpentes",
+        "lizard": "Lacertilia",
+        "turtle": "Testudines",
+        "frog": "Anura",
+        "butterfly": "Lepidoptera",
+        "bee": "Apis mellifera",
+    }
+    # List of common plants and their possible filenames
+    plant_keywords = {
+        "tree": "Arbor",
+        "flower": "Anthophyta",
+        "rose": "Rosa",
+        "tulip": "Tulipa",
+        "daisy": "Bellis perennis",
+        "sunflower": "Helianthus annuus",
+        "oak": "Quercus",
+        "pine": "Pinus",
+        "maple": "Acer",
+        "fern": "Polypodiopsida",
+        "moss": "Bryophyta",
+        "grass": "Poaceae",
+        "cactus": "Cactaceae",
+        "palm": "Arecaceae",
+        "orchid": "Orchidaceae",
+    }
+    # Check animal keywords
+    for keyword, species in animal_keywords.items():
+        if keyword in filename_lower:
+            return species
+    # Check plant keywords
+    for keyword, species in plant_keywords.items():
+        if keyword in filename_lower:
+            return species
+    # If no match is found, return a default species
+    return "Homo sapiens"
+def extract_wikipedia_classification(full_text, title, search_data=None):
+    """
+    Extract classification/taxonomy information from Wikipedia content.
+    Uses various strategies including infobox parsing, section analysis, and text pattern matching.
+    Args:
+        full_text: The full text content of the Wikipedia page
+        title: The title of the Wikipedia page
+        search_data: Optional search data that might contain additional info
+    Returns:
+        A dictionary with taxonomic ranks and their values
+    """
+    # Initialize with default "Unknown" values
+    classification = {
+        "kingdom": "Unknown",
+        "phylum": "Unknown",
+        "class": "Unknown",
+        "order": "Unknown",
+        "family": "Unknown",
+        "genus": "Unknown",
+        "species": "Unknown"
+    }
+    if not full_text:
+        return classification
+    try:
+        # STRATEGY 1: Look for taxonomic information in specific sections
+        taxonomy_section = extract_wikipedia_section(full_text, ["Taxonomy", "Classification", "Taxonomic", "Scientific classification"])
+        if taxonomy_section:
+            # Extract taxonomic information from the section
+            classification = extract_taxonomy_from_text(taxonomy_section, classification)
+        # STRATEGY 2: Look for taxonomic information in infobox-like structures
+        # Wikipedia infoboxes often appear at the beginning of the text with structured format
+        infobox_patterns = [
+            r"Kingdom:\s*([A-Za-z]+)",
+            r"Phylum:\s*([A-Za-z]+)",
+            r"Class:\s*([A-Za-z]+)",
+            r"Order:\s*([A-Za-z]+)",
+            r"Family:\s*([A-Za-z]+)",
+            r"Genus:\s*([A-Za-z]+)",
+            r"Species:\s*([A-Za-z]+)"
+        ]
+        # Apply each pattern to extract taxonomic information
+        for i, pattern in enumerate(infobox_patterns):
+            rank = list(classification.keys())[i]
+            matches = re.findall(pattern, full_text, re.IGNORECASE)
+            if matches:
+                classification[rank] = matches[0].strip()
+        # STRATEGY 3: Parse the first paragraph for taxonomic information
+        # First paragraphs in Wikipedia often contain taxonomic statements
+        first_para = full_text.split('\n\n')[0] if '\n\n' in full_text else full_text
+        classification = extract_taxonomy_from_text(first_para, classification)
+        # STRATEGY 4: Try to extract genus and species from the title
+        title_parts = title.split()
+        if len(title_parts) >= 2 and classification["genus"] == "Unknown":
+            # If title looks like a binomial name (e.g., "Panthera leo")
+            if title_parts[0][0].isupper() and title_parts[0][1:].islower() and title_parts[1].islower():
+                classification["genus"] = title_parts[0]
+                if classification["species"] == "Unknown":
+                    classification["species"] = title_parts[1]
+        # STRATEGY 5: Look for taxonomic statements throughout the text
+        # These patterns match statements like "belongs to the family Felidae"
+        taxonomy_statement_patterns = [
+            r"(?:belongs|belonging)\s+to\s+(?:the)?\s+kingdom\s+([A-Za-z]+)",
+            r"(?:belongs|belonging)\s+to\s+(?:the)?\s+phylum\s+([A-Za-z]+)",
+            r"(?:belongs|belonging)\s+to\s+(?:the)?\s+class\s+([A-Za-z]+)",
+            r"(?:belongs|belonging)\s+to\s+(?:the)?\s+order\s+([A-Za-z]+)",
+            r"(?:belongs|belonging)\s+to\s+(?:the)?\s+family\s+([A-Za-z]+)",
+            r"(?:is|as)\s+a\s+(?:member|species)\s+of\s+(?:the)?\s+kingdom\s+([A-Za-z]+)",
+            r"(?:is|as)\s+a\s+(?:member|species)\s+of\s+(?:the)?\s+phylum\s+([A-Za-z]+)",
+            r"(?:is|as)\s+a\s+(?:member|species)\s+of\s+(?:the)?\s+class\s+([A-Za-z]+)",
+            r"(?:is|as)\s+a\s+(?:member|species)\s+of\s+(?:the)?\s+order\s+([A-Za-z]+)",
+            r"(?:is|as)\s+a\s+(?:member|species)\s+of\s+(?:the)?\s+family\s+([A-Za-z]+)",
+            r"(?:is|as)\s+a\s+(?:member|species)\s+of\s+(?:the)?\s+genus\s+([A-Za-z]+)"
+        ]
+        # Map patterns to taxonomic ranks
+        rank_map = {
+            0: "kingdom", 1: "phylum", 2: "class", 3: "order", 4: "family",
+            5: "kingdom", 6: "phylum", 7: "class", 8: "order", 9: "family", 10: "genus"
+        }
+        # Apply statement patterns to extract taxonomic information
+        for i, pattern in enumerate(taxonomy_statement_patterns):
+            rank = rank_map.get(i)
+            if not rank:
+                continue
+            matches = re.findall(pattern, full_text, re.IGNORECASE)
+            if matches and classification[rank] == "Unknown":
+                classification[rank] = matches[0].strip()
+        # Final cleanup: ensure proper capitalization and formatting
+        for rank, value in classification.items():
+            if value != "Unknown":
+                # Capitalize first letter for taxonomic ranks
+                classification[rank] = value[0].upper() + value[1:]
+    except Exception as e:
+        print(f"Error extracting classification from Wikipedia: {str(e)}")
+        # If an error occurs, we'll return the classification with whatever data we managed to extract
+    return classification
+def extract_taxonomy_from_text(text, classification):
+    """
+    Extract taxonomic information from text using pattern matching
+    and natural language processing techniques.
+    Args:
+        text: The text to analyze
+        classification: The current classification dictionary to update
+    Returns:
+        Updated classification dictionary
+    """
+    if not text:
+        return classification
+    try:
+        # Common patterns for taxonomic ranks in text
+        taxonomy_patterns = {
+            "kingdom": [r"Kingdom:?\s*([A-Za-z]+)", r"Kingdom\s+([A-Za-z]+)", r"a member of the kingdom\s+([A-Za-z]+)"],
+            "phylum": [r"Phylum:?\s*([A-Za-z]+)", r"Phylum\s+([A-Za-z]+)", r"a member of the phylum\s+([A-Za-z]+)"],
+            "class": [r"Class:?\s*([A-Za-z]+)", r"Class\s+([A-Za-z]+)", r"a member of the class\s+([A-Za-z]+)"],
+            "order": [r"Order:?\s*([A-Za-z]+)", r"Order\s+([A-Za-z]+)", r"a member of the order\s+([A-Za-z]+)"],
+        }
+        # For each taxonomic rank, try to find matches using the patterns
+        for rank, patterns in taxonomy_patterns.items():
+            if classification[rank] != "Unknown":
+                continue  # Skip if we already have a value
+            for pattern in patterns:
+                matches = re.findall(pattern, text, re.IGNORECASE)
+                if matches:
+                    # Take the first match and clean it up
+                    match = matches[0].strip()
+                    # Handle Latin taxonomic names with proper capitalization
+                    if rank in ["genus", "species"]:
+                        match = match[0].upper() + match[1:].lower()
+                    elif rank != "species":  # For non-species ranks
+                        match = match.capitalize()
+                    classification[rank] = match
+                    break  # Stop after finding a match for this rank
+        # Look for taxonomic information with specific taxonomic suffixes
+        suffix_patterns = {
+            "family": [r"\b([A-Za-z]+idae)\b", r"\b([A-Za-z]+aceae)\b"],  # Animal and plant families
+            "order": [r"\b([A-Za-z]+ales)\b", r"\b([A-Za-z]+ida)\b"],  # Plant orders and animal orders
+            "class": [r"\b([A-Za-z]+ia)\b", r"\b([A-Za-z]+phyceae)\b"],  # Classes
+            "phylum": [r"\b([A-Za-z]+phyta)\b", r"\b([A-Za-z]+zoa)\b"]  # Plant and animal phyla
+        }
+        # Apply suffix patterns to extract taxonomic information
+        for rank, patterns in suffix_patterns.items():
+            if classification[rank] != "Unknown":
+                continue  # Skip if we already have a value
+            for pattern in patterns:
+                matches = re.findall(pattern, text)
+                if matches:
+                    # Take the first match and clean it up
+                    match = matches[0].strip()
+                    classification[rank] = match
+                    break
+    except Exception as e:
+        print(f"Error in extract_taxonomy_from_text: {str(e)}")
+        # If an error occurs, return the classification as is
+    return classification
+if _name_ == "_main_":
+    main()