import streamlit as st
import requests
import os
import re
from PIL import Image
import tempfile

# List of allowed file extensions for uploads
ALLOWED_EXTENSIONS = {'png', 'jpg', 'jpeg', 'gif'}

def allowed_file(filename):
    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS

def main():
    st.set_page_config(page_title="Species Information Finder", layout="wide")
    
    st.title("Species Information Finder")
    st.write("Discover information about any species by name or by uploading an image.")
    
    # Create tabs for different functionality
    tab1, tab2 = st.tabs(["Search by Name", "Search by Image"])
    
    with tab1:
        st.header("Search by Species Name")
        species_name = st.text_input("Enter a species name (common or scientific):")
        
        if st.button("Search"):
            if not species_name:
                st.error("Please enter a species name")
            else:
                with st.spinner("Searching for species information..."):
                    # Get species info from Wikispecies API
                    species_data = get_species_info(species_name)
                    
                    # Get images from Wikimedia Commons API
                    images = get_species_images(species_name)
                    
                    display_results(species_data, images)
    
    with tab2:
        st.header("Search by Image Upload")
        uploaded_file = st.file_uploader("Upload an image of a species", type=ALLOWED_EXTENSIONS)
        
        if uploaded_file is not None:
            if allowed_file(uploaded_file.name):
                # Display the uploaded image
                image = Image.open(uploaded_file)
                st.image(image, caption="Uploaded Image", use_column_width=True)
                
                if st.button("Identify Species"):
                    with st.spinner("Identifying species from image..."):
                        # In a real app, you would call an image recognition API here
                        # For demo purposes, we'll use our mock function
                        species_name = get_mock_species_from_filename(uploaded_file.name)
                        
                        # Get species info from Wikispecies API
                        species_data = get_species_info(species_name)
                        
                        # Get images from Wikimedia Commons API
                        images = get_species_images(species_name)
                        
                        display_results(species_data, images)
            else:
                st.error("File type not allowed. Please upload an image file (PNG, JPG, JPEG, GIF).")

def display_results(species_data, images):
    """Display the results in a formatted way."""
    if "error" in species_data:
        st.error(species_data["error"])
        return
    
    st.success(f"Found information for: {species_data['title']}")
    
    # Create columns for layout
    col1, col2 = st.columns([1, 2])
    
    with col1:
        # Display classification information
        st.subheader("Classification")
        classification = species_data.get("classification", {})
        for rank, value in classification.items():
            if value != "Unknown":
                st.write(f"{rank.capitalize()}:** {value}")
        
        # Display habitat information
        if species_data.get("habitat", "Unknown") != "Unknown":
            st.subheader("Habitat")
            st.write(species_data["habitat"])
    
    with col2:
        # Display description
        st.subheader("Description")
        st.write(species_data.get("description", "No description available."))
        
        # Display fun facts if available
        if species_data.get("fun_facts"):
            st.subheader("Interesting Facts")
            for i, fact in enumerate(species_data["fun_facts"], 1):
                st.write(f"{i}. {fact}")
    
    # Display images if available
    if images:
        st.subheader("Related Images")
        
        # Display up to 4 images in a grid
        cols = st.columns(min(4, len(images)))
        for idx, img in enumerate(images[:4]):
            with cols[idx]:
                if "thumb_url" in img:
                    st.image(img["thumb_url"], caption=img.get("description", ""), use_column_width=True)
                else:
                    st.image(img["url"], caption=img.get("description", ""), use_column_width=True)
                st.caption(f"Credit: {img.get('author', 'Unknown')} | License: {img.get('license', 'Unknown')}")
    else:
        st.warning("No images found for this species.")

# All the existing functions from your Flask app can remain exactly the same
# (get_species_info, get_wikispecies_data, get_wikipedia_data, etc.)
# I'll include them below for completeness, but they don't need to change

def get_species_info(species_name):
    """
    Get species information from both Wikispecies and Wikipedia APIs
    with improved extraction and fallback strategies for better results.
    """
    # Create the base species info structure
    species_info = {
        "title": species_name,  # Default to the search query
        "description": "No description available.",
        "categories": [],
        "links": [],
        "last_modified": "Unknown",
        "classification": {
            "kingdom": "Unknown", 
            "phylum": "Unknown", 
            "class": "Unknown", 
            "order": "Unknown", 
            "family": "Unknown", 
            "genus": "Unknown", 
            "species": "Unknown"
        },
        "habitat": "Unknown",
        "fun_facts": [],
        "data_sources": []  # Track where we got data from
    }
    
    # Try to get data from Wikispecies first
    wikispecies_info = get_wikispecies_data(species_name)
    
    # If we got a valid response, update our species_info
    if not wikispecies_info.get("error"):
        species_info.update(wikispecies_info)
        species_info["data_sources"].append("Wikispecies")
    
    # Now try to get complementary data from Wikipedia
    wikipedia_info = get_wikipedia_data(species_name)
    
    # If Wikipedia returned valid data, supplement our existing info
    if not wikipedia_info.get("error"):
        # Use Wikipedia description if Wikispecies didn't have one
        if species_info["description"] == "No description available." or len(species_info["description"]) < 50:
            species_info["description"] = wikipedia_info.get("description", species_info["description"])
        
        # Always prefer Wikipedia habitat info as it's likely more detailed
        species_info["habitat"] = wikipedia_info.get("habitat", species_info["habitat"])
        
        # Merge classification info from Wikipedia, preferring Wikipedia data
        if "classification" in wikipedia_info:
            for rank, value in wikipedia_info["classification"].items():
                if value != "Unknown":
                    species_info["classification"][rank] = value
        
        # Add Wikipedia fun facts to our collection, avoiding duplicates
        if wikipedia_info.get("fun_facts"):
            existing_facts = species_info.get("fun_facts", [])
            for fact in wikipedia_info["fun_facts"]:
                if not any(similarity_score(fact, existing) > 0.7 for existing in existing_facts):
                    existing_facts.append(fact)
            species_info["fun_facts"] = existing_facts[:4]  # Limit to 4 facts
        
        species_info["data_sources"].append("Wikipedia")
    
    # If we didn't get any data from either source, return an error
    if not species_info["data_sources"]:
        species_info["error"] = "Species information not found in either Wikispecies or Wikipedia."
    
    return species_info

def get_wikispecies_data(species_name):
    """
    Get species information from Wikispecies API
    """
    # Wikispecies API endpoint
    url = "https://species.wikimedia.org/w/api.php"
    
    # Parameters for the API request - get more info to work with
    params = {
        "action": "query",
        "format": "json",
        "titles": species_name,
        "prop": "extracts|categories|info|links",
        "exintro": True,  # Get only the intro section
        "explaintext": True,  # Get plain text, not HTML
        "cllimit": 50,  # Get more categories
        "pllimit": 50,  # Get more links
    }
    
    try:
        response = requests.get(url, params=params)
        data = response.json()
        
        # Extract page data
        pages = data.get("query", {}).get("pages", {})
        
        if not pages:
            return {"error": "No data found in Wikispecies"}
        
        # Get the first page (there should only be one)
        page_id = next(iter(pages))
        page = pages[page_id]
        
        # Default information structure with placeholders
        species_info = {
            "title": species_name,  # Default to the search query
            "description": "No description available.",
            "categories": [],
            "links": [],
            "last_modified": "Unknown",
            "classification": {
                "kingdom": "Unknown", 
                "phylum": "Unknown", 
                "class": "Unknown", 
                "order": "Unknown", 
                "family": "Unknown", 
                "genus": "Unknown", 
                "species": "Unknown"
            },
            "habitat": "Unknown",
            "fun_facts": []
        }
        
        # Check if the page exists
        if int(page_id) < 0:
            species_info["error"] = "Species not found in Wikispecies. Try a different spelling or check for the scientific name."
            return species_info
        
        # Extract the relevant information
        species_info["title"] = page.get("title", species_name)
        species_info["description"] = page.get("extract", "No description available.")
        
        # Get all categories
        if "categories" in page:
            species_info["categories"] = [cat.get("title") for cat in page.get("categories", [])]
        
        # Get all links (can be useful for finding related info)
        if "links" in page:
            species_info["links"] = [link.get("title") for link in page.get("links", [])]
            
        species_info["last_modified"] = page.get("touched", "Unknown")
        
        # Clean up the description (remove unnecessary line breaks, etc.)
        if species_info["description"]:
            species_info["description"] = species_info["description"].replace("\n", " ").strip()
            # Remove multiple spaces
            import re
            species_info["description"] = re.sub(r' +', ' ', species_info["description"])
        
        # Try different strategies to extract classification
        # Strategy 1: Extract from categories
        species_info["classification"] = extract_classification(species_info["categories"])
        
        # Strategy 2: Try to extract genus and species from the title if available
        title = species_info.get("title", "")
        title_parts = title.split()
        
        # If the title consists of two words, it might be a binomial name (genus + species)
        if len(title_parts) == 2:
            genus = title_parts[0]
            species = title_parts[1]
            
            # Update classification with this information
            classification = species_info.get("classification", {})
            if classification.get("genus") == "Unknown":
                classification["genus"] = genus
            if classification.get("species") == "Unknown":
                classification["species"] = species
            species_info["classification"] = classification
        
        # Strategy 3: Look for classification information in links
        if species_info.get("links"):
            for link in species_info["links"]:
                # Check if link might be a taxonomic rank
                link_parts = link.split()
                if len(link_parts) == 1:
                    # Check common taxonomic suffixes for families, orders, etc.
                    if link.endswith("idae"):  # Family suffix
                        species_info["classification"]["family"] = link
                    elif link.endswith("inae"):  # Subfamily suffix
                        # Store subfamily info in a separate key
                        species_info["classification"]["subfamily"] = link
                    elif link.endswith("ales"):  # Order suffix for plants
                        species_info["classification"]["order"] = link
                    elif link.endswith("aceae"):  # Family suffix for plants
                        species_info["classification"]["family"] = link
        
        # Extract habitat info 
        species_info["habitat"] = extract_habitat(species_info["description"])
        
        # Extract fun facts
        species_info["fun_facts"] = extract_fun_facts(species_info["description"])
        
        # If the description is too short or missing, try to create a basic description
        if not species_info["description"] or len(species_info["description"]) < 20:
            # Create a basic description from available information
            classification = species_info["classification"]
            parts = []
            
            if classification["genus"] != "Unknown" and classification["species"] != "Unknown":
                parts.append(f"{species_info['title']} is a species in the genus {classification['genus']}.")
            
            if classification["family"] != "Unknown":
                parts.append(f"It belongs to the family {classification['family']}.")
                
            if classification["order"] != "Unknown":
                parts.append(f"It is classified under the order {classification['order']}.")
                
            if parts:
                species_info["description"] = " ".join(parts)
            else:
                species_info["description"] = f"{species_info['title']} is a species documented in Wikispecies, the free species directory."
        
        return species_info
    
    except Exception as e:
        error_msg = str(e)
        return {
            "error": f"Error retrieving species information from Wikispecies: {error_msg}",
            "title": species_name,
            "description": "No information available due to an error. Please try a different species name.",
            "classification": {"kingdom": "Unknown", "phylum": "Unknown", "class": "Unknown", "order": "Unknown", "family": "Unknown", "genus": "Unknown", "species": "Unknown"},
            "habitat": "Unknown",
            "fun_facts": []
        }

def get_wikipedia_data(species_name):
    """
    Get species information from Wikipedia API, focusing on description,
    habitat, and fun facts.
    """
    # Wikipedia API endpoint
    url = "https://en.wikipedia.org/w/api.php"
    
    # First, try to search for the page to get the correct title
    search_params = {
        "action": "query",
        "format": "json",
        "list": "search",
        "srsearch": species_name,
        "srlimit": 1,  # Get just the best match
    }
    
    try:
        # Search for the page first to get the exact title
        search_response = requests.get(url, params=search_params)
        search_data = search_response.json()
        
        # Check if we found any search results
        search_results = search_data.get("query", {}).get("search", [])
        if not search_results:
            return {"error": "No matching Wikipedia page found for this species."}
        
        # Get the page title from the search result
        page_title = search_results[0].get("title")
        
        # Now get the full page content
        content_params = {
            "action": "query",
            "format": "json",
            "titles": page_title,
            "prop": "extracts|categories|sections",
            "exintro": False,  # Get the full content, not just the intro
            "explaintext": True,  # Get plain text, not HTML
            "cllimit": 50,  # Get more categories
        }
        
        content_response = requests.get(url, params=content_params)
        content_data = content_response.json()
        
        # Extract page data
        pages = content_data.get("query", {}).get("pages", {})
        
        if not pages:
            return {"error": "Failed to retrieve Wikipedia page content."}
        
        # Get the first page (there should only be one)
        page_id = next(iter(pages))
        page = pages[page_id]
        
        # Check if the page exists
        if int(page_id) < 0:
            return {"error": "Wikipedia page not found."}
        
        # Get basic information
        species_info = {
            "title": page.get("title", species_name),
            "description": "",
            "habitat": "Unknown",
            "fun_facts": [],
            "classification": {
                "kingdom": "Unknown", 
                "phylum": "Unknown", 
                "class": "Unknown", 
                "order": "Unknown", 
                "family": "Unknown", 
                "genus": "Unknown", 
                "species": "Unknown"
            }
        }
        
        # Extract the content
        full_text = page.get("extract", "")
        
        # Clean up the text
        if full_text:
            full_text = full_text.replace("\n\n", "||").replace("\n", " ").replace("||", "\n\n")
            
            # Get sections from the content
            sections = full_text.split("\n\n")
            
            # The first section is usually a good description
            if sections:
                species_info["description"] = sections[0].strip()
            
            # Look for habitat information in the full text
            habitat_section = extract_wikipedia_section(full_text, ["Habitat", "Distribution", "Range", "Ecology", "Environment"])
            if habitat_section:
                species_info["habitat"] = habitat_section
            else:
                # If no specific habitat section, use our habitat extraction on the full text
                habitat = extract_habitat(full_text)
                if habitat != "Unknown":
                    species_info["habitat"] = habitat
            
            # Extract fun facts from various interesting sections
            behavior_section = extract_wikipedia_section(full_text, ["Behavior", "Behaviour", "Life cycle", "Diet", "Feeding", "Reproduction", "Biology"])
            if behavior_section:
                facts = extract_fun_facts(behavior_section)
                if facts:
                    species_info["fun_facts"].extend(facts)
            
            # If we don't have enough facts, try conservation status or other sections
            if len(species_info["fun_facts"]) < 2:
                conservation_section = extract_wikipedia_section(full_text, ["Conservation", "Status", "Threats", "Population"])
                if conservation_section:
                    facts = extract_fun_facts(conservation_section)
                    if facts:
                        for fact in facts:
                            if fact not in species_info["fun_facts"]:
                                species_info["fun_facts"].append(fact)
            
            # If we still don't have enough facts, use our fun facts extraction on the full text
            if len(species_info["fun_facts"]) < 2:
                general_facts = extract_fun_facts(full_text)
                if general_facts:
                    for fact in general_facts:
                        if fact not in species_info["fun_facts"]:
                            species_info["fun_facts"].append(fact)
            
            # Limit to 4 facts
            species_info["fun_facts"] = species_info["fun_facts"][:4]
            
            # Extract classification from Wikipedia content
            wiki_classification = extract_wikipedia_classification(full_text, page.get("title", ""), search_data)
            if wiki_classification:
                species_info["classification"] = wiki_classification
        
        return species_info
    
    except Exception as e:
        error_msg = str(e)
        return {
            "error": f"Error retrieving information from Wikipedia: {error_msg}",
            "title": species_name,
            "description": "No information available from Wikipedia due to an error.",
            "habitat": "Unknown",
            "fun_facts": []
        }

def extract_wikipedia_section(text, section_keywords):
    """
    Try to extract a specific section from Wikipedia text content.
    Returns the first matching section or None if no match is found.
    """
    if not text:
        return None
    
    # Try to find section headings in the text
    section_pattern = r"==\s*([^=]+)\s*=="
    sections = re.findall(section_pattern, text)
    
    # Check if any of our target sections exist
    matching_sections = []
    for keyword in section_keywords:
        for section in sections:
            if keyword.lower() in section.lower():
                # Found a matching section, now extract its content
                section_regex = re.escape(f"== {section} ==")
                try:
                    # Find where this section starts
                    start_match = re.search(section_regex, text)
                    if start_match:
                        start_pos = start_match.end()
                        
                        # Find where the next section starts
                        next_section = re.search(r"==\s*[^=]+\s*==", text[start_pos:])
                        if next_section:
                            end_pos = start_pos + next_section.start()
                            section_text = text[start_pos:end_pos].strip()
                        else:
                            # This is the last section
                            section_text = text[start_pos:].strip()
                        
                        matching_sections.append(section_text)
                except Exception:
                    # Skip this section if there's any error processing it
                    continue
    
    # If we found any matching sections, join them (limit to 2 for conciseness)
    if matching_sections:
        return " ".join(matching_sections[:2])
    
    # Alternative approach: look for paragraphs containing the keywords
    paragraphs = text.split("\n\n")
    for keyword in section_keywords:
        for paragraph in paragraphs:
            if keyword.lower() in paragraph.lower():
                return paragraph
    
    return None

def get_species_images(species_name):
    """
    Get species images from Wikimedia Commons API with improved search
    strategies for better results.
    """
    # Wikimedia Commons API endpoint
    url = "https://commons.wikimedia.org/w/api.php"
    
    # Function to perform a search with given parameters
    def search_images(search_term, limit=10):
        # Parameters for the API request
        params = {
            "action": "query",
            "format": "json",
            "generator": "search",
            "gsrnamespace": 6,  # File namespace
            "gsrsearch": search_term,
            "gsrlimit": limit,  # Limit results
            "prop": "imageinfo",
            "iiprop": "url|extmetadata",
            "iiurlwidth": 800,  # Thumbnail width
        }
        
        try:
            response = requests.get(url, params=params)
            data = response.json()
            
            # Extract image data
            pages = data.get("query", {}).get("pages", {})
            
            if not pages:
                return []
            
            images = []
            for page_id, page in pages.items():
                image_info = page.get("imageinfo", [{}])[0]
                
                # Extract metadata
                metadata = image_info.get("extmetadata", {})
                description = metadata.get("ImageDescription", {}).get("value", "No description")
                author = metadata.get("Artist", {}).get("value", "Unknown")
                license = metadata.get("License", {}).get("value", "Unknown")
                
                # Skip non-image files (like pdfs, audio, etc.)
                title = page.get("title", "").lower()
                if any(ext in title for ext in ['.pdf', '.svg', '.mp3', '.mp4', '.ogg', '.wav', '.webm']):
                    continue
                
                image = {
                    "title": page.get("title", "Unknown"),
                    "url": image_info.get("url", ""),
                    "thumb_url": image_info.get("thumburl", ""),
                    "description": description,
                    "author": author,
                    "license": license,
                }
                
                images.append(image)
            
            return images
        
        except Exception as e:
            return [{"error": str(e)}]
    
    # STRATEGY 1: Try exact file name search first
    images = search_images(f"file:{species_name}")
    
    # If no results, try a broader search
    if not images:
        # STRATEGY 2: Try removing the file: prefix for broader results
        images = search_images(species_name)
    
    # If still no results or very few, try some variations
    if len(images) < 3:
        # Split the species name and try different combinations
        name_parts = species_name.split()
        
        # STRATEGY 3: If it's a binomial name, try with just the genus or species part
        if len(name_parts) == 2:
            # Try with just the genus (first part)
            genus_images = search_images(f"{name_parts[0]}")
            
            # Add unique images from genus search
            existing_urls = [img.get("url") for img in images]
            for img in genus_images:
                if img.get("url") not in existing_urls:
                    images.append(img)
                    existing_urls.append(img.get("url"))
                    
                    # Stop if we now have enough images
                    if len(images) >= 5:
                        break
    
    # If we found at least some images, return them
    if images:
        return images
    
    # STRATEGY 4: Last resort - try a very general search
    # This could be improved by using the taxonomy info
    return search_images("species taxonomy nature")

def extract_classification(categories):
    """
    Extract classification information from categories and additional WikiData
    with improved pattern matching and detection.
    """
    # Initialize with default "Unknown" values
    classification = {
        "kingdom": "Unknown",
        "phylum": "Unknown",
        "class": "Unknown",
        "order": "Unknown",
        "family": "Unknown",
        "genus": "Unknown",
        "species": "Unknown",
    }
    
    # Skip empty categories
    if not categories:
        return classification
    
    # Common taxonomy patterns in category names with more variations
    taxonomy_patterns = {
        "kingdom": ["kingdom:", "regnum:", "reino:", "regno:", "kingdom ", "regnum ", "reino ", "reino "],
        "phylum": ["phylum:", "division:", "división:", "divisio:", "phylum ", "division ", "división ", "divisio "],
        "class": ["class:", "clase:", "classis:", "class ", "clase ", "classis "],
        "order": ["order:", "orden:", "ordo:", "order ", "orden ", "ordo "],
        "family": ["family:", "familia:", "family ", "familia "],
        "genus": ["genus:", "género:", "genero:", "genus ", "género ", "genero "],
        "species": ["species:", "especie:", "specie:", "species ", "especie ", "specie "]
    }
    
    # STRATEGY 1: Direct matching from category names
    for category in categories:
        # Skip Categories: prefix if present
        if category.startswith("Category:"):
            category = category[9:]
            
        category_lower = category.lower()
        
        # Check for direct taxonomy mentions
        for rank, patterns in taxonomy_patterns.items():
            for pattern in patterns:
                if pattern in category_lower:
                    # Extract the value after the pattern
                    parts = category_lower.split(pattern)
                    if len(parts) > 1:
                        # Clean up the value (capitalize first letter, remove trailing spaces and special chars)
                        value = parts[1].strip().split()[0].capitalize()
                        classification[rank] = value
                        break
    
    # STRATEGY 2: Look for categories that directly match taxonomic naming conventions
    for category in categories:
        # Skip Categories: prefix if present
        if category.startswith("Category:"):
            category = category[9:]
            
        category_parts = category.split()
        
        # Check for single-word categories that might be taxonomic names
        if len(category_parts) == 1:
            name = category_parts[0]
            
            # Check for common taxonomic suffixes
            if name.endswith("idae"):  # Family suffix for animals
                classification["family"] = name
            elif name.endswith("inae"):  # Subfamily suffix
                # Store subfamily info in a separate key
                classification["subfamily"] = name
            elif name.endswith("ales"):  # Order suffix for plants
                classification["order"] = name
            elif name.endswith("aceae"):  # Family suffix for plants
                classification["family"] = name
            elif name.endswith("ineae"):  # Suborder suffix for plants
                # Store suborder info in a separate key
                classification["suborder"] = name
            elif name.endswith("oideae"):  # Subfamily suffix for plants
                # Store subfamily info in a separate key
                classification["subfamily"] = name
    
    # STRATEGY 3: Check for categories that contain common taxonomic rank names
    taxonomic_rank_names = ["kingdom", "phylum", "division", "class", "order", "family", "genus", "species"]
    for category in categories:
        # Skip Categories: prefix if present
        if category.startswith("Category:"):
            category = category[9:]
            
        category_lower = category.lower()
        
        for rank in taxonomic_rank_names:
            if rank in category_lower:
                # Look for words after the rank name
                parts = category_lower.split(rank)
                if len(parts) > 1 and parts[1].strip():
                    # Get the first word after the rank
                    value = parts[1].strip().split()[0].capitalize()
                    if classification[rank] == "Unknown":
                        classification[rank] = value
    
    # Final cleanup: ensure proper capitalization and formatting
    for rank, value in classification.items():
        if value != "Unknown":
            # Capitalize first letter for taxonomic ranks
            classification[rank] = value[0].upper() + value[1:]
    
    return classification

def extract_habitat(description):
    """
    Extract habitat information from description using a more comprehensive approach
    with multiple fallback strategies and pattern recognition.
    """
    if not description or description == "No description available":
        return "Unknown"
    
    # Split the description into sentences
    sentences = description.replace(". ", ".|").replace("! ", "!|").replace("? ", "?|").split("|")
    sentences = [s.strip() for s in sentences if s.strip()]
    
    # STRATEGY 1: Direct habitat statements
    # Expanded list of habitat-related keywords and phrases
    habitat_keywords = [
        "habitat", "lives in", "found in", "native to", "occurs in", "distribution", 
        "range includes", "ecosystem", "biome", "environment", "inhabits", "dwelling in",
        "endemic to", "natural range", "geographical range", "distributed across",
        "prefers", "thrives in", "flourishes in", "resides in", "habitat type",
        "commonly found", "typically found", "often found", "usually found", "primarily found"
    ]
    
    # STRATEGY 2: Geography and climate context
    # Climate and geography keywords to catch broader context
    climate_keywords = [
        "tropical", "temperate", "polar", "arctic", "antarctic", "desert", 
        "rainforest", "forest", "jungle", "grassland", "savanna", "wetland", 
        "marsh", "swamp", "mountain", "alpine", "coastal", "marine", "freshwater",
        "ocean", "sea", "river", "lake", "stream", "pond", "terrestrial", "aquatic",
        "woodland", "meadow", "tundra", "taiga", "steppe", "continent", "island",
        "shore", "beach", "reef", "cave", "burrow", "nest", "canopy", "undergrowth"
    ]
    
    # STRATEGY 3: Regional indicators (continents, regions, countries)
    region_keywords = [
        "africa", "asia", "europe", "north america", "south america", "australia", 
        "antarctica", "oceania", "mediterranean", "pacific", "atlantic", "indian ocean",
        "arctic ocean", "southern ocean", "northern", "southern", "eastern", "western",
        "central", "worldwide", "global", "cosmopolitan", "international"
    ]
    
    # STRATEGY 4: Verbs that might indicate location or movement patterns
    action_keywords = [
        "migrate", "roam", "travel", "swim", "fly", "climb", "burrow", "dig", "nest", 
        "breed", "forage", "hunt", "territory", "range"
    ]
    
    # Sentences that might contain habitat information
    habitat_sentences = []
    
    # Apply Strategy 1: Direct habitat statements
    for sentence in sentences:
        for keyword in habitat_keywords:
            if keyword.lower() in sentence.lower():
                habitat_sentences.append(sentence)
                break
    
    # Apply Strategy 2: Geography and climate context (if strategy 1 didn't yield results)
    if not habitat_sentences:
        for sentence in sentences:
            for keyword in climate_keywords:
                if keyword.lower() in sentence.lower():
                    habitat_sentences.append(sentence)
                    break
    
    # Apply Strategy 3: Regional indicators (if strategies 1-2 didn't yield results)
    if not habitat_sentences:
        for sentence in sentences:
            for keyword in region_keywords:
                if keyword.lower() in sentence.lower():
                    habitat_sentences.append(sentence)
                    break
    
    # Apply Strategy 4: Action verbs related to habitat (if strategies 1-3 didn't yield results)
    if not habitat_sentences:
        for sentence in sentences:
            for keyword in action_keywords:
                if keyword.lower() in sentence.lower():
                    habitat_sentences.append(sentence)
                    break
    
    # Fallback Strategy: If no habitat information was found, try to use the first or second sentence
    # as they often contain general information about where the species lives
    if not habitat_sentences and len(sentences) >= 2:
        # Skip the first sentence if it's just a definition and take the second
        if len(sentences) > 2:
            second_sentence = sentences[1]
            # Check if the second sentence has reasonable length to be informative
            if len(second_sentence.split()) > 5:
                habitat_sentences.append(second_sentence)
        
        # If second sentence wasn't suitable or not available, use the first
        if not habitat_sentences:
            first_sentence = sentences[0]
            if len(first_sentence.split()) > 5:
                habitat_sentences.append(first_sentence)
    
    # Format the habitat information
    if habitat_sentences:
        # If we have multiple sentences, join them (but limit to 2 for conciseness)
        if len(habitat_sentences) > 1:
            combined = ". ".join(habitat_sentences[:2]).strip()
            # Make sure it ends with proper punctuation
            if not combined.endswith(('.', '!', '?')):
                combined += '.'
            return combined
        
        single = habitat_sentences[0].strip()
        # Make sure it ends with proper punctuation
        if not single.endswith(('.', '!', '?')):
            single += '.'
        return single
    
    # Last resort: construct a generic message if we couldn't find specific habitat info
    return "Specific habitat information not available from Wikispecies. Try searching online for more details about this species' natural environment."

def extract_fun_facts(description):
    """
    Extract interesting fun facts from the description using keyword-based identification,
    with improved pattern recognition and a structured approach to generate fun facts
    even with limited information.
    """
    if not description or description == "No description available":
        return ["No specific information available for this species in Wikispecies."]
    
    # Split the description into sentences
    sentences = description.replace(". ", ".|").replace("! ", "!|").replace("? ", "?|").split("|")
    sentences = [s.strip() for s in sentences if s.strip()]
    
    # If the description is too short, include it as a single fact
    if len(sentences) == 1 and len(description) < 100:
        if not sentences[0].endswith(('.', '!', '?')):
            sentences[0] += '.'
        return [sentences[0]]
    
    # STRATEGY 1: Identify sentences with interesting keywords
    interesting_keywords = [
        "interesting", "unique", "unusual", "remarkable", "notable", "surprising",
        "fascinating", "amazing", "extraordinary", "distinctive", "special", "rare",
        "strange", "curious", "unlike", "peculiar", "odd", "bizarre", "striking",
        "colorful", "beautiful", "impressive", "popular", "famous", "well-known",
        "largest", "smallest", "fastest", "slowest", "oldest", "youngest", "only",
        "record", "discovery", "first", "last", "origin", "discovered", "introduced",
        "revered", "sacred", "symbol", "iconic", "emblem", "represented", "mythology",
        "legend", "folklore", "traditional", "cultural", "significance", "historical"
    ]
    
    # STRATEGY 2: Physical characteristics and biology often make good facts
    biology_keywords = [
        "lifespan", "longevity", "size", "weight", "height", "length", "wingspan",
        "color", "pattern", "marking", "appearance", "physical", "morphology", "anatomy",
        "feature", "characteristic", "distinctive", "body", "shape", "structure",
        "adaptation", "evolved", "evolution", "mutation", "gene", "genetic", "chromosome",
        "hybrid", "species", "subspecies", "variety", "breed", "strain", "extinct",
        "endangered", "threatened", "vulnerable", "conservation", "protected"
    ]
    
    # STRATEGY 3: Behavior and lifestyle information
    behavior_keywords = [
        "diet", "eat", "feeding", "food", "prey", "predator", "hunt", "scavenge",
        "forage", "graze", "browse", "omnivore", "carnivore", "herbivore", "insectivore",
        "behavior", "behaviour", "habit", "activity", "social", "solitary", "group",
        "herd", "flock", "pack", "colony", "community", "family", "nocturnal", "diurnal",
        "crepuscular", "migrate", "migration", "hibernate", "hibernation", "estivate",
        "dormant", "sleep", "rest", "active", "territory", "defend", "aggressive",
        "docile", "tame", "wild", "domestic", "domesticated", "trained", "human"
    ]
    
    # STRATEGY 4: Reproduction is always interesting
    reproduction_keywords = [
        "reproduce", "reproduction", "breeding", "mate", "mating", "courtship", "display",
        "attract", "offspring", "young", "juvenile", "infant", "baby", "child", "adult",
        "egg", "spawn", "birth", "pregnant", "gestation", "incubation", "hatch", "nestling",
        "fledgling", "litter", "clutch", "brood", "parent", "care", "raise", "nurse", "wean"
    ]
    
    # Comparative patterns that often indicate interesting facts
    comparative_patterns = [
        "more than", "less than", "bigger than", "smaller than", "larger than",
        "faster than", "slower than", "better than", "worse than", "greater than",
        "unlike", "similar to", "compared to", "in contrast to", "differs from",
        "up to", "as many as", "can reach", "can grow", "can live", "known to",
        "capable of", "able to", "estimated", "approximately", "about", "around"
    ]
    
    # Measurement patterns that often indicate interesting statistics
    measurement_patterns = [
        "cm", "meter", "metre", "kilometer", "kilometre", "feet", "foot", "inch",
        "kg", "gram", "pound", "ton", "tonne", "year", "month", "week", "day", "hour",
        "percent", "°C", "°F", "degree", "celsius", "fahrenheit", "temperature", 
        "speed", "mph", "kph", "knot", "altitude", "depth", "width", "height"
    ]
    
    # Collect potential facts using different strategies
    fact_candidates = {
        "interesting": [],
        "biological": [],
        "behavioral": [],
        "reproductive": [],
        "comparative": [],
        "measurements": [],
        "general": []
    }
    
    # Apply strategies to collect potential facts
    for sentence in sentences:
        # Skip very short sentences
        if len(sentence.split()) < 4:
            continue
            
        # Flag to track if the sentence has been categorized
        categorized = False
        
        # Strategy 1: Interesting keywords
        for keyword in interesting_keywords:
            if keyword.lower() in sentence.lower():
                fact_candidates["interesting"].append(sentence)
                categorized = True
                break
                
        if not categorized:
            # Strategy 2: Biological characteristics
            for keyword in biology_keywords:
                if keyword.lower() in sentence.lower():
                    fact_candidates["biological"].append(sentence)
                    categorized = True
                    break
        
        if not categorized:
            # Strategy 3: Behavior keywords
            for keyword in behavior_keywords:
                if keyword.lower() in sentence.lower():
                    fact_candidates["behavioral"].append(sentence)
                    categorized = True
                    break
                    
        if not categorized:
            # Strategy 4: Reproduction keywords
            for keyword in reproduction_keywords:
                if keyword.lower() in sentence.lower():
                    fact_candidates["reproductive"].append(sentence)
                    categorized = True
                    break
        
        if not categorized:
            # Check for comparative patterns
            for pattern in comparative_patterns:
                if pattern.lower() in sentence.lower():
                    fact_candidates["comparative"].append(sentence)
                    categorized = True
                    break
                    
        if not categorized:
            # Check for measurement patterns
            has_number = any(c.isdigit() for c in sentence)
            if has_number:
                for pattern in measurement_patterns:
                    if pattern.lower() in sentence.lower():
                        fact_candidates["measurements"].append(sentence)
                        categorized = True
                        break
                        fact_candidates["measurements"].append(sentence)
                        categorized = True
                        break
        
        # If sentence wasn't categorized by any specific strategy, add to general
        if not categorized and len(sentence.split()) > 5:
            fact_candidates["general"].append(sentence)
    
    # Select facts from each category to ensure diversity (prioritizing the most interesting ones)
    selected_facts = []
    
    # Priority order for fact selection
    categories = ["interesting", "measurements", "biological", "reproductive", "behavioral", "comparative", "general"]
    
    # First, try to get at least one fact from high-priority categories
    for category in categories[:3]:  # First 3 are highest priority
        if fact_candidates[category]:
            selected_facts.append(fact_candidates[category][0])
            fact_candidates[category].pop(0)  # Remove the used fact
    
    # Now fill remaining slots with a mix of all categories
    remaining_slots = 4 - len(selected_facts)  # Maximum 4 facts total
    
    if remaining_slots > 0:
        for category in categories:
            if fact_candidates[category] and remaining_slots > 0:
                next_fact = fact_candidates[category][0]
                # Only add if not too similar to already selected facts
                if not any(similarity_score(next_fact, fact) > 0.7 for fact in selected_facts):
                    selected_facts.append(next_fact)
                    remaining_slots -= 1
                fact_candidates[category].pop(0)  # Remove the used fact
    
    # If we still don't have enough facts, add more from general pool
    if len(selected_facts) < 2 and sentences:
        # Add the first sentence if it's not already included
        if sentences[0] not in selected_facts and len(sentences[0].split()) > 5:
            selected_facts.append(sentences[0])
            
        # Add another sentence from middle of the text if available
        middle_idx = len(sentences) // 2
        if len(sentences) > middle_idx and sentences[middle_idx] not in selected_facts and len(sentences[middle_idx].split()) > 5:
            selected_facts.append(sentences[middle_idx])
    
    # Last resort: if still no facts, create a generic fact
    if not selected_facts:
        selected_facts = ["This species is documented in Wikispecies, the free species directory."]
    
    # Ensure all facts end with proper punctuation
    for i in range(len(selected_facts)):
        if not selected_facts[i].endswith(('.', '!', '?')):
            selected_facts[i] += '.'
    
    # Remove duplicates while preserving order
    unique_facts = []
    for fact in selected_facts:
        if fact not in unique_facts:
            unique_facts.append(fact)
    
    return unique_facts[:4]  # Limit to max 4 facts

def similarity_score(str1, str2):
    """
    Calculate a simple similarity score between two strings
    based on word overlap. Used to avoid selecting too similar facts.
    Returns a value between 0 (completely different) and 1 (identical).
    """
    if not str1 or not str2:
        return 0
        
    # Convert to lowercase and split into words
    words1 = set(str1.lower().split())
    words2 = set(str2.lower().split())
    
    # Calculate Jaccard similarity
    intersection = words1.intersection(words2)
    union = words1.union(words2)
    
    if not union:
        return 0
        
    return len(intersection) / len(union)

def get_mock_species_from_filename(filename):
    """
    A mock function that simulates image recognition by looking at the filename.
    In a real application, this would be replaced with an actual image recognition API.
    """
    filename_lower = filename.lower()
    
    # List of common animals and their possible filenames
    animal_keywords = {
        "cat": "Felis catus",
        "dog": "Canis familiaris",
        "bird": "Aves",
        "eagle": "Aquila chrysaetos",
        "lion": "Panthera leo",
        "tiger": "Panthera tigris",
        "bear": "Ursus arctos",
        "wolf": "Canis lupus",
        "fox": "Vulpes vulpes",
        "deer": "Cervidae",
        "elephant": "Loxodonta africana",
        "giraffe": "Giraffa camelopardalis",
        "zebra": "Equus quagga",
        "monkey": "Primates",
        "gorilla": "Gorilla gorilla",
        "fish": "Actinopterygii",
        "shark": "Selachimorpha",
        "dolphin": "Tursiops truncatus",
        "whale": "Cetacea",
        "snake": "Serpentes",
        "lizard": "Lacertilia",
        "turtle": "Testudines",
        "frog": "Anura",
        "butterfly": "Lepidoptera",
        "bee": "Apis mellifera",
    }
    
    # List of common plants and their possible filenames
    plant_keywords = {
        "tree": "Arbor",
        "flower": "Anthophyta",
        "rose": "Rosa",
        "tulip": "Tulipa",
        "daisy": "Bellis perennis",
        "sunflower": "Helianthus annuus",
        "oak": "Quercus",
        "pine": "Pinus",
        "maple": "Acer",
        "fern": "Polypodiopsida",
        "moss": "Bryophyta",
        "grass": "Poaceae",
        "cactus": "Cactaceae",
        "palm": "Arecaceae",
        "orchid": "Orchidaceae",
    }
    
    # Check animal keywords
    for keyword, species in animal_keywords.items():
        if keyword in filename_lower:
            return species
    
    # Check plant keywords
    for keyword, species in plant_keywords.items():
        if keyword in filename_lower:
            return species
    
    # If no match is found, return a default species
    return "Homo sapiens"

def extract_wikipedia_classification(full_text, title, search_data=None):
    """
    Extract classification/taxonomy information from Wikipedia content.
    Uses various strategies including infobox parsing, section analysis, and text pattern matching.
    
    Args:
        full_text: The full text content of the Wikipedia page
        title: The title of the Wikipedia page
        search_data: Optional search data that might contain additional info
        
    Returns:
        A dictionary with taxonomic ranks and their values
    """
    # Initialize with default "Unknown" values
    classification = {
        "kingdom": "Unknown",
        "phylum": "Unknown",
        "class": "Unknown",
        "order": "Unknown",
        "family": "Unknown",
        "genus": "Unknown",
        "species": "Unknown"
    }
    
    if not full_text:
        return classification
    
    try:
        # STRATEGY 1: Look for taxonomic information in specific sections
        taxonomy_section = extract_wikipedia_section(full_text, ["Taxonomy", "Classification", "Taxonomic", "Scientific classification"])
        if taxonomy_section:
            # Extract taxonomic information from the section
            classification = extract_taxonomy_from_text(taxonomy_section, classification)
        
        # STRATEGY 2: Look for taxonomic information in infobox-like structures
        # Wikipedia infoboxes often appear at the beginning of the text with structured format
        infobox_patterns = [
            r"Kingdom:\s*([A-Za-z]+)",
            r"Phylum:\s*([A-Za-z]+)",
            r"Class:\s*([A-Za-z]+)",
            r"Order:\s*([A-Za-z]+)",
            r"Family:\s*([A-Za-z]+)",
            r"Genus:\s*([A-Za-z]+)",
            r"Species:\s*([A-Za-z]+)"
        ]
        
        # Apply each pattern to extract taxonomic information
        for i, pattern in enumerate(infobox_patterns):
            rank = list(classification.keys())[i]
            matches = re.findall(pattern, full_text, re.IGNORECASE)
            if matches:
                classification[rank] = matches[0].strip()
        
        # STRATEGY 3: Parse the first paragraph for taxonomic information
        # First paragraphs in Wikipedia often contain taxonomic statements
        first_para = full_text.split('\n\n')[0] if '\n\n' in full_text else full_text
        classification = extract_taxonomy_from_text(first_para, classification)
        
        # STRATEGY 4: Try to extract genus and species from the title
        title_parts = title.split()
        if len(title_parts) >= 2 and classification["genus"] == "Unknown":
            # If title looks like a binomial name (e.g., "Panthera leo")
            if title_parts[0][0].isupper() and title_parts[0][1:].islower() and title_parts[1].islower():
                classification["genus"] = title_parts[0]
                if classification["species"] == "Unknown":
                    classification["species"] = title_parts[1]
        
        # STRATEGY 5: Look for taxonomic statements throughout the text
        # These patterns match statements like "belongs to the family Felidae"
        taxonomy_statement_patterns = [
            r"(?:belongs|belonging)\s+to\s+(?:the)?\s+kingdom\s+([A-Za-z]+)",
            r"(?:belongs|belonging)\s+to\s+(?:the)?\s+phylum\s+([A-Za-z]+)",
            r"(?:belongs|belonging)\s+to\s+(?:the)?\s+class\s+([A-Za-z]+)",
            r"(?:belongs|belonging)\s+to\s+(?:the)?\s+order\s+([A-Za-z]+)",
            r"(?:belongs|belonging)\s+to\s+(?:the)?\s+family\s+([A-Za-z]+)",
            r"(?:is|as)\s+a\s+(?:member|species)\s+of\s+(?:the)?\s+kingdom\s+([A-Za-z]+)",
            r"(?:is|as)\s+a\s+(?:member|species)\s+of\s+(?:the)?\s+phylum\s+([A-Za-z]+)",
            r"(?:is|as)\s+a\s+(?:member|species)\s+of\s+(?:the)?\s+class\s+([A-Za-z]+)",
            r"(?:is|as)\s+a\s+(?:member|species)\s+of\s+(?:the)?\s+order\s+([A-Za-z]+)",
            r"(?:is|as)\s+a\s+(?:member|species)\s+of\s+(?:the)?\s+family\s+([A-Za-z]+)",
            r"(?:is|as)\s+a\s+(?:member|species)\s+of\s+(?:the)?\s+genus\s+([A-Za-z]+)"
        ]
        
        # Map patterns to taxonomic ranks
        rank_map = {
            0: "kingdom", 1: "phylum", 2: "class", 3: "order", 4: "family",
            5: "kingdom", 6: "phylum", 7: "class", 8: "order", 9: "family", 10: "genus"
        }
        
        # Apply statement patterns to extract taxonomic information
        for i, pattern in enumerate(taxonomy_statement_patterns):
            rank = rank_map.get(i)
            if not rank:
                continue
                
            matches = re.findall(pattern, full_text, re.IGNORECASE)
            if matches and classification[rank] == "Unknown":
                classification[rank] = matches[0].strip()
        
        # Final cleanup: ensure proper capitalization and formatting
        for rank, value in classification.items():
            if value != "Unknown":
                # Capitalize first letter for taxonomic ranks
                classification[rank] = value[0].upper() + value[1:]
    
    except Exception as e:
        print(f"Error extracting classification from Wikipedia: {str(e)}")
        # If an error occurs, we'll return the classification with whatever data we managed to extract
    
    return classification

def extract_taxonomy_from_text(text, classification):
    """
    Extract taxonomic information from text using pattern matching
    and natural language processing techniques.
    
    Args:
        text: The text to analyze
        classification: The current classification dictionary to update
        
    Returns:
        Updated classification dictionary
    """
    if not text:
        return classification
    
    try:
        # Common patterns for taxonomic ranks in text
        taxonomy_patterns = {
            "kingdom": [r"Kingdom:?\s*([A-Za-z]+)", r"Kingdom\s+([A-Za-z]+)", r"a member of the kingdom\s+([A-Za-z]+)"],
            "phylum": [r"Phylum:?\s*([A-Za-z]+)", r"Phylum\s+([A-Za-z]+)", r"a member of the phylum\s+([A-Za-z]+)"],
            "class": [r"Class:?\s*([A-Za-z]+)", r"Class\s+([A-Za-z]+)", r"a member of the class\s+([A-Za-z]+)"],
            "order": [r"Order:?\s*([A-Za-z]+)", r"Order\s+([A-Za-z]+)", r"a member of the order\s+([A-Za-z]+)"],
        }
        
        # For each taxonomic rank, try to find matches using the patterns
        for rank, patterns in taxonomy_patterns.items():
            if classification[rank] != "Unknown":
                continue  # Skip if we already have a value
                
            for pattern in patterns:
                matches = re.findall(pattern, text, re.IGNORECASE)
                if matches:
                    # Take the first match and clean it up
                    match = matches[0].strip()
                    # Handle Latin taxonomic names with proper capitalization
                    if rank in ["genus", "species"]:
                        match = match[0].upper() + match[1:].lower()
                    elif rank != "species":  # For non-species ranks
                        match = match.capitalize()
                    
                    classification[rank] = match
                    break  # Stop after finding a match for this rank
        
        # Look for taxonomic information with specific taxonomic suffixes
        suffix_patterns = {
            "family": [r"\b([A-Za-z]+idae)\b", r"\b([A-Za-z]+aceae)\b"],  # Animal and plant families
            "order": [r"\b([A-Za-z]+ales)\b", r"\b([A-Za-z]+ida)\b"],  # Plant orders and animal orders
            "class": [r"\b([A-Za-z]+ia)\b", r"\b([A-Za-z]+phyceae)\b"],  # Classes
            "phylum": [r"\b([A-Za-z]+phyta)\b", r"\b([A-Za-z]+zoa)\b"]  # Plant and animal phyla
        }
        
        # Apply suffix patterns to extract taxonomic information
        for rank, patterns in suffix_patterns.items():
            if classification[rank] != "Unknown":
                continue  # Skip if we already have a value
                
            for pattern in patterns:
                matches = re.findall(pattern, text)
                if matches:
                    # Take the first match and clean it up
                    match = matches[0].strip()
                    classification[rank] = match
                    break
                
    except Exception as e:
        print(f"Error in extract_taxonomy_from_text: {str(e)}")
        # If an error occurs, return the classification as is
    
    return classification

if _name_ == "_main_":
    main()