Spaces:

Trinay16
/

WildCardss

Sleeping

App Files Files Community

WildCardss / src /streamlit_app.py

Trinay16

Update src/streamlit_app.py

63a66ba verified 7 months ago

raw

history blame contribute delete

58.1 kB

	import streamlit as st
	import requests
	import os
	import re
	from PIL import Image
	import tempfile
	import os
	os.environ["STREAMLIT_HOME"] = "/tmp"

	# List of allowed file extensions for uploads
	ALLOWED_EXTENSIONS = {'png', 'jpg', 'jpeg', 'gif'}

	def allowed_file(filename):
	return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS

	def main():
	st.set_page_config(page_title="Species Information Finder", layout="wide")

	st.title("Species Information Finder")
	st.write("Discover information about any species by name or by uploading an image.")

	# Create tabs for different functionality
	tab1, tab2 = st.tabs(["Search by Name", "Search by Image"])

	with tab1:
	st.header("Search by Species Name")
	species_name = st.text_input("Enter a species name (common or scientific):")

	if st.button("Search"):
	if not species_name:
	st.error("Please enter a species name")
	else:
	with st.spinner("Searching for species information..."):
	# Get species info from Wikispecies API
	species_data = get_species_info(species_name)

	# Get images from Wikimedia Commons API
	images = get_species_images(species_name)

	display_results(species_data, images)

	with tab2:
	st.header("Search by Image Upload")
	uploaded_file = st.file_uploader("Upload an image of a species", type=ALLOWED_EXTENSIONS)

	if uploaded_file is not None:
	if allowed_file(uploaded_file.name):
	# Display the uploaded image
	image = Image.open(uploaded_file)
	st.image(image, caption="Uploaded Image", use_column_width=True)

	if st.button("Identify Species"):
	with st.spinner("Identifying species from image..."):
	# In a real app, you would call an image recognition API here
	# For demo purposes, we'll use our mock function
	species_name = get_mock_species_from_filename(uploaded_file.name)

	# Get species info from Wikispecies API
	species_data = get_species_info(species_name)

	# Get images from Wikimedia Commons API
	images = get_species_images(species_name)

	display_results(species_data, images)
	else:
	st.error("File type not allowed. Please upload an image file (PNG, JPG, JPEG, GIF).")

	def display_results(species_data, images):
	"""Display the results in a formatted way."""
	if "error" in species_data:
	st.error(species_data["error"])
	return

	st.success(f"Found information for: {species_data['title']}")

	# Create columns for layout
	col1, col2 = st.columns([1, 2])

	with col1:
	# Display classification information
	st.subheader("Classification")
	classification = species_data.get("classification", {})
	for rank, value in classification.items():
	if value != "Unknown":
	st.write(f"{rank.capitalize()}:** {value}")

	# Display habitat information
	if species_data.get("habitat", "Unknown") != "Unknown":
	st.subheader("Habitat")
	st.write(species_data["habitat"])

	with col2:
	# Display description
	st.subheader("Description")
	st.write(species_data.get("description", "No description available."))

	# Display fun facts if available
	if species_data.get("fun_facts"):
	st.subheader("Interesting Facts")
	for i, fact in enumerate(species_data["fun_facts"], 1):
	st.write(f"{i}. {fact}")

	# Display images if available
	if images:
	st.subheader("Related Images")

	# Display up to 4 images in a grid
	cols = st.columns(min(4, len(images)))
	for idx, img in enumerate(images[:4]):
	with cols[idx]:
	if "thumb_url" in img:
	st.image(img["thumb_url"], caption=img.get("description", ""), use_column_width=True)
	else:
	st.image(img["url"], caption=img.get("description", ""), use_column_width=True)
	st.caption(f"Credit: {img.get('author', 'Unknown')} \| License: {img.get('license', 'Unknown')}")
	else:
	st.warning("No images found for this species.")

	# All the existing functions from your Flask app can remain exactly the same
	# (get_species_info, get_wikispecies_data, get_wikipedia_data, etc.)
	# I'll include them below for completeness, but they don't need to change

	def get_species_info(species_name):
	"""
	Get species information from both Wikispecies and Wikipedia APIs
	with improved extraction and fallback strategies for better results.
	"""
	# Create the base species info structure
	species_info = {
	"title": species_name, # Default to the search query
	"description": "No description available.",
	"categories": [],
	"links": [],
	"last_modified": "Unknown",
	"classification": {
	"kingdom": "Unknown",
	"phylum": "Unknown",
	"class": "Unknown",
	"order": "Unknown",
	"family": "Unknown",
	"genus": "Unknown",
	"species": "Unknown"
	},
	"habitat": "Unknown",
	"fun_facts": [],
	"data_sources": [] # Track where we got data from
	}

	# Try to get data from Wikispecies first
	wikispecies_info = get_wikispecies_data(species_name)

	# If we got a valid response, update our species_info
	if not wikispecies_info.get("error"):
	species_info.update(wikispecies_info)
	species_info["data_sources"].append("Wikispecies")

	# Now try to get complementary data from Wikipedia
	wikipedia_info = get_wikipedia_data(species_name)

	# If Wikipedia returned valid data, supplement our existing info
	if not wikipedia_info.get("error"):
	# Use Wikipedia description if Wikispecies didn't have one
	if species_info["description"] == "No description available." or len(species_info["description"]) < 50:
	species_info["description"] = wikipedia_info.get("description", species_info["description"])

	# Always prefer Wikipedia habitat info as it's likely more detailed
	species_info["habitat"] = wikipedia_info.get("habitat", species_info["habitat"])

	# Merge classification info from Wikipedia, preferring Wikipedia data
	if "classification" in wikipedia_info:
	for rank, value in wikipedia_info["classification"].items():
	if value != "Unknown":
	species_info["classification"][rank] = value

	# Add Wikipedia fun facts to our collection, avoiding duplicates
	if wikipedia_info.get("fun_facts"):
	existing_facts = species_info.get("fun_facts", [])
	for fact in wikipedia_info["fun_facts"]:
	if not any(similarity_score(fact, existing) > 0.7 for existing in existing_facts):
	existing_facts.append(fact)
	species_info["fun_facts"] = existing_facts[:4] # Limit to 4 facts

	species_info["data_sources"].append("Wikipedia")

	# If we didn't get any data from either source, return an error
	if not species_info["data_sources"]:
	species_info["error"] = "Species information not found in either Wikispecies or Wikipedia."

	return species_info

	def get_wikispecies_data(species_name):
	"""
	Get species information from Wikispecies API
	"""
	# Wikispecies API endpoint
	url = "https://species.wikimedia.org/w/api.php"

	# Parameters for the API request - get more info to work with
	params = {
	"action": "query",
	"format": "json",
	"titles": species_name,
	"prop": "extracts\|categories\|info\|links",
	"exintro": True, # Get only the intro section
	"explaintext": True, # Get plain text, not HTML
	"cllimit": 50, # Get more categories
	"pllimit": 50, # Get more links
	}

	try:
	response = requests.get(url, params=params)
	data = response.json()

	# Extract page data
	pages = data.get("query", {}).get("pages", {})

	if not pages:
	return {"error": "No data found in Wikispecies"}

	# Get the first page (there should only be one)
	page_id = next(iter(pages))
	page = pages[page_id]

	# Default information structure with placeholders
	species_info = {
	"title": species_name, # Default to the search query
	"description": "No description available.",
	"categories": [],
	"links": [],
	"last_modified": "Unknown",
	"classification": {
	"kingdom": "Unknown",
	"phylum": "Unknown",
	"class": "Unknown",
	"order": "Unknown",
	"family": "Unknown",
	"genus": "Unknown",
	"species": "Unknown"
	},
	"habitat": "Unknown",
	"fun_facts": []
	}

	# Check if the page exists
	if int(page_id) < 0:
	species_info["error"] = "Species not found in Wikispecies. Try a different spelling or check for the scientific name."
	return species_info

	# Extract the relevant information
	species_info["title"] = page.get("title", species_name)
	species_info["description"] = page.get("extract", "No description available.")

	# Get all categories
	if "categories" in page:
	species_info["categories"] = [cat.get("title") for cat in page.get("categories", [])]

	# Get all links (can be useful for finding related info)
	if "links" in page:
	species_info["links"] = [link.get("title") for link in page.get("links", [])]

	species_info["last_modified"] = page.get("touched", "Unknown")

	# Clean up the description (remove unnecessary line breaks, etc.)
	if species_info["description"]:
	species_info["description"] = species_info["description"].replace("\n", " ").strip()
	# Remove multiple spaces
	import re
	species_info["description"] = re.sub(r' +', ' ', species_info["description"])

	# Try different strategies to extract classification
	# Strategy 1: Extract from categories
	species_info["classification"] = extract_classification(species_info["categories"])

	# Strategy 2: Try to extract genus and species from the title if available
	title = species_info.get("title", "")
	title_parts = title.split()

	# If the title consists of two words, it might be a binomial name (genus + species)
	if len(title_parts) == 2:
	genus = title_parts[0]
	species = title_parts[1]

	# Update classification with this information
	classification = species_info.get("classification", {})
	if classification.get("genus") == "Unknown":
	classification["genus"] = genus
	if classification.get("species") == "Unknown":
	classification["species"] = species
	species_info["classification"] = classification

	# Strategy 3: Look for classification information in links
	if species_info.get("links"):
	for link in species_info["links"]:
	# Check if link might be a taxonomic rank
	link_parts = link.split()
	if len(link_parts) == 1:
	# Check common taxonomic suffixes for families, orders, etc.
	if link.endswith("idae"): # Family suffix
	species_info["classification"]["family"] = link
	elif link.endswith("inae"): # Subfamily suffix
	# Store subfamily info in a separate key
	species_info["classification"]["subfamily"] = link
	elif link.endswith("ales"): # Order suffix for plants
	species_info["classification"]["order"] = link
	elif link.endswith("aceae"): # Family suffix for plants
	species_info["classification"]["family"] = link

	# Extract habitat info
	species_info["habitat"] = extract_habitat(species_info["description"])

	# Extract fun facts
	species_info["fun_facts"] = extract_fun_facts(species_info["description"])

	# If the description is too short or missing, try to create a basic description
	if not species_info["description"] or len(species_info["description"]) < 20:
	# Create a basic description from available information
	classification = species_info["classification"]
	parts = []

	if classification["genus"] != "Unknown" and classification["species"] != "Unknown":
	parts.append(f"{species_info['title']} is a species in the genus {classification['genus']}.")

	if classification["family"] != "Unknown":
	parts.append(f"It belongs to the family {classification['family']}.")

	if classification["order"] != "Unknown":
	parts.append(f"It is classified under the order {classification['order']}.")

	if parts:
	species_info["description"] = " ".join(parts)
	else:
	species_info["description"] = f"{species_info['title']} is a species documented in Wikispecies, the free species directory."

	return species_info

	except Exception as e:
	error_msg = str(e)
	return {
	"error": f"Error retrieving species information from Wikispecies: {error_msg}",
	"title": species_name,
	"description": "No information available due to an error. Please try a different species name.",
	"classification": {"kingdom": "Unknown", "phylum": "Unknown", "class": "Unknown", "order": "Unknown", "family": "Unknown", "genus": "Unknown", "species": "Unknown"},
	"habitat": "Unknown",
	"fun_facts": []
	}

	def get_wikipedia_data(species_name):
	"""
	Get species information from Wikipedia API, focusing on description,
	habitat, and fun facts.
	"""
	# Wikipedia API endpoint
	url = "https://en.wikipedia.org/w/api.php"

	# First, try to search for the page to get the correct title
	search_params = {
	"action": "query",
	"format": "json",
	"list": "search",
	"srsearch": species_name,
	"srlimit": 1, # Get just the best match
	}

	try:
	# Search for the page first to get the exact title
	search_response = requests.get(url, params=search_params)
	search_data = search_response.json()

	# Check if we found any search results
	search_results = search_data.get("query", {}).get("search", [])
	if not search_results:
	return {"error": "No matching Wikipedia page found for this species."}

	# Get the page title from the search result
	page_title = search_results[0].get("title")

	# Now get the full page content
	content_params = {
	"action": "query",
	"format": "json",
	"titles": page_title,
	"prop": "extracts\|categories\|sections",
	"exintro": False, # Get the full content, not just the intro
	"explaintext": True, # Get plain text, not HTML
	"cllimit": 50, # Get more categories
	}

	content_response = requests.get(url, params=content_params)
	content_data = content_response.json()

	# Extract page data
	pages = content_data.get("query", {}).get("pages", {})

	if not pages:
	return {"error": "Failed to retrieve Wikipedia page content."}

	# Get the first page (there should only be one)
	page_id = next(iter(pages))
	page = pages[page_id]

	# Check if the page exists
	if int(page_id) < 0:
	return {"error": "Wikipedia page not found."}

	# Get basic information
	species_info = {
	"title": page.get("title", species_name),
	"description": "",
	"habitat": "Unknown",
	"fun_facts": [],
	"classification": {
	"kingdom": "Unknown",
	"phylum": "Unknown",
	"class": "Unknown",
	"order": "Unknown",
	"family": "Unknown",
	"genus": "Unknown",
	"species": "Unknown"
	}
	}

	# Extract the content
	full_text = page.get("extract", "")

	# Clean up the text
	if full_text:
	full_text = full_text.replace("\n\n", "\|\|").replace("\n", " ").replace("\|\|", "\n\n")

	# Get sections from the content
	sections = full_text.split("\n\n")

	# The first section is usually a good description
	if sections:
	species_info["description"] = sections[0].strip()

	# Look for habitat information in the full text
	habitat_section = extract_wikipedia_section(full_text, ["Habitat", "Distribution", "Range", "Ecology", "Environment"])
	if habitat_section:
	species_info["habitat"] = habitat_section
	else:
	# If no specific habitat section, use our habitat extraction on the full text
	habitat = extract_habitat(full_text)
	if habitat != "Unknown":
	species_info["habitat"] = habitat

	# Extract fun facts from various interesting sections
	behavior_section = extract_wikipedia_section(full_text, ["Behavior", "Behaviour", "Life cycle", "Diet", "Feeding", "Reproduction", "Biology"])
	if behavior_section:
	facts = extract_fun_facts(behavior_section)
	if facts:
	species_info["fun_facts"].extend(facts)

	# If we don't have enough facts, try conservation status or other sections
	if len(species_info["fun_facts"]) < 2:
	conservation_section = extract_wikipedia_section(full_text, ["Conservation", "Status", "Threats", "Population"])
	if conservation_section:
	facts = extract_fun_facts(conservation_section)
	if facts:
	for fact in facts:
	if fact not in species_info["fun_facts"]:
	species_info["fun_facts"].append(fact)

	# If we still don't have enough facts, use our fun facts extraction on the full text
	if len(species_info["fun_facts"]) < 2:
	general_facts = extract_fun_facts(full_text)
	if general_facts:
	for fact in general_facts:
	if fact not in species_info["fun_facts"]:
	species_info["fun_facts"].append(fact)

	# Limit to 4 facts
	species_info["fun_facts"] = species_info["fun_facts"][:4]

	# Extract classification from Wikipedia content
	wiki_classification = extract_wikipedia_classification(full_text, page.get("title", ""), search_data)
	if wiki_classification:
	species_info["classification"] = wiki_classification

	return species_info

	except Exception as e:
	error_msg = str(e)
	return {
	"error": f"Error retrieving information from Wikipedia: {error_msg}",
	"title": species_name,
	"description": "No information available from Wikipedia due to an error.",
	"habitat": "Unknown",
	"fun_facts": []
	}

	def extract_wikipedia_section(text, section_keywords):
	"""
	Try to extract a specific section from Wikipedia text content.
	Returns the first matching section or None if no match is found.
	"""
	if not text:
	return None

	# Try to find section headings in the text
	section_pattern = r"==\s([^=]+)\s=="
	sections = re.findall(section_pattern, text)

	# Check if any of our target sections exist
	matching_sections = []
	for keyword in section_keywords:
	for section in sections:
	if keyword.lower() in section.lower():
	# Found a matching section, now extract its content
	section_regex = re.escape(f"== {section} ==")
	try:
	# Find where this section starts
	start_match = re.search(section_regex, text)
	if start_match:
	start_pos = start_match.end()

	# Find where the next section starts
	next_section = re.search(r"==\s[^=]+\s==", text[start_pos:])
	if next_section:
	end_pos = start_pos + next_section.start()
	section_text = text[start_pos:end_pos].strip()
	else:
	# This is the last section
	section_text = text[start_pos:].strip()

	matching_sections.append(section_text)
	except Exception:
	# Skip this section if there's any error processing it
	continue

	# If we found any matching sections, join them (limit to 2 for conciseness)
	if matching_sections:
	return " ".join(matching_sections[:2])

	# Alternative approach: look for paragraphs containing the keywords
	paragraphs = text.split("\n\n")
	for keyword in section_keywords:
	for paragraph in paragraphs:
	if keyword.lower() in paragraph.lower():
	return paragraph

	return None

	def get_species_images(species_name):
	"""
	Get species images from Wikimedia Commons API with improved search
	strategies for better results.
	"""
	# Wikimedia Commons API endpoint
	url = "https://commons.wikimedia.org/w/api.php"

	# Function to perform a search with given parameters
	def search_images(search_term, limit=10):
	# Parameters for the API request
	params = {
	"action": "query",
	"format": "json",
	"generator": "search",
	"gsrnamespace": 6, # File namespace
	"gsrsearch": search_term,
	"gsrlimit": limit, # Limit results
	"prop": "imageinfo",
	"iiprop": "url\|extmetadata",
	"iiurlwidth": 800, # Thumbnail width
	}

	try:
	response = requests.get(url, params=params)
	data = response.json()

	# Extract image data
	pages = data.get("query", {}).get("pages", {})

	if not pages:
	return []

	images = []
	for page_id, page in pages.items():
	image_info = page.get("imageinfo", [{}])[0]

	# Extract metadata
	metadata = image_info.get("extmetadata", {})
	description = metadata.get("ImageDescription", {}).get("value", "No description")
	author = metadata.get("Artist", {}).get("value", "Unknown")
	license = metadata.get("License", {}).get("value", "Unknown")

	# Skip non-image files (like pdfs, audio, etc.)
	title = page.get("title", "").lower()
	if any(ext in title for ext in ['.pdf', '.svg', '.mp3', '.mp4', '.ogg', '.wav', '.webm']):
	continue

	image = {
	"title": page.get("title", "Unknown"),
	"url": image_info.get("url", ""),
	"thumb_url": image_info.get("thumburl", ""),
	"description": description,
	"author": author,
	"license": license,
	}

	images.append(image)

	return images

	except Exception as e:
	return [{"error": str(e)}]

	# STRATEGY 1: Try exact file name search first
	images = search_images(f"file:{species_name}")

	# If no results, try a broader search
	if not images:
	# STRATEGY 2: Try removing the file: prefix for broader results
	images = search_images(species_name)

	# If still no results or very few, try some variations
	if len(images) < 3:
	# Split the species name and try different combinations
	name_parts = species_name.split()

	# STRATEGY 3: If it's a binomial name, try with just the genus or species part
	if len(name_parts) == 2:
	# Try with just the genus (first part)
	genus_images = search_images(f"{name_parts[0]}")

	# Add unique images from genus search
	existing_urls = [img.get("url") for img in images]
	for img in genus_images:
	if img.get("url") not in existing_urls:
	images.append(img)
	existing_urls.append(img.get("url"))

	# Stop if we now have enough images
	if len(images) >= 5:
	break

	# If we found at least some images, return them
	if images:
	return images

	# STRATEGY 4: Last resort - try a very general search
	# This could be improved by using the taxonomy info
	return search_images("species taxonomy nature")

	def extract_classification(categories):
	"""
	Extract classification information from categories and additional WikiData
	with improved pattern matching and detection.
	"""
	# Initialize with default "Unknown" values
	classification = {
	"kingdom": "Unknown",
	"phylum": "Unknown",
	"class": "Unknown",
	"order": "Unknown",
	"family": "Unknown",
	"genus": "Unknown",
	"species": "Unknown",
	}

	# Skip empty categories
	if not categories:
	return classification

	# Common taxonomy patterns in category names with more variations
	taxonomy_patterns = {
	"kingdom": ["kingdom:", "regnum:", "reino:", "regno:", "kingdom ", "regnum ", "reino ", "reino "],
	"phylum": ["phylum:", "division:", "división:", "divisio:", "phylum ", "division ", "división ", "divisio "],
	"class": ["class:", "clase:", "classis:", "class ", "clase ", "classis "],
	"order": ["order:", "orden:", "ordo:", "order ", "orden ", "ordo "],
	"family": ["family:", "familia:", "family ", "familia "],
	"genus": ["genus:", "género:", "genero:", "genus ", "género ", "genero "],
	"species": ["species:", "especie:", "specie:", "species ", "especie ", "specie "]
	}

	# STRATEGY 1: Direct matching from category names
	for category in categories:
	# Skip Categories: prefix if present
	if category.startswith("Category:"):
	category = category[9:]

	category_lower = category.lower()

	# Check for direct taxonomy mentions
	for rank, patterns in taxonomy_patterns.items():
	for pattern in patterns:
	if pattern in category_lower:
	# Extract the value after the pattern
	parts = category_lower.split(pattern)
	if len(parts) > 1:
	# Clean up the value (capitalize first letter, remove trailing spaces and special chars)
	value = parts[1].strip().split()[0].capitalize()
	classification[rank] = value
	break

	# STRATEGY 2: Look for categories that directly match taxonomic naming conventions
	for category in categories:
	# Skip Categories: prefix if present
	if category.startswith("Category:"):
	category = category[9:]

	category_parts = category.split()

	# Check for single-word categories that might be taxonomic names
	if len(category_parts) == 1:
	name = category_parts[0]

	# Check for common taxonomic suffixes
	if name.endswith("idae"): # Family suffix for animals
	classification["family"] = name
	elif name.endswith("inae"): # Subfamily suffix
	# Store subfamily info in a separate key
	classification["subfamily"] = name
	elif name.endswith("ales"): # Order suffix for plants
	classification["order"] = name
	elif name.endswith("aceae"): # Family suffix for plants
	classification["family"] = name
	elif name.endswith("ineae"): # Suborder suffix for plants
	# Store suborder info in a separate key
	classification["suborder"] = name
	elif name.endswith("oideae"): # Subfamily suffix for plants
	# Store subfamily info in a separate key
	classification["subfamily"] = name

	# STRATEGY 3: Check for categories that contain common taxonomic rank names
	taxonomic_rank_names = ["kingdom", "phylum", "division", "class", "order", "family", "genus", "species"]
	for category in categories:
	# Skip Categories: prefix if present
	if category.startswith("Category:"):
	category = category[9:]

	category_lower = category.lower()

	for rank in taxonomic_rank_names:
	if rank in category_lower:
	# Look for words after the rank name
	parts = category_lower.split(rank)
	if len(parts) > 1 and parts[1].strip():
	# Get the first word after the rank
	value = parts[1].strip().split()[0].capitalize()
	if classification[rank] == "Unknown":
	classification[rank] = value

	# Final cleanup: ensure proper capitalization and formatting
	for rank, value in classification.items():
	if value != "Unknown":
	# Capitalize first letter for taxonomic ranks
	classification[rank] = value[0].upper() + value[1:]

	return classification

	def extract_habitat(description):
	"""
	Extract habitat information from description using a more comprehensive approach
	with multiple fallback strategies and pattern recognition.
	"""
	if not description or description == "No description available":
	return "Unknown"

	# Split the description into sentences
	sentences = description.replace(". ", ".\|").replace("! ", "!\|").replace("? ", "?\|").split("\|")
	sentences = [s.strip() for s in sentences if s.strip()]

	# STRATEGY 1: Direct habitat statements
	# Expanded list of habitat-related keywords and phrases
	habitat_keywords = [
	"habitat", "lives in", "found in", "native to", "occurs in", "distribution",
	"range includes", "ecosystem", "biome", "environment", "inhabits", "dwelling in",
	"endemic to", "natural range", "geographical range", "distributed across",
	"prefers", "thrives in", "flourishes in", "resides in", "habitat type",
	"commonly found", "typically found", "often found", "usually found", "primarily found"
	]

	# STRATEGY 2: Geography and climate context
	# Climate and geography keywords to catch broader context
	climate_keywords = [
	"tropical", "temperate", "polar", "arctic", "antarctic", "desert",
	"rainforest", "forest", "jungle", "grassland", "savanna", "wetland",
	"marsh", "swamp", "mountain", "alpine", "coastal", "marine", "freshwater",
	"ocean", "sea", "river", "lake", "stream", "pond", "terrestrial", "aquatic",
	"woodland", "meadow", "tundra", "taiga", "steppe", "continent", "island",
	"shore", "beach", "reef", "cave", "burrow", "nest", "canopy", "undergrowth"
	]

	# STRATEGY 3: Regional indicators (continents, regions, countries)
	region_keywords = [
	"africa", "asia", "europe", "north america", "south america", "australia",
	"antarctica", "oceania", "mediterranean", "pacific", "atlantic", "indian ocean",
	"arctic ocean", "southern ocean", "northern", "southern", "eastern", "western",
	"central", "worldwide", "global", "cosmopolitan", "international"
	]

	# STRATEGY 4: Verbs that might indicate location or movement patterns
	action_keywords = [
	"migrate", "roam", "travel", "swim", "fly", "climb", "burrow", "dig", "nest",
	"breed", "forage", "hunt", "territory", "range"
	]

	# Sentences that might contain habitat information
	habitat_sentences = []

	# Apply Strategy 1: Direct habitat statements
	for sentence in sentences:
	for keyword in habitat_keywords:
	if keyword.lower() in sentence.lower():
	habitat_sentences.append(sentence)
	break

	# Apply Strategy 2: Geography and climate context (if strategy 1 didn't yield results)
	if not habitat_sentences:
	for sentence in sentences:
	for keyword in climate_keywords:
	if keyword.lower() in sentence.lower():
	habitat_sentences.append(sentence)
	break

	# Apply Strategy 3: Regional indicators (if strategies 1-2 didn't yield results)
	if not habitat_sentences:
	for sentence in sentences:
	for keyword in region_keywords:
	if keyword.lower() in sentence.lower():
	habitat_sentences.append(sentence)
	break

	# Apply Strategy 4: Action verbs related to habitat (if strategies 1-3 didn't yield results)
	if not habitat_sentences:
	for sentence in sentences:
	for keyword in action_keywords:
	if keyword.lower() in sentence.lower():
	habitat_sentences.append(sentence)
	break

	# Fallback Strategy: If no habitat information was found, try to use the first or second sentence
	# as they often contain general information about where the species lives
	if not habitat_sentences and len(sentences) >= 2:
	# Skip the first sentence if it's just a definition and take the second
	if len(sentences) > 2:
	second_sentence = sentences[1]
	# Check if the second sentence has reasonable length to be informative
	if len(second_sentence.split()) > 5:
	habitat_sentences.append(second_sentence)

	# If second sentence wasn't suitable or not available, use the first
	if not habitat_sentences:
	first_sentence = sentences[0]
	if len(first_sentence.split()) > 5:
	habitat_sentences.append(first_sentence)

	# Format the habitat information
	if habitat_sentences:
	# If we have multiple sentences, join them (but limit to 2 for conciseness)
	if len(habitat_sentences) > 1:
	combined = ". ".join(habitat_sentences[:2]).strip()
	# Make sure it ends with proper punctuation
	if not combined.endswith(('.', '!', '?')):
	combined += '.'
	return combined

	single = habitat_sentences[0].strip()
	# Make sure it ends with proper punctuation
	if not single.endswith(('.', '!', '?')):
	single += '.'
	return single

	# Last resort: construct a generic message if we couldn't find specific habitat info
	return "Specific habitat information not available from Wikispecies. Try searching online for more details about this species' natural environment."

	def extract_fun_facts(description):
	"""
	Extract interesting fun facts from the description using keyword-based identification,
	with improved pattern recognition and a structured approach to generate fun facts
	even with limited information.
	"""
	if not description or description == "No description available":
	return ["No specific information available for this species in Wikispecies."]

	# Split the description into sentences
	sentences = description.replace(". ", ".\|").replace("! ", "!\|").replace("? ", "?\|").split("\|")
	sentences = [s.strip() for s in sentences if s.strip()]

	# If the description is too short, include it as a single fact
	if len(sentences) == 1 and len(description) < 100:
	if not sentences[0].endswith(('.', '!', '?')):
	sentences[0] += '.'
	return [sentences[0]]

	# STRATEGY 1: Identify sentences with interesting keywords
	interesting_keywords = [
	"interesting", "unique", "unusual", "remarkable", "notable", "surprising",
	"fascinating", "amazing", "extraordinary", "distinctive", "special", "rare",
	"strange", "curious", "unlike", "peculiar", "odd", "bizarre", "striking",
	"colorful", "beautiful", "impressive", "popular", "famous", "well-known",
	"largest", "smallest", "fastest", "slowest", "oldest", "youngest", "only",
	"record", "discovery", "first", "last", "origin", "discovered", "introduced",
	"revered", "sacred", "symbol", "iconic", "emblem", "represented", "mythology",
	"legend", "folklore", "traditional", "cultural", "significance", "historical"
	]

	# STRATEGY 2: Physical characteristics and biology often make good facts
	biology_keywords = [
	"lifespan", "longevity", "size", "weight", "height", "length", "wingspan",
	"color", "pattern", "marking", "appearance", "physical", "morphology", "anatomy",
	"feature", "characteristic", "distinctive", "body", "shape", "structure",
	"adaptation", "evolved", "evolution", "mutation", "gene", "genetic", "chromosome",
	"hybrid", "species", "subspecies", "variety", "breed", "strain", "extinct",
	"endangered", "threatened", "vulnerable", "conservation", "protected"
	]

	# STRATEGY 3: Behavior and lifestyle information
	behavior_keywords = [
	"diet", "eat", "feeding", "food", "prey", "predator", "hunt", "scavenge",
	"forage", "graze", "browse", "omnivore", "carnivore", "herbivore", "insectivore",
	"behavior", "behaviour", "habit", "activity", "social", "solitary", "group",
	"herd", "flock", "pack", "colony", "community", "family", "nocturnal", "diurnal",
	"crepuscular", "migrate", "migration", "hibernate", "hibernation", "estivate",
	"dormant", "sleep", "rest", "active", "territory", "defend", "aggressive",
	"docile", "tame", "wild", "domestic", "domesticated", "trained", "human"
	]

	# STRATEGY 4: Reproduction is always interesting
	reproduction_keywords = [
	"reproduce", "reproduction", "breeding", "mate", "mating", "courtship", "display",
	"attract", "offspring", "young", "juvenile", "infant", "baby", "child", "adult",
	"egg", "spawn", "birth", "pregnant", "gestation", "incubation", "hatch", "nestling",
	"fledgling", "litter", "clutch", "brood", "parent", "care", "raise", "nurse", "wean"
	]

	# Comparative patterns that often indicate interesting facts
	comparative_patterns = [
	"more than", "less than", "bigger than", "smaller than", "larger than",
	"faster than", "slower than", "better than", "worse than", "greater than",
	"unlike", "similar to", "compared to", "in contrast to", "differs from",
	"up to", "as many as", "can reach", "can grow", "can live", "known to",
	"capable of", "able to", "estimated", "approximately", "about", "around"
	]

	# Measurement patterns that often indicate interesting statistics
	measurement_patterns = [
	"cm", "meter", "metre", "kilometer", "kilometre", "feet", "foot", "inch",
	"kg", "gram", "pound", "ton", "tonne", "year", "month", "week", "day", "hour",
	"percent", "°C", "°F", "degree", "celsius", "fahrenheit", "temperature",
	"speed", "mph", "kph", "knot", "altitude", "depth", "width", "height"
	]

	# Collect potential facts using different strategies
	fact_candidates = {
	"interesting": [],
	"biological": [],
	"behavioral": [],
	"reproductive": [],
	"comparative": [],
	"measurements": [],
	"general": []
	}

	# Apply strategies to collect potential facts
	for sentence in sentences:
	# Skip very short sentences
	if len(sentence.split()) < 4:
	continue

	# Flag to track if the sentence has been categorized
	categorized = False

	# Strategy 1: Interesting keywords
	for keyword in interesting_keywords:
	if keyword.lower() in sentence.lower():
	fact_candidates["interesting"].append(sentence)
	categorized = True
	break

	if not categorized:
	# Strategy 2: Biological characteristics
	for keyword in biology_keywords:
	if keyword.lower() in sentence.lower():
	fact_candidates["biological"].append(sentence)
	categorized = True
	break

	if not categorized:
	# Strategy 3: Behavior keywords
	for keyword in behavior_keywords:
	if keyword.lower() in sentence.lower():
	fact_candidates["behavioral"].append(sentence)
	categorized = True
	break

	if not categorized:
	# Strategy 4: Reproduction keywords
	for keyword in reproduction_keywords:
	if keyword.lower() in sentence.lower():
	fact_candidates["reproductive"].append(sentence)
	categorized = True
	break

	if not categorized:
	# Check for comparative patterns
	for pattern in comparative_patterns:
	if pattern.lower() in sentence.lower():
	fact_candidates["comparative"].append(sentence)
	categorized = True
	break

	if not categorized:
	# Check for measurement patterns
	has_number = any(c.isdigit() for c in sentence)
	if has_number:
	for pattern in measurement_patterns:
	if pattern.lower() in sentence.lower():
	fact_candidates["measurements"].append(sentence)
	categorized = True
	break
	fact_candidates["measurements"].append(sentence)
	categorized = True
	break

	# If sentence wasn't categorized by any specific strategy, add to general
	if not categorized and len(sentence.split()) > 5:
	fact_candidates["general"].append(sentence)

	# Select facts from each category to ensure diversity (prioritizing the most interesting ones)
	selected_facts = []

	# Priority order for fact selection
	categories = ["interesting", "measurements", "biological", "reproductive", "behavioral", "comparative", "general"]

	# First, try to get at least one fact from high-priority categories
	for category in categories[:3]: # First 3 are highest priority
	if fact_candidates[category]:
	selected_facts.append(fact_candidates[category][0])
	fact_candidates[category].pop(0) # Remove the used fact

	# Now fill remaining slots with a mix of all categories
	remaining_slots = 4 - len(selected_facts) # Maximum 4 facts total

	if remaining_slots > 0:
	for category in categories:
	if fact_candidates[category] and remaining_slots > 0:
	next_fact = fact_candidates[category][0]
	# Only add if not too similar to already selected facts
	if not any(similarity_score(next_fact, fact) > 0.7 for fact in selected_facts):
	selected_facts.append(next_fact)
	remaining_slots -= 1
	fact_candidates[category].pop(0) # Remove the used fact

	# If we still don't have enough facts, add more from general pool
	if len(selected_facts) < 2 and sentences:
	# Add the first sentence if it's not already included
	if sentences[0] not in selected_facts and len(sentences[0].split()) > 5:
	selected_facts.append(sentences[0])

	# Add another sentence from middle of the text if available
	middle_idx = len(sentences) // 2
	if len(sentences) > middle_idx and sentences[middle_idx] not in selected_facts and len(sentences[middle_idx].split()) > 5:
	selected_facts.append(sentences[middle_idx])

	# Last resort: if still no facts, create a generic fact
	if not selected_facts:
	selected_facts = ["This species is documented in Wikispecies, the free species directory."]

	# Ensure all facts end with proper punctuation
	for i in range(len(selected_facts)):
	if not selected_facts[i].endswith(('.', '!', '?')):
	selected_facts[i] += '.'

	# Remove duplicates while preserving order
	unique_facts = []
	for fact in selected_facts:
	if fact not in unique_facts:
	unique_facts.append(fact)

	return unique_facts[:4] # Limit to max 4 facts

	def similarity_score(str1, str2):
	"""
	Calculate a simple similarity score between two strings
	based on word overlap. Used to avoid selecting too similar facts.
	Returns a value between 0 (completely different) and 1 (identical).
	"""
	if not str1 or not str2:
	return 0

	# Convert to lowercase and split into words
	words1 = set(str1.lower().split())
	words2 = set(str2.lower().split())

	# Calculate Jaccard similarity
	intersection = words1.intersection(words2)
	union = words1.union(words2)

	if not union:
	return 0

	return len(intersection) / len(union)

	def get_mock_species_from_filename(filename):
	"""
	A mock function that simulates image recognition by looking at the filename.
	In a real application, this would be replaced with an actual image recognition API.
	"""
	filename_lower = filename.lower()

	# List of common animals and their possible filenames
	animal_keywords = {
	"cat": "Felis catus",
	"dog": "Canis familiaris",
	"bird": "Aves",
	"eagle": "Aquila chrysaetos",
	"lion": "Panthera leo",
	"tiger": "Panthera tigris",
	"bear": "Ursus arctos",
	"wolf": "Canis lupus",
	"fox": "Vulpes vulpes",
	"deer": "Cervidae",
	"elephant": "Loxodonta africana",
	"giraffe": "Giraffa camelopardalis",
	"zebra": "Equus quagga",
	"monkey": "Primates",
	"gorilla": "Gorilla gorilla",
	"fish": "Actinopterygii",
	"shark": "Selachimorpha",
	"dolphin": "Tursiops truncatus",
	"whale": "Cetacea",
	"snake": "Serpentes",
	"lizard": "Lacertilia",
	"turtle": "Testudines",
	"frog": "Anura",
	"butterfly": "Lepidoptera",
	"bee": "Apis mellifera",
	}

	# List of common plants and their possible filenames
	plant_keywords = {
	"tree": "Arbor",
	"flower": "Anthophyta",
	"rose": "Rosa",
	"tulip": "Tulipa",
	"daisy": "Bellis perennis",
	"sunflower": "Helianthus annuus",
	"oak": "Quercus",
	"pine": "Pinus",
	"maple": "Acer",
	"fern": "Polypodiopsida",
	"moss": "Bryophyta",
	"grass": "Poaceae",
	"cactus": "Cactaceae",
	"palm": "Arecaceae",
	"orchid": "Orchidaceae",
	}

	# Check animal keywords
	for keyword, species in animal_keywords.items():
	if keyword in filename_lower:
	return species

	# Check plant keywords
	for keyword, species in plant_keywords.items():
	if keyword in filename_lower:
	return species

	# If no match is found, return a default species
	return "Homo sapiens"

	def extract_wikipedia_classification(full_text, title, search_data=None):
	"""
	Extract classification/taxonomy information from Wikipedia content.
	Uses various strategies including infobox parsing, section analysis, and text pattern matching.

	Args:
	full_text: The full text content of the Wikipedia page
	title: The title of the Wikipedia page
	search_data: Optional search data that might contain additional info

	Returns:
	A dictionary with taxonomic ranks and their values
	"""
	# Initialize with default "Unknown" values
	classification = {
	"kingdom": "Unknown",
	"phylum": "Unknown",
	"class": "Unknown",
	"order": "Unknown",
	"family": "Unknown",
	"genus": "Unknown",
	"species": "Unknown"
	}

	if not full_text:
	return classification

	try:
	# STRATEGY 1: Look for taxonomic information in specific sections
	taxonomy_section = extract_wikipedia_section(full_text, ["Taxonomy", "Classification", "Taxonomic", "Scientific classification"])
	if taxonomy_section:
	# Extract taxonomic information from the section
	classification = extract_taxonomy_from_text(taxonomy_section, classification)

	# STRATEGY 2: Look for taxonomic information in infobox-like structures
	# Wikipedia infoboxes often appear at the beginning of the text with structured format
	infobox_patterns = [
	r"Kingdom:\s*([A-Za-z]+)",
	r"Phylum:\s*([A-Za-z]+)",
	r"Class:\s*([A-Za-z]+)",
	r"Order:\s*([A-Za-z]+)",
	r"Family:\s*([A-Za-z]+)",
	r"Genus:\s*([A-Za-z]+)",
	r"Species:\s*([A-Za-z]+)"
	]

	# Apply each pattern to extract taxonomic information
	for i, pattern in enumerate(infobox_patterns):
	rank = list(classification.keys())[i]
	matches = re.findall(pattern, full_text, re.IGNORECASE)
	if matches:
	classification[rank] = matches[0].strip()

	# STRATEGY 3: Parse the first paragraph for taxonomic information
	# First paragraphs in Wikipedia often contain taxonomic statements
	first_para = full_text.split('\n\n')[0] if '\n\n' in full_text else full_text
	classification = extract_taxonomy_from_text(first_para, classification)

	# STRATEGY 4: Try to extract genus and species from the title
	title_parts = title.split()
	if len(title_parts) >= 2 and classification["genus"] == "Unknown":
	# If title looks like a binomial name (e.g., "Panthera leo")
	if title_parts[0][0].isupper() and title_parts[0][1:].islower() and title_parts[1].islower():
	classification["genus"] = title_parts[0]
	if classification["species"] == "Unknown":
	classification["species"] = title_parts[1]

	# STRATEGY 5: Look for taxonomic statements throughout the text
	# These patterns match statements like "belongs to the family Felidae"
	taxonomy_statement_patterns = [
	r"(?:belongs\|belonging)\s+to\s+(?:the)?\s+kingdom\s+([A-Za-z]+)",
	r"(?:belongs\|belonging)\s+to\s+(?:the)?\s+phylum\s+([A-Za-z]+)",
	r"(?:belongs\|belonging)\s+to\s+(?:the)?\s+class\s+([A-Za-z]+)",
	r"(?:belongs\|belonging)\s+to\s+(?:the)?\s+order\s+([A-Za-z]+)",
	r"(?:belongs\|belonging)\s+to\s+(?:the)?\s+family\s+([A-Za-z]+)",
	r"(?:is\|as)\s+a\s+(?:member\|species)\s+of\s+(?:the)?\s+kingdom\s+([A-Za-z]+)",
	r"(?:is\|as)\s+a\s+(?:member\|species)\s+of\s+(?:the)?\s+phylum\s+([A-Za-z]+)",
	r"(?:is\|as)\s+a\s+(?:member\|species)\s+of\s+(?:the)?\s+class\s+([A-Za-z]+)",
	r"(?:is\|as)\s+a\s+(?:member\|species)\s+of\s+(?:the)?\s+order\s+([A-Za-z]+)",
	r"(?:is\|as)\s+a\s+(?:member\|species)\s+of\s+(?:the)?\s+family\s+([A-Za-z]+)",
	r"(?:is\|as)\s+a\s+(?:member\|species)\s+of\s+(?:the)?\s+genus\s+([A-Za-z]+)"
	]

	# Map patterns to taxonomic ranks
	rank_map = {
	0: "kingdom", 1: "phylum", 2: "class", 3: "order", 4: "family",
	5: "kingdom", 6: "phylum", 7: "class", 8: "order", 9: "family", 10: "genus"
	}

	# Apply statement patterns to extract taxonomic information
	for i, pattern in enumerate(taxonomy_statement_patterns):
	rank = rank_map.get(i)
	if not rank:
	continue

	matches = re.findall(pattern, full_text, re.IGNORECASE)
	if matches and classification[rank] == "Unknown":
	classification[rank] = matches[0].strip()

	# Final cleanup: ensure proper capitalization and formatting
	for rank, value in classification.items():
	if value != "Unknown":
	# Capitalize first letter for taxonomic ranks
	classification[rank] = value[0].upper() + value[1:]

	except Exception as e:
	print(f"Error extracting classification from Wikipedia: {str(e)}")
	# If an error occurs, we'll return the classification with whatever data we managed to extract

	return classification

	def extract_taxonomy_from_text(text, classification):
	"""
	Extract taxonomic information from text using pattern matching
	and natural language processing techniques.

	Args:
	text: The text to analyze
	classification: The current classification dictionary to update

	Returns:
	Updated classification dictionary
	"""
	if not text:
	return classification

	try:
	# Common patterns for taxonomic ranks in text
	taxonomy_patterns = {
	"kingdom": [r"Kingdom:?\s*([A-Za-z]+)", r"Kingdom\s+([A-Za-z]+)", r"a member of the kingdom\s+([A-Za-z]+)"],
	"phylum": [r"Phylum:?\s*([A-Za-z]+)", r"Phylum\s+([A-Za-z]+)", r"a member of the phylum\s+([A-Za-z]+)"],
	"class": [r"Class:?\s*([A-Za-z]+)", r"Class\s+([A-Za-z]+)", r"a member of the class\s+([A-Za-z]+)"],
	"order": [r"Order:?\s*([A-Za-z]+)", r"Order\s+([A-Za-z]+)", r"a member of the order\s+([A-Za-z]+)"],
	}

	# For each taxonomic rank, try to find matches using the patterns
	for rank, patterns in taxonomy_patterns.items():
	if classification[rank] != "Unknown":
	continue # Skip if we already have a value

	for pattern in patterns:
	matches = re.findall(pattern, text, re.IGNORECASE)
	if matches:
	# Take the first match and clean it up
	match = matches[0].strip()
	# Handle Latin taxonomic names with proper capitalization
	if rank in ["genus", "species"]:
	match = match[0].upper() + match[1:].lower()
	elif rank != "species": # For non-species ranks
	match = match.capitalize()

	classification[rank] = match
	break # Stop after finding a match for this rank

	# Look for taxonomic information with specific taxonomic suffixes
	suffix_patterns = {
	"family": [r"\b([A-Za-z]+idae)\b", r"\b([A-Za-z]+aceae)\b"], # Animal and plant families
	"order": [r"\b([A-Za-z]+ales)\b", r"\b([A-Za-z]+ida)\b"], # Plant orders and animal orders
	"class": [r"\b([A-Za-z]+ia)\b", r"\b([A-Za-z]+phyceae)\b"], # Classes
	"phylum": [r"\b([A-Za-z]+phyta)\b", r"\b([A-Za-z]+zoa)\b"] # Plant and animal phyla
	}

	# Apply suffix patterns to extract taxonomic information
	for rank, patterns in suffix_patterns.items():
	if classification[rank] != "Unknown":
	continue # Skip if we already have a value

	for pattern in patterns:
	matches = re.findall(pattern, text)
	if matches:
	# Take the first match and clean it up
	match = matches[0].strip()
	classification[rank] = match
	break

	except Exception as e:
	print(f"Error in extract_taxonomy_from_text: {str(e)}")
	# If an error occurs, return the classification as is

	return classification

	if __name__ == "__main__":
	main()