WIldCards / app.py
Trinay16's picture
Upload app.py
c77c0e7 verified
import streamlit as st
import requests
import os
import re
from PIL import Image
import tempfile
# List of allowed file extensions for uploads
ALLOWED_EXTENSIONS = {'png', 'jpg', 'jpeg', 'gif'}
def allowed_file(filename):
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
def main():
st.set_page_config(page_title="Species Information Finder", layout="wide")
st.title("Species Information Finder")
st.write("Discover information about any species by name or by uploading an image.")
# Create tabs for different functionality
tab1, tab2 = st.tabs(["Search by Name", "Search by Image"])
with tab1:
st.header("Search by Species Name")
species_name = st.text_input("Enter a species name (common or scientific):")
if st.button("Search"):
if not species_name:
st.error("Please enter a species name")
else:
with st.spinner("Searching for species information..."):
# Get species info from Wikispecies API
species_data = get_species_info(species_name)
# Get images from Wikimedia Commons API
images = get_species_images(species_name)
display_results(species_data, images)
with tab2:
st.header("Search by Image Upload")
uploaded_file = st.file_uploader("Upload an image of a species", type=ALLOWED_EXTENSIONS)
if uploaded_file is not None:
if allowed_file(uploaded_file.name):
# Display the uploaded image
image = Image.open(uploaded_file)
st.image(image, caption="Uploaded Image", use_column_width=True)
if st.button("Identify Species"):
with st.spinner("Identifying species from image..."):
# In a real app, you would call an image recognition API here
# For demo purposes, we'll use our mock function
species_name = get_mock_species_from_filename(uploaded_file.name)
# Get species info from Wikispecies API
species_data = get_species_info(species_name)
# Get images from Wikimedia Commons API
images = get_species_images(species_name)
display_results(species_data, images)
else:
st.error("File type not allowed. Please upload an image file (PNG, JPG, JPEG, GIF).")
def display_results(species_data, images):
"""Display the results in a formatted way."""
if "error" in species_data:
st.error(species_data["error"])
return
st.success(f"Found information for: {species_data['title']}")
# Create columns for layout
col1, col2 = st.columns([1, 2])
with col1:
# Display classification information
st.subheader("Classification")
classification = species_data.get("classification", {})
for rank, value in classification.items():
if value != "Unknown":
st.write(f"{rank.capitalize()}:** {value}")
# Display habitat information
if species_data.get("habitat", "Unknown") != "Unknown":
st.subheader("Habitat")
st.write(species_data["habitat"])
with col2:
# Display description
st.subheader("Description")
st.write(species_data.get("description", "No description available."))
# Display fun facts if available
if species_data.get("fun_facts"):
st.subheader("Interesting Facts")
for i, fact in enumerate(species_data["fun_facts"], 1):
st.write(f"{i}. {fact}")
# Display images if available
if images:
st.subheader("Related Images")
# Display up to 4 images in a grid
cols = st.columns(min(4, len(images)))
for idx, img in enumerate(images[:4]):
with cols[idx]:
if "thumb_url" in img:
st.image(img["thumb_url"], caption=img.get("description", ""), use_column_width=True)
else:
st.image(img["url"], caption=img.get("description", ""), use_column_width=True)
st.caption(f"Credit: {img.get('author', 'Unknown')} | License: {img.get('license', 'Unknown')}")
else:
st.warning("No images found for this species.")
# All the existing functions from your Flask app can remain exactly the same
# (get_species_info, get_wikispecies_data, get_wikipedia_data, etc.)
# I'll include them below for completeness, but they don't need to change
def get_species_info(species_name):
"""
Get species information from both Wikispecies and Wikipedia APIs
with improved extraction and fallback strategies for better results.
"""
# Create the base species info structure
species_info = {
"title": species_name, # Default to the search query
"description": "No description available.",
"categories": [],
"links": [],
"last_modified": "Unknown",
"classification": {
"kingdom": "Unknown",
"phylum": "Unknown",
"class": "Unknown",
"order": "Unknown",
"family": "Unknown",
"genus": "Unknown",
"species": "Unknown"
},
"habitat": "Unknown",
"fun_facts": [],
"data_sources": [] # Track where we got data from
}
# Try to get data from Wikispecies first
wikispecies_info = get_wikispecies_data(species_name)
# If we got a valid response, update our species_info
if not wikispecies_info.get("error"):
species_info.update(wikispecies_info)
species_info["data_sources"].append("Wikispecies")
# Now try to get complementary data from Wikipedia
wikipedia_info = get_wikipedia_data(species_name)
# If Wikipedia returned valid data, supplement our existing info
if not wikipedia_info.get("error"):
# Use Wikipedia description if Wikispecies didn't have one
if species_info["description"] == "No description available." or len(species_info["description"]) < 50:
species_info["description"] = wikipedia_info.get("description", species_info["description"])
# Always prefer Wikipedia habitat info as it's likely more detailed
species_info["habitat"] = wikipedia_info.get("habitat", species_info["habitat"])
# Merge classification info from Wikipedia, preferring Wikipedia data
if "classification" in wikipedia_info:
for rank, value in wikipedia_info["classification"].items():
if value != "Unknown":
species_info["classification"][rank] = value
# Add Wikipedia fun facts to our collection, avoiding duplicates
if wikipedia_info.get("fun_facts"):
existing_facts = species_info.get("fun_facts", [])
for fact in wikipedia_info["fun_facts"]:
if not any(similarity_score(fact, existing) > 0.7 for existing in existing_facts):
existing_facts.append(fact)
species_info["fun_facts"] = existing_facts[:4] # Limit to 4 facts
species_info["data_sources"].append("Wikipedia")
# If we didn't get any data from either source, return an error
if not species_info["data_sources"]:
species_info["error"] = "Species information not found in either Wikispecies or Wikipedia."
return species_info
def get_wikispecies_data(species_name):
"""
Get species information from Wikispecies API
"""
# Wikispecies API endpoint
url = "https://species.wikimedia.org/w/api.php"
# Parameters for the API request - get more info to work with
params = {
"action": "query",
"format": "json",
"titles": species_name,
"prop": "extracts|categories|info|links",
"exintro": True, # Get only the intro section
"explaintext": True, # Get plain text, not HTML
"cllimit": 50, # Get more categories
"pllimit": 50, # Get more links
}
try:
response = requests.get(url, params=params)
data = response.json()
# Extract page data
pages = data.get("query", {}).get("pages", {})
if not pages:
return {"error": "No data found in Wikispecies"}
# Get the first page (there should only be one)
page_id = next(iter(pages))
page = pages[page_id]
# Default information structure with placeholders
species_info = {
"title": species_name, # Default to the search query
"description": "No description available.",
"categories": [],
"links": [],
"last_modified": "Unknown",
"classification": {
"kingdom": "Unknown",
"phylum": "Unknown",
"class": "Unknown",
"order": "Unknown",
"family": "Unknown",
"genus": "Unknown",
"species": "Unknown"
},
"habitat": "Unknown",
"fun_facts": []
}
# Check if the page exists
if int(page_id) < 0:
species_info["error"] = "Species not found in Wikispecies. Try a different spelling or check for the scientific name."
return species_info
# Extract the relevant information
species_info["title"] = page.get("title", species_name)
species_info["description"] = page.get("extract", "No description available.")
# Get all categories
if "categories" in page:
species_info["categories"] = [cat.get("title") for cat in page.get("categories", [])]
# Get all links (can be useful for finding related info)
if "links" in page:
species_info["links"] = [link.get("title") for link in page.get("links", [])]
species_info["last_modified"] = page.get("touched", "Unknown")
# Clean up the description (remove unnecessary line breaks, etc.)
if species_info["description"]:
species_info["description"] = species_info["description"].replace("\n", " ").strip()
# Remove multiple spaces
import re
species_info["description"] = re.sub(r' +', ' ', species_info["description"])
# Try different strategies to extract classification
# Strategy 1: Extract from categories
species_info["classification"] = extract_classification(species_info["categories"])
# Strategy 2: Try to extract genus and species from the title if available
title = species_info.get("title", "")
title_parts = title.split()
# If the title consists of two words, it might be a binomial name (genus + species)
if len(title_parts) == 2:
genus = title_parts[0]
species = title_parts[1]
# Update classification with this information
classification = species_info.get("classification", {})
if classification.get("genus") == "Unknown":
classification["genus"] = genus
if classification.get("species") == "Unknown":
classification["species"] = species
species_info["classification"] = classification
# Strategy 3: Look for classification information in links
if species_info.get("links"):
for link in species_info["links"]:
# Check if link might be a taxonomic rank
link_parts = link.split()
if len(link_parts) == 1:
# Check common taxonomic suffixes for families, orders, etc.
if link.endswith("idae"): # Family suffix
species_info["classification"]["family"] = link
elif link.endswith("inae"): # Subfamily suffix
# Store subfamily info in a separate key
species_info["classification"]["subfamily"] = link
elif link.endswith("ales"): # Order suffix for plants
species_info["classification"]["order"] = link
elif link.endswith("aceae"): # Family suffix for plants
species_info["classification"]["family"] = link
# Extract habitat info
species_info["habitat"] = extract_habitat(species_info["description"])
# Extract fun facts
species_info["fun_facts"] = extract_fun_facts(species_info["description"])
# If the description is too short or missing, try to create a basic description
if not species_info["description"] or len(species_info["description"]) < 20:
# Create a basic description from available information
classification = species_info["classification"]
parts = []
if classification["genus"] != "Unknown" and classification["species"] != "Unknown":
parts.append(f"{species_info['title']} is a species in the genus {classification['genus']}.")
if classification["family"] != "Unknown":
parts.append(f"It belongs to the family {classification['family']}.")
if classification["order"] != "Unknown":
parts.append(f"It is classified under the order {classification['order']}.")
if parts:
species_info["description"] = " ".join(parts)
else:
species_info["description"] = f"{species_info['title']} is a species documented in Wikispecies, the free species directory."
return species_info
except Exception as e:
error_msg = str(e)
return {
"error": f"Error retrieving species information from Wikispecies: {error_msg}",
"title": species_name,
"description": "No information available due to an error. Please try a different species name.",
"classification": {"kingdom": "Unknown", "phylum": "Unknown", "class": "Unknown", "order": "Unknown", "family": "Unknown", "genus": "Unknown", "species": "Unknown"},
"habitat": "Unknown",
"fun_facts": []
}
def get_wikipedia_data(species_name):
"""
Get species information from Wikipedia API, focusing on description,
habitat, and fun facts.
"""
# Wikipedia API endpoint
url = "https://en.wikipedia.org/w/api.php"
# First, try to search for the page to get the correct title
search_params = {
"action": "query",
"format": "json",
"list": "search",
"srsearch": species_name,
"srlimit": 1, # Get just the best match
}
try:
# Search for the page first to get the exact title
search_response = requests.get(url, params=search_params)
search_data = search_response.json()
# Check if we found any search results
search_results = search_data.get("query", {}).get("search", [])
if not search_results:
return {"error": "No matching Wikipedia page found for this species."}
# Get the page title from the search result
page_title = search_results[0].get("title")
# Now get the full page content
content_params = {
"action": "query",
"format": "json",
"titles": page_title,
"prop": "extracts|categories|sections",
"exintro": False, # Get the full content, not just the intro
"explaintext": True, # Get plain text, not HTML
"cllimit": 50, # Get more categories
}
content_response = requests.get(url, params=content_params)
content_data = content_response.json()
# Extract page data
pages = content_data.get("query", {}).get("pages", {})
if not pages:
return {"error": "Failed to retrieve Wikipedia page content."}
# Get the first page (there should only be one)
page_id = next(iter(pages))
page = pages[page_id]
# Check if the page exists
if int(page_id) < 0:
return {"error": "Wikipedia page not found."}
# Get basic information
species_info = {
"title": page.get("title", species_name),
"description": "",
"habitat": "Unknown",
"fun_facts": [],
"classification": {
"kingdom": "Unknown",
"phylum": "Unknown",
"class": "Unknown",
"order": "Unknown",
"family": "Unknown",
"genus": "Unknown",
"species": "Unknown"
}
}
# Extract the content
full_text = page.get("extract", "")
# Clean up the text
if full_text:
full_text = full_text.replace("\n\n", "||").replace("\n", " ").replace("||", "\n\n")
# Get sections from the content
sections = full_text.split("\n\n")
# The first section is usually a good description
if sections:
species_info["description"] = sections[0].strip()
# Look for habitat information in the full text
habitat_section = extract_wikipedia_section(full_text, ["Habitat", "Distribution", "Range", "Ecology", "Environment"])
if habitat_section:
species_info["habitat"] = habitat_section
else:
# If no specific habitat section, use our habitat extraction on the full text
habitat = extract_habitat(full_text)
if habitat != "Unknown":
species_info["habitat"] = habitat
# Extract fun facts from various interesting sections
behavior_section = extract_wikipedia_section(full_text, ["Behavior", "Behaviour", "Life cycle", "Diet", "Feeding", "Reproduction", "Biology"])
if behavior_section:
facts = extract_fun_facts(behavior_section)
if facts:
species_info["fun_facts"].extend(facts)
# If we don't have enough facts, try conservation status or other sections
if len(species_info["fun_facts"]) < 2:
conservation_section = extract_wikipedia_section(full_text, ["Conservation", "Status", "Threats", "Population"])
if conservation_section:
facts = extract_fun_facts(conservation_section)
if facts:
for fact in facts:
if fact not in species_info["fun_facts"]:
species_info["fun_facts"].append(fact)
# If we still don't have enough facts, use our fun facts extraction on the full text
if len(species_info["fun_facts"]) < 2:
general_facts = extract_fun_facts(full_text)
if general_facts:
for fact in general_facts:
if fact not in species_info["fun_facts"]:
species_info["fun_facts"].append(fact)
# Limit to 4 facts
species_info["fun_facts"] = species_info["fun_facts"][:4]
# Extract classification from Wikipedia content
wiki_classification = extract_wikipedia_classification(full_text, page.get("title", ""), search_data)
if wiki_classification:
species_info["classification"] = wiki_classification
return species_info
except Exception as e:
error_msg = str(e)
return {
"error": f"Error retrieving information from Wikipedia: {error_msg}",
"title": species_name,
"description": "No information available from Wikipedia due to an error.",
"habitat": "Unknown",
"fun_facts": []
}
def extract_wikipedia_section(text, section_keywords):
"""
Try to extract a specific section from Wikipedia text content.
Returns the first matching section or None if no match is found.
"""
if not text:
return None
# Try to find section headings in the text
section_pattern = r"==\s*([^=]+)\s*=="
sections = re.findall(section_pattern, text)
# Check if any of our target sections exist
matching_sections = []
for keyword in section_keywords:
for section in sections:
if keyword.lower() in section.lower():
# Found a matching section, now extract its content
section_regex = re.escape(f"== {section} ==")
try:
# Find where this section starts
start_match = re.search(section_regex, text)
if start_match:
start_pos = start_match.end()
# Find where the next section starts
next_section = re.search(r"==\s*[^=]+\s*==", text[start_pos:])
if next_section:
end_pos = start_pos + next_section.start()
section_text = text[start_pos:end_pos].strip()
else:
# This is the last section
section_text = text[start_pos:].strip()
matching_sections.append(section_text)
except Exception:
# Skip this section if there's any error processing it
continue
# If we found any matching sections, join them (limit to 2 for conciseness)
if matching_sections:
return " ".join(matching_sections[:2])
# Alternative approach: look for paragraphs containing the keywords
paragraphs = text.split("\n\n")
for keyword in section_keywords:
for paragraph in paragraphs:
if keyword.lower() in paragraph.lower():
return paragraph
return None
def get_species_images(species_name):
"""
Get species images from Wikimedia Commons API with improved search
strategies for better results.
"""
# Wikimedia Commons API endpoint
url = "https://commons.wikimedia.org/w/api.php"
# Function to perform a search with given parameters
def search_images(search_term, limit=10):
# Parameters for the API request
params = {
"action": "query",
"format": "json",
"generator": "search",
"gsrnamespace": 6, # File namespace
"gsrsearch": search_term,
"gsrlimit": limit, # Limit results
"prop": "imageinfo",
"iiprop": "url|extmetadata",
"iiurlwidth": 800, # Thumbnail width
}
try:
response = requests.get(url, params=params)
data = response.json()
# Extract image data
pages = data.get("query", {}).get("pages", {})
if not pages:
return []
images = []
for page_id, page in pages.items():
image_info = page.get("imageinfo", [{}])[0]
# Extract metadata
metadata = image_info.get("extmetadata", {})
description = metadata.get("ImageDescription", {}).get("value", "No description")
author = metadata.get("Artist", {}).get("value", "Unknown")
license = metadata.get("License", {}).get("value", "Unknown")
# Skip non-image files (like pdfs, audio, etc.)
title = page.get("title", "").lower()
if any(ext in title for ext in ['.pdf', '.svg', '.mp3', '.mp4', '.ogg', '.wav', '.webm']):
continue
image = {
"title": page.get("title", "Unknown"),
"url": image_info.get("url", ""),
"thumb_url": image_info.get("thumburl", ""),
"description": description,
"author": author,
"license": license,
}
images.append(image)
return images
except Exception as e:
return [{"error": str(e)}]
# STRATEGY 1: Try exact file name search first
images = search_images(f"file:{species_name}")
# If no results, try a broader search
if not images:
# STRATEGY 2: Try removing the file: prefix for broader results
images = search_images(species_name)
# If still no results or very few, try some variations
if len(images) < 3:
# Split the species name and try different combinations
name_parts = species_name.split()
# STRATEGY 3: If it's a binomial name, try with just the genus or species part
if len(name_parts) == 2:
# Try with just the genus (first part)
genus_images = search_images(f"{name_parts[0]}")
# Add unique images from genus search
existing_urls = [img.get("url") for img in images]
for img in genus_images:
if img.get("url") not in existing_urls:
images.append(img)
existing_urls.append(img.get("url"))
# Stop if we now have enough images
if len(images) >= 5:
break
# If we found at least some images, return them
if images:
return images
# STRATEGY 4: Last resort - try a very general search
# This could be improved by using the taxonomy info
return search_images("species taxonomy nature")
def extract_classification(categories):
"""
Extract classification information from categories and additional WikiData
with improved pattern matching and detection.
"""
# Initialize with default "Unknown" values
classification = {
"kingdom": "Unknown",
"phylum": "Unknown",
"class": "Unknown",
"order": "Unknown",
"family": "Unknown",
"genus": "Unknown",
"species": "Unknown",
}
# Skip empty categories
if not categories:
return classification
# Common taxonomy patterns in category names with more variations
taxonomy_patterns = {
"kingdom": ["kingdom:", "regnum:", "reino:", "regno:", "kingdom ", "regnum ", "reino ", "reino "],
"phylum": ["phylum:", "division:", "división:", "divisio:", "phylum ", "division ", "división ", "divisio "],
"class": ["class:", "clase:", "classis:", "class ", "clase ", "classis "],
"order": ["order:", "orden:", "ordo:", "order ", "orden ", "ordo "],
"family": ["family:", "familia:", "family ", "familia "],
"genus": ["genus:", "género:", "genero:", "genus ", "género ", "genero "],
"species": ["species:", "especie:", "specie:", "species ", "especie ", "specie "]
}
# STRATEGY 1: Direct matching from category names
for category in categories:
# Skip Categories: prefix if present
if category.startswith("Category:"):
category = category[9:]
category_lower = category.lower()
# Check for direct taxonomy mentions
for rank, patterns in taxonomy_patterns.items():
for pattern in patterns:
if pattern in category_lower:
# Extract the value after the pattern
parts = category_lower.split(pattern)
if len(parts) > 1:
# Clean up the value (capitalize first letter, remove trailing spaces and special chars)
value = parts[1].strip().split()[0].capitalize()
classification[rank] = value
break
# STRATEGY 2: Look for categories that directly match taxonomic naming conventions
for category in categories:
# Skip Categories: prefix if present
if category.startswith("Category:"):
category = category[9:]
category_parts = category.split()
# Check for single-word categories that might be taxonomic names
if len(category_parts) == 1:
name = category_parts[0]
# Check for common taxonomic suffixes
if name.endswith("idae"): # Family suffix for animals
classification["family"] = name
elif name.endswith("inae"): # Subfamily suffix
# Store subfamily info in a separate key
classification["subfamily"] = name
elif name.endswith("ales"): # Order suffix for plants
classification["order"] = name
elif name.endswith("aceae"): # Family suffix for plants
classification["family"] = name
elif name.endswith("ineae"): # Suborder suffix for plants
# Store suborder info in a separate key
classification["suborder"] = name
elif name.endswith("oideae"): # Subfamily suffix for plants
# Store subfamily info in a separate key
classification["subfamily"] = name
# STRATEGY 3: Check for categories that contain common taxonomic rank names
taxonomic_rank_names = ["kingdom", "phylum", "division", "class", "order", "family", "genus", "species"]
for category in categories:
# Skip Categories: prefix if present
if category.startswith("Category:"):
category = category[9:]
category_lower = category.lower()
for rank in taxonomic_rank_names:
if rank in category_lower:
# Look for words after the rank name
parts = category_lower.split(rank)
if len(parts) > 1 and parts[1].strip():
# Get the first word after the rank
value = parts[1].strip().split()[0].capitalize()
if classification[rank] == "Unknown":
classification[rank] = value
# Final cleanup: ensure proper capitalization and formatting
for rank, value in classification.items():
if value != "Unknown":
# Capitalize first letter for taxonomic ranks
classification[rank] = value[0].upper() + value[1:]
return classification
def extract_habitat(description):
"""
Extract habitat information from description using a more comprehensive approach
with multiple fallback strategies and pattern recognition.
"""
if not description or description == "No description available":
return "Unknown"
# Split the description into sentences
sentences = description.replace(". ", ".|").replace("! ", "!|").replace("? ", "?|").split("|")
sentences = [s.strip() for s in sentences if s.strip()]
# STRATEGY 1: Direct habitat statements
# Expanded list of habitat-related keywords and phrases
habitat_keywords = [
"habitat", "lives in", "found in", "native to", "occurs in", "distribution",
"range includes", "ecosystem", "biome", "environment", "inhabits", "dwelling in",
"endemic to", "natural range", "geographical range", "distributed across",
"prefers", "thrives in", "flourishes in", "resides in", "habitat type",
"commonly found", "typically found", "often found", "usually found", "primarily found"
]
# STRATEGY 2: Geography and climate context
# Climate and geography keywords to catch broader context
climate_keywords = [
"tropical", "temperate", "polar", "arctic", "antarctic", "desert",
"rainforest", "forest", "jungle", "grassland", "savanna", "wetland",
"marsh", "swamp", "mountain", "alpine", "coastal", "marine", "freshwater",
"ocean", "sea", "river", "lake", "stream", "pond", "terrestrial", "aquatic",
"woodland", "meadow", "tundra", "taiga", "steppe", "continent", "island",
"shore", "beach", "reef", "cave", "burrow", "nest", "canopy", "undergrowth"
]
# STRATEGY 3: Regional indicators (continents, regions, countries)
region_keywords = [
"africa", "asia", "europe", "north america", "south america", "australia",
"antarctica", "oceania", "mediterranean", "pacific", "atlantic", "indian ocean",
"arctic ocean", "southern ocean", "northern", "southern", "eastern", "western",
"central", "worldwide", "global", "cosmopolitan", "international"
]
# STRATEGY 4: Verbs that might indicate location or movement patterns
action_keywords = [
"migrate", "roam", "travel", "swim", "fly", "climb", "burrow", "dig", "nest",
"breed", "forage", "hunt", "territory", "range"
]
# Sentences that might contain habitat information
habitat_sentences = []
# Apply Strategy 1: Direct habitat statements
for sentence in sentences:
for keyword in habitat_keywords:
if keyword.lower() in sentence.lower():
habitat_sentences.append(sentence)
break
# Apply Strategy 2: Geography and climate context (if strategy 1 didn't yield results)
if not habitat_sentences:
for sentence in sentences:
for keyword in climate_keywords:
if keyword.lower() in sentence.lower():
habitat_sentences.append(sentence)
break
# Apply Strategy 3: Regional indicators (if strategies 1-2 didn't yield results)
if not habitat_sentences:
for sentence in sentences:
for keyword in region_keywords:
if keyword.lower() in sentence.lower():
habitat_sentences.append(sentence)
break
# Apply Strategy 4: Action verbs related to habitat (if strategies 1-3 didn't yield results)
if not habitat_sentences:
for sentence in sentences:
for keyword in action_keywords:
if keyword.lower() in sentence.lower():
habitat_sentences.append(sentence)
break
# Fallback Strategy: If no habitat information was found, try to use the first or second sentence
# as they often contain general information about where the species lives
if not habitat_sentences and len(sentences) >= 2:
# Skip the first sentence if it's just a definition and take the second
if len(sentences) > 2:
second_sentence = sentences[1]
# Check if the second sentence has reasonable length to be informative
if len(second_sentence.split()) > 5:
habitat_sentences.append(second_sentence)
# If second sentence wasn't suitable or not available, use the first
if not habitat_sentences:
first_sentence = sentences[0]
if len(first_sentence.split()) > 5:
habitat_sentences.append(first_sentence)
# Format the habitat information
if habitat_sentences:
# If we have multiple sentences, join them (but limit to 2 for conciseness)
if len(habitat_sentences) > 1:
combined = ". ".join(habitat_sentences[:2]).strip()
# Make sure it ends with proper punctuation
if not combined.endswith(('.', '!', '?')):
combined += '.'
return combined
single = habitat_sentences[0].strip()
# Make sure it ends with proper punctuation
if not single.endswith(('.', '!', '?')):
single += '.'
return single
# Last resort: construct a generic message if we couldn't find specific habitat info
return "Specific habitat information not available from Wikispecies. Try searching online for more details about this species' natural environment."
def extract_fun_facts(description):
"""
Extract interesting fun facts from the description using keyword-based identification,
with improved pattern recognition and a structured approach to generate fun facts
even with limited information.
"""
if not description or description == "No description available":
return ["No specific information available for this species in Wikispecies."]
# Split the description into sentences
sentences = description.replace(". ", ".|").replace("! ", "!|").replace("? ", "?|").split("|")
sentences = [s.strip() for s in sentences if s.strip()]
# If the description is too short, include it as a single fact
if len(sentences) == 1 and len(description) < 100:
if not sentences[0].endswith(('.', '!', '?')):
sentences[0] += '.'
return [sentences[0]]
# STRATEGY 1: Identify sentences with interesting keywords
interesting_keywords = [
"interesting", "unique", "unusual", "remarkable", "notable", "surprising",
"fascinating", "amazing", "extraordinary", "distinctive", "special", "rare",
"strange", "curious", "unlike", "peculiar", "odd", "bizarre", "striking",
"colorful", "beautiful", "impressive", "popular", "famous", "well-known",
"largest", "smallest", "fastest", "slowest", "oldest", "youngest", "only",
"record", "discovery", "first", "last", "origin", "discovered", "introduced",
"revered", "sacred", "symbol", "iconic", "emblem", "represented", "mythology",
"legend", "folklore", "traditional", "cultural", "significance", "historical"
]
# STRATEGY 2: Physical characteristics and biology often make good facts
biology_keywords = [
"lifespan", "longevity", "size", "weight", "height", "length", "wingspan",
"color", "pattern", "marking", "appearance", "physical", "morphology", "anatomy",
"feature", "characteristic", "distinctive", "body", "shape", "structure",
"adaptation", "evolved", "evolution", "mutation", "gene", "genetic", "chromosome",
"hybrid", "species", "subspecies", "variety", "breed", "strain", "extinct",
"endangered", "threatened", "vulnerable", "conservation", "protected"
]
# STRATEGY 3: Behavior and lifestyle information
behavior_keywords = [
"diet", "eat", "feeding", "food", "prey", "predator", "hunt", "scavenge",
"forage", "graze", "browse", "omnivore", "carnivore", "herbivore", "insectivore",
"behavior", "behaviour", "habit", "activity", "social", "solitary", "group",
"herd", "flock", "pack", "colony", "community", "family", "nocturnal", "diurnal",
"crepuscular", "migrate", "migration", "hibernate", "hibernation", "estivate",
"dormant", "sleep", "rest", "active", "territory", "defend", "aggressive",
"docile", "tame", "wild", "domestic", "domesticated", "trained", "human"
]
# STRATEGY 4: Reproduction is always interesting
reproduction_keywords = [
"reproduce", "reproduction", "breeding", "mate", "mating", "courtship", "display",
"attract", "offspring", "young", "juvenile", "infant", "baby", "child", "adult",
"egg", "spawn", "birth", "pregnant", "gestation", "incubation", "hatch", "nestling",
"fledgling", "litter", "clutch", "brood", "parent", "care", "raise", "nurse", "wean"
]
# Comparative patterns that often indicate interesting facts
comparative_patterns = [
"more than", "less than", "bigger than", "smaller than", "larger than",
"faster than", "slower than", "better than", "worse than", "greater than",
"unlike", "similar to", "compared to", "in contrast to", "differs from",
"up to", "as many as", "can reach", "can grow", "can live", "known to",
"capable of", "able to", "estimated", "approximately", "about", "around"
]
# Measurement patterns that often indicate interesting statistics
measurement_patterns = [
"cm", "meter", "metre", "kilometer", "kilometre", "feet", "foot", "inch",
"kg", "gram", "pound", "ton", "tonne", "year", "month", "week", "day", "hour",
"percent", "°C", "°F", "degree", "celsius", "fahrenheit", "temperature",
"speed", "mph", "kph", "knot", "altitude", "depth", "width", "height"
]
# Collect potential facts using different strategies
fact_candidates = {
"interesting": [],
"biological": [],
"behavioral": [],
"reproductive": [],
"comparative": [],
"measurements": [],
"general": []
}
# Apply strategies to collect potential facts
for sentence in sentences:
# Skip very short sentences
if len(sentence.split()) < 4:
continue
# Flag to track if the sentence has been categorized
categorized = False
# Strategy 1: Interesting keywords
for keyword in interesting_keywords:
if keyword.lower() in sentence.lower():
fact_candidates["interesting"].append(sentence)
categorized = True
break
if not categorized:
# Strategy 2: Biological characteristics
for keyword in biology_keywords:
if keyword.lower() in sentence.lower():
fact_candidates["biological"].append(sentence)
categorized = True
break
if not categorized:
# Strategy 3: Behavior keywords
for keyword in behavior_keywords:
if keyword.lower() in sentence.lower():
fact_candidates["behavioral"].append(sentence)
categorized = True
break
if not categorized:
# Strategy 4: Reproduction keywords
for keyword in reproduction_keywords:
if keyword.lower() in sentence.lower():
fact_candidates["reproductive"].append(sentence)
categorized = True
break
if not categorized:
# Check for comparative patterns
for pattern in comparative_patterns:
if pattern.lower() in sentence.lower():
fact_candidates["comparative"].append(sentence)
categorized = True
break
if not categorized:
# Check for measurement patterns
has_number = any(c.isdigit() for c in sentence)
if has_number:
for pattern in measurement_patterns:
if pattern.lower() in sentence.lower():
fact_candidates["measurements"].append(sentence)
categorized = True
break
fact_candidates["measurements"].append(sentence)
categorized = True
break
# If sentence wasn't categorized by any specific strategy, add to general
if not categorized and len(sentence.split()) > 5:
fact_candidates["general"].append(sentence)
# Select facts from each category to ensure diversity (prioritizing the most interesting ones)
selected_facts = []
# Priority order for fact selection
categories = ["interesting", "measurements", "biological", "reproductive", "behavioral", "comparative", "general"]
# First, try to get at least one fact from high-priority categories
for category in categories[:3]: # First 3 are highest priority
if fact_candidates[category]:
selected_facts.append(fact_candidates[category][0])
fact_candidates[category].pop(0) # Remove the used fact
# Now fill remaining slots with a mix of all categories
remaining_slots = 4 - len(selected_facts) # Maximum 4 facts total
if remaining_slots > 0:
for category in categories:
if fact_candidates[category] and remaining_slots > 0:
next_fact = fact_candidates[category][0]
# Only add if not too similar to already selected facts
if not any(similarity_score(next_fact, fact) > 0.7 for fact in selected_facts):
selected_facts.append(next_fact)
remaining_slots -= 1
fact_candidates[category].pop(0) # Remove the used fact
# If we still don't have enough facts, add more from general pool
if len(selected_facts) < 2 and sentences:
# Add the first sentence if it's not already included
if sentences[0] not in selected_facts and len(sentences[0].split()) > 5:
selected_facts.append(sentences[0])
# Add another sentence from middle of the text if available
middle_idx = len(sentences) // 2
if len(sentences) > middle_idx and sentences[middle_idx] not in selected_facts and len(sentences[middle_idx].split()) > 5:
selected_facts.append(sentences[middle_idx])
# Last resort: if still no facts, create a generic fact
if not selected_facts:
selected_facts = ["This species is documented in Wikispecies, the free species directory."]
# Ensure all facts end with proper punctuation
for i in range(len(selected_facts)):
if not selected_facts[i].endswith(('.', '!', '?')):
selected_facts[i] += '.'
# Remove duplicates while preserving order
unique_facts = []
for fact in selected_facts:
if fact not in unique_facts:
unique_facts.append(fact)
return unique_facts[:4] # Limit to max 4 facts
def similarity_score(str1, str2):
"""
Calculate a simple similarity score between two strings
based on word overlap. Used to avoid selecting too similar facts.
Returns a value between 0 (completely different) and 1 (identical).
"""
if not str1 or not str2:
return 0
# Convert to lowercase and split into words
words1 = set(str1.lower().split())
words2 = set(str2.lower().split())
# Calculate Jaccard similarity
intersection = words1.intersection(words2)
union = words1.union(words2)
if not union:
return 0
return len(intersection) / len(union)
def get_mock_species_from_filename(filename):
"""
A mock function that simulates image recognition by looking at the filename.
In a real application, this would be replaced with an actual image recognition API.
"""
filename_lower = filename.lower()
# List of common animals and their possible filenames
animal_keywords = {
"cat": "Felis catus",
"dog": "Canis familiaris",
"bird": "Aves",
"eagle": "Aquila chrysaetos",
"lion": "Panthera leo",
"tiger": "Panthera tigris",
"bear": "Ursus arctos",
"wolf": "Canis lupus",
"fox": "Vulpes vulpes",
"deer": "Cervidae",
"elephant": "Loxodonta africana",
"giraffe": "Giraffa camelopardalis",
"zebra": "Equus quagga",
"monkey": "Primates",
"gorilla": "Gorilla gorilla",
"fish": "Actinopterygii",
"shark": "Selachimorpha",
"dolphin": "Tursiops truncatus",
"whale": "Cetacea",
"snake": "Serpentes",
"lizard": "Lacertilia",
"turtle": "Testudines",
"frog": "Anura",
"butterfly": "Lepidoptera",
"bee": "Apis mellifera",
}
# List of common plants and their possible filenames
plant_keywords = {
"tree": "Arbor",
"flower": "Anthophyta",
"rose": "Rosa",
"tulip": "Tulipa",
"daisy": "Bellis perennis",
"sunflower": "Helianthus annuus",
"oak": "Quercus",
"pine": "Pinus",
"maple": "Acer",
"fern": "Polypodiopsida",
"moss": "Bryophyta",
"grass": "Poaceae",
"cactus": "Cactaceae",
"palm": "Arecaceae",
"orchid": "Orchidaceae",
}
# Check animal keywords
for keyword, species in animal_keywords.items():
if keyword in filename_lower:
return species
# Check plant keywords
for keyword, species in plant_keywords.items():
if keyword in filename_lower:
return species
# If no match is found, return a default species
return "Homo sapiens"
def extract_wikipedia_classification(full_text, title, search_data=None):
"""
Extract classification/taxonomy information from Wikipedia content.
Uses various strategies including infobox parsing, section analysis, and text pattern matching.
Args:
full_text: The full text content of the Wikipedia page
title: The title of the Wikipedia page
search_data: Optional search data that might contain additional info
Returns:
A dictionary with taxonomic ranks and their values
"""
# Initialize with default "Unknown" values
classification = {
"kingdom": "Unknown",
"phylum": "Unknown",
"class": "Unknown",
"order": "Unknown",
"family": "Unknown",
"genus": "Unknown",
"species": "Unknown"
}
if not full_text:
return classification
try:
# STRATEGY 1: Look for taxonomic information in specific sections
taxonomy_section = extract_wikipedia_section(full_text, ["Taxonomy", "Classification", "Taxonomic", "Scientific classification"])
if taxonomy_section:
# Extract taxonomic information from the section
classification = extract_taxonomy_from_text(taxonomy_section, classification)
# STRATEGY 2: Look for taxonomic information in infobox-like structures
# Wikipedia infoboxes often appear at the beginning of the text with structured format
infobox_patterns = [
r"Kingdom:\s*([A-Za-z]+)",
r"Phylum:\s*([A-Za-z]+)",
r"Class:\s*([A-Za-z]+)",
r"Order:\s*([A-Za-z]+)",
r"Family:\s*([A-Za-z]+)",
r"Genus:\s*([A-Za-z]+)",
r"Species:\s*([A-Za-z]+)"
]
# Apply each pattern to extract taxonomic information
for i, pattern in enumerate(infobox_patterns):
rank = list(classification.keys())[i]
matches = re.findall(pattern, full_text, re.IGNORECASE)
if matches:
classification[rank] = matches[0].strip()
# STRATEGY 3: Parse the first paragraph for taxonomic information
# First paragraphs in Wikipedia often contain taxonomic statements
first_para = full_text.split('\n\n')[0] if '\n\n' in full_text else full_text
classification = extract_taxonomy_from_text(first_para, classification)
# STRATEGY 4: Try to extract genus and species from the title
title_parts = title.split()
if len(title_parts) >= 2 and classification["genus"] == "Unknown":
# If title looks like a binomial name (e.g., "Panthera leo")
if title_parts[0][0].isupper() and title_parts[0][1:].islower() and title_parts[1].islower():
classification["genus"] = title_parts[0]
if classification["species"] == "Unknown":
classification["species"] = title_parts[1]
# STRATEGY 5: Look for taxonomic statements throughout the text
# These patterns match statements like "belongs to the family Felidae"
taxonomy_statement_patterns = [
r"(?:belongs|belonging)\s+to\s+(?:the)?\s+kingdom\s+([A-Za-z]+)",
r"(?:belongs|belonging)\s+to\s+(?:the)?\s+phylum\s+([A-Za-z]+)",
r"(?:belongs|belonging)\s+to\s+(?:the)?\s+class\s+([A-Za-z]+)",
r"(?:belongs|belonging)\s+to\s+(?:the)?\s+order\s+([A-Za-z]+)",
r"(?:belongs|belonging)\s+to\s+(?:the)?\s+family\s+([A-Za-z]+)",
r"(?:is|as)\s+a\s+(?:member|species)\s+of\s+(?:the)?\s+kingdom\s+([A-Za-z]+)",
r"(?:is|as)\s+a\s+(?:member|species)\s+of\s+(?:the)?\s+phylum\s+([A-Za-z]+)",
r"(?:is|as)\s+a\s+(?:member|species)\s+of\s+(?:the)?\s+class\s+([A-Za-z]+)",
r"(?:is|as)\s+a\s+(?:member|species)\s+of\s+(?:the)?\s+order\s+([A-Za-z]+)",
r"(?:is|as)\s+a\s+(?:member|species)\s+of\s+(?:the)?\s+family\s+([A-Za-z]+)",
r"(?:is|as)\s+a\s+(?:member|species)\s+of\s+(?:the)?\s+genus\s+([A-Za-z]+)"
]
# Map patterns to taxonomic ranks
rank_map = {
0: "kingdom", 1: "phylum", 2: "class", 3: "order", 4: "family",
5: "kingdom", 6: "phylum", 7: "class", 8: "order", 9: "family", 10: "genus"
}
# Apply statement patterns to extract taxonomic information
for i, pattern in enumerate(taxonomy_statement_patterns):
rank = rank_map.get(i)
if not rank:
continue
matches = re.findall(pattern, full_text, re.IGNORECASE)
if matches and classification[rank] == "Unknown":
classification[rank] = matches[0].strip()
# Final cleanup: ensure proper capitalization and formatting
for rank, value in classification.items():
if value != "Unknown":
# Capitalize first letter for taxonomic ranks
classification[rank] = value[0].upper() + value[1:]
except Exception as e:
print(f"Error extracting classification from Wikipedia: {str(e)}")
# If an error occurs, we'll return the classification with whatever data we managed to extract
return classification
def extract_taxonomy_from_text(text, classification):
"""
Extract taxonomic information from text using pattern matching
and natural language processing techniques.
Args:
text: The text to analyze
classification: The current classification dictionary to update
Returns:
Updated classification dictionary
"""
if not text:
return classification
try:
# Common patterns for taxonomic ranks in text
taxonomy_patterns = {
"kingdom": [r"Kingdom:?\s*([A-Za-z]+)", r"Kingdom\s+([A-Za-z]+)", r"a member of the kingdom\s+([A-Za-z]+)"],
"phylum": [r"Phylum:?\s*([A-Za-z]+)", r"Phylum\s+([A-Za-z]+)", r"a member of the phylum\s+([A-Za-z]+)"],
"class": [r"Class:?\s*([A-Za-z]+)", r"Class\s+([A-Za-z]+)", r"a member of the class\s+([A-Za-z]+)"],
"order": [r"Order:?\s*([A-Za-z]+)", r"Order\s+([A-Za-z]+)", r"a member of the order\s+([A-Za-z]+)"],
}
# For each taxonomic rank, try to find matches using the patterns
for rank, patterns in taxonomy_patterns.items():
if classification[rank] != "Unknown":
continue # Skip if we already have a value
for pattern in patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
if matches:
# Take the first match and clean it up
match = matches[0].strip()
# Handle Latin taxonomic names with proper capitalization
if rank in ["genus", "species"]:
match = match[0].upper() + match[1:].lower()
elif rank != "species": # For non-species ranks
match = match.capitalize()
classification[rank] = match
break # Stop after finding a match for this rank
# Look for taxonomic information with specific taxonomic suffixes
suffix_patterns = {
"family": [r"\b([A-Za-z]+idae)\b", r"\b([A-Za-z]+aceae)\b"], # Animal and plant families
"order": [r"\b([A-Za-z]+ales)\b", r"\b([A-Za-z]+ida)\b"], # Plant orders and animal orders
"class": [r"\b([A-Za-z]+ia)\b", r"\b([A-Za-z]+phyceae)\b"], # Classes
"phylum": [r"\b([A-Za-z]+phyta)\b", r"\b([A-Za-z]+zoa)\b"] # Plant and animal phyla
}
# Apply suffix patterns to extract taxonomic information
for rank, patterns in suffix_patterns.items():
if classification[rank] != "Unknown":
continue # Skip if we already have a value
for pattern in patterns:
matches = re.findall(pattern, text)
if matches:
# Take the first match and clean it up
match = matches[0].strip()
classification[rank] = match
break
except Exception as e:
print(f"Error in extract_taxonomy_from_text: {str(e)}")
# If an error occurs, return the classification as is
return classification
if _name_ == "_main_":
main()