|
|
import streamlit as st
|
|
|
import requests
|
|
|
import os
|
|
|
import re
|
|
|
from PIL import Image
|
|
|
import tempfile
|
|
|
|
|
|
|
|
|
ALLOWED_EXTENSIONS = {'png', 'jpg', 'jpeg', 'gif'}
|
|
|
|
|
|
def allowed_file(filename):
|
|
|
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
|
|
|
|
|
|
def main():
|
|
|
st.set_page_config(page_title="Species Information Finder", layout="wide")
|
|
|
|
|
|
st.title("Species Information Finder")
|
|
|
st.write("Discover information about any species by name or by uploading an image.")
|
|
|
|
|
|
|
|
|
tab1, tab2 = st.tabs(["Search by Name", "Search by Image"])
|
|
|
|
|
|
with tab1:
|
|
|
st.header("Search by Species Name")
|
|
|
species_name = st.text_input("Enter a species name (common or scientific):")
|
|
|
|
|
|
if st.button("Search"):
|
|
|
if not species_name:
|
|
|
st.error("Please enter a species name")
|
|
|
else:
|
|
|
with st.spinner("Searching for species information..."):
|
|
|
|
|
|
species_data = get_species_info(species_name)
|
|
|
|
|
|
|
|
|
images = get_species_images(species_name)
|
|
|
|
|
|
display_results(species_data, images)
|
|
|
|
|
|
with tab2:
|
|
|
st.header("Search by Image Upload")
|
|
|
uploaded_file = st.file_uploader("Upload an image of a species", type=ALLOWED_EXTENSIONS)
|
|
|
|
|
|
if uploaded_file is not None:
|
|
|
if allowed_file(uploaded_file.name):
|
|
|
|
|
|
image = Image.open(uploaded_file)
|
|
|
st.image(image, caption="Uploaded Image", use_column_width=True)
|
|
|
|
|
|
if st.button("Identify Species"):
|
|
|
with st.spinner("Identifying species from image..."):
|
|
|
|
|
|
|
|
|
species_name = get_mock_species_from_filename(uploaded_file.name)
|
|
|
|
|
|
|
|
|
species_data = get_species_info(species_name)
|
|
|
|
|
|
|
|
|
images = get_species_images(species_name)
|
|
|
|
|
|
display_results(species_data, images)
|
|
|
else:
|
|
|
st.error("File type not allowed. Please upload an image file (PNG, JPG, JPEG, GIF).")
|
|
|
|
|
|
def display_results(species_data, images):
|
|
|
"""Display the results in a formatted way."""
|
|
|
if "error" in species_data:
|
|
|
st.error(species_data["error"])
|
|
|
return
|
|
|
|
|
|
st.success(f"Found information for: {species_data['title']}")
|
|
|
|
|
|
|
|
|
col1, col2 = st.columns([1, 2])
|
|
|
|
|
|
with col1:
|
|
|
|
|
|
st.subheader("Classification")
|
|
|
classification = species_data.get("classification", {})
|
|
|
for rank, value in classification.items():
|
|
|
if value != "Unknown":
|
|
|
st.write(f"{rank.capitalize()}:** {value}")
|
|
|
|
|
|
|
|
|
if species_data.get("habitat", "Unknown") != "Unknown":
|
|
|
st.subheader("Habitat")
|
|
|
st.write(species_data["habitat"])
|
|
|
|
|
|
with col2:
|
|
|
|
|
|
st.subheader("Description")
|
|
|
st.write(species_data.get("description", "No description available."))
|
|
|
|
|
|
|
|
|
if species_data.get("fun_facts"):
|
|
|
st.subheader("Interesting Facts")
|
|
|
for i, fact in enumerate(species_data["fun_facts"], 1):
|
|
|
st.write(f"{i}. {fact}")
|
|
|
|
|
|
|
|
|
if images:
|
|
|
st.subheader("Related Images")
|
|
|
|
|
|
|
|
|
cols = st.columns(min(4, len(images)))
|
|
|
for idx, img in enumerate(images[:4]):
|
|
|
with cols[idx]:
|
|
|
if "thumb_url" in img:
|
|
|
st.image(img["thumb_url"], caption=img.get("description", ""), use_column_width=True)
|
|
|
else:
|
|
|
st.image(img["url"], caption=img.get("description", ""), use_column_width=True)
|
|
|
st.caption(f"Credit: {img.get('author', 'Unknown')} | License: {img.get('license', 'Unknown')}")
|
|
|
else:
|
|
|
st.warning("No images found for this species.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_species_info(species_name):
|
|
|
"""
|
|
|
Get species information from both Wikispecies and Wikipedia APIs
|
|
|
with improved extraction and fallback strategies for better results.
|
|
|
"""
|
|
|
|
|
|
species_info = {
|
|
|
"title": species_name,
|
|
|
"description": "No description available.",
|
|
|
"categories": [],
|
|
|
"links": [],
|
|
|
"last_modified": "Unknown",
|
|
|
"classification": {
|
|
|
"kingdom": "Unknown",
|
|
|
"phylum": "Unknown",
|
|
|
"class": "Unknown",
|
|
|
"order": "Unknown",
|
|
|
"family": "Unknown",
|
|
|
"genus": "Unknown",
|
|
|
"species": "Unknown"
|
|
|
},
|
|
|
"habitat": "Unknown",
|
|
|
"fun_facts": [],
|
|
|
"data_sources": []
|
|
|
}
|
|
|
|
|
|
|
|
|
wikispecies_info = get_wikispecies_data(species_name)
|
|
|
|
|
|
|
|
|
if not wikispecies_info.get("error"):
|
|
|
species_info.update(wikispecies_info)
|
|
|
species_info["data_sources"].append("Wikispecies")
|
|
|
|
|
|
|
|
|
wikipedia_info = get_wikipedia_data(species_name)
|
|
|
|
|
|
|
|
|
if not wikipedia_info.get("error"):
|
|
|
|
|
|
if species_info["description"] == "No description available." or len(species_info["description"]) < 50:
|
|
|
species_info["description"] = wikipedia_info.get("description", species_info["description"])
|
|
|
|
|
|
|
|
|
species_info["habitat"] = wikipedia_info.get("habitat", species_info["habitat"])
|
|
|
|
|
|
|
|
|
if "classification" in wikipedia_info:
|
|
|
for rank, value in wikipedia_info["classification"].items():
|
|
|
if value != "Unknown":
|
|
|
species_info["classification"][rank] = value
|
|
|
|
|
|
|
|
|
if wikipedia_info.get("fun_facts"):
|
|
|
existing_facts = species_info.get("fun_facts", [])
|
|
|
for fact in wikipedia_info["fun_facts"]:
|
|
|
if not any(similarity_score(fact, existing) > 0.7 for existing in existing_facts):
|
|
|
existing_facts.append(fact)
|
|
|
species_info["fun_facts"] = existing_facts[:4]
|
|
|
|
|
|
species_info["data_sources"].append("Wikipedia")
|
|
|
|
|
|
|
|
|
if not species_info["data_sources"]:
|
|
|
species_info["error"] = "Species information not found in either Wikispecies or Wikipedia."
|
|
|
|
|
|
return species_info
|
|
|
|
|
|
def get_wikispecies_data(species_name):
|
|
|
"""
|
|
|
Get species information from Wikispecies API
|
|
|
"""
|
|
|
|
|
|
url = "https://species.wikimedia.org/w/api.php"
|
|
|
|
|
|
|
|
|
params = {
|
|
|
"action": "query",
|
|
|
"format": "json",
|
|
|
"titles": species_name,
|
|
|
"prop": "extracts|categories|info|links",
|
|
|
"exintro": True,
|
|
|
"explaintext": True,
|
|
|
"cllimit": 50,
|
|
|
"pllimit": 50,
|
|
|
}
|
|
|
|
|
|
try:
|
|
|
response = requests.get(url, params=params)
|
|
|
data = response.json()
|
|
|
|
|
|
|
|
|
pages = data.get("query", {}).get("pages", {})
|
|
|
|
|
|
if not pages:
|
|
|
return {"error": "No data found in Wikispecies"}
|
|
|
|
|
|
|
|
|
page_id = next(iter(pages))
|
|
|
page = pages[page_id]
|
|
|
|
|
|
|
|
|
species_info = {
|
|
|
"title": species_name,
|
|
|
"description": "No description available.",
|
|
|
"categories": [],
|
|
|
"links": [],
|
|
|
"last_modified": "Unknown",
|
|
|
"classification": {
|
|
|
"kingdom": "Unknown",
|
|
|
"phylum": "Unknown",
|
|
|
"class": "Unknown",
|
|
|
"order": "Unknown",
|
|
|
"family": "Unknown",
|
|
|
"genus": "Unknown",
|
|
|
"species": "Unknown"
|
|
|
},
|
|
|
"habitat": "Unknown",
|
|
|
"fun_facts": []
|
|
|
}
|
|
|
|
|
|
|
|
|
if int(page_id) < 0:
|
|
|
species_info["error"] = "Species not found in Wikispecies. Try a different spelling or check for the scientific name."
|
|
|
return species_info
|
|
|
|
|
|
|
|
|
species_info["title"] = page.get("title", species_name)
|
|
|
species_info["description"] = page.get("extract", "No description available.")
|
|
|
|
|
|
|
|
|
if "categories" in page:
|
|
|
species_info["categories"] = [cat.get("title") for cat in page.get("categories", [])]
|
|
|
|
|
|
|
|
|
if "links" in page:
|
|
|
species_info["links"] = [link.get("title") for link in page.get("links", [])]
|
|
|
|
|
|
species_info["last_modified"] = page.get("touched", "Unknown")
|
|
|
|
|
|
|
|
|
if species_info["description"]:
|
|
|
species_info["description"] = species_info["description"].replace("\n", " ").strip()
|
|
|
|
|
|
import re
|
|
|
species_info["description"] = re.sub(r' +', ' ', species_info["description"])
|
|
|
|
|
|
|
|
|
|
|
|
species_info["classification"] = extract_classification(species_info["categories"])
|
|
|
|
|
|
|
|
|
title = species_info.get("title", "")
|
|
|
title_parts = title.split()
|
|
|
|
|
|
|
|
|
if len(title_parts) == 2:
|
|
|
genus = title_parts[0]
|
|
|
species = title_parts[1]
|
|
|
|
|
|
|
|
|
classification = species_info.get("classification", {})
|
|
|
if classification.get("genus") == "Unknown":
|
|
|
classification["genus"] = genus
|
|
|
if classification.get("species") == "Unknown":
|
|
|
classification["species"] = species
|
|
|
species_info["classification"] = classification
|
|
|
|
|
|
|
|
|
if species_info.get("links"):
|
|
|
for link in species_info["links"]:
|
|
|
|
|
|
link_parts = link.split()
|
|
|
if len(link_parts) == 1:
|
|
|
|
|
|
if link.endswith("idae"):
|
|
|
species_info["classification"]["family"] = link
|
|
|
elif link.endswith("inae"):
|
|
|
|
|
|
species_info["classification"]["subfamily"] = link
|
|
|
elif link.endswith("ales"):
|
|
|
species_info["classification"]["order"] = link
|
|
|
elif link.endswith("aceae"):
|
|
|
species_info["classification"]["family"] = link
|
|
|
|
|
|
|
|
|
species_info["habitat"] = extract_habitat(species_info["description"])
|
|
|
|
|
|
|
|
|
species_info["fun_facts"] = extract_fun_facts(species_info["description"])
|
|
|
|
|
|
|
|
|
if not species_info["description"] or len(species_info["description"]) < 20:
|
|
|
|
|
|
classification = species_info["classification"]
|
|
|
parts = []
|
|
|
|
|
|
if classification["genus"] != "Unknown" and classification["species"] != "Unknown":
|
|
|
parts.append(f"{species_info['title']} is a species in the genus {classification['genus']}.")
|
|
|
|
|
|
if classification["family"] != "Unknown":
|
|
|
parts.append(f"It belongs to the family {classification['family']}.")
|
|
|
|
|
|
if classification["order"] != "Unknown":
|
|
|
parts.append(f"It is classified under the order {classification['order']}.")
|
|
|
|
|
|
if parts:
|
|
|
species_info["description"] = " ".join(parts)
|
|
|
else:
|
|
|
species_info["description"] = f"{species_info['title']} is a species documented in Wikispecies, the free species directory."
|
|
|
|
|
|
return species_info
|
|
|
|
|
|
except Exception as e:
|
|
|
error_msg = str(e)
|
|
|
return {
|
|
|
"error": f"Error retrieving species information from Wikispecies: {error_msg}",
|
|
|
"title": species_name,
|
|
|
"description": "No information available due to an error. Please try a different species name.",
|
|
|
"classification": {"kingdom": "Unknown", "phylum": "Unknown", "class": "Unknown", "order": "Unknown", "family": "Unknown", "genus": "Unknown", "species": "Unknown"},
|
|
|
"habitat": "Unknown",
|
|
|
"fun_facts": []
|
|
|
}
|
|
|
|
|
|
def get_wikipedia_data(species_name):
|
|
|
"""
|
|
|
Get species information from Wikipedia API, focusing on description,
|
|
|
habitat, and fun facts.
|
|
|
"""
|
|
|
|
|
|
url = "https://en.wikipedia.org/w/api.php"
|
|
|
|
|
|
|
|
|
search_params = {
|
|
|
"action": "query",
|
|
|
"format": "json",
|
|
|
"list": "search",
|
|
|
"srsearch": species_name,
|
|
|
"srlimit": 1,
|
|
|
}
|
|
|
|
|
|
try:
|
|
|
|
|
|
search_response = requests.get(url, params=search_params)
|
|
|
search_data = search_response.json()
|
|
|
|
|
|
|
|
|
search_results = search_data.get("query", {}).get("search", [])
|
|
|
if not search_results:
|
|
|
return {"error": "No matching Wikipedia page found for this species."}
|
|
|
|
|
|
|
|
|
page_title = search_results[0].get("title")
|
|
|
|
|
|
|
|
|
content_params = {
|
|
|
"action": "query",
|
|
|
"format": "json",
|
|
|
"titles": page_title,
|
|
|
"prop": "extracts|categories|sections",
|
|
|
"exintro": False,
|
|
|
"explaintext": True,
|
|
|
"cllimit": 50,
|
|
|
}
|
|
|
|
|
|
content_response = requests.get(url, params=content_params)
|
|
|
content_data = content_response.json()
|
|
|
|
|
|
|
|
|
pages = content_data.get("query", {}).get("pages", {})
|
|
|
|
|
|
if not pages:
|
|
|
return {"error": "Failed to retrieve Wikipedia page content."}
|
|
|
|
|
|
|
|
|
page_id = next(iter(pages))
|
|
|
page = pages[page_id]
|
|
|
|
|
|
|
|
|
if int(page_id) < 0:
|
|
|
return {"error": "Wikipedia page not found."}
|
|
|
|
|
|
|
|
|
species_info = {
|
|
|
"title": page.get("title", species_name),
|
|
|
"description": "",
|
|
|
"habitat": "Unknown",
|
|
|
"fun_facts": [],
|
|
|
"classification": {
|
|
|
"kingdom": "Unknown",
|
|
|
"phylum": "Unknown",
|
|
|
"class": "Unknown",
|
|
|
"order": "Unknown",
|
|
|
"family": "Unknown",
|
|
|
"genus": "Unknown",
|
|
|
"species": "Unknown"
|
|
|
}
|
|
|
}
|
|
|
|
|
|
|
|
|
full_text = page.get("extract", "")
|
|
|
|
|
|
|
|
|
if full_text:
|
|
|
full_text = full_text.replace("\n\n", "||").replace("\n", " ").replace("||", "\n\n")
|
|
|
|
|
|
|
|
|
sections = full_text.split("\n\n")
|
|
|
|
|
|
|
|
|
if sections:
|
|
|
species_info["description"] = sections[0].strip()
|
|
|
|
|
|
|
|
|
habitat_section = extract_wikipedia_section(full_text, ["Habitat", "Distribution", "Range", "Ecology", "Environment"])
|
|
|
if habitat_section:
|
|
|
species_info["habitat"] = habitat_section
|
|
|
else:
|
|
|
|
|
|
habitat = extract_habitat(full_text)
|
|
|
if habitat != "Unknown":
|
|
|
species_info["habitat"] = habitat
|
|
|
|
|
|
|
|
|
behavior_section = extract_wikipedia_section(full_text, ["Behavior", "Behaviour", "Life cycle", "Diet", "Feeding", "Reproduction", "Biology"])
|
|
|
if behavior_section:
|
|
|
facts = extract_fun_facts(behavior_section)
|
|
|
if facts:
|
|
|
species_info["fun_facts"].extend(facts)
|
|
|
|
|
|
|
|
|
if len(species_info["fun_facts"]) < 2:
|
|
|
conservation_section = extract_wikipedia_section(full_text, ["Conservation", "Status", "Threats", "Population"])
|
|
|
if conservation_section:
|
|
|
facts = extract_fun_facts(conservation_section)
|
|
|
if facts:
|
|
|
for fact in facts:
|
|
|
if fact not in species_info["fun_facts"]:
|
|
|
species_info["fun_facts"].append(fact)
|
|
|
|
|
|
|
|
|
if len(species_info["fun_facts"]) < 2:
|
|
|
general_facts = extract_fun_facts(full_text)
|
|
|
if general_facts:
|
|
|
for fact in general_facts:
|
|
|
if fact not in species_info["fun_facts"]:
|
|
|
species_info["fun_facts"].append(fact)
|
|
|
|
|
|
|
|
|
species_info["fun_facts"] = species_info["fun_facts"][:4]
|
|
|
|
|
|
|
|
|
wiki_classification = extract_wikipedia_classification(full_text, page.get("title", ""), search_data)
|
|
|
if wiki_classification:
|
|
|
species_info["classification"] = wiki_classification
|
|
|
|
|
|
return species_info
|
|
|
|
|
|
except Exception as e:
|
|
|
error_msg = str(e)
|
|
|
return {
|
|
|
"error": f"Error retrieving information from Wikipedia: {error_msg}",
|
|
|
"title": species_name,
|
|
|
"description": "No information available from Wikipedia due to an error.",
|
|
|
"habitat": "Unknown",
|
|
|
"fun_facts": []
|
|
|
}
|
|
|
|
|
|
def extract_wikipedia_section(text, section_keywords):
|
|
|
"""
|
|
|
Try to extract a specific section from Wikipedia text content.
|
|
|
Returns the first matching section or None if no match is found.
|
|
|
"""
|
|
|
if not text:
|
|
|
return None
|
|
|
|
|
|
|
|
|
section_pattern = r"==\s*([^=]+)\s*=="
|
|
|
sections = re.findall(section_pattern, text)
|
|
|
|
|
|
|
|
|
matching_sections = []
|
|
|
for keyword in section_keywords:
|
|
|
for section in sections:
|
|
|
if keyword.lower() in section.lower():
|
|
|
|
|
|
section_regex = re.escape(f"== {section} ==")
|
|
|
try:
|
|
|
|
|
|
start_match = re.search(section_regex, text)
|
|
|
if start_match:
|
|
|
start_pos = start_match.end()
|
|
|
|
|
|
|
|
|
next_section = re.search(r"==\s*[^=]+\s*==", text[start_pos:])
|
|
|
if next_section:
|
|
|
end_pos = start_pos + next_section.start()
|
|
|
section_text = text[start_pos:end_pos].strip()
|
|
|
else:
|
|
|
|
|
|
section_text = text[start_pos:].strip()
|
|
|
|
|
|
matching_sections.append(section_text)
|
|
|
except Exception:
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
if matching_sections:
|
|
|
return " ".join(matching_sections[:2])
|
|
|
|
|
|
|
|
|
paragraphs = text.split("\n\n")
|
|
|
for keyword in section_keywords:
|
|
|
for paragraph in paragraphs:
|
|
|
if keyword.lower() in paragraph.lower():
|
|
|
return paragraph
|
|
|
|
|
|
return None
|
|
|
|
|
|
def get_species_images(species_name):
|
|
|
"""
|
|
|
Get species images from Wikimedia Commons API with improved search
|
|
|
strategies for better results.
|
|
|
"""
|
|
|
|
|
|
url = "https://commons.wikimedia.org/w/api.php"
|
|
|
|
|
|
|
|
|
def search_images(search_term, limit=10):
|
|
|
|
|
|
params = {
|
|
|
"action": "query",
|
|
|
"format": "json",
|
|
|
"generator": "search",
|
|
|
"gsrnamespace": 6,
|
|
|
"gsrsearch": search_term,
|
|
|
"gsrlimit": limit,
|
|
|
"prop": "imageinfo",
|
|
|
"iiprop": "url|extmetadata",
|
|
|
"iiurlwidth": 800,
|
|
|
}
|
|
|
|
|
|
try:
|
|
|
response = requests.get(url, params=params)
|
|
|
data = response.json()
|
|
|
|
|
|
|
|
|
pages = data.get("query", {}).get("pages", {})
|
|
|
|
|
|
if not pages:
|
|
|
return []
|
|
|
|
|
|
images = []
|
|
|
for page_id, page in pages.items():
|
|
|
image_info = page.get("imageinfo", [{}])[0]
|
|
|
|
|
|
|
|
|
metadata = image_info.get("extmetadata", {})
|
|
|
description = metadata.get("ImageDescription", {}).get("value", "No description")
|
|
|
author = metadata.get("Artist", {}).get("value", "Unknown")
|
|
|
license = metadata.get("License", {}).get("value", "Unknown")
|
|
|
|
|
|
|
|
|
title = page.get("title", "").lower()
|
|
|
if any(ext in title for ext in ['.pdf', '.svg', '.mp3', '.mp4', '.ogg', '.wav', '.webm']):
|
|
|
continue
|
|
|
|
|
|
image = {
|
|
|
"title": page.get("title", "Unknown"),
|
|
|
"url": image_info.get("url", ""),
|
|
|
"thumb_url": image_info.get("thumburl", ""),
|
|
|
"description": description,
|
|
|
"author": author,
|
|
|
"license": license,
|
|
|
}
|
|
|
|
|
|
images.append(image)
|
|
|
|
|
|
return images
|
|
|
|
|
|
except Exception as e:
|
|
|
return [{"error": str(e)}]
|
|
|
|
|
|
|
|
|
images = search_images(f"file:{species_name}")
|
|
|
|
|
|
|
|
|
if not images:
|
|
|
|
|
|
images = search_images(species_name)
|
|
|
|
|
|
|
|
|
if len(images) < 3:
|
|
|
|
|
|
name_parts = species_name.split()
|
|
|
|
|
|
|
|
|
if len(name_parts) == 2:
|
|
|
|
|
|
genus_images = search_images(f"{name_parts[0]}")
|
|
|
|
|
|
|
|
|
existing_urls = [img.get("url") for img in images]
|
|
|
for img in genus_images:
|
|
|
if img.get("url") not in existing_urls:
|
|
|
images.append(img)
|
|
|
existing_urls.append(img.get("url"))
|
|
|
|
|
|
|
|
|
if len(images) >= 5:
|
|
|
break
|
|
|
|
|
|
|
|
|
if images:
|
|
|
return images
|
|
|
|
|
|
|
|
|
|
|
|
return search_images("species taxonomy nature")
|
|
|
|
|
|
def extract_classification(categories):
|
|
|
"""
|
|
|
Extract classification information from categories and additional WikiData
|
|
|
with improved pattern matching and detection.
|
|
|
"""
|
|
|
|
|
|
classification = {
|
|
|
"kingdom": "Unknown",
|
|
|
"phylum": "Unknown",
|
|
|
"class": "Unknown",
|
|
|
"order": "Unknown",
|
|
|
"family": "Unknown",
|
|
|
"genus": "Unknown",
|
|
|
"species": "Unknown",
|
|
|
}
|
|
|
|
|
|
|
|
|
if not categories:
|
|
|
return classification
|
|
|
|
|
|
|
|
|
taxonomy_patterns = {
|
|
|
"kingdom": ["kingdom:", "regnum:", "reino:", "regno:", "kingdom ", "regnum ", "reino ", "reino "],
|
|
|
"phylum": ["phylum:", "division:", "división:", "divisio:", "phylum ", "division ", "división ", "divisio "],
|
|
|
"class": ["class:", "clase:", "classis:", "class ", "clase ", "classis "],
|
|
|
"order": ["order:", "orden:", "ordo:", "order ", "orden ", "ordo "],
|
|
|
"family": ["family:", "familia:", "family ", "familia "],
|
|
|
"genus": ["genus:", "género:", "genero:", "genus ", "género ", "genero "],
|
|
|
"species": ["species:", "especie:", "specie:", "species ", "especie ", "specie "]
|
|
|
}
|
|
|
|
|
|
|
|
|
for category in categories:
|
|
|
|
|
|
if category.startswith("Category:"):
|
|
|
category = category[9:]
|
|
|
|
|
|
category_lower = category.lower()
|
|
|
|
|
|
|
|
|
for rank, patterns in taxonomy_patterns.items():
|
|
|
for pattern in patterns:
|
|
|
if pattern in category_lower:
|
|
|
|
|
|
parts = category_lower.split(pattern)
|
|
|
if len(parts) > 1:
|
|
|
|
|
|
value = parts[1].strip().split()[0].capitalize()
|
|
|
classification[rank] = value
|
|
|
break
|
|
|
|
|
|
|
|
|
for category in categories:
|
|
|
|
|
|
if category.startswith("Category:"):
|
|
|
category = category[9:]
|
|
|
|
|
|
category_parts = category.split()
|
|
|
|
|
|
|
|
|
if len(category_parts) == 1:
|
|
|
name = category_parts[0]
|
|
|
|
|
|
|
|
|
if name.endswith("idae"):
|
|
|
classification["family"] = name
|
|
|
elif name.endswith("inae"):
|
|
|
|
|
|
classification["subfamily"] = name
|
|
|
elif name.endswith("ales"):
|
|
|
classification["order"] = name
|
|
|
elif name.endswith("aceae"):
|
|
|
classification["family"] = name
|
|
|
elif name.endswith("ineae"):
|
|
|
|
|
|
classification["suborder"] = name
|
|
|
elif name.endswith("oideae"):
|
|
|
|
|
|
classification["subfamily"] = name
|
|
|
|
|
|
|
|
|
taxonomic_rank_names = ["kingdom", "phylum", "division", "class", "order", "family", "genus", "species"]
|
|
|
for category in categories:
|
|
|
|
|
|
if category.startswith("Category:"):
|
|
|
category = category[9:]
|
|
|
|
|
|
category_lower = category.lower()
|
|
|
|
|
|
for rank in taxonomic_rank_names:
|
|
|
if rank in category_lower:
|
|
|
|
|
|
parts = category_lower.split(rank)
|
|
|
if len(parts) > 1 and parts[1].strip():
|
|
|
|
|
|
value = parts[1].strip().split()[0].capitalize()
|
|
|
if classification[rank] == "Unknown":
|
|
|
classification[rank] = value
|
|
|
|
|
|
|
|
|
for rank, value in classification.items():
|
|
|
if value != "Unknown":
|
|
|
|
|
|
classification[rank] = value[0].upper() + value[1:]
|
|
|
|
|
|
return classification
|
|
|
|
|
|
def extract_habitat(description):
|
|
|
"""
|
|
|
Extract habitat information from description using a more comprehensive approach
|
|
|
with multiple fallback strategies and pattern recognition.
|
|
|
"""
|
|
|
if not description or description == "No description available":
|
|
|
return "Unknown"
|
|
|
|
|
|
|
|
|
sentences = description.replace(". ", ".|").replace("! ", "!|").replace("? ", "?|").split("|")
|
|
|
sentences = [s.strip() for s in sentences if s.strip()]
|
|
|
|
|
|
|
|
|
|
|
|
habitat_keywords = [
|
|
|
"habitat", "lives in", "found in", "native to", "occurs in", "distribution",
|
|
|
"range includes", "ecosystem", "biome", "environment", "inhabits", "dwelling in",
|
|
|
"endemic to", "natural range", "geographical range", "distributed across",
|
|
|
"prefers", "thrives in", "flourishes in", "resides in", "habitat type",
|
|
|
"commonly found", "typically found", "often found", "usually found", "primarily found"
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
climate_keywords = [
|
|
|
"tropical", "temperate", "polar", "arctic", "antarctic", "desert",
|
|
|
"rainforest", "forest", "jungle", "grassland", "savanna", "wetland",
|
|
|
"marsh", "swamp", "mountain", "alpine", "coastal", "marine", "freshwater",
|
|
|
"ocean", "sea", "river", "lake", "stream", "pond", "terrestrial", "aquatic",
|
|
|
"woodland", "meadow", "tundra", "taiga", "steppe", "continent", "island",
|
|
|
"shore", "beach", "reef", "cave", "burrow", "nest", "canopy", "undergrowth"
|
|
|
]
|
|
|
|
|
|
|
|
|
region_keywords = [
|
|
|
"africa", "asia", "europe", "north america", "south america", "australia",
|
|
|
"antarctica", "oceania", "mediterranean", "pacific", "atlantic", "indian ocean",
|
|
|
"arctic ocean", "southern ocean", "northern", "southern", "eastern", "western",
|
|
|
"central", "worldwide", "global", "cosmopolitan", "international"
|
|
|
]
|
|
|
|
|
|
|
|
|
action_keywords = [
|
|
|
"migrate", "roam", "travel", "swim", "fly", "climb", "burrow", "dig", "nest",
|
|
|
"breed", "forage", "hunt", "territory", "range"
|
|
|
]
|
|
|
|
|
|
|
|
|
habitat_sentences = []
|
|
|
|
|
|
|
|
|
for sentence in sentences:
|
|
|
for keyword in habitat_keywords:
|
|
|
if keyword.lower() in sentence.lower():
|
|
|
habitat_sentences.append(sentence)
|
|
|
break
|
|
|
|
|
|
|
|
|
if not habitat_sentences:
|
|
|
for sentence in sentences:
|
|
|
for keyword in climate_keywords:
|
|
|
if keyword.lower() in sentence.lower():
|
|
|
habitat_sentences.append(sentence)
|
|
|
break
|
|
|
|
|
|
|
|
|
if not habitat_sentences:
|
|
|
for sentence in sentences:
|
|
|
for keyword in region_keywords:
|
|
|
if keyword.lower() in sentence.lower():
|
|
|
habitat_sentences.append(sentence)
|
|
|
break
|
|
|
|
|
|
|
|
|
if not habitat_sentences:
|
|
|
for sentence in sentences:
|
|
|
for keyword in action_keywords:
|
|
|
if keyword.lower() in sentence.lower():
|
|
|
habitat_sentences.append(sentence)
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
if not habitat_sentences and len(sentences) >= 2:
|
|
|
|
|
|
if len(sentences) > 2:
|
|
|
second_sentence = sentences[1]
|
|
|
|
|
|
if len(second_sentence.split()) > 5:
|
|
|
habitat_sentences.append(second_sentence)
|
|
|
|
|
|
|
|
|
if not habitat_sentences:
|
|
|
first_sentence = sentences[0]
|
|
|
if len(first_sentence.split()) > 5:
|
|
|
habitat_sentences.append(first_sentence)
|
|
|
|
|
|
|
|
|
if habitat_sentences:
|
|
|
|
|
|
if len(habitat_sentences) > 1:
|
|
|
combined = ". ".join(habitat_sentences[:2]).strip()
|
|
|
|
|
|
if not combined.endswith(('.', '!', '?')):
|
|
|
combined += '.'
|
|
|
return combined
|
|
|
|
|
|
single = habitat_sentences[0].strip()
|
|
|
|
|
|
if not single.endswith(('.', '!', '?')):
|
|
|
single += '.'
|
|
|
return single
|
|
|
|
|
|
|
|
|
return "Specific habitat information not available from Wikispecies. Try searching online for more details about this species' natural environment."
|
|
|
|
|
|
def extract_fun_facts(description):
|
|
|
"""
|
|
|
Extract interesting fun facts from the description using keyword-based identification,
|
|
|
with improved pattern recognition and a structured approach to generate fun facts
|
|
|
even with limited information.
|
|
|
"""
|
|
|
if not description or description == "No description available":
|
|
|
return ["No specific information available for this species in Wikispecies."]
|
|
|
|
|
|
|
|
|
sentences = description.replace(". ", ".|").replace("! ", "!|").replace("? ", "?|").split("|")
|
|
|
sentences = [s.strip() for s in sentences if s.strip()]
|
|
|
|
|
|
|
|
|
if len(sentences) == 1 and len(description) < 100:
|
|
|
if not sentences[0].endswith(('.', '!', '?')):
|
|
|
sentences[0] += '.'
|
|
|
return [sentences[0]]
|
|
|
|
|
|
|
|
|
interesting_keywords = [
|
|
|
"interesting", "unique", "unusual", "remarkable", "notable", "surprising",
|
|
|
"fascinating", "amazing", "extraordinary", "distinctive", "special", "rare",
|
|
|
"strange", "curious", "unlike", "peculiar", "odd", "bizarre", "striking",
|
|
|
"colorful", "beautiful", "impressive", "popular", "famous", "well-known",
|
|
|
"largest", "smallest", "fastest", "slowest", "oldest", "youngest", "only",
|
|
|
"record", "discovery", "first", "last", "origin", "discovered", "introduced",
|
|
|
"revered", "sacred", "symbol", "iconic", "emblem", "represented", "mythology",
|
|
|
"legend", "folklore", "traditional", "cultural", "significance", "historical"
|
|
|
]
|
|
|
|
|
|
|
|
|
biology_keywords = [
|
|
|
"lifespan", "longevity", "size", "weight", "height", "length", "wingspan",
|
|
|
"color", "pattern", "marking", "appearance", "physical", "morphology", "anatomy",
|
|
|
"feature", "characteristic", "distinctive", "body", "shape", "structure",
|
|
|
"adaptation", "evolved", "evolution", "mutation", "gene", "genetic", "chromosome",
|
|
|
"hybrid", "species", "subspecies", "variety", "breed", "strain", "extinct",
|
|
|
"endangered", "threatened", "vulnerable", "conservation", "protected"
|
|
|
]
|
|
|
|
|
|
|
|
|
behavior_keywords = [
|
|
|
"diet", "eat", "feeding", "food", "prey", "predator", "hunt", "scavenge",
|
|
|
"forage", "graze", "browse", "omnivore", "carnivore", "herbivore", "insectivore",
|
|
|
"behavior", "behaviour", "habit", "activity", "social", "solitary", "group",
|
|
|
"herd", "flock", "pack", "colony", "community", "family", "nocturnal", "diurnal",
|
|
|
"crepuscular", "migrate", "migration", "hibernate", "hibernation", "estivate",
|
|
|
"dormant", "sleep", "rest", "active", "territory", "defend", "aggressive",
|
|
|
"docile", "tame", "wild", "domestic", "domesticated", "trained", "human"
|
|
|
]
|
|
|
|
|
|
|
|
|
reproduction_keywords = [
|
|
|
"reproduce", "reproduction", "breeding", "mate", "mating", "courtship", "display",
|
|
|
"attract", "offspring", "young", "juvenile", "infant", "baby", "child", "adult",
|
|
|
"egg", "spawn", "birth", "pregnant", "gestation", "incubation", "hatch", "nestling",
|
|
|
"fledgling", "litter", "clutch", "brood", "parent", "care", "raise", "nurse", "wean"
|
|
|
]
|
|
|
|
|
|
|
|
|
comparative_patterns = [
|
|
|
"more than", "less than", "bigger than", "smaller than", "larger than",
|
|
|
"faster than", "slower than", "better than", "worse than", "greater than",
|
|
|
"unlike", "similar to", "compared to", "in contrast to", "differs from",
|
|
|
"up to", "as many as", "can reach", "can grow", "can live", "known to",
|
|
|
"capable of", "able to", "estimated", "approximately", "about", "around"
|
|
|
]
|
|
|
|
|
|
|
|
|
measurement_patterns = [
|
|
|
"cm", "meter", "metre", "kilometer", "kilometre", "feet", "foot", "inch",
|
|
|
"kg", "gram", "pound", "ton", "tonne", "year", "month", "week", "day", "hour",
|
|
|
"percent", "°C", "°F", "degree", "celsius", "fahrenheit", "temperature",
|
|
|
"speed", "mph", "kph", "knot", "altitude", "depth", "width", "height"
|
|
|
]
|
|
|
|
|
|
|
|
|
fact_candidates = {
|
|
|
"interesting": [],
|
|
|
"biological": [],
|
|
|
"behavioral": [],
|
|
|
"reproductive": [],
|
|
|
"comparative": [],
|
|
|
"measurements": [],
|
|
|
"general": []
|
|
|
}
|
|
|
|
|
|
|
|
|
for sentence in sentences:
|
|
|
|
|
|
if len(sentence.split()) < 4:
|
|
|
continue
|
|
|
|
|
|
|
|
|
categorized = False
|
|
|
|
|
|
|
|
|
for keyword in interesting_keywords:
|
|
|
if keyword.lower() in sentence.lower():
|
|
|
fact_candidates["interesting"].append(sentence)
|
|
|
categorized = True
|
|
|
break
|
|
|
|
|
|
if not categorized:
|
|
|
|
|
|
for keyword in biology_keywords:
|
|
|
if keyword.lower() in sentence.lower():
|
|
|
fact_candidates["biological"].append(sentence)
|
|
|
categorized = True
|
|
|
break
|
|
|
|
|
|
if not categorized:
|
|
|
|
|
|
for keyword in behavior_keywords:
|
|
|
if keyword.lower() in sentence.lower():
|
|
|
fact_candidates["behavioral"].append(sentence)
|
|
|
categorized = True
|
|
|
break
|
|
|
|
|
|
if not categorized:
|
|
|
|
|
|
for keyword in reproduction_keywords:
|
|
|
if keyword.lower() in sentence.lower():
|
|
|
fact_candidates["reproductive"].append(sentence)
|
|
|
categorized = True
|
|
|
break
|
|
|
|
|
|
if not categorized:
|
|
|
|
|
|
for pattern in comparative_patterns:
|
|
|
if pattern.lower() in sentence.lower():
|
|
|
fact_candidates["comparative"].append(sentence)
|
|
|
categorized = True
|
|
|
break
|
|
|
|
|
|
if not categorized:
|
|
|
|
|
|
has_number = any(c.isdigit() for c in sentence)
|
|
|
if has_number:
|
|
|
for pattern in measurement_patterns:
|
|
|
if pattern.lower() in sentence.lower():
|
|
|
fact_candidates["measurements"].append(sentence)
|
|
|
categorized = True
|
|
|
break
|
|
|
fact_candidates["measurements"].append(sentence)
|
|
|
categorized = True
|
|
|
break
|
|
|
|
|
|
|
|
|
if not categorized and len(sentence.split()) > 5:
|
|
|
fact_candidates["general"].append(sentence)
|
|
|
|
|
|
|
|
|
selected_facts = []
|
|
|
|
|
|
|
|
|
categories = ["interesting", "measurements", "biological", "reproductive", "behavioral", "comparative", "general"]
|
|
|
|
|
|
|
|
|
for category in categories[:3]:
|
|
|
if fact_candidates[category]:
|
|
|
selected_facts.append(fact_candidates[category][0])
|
|
|
fact_candidates[category].pop(0)
|
|
|
|
|
|
|
|
|
remaining_slots = 4 - len(selected_facts)
|
|
|
|
|
|
if remaining_slots > 0:
|
|
|
for category in categories:
|
|
|
if fact_candidates[category] and remaining_slots > 0:
|
|
|
next_fact = fact_candidates[category][0]
|
|
|
|
|
|
if not any(similarity_score(next_fact, fact) > 0.7 for fact in selected_facts):
|
|
|
selected_facts.append(next_fact)
|
|
|
remaining_slots -= 1
|
|
|
fact_candidates[category].pop(0)
|
|
|
|
|
|
|
|
|
if len(selected_facts) < 2 and sentences:
|
|
|
|
|
|
if sentences[0] not in selected_facts and len(sentences[0].split()) > 5:
|
|
|
selected_facts.append(sentences[0])
|
|
|
|
|
|
|
|
|
middle_idx = len(sentences) // 2
|
|
|
if len(sentences) > middle_idx and sentences[middle_idx] not in selected_facts and len(sentences[middle_idx].split()) > 5:
|
|
|
selected_facts.append(sentences[middle_idx])
|
|
|
|
|
|
|
|
|
if not selected_facts:
|
|
|
selected_facts = ["This species is documented in Wikispecies, the free species directory."]
|
|
|
|
|
|
|
|
|
for i in range(len(selected_facts)):
|
|
|
if not selected_facts[i].endswith(('.', '!', '?')):
|
|
|
selected_facts[i] += '.'
|
|
|
|
|
|
|
|
|
unique_facts = []
|
|
|
for fact in selected_facts:
|
|
|
if fact not in unique_facts:
|
|
|
unique_facts.append(fact)
|
|
|
|
|
|
return unique_facts[:4]
|
|
|
|
|
|
def similarity_score(str1, str2):
|
|
|
"""
|
|
|
Calculate a simple similarity score between two strings
|
|
|
based on word overlap. Used to avoid selecting too similar facts.
|
|
|
Returns a value between 0 (completely different) and 1 (identical).
|
|
|
"""
|
|
|
if not str1 or not str2:
|
|
|
return 0
|
|
|
|
|
|
|
|
|
words1 = set(str1.lower().split())
|
|
|
words2 = set(str2.lower().split())
|
|
|
|
|
|
|
|
|
intersection = words1.intersection(words2)
|
|
|
union = words1.union(words2)
|
|
|
|
|
|
if not union:
|
|
|
return 0
|
|
|
|
|
|
return len(intersection) / len(union)
|
|
|
|
|
|
def get_mock_species_from_filename(filename):
|
|
|
"""
|
|
|
A mock function that simulates image recognition by looking at the filename.
|
|
|
In a real application, this would be replaced with an actual image recognition API.
|
|
|
"""
|
|
|
filename_lower = filename.lower()
|
|
|
|
|
|
|
|
|
animal_keywords = {
|
|
|
"cat": "Felis catus",
|
|
|
"dog": "Canis familiaris",
|
|
|
"bird": "Aves",
|
|
|
"eagle": "Aquila chrysaetos",
|
|
|
"lion": "Panthera leo",
|
|
|
"tiger": "Panthera tigris",
|
|
|
"bear": "Ursus arctos",
|
|
|
"wolf": "Canis lupus",
|
|
|
"fox": "Vulpes vulpes",
|
|
|
"deer": "Cervidae",
|
|
|
"elephant": "Loxodonta africana",
|
|
|
"giraffe": "Giraffa camelopardalis",
|
|
|
"zebra": "Equus quagga",
|
|
|
"monkey": "Primates",
|
|
|
"gorilla": "Gorilla gorilla",
|
|
|
"fish": "Actinopterygii",
|
|
|
"shark": "Selachimorpha",
|
|
|
"dolphin": "Tursiops truncatus",
|
|
|
"whale": "Cetacea",
|
|
|
"snake": "Serpentes",
|
|
|
"lizard": "Lacertilia",
|
|
|
"turtle": "Testudines",
|
|
|
"frog": "Anura",
|
|
|
"butterfly": "Lepidoptera",
|
|
|
"bee": "Apis mellifera",
|
|
|
}
|
|
|
|
|
|
|
|
|
plant_keywords = {
|
|
|
"tree": "Arbor",
|
|
|
"flower": "Anthophyta",
|
|
|
"rose": "Rosa",
|
|
|
"tulip": "Tulipa",
|
|
|
"daisy": "Bellis perennis",
|
|
|
"sunflower": "Helianthus annuus",
|
|
|
"oak": "Quercus",
|
|
|
"pine": "Pinus",
|
|
|
"maple": "Acer",
|
|
|
"fern": "Polypodiopsida",
|
|
|
"moss": "Bryophyta",
|
|
|
"grass": "Poaceae",
|
|
|
"cactus": "Cactaceae",
|
|
|
"palm": "Arecaceae",
|
|
|
"orchid": "Orchidaceae",
|
|
|
}
|
|
|
|
|
|
|
|
|
for keyword, species in animal_keywords.items():
|
|
|
if keyword in filename_lower:
|
|
|
return species
|
|
|
|
|
|
|
|
|
for keyword, species in plant_keywords.items():
|
|
|
if keyword in filename_lower:
|
|
|
return species
|
|
|
|
|
|
|
|
|
return "Homo sapiens"
|
|
|
|
|
|
def extract_wikipedia_classification(full_text, title, search_data=None):
|
|
|
"""
|
|
|
Extract classification/taxonomy information from Wikipedia content.
|
|
|
Uses various strategies including infobox parsing, section analysis, and text pattern matching.
|
|
|
|
|
|
Args:
|
|
|
full_text: The full text content of the Wikipedia page
|
|
|
title: The title of the Wikipedia page
|
|
|
search_data: Optional search data that might contain additional info
|
|
|
|
|
|
Returns:
|
|
|
A dictionary with taxonomic ranks and their values
|
|
|
"""
|
|
|
|
|
|
classification = {
|
|
|
"kingdom": "Unknown",
|
|
|
"phylum": "Unknown",
|
|
|
"class": "Unknown",
|
|
|
"order": "Unknown",
|
|
|
"family": "Unknown",
|
|
|
"genus": "Unknown",
|
|
|
"species": "Unknown"
|
|
|
}
|
|
|
|
|
|
if not full_text:
|
|
|
return classification
|
|
|
|
|
|
try:
|
|
|
|
|
|
taxonomy_section = extract_wikipedia_section(full_text, ["Taxonomy", "Classification", "Taxonomic", "Scientific classification"])
|
|
|
if taxonomy_section:
|
|
|
|
|
|
classification = extract_taxonomy_from_text(taxonomy_section, classification)
|
|
|
|
|
|
|
|
|
|
|
|
infobox_patterns = [
|
|
|
r"Kingdom:\s*([A-Za-z]+)",
|
|
|
r"Phylum:\s*([A-Za-z]+)",
|
|
|
r"Class:\s*([A-Za-z]+)",
|
|
|
r"Order:\s*([A-Za-z]+)",
|
|
|
r"Family:\s*([A-Za-z]+)",
|
|
|
r"Genus:\s*([A-Za-z]+)",
|
|
|
r"Species:\s*([A-Za-z]+)"
|
|
|
]
|
|
|
|
|
|
|
|
|
for i, pattern in enumerate(infobox_patterns):
|
|
|
rank = list(classification.keys())[i]
|
|
|
matches = re.findall(pattern, full_text, re.IGNORECASE)
|
|
|
if matches:
|
|
|
classification[rank] = matches[0].strip()
|
|
|
|
|
|
|
|
|
|
|
|
first_para = full_text.split('\n\n')[0] if '\n\n' in full_text else full_text
|
|
|
classification = extract_taxonomy_from_text(first_para, classification)
|
|
|
|
|
|
|
|
|
title_parts = title.split()
|
|
|
if len(title_parts) >= 2 and classification["genus"] == "Unknown":
|
|
|
|
|
|
if title_parts[0][0].isupper() and title_parts[0][1:].islower() and title_parts[1].islower():
|
|
|
classification["genus"] = title_parts[0]
|
|
|
if classification["species"] == "Unknown":
|
|
|
classification["species"] = title_parts[1]
|
|
|
|
|
|
|
|
|
|
|
|
taxonomy_statement_patterns = [
|
|
|
r"(?:belongs|belonging)\s+to\s+(?:the)?\s+kingdom\s+([A-Za-z]+)",
|
|
|
r"(?:belongs|belonging)\s+to\s+(?:the)?\s+phylum\s+([A-Za-z]+)",
|
|
|
r"(?:belongs|belonging)\s+to\s+(?:the)?\s+class\s+([A-Za-z]+)",
|
|
|
r"(?:belongs|belonging)\s+to\s+(?:the)?\s+order\s+([A-Za-z]+)",
|
|
|
r"(?:belongs|belonging)\s+to\s+(?:the)?\s+family\s+([A-Za-z]+)",
|
|
|
r"(?:is|as)\s+a\s+(?:member|species)\s+of\s+(?:the)?\s+kingdom\s+([A-Za-z]+)",
|
|
|
r"(?:is|as)\s+a\s+(?:member|species)\s+of\s+(?:the)?\s+phylum\s+([A-Za-z]+)",
|
|
|
r"(?:is|as)\s+a\s+(?:member|species)\s+of\s+(?:the)?\s+class\s+([A-Za-z]+)",
|
|
|
r"(?:is|as)\s+a\s+(?:member|species)\s+of\s+(?:the)?\s+order\s+([A-Za-z]+)",
|
|
|
r"(?:is|as)\s+a\s+(?:member|species)\s+of\s+(?:the)?\s+family\s+([A-Za-z]+)",
|
|
|
r"(?:is|as)\s+a\s+(?:member|species)\s+of\s+(?:the)?\s+genus\s+([A-Za-z]+)"
|
|
|
]
|
|
|
|
|
|
|
|
|
rank_map = {
|
|
|
0: "kingdom", 1: "phylum", 2: "class", 3: "order", 4: "family",
|
|
|
5: "kingdom", 6: "phylum", 7: "class", 8: "order", 9: "family", 10: "genus"
|
|
|
}
|
|
|
|
|
|
|
|
|
for i, pattern in enumerate(taxonomy_statement_patterns):
|
|
|
rank = rank_map.get(i)
|
|
|
if not rank:
|
|
|
continue
|
|
|
|
|
|
matches = re.findall(pattern, full_text, re.IGNORECASE)
|
|
|
if matches and classification[rank] == "Unknown":
|
|
|
classification[rank] = matches[0].strip()
|
|
|
|
|
|
|
|
|
for rank, value in classification.items():
|
|
|
if value != "Unknown":
|
|
|
|
|
|
classification[rank] = value[0].upper() + value[1:]
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"Error extracting classification from Wikipedia: {str(e)}")
|
|
|
|
|
|
|
|
|
return classification
|
|
|
|
|
|
def extract_taxonomy_from_text(text, classification):
|
|
|
"""
|
|
|
Extract taxonomic information from text using pattern matching
|
|
|
and natural language processing techniques.
|
|
|
|
|
|
Args:
|
|
|
text: The text to analyze
|
|
|
classification: The current classification dictionary to update
|
|
|
|
|
|
Returns:
|
|
|
Updated classification dictionary
|
|
|
"""
|
|
|
if not text:
|
|
|
return classification
|
|
|
|
|
|
try:
|
|
|
|
|
|
taxonomy_patterns = {
|
|
|
"kingdom": [r"Kingdom:?\s*([A-Za-z]+)", r"Kingdom\s+([A-Za-z]+)", r"a member of the kingdom\s+([A-Za-z]+)"],
|
|
|
"phylum": [r"Phylum:?\s*([A-Za-z]+)", r"Phylum\s+([A-Za-z]+)", r"a member of the phylum\s+([A-Za-z]+)"],
|
|
|
"class": [r"Class:?\s*([A-Za-z]+)", r"Class\s+([A-Za-z]+)", r"a member of the class\s+([A-Za-z]+)"],
|
|
|
"order": [r"Order:?\s*([A-Za-z]+)", r"Order\s+([A-Za-z]+)", r"a member of the order\s+([A-Za-z]+)"],
|
|
|
}
|
|
|
|
|
|
|
|
|
for rank, patterns in taxonomy_patterns.items():
|
|
|
if classification[rank] != "Unknown":
|
|
|
continue
|
|
|
|
|
|
for pattern in patterns:
|
|
|
matches = re.findall(pattern, text, re.IGNORECASE)
|
|
|
if matches:
|
|
|
|
|
|
match = matches[0].strip()
|
|
|
|
|
|
if rank in ["genus", "species"]:
|
|
|
match = match[0].upper() + match[1:].lower()
|
|
|
elif rank != "species":
|
|
|
match = match.capitalize()
|
|
|
|
|
|
classification[rank] = match
|
|
|
break
|
|
|
|
|
|
|
|
|
suffix_patterns = {
|
|
|
"family": [r"\b([A-Za-z]+idae)\b", r"\b([A-Za-z]+aceae)\b"],
|
|
|
"order": [r"\b([A-Za-z]+ales)\b", r"\b([A-Za-z]+ida)\b"],
|
|
|
"class": [r"\b([A-Za-z]+ia)\b", r"\b([A-Za-z]+phyceae)\b"],
|
|
|
"phylum": [r"\b([A-Za-z]+phyta)\b", r"\b([A-Za-z]+zoa)\b"]
|
|
|
}
|
|
|
|
|
|
|
|
|
for rank, patterns in suffix_patterns.items():
|
|
|
if classification[rank] != "Unknown":
|
|
|
continue
|
|
|
|
|
|
for pattern in patterns:
|
|
|
matches = re.findall(pattern, text)
|
|
|
if matches:
|
|
|
|
|
|
match = matches[0].strip()
|
|
|
classification[rank] = match
|
|
|
break
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"Error in extract_taxonomy_from_text: {str(e)}")
|
|
|
|
|
|
|
|
|
return classification
|
|
|
|
|
|
if _name_ == "_main_":
|
|
|
main() |