Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import requests | |
| import os | |
| import re | |
| from PIL import Image | |
| import tempfile | |
| import os | |
| os.environ["STREAMLIT_HOME"] = "/tmp" | |
| # List of allowed file extensions for uploads | |
| ALLOWED_EXTENSIONS = {'png', 'jpg', 'jpeg', 'gif'} | |
| def allowed_file(filename): | |
| return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS | |
| def main(): | |
| st.set_page_config(page_title="Species Information Finder", layout="wide") | |
| st.title("Species Information Finder") | |
| st.write("Discover information about any species by name or by uploading an image.") | |
| # Create tabs for different functionality | |
| tab1, tab2 = st.tabs(["Search by Name", "Search by Image"]) | |
| with tab1: | |
| st.header("Search by Species Name") | |
| species_name = st.text_input("Enter a species name (common or scientific):") | |
| if st.button("Search"): | |
| if not species_name: | |
| st.error("Please enter a species name") | |
| else: | |
| with st.spinner("Searching for species information..."): | |
| # Get species info from Wikispecies API | |
| species_data = get_species_info(species_name) | |
| # Get images from Wikimedia Commons API | |
| images = get_species_images(species_name) | |
| display_results(species_data, images) | |
| with tab2: | |
| st.header("Search by Image Upload") | |
| uploaded_file = st.file_uploader("Upload an image of a species", type=ALLOWED_EXTENSIONS) | |
| if uploaded_file is not None: | |
| if allowed_file(uploaded_file.name): | |
| # Display the uploaded image | |
| image = Image.open(uploaded_file) | |
| st.image(image, caption="Uploaded Image", use_column_width=True) | |
| if st.button("Identify Species"): | |
| with st.spinner("Identifying species from image..."): | |
| # In a real app, you would call an image recognition API here | |
| # For demo purposes, we'll use our mock function | |
| species_name = get_mock_species_from_filename(uploaded_file.name) | |
| # Get species info from Wikispecies API | |
| species_data = get_species_info(species_name) | |
| # Get images from Wikimedia Commons API | |
| images = get_species_images(species_name) | |
| display_results(species_data, images) | |
| else: | |
| st.error("File type not allowed. Please upload an image file (PNG, JPG, JPEG, GIF).") | |
| def display_results(species_data, images): | |
| """Display the results in a formatted way.""" | |
| if "error" in species_data: | |
| st.error(species_data["error"]) | |
| return | |
| st.success(f"Found information for: {species_data['title']}") | |
| # Create columns for layout | |
| col1, col2 = st.columns([1, 2]) | |
| with col1: | |
| # Display classification information | |
| st.subheader("Classification") | |
| classification = species_data.get("classification", {}) | |
| for rank, value in classification.items(): | |
| if value != "Unknown": | |
| st.write(f"{rank.capitalize()}:** {value}") | |
| # Display habitat information | |
| if species_data.get("habitat", "Unknown") != "Unknown": | |
| st.subheader("Habitat") | |
| st.write(species_data["habitat"]) | |
| with col2: | |
| # Display description | |
| st.subheader("Description") | |
| st.write(species_data.get("description", "No description available.")) | |
| # Display fun facts if available | |
| if species_data.get("fun_facts"): | |
| st.subheader("Interesting Facts") | |
| for i, fact in enumerate(species_data["fun_facts"], 1): | |
| st.write(f"{i}. {fact}") | |
| # Display images if available | |
| if images: | |
| st.subheader("Related Images") | |
| # Display up to 4 images in a grid | |
| cols = st.columns(min(4, len(images))) | |
| for idx, img in enumerate(images[:4]): | |
| with cols[idx]: | |
| if "thumb_url" in img: | |
| st.image(img["thumb_url"], caption=img.get("description", ""), use_column_width=True) | |
| else: | |
| st.image(img["url"], caption=img.get("description", ""), use_column_width=True) | |
| st.caption(f"Credit: {img.get('author', 'Unknown')} | License: {img.get('license', 'Unknown')}") | |
| else: | |
| st.warning("No images found for this species.") | |
| # All the existing functions from your Flask app can remain exactly the same | |
| # (get_species_info, get_wikispecies_data, get_wikipedia_data, etc.) | |
| # I'll include them below for completeness, but they don't need to change | |
| def get_species_info(species_name): | |
| """ | |
| Get species information from both Wikispecies and Wikipedia APIs | |
| with improved extraction and fallback strategies for better results. | |
| """ | |
| # Create the base species info structure | |
| species_info = { | |
| "title": species_name, # Default to the search query | |
| "description": "No description available.", | |
| "categories": [], | |
| "links": [], | |
| "last_modified": "Unknown", | |
| "classification": { | |
| "kingdom": "Unknown", | |
| "phylum": "Unknown", | |
| "class": "Unknown", | |
| "order": "Unknown", | |
| "family": "Unknown", | |
| "genus": "Unknown", | |
| "species": "Unknown" | |
| }, | |
| "habitat": "Unknown", | |
| "fun_facts": [], | |
| "data_sources": [] # Track where we got data from | |
| } | |
| # Try to get data from Wikispecies first | |
| wikispecies_info = get_wikispecies_data(species_name) | |
| # If we got a valid response, update our species_info | |
| if not wikispecies_info.get("error"): | |
| species_info.update(wikispecies_info) | |
| species_info["data_sources"].append("Wikispecies") | |
| # Now try to get complementary data from Wikipedia | |
| wikipedia_info = get_wikipedia_data(species_name) | |
| # If Wikipedia returned valid data, supplement our existing info | |
| if not wikipedia_info.get("error"): | |
| # Use Wikipedia description if Wikispecies didn't have one | |
| if species_info["description"] == "No description available." or len(species_info["description"]) < 50: | |
| species_info["description"] = wikipedia_info.get("description", species_info["description"]) | |
| # Always prefer Wikipedia habitat info as it's likely more detailed | |
| species_info["habitat"] = wikipedia_info.get("habitat", species_info["habitat"]) | |
| # Merge classification info from Wikipedia, preferring Wikipedia data | |
| if "classification" in wikipedia_info: | |
| for rank, value in wikipedia_info["classification"].items(): | |
| if value != "Unknown": | |
| species_info["classification"][rank] = value | |
| # Add Wikipedia fun facts to our collection, avoiding duplicates | |
| if wikipedia_info.get("fun_facts"): | |
| existing_facts = species_info.get("fun_facts", []) | |
| for fact in wikipedia_info["fun_facts"]: | |
| if not any(similarity_score(fact, existing) > 0.7 for existing in existing_facts): | |
| existing_facts.append(fact) | |
| species_info["fun_facts"] = existing_facts[:4] # Limit to 4 facts | |
| species_info["data_sources"].append("Wikipedia") | |
| # If we didn't get any data from either source, return an error | |
| if not species_info["data_sources"]: | |
| species_info["error"] = "Species information not found in either Wikispecies or Wikipedia." | |
| return species_info | |
| def get_wikispecies_data(species_name): | |
| """ | |
| Get species information from Wikispecies API | |
| """ | |
| # Wikispecies API endpoint | |
| url = "https://species.wikimedia.org/w/api.php" | |
| # Parameters for the API request - get more info to work with | |
| params = { | |
| "action": "query", | |
| "format": "json", | |
| "titles": species_name, | |
| "prop": "extracts|categories|info|links", | |
| "exintro": True, # Get only the intro section | |
| "explaintext": True, # Get plain text, not HTML | |
| "cllimit": 50, # Get more categories | |
| "pllimit": 50, # Get more links | |
| } | |
| try: | |
| response = requests.get(url, params=params) | |
| data = response.json() | |
| # Extract page data | |
| pages = data.get("query", {}).get("pages", {}) | |
| if not pages: | |
| return {"error": "No data found in Wikispecies"} | |
| # Get the first page (there should only be one) | |
| page_id = next(iter(pages)) | |
| page = pages[page_id] | |
| # Default information structure with placeholders | |
| species_info = { | |
| "title": species_name, # Default to the search query | |
| "description": "No description available.", | |
| "categories": [], | |
| "links": [], | |
| "last_modified": "Unknown", | |
| "classification": { | |
| "kingdom": "Unknown", | |
| "phylum": "Unknown", | |
| "class": "Unknown", | |
| "order": "Unknown", | |
| "family": "Unknown", | |
| "genus": "Unknown", | |
| "species": "Unknown" | |
| }, | |
| "habitat": "Unknown", | |
| "fun_facts": [] | |
| } | |
| # Check if the page exists | |
| if int(page_id) < 0: | |
| species_info["error"] = "Species not found in Wikispecies. Try a different spelling or check for the scientific name." | |
| return species_info | |
| # Extract the relevant information | |
| species_info["title"] = page.get("title", species_name) | |
| species_info["description"] = page.get("extract", "No description available.") | |
| # Get all categories | |
| if "categories" in page: | |
| species_info["categories"] = [cat.get("title") for cat in page.get("categories", [])] | |
| # Get all links (can be useful for finding related info) | |
| if "links" in page: | |
| species_info["links"] = [link.get("title") for link in page.get("links", [])] | |
| species_info["last_modified"] = page.get("touched", "Unknown") | |
| # Clean up the description (remove unnecessary line breaks, etc.) | |
| if species_info["description"]: | |
| species_info["description"] = species_info["description"].replace("\n", " ").strip() | |
| # Remove multiple spaces | |
| import re | |
| species_info["description"] = re.sub(r' +', ' ', species_info["description"]) | |
| # Try different strategies to extract classification | |
| # Strategy 1: Extract from categories | |
| species_info["classification"] = extract_classification(species_info["categories"]) | |
| # Strategy 2: Try to extract genus and species from the title if available | |
| title = species_info.get("title", "") | |
| title_parts = title.split() | |
| # If the title consists of two words, it might be a binomial name (genus + species) | |
| if len(title_parts) == 2: | |
| genus = title_parts[0] | |
| species = title_parts[1] | |
| # Update classification with this information | |
| classification = species_info.get("classification", {}) | |
| if classification.get("genus") == "Unknown": | |
| classification["genus"] = genus | |
| if classification.get("species") == "Unknown": | |
| classification["species"] = species | |
| species_info["classification"] = classification | |
| # Strategy 3: Look for classification information in links | |
| if species_info.get("links"): | |
| for link in species_info["links"]: | |
| # Check if link might be a taxonomic rank | |
| link_parts = link.split() | |
| if len(link_parts) == 1: | |
| # Check common taxonomic suffixes for families, orders, etc. | |
| if link.endswith("idae"): # Family suffix | |
| species_info["classification"]["family"] = link | |
| elif link.endswith("inae"): # Subfamily suffix | |
| # Store subfamily info in a separate key | |
| species_info["classification"]["subfamily"] = link | |
| elif link.endswith("ales"): # Order suffix for plants | |
| species_info["classification"]["order"] = link | |
| elif link.endswith("aceae"): # Family suffix for plants | |
| species_info["classification"]["family"] = link | |
| # Extract habitat info | |
| species_info["habitat"] = extract_habitat(species_info["description"]) | |
| # Extract fun facts | |
| species_info["fun_facts"] = extract_fun_facts(species_info["description"]) | |
| # If the description is too short or missing, try to create a basic description | |
| if not species_info["description"] or len(species_info["description"]) < 20: | |
| # Create a basic description from available information | |
| classification = species_info["classification"] | |
| parts = [] | |
| if classification["genus"] != "Unknown" and classification["species"] != "Unknown": | |
| parts.append(f"{species_info['title']} is a species in the genus {classification['genus']}.") | |
| if classification["family"] != "Unknown": | |
| parts.append(f"It belongs to the family {classification['family']}.") | |
| if classification["order"] != "Unknown": | |
| parts.append(f"It is classified under the order {classification['order']}.") | |
| if parts: | |
| species_info["description"] = " ".join(parts) | |
| else: | |
| species_info["description"] = f"{species_info['title']} is a species documented in Wikispecies, the free species directory." | |
| return species_info | |
| except Exception as e: | |
| error_msg = str(e) | |
| return { | |
| "error": f"Error retrieving species information from Wikispecies: {error_msg}", | |
| "title": species_name, | |
| "description": "No information available due to an error. Please try a different species name.", | |
| "classification": {"kingdom": "Unknown", "phylum": "Unknown", "class": "Unknown", "order": "Unknown", "family": "Unknown", "genus": "Unknown", "species": "Unknown"}, | |
| "habitat": "Unknown", | |
| "fun_facts": [] | |
| } | |
| def get_wikipedia_data(species_name): | |
| """ | |
| Get species information from Wikipedia API, focusing on description, | |
| habitat, and fun facts. | |
| """ | |
| # Wikipedia API endpoint | |
| url = "https://en.wikipedia.org/w/api.php" | |
| # First, try to search for the page to get the correct title | |
| search_params = { | |
| "action": "query", | |
| "format": "json", | |
| "list": "search", | |
| "srsearch": species_name, | |
| "srlimit": 1, # Get just the best match | |
| } | |
| try: | |
| # Search for the page first to get the exact title | |
| search_response = requests.get(url, params=search_params) | |
| search_data = search_response.json() | |
| # Check if we found any search results | |
| search_results = search_data.get("query", {}).get("search", []) | |
| if not search_results: | |
| return {"error": "No matching Wikipedia page found for this species."} | |
| # Get the page title from the search result | |
| page_title = search_results[0].get("title") | |
| # Now get the full page content | |
| content_params = { | |
| "action": "query", | |
| "format": "json", | |
| "titles": page_title, | |
| "prop": "extracts|categories|sections", | |
| "exintro": False, # Get the full content, not just the intro | |
| "explaintext": True, # Get plain text, not HTML | |
| "cllimit": 50, # Get more categories | |
| } | |
| content_response = requests.get(url, params=content_params) | |
| content_data = content_response.json() | |
| # Extract page data | |
| pages = content_data.get("query", {}).get("pages", {}) | |
| if not pages: | |
| return {"error": "Failed to retrieve Wikipedia page content."} | |
| # Get the first page (there should only be one) | |
| page_id = next(iter(pages)) | |
| page = pages[page_id] | |
| # Check if the page exists | |
| if int(page_id) < 0: | |
| return {"error": "Wikipedia page not found."} | |
| # Get basic information | |
| species_info = { | |
| "title": page.get("title", species_name), | |
| "description": "", | |
| "habitat": "Unknown", | |
| "fun_facts": [], | |
| "classification": { | |
| "kingdom": "Unknown", | |
| "phylum": "Unknown", | |
| "class": "Unknown", | |
| "order": "Unknown", | |
| "family": "Unknown", | |
| "genus": "Unknown", | |
| "species": "Unknown" | |
| } | |
| } | |
| # Extract the content | |
| full_text = page.get("extract", "") | |
| # Clean up the text | |
| if full_text: | |
| full_text = full_text.replace("\n\n", "||").replace("\n", " ").replace("||", "\n\n") | |
| # Get sections from the content | |
| sections = full_text.split("\n\n") | |
| # The first section is usually a good description | |
| if sections: | |
| species_info["description"] = sections[0].strip() | |
| # Look for habitat information in the full text | |
| habitat_section = extract_wikipedia_section(full_text, ["Habitat", "Distribution", "Range", "Ecology", "Environment"]) | |
| if habitat_section: | |
| species_info["habitat"] = habitat_section | |
| else: | |
| # If no specific habitat section, use our habitat extraction on the full text | |
| habitat = extract_habitat(full_text) | |
| if habitat != "Unknown": | |
| species_info["habitat"] = habitat | |
| # Extract fun facts from various interesting sections | |
| behavior_section = extract_wikipedia_section(full_text, ["Behavior", "Behaviour", "Life cycle", "Diet", "Feeding", "Reproduction", "Biology"]) | |
| if behavior_section: | |
| facts = extract_fun_facts(behavior_section) | |
| if facts: | |
| species_info["fun_facts"].extend(facts) | |
| # If we don't have enough facts, try conservation status or other sections | |
| if len(species_info["fun_facts"]) < 2: | |
| conservation_section = extract_wikipedia_section(full_text, ["Conservation", "Status", "Threats", "Population"]) | |
| if conservation_section: | |
| facts = extract_fun_facts(conservation_section) | |
| if facts: | |
| for fact in facts: | |
| if fact not in species_info["fun_facts"]: | |
| species_info["fun_facts"].append(fact) | |
| # If we still don't have enough facts, use our fun facts extraction on the full text | |
| if len(species_info["fun_facts"]) < 2: | |
| general_facts = extract_fun_facts(full_text) | |
| if general_facts: | |
| for fact in general_facts: | |
| if fact not in species_info["fun_facts"]: | |
| species_info["fun_facts"].append(fact) | |
| # Limit to 4 facts | |
| species_info["fun_facts"] = species_info["fun_facts"][:4] | |
| # Extract classification from Wikipedia content | |
| wiki_classification = extract_wikipedia_classification(full_text, page.get("title", ""), search_data) | |
| if wiki_classification: | |
| species_info["classification"] = wiki_classification | |
| return species_info | |
| except Exception as e: | |
| error_msg = str(e) | |
| return { | |
| "error": f"Error retrieving information from Wikipedia: {error_msg}", | |
| "title": species_name, | |
| "description": "No information available from Wikipedia due to an error.", | |
| "habitat": "Unknown", | |
| "fun_facts": [] | |
| } | |
| def extract_wikipedia_section(text, section_keywords): | |
| """ | |
| Try to extract a specific section from Wikipedia text content. | |
| Returns the first matching section or None if no match is found. | |
| """ | |
| if not text: | |
| return None | |
| # Try to find section headings in the text | |
| section_pattern = r"==\s*([^=]+)\s*==" | |
| sections = re.findall(section_pattern, text) | |
| # Check if any of our target sections exist | |
| matching_sections = [] | |
| for keyword in section_keywords: | |
| for section in sections: | |
| if keyword.lower() in section.lower(): | |
| # Found a matching section, now extract its content | |
| section_regex = re.escape(f"== {section} ==") | |
| try: | |
| # Find where this section starts | |
| start_match = re.search(section_regex, text) | |
| if start_match: | |
| start_pos = start_match.end() | |
| # Find where the next section starts | |
| next_section = re.search(r"==\s*[^=]+\s*==", text[start_pos:]) | |
| if next_section: | |
| end_pos = start_pos + next_section.start() | |
| section_text = text[start_pos:end_pos].strip() | |
| else: | |
| # This is the last section | |
| section_text = text[start_pos:].strip() | |
| matching_sections.append(section_text) | |
| except Exception: | |
| # Skip this section if there's any error processing it | |
| continue | |
| # If we found any matching sections, join them (limit to 2 for conciseness) | |
| if matching_sections: | |
| return " ".join(matching_sections[:2]) | |
| # Alternative approach: look for paragraphs containing the keywords | |
| paragraphs = text.split("\n\n") | |
| for keyword in section_keywords: | |
| for paragraph in paragraphs: | |
| if keyword.lower() in paragraph.lower(): | |
| return paragraph | |
| return None | |
| def get_species_images(species_name): | |
| """ | |
| Get species images from Wikimedia Commons API with improved search | |
| strategies for better results. | |
| """ | |
| # Wikimedia Commons API endpoint | |
| url = "https://commons.wikimedia.org/w/api.php" | |
| # Function to perform a search with given parameters | |
| def search_images(search_term, limit=10): | |
| # Parameters for the API request | |
| params = { | |
| "action": "query", | |
| "format": "json", | |
| "generator": "search", | |
| "gsrnamespace": 6, # File namespace | |
| "gsrsearch": search_term, | |
| "gsrlimit": limit, # Limit results | |
| "prop": "imageinfo", | |
| "iiprop": "url|extmetadata", | |
| "iiurlwidth": 800, # Thumbnail width | |
| } | |
| try: | |
| response = requests.get(url, params=params) | |
| data = response.json() | |
| # Extract image data | |
| pages = data.get("query", {}).get("pages", {}) | |
| if not pages: | |
| return [] | |
| images = [] | |
| for page_id, page in pages.items(): | |
| image_info = page.get("imageinfo", [{}])[0] | |
| # Extract metadata | |
| metadata = image_info.get("extmetadata", {}) | |
| description = metadata.get("ImageDescription", {}).get("value", "No description") | |
| author = metadata.get("Artist", {}).get("value", "Unknown") | |
| license = metadata.get("License", {}).get("value", "Unknown") | |
| # Skip non-image files (like pdfs, audio, etc.) | |
| title = page.get("title", "").lower() | |
| if any(ext in title for ext in ['.pdf', '.svg', '.mp3', '.mp4', '.ogg', '.wav', '.webm']): | |
| continue | |
| image = { | |
| "title": page.get("title", "Unknown"), | |
| "url": image_info.get("url", ""), | |
| "thumb_url": image_info.get("thumburl", ""), | |
| "description": description, | |
| "author": author, | |
| "license": license, | |
| } | |
| images.append(image) | |
| return images | |
| except Exception as e: | |
| return [{"error": str(e)}] | |
| # STRATEGY 1: Try exact file name search first | |
| images = search_images(f"file:{species_name}") | |
| # If no results, try a broader search | |
| if not images: | |
| # STRATEGY 2: Try removing the file: prefix for broader results | |
| images = search_images(species_name) | |
| # If still no results or very few, try some variations | |
| if len(images) < 3: | |
| # Split the species name and try different combinations | |
| name_parts = species_name.split() | |
| # STRATEGY 3: If it's a binomial name, try with just the genus or species part | |
| if len(name_parts) == 2: | |
| # Try with just the genus (first part) | |
| genus_images = search_images(f"{name_parts[0]}") | |
| # Add unique images from genus search | |
| existing_urls = [img.get("url") for img in images] | |
| for img in genus_images: | |
| if img.get("url") not in existing_urls: | |
| images.append(img) | |
| existing_urls.append(img.get("url")) | |
| # Stop if we now have enough images | |
| if len(images) >= 5: | |
| break | |
| # If we found at least some images, return them | |
| if images: | |
| return images | |
| # STRATEGY 4: Last resort - try a very general search | |
| # This could be improved by using the taxonomy info | |
| return search_images("species taxonomy nature") | |
| def extract_classification(categories): | |
| """ | |
| Extract classification information from categories and additional WikiData | |
| with improved pattern matching and detection. | |
| """ | |
| # Initialize with default "Unknown" values | |
| classification = { | |
| "kingdom": "Unknown", | |
| "phylum": "Unknown", | |
| "class": "Unknown", | |
| "order": "Unknown", | |
| "family": "Unknown", | |
| "genus": "Unknown", | |
| "species": "Unknown", | |
| } | |
| # Skip empty categories | |
| if not categories: | |
| return classification | |
| # Common taxonomy patterns in category names with more variations | |
| taxonomy_patterns = { | |
| "kingdom": ["kingdom:", "regnum:", "reino:", "regno:", "kingdom ", "regnum ", "reino ", "reino "], | |
| "phylum": ["phylum:", "division:", "división:", "divisio:", "phylum ", "division ", "división ", "divisio "], | |
| "class": ["class:", "clase:", "classis:", "class ", "clase ", "classis "], | |
| "order": ["order:", "orden:", "ordo:", "order ", "orden ", "ordo "], | |
| "family": ["family:", "familia:", "family ", "familia "], | |
| "genus": ["genus:", "género:", "genero:", "genus ", "género ", "genero "], | |
| "species": ["species:", "especie:", "specie:", "species ", "especie ", "specie "] | |
| } | |
| # STRATEGY 1: Direct matching from category names | |
| for category in categories: | |
| # Skip Categories: prefix if present | |
| if category.startswith("Category:"): | |
| category = category[9:] | |
| category_lower = category.lower() | |
| # Check for direct taxonomy mentions | |
| for rank, patterns in taxonomy_patterns.items(): | |
| for pattern in patterns: | |
| if pattern in category_lower: | |
| # Extract the value after the pattern | |
| parts = category_lower.split(pattern) | |
| if len(parts) > 1: | |
| # Clean up the value (capitalize first letter, remove trailing spaces and special chars) | |
| value = parts[1].strip().split()[0].capitalize() | |
| classification[rank] = value | |
| break | |
| # STRATEGY 2: Look for categories that directly match taxonomic naming conventions | |
| for category in categories: | |
| # Skip Categories: prefix if present | |
| if category.startswith("Category:"): | |
| category = category[9:] | |
| category_parts = category.split() | |
| # Check for single-word categories that might be taxonomic names | |
| if len(category_parts) == 1: | |
| name = category_parts[0] | |
| # Check for common taxonomic suffixes | |
| if name.endswith("idae"): # Family suffix for animals | |
| classification["family"] = name | |
| elif name.endswith("inae"): # Subfamily suffix | |
| # Store subfamily info in a separate key | |
| classification["subfamily"] = name | |
| elif name.endswith("ales"): # Order suffix for plants | |
| classification["order"] = name | |
| elif name.endswith("aceae"): # Family suffix for plants | |
| classification["family"] = name | |
| elif name.endswith("ineae"): # Suborder suffix for plants | |
| # Store suborder info in a separate key | |
| classification["suborder"] = name | |
| elif name.endswith("oideae"): # Subfamily suffix for plants | |
| # Store subfamily info in a separate key | |
| classification["subfamily"] = name | |
| # STRATEGY 3: Check for categories that contain common taxonomic rank names | |
| taxonomic_rank_names = ["kingdom", "phylum", "division", "class", "order", "family", "genus", "species"] | |
| for category in categories: | |
| # Skip Categories: prefix if present | |
| if category.startswith("Category:"): | |
| category = category[9:] | |
| category_lower = category.lower() | |
| for rank in taxonomic_rank_names: | |
| if rank in category_lower: | |
| # Look for words after the rank name | |
| parts = category_lower.split(rank) | |
| if len(parts) > 1 and parts[1].strip(): | |
| # Get the first word after the rank | |
| value = parts[1].strip().split()[0].capitalize() | |
| if classification[rank] == "Unknown": | |
| classification[rank] = value | |
| # Final cleanup: ensure proper capitalization and formatting | |
| for rank, value in classification.items(): | |
| if value != "Unknown": | |
| # Capitalize first letter for taxonomic ranks | |
| classification[rank] = value[0].upper() + value[1:] | |
| return classification | |
| def extract_habitat(description): | |
| """ | |
| Extract habitat information from description using a more comprehensive approach | |
| with multiple fallback strategies and pattern recognition. | |
| """ | |
| if not description or description == "No description available": | |
| return "Unknown" | |
| # Split the description into sentences | |
| sentences = description.replace(". ", ".|").replace("! ", "!|").replace("? ", "?|").split("|") | |
| sentences = [s.strip() for s in sentences if s.strip()] | |
| # STRATEGY 1: Direct habitat statements | |
| # Expanded list of habitat-related keywords and phrases | |
| habitat_keywords = [ | |
| "habitat", "lives in", "found in", "native to", "occurs in", "distribution", | |
| "range includes", "ecosystem", "biome", "environment", "inhabits", "dwelling in", | |
| "endemic to", "natural range", "geographical range", "distributed across", | |
| "prefers", "thrives in", "flourishes in", "resides in", "habitat type", | |
| "commonly found", "typically found", "often found", "usually found", "primarily found" | |
| ] | |
| # STRATEGY 2: Geography and climate context | |
| # Climate and geography keywords to catch broader context | |
| climate_keywords = [ | |
| "tropical", "temperate", "polar", "arctic", "antarctic", "desert", | |
| "rainforest", "forest", "jungle", "grassland", "savanna", "wetland", | |
| "marsh", "swamp", "mountain", "alpine", "coastal", "marine", "freshwater", | |
| "ocean", "sea", "river", "lake", "stream", "pond", "terrestrial", "aquatic", | |
| "woodland", "meadow", "tundra", "taiga", "steppe", "continent", "island", | |
| "shore", "beach", "reef", "cave", "burrow", "nest", "canopy", "undergrowth" | |
| ] | |
| # STRATEGY 3: Regional indicators (continents, regions, countries) | |
| region_keywords = [ | |
| "africa", "asia", "europe", "north america", "south america", "australia", | |
| "antarctica", "oceania", "mediterranean", "pacific", "atlantic", "indian ocean", | |
| "arctic ocean", "southern ocean", "northern", "southern", "eastern", "western", | |
| "central", "worldwide", "global", "cosmopolitan", "international" | |
| ] | |
| # STRATEGY 4: Verbs that might indicate location or movement patterns | |
| action_keywords = [ | |
| "migrate", "roam", "travel", "swim", "fly", "climb", "burrow", "dig", "nest", | |
| "breed", "forage", "hunt", "territory", "range" | |
| ] | |
| # Sentences that might contain habitat information | |
| habitat_sentences = [] | |
| # Apply Strategy 1: Direct habitat statements | |
| for sentence in sentences: | |
| for keyword in habitat_keywords: | |
| if keyword.lower() in sentence.lower(): | |
| habitat_sentences.append(sentence) | |
| break | |
| # Apply Strategy 2: Geography and climate context (if strategy 1 didn't yield results) | |
| if not habitat_sentences: | |
| for sentence in sentences: | |
| for keyword in climate_keywords: | |
| if keyword.lower() in sentence.lower(): | |
| habitat_sentences.append(sentence) | |
| break | |
| # Apply Strategy 3: Regional indicators (if strategies 1-2 didn't yield results) | |
| if not habitat_sentences: | |
| for sentence in sentences: | |
| for keyword in region_keywords: | |
| if keyword.lower() in sentence.lower(): | |
| habitat_sentences.append(sentence) | |
| break | |
| # Apply Strategy 4: Action verbs related to habitat (if strategies 1-3 didn't yield results) | |
| if not habitat_sentences: | |
| for sentence in sentences: | |
| for keyword in action_keywords: | |
| if keyword.lower() in sentence.lower(): | |
| habitat_sentences.append(sentence) | |
| break | |
| # Fallback Strategy: If no habitat information was found, try to use the first or second sentence | |
| # as they often contain general information about where the species lives | |
| if not habitat_sentences and len(sentences) >= 2: | |
| # Skip the first sentence if it's just a definition and take the second | |
| if len(sentences) > 2: | |
| second_sentence = sentences[1] | |
| # Check if the second sentence has reasonable length to be informative | |
| if len(second_sentence.split()) > 5: | |
| habitat_sentences.append(second_sentence) | |
| # If second sentence wasn't suitable or not available, use the first | |
| if not habitat_sentences: | |
| first_sentence = sentences[0] | |
| if len(first_sentence.split()) > 5: | |
| habitat_sentences.append(first_sentence) | |
| # Format the habitat information | |
| if habitat_sentences: | |
| # If we have multiple sentences, join them (but limit to 2 for conciseness) | |
| if len(habitat_sentences) > 1: | |
| combined = ". ".join(habitat_sentences[:2]).strip() | |
| # Make sure it ends with proper punctuation | |
| if not combined.endswith(('.', '!', '?')): | |
| combined += '.' | |
| return combined | |
| single = habitat_sentences[0].strip() | |
| # Make sure it ends with proper punctuation | |
| if not single.endswith(('.', '!', '?')): | |
| single += '.' | |
| return single | |
| # Last resort: construct a generic message if we couldn't find specific habitat info | |
| return "Specific habitat information not available from Wikispecies. Try searching online for more details about this species' natural environment." | |
| def extract_fun_facts(description): | |
| """ | |
| Extract interesting fun facts from the description using keyword-based identification, | |
| with improved pattern recognition and a structured approach to generate fun facts | |
| even with limited information. | |
| """ | |
| if not description or description == "No description available": | |
| return ["No specific information available for this species in Wikispecies."] | |
| # Split the description into sentences | |
| sentences = description.replace(". ", ".|").replace("! ", "!|").replace("? ", "?|").split("|") | |
| sentences = [s.strip() for s in sentences if s.strip()] | |
| # If the description is too short, include it as a single fact | |
| if len(sentences) == 1 and len(description) < 100: | |
| if not sentences[0].endswith(('.', '!', '?')): | |
| sentences[0] += '.' | |
| return [sentences[0]] | |
| # STRATEGY 1: Identify sentences with interesting keywords | |
| interesting_keywords = [ | |
| "interesting", "unique", "unusual", "remarkable", "notable", "surprising", | |
| "fascinating", "amazing", "extraordinary", "distinctive", "special", "rare", | |
| "strange", "curious", "unlike", "peculiar", "odd", "bizarre", "striking", | |
| "colorful", "beautiful", "impressive", "popular", "famous", "well-known", | |
| "largest", "smallest", "fastest", "slowest", "oldest", "youngest", "only", | |
| "record", "discovery", "first", "last", "origin", "discovered", "introduced", | |
| "revered", "sacred", "symbol", "iconic", "emblem", "represented", "mythology", | |
| "legend", "folklore", "traditional", "cultural", "significance", "historical" | |
| ] | |
| # STRATEGY 2: Physical characteristics and biology often make good facts | |
| biology_keywords = [ | |
| "lifespan", "longevity", "size", "weight", "height", "length", "wingspan", | |
| "color", "pattern", "marking", "appearance", "physical", "morphology", "anatomy", | |
| "feature", "characteristic", "distinctive", "body", "shape", "structure", | |
| "adaptation", "evolved", "evolution", "mutation", "gene", "genetic", "chromosome", | |
| "hybrid", "species", "subspecies", "variety", "breed", "strain", "extinct", | |
| "endangered", "threatened", "vulnerable", "conservation", "protected" | |
| ] | |
| # STRATEGY 3: Behavior and lifestyle information | |
| behavior_keywords = [ | |
| "diet", "eat", "feeding", "food", "prey", "predator", "hunt", "scavenge", | |
| "forage", "graze", "browse", "omnivore", "carnivore", "herbivore", "insectivore", | |
| "behavior", "behaviour", "habit", "activity", "social", "solitary", "group", | |
| "herd", "flock", "pack", "colony", "community", "family", "nocturnal", "diurnal", | |
| "crepuscular", "migrate", "migration", "hibernate", "hibernation", "estivate", | |
| "dormant", "sleep", "rest", "active", "territory", "defend", "aggressive", | |
| "docile", "tame", "wild", "domestic", "domesticated", "trained", "human" | |
| ] | |
| # STRATEGY 4: Reproduction is always interesting | |
| reproduction_keywords = [ | |
| "reproduce", "reproduction", "breeding", "mate", "mating", "courtship", "display", | |
| "attract", "offspring", "young", "juvenile", "infant", "baby", "child", "adult", | |
| "egg", "spawn", "birth", "pregnant", "gestation", "incubation", "hatch", "nestling", | |
| "fledgling", "litter", "clutch", "brood", "parent", "care", "raise", "nurse", "wean" | |
| ] | |
| # Comparative patterns that often indicate interesting facts | |
| comparative_patterns = [ | |
| "more than", "less than", "bigger than", "smaller than", "larger than", | |
| "faster than", "slower than", "better than", "worse than", "greater than", | |
| "unlike", "similar to", "compared to", "in contrast to", "differs from", | |
| "up to", "as many as", "can reach", "can grow", "can live", "known to", | |
| "capable of", "able to", "estimated", "approximately", "about", "around" | |
| ] | |
| # Measurement patterns that often indicate interesting statistics | |
| measurement_patterns = [ | |
| "cm", "meter", "metre", "kilometer", "kilometre", "feet", "foot", "inch", | |
| "kg", "gram", "pound", "ton", "tonne", "year", "month", "week", "day", "hour", | |
| "percent", "°C", "°F", "degree", "celsius", "fahrenheit", "temperature", | |
| "speed", "mph", "kph", "knot", "altitude", "depth", "width", "height" | |
| ] | |
| # Collect potential facts using different strategies | |
| fact_candidates = { | |
| "interesting": [], | |
| "biological": [], | |
| "behavioral": [], | |
| "reproductive": [], | |
| "comparative": [], | |
| "measurements": [], | |
| "general": [] | |
| } | |
| # Apply strategies to collect potential facts | |
| for sentence in sentences: | |
| # Skip very short sentences | |
| if len(sentence.split()) < 4: | |
| continue | |
| # Flag to track if the sentence has been categorized | |
| categorized = False | |
| # Strategy 1: Interesting keywords | |
| for keyword in interesting_keywords: | |
| if keyword.lower() in sentence.lower(): | |
| fact_candidates["interesting"].append(sentence) | |
| categorized = True | |
| break | |
| if not categorized: | |
| # Strategy 2: Biological characteristics | |
| for keyword in biology_keywords: | |
| if keyword.lower() in sentence.lower(): | |
| fact_candidates["biological"].append(sentence) | |
| categorized = True | |
| break | |
| if not categorized: | |
| # Strategy 3: Behavior keywords | |
| for keyword in behavior_keywords: | |
| if keyword.lower() in sentence.lower(): | |
| fact_candidates["behavioral"].append(sentence) | |
| categorized = True | |
| break | |
| if not categorized: | |
| # Strategy 4: Reproduction keywords | |
| for keyword in reproduction_keywords: | |
| if keyword.lower() in sentence.lower(): | |
| fact_candidates["reproductive"].append(sentence) | |
| categorized = True | |
| break | |
| if not categorized: | |
| # Check for comparative patterns | |
| for pattern in comparative_patterns: | |
| if pattern.lower() in sentence.lower(): | |
| fact_candidates["comparative"].append(sentence) | |
| categorized = True | |
| break | |
| if not categorized: | |
| # Check for measurement patterns | |
| has_number = any(c.isdigit() for c in sentence) | |
| if has_number: | |
| for pattern in measurement_patterns: | |
| if pattern.lower() in sentence.lower(): | |
| fact_candidates["measurements"].append(sentence) | |
| categorized = True | |
| break | |
| fact_candidates["measurements"].append(sentence) | |
| categorized = True | |
| break | |
| # If sentence wasn't categorized by any specific strategy, add to general | |
| if not categorized and len(sentence.split()) > 5: | |
| fact_candidates["general"].append(sentence) | |
| # Select facts from each category to ensure diversity (prioritizing the most interesting ones) | |
| selected_facts = [] | |
| # Priority order for fact selection | |
| categories = ["interesting", "measurements", "biological", "reproductive", "behavioral", "comparative", "general"] | |
| # First, try to get at least one fact from high-priority categories | |
| for category in categories[:3]: # First 3 are highest priority | |
| if fact_candidates[category]: | |
| selected_facts.append(fact_candidates[category][0]) | |
| fact_candidates[category].pop(0) # Remove the used fact | |
| # Now fill remaining slots with a mix of all categories | |
| remaining_slots = 4 - len(selected_facts) # Maximum 4 facts total | |
| if remaining_slots > 0: | |
| for category in categories: | |
| if fact_candidates[category] and remaining_slots > 0: | |
| next_fact = fact_candidates[category][0] | |
| # Only add if not too similar to already selected facts | |
| if not any(similarity_score(next_fact, fact) > 0.7 for fact in selected_facts): | |
| selected_facts.append(next_fact) | |
| remaining_slots -= 1 | |
| fact_candidates[category].pop(0) # Remove the used fact | |
| # If we still don't have enough facts, add more from general pool | |
| if len(selected_facts) < 2 and sentences: | |
| # Add the first sentence if it's not already included | |
| if sentences[0] not in selected_facts and len(sentences[0].split()) > 5: | |
| selected_facts.append(sentences[0]) | |
| # Add another sentence from middle of the text if available | |
| middle_idx = len(sentences) // 2 | |
| if len(sentences) > middle_idx and sentences[middle_idx] not in selected_facts and len(sentences[middle_idx].split()) > 5: | |
| selected_facts.append(sentences[middle_idx]) | |
| # Last resort: if still no facts, create a generic fact | |
| if not selected_facts: | |
| selected_facts = ["This species is documented in Wikispecies, the free species directory."] | |
| # Ensure all facts end with proper punctuation | |
| for i in range(len(selected_facts)): | |
| if not selected_facts[i].endswith(('.', '!', '?')): | |
| selected_facts[i] += '.' | |
| # Remove duplicates while preserving order | |
| unique_facts = [] | |
| for fact in selected_facts: | |
| if fact not in unique_facts: | |
| unique_facts.append(fact) | |
| return unique_facts[:4] # Limit to max 4 facts | |
| def similarity_score(str1, str2): | |
| """ | |
| Calculate a simple similarity score between two strings | |
| based on word overlap. Used to avoid selecting too similar facts. | |
| Returns a value between 0 (completely different) and 1 (identical). | |
| """ | |
| if not str1 or not str2: | |
| return 0 | |
| # Convert to lowercase and split into words | |
| words1 = set(str1.lower().split()) | |
| words2 = set(str2.lower().split()) | |
| # Calculate Jaccard similarity | |
| intersection = words1.intersection(words2) | |
| union = words1.union(words2) | |
| if not union: | |
| return 0 | |
| return len(intersection) / len(union) | |
| def get_mock_species_from_filename(filename): | |
| """ | |
| A mock function that simulates image recognition by looking at the filename. | |
| In a real application, this would be replaced with an actual image recognition API. | |
| """ | |
| filename_lower = filename.lower() | |
| # List of common animals and their possible filenames | |
| animal_keywords = { | |
| "cat": "Felis catus", | |
| "dog": "Canis familiaris", | |
| "bird": "Aves", | |
| "eagle": "Aquila chrysaetos", | |
| "lion": "Panthera leo", | |
| "tiger": "Panthera tigris", | |
| "bear": "Ursus arctos", | |
| "wolf": "Canis lupus", | |
| "fox": "Vulpes vulpes", | |
| "deer": "Cervidae", | |
| "elephant": "Loxodonta africana", | |
| "giraffe": "Giraffa camelopardalis", | |
| "zebra": "Equus quagga", | |
| "monkey": "Primates", | |
| "gorilla": "Gorilla gorilla", | |
| "fish": "Actinopterygii", | |
| "shark": "Selachimorpha", | |
| "dolphin": "Tursiops truncatus", | |
| "whale": "Cetacea", | |
| "snake": "Serpentes", | |
| "lizard": "Lacertilia", | |
| "turtle": "Testudines", | |
| "frog": "Anura", | |
| "butterfly": "Lepidoptera", | |
| "bee": "Apis mellifera", | |
| } | |
| # List of common plants and their possible filenames | |
| plant_keywords = { | |
| "tree": "Arbor", | |
| "flower": "Anthophyta", | |
| "rose": "Rosa", | |
| "tulip": "Tulipa", | |
| "daisy": "Bellis perennis", | |
| "sunflower": "Helianthus annuus", | |
| "oak": "Quercus", | |
| "pine": "Pinus", | |
| "maple": "Acer", | |
| "fern": "Polypodiopsida", | |
| "moss": "Bryophyta", | |
| "grass": "Poaceae", | |
| "cactus": "Cactaceae", | |
| "palm": "Arecaceae", | |
| "orchid": "Orchidaceae", | |
| } | |
| # Check animal keywords | |
| for keyword, species in animal_keywords.items(): | |
| if keyword in filename_lower: | |
| return species | |
| # Check plant keywords | |
| for keyword, species in plant_keywords.items(): | |
| if keyword in filename_lower: | |
| return species | |
| # If no match is found, return a default species | |
| return "Homo sapiens" | |
| def extract_wikipedia_classification(full_text, title, search_data=None): | |
| """ | |
| Extract classification/taxonomy information from Wikipedia content. | |
| Uses various strategies including infobox parsing, section analysis, and text pattern matching. | |
| Args: | |
| full_text: The full text content of the Wikipedia page | |
| title: The title of the Wikipedia page | |
| search_data: Optional search data that might contain additional info | |
| Returns: | |
| A dictionary with taxonomic ranks and their values | |
| """ | |
| # Initialize with default "Unknown" values | |
| classification = { | |
| "kingdom": "Unknown", | |
| "phylum": "Unknown", | |
| "class": "Unknown", | |
| "order": "Unknown", | |
| "family": "Unknown", | |
| "genus": "Unknown", | |
| "species": "Unknown" | |
| } | |
| if not full_text: | |
| return classification | |
| try: | |
| # STRATEGY 1: Look for taxonomic information in specific sections | |
| taxonomy_section = extract_wikipedia_section(full_text, ["Taxonomy", "Classification", "Taxonomic", "Scientific classification"]) | |
| if taxonomy_section: | |
| # Extract taxonomic information from the section | |
| classification = extract_taxonomy_from_text(taxonomy_section, classification) | |
| # STRATEGY 2: Look for taxonomic information in infobox-like structures | |
| # Wikipedia infoboxes often appear at the beginning of the text with structured format | |
| infobox_patterns = [ | |
| r"Kingdom:\s*([A-Za-z]+)", | |
| r"Phylum:\s*([A-Za-z]+)", | |
| r"Class:\s*([A-Za-z]+)", | |
| r"Order:\s*([A-Za-z]+)", | |
| r"Family:\s*([A-Za-z]+)", | |
| r"Genus:\s*([A-Za-z]+)", | |
| r"Species:\s*([A-Za-z]+)" | |
| ] | |
| # Apply each pattern to extract taxonomic information | |
| for i, pattern in enumerate(infobox_patterns): | |
| rank = list(classification.keys())[i] | |
| matches = re.findall(pattern, full_text, re.IGNORECASE) | |
| if matches: | |
| classification[rank] = matches[0].strip() | |
| # STRATEGY 3: Parse the first paragraph for taxonomic information | |
| # First paragraphs in Wikipedia often contain taxonomic statements | |
| first_para = full_text.split('\n\n')[0] if '\n\n' in full_text else full_text | |
| classification = extract_taxonomy_from_text(first_para, classification) | |
| # STRATEGY 4: Try to extract genus and species from the title | |
| title_parts = title.split() | |
| if len(title_parts) >= 2 and classification["genus"] == "Unknown": | |
| # If title looks like a binomial name (e.g., "Panthera leo") | |
| if title_parts[0][0].isupper() and title_parts[0][1:].islower() and title_parts[1].islower(): | |
| classification["genus"] = title_parts[0] | |
| if classification["species"] == "Unknown": | |
| classification["species"] = title_parts[1] | |
| # STRATEGY 5: Look for taxonomic statements throughout the text | |
| # These patterns match statements like "belongs to the family Felidae" | |
| taxonomy_statement_patterns = [ | |
| r"(?:belongs|belonging)\s+to\s+(?:the)?\s+kingdom\s+([A-Za-z]+)", | |
| r"(?:belongs|belonging)\s+to\s+(?:the)?\s+phylum\s+([A-Za-z]+)", | |
| r"(?:belongs|belonging)\s+to\s+(?:the)?\s+class\s+([A-Za-z]+)", | |
| r"(?:belongs|belonging)\s+to\s+(?:the)?\s+order\s+([A-Za-z]+)", | |
| r"(?:belongs|belonging)\s+to\s+(?:the)?\s+family\s+([A-Za-z]+)", | |
| r"(?:is|as)\s+a\s+(?:member|species)\s+of\s+(?:the)?\s+kingdom\s+([A-Za-z]+)", | |
| r"(?:is|as)\s+a\s+(?:member|species)\s+of\s+(?:the)?\s+phylum\s+([A-Za-z]+)", | |
| r"(?:is|as)\s+a\s+(?:member|species)\s+of\s+(?:the)?\s+class\s+([A-Za-z]+)", | |
| r"(?:is|as)\s+a\s+(?:member|species)\s+of\s+(?:the)?\s+order\s+([A-Za-z]+)", | |
| r"(?:is|as)\s+a\s+(?:member|species)\s+of\s+(?:the)?\s+family\s+([A-Za-z]+)", | |
| r"(?:is|as)\s+a\s+(?:member|species)\s+of\s+(?:the)?\s+genus\s+([A-Za-z]+)" | |
| ] | |
| # Map patterns to taxonomic ranks | |
| rank_map = { | |
| 0: "kingdom", 1: "phylum", 2: "class", 3: "order", 4: "family", | |
| 5: "kingdom", 6: "phylum", 7: "class", 8: "order", 9: "family", 10: "genus" | |
| } | |
| # Apply statement patterns to extract taxonomic information | |
| for i, pattern in enumerate(taxonomy_statement_patterns): | |
| rank = rank_map.get(i) | |
| if not rank: | |
| continue | |
| matches = re.findall(pattern, full_text, re.IGNORECASE) | |
| if matches and classification[rank] == "Unknown": | |
| classification[rank] = matches[0].strip() | |
| # Final cleanup: ensure proper capitalization and formatting | |
| for rank, value in classification.items(): | |
| if value != "Unknown": | |
| # Capitalize first letter for taxonomic ranks | |
| classification[rank] = value[0].upper() + value[1:] | |
| except Exception as e: | |
| print(f"Error extracting classification from Wikipedia: {str(e)}") | |
| # If an error occurs, we'll return the classification with whatever data we managed to extract | |
| return classification | |
| def extract_taxonomy_from_text(text, classification): | |
| """ | |
| Extract taxonomic information from text using pattern matching | |
| and natural language processing techniques. | |
| Args: | |
| text: The text to analyze | |
| classification: The current classification dictionary to update | |
| Returns: | |
| Updated classification dictionary | |
| """ | |
| if not text: | |
| return classification | |
| try: | |
| # Common patterns for taxonomic ranks in text | |
| taxonomy_patterns = { | |
| "kingdom": [r"Kingdom:?\s*([A-Za-z]+)", r"Kingdom\s+([A-Za-z]+)", r"a member of the kingdom\s+([A-Za-z]+)"], | |
| "phylum": [r"Phylum:?\s*([A-Za-z]+)", r"Phylum\s+([A-Za-z]+)", r"a member of the phylum\s+([A-Za-z]+)"], | |
| "class": [r"Class:?\s*([A-Za-z]+)", r"Class\s+([A-Za-z]+)", r"a member of the class\s+([A-Za-z]+)"], | |
| "order": [r"Order:?\s*([A-Za-z]+)", r"Order\s+([A-Za-z]+)", r"a member of the order\s+([A-Za-z]+)"], | |
| } | |
| # For each taxonomic rank, try to find matches using the patterns | |
| for rank, patterns in taxonomy_patterns.items(): | |
| if classification[rank] != "Unknown": | |
| continue # Skip if we already have a value | |
| for pattern in patterns: | |
| matches = re.findall(pattern, text, re.IGNORECASE) | |
| if matches: | |
| # Take the first match and clean it up | |
| match = matches[0].strip() | |
| # Handle Latin taxonomic names with proper capitalization | |
| if rank in ["genus", "species"]: | |
| match = match[0].upper() + match[1:].lower() | |
| elif rank != "species": # For non-species ranks | |
| match = match.capitalize() | |
| classification[rank] = match | |
| break # Stop after finding a match for this rank | |
| # Look for taxonomic information with specific taxonomic suffixes | |
| suffix_patterns = { | |
| "family": [r"\b([A-Za-z]+idae)\b", r"\b([A-Za-z]+aceae)\b"], # Animal and plant families | |
| "order": [r"\b([A-Za-z]+ales)\b", r"\b([A-Za-z]+ida)\b"], # Plant orders and animal orders | |
| "class": [r"\b([A-Za-z]+ia)\b", r"\b([A-Za-z]+phyceae)\b"], # Classes | |
| "phylum": [r"\b([A-Za-z]+phyta)\b", r"\b([A-Za-z]+zoa)\b"] # Plant and animal phyla | |
| } | |
| # Apply suffix patterns to extract taxonomic information | |
| for rank, patterns in suffix_patterns.items(): | |
| if classification[rank] != "Unknown": | |
| continue # Skip if we already have a value | |
| for pattern in patterns: | |
| matches = re.findall(pattern, text) | |
| if matches: | |
| # Take the first match and clean it up | |
| match = matches[0].strip() | |
| classification[rank] = match | |
| break | |
| except Exception as e: | |
| print(f"Error in extract_taxonomy_from_text: {str(e)}") | |
| # If an error occurs, return the classification as is | |
| return classification | |
| if __name__ == "__main__": | |
| main() |