Trinay16 commited on
Commit
c77c0e7
·
verified ·
1 Parent(s): cf38d99

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +1343 -0
app.py ADDED
@@ -0,0 +1,1343 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+ import os
4
+ import re
5
+ from PIL import Image
6
+ import tempfile
7
+
8
+ # List of allowed file extensions for uploads
9
+ ALLOWED_EXTENSIONS = {'png', 'jpg', 'jpeg', 'gif'}
10
+
11
+ def allowed_file(filename):
12
+ return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
13
+
14
+ def main():
15
+ st.set_page_config(page_title="Species Information Finder", layout="wide")
16
+
17
+ st.title("Species Information Finder")
18
+ st.write("Discover information about any species by name or by uploading an image.")
19
+
20
+ # Create tabs for different functionality
21
+ tab1, tab2 = st.tabs(["Search by Name", "Search by Image"])
22
+
23
+ with tab1:
24
+ st.header("Search by Species Name")
25
+ species_name = st.text_input("Enter a species name (common or scientific):")
26
+
27
+ if st.button("Search"):
28
+ if not species_name:
29
+ st.error("Please enter a species name")
30
+ else:
31
+ with st.spinner("Searching for species information..."):
32
+ # Get species info from Wikispecies API
33
+ species_data = get_species_info(species_name)
34
+
35
+ # Get images from Wikimedia Commons API
36
+ images = get_species_images(species_name)
37
+
38
+ display_results(species_data, images)
39
+
40
+ with tab2:
41
+ st.header("Search by Image Upload")
42
+ uploaded_file = st.file_uploader("Upload an image of a species", type=ALLOWED_EXTENSIONS)
43
+
44
+ if uploaded_file is not None:
45
+ if allowed_file(uploaded_file.name):
46
+ # Display the uploaded image
47
+ image = Image.open(uploaded_file)
48
+ st.image(image, caption="Uploaded Image", use_column_width=True)
49
+
50
+ if st.button("Identify Species"):
51
+ with st.spinner("Identifying species from image..."):
52
+ # In a real app, you would call an image recognition API here
53
+ # For demo purposes, we'll use our mock function
54
+ species_name = get_mock_species_from_filename(uploaded_file.name)
55
+
56
+ # Get species info from Wikispecies API
57
+ species_data = get_species_info(species_name)
58
+
59
+ # Get images from Wikimedia Commons API
60
+ images = get_species_images(species_name)
61
+
62
+ display_results(species_data, images)
63
+ else:
64
+ st.error("File type not allowed. Please upload an image file (PNG, JPG, JPEG, GIF).")
65
+
66
+ def display_results(species_data, images):
67
+ """Display the results in a formatted way."""
68
+ if "error" in species_data:
69
+ st.error(species_data["error"])
70
+ return
71
+
72
+ st.success(f"Found information for: {species_data['title']}")
73
+
74
+ # Create columns for layout
75
+ col1, col2 = st.columns([1, 2])
76
+
77
+ with col1:
78
+ # Display classification information
79
+ st.subheader("Classification")
80
+ classification = species_data.get("classification", {})
81
+ for rank, value in classification.items():
82
+ if value != "Unknown":
83
+ st.write(f"{rank.capitalize()}:** {value}")
84
+
85
+ # Display habitat information
86
+ if species_data.get("habitat", "Unknown") != "Unknown":
87
+ st.subheader("Habitat")
88
+ st.write(species_data["habitat"])
89
+
90
+ with col2:
91
+ # Display description
92
+ st.subheader("Description")
93
+ st.write(species_data.get("description", "No description available."))
94
+
95
+ # Display fun facts if available
96
+ if species_data.get("fun_facts"):
97
+ st.subheader("Interesting Facts")
98
+ for i, fact in enumerate(species_data["fun_facts"], 1):
99
+ st.write(f"{i}. {fact}")
100
+
101
+ # Display images if available
102
+ if images:
103
+ st.subheader("Related Images")
104
+
105
+ # Display up to 4 images in a grid
106
+ cols = st.columns(min(4, len(images)))
107
+ for idx, img in enumerate(images[:4]):
108
+ with cols[idx]:
109
+ if "thumb_url" in img:
110
+ st.image(img["thumb_url"], caption=img.get("description", ""), use_column_width=True)
111
+ else:
112
+ st.image(img["url"], caption=img.get("description", ""), use_column_width=True)
113
+ st.caption(f"Credit: {img.get('author', 'Unknown')} | License: {img.get('license', 'Unknown')}")
114
+ else:
115
+ st.warning("No images found for this species.")
116
+
117
+ # All the existing functions from your Flask app can remain exactly the same
118
+ # (get_species_info, get_wikispecies_data, get_wikipedia_data, etc.)
119
+ # I'll include them below for completeness, but they don't need to change
120
+
121
+ def get_species_info(species_name):
122
+ """
123
+ Get species information from both Wikispecies and Wikipedia APIs
124
+ with improved extraction and fallback strategies for better results.
125
+ """
126
+ # Create the base species info structure
127
+ species_info = {
128
+ "title": species_name, # Default to the search query
129
+ "description": "No description available.",
130
+ "categories": [],
131
+ "links": [],
132
+ "last_modified": "Unknown",
133
+ "classification": {
134
+ "kingdom": "Unknown",
135
+ "phylum": "Unknown",
136
+ "class": "Unknown",
137
+ "order": "Unknown",
138
+ "family": "Unknown",
139
+ "genus": "Unknown",
140
+ "species": "Unknown"
141
+ },
142
+ "habitat": "Unknown",
143
+ "fun_facts": [],
144
+ "data_sources": [] # Track where we got data from
145
+ }
146
+
147
+ # Try to get data from Wikispecies first
148
+ wikispecies_info = get_wikispecies_data(species_name)
149
+
150
+ # If we got a valid response, update our species_info
151
+ if not wikispecies_info.get("error"):
152
+ species_info.update(wikispecies_info)
153
+ species_info["data_sources"].append("Wikispecies")
154
+
155
+ # Now try to get complementary data from Wikipedia
156
+ wikipedia_info = get_wikipedia_data(species_name)
157
+
158
+ # If Wikipedia returned valid data, supplement our existing info
159
+ if not wikipedia_info.get("error"):
160
+ # Use Wikipedia description if Wikispecies didn't have one
161
+ if species_info["description"] == "No description available." or len(species_info["description"]) < 50:
162
+ species_info["description"] = wikipedia_info.get("description", species_info["description"])
163
+
164
+ # Always prefer Wikipedia habitat info as it's likely more detailed
165
+ species_info["habitat"] = wikipedia_info.get("habitat", species_info["habitat"])
166
+
167
+ # Merge classification info from Wikipedia, preferring Wikipedia data
168
+ if "classification" in wikipedia_info:
169
+ for rank, value in wikipedia_info["classification"].items():
170
+ if value != "Unknown":
171
+ species_info["classification"][rank] = value
172
+
173
+ # Add Wikipedia fun facts to our collection, avoiding duplicates
174
+ if wikipedia_info.get("fun_facts"):
175
+ existing_facts = species_info.get("fun_facts", [])
176
+ for fact in wikipedia_info["fun_facts"]:
177
+ if not any(similarity_score(fact, existing) > 0.7 for existing in existing_facts):
178
+ existing_facts.append(fact)
179
+ species_info["fun_facts"] = existing_facts[:4] # Limit to 4 facts
180
+
181
+ species_info["data_sources"].append("Wikipedia")
182
+
183
+ # If we didn't get any data from either source, return an error
184
+ if not species_info["data_sources"]:
185
+ species_info["error"] = "Species information not found in either Wikispecies or Wikipedia."
186
+
187
+ return species_info
188
+
189
+ def get_wikispecies_data(species_name):
190
+ """
191
+ Get species information from Wikispecies API
192
+ """
193
+ # Wikispecies API endpoint
194
+ url = "https://species.wikimedia.org/w/api.php"
195
+
196
+ # Parameters for the API request - get more info to work with
197
+ params = {
198
+ "action": "query",
199
+ "format": "json",
200
+ "titles": species_name,
201
+ "prop": "extracts|categories|info|links",
202
+ "exintro": True, # Get only the intro section
203
+ "explaintext": True, # Get plain text, not HTML
204
+ "cllimit": 50, # Get more categories
205
+ "pllimit": 50, # Get more links
206
+ }
207
+
208
+ try:
209
+ response = requests.get(url, params=params)
210
+ data = response.json()
211
+
212
+ # Extract page data
213
+ pages = data.get("query", {}).get("pages", {})
214
+
215
+ if not pages:
216
+ return {"error": "No data found in Wikispecies"}
217
+
218
+ # Get the first page (there should only be one)
219
+ page_id = next(iter(pages))
220
+ page = pages[page_id]
221
+
222
+ # Default information structure with placeholders
223
+ species_info = {
224
+ "title": species_name, # Default to the search query
225
+ "description": "No description available.",
226
+ "categories": [],
227
+ "links": [],
228
+ "last_modified": "Unknown",
229
+ "classification": {
230
+ "kingdom": "Unknown",
231
+ "phylum": "Unknown",
232
+ "class": "Unknown",
233
+ "order": "Unknown",
234
+ "family": "Unknown",
235
+ "genus": "Unknown",
236
+ "species": "Unknown"
237
+ },
238
+ "habitat": "Unknown",
239
+ "fun_facts": []
240
+ }
241
+
242
+ # Check if the page exists
243
+ if int(page_id) < 0:
244
+ species_info["error"] = "Species not found in Wikispecies. Try a different spelling or check for the scientific name."
245
+ return species_info
246
+
247
+ # Extract the relevant information
248
+ species_info["title"] = page.get("title", species_name)
249
+ species_info["description"] = page.get("extract", "No description available.")
250
+
251
+ # Get all categories
252
+ if "categories" in page:
253
+ species_info["categories"] = [cat.get("title") for cat in page.get("categories", [])]
254
+
255
+ # Get all links (can be useful for finding related info)
256
+ if "links" in page:
257
+ species_info["links"] = [link.get("title") for link in page.get("links", [])]
258
+
259
+ species_info["last_modified"] = page.get("touched", "Unknown")
260
+
261
+ # Clean up the description (remove unnecessary line breaks, etc.)
262
+ if species_info["description"]:
263
+ species_info["description"] = species_info["description"].replace("\n", " ").strip()
264
+ # Remove multiple spaces
265
+ import re
266
+ species_info["description"] = re.sub(r' +', ' ', species_info["description"])
267
+
268
+ # Try different strategies to extract classification
269
+ # Strategy 1: Extract from categories
270
+ species_info["classification"] = extract_classification(species_info["categories"])
271
+
272
+ # Strategy 2: Try to extract genus and species from the title if available
273
+ title = species_info.get("title", "")
274
+ title_parts = title.split()
275
+
276
+ # If the title consists of two words, it might be a binomial name (genus + species)
277
+ if len(title_parts) == 2:
278
+ genus = title_parts[0]
279
+ species = title_parts[1]
280
+
281
+ # Update classification with this information
282
+ classification = species_info.get("classification", {})
283
+ if classification.get("genus") == "Unknown":
284
+ classification["genus"] = genus
285
+ if classification.get("species") == "Unknown":
286
+ classification["species"] = species
287
+ species_info["classification"] = classification
288
+
289
+ # Strategy 3: Look for classification information in links
290
+ if species_info.get("links"):
291
+ for link in species_info["links"]:
292
+ # Check if link might be a taxonomic rank
293
+ link_parts = link.split()
294
+ if len(link_parts) == 1:
295
+ # Check common taxonomic suffixes for families, orders, etc.
296
+ if link.endswith("idae"): # Family suffix
297
+ species_info["classification"]["family"] = link
298
+ elif link.endswith("inae"): # Subfamily suffix
299
+ # Store subfamily info in a separate key
300
+ species_info["classification"]["subfamily"] = link
301
+ elif link.endswith("ales"): # Order suffix for plants
302
+ species_info["classification"]["order"] = link
303
+ elif link.endswith("aceae"): # Family suffix for plants
304
+ species_info["classification"]["family"] = link
305
+
306
+ # Extract habitat info
307
+ species_info["habitat"] = extract_habitat(species_info["description"])
308
+
309
+ # Extract fun facts
310
+ species_info["fun_facts"] = extract_fun_facts(species_info["description"])
311
+
312
+ # If the description is too short or missing, try to create a basic description
313
+ if not species_info["description"] or len(species_info["description"]) < 20:
314
+ # Create a basic description from available information
315
+ classification = species_info["classification"]
316
+ parts = []
317
+
318
+ if classification["genus"] != "Unknown" and classification["species"] != "Unknown":
319
+ parts.append(f"{species_info['title']} is a species in the genus {classification['genus']}.")
320
+
321
+ if classification["family"] != "Unknown":
322
+ parts.append(f"It belongs to the family {classification['family']}.")
323
+
324
+ if classification["order"] != "Unknown":
325
+ parts.append(f"It is classified under the order {classification['order']}.")
326
+
327
+ if parts:
328
+ species_info["description"] = " ".join(parts)
329
+ else:
330
+ species_info["description"] = f"{species_info['title']} is a species documented in Wikispecies, the free species directory."
331
+
332
+ return species_info
333
+
334
+ except Exception as e:
335
+ error_msg = str(e)
336
+ return {
337
+ "error": f"Error retrieving species information from Wikispecies: {error_msg}",
338
+ "title": species_name,
339
+ "description": "No information available due to an error. Please try a different species name.",
340
+ "classification": {"kingdom": "Unknown", "phylum": "Unknown", "class": "Unknown", "order": "Unknown", "family": "Unknown", "genus": "Unknown", "species": "Unknown"},
341
+ "habitat": "Unknown",
342
+ "fun_facts": []
343
+ }
344
+
345
+ def get_wikipedia_data(species_name):
346
+ """
347
+ Get species information from Wikipedia API, focusing on description,
348
+ habitat, and fun facts.
349
+ """
350
+ # Wikipedia API endpoint
351
+ url = "https://en.wikipedia.org/w/api.php"
352
+
353
+ # First, try to search for the page to get the correct title
354
+ search_params = {
355
+ "action": "query",
356
+ "format": "json",
357
+ "list": "search",
358
+ "srsearch": species_name,
359
+ "srlimit": 1, # Get just the best match
360
+ }
361
+
362
+ try:
363
+ # Search for the page first to get the exact title
364
+ search_response = requests.get(url, params=search_params)
365
+ search_data = search_response.json()
366
+
367
+ # Check if we found any search results
368
+ search_results = search_data.get("query", {}).get("search", [])
369
+ if not search_results:
370
+ return {"error": "No matching Wikipedia page found for this species."}
371
+
372
+ # Get the page title from the search result
373
+ page_title = search_results[0].get("title")
374
+
375
+ # Now get the full page content
376
+ content_params = {
377
+ "action": "query",
378
+ "format": "json",
379
+ "titles": page_title,
380
+ "prop": "extracts|categories|sections",
381
+ "exintro": False, # Get the full content, not just the intro
382
+ "explaintext": True, # Get plain text, not HTML
383
+ "cllimit": 50, # Get more categories
384
+ }
385
+
386
+ content_response = requests.get(url, params=content_params)
387
+ content_data = content_response.json()
388
+
389
+ # Extract page data
390
+ pages = content_data.get("query", {}).get("pages", {})
391
+
392
+ if not pages:
393
+ return {"error": "Failed to retrieve Wikipedia page content."}
394
+
395
+ # Get the first page (there should only be one)
396
+ page_id = next(iter(pages))
397
+ page = pages[page_id]
398
+
399
+ # Check if the page exists
400
+ if int(page_id) < 0:
401
+ return {"error": "Wikipedia page not found."}
402
+
403
+ # Get basic information
404
+ species_info = {
405
+ "title": page.get("title", species_name),
406
+ "description": "",
407
+ "habitat": "Unknown",
408
+ "fun_facts": [],
409
+ "classification": {
410
+ "kingdom": "Unknown",
411
+ "phylum": "Unknown",
412
+ "class": "Unknown",
413
+ "order": "Unknown",
414
+ "family": "Unknown",
415
+ "genus": "Unknown",
416
+ "species": "Unknown"
417
+ }
418
+ }
419
+
420
+ # Extract the content
421
+ full_text = page.get("extract", "")
422
+
423
+ # Clean up the text
424
+ if full_text:
425
+ full_text = full_text.replace("\n\n", "||").replace("\n", " ").replace("||", "\n\n")
426
+
427
+ # Get sections from the content
428
+ sections = full_text.split("\n\n")
429
+
430
+ # The first section is usually a good description
431
+ if sections:
432
+ species_info["description"] = sections[0].strip()
433
+
434
+ # Look for habitat information in the full text
435
+ habitat_section = extract_wikipedia_section(full_text, ["Habitat", "Distribution", "Range", "Ecology", "Environment"])
436
+ if habitat_section:
437
+ species_info["habitat"] = habitat_section
438
+ else:
439
+ # If no specific habitat section, use our habitat extraction on the full text
440
+ habitat = extract_habitat(full_text)
441
+ if habitat != "Unknown":
442
+ species_info["habitat"] = habitat
443
+
444
+ # Extract fun facts from various interesting sections
445
+ behavior_section = extract_wikipedia_section(full_text, ["Behavior", "Behaviour", "Life cycle", "Diet", "Feeding", "Reproduction", "Biology"])
446
+ if behavior_section:
447
+ facts = extract_fun_facts(behavior_section)
448
+ if facts:
449
+ species_info["fun_facts"].extend(facts)
450
+
451
+ # If we don't have enough facts, try conservation status or other sections
452
+ if len(species_info["fun_facts"]) < 2:
453
+ conservation_section = extract_wikipedia_section(full_text, ["Conservation", "Status", "Threats", "Population"])
454
+ if conservation_section:
455
+ facts = extract_fun_facts(conservation_section)
456
+ if facts:
457
+ for fact in facts:
458
+ if fact not in species_info["fun_facts"]:
459
+ species_info["fun_facts"].append(fact)
460
+
461
+ # If we still don't have enough facts, use our fun facts extraction on the full text
462
+ if len(species_info["fun_facts"]) < 2:
463
+ general_facts = extract_fun_facts(full_text)
464
+ if general_facts:
465
+ for fact in general_facts:
466
+ if fact not in species_info["fun_facts"]:
467
+ species_info["fun_facts"].append(fact)
468
+
469
+ # Limit to 4 facts
470
+ species_info["fun_facts"] = species_info["fun_facts"][:4]
471
+
472
+ # Extract classification from Wikipedia content
473
+ wiki_classification = extract_wikipedia_classification(full_text, page.get("title", ""), search_data)
474
+ if wiki_classification:
475
+ species_info["classification"] = wiki_classification
476
+
477
+ return species_info
478
+
479
+ except Exception as e:
480
+ error_msg = str(e)
481
+ return {
482
+ "error": f"Error retrieving information from Wikipedia: {error_msg}",
483
+ "title": species_name,
484
+ "description": "No information available from Wikipedia due to an error.",
485
+ "habitat": "Unknown",
486
+ "fun_facts": []
487
+ }
488
+
489
+ def extract_wikipedia_section(text, section_keywords):
490
+ """
491
+ Try to extract a specific section from Wikipedia text content.
492
+ Returns the first matching section or None if no match is found.
493
+ """
494
+ if not text:
495
+ return None
496
+
497
+ # Try to find section headings in the text
498
+ section_pattern = r"==\s*([^=]+)\s*=="
499
+ sections = re.findall(section_pattern, text)
500
+
501
+ # Check if any of our target sections exist
502
+ matching_sections = []
503
+ for keyword in section_keywords:
504
+ for section in sections:
505
+ if keyword.lower() in section.lower():
506
+ # Found a matching section, now extract its content
507
+ section_regex = re.escape(f"== {section} ==")
508
+ try:
509
+ # Find where this section starts
510
+ start_match = re.search(section_regex, text)
511
+ if start_match:
512
+ start_pos = start_match.end()
513
+
514
+ # Find where the next section starts
515
+ next_section = re.search(r"==\s*[^=]+\s*==", text[start_pos:])
516
+ if next_section:
517
+ end_pos = start_pos + next_section.start()
518
+ section_text = text[start_pos:end_pos].strip()
519
+ else:
520
+ # This is the last section
521
+ section_text = text[start_pos:].strip()
522
+
523
+ matching_sections.append(section_text)
524
+ except Exception:
525
+ # Skip this section if there's any error processing it
526
+ continue
527
+
528
+ # If we found any matching sections, join them (limit to 2 for conciseness)
529
+ if matching_sections:
530
+ return " ".join(matching_sections[:2])
531
+
532
+ # Alternative approach: look for paragraphs containing the keywords
533
+ paragraphs = text.split("\n\n")
534
+ for keyword in section_keywords:
535
+ for paragraph in paragraphs:
536
+ if keyword.lower() in paragraph.lower():
537
+ return paragraph
538
+
539
+ return None
540
+
541
+ def get_species_images(species_name):
542
+ """
543
+ Get species images from Wikimedia Commons API with improved search
544
+ strategies for better results.
545
+ """
546
+ # Wikimedia Commons API endpoint
547
+ url = "https://commons.wikimedia.org/w/api.php"
548
+
549
+ # Function to perform a search with given parameters
550
+ def search_images(search_term, limit=10):
551
+ # Parameters for the API request
552
+ params = {
553
+ "action": "query",
554
+ "format": "json",
555
+ "generator": "search",
556
+ "gsrnamespace": 6, # File namespace
557
+ "gsrsearch": search_term,
558
+ "gsrlimit": limit, # Limit results
559
+ "prop": "imageinfo",
560
+ "iiprop": "url|extmetadata",
561
+ "iiurlwidth": 800, # Thumbnail width
562
+ }
563
+
564
+ try:
565
+ response = requests.get(url, params=params)
566
+ data = response.json()
567
+
568
+ # Extract image data
569
+ pages = data.get("query", {}).get("pages", {})
570
+
571
+ if not pages:
572
+ return []
573
+
574
+ images = []
575
+ for page_id, page in pages.items():
576
+ image_info = page.get("imageinfo", [{}])[0]
577
+
578
+ # Extract metadata
579
+ metadata = image_info.get("extmetadata", {})
580
+ description = metadata.get("ImageDescription", {}).get("value", "No description")
581
+ author = metadata.get("Artist", {}).get("value", "Unknown")
582
+ license = metadata.get("License", {}).get("value", "Unknown")
583
+
584
+ # Skip non-image files (like pdfs, audio, etc.)
585
+ title = page.get("title", "").lower()
586
+ if any(ext in title for ext in ['.pdf', '.svg', '.mp3', '.mp4', '.ogg', '.wav', '.webm']):
587
+ continue
588
+
589
+ image = {
590
+ "title": page.get("title", "Unknown"),
591
+ "url": image_info.get("url", ""),
592
+ "thumb_url": image_info.get("thumburl", ""),
593
+ "description": description,
594
+ "author": author,
595
+ "license": license,
596
+ }
597
+
598
+ images.append(image)
599
+
600
+ return images
601
+
602
+ except Exception as e:
603
+ return [{"error": str(e)}]
604
+
605
+ # STRATEGY 1: Try exact file name search first
606
+ images = search_images(f"file:{species_name}")
607
+
608
+ # If no results, try a broader search
609
+ if not images:
610
+ # STRATEGY 2: Try removing the file: prefix for broader results
611
+ images = search_images(species_name)
612
+
613
+ # If still no results or very few, try some variations
614
+ if len(images) < 3:
615
+ # Split the species name and try different combinations
616
+ name_parts = species_name.split()
617
+
618
+ # STRATEGY 3: If it's a binomial name, try with just the genus or species part
619
+ if len(name_parts) == 2:
620
+ # Try with just the genus (first part)
621
+ genus_images = search_images(f"{name_parts[0]}")
622
+
623
+ # Add unique images from genus search
624
+ existing_urls = [img.get("url") for img in images]
625
+ for img in genus_images:
626
+ if img.get("url") not in existing_urls:
627
+ images.append(img)
628
+ existing_urls.append(img.get("url"))
629
+
630
+ # Stop if we now have enough images
631
+ if len(images) >= 5:
632
+ break
633
+
634
+ # If we found at least some images, return them
635
+ if images:
636
+ return images
637
+
638
+ # STRATEGY 4: Last resort - try a very general search
639
+ # This could be improved by using the taxonomy info
640
+ return search_images("species taxonomy nature")
641
+
642
+ def extract_classification(categories):
643
+ """
644
+ Extract classification information from categories and additional WikiData
645
+ with improved pattern matching and detection.
646
+ """
647
+ # Initialize with default "Unknown" values
648
+ classification = {
649
+ "kingdom": "Unknown",
650
+ "phylum": "Unknown",
651
+ "class": "Unknown",
652
+ "order": "Unknown",
653
+ "family": "Unknown",
654
+ "genus": "Unknown",
655
+ "species": "Unknown",
656
+ }
657
+
658
+ # Skip empty categories
659
+ if not categories:
660
+ return classification
661
+
662
+ # Common taxonomy patterns in category names with more variations
663
+ taxonomy_patterns = {
664
+ "kingdom": ["kingdom:", "regnum:", "reino:", "regno:", "kingdom ", "regnum ", "reino ", "reino "],
665
+ "phylum": ["phylum:", "division:", "división:", "divisio:", "phylum ", "division ", "división ", "divisio "],
666
+ "class": ["class:", "clase:", "classis:", "class ", "clase ", "classis "],
667
+ "order": ["order:", "orden:", "ordo:", "order ", "orden ", "ordo "],
668
+ "family": ["family:", "familia:", "family ", "familia "],
669
+ "genus": ["genus:", "género:", "genero:", "genus ", "género ", "genero "],
670
+ "species": ["species:", "especie:", "specie:", "species ", "especie ", "specie "]
671
+ }
672
+
673
+ # STRATEGY 1: Direct matching from category names
674
+ for category in categories:
675
+ # Skip Categories: prefix if present
676
+ if category.startswith("Category:"):
677
+ category = category[9:]
678
+
679
+ category_lower = category.lower()
680
+
681
+ # Check for direct taxonomy mentions
682
+ for rank, patterns in taxonomy_patterns.items():
683
+ for pattern in patterns:
684
+ if pattern in category_lower:
685
+ # Extract the value after the pattern
686
+ parts = category_lower.split(pattern)
687
+ if len(parts) > 1:
688
+ # Clean up the value (capitalize first letter, remove trailing spaces and special chars)
689
+ value = parts[1].strip().split()[0].capitalize()
690
+ classification[rank] = value
691
+ break
692
+
693
+ # STRATEGY 2: Look for categories that directly match taxonomic naming conventions
694
+ for category in categories:
695
+ # Skip Categories: prefix if present
696
+ if category.startswith("Category:"):
697
+ category = category[9:]
698
+
699
+ category_parts = category.split()
700
+
701
+ # Check for single-word categories that might be taxonomic names
702
+ if len(category_parts) == 1:
703
+ name = category_parts[0]
704
+
705
+ # Check for common taxonomic suffixes
706
+ if name.endswith("idae"): # Family suffix for animals
707
+ classification["family"] = name
708
+ elif name.endswith("inae"): # Subfamily suffix
709
+ # Store subfamily info in a separate key
710
+ classification["subfamily"] = name
711
+ elif name.endswith("ales"): # Order suffix for plants
712
+ classification["order"] = name
713
+ elif name.endswith("aceae"): # Family suffix for plants
714
+ classification["family"] = name
715
+ elif name.endswith("ineae"): # Suborder suffix for plants
716
+ # Store suborder info in a separate key
717
+ classification["suborder"] = name
718
+ elif name.endswith("oideae"): # Subfamily suffix for plants
719
+ # Store subfamily info in a separate key
720
+ classification["subfamily"] = name
721
+
722
+ # STRATEGY 3: Check for categories that contain common taxonomic rank names
723
+ taxonomic_rank_names = ["kingdom", "phylum", "division", "class", "order", "family", "genus", "species"]
724
+ for category in categories:
725
+ # Skip Categories: prefix if present
726
+ if category.startswith("Category:"):
727
+ category = category[9:]
728
+
729
+ category_lower = category.lower()
730
+
731
+ for rank in taxonomic_rank_names:
732
+ if rank in category_lower:
733
+ # Look for words after the rank name
734
+ parts = category_lower.split(rank)
735
+ if len(parts) > 1 and parts[1].strip():
736
+ # Get the first word after the rank
737
+ value = parts[1].strip().split()[0].capitalize()
738
+ if classification[rank] == "Unknown":
739
+ classification[rank] = value
740
+
741
+ # Final cleanup: ensure proper capitalization and formatting
742
+ for rank, value in classification.items():
743
+ if value != "Unknown":
744
+ # Capitalize first letter for taxonomic ranks
745
+ classification[rank] = value[0].upper() + value[1:]
746
+
747
+ return classification
748
+
749
+ def extract_habitat(description):
750
+ """
751
+ Extract habitat information from description using a more comprehensive approach
752
+ with multiple fallback strategies and pattern recognition.
753
+ """
754
+ if not description or description == "No description available":
755
+ return "Unknown"
756
+
757
+ # Split the description into sentences
758
+ sentences = description.replace(". ", ".|").replace("! ", "!|").replace("? ", "?|").split("|")
759
+ sentences = [s.strip() for s in sentences if s.strip()]
760
+
761
+ # STRATEGY 1: Direct habitat statements
762
+ # Expanded list of habitat-related keywords and phrases
763
+ habitat_keywords = [
764
+ "habitat", "lives in", "found in", "native to", "occurs in", "distribution",
765
+ "range includes", "ecosystem", "biome", "environment", "inhabits", "dwelling in",
766
+ "endemic to", "natural range", "geographical range", "distributed across",
767
+ "prefers", "thrives in", "flourishes in", "resides in", "habitat type",
768
+ "commonly found", "typically found", "often found", "usually found", "primarily found"
769
+ ]
770
+
771
+ # STRATEGY 2: Geography and climate context
772
+ # Climate and geography keywords to catch broader context
773
+ climate_keywords = [
774
+ "tropical", "temperate", "polar", "arctic", "antarctic", "desert",
775
+ "rainforest", "forest", "jungle", "grassland", "savanna", "wetland",
776
+ "marsh", "swamp", "mountain", "alpine", "coastal", "marine", "freshwater",
777
+ "ocean", "sea", "river", "lake", "stream", "pond", "terrestrial", "aquatic",
778
+ "woodland", "meadow", "tundra", "taiga", "steppe", "continent", "island",
779
+ "shore", "beach", "reef", "cave", "burrow", "nest", "canopy", "undergrowth"
780
+ ]
781
+
782
+ # STRATEGY 3: Regional indicators (continents, regions, countries)
783
+ region_keywords = [
784
+ "africa", "asia", "europe", "north america", "south america", "australia",
785
+ "antarctica", "oceania", "mediterranean", "pacific", "atlantic", "indian ocean",
786
+ "arctic ocean", "southern ocean", "northern", "southern", "eastern", "western",
787
+ "central", "worldwide", "global", "cosmopolitan", "international"
788
+ ]
789
+
790
+ # STRATEGY 4: Verbs that might indicate location or movement patterns
791
+ action_keywords = [
792
+ "migrate", "roam", "travel", "swim", "fly", "climb", "burrow", "dig", "nest",
793
+ "breed", "forage", "hunt", "territory", "range"
794
+ ]
795
+
796
+ # Sentences that might contain habitat information
797
+ habitat_sentences = []
798
+
799
+ # Apply Strategy 1: Direct habitat statements
800
+ for sentence in sentences:
801
+ for keyword in habitat_keywords:
802
+ if keyword.lower() in sentence.lower():
803
+ habitat_sentences.append(sentence)
804
+ break
805
+
806
+ # Apply Strategy 2: Geography and climate context (if strategy 1 didn't yield results)
807
+ if not habitat_sentences:
808
+ for sentence in sentences:
809
+ for keyword in climate_keywords:
810
+ if keyword.lower() in sentence.lower():
811
+ habitat_sentences.append(sentence)
812
+ break
813
+
814
+ # Apply Strategy 3: Regional indicators (if strategies 1-2 didn't yield results)
815
+ if not habitat_sentences:
816
+ for sentence in sentences:
817
+ for keyword in region_keywords:
818
+ if keyword.lower() in sentence.lower():
819
+ habitat_sentences.append(sentence)
820
+ break
821
+
822
+ # Apply Strategy 4: Action verbs related to habitat (if strategies 1-3 didn't yield results)
823
+ if not habitat_sentences:
824
+ for sentence in sentences:
825
+ for keyword in action_keywords:
826
+ if keyword.lower() in sentence.lower():
827
+ habitat_sentences.append(sentence)
828
+ break
829
+
830
+ # Fallback Strategy: If no habitat information was found, try to use the first or second sentence
831
+ # as they often contain general information about where the species lives
832
+ if not habitat_sentences and len(sentences) >= 2:
833
+ # Skip the first sentence if it's just a definition and take the second
834
+ if len(sentences) > 2:
835
+ second_sentence = sentences[1]
836
+ # Check if the second sentence has reasonable length to be informative
837
+ if len(second_sentence.split()) > 5:
838
+ habitat_sentences.append(second_sentence)
839
+
840
+ # If second sentence wasn't suitable or not available, use the first
841
+ if not habitat_sentences:
842
+ first_sentence = sentences[0]
843
+ if len(first_sentence.split()) > 5:
844
+ habitat_sentences.append(first_sentence)
845
+
846
+ # Format the habitat information
847
+ if habitat_sentences:
848
+ # If we have multiple sentences, join them (but limit to 2 for conciseness)
849
+ if len(habitat_sentences) > 1:
850
+ combined = ". ".join(habitat_sentences[:2]).strip()
851
+ # Make sure it ends with proper punctuation
852
+ if not combined.endswith(('.', '!', '?')):
853
+ combined += '.'
854
+ return combined
855
+
856
+ single = habitat_sentences[0].strip()
857
+ # Make sure it ends with proper punctuation
858
+ if not single.endswith(('.', '!', '?')):
859
+ single += '.'
860
+ return single
861
+
862
+ # Last resort: construct a generic message if we couldn't find specific habitat info
863
+ return "Specific habitat information not available from Wikispecies. Try searching online for more details about this species' natural environment."
864
+
865
+ def extract_fun_facts(description):
866
+ """
867
+ Extract interesting fun facts from the description using keyword-based identification,
868
+ with improved pattern recognition and a structured approach to generate fun facts
869
+ even with limited information.
870
+ """
871
+ if not description or description == "No description available":
872
+ return ["No specific information available for this species in Wikispecies."]
873
+
874
+ # Split the description into sentences
875
+ sentences = description.replace(". ", ".|").replace("! ", "!|").replace("? ", "?|").split("|")
876
+ sentences = [s.strip() for s in sentences if s.strip()]
877
+
878
+ # If the description is too short, include it as a single fact
879
+ if len(sentences) == 1 and len(description) < 100:
880
+ if not sentences[0].endswith(('.', '!', '?')):
881
+ sentences[0] += '.'
882
+ return [sentences[0]]
883
+
884
+ # STRATEGY 1: Identify sentences with interesting keywords
885
+ interesting_keywords = [
886
+ "interesting", "unique", "unusual", "remarkable", "notable", "surprising",
887
+ "fascinating", "amazing", "extraordinary", "distinctive", "special", "rare",
888
+ "strange", "curious", "unlike", "peculiar", "odd", "bizarre", "striking",
889
+ "colorful", "beautiful", "impressive", "popular", "famous", "well-known",
890
+ "largest", "smallest", "fastest", "slowest", "oldest", "youngest", "only",
891
+ "record", "discovery", "first", "last", "origin", "discovered", "introduced",
892
+ "revered", "sacred", "symbol", "iconic", "emblem", "represented", "mythology",
893
+ "legend", "folklore", "traditional", "cultural", "significance", "historical"
894
+ ]
895
+
896
+ # STRATEGY 2: Physical characteristics and biology often make good facts
897
+ biology_keywords = [
898
+ "lifespan", "longevity", "size", "weight", "height", "length", "wingspan",
899
+ "color", "pattern", "marking", "appearance", "physical", "morphology", "anatomy",
900
+ "feature", "characteristic", "distinctive", "body", "shape", "structure",
901
+ "adaptation", "evolved", "evolution", "mutation", "gene", "genetic", "chromosome",
902
+ "hybrid", "species", "subspecies", "variety", "breed", "strain", "extinct",
903
+ "endangered", "threatened", "vulnerable", "conservation", "protected"
904
+ ]
905
+
906
+ # STRATEGY 3: Behavior and lifestyle information
907
+ behavior_keywords = [
908
+ "diet", "eat", "feeding", "food", "prey", "predator", "hunt", "scavenge",
909
+ "forage", "graze", "browse", "omnivore", "carnivore", "herbivore", "insectivore",
910
+ "behavior", "behaviour", "habit", "activity", "social", "solitary", "group",
911
+ "herd", "flock", "pack", "colony", "community", "family", "nocturnal", "diurnal",
912
+ "crepuscular", "migrate", "migration", "hibernate", "hibernation", "estivate",
913
+ "dormant", "sleep", "rest", "active", "territory", "defend", "aggressive",
914
+ "docile", "tame", "wild", "domestic", "domesticated", "trained", "human"
915
+ ]
916
+
917
+ # STRATEGY 4: Reproduction is always interesting
918
+ reproduction_keywords = [
919
+ "reproduce", "reproduction", "breeding", "mate", "mating", "courtship", "display",
920
+ "attract", "offspring", "young", "juvenile", "infant", "baby", "child", "adult",
921
+ "egg", "spawn", "birth", "pregnant", "gestation", "incubation", "hatch", "nestling",
922
+ "fledgling", "litter", "clutch", "brood", "parent", "care", "raise", "nurse", "wean"
923
+ ]
924
+
925
+ # Comparative patterns that often indicate interesting facts
926
+ comparative_patterns = [
927
+ "more than", "less than", "bigger than", "smaller than", "larger than",
928
+ "faster than", "slower than", "better than", "worse than", "greater than",
929
+ "unlike", "similar to", "compared to", "in contrast to", "differs from",
930
+ "up to", "as many as", "can reach", "can grow", "can live", "known to",
931
+ "capable of", "able to", "estimated", "approximately", "about", "around"
932
+ ]
933
+
934
+ # Measurement patterns that often indicate interesting statistics
935
+ measurement_patterns = [
936
+ "cm", "meter", "metre", "kilometer", "kilometre", "feet", "foot", "inch",
937
+ "kg", "gram", "pound", "ton", "tonne", "year", "month", "week", "day", "hour",
938
+ "percent", "°C", "°F", "degree", "celsius", "fahrenheit", "temperature",
939
+ "speed", "mph", "kph", "knot", "altitude", "depth", "width", "height"
940
+ ]
941
+
942
+ # Collect potential facts using different strategies
943
+ fact_candidates = {
944
+ "interesting": [],
945
+ "biological": [],
946
+ "behavioral": [],
947
+ "reproductive": [],
948
+ "comparative": [],
949
+ "measurements": [],
950
+ "general": []
951
+ }
952
+
953
+ # Apply strategies to collect potential facts
954
+ for sentence in sentences:
955
+ # Skip very short sentences
956
+ if len(sentence.split()) < 4:
957
+ continue
958
+
959
+ # Flag to track if the sentence has been categorized
960
+ categorized = False
961
+
962
+ # Strategy 1: Interesting keywords
963
+ for keyword in interesting_keywords:
964
+ if keyword.lower() in sentence.lower():
965
+ fact_candidates["interesting"].append(sentence)
966
+ categorized = True
967
+ break
968
+
969
+ if not categorized:
970
+ # Strategy 2: Biological characteristics
971
+ for keyword in biology_keywords:
972
+ if keyword.lower() in sentence.lower():
973
+ fact_candidates["biological"].append(sentence)
974
+ categorized = True
975
+ break
976
+
977
+ if not categorized:
978
+ # Strategy 3: Behavior keywords
979
+ for keyword in behavior_keywords:
980
+ if keyword.lower() in sentence.lower():
981
+ fact_candidates["behavioral"].append(sentence)
982
+ categorized = True
983
+ break
984
+
985
+ if not categorized:
986
+ # Strategy 4: Reproduction keywords
987
+ for keyword in reproduction_keywords:
988
+ if keyword.lower() in sentence.lower():
989
+ fact_candidates["reproductive"].append(sentence)
990
+ categorized = True
991
+ break
992
+
993
+ if not categorized:
994
+ # Check for comparative patterns
995
+ for pattern in comparative_patterns:
996
+ if pattern.lower() in sentence.lower():
997
+ fact_candidates["comparative"].append(sentence)
998
+ categorized = True
999
+ break
1000
+
1001
+ if not categorized:
1002
+ # Check for measurement patterns
1003
+ has_number = any(c.isdigit() for c in sentence)
1004
+ if has_number:
1005
+ for pattern in measurement_patterns:
1006
+ if pattern.lower() in sentence.lower():
1007
+ fact_candidates["measurements"].append(sentence)
1008
+ categorized = True
1009
+ break
1010
+ fact_candidates["measurements"].append(sentence)
1011
+ categorized = True
1012
+ break
1013
+
1014
+ # If sentence wasn't categorized by any specific strategy, add to general
1015
+ if not categorized and len(sentence.split()) > 5:
1016
+ fact_candidates["general"].append(sentence)
1017
+
1018
+ # Select facts from each category to ensure diversity (prioritizing the most interesting ones)
1019
+ selected_facts = []
1020
+
1021
+ # Priority order for fact selection
1022
+ categories = ["interesting", "measurements", "biological", "reproductive", "behavioral", "comparative", "general"]
1023
+
1024
+ # First, try to get at least one fact from high-priority categories
1025
+ for category in categories[:3]: # First 3 are highest priority
1026
+ if fact_candidates[category]:
1027
+ selected_facts.append(fact_candidates[category][0])
1028
+ fact_candidates[category].pop(0) # Remove the used fact
1029
+
1030
+ # Now fill remaining slots with a mix of all categories
1031
+ remaining_slots = 4 - len(selected_facts) # Maximum 4 facts total
1032
+
1033
+ if remaining_slots > 0:
1034
+ for category in categories:
1035
+ if fact_candidates[category] and remaining_slots > 0:
1036
+ next_fact = fact_candidates[category][0]
1037
+ # Only add if not too similar to already selected facts
1038
+ if not any(similarity_score(next_fact, fact) > 0.7 for fact in selected_facts):
1039
+ selected_facts.append(next_fact)
1040
+ remaining_slots -= 1
1041
+ fact_candidates[category].pop(0) # Remove the used fact
1042
+
1043
+ # If we still don't have enough facts, add more from general pool
1044
+ if len(selected_facts) < 2 and sentences:
1045
+ # Add the first sentence if it's not already included
1046
+ if sentences[0] not in selected_facts and len(sentences[0].split()) > 5:
1047
+ selected_facts.append(sentences[0])
1048
+
1049
+ # Add another sentence from middle of the text if available
1050
+ middle_idx = len(sentences) // 2
1051
+ if len(sentences) > middle_idx and sentences[middle_idx] not in selected_facts and len(sentences[middle_idx].split()) > 5:
1052
+ selected_facts.append(sentences[middle_idx])
1053
+
1054
+ # Last resort: if still no facts, create a generic fact
1055
+ if not selected_facts:
1056
+ selected_facts = ["This species is documented in Wikispecies, the free species directory."]
1057
+
1058
+ # Ensure all facts end with proper punctuation
1059
+ for i in range(len(selected_facts)):
1060
+ if not selected_facts[i].endswith(('.', '!', '?')):
1061
+ selected_facts[i] += '.'
1062
+
1063
+ # Remove duplicates while preserving order
1064
+ unique_facts = []
1065
+ for fact in selected_facts:
1066
+ if fact not in unique_facts:
1067
+ unique_facts.append(fact)
1068
+
1069
+ return unique_facts[:4] # Limit to max 4 facts
1070
+
1071
+ def similarity_score(str1, str2):
1072
+ """
1073
+ Calculate a simple similarity score between two strings
1074
+ based on word overlap. Used to avoid selecting too similar facts.
1075
+ Returns a value between 0 (completely different) and 1 (identical).
1076
+ """
1077
+ if not str1 or not str2:
1078
+ return 0
1079
+
1080
+ # Convert to lowercase and split into words
1081
+ words1 = set(str1.lower().split())
1082
+ words2 = set(str2.lower().split())
1083
+
1084
+ # Calculate Jaccard similarity
1085
+ intersection = words1.intersection(words2)
1086
+ union = words1.union(words2)
1087
+
1088
+ if not union:
1089
+ return 0
1090
+
1091
+ return len(intersection) / len(union)
1092
+
1093
+ def get_mock_species_from_filename(filename):
1094
+ """
1095
+ A mock function that simulates image recognition by looking at the filename.
1096
+ In a real application, this would be replaced with an actual image recognition API.
1097
+ """
1098
+ filename_lower = filename.lower()
1099
+
1100
+ # List of common animals and their possible filenames
1101
+ animal_keywords = {
1102
+ "cat": "Felis catus",
1103
+ "dog": "Canis familiaris",
1104
+ "bird": "Aves",
1105
+ "eagle": "Aquila chrysaetos",
1106
+ "lion": "Panthera leo",
1107
+ "tiger": "Panthera tigris",
1108
+ "bear": "Ursus arctos",
1109
+ "wolf": "Canis lupus",
1110
+ "fox": "Vulpes vulpes",
1111
+ "deer": "Cervidae",
1112
+ "elephant": "Loxodonta africana",
1113
+ "giraffe": "Giraffa camelopardalis",
1114
+ "zebra": "Equus quagga",
1115
+ "monkey": "Primates",
1116
+ "gorilla": "Gorilla gorilla",
1117
+ "fish": "Actinopterygii",
1118
+ "shark": "Selachimorpha",
1119
+ "dolphin": "Tursiops truncatus",
1120
+ "whale": "Cetacea",
1121
+ "snake": "Serpentes",
1122
+ "lizard": "Lacertilia",
1123
+ "turtle": "Testudines",
1124
+ "frog": "Anura",
1125
+ "butterfly": "Lepidoptera",
1126
+ "bee": "Apis mellifera",
1127
+ }
1128
+
1129
+ # List of common plants and their possible filenames
1130
+ plant_keywords = {
1131
+ "tree": "Arbor",
1132
+ "flower": "Anthophyta",
1133
+ "rose": "Rosa",
1134
+ "tulip": "Tulipa",
1135
+ "daisy": "Bellis perennis",
1136
+ "sunflower": "Helianthus annuus",
1137
+ "oak": "Quercus",
1138
+ "pine": "Pinus",
1139
+ "maple": "Acer",
1140
+ "fern": "Polypodiopsida",
1141
+ "moss": "Bryophyta",
1142
+ "grass": "Poaceae",
1143
+ "cactus": "Cactaceae",
1144
+ "palm": "Arecaceae",
1145
+ "orchid": "Orchidaceae",
1146
+ }
1147
+
1148
+ # Check animal keywords
1149
+ for keyword, species in animal_keywords.items():
1150
+ if keyword in filename_lower:
1151
+ return species
1152
+
1153
+ # Check plant keywords
1154
+ for keyword, species in plant_keywords.items():
1155
+ if keyword in filename_lower:
1156
+ return species
1157
+
1158
+ # If no match is found, return a default species
1159
+ return "Homo sapiens"
1160
+
1161
+ def extract_wikipedia_classification(full_text, title, search_data=None):
1162
+ """
1163
+ Extract classification/taxonomy information from Wikipedia content.
1164
+ Uses various strategies including infobox parsing, section analysis, and text pattern matching.
1165
+
1166
+ Args:
1167
+ full_text: The full text content of the Wikipedia page
1168
+ title: The title of the Wikipedia page
1169
+ search_data: Optional search data that might contain additional info
1170
+
1171
+ Returns:
1172
+ A dictionary with taxonomic ranks and their values
1173
+ """
1174
+ # Initialize with default "Unknown" values
1175
+ classification = {
1176
+ "kingdom": "Unknown",
1177
+ "phylum": "Unknown",
1178
+ "class": "Unknown",
1179
+ "order": "Unknown",
1180
+ "family": "Unknown",
1181
+ "genus": "Unknown",
1182
+ "species": "Unknown"
1183
+ }
1184
+
1185
+ if not full_text:
1186
+ return classification
1187
+
1188
+ try:
1189
+ # STRATEGY 1: Look for taxonomic information in specific sections
1190
+ taxonomy_section = extract_wikipedia_section(full_text, ["Taxonomy", "Classification", "Taxonomic", "Scientific classification"])
1191
+ if taxonomy_section:
1192
+ # Extract taxonomic information from the section
1193
+ classification = extract_taxonomy_from_text(taxonomy_section, classification)
1194
+
1195
+ # STRATEGY 2: Look for taxonomic information in infobox-like structures
1196
+ # Wikipedia infoboxes often appear at the beginning of the text with structured format
1197
+ infobox_patterns = [
1198
+ r"Kingdom:\s*([A-Za-z]+)",
1199
+ r"Phylum:\s*([A-Za-z]+)",
1200
+ r"Class:\s*([A-Za-z]+)",
1201
+ r"Order:\s*([A-Za-z]+)",
1202
+ r"Family:\s*([A-Za-z]+)",
1203
+ r"Genus:\s*([A-Za-z]+)",
1204
+ r"Species:\s*([A-Za-z]+)"
1205
+ ]
1206
+
1207
+ # Apply each pattern to extract taxonomic information
1208
+ for i, pattern in enumerate(infobox_patterns):
1209
+ rank = list(classification.keys())[i]
1210
+ matches = re.findall(pattern, full_text, re.IGNORECASE)
1211
+ if matches:
1212
+ classification[rank] = matches[0].strip()
1213
+
1214
+ # STRATEGY 3: Parse the first paragraph for taxonomic information
1215
+ # First paragraphs in Wikipedia often contain taxonomic statements
1216
+ first_para = full_text.split('\n\n')[0] if '\n\n' in full_text else full_text
1217
+ classification = extract_taxonomy_from_text(first_para, classification)
1218
+
1219
+ # STRATEGY 4: Try to extract genus and species from the title
1220
+ title_parts = title.split()
1221
+ if len(title_parts) >= 2 and classification["genus"] == "Unknown":
1222
+ # If title looks like a binomial name (e.g., "Panthera leo")
1223
+ if title_parts[0][0].isupper() and title_parts[0][1:].islower() and title_parts[1].islower():
1224
+ classification["genus"] = title_parts[0]
1225
+ if classification["species"] == "Unknown":
1226
+ classification["species"] = title_parts[1]
1227
+
1228
+ # STRATEGY 5: Look for taxonomic statements throughout the text
1229
+ # These patterns match statements like "belongs to the family Felidae"
1230
+ taxonomy_statement_patterns = [
1231
+ r"(?:belongs|belonging)\s+to\s+(?:the)?\s+kingdom\s+([A-Za-z]+)",
1232
+ r"(?:belongs|belonging)\s+to\s+(?:the)?\s+phylum\s+([A-Za-z]+)",
1233
+ r"(?:belongs|belonging)\s+to\s+(?:the)?\s+class\s+([A-Za-z]+)",
1234
+ r"(?:belongs|belonging)\s+to\s+(?:the)?\s+order\s+([A-Za-z]+)",
1235
+ r"(?:belongs|belonging)\s+to\s+(?:the)?\s+family\s+([A-Za-z]+)",
1236
+ r"(?:is|as)\s+a\s+(?:member|species)\s+of\s+(?:the)?\s+kingdom\s+([A-Za-z]+)",
1237
+ r"(?:is|as)\s+a\s+(?:member|species)\s+of\s+(?:the)?\s+phylum\s+([A-Za-z]+)",
1238
+ r"(?:is|as)\s+a\s+(?:member|species)\s+of\s+(?:the)?\s+class\s+([A-Za-z]+)",
1239
+ r"(?:is|as)\s+a\s+(?:member|species)\s+of\s+(?:the)?\s+order\s+([A-Za-z]+)",
1240
+ r"(?:is|as)\s+a\s+(?:member|species)\s+of\s+(?:the)?\s+family\s+([A-Za-z]+)",
1241
+ r"(?:is|as)\s+a\s+(?:member|species)\s+of\s+(?:the)?\s+genus\s+([A-Za-z]+)"
1242
+ ]
1243
+
1244
+ # Map patterns to taxonomic ranks
1245
+ rank_map = {
1246
+ 0: "kingdom", 1: "phylum", 2: "class", 3: "order", 4: "family",
1247
+ 5: "kingdom", 6: "phylum", 7: "class", 8: "order", 9: "family", 10: "genus"
1248
+ }
1249
+
1250
+ # Apply statement patterns to extract taxonomic information
1251
+ for i, pattern in enumerate(taxonomy_statement_patterns):
1252
+ rank = rank_map.get(i)
1253
+ if not rank:
1254
+ continue
1255
+
1256
+ matches = re.findall(pattern, full_text, re.IGNORECASE)
1257
+ if matches and classification[rank] == "Unknown":
1258
+ classification[rank] = matches[0].strip()
1259
+
1260
+ # Final cleanup: ensure proper capitalization and formatting
1261
+ for rank, value in classification.items():
1262
+ if value != "Unknown":
1263
+ # Capitalize first letter for taxonomic ranks
1264
+ classification[rank] = value[0].upper() + value[1:]
1265
+
1266
+ except Exception as e:
1267
+ print(f"Error extracting classification from Wikipedia: {str(e)}")
1268
+ # If an error occurs, we'll return the classification with whatever data we managed to extract
1269
+
1270
+ return classification
1271
+
1272
+ def extract_taxonomy_from_text(text, classification):
1273
+ """
1274
+ Extract taxonomic information from text using pattern matching
1275
+ and natural language processing techniques.
1276
+
1277
+ Args:
1278
+ text: The text to analyze
1279
+ classification: The current classification dictionary to update
1280
+
1281
+ Returns:
1282
+ Updated classification dictionary
1283
+ """
1284
+ if not text:
1285
+ return classification
1286
+
1287
+ try:
1288
+ # Common patterns for taxonomic ranks in text
1289
+ taxonomy_patterns = {
1290
+ "kingdom": [r"Kingdom:?\s*([A-Za-z]+)", r"Kingdom\s+([A-Za-z]+)", r"a member of the kingdom\s+([A-Za-z]+)"],
1291
+ "phylum": [r"Phylum:?\s*([A-Za-z]+)", r"Phylum\s+([A-Za-z]+)", r"a member of the phylum\s+([A-Za-z]+)"],
1292
+ "class": [r"Class:?\s*([A-Za-z]+)", r"Class\s+([A-Za-z]+)", r"a member of the class\s+([A-Za-z]+)"],
1293
+ "order": [r"Order:?\s*([A-Za-z]+)", r"Order\s+([A-Za-z]+)", r"a member of the order\s+([A-Za-z]+)"],
1294
+ }
1295
+
1296
+ # For each taxonomic rank, try to find matches using the patterns
1297
+ for rank, patterns in taxonomy_patterns.items():
1298
+ if classification[rank] != "Unknown":
1299
+ continue # Skip if we already have a value
1300
+
1301
+ for pattern in patterns:
1302
+ matches = re.findall(pattern, text, re.IGNORECASE)
1303
+ if matches:
1304
+ # Take the first match and clean it up
1305
+ match = matches[0].strip()
1306
+ # Handle Latin taxonomic names with proper capitalization
1307
+ if rank in ["genus", "species"]:
1308
+ match = match[0].upper() + match[1:].lower()
1309
+ elif rank != "species": # For non-species ranks
1310
+ match = match.capitalize()
1311
+
1312
+ classification[rank] = match
1313
+ break # Stop after finding a match for this rank
1314
+
1315
+ # Look for taxonomic information with specific taxonomic suffixes
1316
+ suffix_patterns = {
1317
+ "family": [r"\b([A-Za-z]+idae)\b", r"\b([A-Za-z]+aceae)\b"], # Animal and plant families
1318
+ "order": [r"\b([A-Za-z]+ales)\b", r"\b([A-Za-z]+ida)\b"], # Plant orders and animal orders
1319
+ "class": [r"\b([A-Za-z]+ia)\b", r"\b([A-Za-z]+phyceae)\b"], # Classes
1320
+ "phylum": [r"\b([A-Za-z]+phyta)\b", r"\b([A-Za-z]+zoa)\b"] # Plant and animal phyla
1321
+ }
1322
+
1323
+ # Apply suffix patterns to extract taxonomic information
1324
+ for rank, patterns in suffix_patterns.items():
1325
+ if classification[rank] != "Unknown":
1326
+ continue # Skip if we already have a value
1327
+
1328
+ for pattern in patterns:
1329
+ matches = re.findall(pattern, text)
1330
+ if matches:
1331
+ # Take the first match and clean it up
1332
+ match = matches[0].strip()
1333
+ classification[rank] = match
1334
+ break
1335
+
1336
+ except Exception as e:
1337
+ print(f"Error in extract_taxonomy_from_text: {str(e)}")
1338
+ # If an error occurs, return the classification as is
1339
+
1340
+ return classification
1341
+
1342
+ if _name_ == "_main_":
1343
+ main()