Spaces:

MCP-1st-Birthday
/

BirdScopeAI

Paused

App Files Files Community

facemelter commited on Nov 30, 2025

Commit

acf4dc8

verified ·

1 Parent(s): 17f468c

Fixed regex pattern upsplash images

Browse files

Files changed (1) hide show

langgraph_agent/structured_output.py +16 -6

langgraph_agent/structured_output.py CHANGED Viewed

@@ -38,28 +38,38 @@ def extract_urls_from_text(text: str) -> tuple[List[str], List[str]]:
     Extract image and audio URLs from text using regex.
     Updated to handle URLs within markdown, JSON, and plain text.
     Returns:
         tuple: (image_urls, audio_urls)
     """
-    # Updated pattern for image URLs - more permissive to catch URLs in various contexts
     # Matches URLs ending in image extensions, allowing most characters before the extension
     # Stops at whitespace or common delimiters like ), ], }
-    image_pattern = r'https?://[^\s)}\]]+?\.(?:jpg|jpeg|png|gif|webp|svg)(?:\?[^\s)}\]]*)?'
     # Pattern for audio URLs - handles both direct audio files AND xeno-canto links
     # Updated to be more permissive like image pattern
     audio_pattern_files = r'https?://[^\s)}\]]+?\.(?:mp3|wav|ogg|m4a)(?:\?[^\s)}\]]*)?'
-    audio_pattern_xenocanto = r'https?://xeno-canto\.org/\d+/download'
     print(f"[EXTRACT_URLS] Searching text of length {len(text)}")
-    # Extract all URLs
-    raw_image_urls = re.findall(image_pattern, text, re.IGNORECASE)
     raw_audio_urls_files = re.findall(audio_pattern_files, text, re.IGNORECASE)
     audio_urls_xenocanto = list(set(re.findall(audio_pattern_xenocanto, text, re.IGNORECASE)))
-    print(f"[EXTRACT_URLS] Found {len(raw_image_urls)} raw image URLs")
     print(f"[EXTRACT_URLS] Found {len(raw_audio_urls_files)} audio file URLs")
     print(f"[EXTRACT_URLS] Found {len(audio_urls_xenocanto)} xeno-canto URLs")

     Extract image and audio URLs from text using regex.
     Updated to handle URLs within markdown, JSON, and plain text.
+    Supports both extension-based URLs (.jpg, .png) and domain-based (Unsplash).
     Returns:
         tuple: (image_urls, audio_urls)
     """
+    # Pattern 1: Image URLs with file extensions
     # Matches URLs ending in image extensions, allowing most characters before the extension
     # Stops at whitespace or common delimiters like ), ], }
+    image_pattern_ext = r'https?://[^\s)}\]]+?\.(?:jpg|jpeg|png|gif|webp|svg)(?:\?[^\s)}\]]*)?'
+    # Pattern 2: Unsplash image URLs (no file extension needed)
+    # Matches: https://images.unsplash.com/photo-XXXXXXX or similar
+    image_pattern_unsplash = r'https?://images\.unsplash\.com/[^\s)}\]]*'
     # Pattern for audio URLs - handles both direct audio files AND xeno-canto links
     # Updated to be more permissive like image pattern
     audio_pattern_files = r'https?://[^\s)}\]]+?\.(?:mp3|wav|ogg|m4a)(?:\?[^\s)}\]]*)?'
+    audio_pattern_xenocanto = r'https?://xeno-canto\.org/\d+(?:/download)?'
     print(f"[EXTRACT_URLS] Searching text of length {len(text)}")
+    # Extract all URLs - combine both image patterns
+    raw_image_urls_ext = re.findall(image_pattern_ext, text, re.IGNORECASE)
+    raw_image_urls_unsplash = re.findall(image_pattern_unsplash, text, re.IGNORECASE)
     raw_audio_urls_files = re.findall(audio_pattern_files, text, re.IGNORECASE)
     audio_urls_xenocanto = list(set(re.findall(audio_pattern_xenocanto, text, re.IGNORECASE)))
+    # Combine image URLs from both patterns
+    raw_image_urls = raw_image_urls_ext + raw_image_urls_unsplash
+    print(f"[EXTRACT_URLS] Found {len(raw_image_urls_ext)} extension-based image URLs")
+    print(f"[EXTRACT_URLS] Found {len(raw_image_urls_unsplash)} Unsplash image URLs")
     print(f"[EXTRACT_URLS] Found {len(raw_audio_urls_files)} audio file URLs")
     print(f"[EXTRACT_URLS] Found {len(audio_urls_xenocanto)} xeno-canto URLs")