Fixed regex pattern upsplash images
Browse files
langgraph_agent/structured_output.py
CHANGED
|
@@ -38,28 +38,38 @@ def extract_urls_from_text(text: str) -> tuple[List[str], List[str]]:
|
|
| 38 |
Extract image and audio URLs from text using regex.
|
| 39 |
|
| 40 |
Updated to handle URLs within markdown, JSON, and plain text.
|
|
|
|
| 41 |
|
| 42 |
Returns:
|
| 43 |
tuple: (image_urls, audio_urls)
|
| 44 |
"""
|
| 45 |
-
#
|
| 46 |
# Matches URLs ending in image extensions, allowing most characters before the extension
|
| 47 |
# Stops at whitespace or common delimiters like ), ], }
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
# Pattern for audio URLs - handles both direct audio files AND xeno-canto links
|
| 51 |
# Updated to be more permissive like image pattern
|
| 52 |
audio_pattern_files = r'https?://[^\s)}\]]+?\.(?:mp3|wav|ogg|m4a)(?:\?[^\s)}\]]*)?'
|
| 53 |
-
audio_pattern_xenocanto = r'https?://xeno-canto\.org/\d
|
| 54 |
|
| 55 |
print(f"[EXTRACT_URLS] Searching text of length {len(text)}")
|
| 56 |
|
| 57 |
-
# Extract all URLs
|
| 58 |
-
|
|
|
|
| 59 |
raw_audio_urls_files = re.findall(audio_pattern_files, text, re.IGNORECASE)
|
| 60 |
audio_urls_xenocanto = list(set(re.findall(audio_pattern_xenocanto, text, re.IGNORECASE)))
|
| 61 |
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
print(f"[EXTRACT_URLS] Found {len(raw_audio_urls_files)} audio file URLs")
|
| 64 |
print(f"[EXTRACT_URLS] Found {len(audio_urls_xenocanto)} xeno-canto URLs")
|
| 65 |
|
|
|
|
| 38 |
Extract image and audio URLs from text using regex.
|
| 39 |
|
| 40 |
Updated to handle URLs within markdown, JSON, and plain text.
|
| 41 |
+
Supports both extension-based URLs (.jpg, .png) and domain-based (Unsplash).
|
| 42 |
|
| 43 |
Returns:
|
| 44 |
tuple: (image_urls, audio_urls)
|
| 45 |
"""
|
| 46 |
+
# Pattern 1: Image URLs with file extensions
|
| 47 |
# Matches URLs ending in image extensions, allowing most characters before the extension
|
| 48 |
# Stops at whitespace or common delimiters like ), ], }
|
| 49 |
+
image_pattern_ext = r'https?://[^\s)}\]]+?\.(?:jpg|jpeg|png|gif|webp|svg)(?:\?[^\s)}\]]*)?'
|
| 50 |
+
|
| 51 |
+
# Pattern 2: Unsplash image URLs (no file extension needed)
|
| 52 |
+
# Matches: https://images.unsplash.com/photo-XXXXXXX or similar
|
| 53 |
+
image_pattern_unsplash = r'https?://images\.unsplash\.com/[^\s)}\]]*'
|
| 54 |
|
| 55 |
# Pattern for audio URLs - handles both direct audio files AND xeno-canto links
|
| 56 |
# Updated to be more permissive like image pattern
|
| 57 |
audio_pattern_files = r'https?://[^\s)}\]]+?\.(?:mp3|wav|ogg|m4a)(?:\?[^\s)}\]]*)?'
|
| 58 |
+
audio_pattern_xenocanto = r'https?://xeno-canto\.org/\d+(?:/download)?'
|
| 59 |
|
| 60 |
print(f"[EXTRACT_URLS] Searching text of length {len(text)}")
|
| 61 |
|
| 62 |
+
# Extract all URLs - combine both image patterns
|
| 63 |
+
raw_image_urls_ext = re.findall(image_pattern_ext, text, re.IGNORECASE)
|
| 64 |
+
raw_image_urls_unsplash = re.findall(image_pattern_unsplash, text, re.IGNORECASE)
|
| 65 |
raw_audio_urls_files = re.findall(audio_pattern_files, text, re.IGNORECASE)
|
| 66 |
audio_urls_xenocanto = list(set(re.findall(audio_pattern_xenocanto, text, re.IGNORECASE)))
|
| 67 |
|
| 68 |
+
# Combine image URLs from both patterns
|
| 69 |
+
raw_image_urls = raw_image_urls_ext + raw_image_urls_unsplash
|
| 70 |
+
|
| 71 |
+
print(f"[EXTRACT_URLS] Found {len(raw_image_urls_ext)} extension-based image URLs")
|
| 72 |
+
print(f"[EXTRACT_URLS] Found {len(raw_image_urls_unsplash)} Unsplash image URLs")
|
| 73 |
print(f"[EXTRACT_URLS] Found {len(raw_audio_urls_files)} audio file URLs")
|
| 74 |
print(f"[EXTRACT_URLS] Found {len(audio_urls_xenocanto)} xeno-canto URLs")
|
| 75 |
|