facemelter commited on
Commit
acf4dc8
·
verified ·
1 Parent(s): 17f468c

Fixed regex pattern upsplash images

Browse files
langgraph_agent/structured_output.py CHANGED
@@ -38,28 +38,38 @@ def extract_urls_from_text(text: str) -> tuple[List[str], List[str]]:
38
  Extract image and audio URLs from text using regex.
39
 
40
  Updated to handle URLs within markdown, JSON, and plain text.
 
41
 
42
  Returns:
43
  tuple: (image_urls, audio_urls)
44
  """
45
- # Updated pattern for image URLs - more permissive to catch URLs in various contexts
46
  # Matches URLs ending in image extensions, allowing most characters before the extension
47
  # Stops at whitespace or common delimiters like ), ], }
48
- image_pattern = r'https?://[^\s)}\]]+?\.(?:jpg|jpeg|png|gif|webp|svg)(?:\?[^\s)}\]]*)?'
 
 
 
 
49
 
50
  # Pattern for audio URLs - handles both direct audio files AND xeno-canto links
51
  # Updated to be more permissive like image pattern
52
  audio_pattern_files = r'https?://[^\s)}\]]+?\.(?:mp3|wav|ogg|m4a)(?:\?[^\s)}\]]*)?'
53
- audio_pattern_xenocanto = r'https?://xeno-canto\.org/\d+/download'
54
 
55
  print(f"[EXTRACT_URLS] Searching text of length {len(text)}")
56
 
57
- # Extract all URLs
58
- raw_image_urls = re.findall(image_pattern, text, re.IGNORECASE)
 
59
  raw_audio_urls_files = re.findall(audio_pattern_files, text, re.IGNORECASE)
60
  audio_urls_xenocanto = list(set(re.findall(audio_pattern_xenocanto, text, re.IGNORECASE)))
61
 
62
- print(f"[EXTRACT_URLS] Found {len(raw_image_urls)} raw image URLs")
 
 
 
 
63
  print(f"[EXTRACT_URLS] Found {len(raw_audio_urls_files)} audio file URLs")
64
  print(f"[EXTRACT_URLS] Found {len(audio_urls_xenocanto)} xeno-canto URLs")
65
 
 
38
  Extract image and audio URLs from text using regex.
39
 
40
  Updated to handle URLs within markdown, JSON, and plain text.
41
+ Supports both extension-based URLs (.jpg, .png) and domain-based (Unsplash).
42
 
43
  Returns:
44
  tuple: (image_urls, audio_urls)
45
  """
46
+ # Pattern 1: Image URLs with file extensions
47
  # Matches URLs ending in image extensions, allowing most characters before the extension
48
  # Stops at whitespace or common delimiters like ), ], }
49
+ image_pattern_ext = r'https?://[^\s)}\]]+?\.(?:jpg|jpeg|png|gif|webp|svg)(?:\?[^\s)}\]]*)?'
50
+
51
+ # Pattern 2: Unsplash image URLs (no file extension needed)
52
+ # Matches: https://images.unsplash.com/photo-XXXXXXX or similar
53
+ image_pattern_unsplash = r'https?://images\.unsplash\.com/[^\s)}\]]*'
54
 
55
  # Pattern for audio URLs - handles both direct audio files AND xeno-canto links
56
  # Updated to be more permissive like image pattern
57
  audio_pattern_files = r'https?://[^\s)}\]]+?\.(?:mp3|wav|ogg|m4a)(?:\?[^\s)}\]]*)?'
58
+ audio_pattern_xenocanto = r'https?://xeno-canto\.org/\d+(?:/download)?'
59
 
60
  print(f"[EXTRACT_URLS] Searching text of length {len(text)}")
61
 
62
+ # Extract all URLs - combine both image patterns
63
+ raw_image_urls_ext = re.findall(image_pattern_ext, text, re.IGNORECASE)
64
+ raw_image_urls_unsplash = re.findall(image_pattern_unsplash, text, re.IGNORECASE)
65
  raw_audio_urls_files = re.findall(audio_pattern_files, text, re.IGNORECASE)
66
  audio_urls_xenocanto = list(set(re.findall(audio_pattern_xenocanto, text, re.IGNORECASE)))
67
 
68
+ # Combine image URLs from both patterns
69
+ raw_image_urls = raw_image_urls_ext + raw_image_urls_unsplash
70
+
71
+ print(f"[EXTRACT_URLS] Found {len(raw_image_urls_ext)} extension-based image URLs")
72
+ print(f"[EXTRACT_URLS] Found {len(raw_image_urls_unsplash)} Unsplash image URLs")
73
  print(f"[EXTRACT_URLS] Found {len(raw_audio_urls_files)} audio file URLs")
74
  print(f"[EXTRACT_URLS] Found {len(audio_urls_xenocanto)} xeno-canto URLs")
75