Spaces:

MCP-1st-Birthday
/

BirdScopeAI

Paused

File size: 8,865 Bytes

"""
Structured output parsing using LlamaIndex Pydantic Programs.
Ensures consistent image formatting in agent responses.

HACKATHON OPTIMIZED: Uses regex extraction instead of LLM calls for speed.
"""
from typing import List, Optional
import re
from pydantic import BaseModel, Field


class BirdIdentificationResponse(BaseModel):
    """Structured response for bird identification using LlamaIndex Pydantic."""

    summary: str = Field(
        description="Main response text with bird identification, facts, or information"
    )
    species_name: Optional[str] = Field(
        default=None,
        description="Common name of the bird species (e.g., 'Northern Cardinal')"
    )
    image_urls: List[str] = Field(
        default_factory=list,
        description="List of image URLs to display for this bird"
    )
    audio_urls: List[str] = Field(
        default_factory=list,
        description="List of audio URLs (bird calls/songs)"
    )
    confidence_score: Optional[float] = Field(
        default=None,
        description="Confidence score from classifier (0.0-1.0)"
    )


def extract_urls_from_text(text: str) -> tuple[List[str], List[str]]:
    """
    Extract image and audio URLs from text using regex.

    Updated to handle URLs within markdown, JSON, and plain text.
    Supports both extension-based URLs (.jpg, .png) and domain-based (Unsplash).

    Returns:
        tuple: (image_urls, audio_urls)
    """
    # Pattern 1: Image URLs with file extensions
    # Matches URLs ending in image extensions, allowing most characters before the extension
    # Stops at whitespace or common delimiters like ), ], }
    image_pattern_ext = r'https?://[^\s)}\]]+?\.(?:jpg|jpeg|png|gif|webp|svg)(?:\?[^\s)}\]]*)?'

    # Pattern 2: Unsplash image URLs (no file extension needed)
    # Matches: https://images.unsplash.com/photo-XXXXXXX or similar
    image_pattern_unsplash = r'https?://images\.unsplash\.com/[^\s)}\]]*'

    # Pattern for audio URLs - handles both direct audio files AND xeno-canto links
    # Updated to be more permissive like image pattern
    audio_pattern_files = r'https?://[^\s)}\]]+?\.(?:mp3|wav|ogg|m4a)(?:\?[^\s)}\]]*)?'
    audio_pattern_xenocanto = r'https?://xeno-canto\.org/\d+(?:/download)?'

    print(f"[EXTRACT_URLS] Searching text of length {len(text)}")

    # Extract all URLs - combine both image patterns
    raw_image_urls_ext = re.findall(image_pattern_ext, text, re.IGNORECASE)
    raw_image_urls_unsplash = re.findall(image_pattern_unsplash, text, re.IGNORECASE)
    raw_audio_urls_files = re.findall(audio_pattern_files, text, re.IGNORECASE)
    audio_urls_xenocanto = list(set(re.findall(audio_pattern_xenocanto, text, re.IGNORECASE)))

    # Combine image URLs from both patterns
    raw_image_urls = raw_image_urls_ext + raw_image_urls_unsplash

    print(f"[EXTRACT_URLS] Found {len(raw_image_urls_ext)} extension-based image URLs")
    print(f"[EXTRACT_URLS] Found {len(raw_image_urls_unsplash)} Unsplash image URLs")
    print(f"[EXTRACT_URLS] Found {len(raw_audio_urls_files)} audio file URLs")
    print(f"[EXTRACT_URLS] Found {len(audio_urls_xenocanto)} xeno-canto URLs")

    # Clean URLs (remove trailing quotes, commas, etc.)
    def clean_url(url: str) -> str:
        cleaned = url.rstrip('",;)')
        # Validate it's still a proper URL
        if cleaned.startswith('http://') or cleaned.startswith('https://'):
            return cleaned
        else:
            print(f"[EXTRACT_URLS] ⚠️ Rejected malformed URL after cleaning: {cleaned}")
            return None

    image_urls = [u for u in (clean_url(url) for url in raw_image_urls) if u is not None]
    image_urls = list(set(image_urls))  # Deduplicate

    audio_urls_files = [u for u in (clean_url(url) for url in raw_audio_urls_files) if u is not None]
    audio_urls_files = list(set(audio_urls_files))  # Deduplicate

    # Combine both types of audio URLs
    audio_urls = audio_urls_files + audio_urls_xenocanto

    # Log the actual URLs extracted
    print(f"[EXTRACT_URLS] ✅ Cleaned image URLs ({len(image_urls)}): {image_urls}")
    print(f"[EXTRACT_URLS] ✅ Cleaned audio URLs ({len(audio_urls)}): {audio_urls}")

    return image_urls, audio_urls


def extract_species_name(text: str) -> Optional[str]:
    """
    Try to extract species name from common patterns in response.
    """
    # Pattern: "identified as SPECIES NAME" or "species: SPECIES NAME"
    patterns = [
        r'identified as[:\s]+([A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,3})',
        r'species[:\s]+([A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,3})',
        r'This is (?:a |an )?([A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,3})',
    ]

    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            return match.group(1)

    return None


async def parse_agent_response(
    raw_response: str,
    provider: str,
    api_key: str,
    model: str
) -> str:
    """
    Parse agent response into structured format and reformat with guaranteed markdown.

    OPTIMIZED FOR HACKATHON: Uses regex extraction instead of LLM call.
    Still uses LlamaIndex Pydantic models for structured data.

    Args:
        raw_response: The agent's raw text response
        provider: LLM provider ("openai", "anthropic", "huggingface")
        api_key: API key (unused in optimized version)
        model: Model name (unused in optimized version)

    Returns:
        Formatted markdown response with guaranteed image syntax
    """
    try:
        print("[STRUCTURED OUTPUT] Starting parsing...")
        print(f"[STRUCTURED OUTPUT] Raw response length: {len(raw_response)} characters")
        print(f"[STRUCTURED OUTPUT] First 500 chars: {raw_response[:500]}")
        print(f"[STRUCTURED OUTPUT] Last 500 chars: {raw_response[-500:]}")

        # Extract URLs using regex (fast, no API call)
        image_urls, audio_urls = extract_urls_from_text(raw_response)

        print(f"[STRUCTURED OUTPUT] Found {len(image_urls)} images, {len(audio_urls)} audio files")

        # Extract species name if possible
        species_name = extract_species_name(raw_response)

        # Create structured response using LlamaIndex Pydantic model
        structured = BirdIdentificationResponse(
            summary=raw_response,  # Keep full response as summary
            species_name=species_name,
            image_urls=image_urls,
            audio_urls=audio_urls,
            confidence_score=None  # Could extract with regex if needed
        )

        # Check if we found any media to format
        if not structured.image_urls and not structured.audio_urls:
            print("[STRUCTURED OUTPUT] No images or audio found, returning original")
            return raw_response

        # Reformat into markdown with guaranteed images
        formatted_parts = []

        # Main summary (but remove already-formatted images/audio to avoid duplication)
        clean_summary = raw_response
        for url in image_urls:
            # Remove existing markdown images
            clean_summary = re.sub(rf'!\[([^\]]*)\]\({re.escape(url)}\)', '', clean_summary)
            # Remove plain URLs
            clean_summary = clean_summary.replace(url, '')

        for url in audio_urls:
            # Remove audio URLs from summary
            clean_summary = clean_summary.replace(url, '')

        formatted_parts.append(clean_summary.strip())

        # Add images with markdown syntax
        if structured.image_urls:
            formatted_parts.append("\n### Images\n")
            for idx, url in enumerate(structured.image_urls, 1):
                # Use species name if available, otherwise generic
                alt_text = structured.species_name or f"Bird {idx}"
                img_markdown = f"![{alt_text}]({url})"
                print(f"[STRUCTURED OUTPUT] Generated image markdown: {img_markdown}")
                formatted_parts.append(img_markdown)

        # Add audio links if present
        if structured.audio_urls:
            formatted_parts.append("\n### Audio Recordings\n")
            for idx, url in enumerate(structured.audio_urls, 1):
                # Strip /download from xeno-canto URLs for browser-friendly links
                display_url = url.replace("/download", "") if "xeno-canto.org" in url else url
                formatted_parts.append(f"🔊 [Listen to recording {idx}]({display_url})")

        result = "\n\n".join(formatted_parts)
        print(f"[STRUCTURED OUTPUT] ✅ Successfully formatted response")
        print(f"[STRUCTURED OUTPUT] Final markdown length: {len(result)} characters")
        print(f"[STRUCTURED OUTPUT] Final markdown (last 500 chars): {result[-500:]}")
        return result

    except Exception as e:
        # Fallback: return original response if parsing fails
        print(f"[STRUCTURED OUTPUT] ❌ Parsing failed: {e}")
        return raw_response