Spaces:

iitmbs24f
/

Prj2

Sleeping

File size: 8,449 Bytes

"""

Utility functions for the IITM LLM Quiz Solver.

"""
import re
import json
import logging
from typing import Optional, Dict, Any
from urllib.parse import urlparse, urljoin

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


def extract_submit_url(text: str, base_url: str) -> Optional[str]:
    """

    Extract submit URL from page text.

    

    Looks for patterns like:

    - "Submit your answer to: https://example.com/submit"

    - "Submit to: https://example.com/submit"

    - "URL: https://example.com/submit"

    

    Args:

        text: The page text content

        base_url: Base URL for relative URL resolution

        

    Returns:

        Extracted submit URL or None

    """
    # Common patterns for submit URLs
    patterns = [
        r'[Ss]ubmit\s+(?:your\s+)?(?:answer\s+)?(?:to|at|via):\s*(https?://[^\s<>"\'\)]+)',
        r'[Ss]ubmit\s+[Tt]o:\s*(https?://[^\s<>"\'\)]+)',
        r'[Pp]ost\s+(?:to|at|JSON\s+to):\s*(https?://[^\s<>"\'\)]+)',  # "POST to JSON to https://..."
        r'[Pp]ost\s+to\s+JSON\s+to\s*(https?://[^\s<>"\'\)]+)',  # "POST to JSON to https://..."
        r'[Uu][Rr][Ll]:\s*(https?://[^\s<>"\'\)]+)',
        r'[Pp]ost\s+(?:to|at):\s*(https?://[^\s<>"\'\)]+)',
        r'[Ss]end\s+(?:to|at):\s*(https?://[^\s<>"\'\)]+)',
        r'(https?://[^\s<>"\'\)]*submit[^\s<>"\'\)]*)',
        r'(https?://[^\s<>"\'\)]*answer[^\s<>"\'\)]*)',
        r'POST\s+to\s+JSON\s+to\s*(https?://[^\s<>"\'\)]+)',  # "POST to JSON to https://..."
    ]
    
    for pattern in patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        if matches:
            url = matches[0].strip().rstrip('.,;:!?)}]{["\'')
            # Validate URL
            try:
                parsed = urlparse(url)
                if parsed.scheme and parsed.netloc:
                    logger.info(f"Found submit URL: {url}")
                    return url
            except Exception as e:
                logger.warning(f"Invalid URL pattern found: {url}, error: {e}")
                continue
    
    # Try to find any URL that might be a submit endpoint
    url_pattern = r'https?://[^\s<>"\'\)]+'
    all_urls = re.findall(url_pattern, text)
    for url in all_urls:
        url_lower = url.lower()
        if 'submit' in url_lower or 'answer' in url_lower:
            try:
                parsed = urlparse(url)
                if parsed.scheme and parsed.netloc:
                    logger.info(f"Found potential submit URL: {url}")
                    return url
            except:
                continue
    
    # Try to find relative submit links (e.g. href="/submit")
    # Be more strict - only match actual submit endpoints, not paths that happen to contain "submit" in text
    rel_patterns = [
        r'href=["\\\'](/[^"\\\']*submit[^"\\\']*)["\\\']',  # href="/submit" or href="/api/submit"
        r'POST\s+to\s+JSON\s+to\s+(/[^\s<>"\'\)]+)',  # "POST to JSON to /submit"
        r'[Pp]ost\s+(?:to|at):\s+(/[^\s<>"\'\)]+)',  # "POST to: /submit"
    ]
    for pattern in rel_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        if matches:
            candidate = matches[0].strip()
            # Clean up - stop at first space, parenthesis, or other non-URL character
            # This prevents matching text like "/path (description).Submit"
            candidate = re.sub(r'[\s\(\)].*$', '', candidate)  # Remove everything after space or paren
            candidate = candidate.rstrip('.,;:!?)}]{["\'')
            # Validate it's actually a submit endpoint (contains "submit" in the path)
            # AND it doesn't contain file extensions that indicate it's a document path
            if 'submit' in candidate.lower() and not any(ext in candidate.lower() for ext in ['.md', '.txt', '.pdf', '.html']):
                try:
                    joined = urljoin(base_url, candidate)
                    # Final validation - ensure it's a valid URL
                    parsed = urlparse(joined)
                    if parsed.scheme and parsed.netloc:
                        logger.info(f"Found relative submit URL: {joined}")
                        return joined
                except Exception as e:
                    logger.warning(f"Invalid relative URL candidate: {candidate}, error: {e}")
                    continue
    
    # Don't match generic paths that happen to contain "submit" in surrounding text
    # This was causing issues where paths like "/project2/data-preparation.md (local copy provided).Submit that exact"
    # were being matched incorrectly
    
    # Try to find submit URL in the base domain with /submit path
    if base_url:
        try:
            parsed = urlparse(base_url)
            submit_url = f"{parsed.scheme}://{parsed.netloc}/submit"
            logger.info(f"Trying default submit URL: {submit_url}")
            return submit_url
        except:
            pass

    logger.warning("No submit URL found in page text")
    return None


def validate_secret(secret: str, expected_secret: str) -> bool:
    """

    Validate the secret key.

    

    Args:

        secret: Provided secret

        expected_secret: Expected secret from environment

        

    Returns:

        True if valid, False otherwise

    """
    return secret == expected_secret


def clean_text(text: str) -> str:
    """

    Clean and normalize text content.

    

    Args:

        text: Raw text content

        

    Returns:

        Cleaned text

    """
    if not text:
        return ""
    
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)
    # Remove leading/trailing whitespace
    text = text.strip()
    
    return text


def extract_json_from_text(text: str) -> Optional[Dict[str, Any]]:
    """

    Try to extract JSON objects from text.

    

    Args:

        text: Text that may contain JSON

        

    Returns:

        Parsed JSON dict or None

    """
    # Try to find JSON blocks
    json_pattern = r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}'
    matches = re.findall(json_pattern, text, re.DOTALL)
    
    for match in matches:
        try:
            return json.loads(match)
        except json.JSONDecodeError:
            continue
    
    # Try to fix common JSON issues
    try:
        # Remove markdown code blocks
        text = re.sub(r'```json\s*', '', text)
        text = re.sub(r'```\s*', '', text)
        # Try parsing the cleaned text
        return json.loads(text.strip())
    except json.JSONDecodeError:
        pass
    
    return None


def safe_extract_json(text: str, max_retries: int = 1) -> Optional[Dict[str, Any]]:
    """

    Safely extract JSON with better error handling.

    

    Args:

        text: Text that may contain JSON

        max_retries: Maximum retry attempts

        

    Returns:

        Parsed JSON dict or None

    """
    result = extract_json_from_text(text)
    if result:
        return result
    
    # Try to fix common issues
    fixed_text = text
    # Remove leading/trailing whitespace and newlines
    fixed_text = fixed_text.strip()
    # Remove markdown formatting
    fixed_text = re.sub(r'^```(?:json)?\s*', '', fixed_text, flags=re.MULTILINE)
    fixed_text = re.sub(r'\s*```$', '', fixed_text, flags=re.MULTILINE)
    # Try again with fixed text
    result = extract_json_from_text(fixed_text)
    
    return result


def is_valid_url(url: str) -> bool:
    """

    Validate if a string is a valid URL.

    

    Args:

        url: URL string to validate

        

    Returns:

        True if valid URL, False otherwise

    """
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except Exception:
        return False


def sanitize_filename(filename: str) -> str:
    """

    Sanitize a filename by removing invalid characters.

    

    Args:

        filename: Original filename

        

    Returns:

        Sanitized filename

    """
    # Remove invalid characters
    filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
    # Remove leading/trailing dots and spaces
    filename = filename.strip('. ')
    return filename