Spaces:

iitmbs24f
/

Prj2

Sleeping

File size: 5,125 Bytes

2f95553

"""

Utility functions for the IITM LLM Quiz Solver.

"""
import re
import json
import logging
from typing import Optional, Dict, Any
from urllib.parse import urlparse, urljoin

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


def extract_submit_url(text: str, base_url: str) -> Optional[str]:
    """

    Extract submit URL from page text.

    

    Looks for patterns like:

    - "Submit your answer to: https://example.com/submit"

    - "Submit to: https://example.com/submit"

    - "URL: https://example.com/submit"

    

    Args:

        text: The page text content

        base_url: Base URL for relative URL resolution

        

    Returns:

        Extracted submit URL or None

    """
    # Common patterns for submit URLs
    patterns = [
        r'[Ss]ubmit\s+(?:your\s+)?(?:answer\s+)?(?:to|at|via):\s*(https?://[^\s<>"\'\)]+)',
        r'[Ss]ubmit\s+[Tt]o:\s*(https?://[^\s<>"\'\)]+)',
        r'[Uu][Rr][Ll]:\s*(https?://[^\s<>"\'\)]+)',
        r'[Pp]ost\s+(?:to|at):\s*(https?://[^\s<>"\'\)]+)',
        r'[Ss]end\s+(?:to|at):\s*(https?://[^\s<>"\'\)]+)',
        r'(https?://[^\s<>"\'\)]*submit[^\s<>"\'\)]*)',
        r'(https?://[^\s<>"\'\)]*answer[^\s<>"\'\)]*)',
    ]
    
    for pattern in patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        if matches:
            url = matches[0].strip().rstrip('.,;:!?)}]{["\'')
            # Validate URL
            try:
                parsed = urlparse(url)
                if parsed.scheme and parsed.netloc:
                    logger.info(f"Found submit URL: {url}")
                    return url
            except Exception as e:
                logger.warning(f"Invalid URL pattern found: {url}, error: {e}")
                continue
    
    # Try to find any URL that might be a submit endpoint
    url_pattern = r'https?://[^\s<>"\'\)]+'
    all_urls = re.findall(url_pattern, text)
    for url in all_urls:
        url_lower = url.lower()
        if 'submit' in url_lower or 'answer' in url_lower:
            try:
                parsed = urlparse(url)
                if parsed.scheme and parsed.netloc:
                    logger.info(f"Found potential submit URL: {url}")
                    return url
            except:
                continue
    
    # Try to find relative submit links (e.g. href="/submit")
    rel_patterns = [
        r'href=["\\\'](/[^"\\\']*submit[^"\\\']*)["\\\']',
        r'(/[^\\s"<>\']*submit[^\\s"<>\']*)',
    ]
    for pattern in rel_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        if matches:
            candidate = matches[0].strip().rstrip('.,;:!?)}]{["\'')
            joined = urljoin(base_url, candidate)
            logger.info(f"Found relative submit URL: {joined}")
            return joined

    logger.warning("No submit URL found in page text")
    return None


def validate_secret(secret: str, expected_secret: str) -> bool:
    """

    Validate the secret key.

    

    Args:

        secret: Provided secret

        expected_secret: Expected secret from environment

        

    Returns:

        True if valid, False otherwise

    """
    return secret == expected_secret


def clean_text(text: str) -> str:
    """

    Clean and normalize text content.

    

    Args:

        text: Raw text content

        

    Returns:

        Cleaned text

    """
    if not text:
        return ""
    
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)
    # Remove leading/trailing whitespace
    text = text.strip()
    
    return text


def extract_json_from_text(text: str) -> Optional[Dict[str, Any]]:
    """

    Try to extract JSON objects from text.

    

    Args:

        text: Text that may contain JSON

        

    Returns:

        Parsed JSON dict or None

    """
    # Try to find JSON blocks
    json_pattern = r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}'
    matches = re.findall(json_pattern, text, re.DOTALL)
    
    for match in matches:
        try:
            return json.loads(match)
        except json.JSONDecodeError:
            continue
    
    return None


def is_valid_url(url: str) -> bool:
    """

    Validate if a string is a valid URL.

    

    Args:

        url: URL string to validate

        

    Returns:

        True if valid URL, False otherwise

    """
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except Exception:
        return False


def sanitize_filename(filename: str) -> str:
    """

    Sanitize a filename by removing invalid characters.

    

    Args:

        filename: Original filename

        

    Returns:

        Sanitized filename

    """
    # Remove invalid characters
    filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
    # Remove leading/trailing dots and spaces
    filename = filename.strip('. ')
    return filename