File size: 5,125 Bytes
2f95553
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
"""

Utility functions for the IITM LLM Quiz Solver.

"""
import re
import json
import logging
from typing import Optional, Dict, Any
from urllib.parse import urlparse, urljoin

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


def extract_submit_url(text: str, base_url: str) -> Optional[str]:
    """

    Extract submit URL from page text.

    

    Looks for patterns like:

    - "Submit your answer to: https://example.com/submit"

    - "Submit to: https://example.com/submit"

    - "URL: https://example.com/submit"

    

    Args:

        text: The page text content

        base_url: Base URL for relative URL resolution

        

    Returns:

        Extracted submit URL or None

    """
    # Common patterns for submit URLs
    patterns = [
        r'[Ss]ubmit\s+(?:your\s+)?(?:answer\s+)?(?:to|at|via):\s*(https?://[^\s<>"\'\)]+)',
        r'[Ss]ubmit\s+[Tt]o:\s*(https?://[^\s<>"\'\)]+)',
        r'[Uu][Rr][Ll]:\s*(https?://[^\s<>"\'\)]+)',
        r'[Pp]ost\s+(?:to|at):\s*(https?://[^\s<>"\'\)]+)',
        r'[Ss]end\s+(?:to|at):\s*(https?://[^\s<>"\'\)]+)',
        r'(https?://[^\s<>"\'\)]*submit[^\s<>"\'\)]*)',
        r'(https?://[^\s<>"\'\)]*answer[^\s<>"\'\)]*)',
    ]
    
    for pattern in patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        if matches:
            url = matches[0].strip().rstrip('.,;:!?)}]{["\'')
            # Validate URL
            try:
                parsed = urlparse(url)
                if parsed.scheme and parsed.netloc:
                    logger.info(f"Found submit URL: {url}")
                    return url
            except Exception as e:
                logger.warning(f"Invalid URL pattern found: {url}, error: {e}")
                continue
    
    # Try to find any URL that might be a submit endpoint
    url_pattern = r'https?://[^\s<>"\'\)]+'
    all_urls = re.findall(url_pattern, text)
    for url in all_urls:
        url_lower = url.lower()
        if 'submit' in url_lower or 'answer' in url_lower:
            try:
                parsed = urlparse(url)
                if parsed.scheme and parsed.netloc:
                    logger.info(f"Found potential submit URL: {url}")
                    return url
            except:
                continue
    
    # Try to find relative submit links (e.g. href="/submit")
    rel_patterns = [
        r'href=["\\\'](/[^"\\\']*submit[^"\\\']*)["\\\']',
        r'(/[^\\s"<>\']*submit[^\\s"<>\']*)',
    ]
    for pattern in rel_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        if matches:
            candidate = matches[0].strip().rstrip('.,;:!?)}]{["\'')
            joined = urljoin(base_url, candidate)
            logger.info(f"Found relative submit URL: {joined}")
            return joined

    logger.warning("No submit URL found in page text")
    return None


def validate_secret(secret: str, expected_secret: str) -> bool:
    """

    Validate the secret key.

    

    Args:

        secret: Provided secret

        expected_secret: Expected secret from environment

        

    Returns:

        True if valid, False otherwise

    """
    return secret == expected_secret


def clean_text(text: str) -> str:
    """

    Clean and normalize text content.

    

    Args:

        text: Raw text content

        

    Returns:

        Cleaned text

    """
    if not text:
        return ""
    
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)
    # Remove leading/trailing whitespace
    text = text.strip()
    
    return text


def extract_json_from_text(text: str) -> Optional[Dict[str, Any]]:
    """

    Try to extract JSON objects from text.

    

    Args:

        text: Text that may contain JSON

        

    Returns:

        Parsed JSON dict or None

    """
    # Try to find JSON blocks
    json_pattern = r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}'
    matches = re.findall(json_pattern, text, re.DOTALL)
    
    for match in matches:
        try:
            return json.loads(match)
        except json.JSONDecodeError:
            continue
    
    return None


def is_valid_url(url: str) -> bool:
    """

    Validate if a string is a valid URL.

    

    Args:

        url: URL string to validate

        

    Returns:

        True if valid URL, False otherwise

    """
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except Exception:
        return False


def sanitize_filename(filename: str) -> str:
    """

    Sanitize a filename by removing invalid characters.

    

    Args:

        filename: Original filename

        

    Returns:

        Sanitized filename

    """
    # Remove invalid characters
    filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
    # Remove leading/trailing dots and spaces
    filename = filename.strip('. ')
    return filename