File size: 8,449 Bytes
08615f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6cf3d0d
 
08615f0
 
 
 
 
6cf3d0d
08615f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23fc6bf
08615f0
23fc6bf
6cf3d0d
 
08615f0
 
 
 
23fc6bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6cf3d0d
 
 
 
 
 
 
 
 
 
08615f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8740e76
 
 
 
 
 
 
 
 
 
08615f0
 
 
8740e76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
08615f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
"""

Utility functions for the IITM LLM Quiz Solver.

"""
import re
import json
import logging
from typing import Optional, Dict, Any
from urllib.parse import urlparse, urljoin

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


def extract_submit_url(text: str, base_url: str) -> Optional[str]:
    """

    Extract submit URL from page text.

    

    Looks for patterns like:

    - "Submit your answer to: https://example.com/submit"

    - "Submit to: https://example.com/submit"

    - "URL: https://example.com/submit"

    

    Args:

        text: The page text content

        base_url: Base URL for relative URL resolution

        

    Returns:

        Extracted submit URL or None

    """
    # Common patterns for submit URLs
    patterns = [
        r'[Ss]ubmit\s+(?:your\s+)?(?:answer\s+)?(?:to|at|via):\s*(https?://[^\s<>"\'\)]+)',
        r'[Ss]ubmit\s+[Tt]o:\s*(https?://[^\s<>"\'\)]+)',
        r'[Pp]ost\s+(?:to|at|JSON\s+to):\s*(https?://[^\s<>"\'\)]+)',  # "POST to JSON to https://..."
        r'[Pp]ost\s+to\s+JSON\s+to\s*(https?://[^\s<>"\'\)]+)',  # "POST to JSON to https://..."
        r'[Uu][Rr][Ll]:\s*(https?://[^\s<>"\'\)]+)',
        r'[Pp]ost\s+(?:to|at):\s*(https?://[^\s<>"\'\)]+)',
        r'[Ss]end\s+(?:to|at):\s*(https?://[^\s<>"\'\)]+)',
        r'(https?://[^\s<>"\'\)]*submit[^\s<>"\'\)]*)',
        r'(https?://[^\s<>"\'\)]*answer[^\s<>"\'\)]*)',
        r'POST\s+to\s+JSON\s+to\s*(https?://[^\s<>"\'\)]+)',  # "POST to JSON to https://..."
    ]
    
    for pattern in patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        if matches:
            url = matches[0].strip().rstrip('.,;:!?)}]{["\'')
            # Validate URL
            try:
                parsed = urlparse(url)
                if parsed.scheme and parsed.netloc:
                    logger.info(f"Found submit URL: {url}")
                    return url
            except Exception as e:
                logger.warning(f"Invalid URL pattern found: {url}, error: {e}")
                continue
    
    # Try to find any URL that might be a submit endpoint
    url_pattern = r'https?://[^\s<>"\'\)]+'
    all_urls = re.findall(url_pattern, text)
    for url in all_urls:
        url_lower = url.lower()
        if 'submit' in url_lower or 'answer' in url_lower:
            try:
                parsed = urlparse(url)
                if parsed.scheme and parsed.netloc:
                    logger.info(f"Found potential submit URL: {url}")
                    return url
            except:
                continue
    
    # Try to find relative submit links (e.g. href="/submit")
    # Be more strict - only match actual submit endpoints, not paths that happen to contain "submit" in text
    rel_patterns = [
        r'href=["\\\'](/[^"\\\']*submit[^"\\\']*)["\\\']',  # href="/submit" or href="/api/submit"
        r'POST\s+to\s+JSON\s+to\s+(/[^\s<>"\'\)]+)',  # "POST to JSON to /submit"
        r'[Pp]ost\s+(?:to|at):\s+(/[^\s<>"\'\)]+)',  # "POST to: /submit"
    ]
    for pattern in rel_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        if matches:
            candidate = matches[0].strip()
            # Clean up - stop at first space, parenthesis, or other non-URL character
            # This prevents matching text like "/path (description).Submit"
            candidate = re.sub(r'[\s\(\)].*$', '', candidate)  # Remove everything after space or paren
            candidate = candidate.rstrip('.,;:!?)}]{["\'')
            # Validate it's actually a submit endpoint (contains "submit" in the path)
            # AND it doesn't contain file extensions that indicate it's a document path
            if 'submit' in candidate.lower() and not any(ext in candidate.lower() for ext in ['.md', '.txt', '.pdf', '.html']):
                try:
                    joined = urljoin(base_url, candidate)
                    # Final validation - ensure it's a valid URL
                    parsed = urlparse(joined)
                    if parsed.scheme and parsed.netloc:
                        logger.info(f"Found relative submit URL: {joined}")
                        return joined
                except Exception as e:
                    logger.warning(f"Invalid relative URL candidate: {candidate}, error: {e}")
                    continue
    
    # Don't match generic paths that happen to contain "submit" in surrounding text
    # This was causing issues where paths like "/project2/data-preparation.md (local copy provided).Submit that exact"
    # were being matched incorrectly
    
    # Try to find submit URL in the base domain with /submit path
    if base_url:
        try:
            parsed = urlparse(base_url)
            submit_url = f"{parsed.scheme}://{parsed.netloc}/submit"
            logger.info(f"Trying default submit URL: {submit_url}")
            return submit_url
        except:
            pass

    logger.warning("No submit URL found in page text")
    return None


def validate_secret(secret: str, expected_secret: str) -> bool:
    """

    Validate the secret key.

    

    Args:

        secret: Provided secret

        expected_secret: Expected secret from environment

        

    Returns:

        True if valid, False otherwise

    """
    return secret == expected_secret


def clean_text(text: str) -> str:
    """

    Clean and normalize text content.

    

    Args:

        text: Raw text content

        

    Returns:

        Cleaned text

    """
    if not text:
        return ""
    
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)
    # Remove leading/trailing whitespace
    text = text.strip()
    
    return text


def extract_json_from_text(text: str) -> Optional[Dict[str, Any]]:
    """

    Try to extract JSON objects from text.

    

    Args:

        text: Text that may contain JSON

        

    Returns:

        Parsed JSON dict or None

    """
    # Try to find JSON blocks
    json_pattern = r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}'
    matches = re.findall(json_pattern, text, re.DOTALL)
    
    for match in matches:
        try:
            return json.loads(match)
        except json.JSONDecodeError:
            continue
    
    # Try to fix common JSON issues
    try:
        # Remove markdown code blocks
        text = re.sub(r'```json\s*', '', text)
        text = re.sub(r'```\s*', '', text)
        # Try parsing the cleaned text
        return json.loads(text.strip())
    except json.JSONDecodeError:
        pass
    
    return None


def safe_extract_json(text: str, max_retries: int = 1) -> Optional[Dict[str, Any]]:
    """

    Safely extract JSON with better error handling.

    

    Args:

        text: Text that may contain JSON

        max_retries: Maximum retry attempts

        

    Returns:

        Parsed JSON dict or None

    """
    result = extract_json_from_text(text)
    if result:
        return result
    
    # Try to fix common issues
    fixed_text = text
    # Remove leading/trailing whitespace and newlines
    fixed_text = fixed_text.strip()
    # Remove markdown formatting
    fixed_text = re.sub(r'^```(?:json)?\s*', '', fixed_text, flags=re.MULTILINE)
    fixed_text = re.sub(r'\s*```$', '', fixed_text, flags=re.MULTILINE)
    # Try again with fixed text
    result = extract_json_from_text(fixed_text)
    
    return result


def is_valid_url(url: str) -> bool:
    """

    Validate if a string is a valid URL.

    

    Args:

        url: URL string to validate

        

    Returns:

        True if valid URL, False otherwise

    """
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except Exception:
        return False


def sanitize_filename(filename: str) -> str:
    """

    Sanitize a filename by removing invalid characters.

    

    Args:

        filename: Original filename

        

    Returns:

        Sanitized filename

    """
    # Remove invalid characters
    filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
    # Remove leading/trailing dots and spaces
    filename = filename.strip('. ')
    return filename