|
|
"""
|
|
|
Utility functions for the IITM LLM Quiz Solver.
|
|
|
"""
|
|
|
import re
|
|
|
import json
|
|
|
import logging
|
|
|
from typing import Optional, Dict, Any
|
|
|
from urllib.parse import urlparse, urljoin
|
|
|
|
|
|
|
|
|
logging.basicConfig(
|
|
|
level=logging.INFO,
|
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
|
)
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
def extract_submit_url(text: str, base_url: str) -> Optional[str]:
|
|
|
"""
|
|
|
Extract submit URL from page text.
|
|
|
|
|
|
Looks for patterns like:
|
|
|
- "Submit your answer to: https://example.com/submit"
|
|
|
- "Submit to: https://example.com/submit"
|
|
|
- "URL: https://example.com/submit"
|
|
|
|
|
|
Args:
|
|
|
text: The page text content
|
|
|
base_url: Base URL for relative URL resolution
|
|
|
|
|
|
Returns:
|
|
|
Extracted submit URL or None
|
|
|
"""
|
|
|
|
|
|
patterns = [
|
|
|
r'[Ss]ubmit\s+(?:your\s+)?(?:answer\s+)?(?:to|at|via):\s*(https?://[^\s<>"\'\)]+)',
|
|
|
r'[Ss]ubmit\s+[Tt]o:\s*(https?://[^\s<>"\'\)]+)',
|
|
|
r'[Uu][Rr][Ll]:\s*(https?://[^\s<>"\'\)]+)',
|
|
|
r'[Pp]ost\s+(?:to|at):\s*(https?://[^\s<>"\'\)]+)',
|
|
|
r'[Ss]end\s+(?:to|at):\s*(https?://[^\s<>"\'\)]+)',
|
|
|
r'(https?://[^\s<>"\'\)]*submit[^\s<>"\'\)]*)',
|
|
|
r'(https?://[^\s<>"\'\)]*answer[^\s<>"\'\)]*)',
|
|
|
]
|
|
|
|
|
|
for pattern in patterns:
|
|
|
matches = re.findall(pattern, text, re.IGNORECASE)
|
|
|
if matches:
|
|
|
url = matches[0].strip().rstrip('.,;:!?)}]{["\'')
|
|
|
|
|
|
try:
|
|
|
parsed = urlparse(url)
|
|
|
if parsed.scheme and parsed.netloc:
|
|
|
logger.info(f"Found submit URL: {url}")
|
|
|
return url
|
|
|
except Exception as e:
|
|
|
logger.warning(f"Invalid URL pattern found: {url}, error: {e}")
|
|
|
continue
|
|
|
|
|
|
|
|
|
url_pattern = r'https?://[^\s<>"\'\)]+'
|
|
|
all_urls = re.findall(url_pattern, text)
|
|
|
for url in all_urls:
|
|
|
url_lower = url.lower()
|
|
|
if 'submit' in url_lower or 'answer' in url_lower:
|
|
|
try:
|
|
|
parsed = urlparse(url)
|
|
|
if parsed.scheme and parsed.netloc:
|
|
|
logger.info(f"Found potential submit URL: {url}")
|
|
|
return url
|
|
|
except:
|
|
|
continue
|
|
|
|
|
|
|
|
|
rel_patterns = [
|
|
|
r'href=["\\\'](/[^"\\\']*submit[^"\\\']*)["\\\']',
|
|
|
r'(/[^\\s"<>\']*submit[^\\s"<>\']*)',
|
|
|
]
|
|
|
for pattern in rel_patterns:
|
|
|
matches = re.findall(pattern, text, re.IGNORECASE)
|
|
|
if matches:
|
|
|
candidate = matches[0].strip().rstrip('.,;:!?)}]{["\'')
|
|
|
joined = urljoin(base_url, candidate)
|
|
|
logger.info(f"Found relative submit URL: {joined}")
|
|
|
return joined
|
|
|
|
|
|
logger.warning("No submit URL found in page text")
|
|
|
return None
|
|
|
|
|
|
|
|
|
def validate_secret(secret: str, expected_secret: str) -> bool:
|
|
|
"""
|
|
|
Validate the secret key.
|
|
|
|
|
|
Args:
|
|
|
secret: Provided secret
|
|
|
expected_secret: Expected secret from environment
|
|
|
|
|
|
Returns:
|
|
|
True if valid, False otherwise
|
|
|
"""
|
|
|
return secret == expected_secret
|
|
|
|
|
|
|
|
|
def clean_text(text: str) -> str:
|
|
|
"""
|
|
|
Clean and normalize text content.
|
|
|
|
|
|
Args:
|
|
|
text: Raw text content
|
|
|
|
|
|
Returns:
|
|
|
Cleaned text
|
|
|
"""
|
|
|
if not text:
|
|
|
return ""
|
|
|
|
|
|
|
|
|
text = re.sub(r'\s+', ' ', text)
|
|
|
|
|
|
text = text.strip()
|
|
|
|
|
|
return text
|
|
|
|
|
|
|
|
|
def extract_json_from_text(text: str) -> Optional[Dict[str, Any]]:
|
|
|
"""
|
|
|
Try to extract JSON objects from text.
|
|
|
|
|
|
Args:
|
|
|
text: Text that may contain JSON
|
|
|
|
|
|
Returns:
|
|
|
Parsed JSON dict or None
|
|
|
"""
|
|
|
|
|
|
json_pattern = r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}'
|
|
|
matches = re.findall(json_pattern, text, re.DOTALL)
|
|
|
|
|
|
for match in matches:
|
|
|
try:
|
|
|
return json.loads(match)
|
|
|
except json.JSONDecodeError:
|
|
|
continue
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
def is_valid_url(url: str) -> bool:
|
|
|
"""
|
|
|
Validate if a string is a valid URL.
|
|
|
|
|
|
Args:
|
|
|
url: URL string to validate
|
|
|
|
|
|
Returns:
|
|
|
True if valid URL, False otherwise
|
|
|
"""
|
|
|
try:
|
|
|
result = urlparse(url)
|
|
|
return all([result.scheme, result.netloc])
|
|
|
except Exception:
|
|
|
return False
|
|
|
|
|
|
|
|
|
def sanitize_filename(filename: str) -> str:
|
|
|
"""
|
|
|
Sanitize a filename by removing invalid characters.
|
|
|
|
|
|
Args:
|
|
|
filename: Original filename
|
|
|
|
|
|
Returns:
|
|
|
Sanitized filename
|
|
|
"""
|
|
|
|
|
|
filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
|
|
|
|
|
|
filename = filename.strip('. ')
|
|
|
return filename
|
|
|
|
|
|
|