File size: 5,125 Bytes
2f95553 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
"""
Utility functions for the IITM LLM Quiz Solver.
"""
import re
import json
import logging
from typing import Optional, Dict, Any
from urllib.parse import urlparse, urljoin
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
def extract_submit_url(text: str, base_url: str) -> Optional[str]:
"""
Extract submit URL from page text.
Looks for patterns like:
- "Submit your answer to: https://example.com/submit"
- "Submit to: https://example.com/submit"
- "URL: https://example.com/submit"
Args:
text: The page text content
base_url: Base URL for relative URL resolution
Returns:
Extracted submit URL or None
"""
# Common patterns for submit URLs
patterns = [
r'[Ss]ubmit\s+(?:your\s+)?(?:answer\s+)?(?:to|at|via):\s*(https?://[^\s<>"\'\)]+)',
r'[Ss]ubmit\s+[Tt]o:\s*(https?://[^\s<>"\'\)]+)',
r'[Uu][Rr][Ll]:\s*(https?://[^\s<>"\'\)]+)',
r'[Pp]ost\s+(?:to|at):\s*(https?://[^\s<>"\'\)]+)',
r'[Ss]end\s+(?:to|at):\s*(https?://[^\s<>"\'\)]+)',
r'(https?://[^\s<>"\'\)]*submit[^\s<>"\'\)]*)',
r'(https?://[^\s<>"\'\)]*answer[^\s<>"\'\)]*)',
]
for pattern in patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
if matches:
url = matches[0].strip().rstrip('.,;:!?)}]{["\'')
# Validate URL
try:
parsed = urlparse(url)
if parsed.scheme and parsed.netloc:
logger.info(f"Found submit URL: {url}")
return url
except Exception as e:
logger.warning(f"Invalid URL pattern found: {url}, error: {e}")
continue
# Try to find any URL that might be a submit endpoint
url_pattern = r'https?://[^\s<>"\'\)]+'
all_urls = re.findall(url_pattern, text)
for url in all_urls:
url_lower = url.lower()
if 'submit' in url_lower or 'answer' in url_lower:
try:
parsed = urlparse(url)
if parsed.scheme and parsed.netloc:
logger.info(f"Found potential submit URL: {url}")
return url
except:
continue
# Try to find relative submit links (e.g. href="/submit")
rel_patterns = [
r'href=["\\\'](/[^"\\\']*submit[^"\\\']*)["\\\']',
r'(/[^\\s"<>\']*submit[^\\s"<>\']*)',
]
for pattern in rel_patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
if matches:
candidate = matches[0].strip().rstrip('.,;:!?)}]{["\'')
joined = urljoin(base_url, candidate)
logger.info(f"Found relative submit URL: {joined}")
return joined
logger.warning("No submit URL found in page text")
return None
def validate_secret(secret: str, expected_secret: str) -> bool:
"""
Validate the secret key.
Args:
secret: Provided secret
expected_secret: Expected secret from environment
Returns:
True if valid, False otherwise
"""
return secret == expected_secret
def clean_text(text: str) -> str:
"""
Clean and normalize text content.
Args:
text: Raw text content
Returns:
Cleaned text
"""
if not text:
return ""
# Remove excessive whitespace
text = re.sub(r'\s+', ' ', text)
# Remove leading/trailing whitespace
text = text.strip()
return text
def extract_json_from_text(text: str) -> Optional[Dict[str, Any]]:
"""
Try to extract JSON objects from text.
Args:
text: Text that may contain JSON
Returns:
Parsed JSON dict or None
"""
# Try to find JSON blocks
json_pattern = r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}'
matches = re.findall(json_pattern, text, re.DOTALL)
for match in matches:
try:
return json.loads(match)
except json.JSONDecodeError:
continue
return None
def is_valid_url(url: str) -> bool:
"""
Validate if a string is a valid URL.
Args:
url: URL string to validate
Returns:
True if valid URL, False otherwise
"""
try:
result = urlparse(url)
return all([result.scheme, result.netloc])
except Exception:
return False
def sanitize_filename(filename: str) -> str:
"""
Sanitize a filename by removing invalid characters.
Args:
filename: Original filename
Returns:
Sanitized filename
"""
# Remove invalid characters
filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
# Remove leading/trailing dots and spaces
filename = filename.strip('. ')
return filename
|