Prj2 / utils.py
iitmbs24f's picture
Upload 37 files
2f95553 verified
"""
Utility functions for the IITM LLM Quiz Solver.
"""
import re
import json
import logging
from typing import Optional, Dict, Any
from urllib.parse import urlparse, urljoin
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
def extract_submit_url(text: str, base_url: str) -> Optional[str]:
"""
Extract submit URL from page text.
Looks for patterns like:
- "Submit your answer to: https://example.com/submit"
- "Submit to: https://example.com/submit"
- "URL: https://example.com/submit"
Args:
text: The page text content
base_url: Base URL for relative URL resolution
Returns:
Extracted submit URL or None
"""
# Common patterns for submit URLs
patterns = [
r'[Ss]ubmit\s+(?:your\s+)?(?:answer\s+)?(?:to|at|via):\s*(https?://[^\s<>"\'\)]+)',
r'[Ss]ubmit\s+[Tt]o:\s*(https?://[^\s<>"\'\)]+)',
r'[Uu][Rr][Ll]:\s*(https?://[^\s<>"\'\)]+)',
r'[Pp]ost\s+(?:to|at):\s*(https?://[^\s<>"\'\)]+)',
r'[Ss]end\s+(?:to|at):\s*(https?://[^\s<>"\'\)]+)',
r'(https?://[^\s<>"\'\)]*submit[^\s<>"\'\)]*)',
r'(https?://[^\s<>"\'\)]*answer[^\s<>"\'\)]*)',
]
for pattern in patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
if matches:
url = matches[0].strip().rstrip('.,;:!?)}]{["\'')
# Validate URL
try:
parsed = urlparse(url)
if parsed.scheme and parsed.netloc:
logger.info(f"Found submit URL: {url}")
return url
except Exception as e:
logger.warning(f"Invalid URL pattern found: {url}, error: {e}")
continue
# Try to find any URL that might be a submit endpoint
url_pattern = r'https?://[^\s<>"\'\)]+'
all_urls = re.findall(url_pattern, text)
for url in all_urls:
url_lower = url.lower()
if 'submit' in url_lower or 'answer' in url_lower:
try:
parsed = urlparse(url)
if parsed.scheme and parsed.netloc:
logger.info(f"Found potential submit URL: {url}")
return url
except:
continue
# Try to find relative submit links (e.g. href="/submit")
rel_patterns = [
r'href=["\\\'](/[^"\\\']*submit[^"\\\']*)["\\\']',
r'(/[^\\s"<>\']*submit[^\\s"<>\']*)',
]
for pattern in rel_patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
if matches:
candidate = matches[0].strip().rstrip('.,;:!?)}]{["\'')
joined = urljoin(base_url, candidate)
logger.info(f"Found relative submit URL: {joined}")
return joined
logger.warning("No submit URL found in page text")
return None
def validate_secret(secret: str, expected_secret: str) -> bool:
"""
Validate the secret key.
Args:
secret: Provided secret
expected_secret: Expected secret from environment
Returns:
True if valid, False otherwise
"""
return secret == expected_secret
def clean_text(text: str) -> str:
"""
Clean and normalize text content.
Args:
text: Raw text content
Returns:
Cleaned text
"""
if not text:
return ""
# Remove excessive whitespace
text = re.sub(r'\s+', ' ', text)
# Remove leading/trailing whitespace
text = text.strip()
return text
def extract_json_from_text(text: str) -> Optional[Dict[str, Any]]:
"""
Try to extract JSON objects from text.
Args:
text: Text that may contain JSON
Returns:
Parsed JSON dict or None
"""
# Try to find JSON blocks
json_pattern = r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}'
matches = re.findall(json_pattern, text, re.DOTALL)
for match in matches:
try:
return json.loads(match)
except json.JSONDecodeError:
continue
return None
def is_valid_url(url: str) -> bool:
"""
Validate if a string is a valid URL.
Args:
url: URL string to validate
Returns:
True if valid URL, False otherwise
"""
try:
result = urlparse(url)
return all([result.scheme, result.netloc])
except Exception:
return False
def sanitize_filename(filename: str) -> str:
"""
Sanitize a filename by removing invalid characters.
Args:
filename: Original filename
Returns:
Sanitized filename
"""
# Remove invalid characters
filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
# Remove leading/trailing dots and spaces
filename = filename.strip('. ')
return filename