""" Utility functions for the IITM LLM Quiz Solver. """ import re import json import logging from typing import Optional, Dict, Any from urllib.parse import urlparse, urljoin # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) def extract_submit_url(text: str, base_url: str) -> Optional[str]: """ Extract submit URL from page text. Looks for patterns like: - "Submit your answer to: https://example.com/submit" - "Submit to: https://example.com/submit" - "URL: https://example.com/submit" Args: text: The page text content base_url: Base URL for relative URL resolution Returns: Extracted submit URL or None """ # Common patterns for submit URLs patterns = [ r'[Ss]ubmit\s+(?:your\s+)?(?:answer\s+)?(?:to|at|via):\s*(https?://[^\s<>"\'\)]+)', r'[Ss]ubmit\s+[Tt]o:\s*(https?://[^\s<>"\'\)]+)', r'[Pp]ost\s+(?:to|at|JSON\s+to):\s*(https?://[^\s<>"\'\)]+)', # "POST to JSON to https://..." r'[Pp]ost\s+to\s+JSON\s+to\s*(https?://[^\s<>"\'\)]+)', # "POST to JSON to https://..." r'[Uu][Rr][Ll]:\s*(https?://[^\s<>"\'\)]+)', r'[Pp]ost\s+(?:to|at):\s*(https?://[^\s<>"\'\)]+)', r'[Ss]end\s+(?:to|at):\s*(https?://[^\s<>"\'\)]+)', r'(https?://[^\s<>"\'\)]*submit[^\s<>"\'\)]*)', r'(https?://[^\s<>"\'\)]*answer[^\s<>"\'\)]*)', r'POST\s+to\s+JSON\s+to\s*(https?://[^\s<>"\'\)]+)', # "POST to JSON to https://..." ] for pattern in patterns: matches = re.findall(pattern, text, re.IGNORECASE) if matches: url = matches[0].strip().rstrip('.,;:!?)}]{["\'') # Validate URL try: parsed = urlparse(url) if parsed.scheme and parsed.netloc: logger.info(f"Found submit URL: {url}") return url except Exception as e: logger.warning(f"Invalid URL pattern found: {url}, error: {e}") continue # Try to find any URL that might be a submit endpoint url_pattern = r'https?://[^\s<>"\'\)]+' all_urls = re.findall(url_pattern, text) for url in all_urls: url_lower = url.lower() if 'submit' in url_lower or 'answer' in url_lower: try: parsed = urlparse(url) if parsed.scheme and parsed.netloc: logger.info(f"Found potential submit URL: {url}") return url except: continue # Try to find relative submit links (e.g. href="/submit") # Be more strict - only match actual submit endpoints, not paths that happen to contain "submit" in text rel_patterns = [ r'href=["\\\'](/[^"\\\']*submit[^"\\\']*)["\\\']', # href="/submit" or href="/api/submit" r'POST\s+to\s+JSON\s+to\s+(/[^\s<>"\'\)]+)', # "POST to JSON to /submit" r'[Pp]ost\s+(?:to|at):\s+(/[^\s<>"\'\)]+)', # "POST to: /submit" ] for pattern in rel_patterns: matches = re.findall(pattern, text, re.IGNORECASE) if matches: candidate = matches[0].strip() # Clean up - stop at first space, parenthesis, or other non-URL character # This prevents matching text like "/path (description).Submit" candidate = re.sub(r'[\s\(\)].*$', '', candidate) # Remove everything after space or paren candidate = candidate.rstrip('.,;:!?)}]{["\'') # Validate it's actually a submit endpoint (contains "submit" in the path) # AND it doesn't contain file extensions that indicate it's a document path if 'submit' in candidate.lower() and not any(ext in candidate.lower() for ext in ['.md', '.txt', '.pdf', '.html']): try: joined = urljoin(base_url, candidate) # Final validation - ensure it's a valid URL parsed = urlparse(joined) if parsed.scheme and parsed.netloc: logger.info(f"Found relative submit URL: {joined}") return joined except Exception as e: logger.warning(f"Invalid relative URL candidate: {candidate}, error: {e}") continue # Don't match generic paths that happen to contain "submit" in surrounding text # This was causing issues where paths like "/project2/data-preparation.md (local copy provided).Submit that exact" # were being matched incorrectly # Try to find submit URL in the base domain with /submit path if base_url: try: parsed = urlparse(base_url) submit_url = f"{parsed.scheme}://{parsed.netloc}/submit" logger.info(f"Trying default submit URL: {submit_url}") return submit_url except: pass logger.warning("No submit URL found in page text") return None def validate_secret(secret: str, expected_secret: str) -> bool: """ Validate the secret key. Args: secret: Provided secret expected_secret: Expected secret from environment Returns: True if valid, False otherwise """ return secret == expected_secret def clean_text(text: str) -> str: """ Clean and normalize text content. Args: text: Raw text content Returns: Cleaned text """ if not text: return "" # Remove excessive whitespace text = re.sub(r'\s+', ' ', text) # Remove leading/trailing whitespace text = text.strip() return text def extract_json_from_text(text: str) -> Optional[Dict[str, Any]]: """ Try to extract JSON objects from text. Args: text: Text that may contain JSON Returns: Parsed JSON dict or None """ # Try to find JSON blocks json_pattern = r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}' matches = re.findall(json_pattern, text, re.DOTALL) for match in matches: try: return json.loads(match) except json.JSONDecodeError: continue # Try to fix common JSON issues try: # Remove markdown code blocks text = re.sub(r'```json\s*', '', text) text = re.sub(r'```\s*', '', text) # Try parsing the cleaned text return json.loads(text.strip()) except json.JSONDecodeError: pass return None def safe_extract_json(text: str, max_retries: int = 1) -> Optional[Dict[str, Any]]: """ Safely extract JSON with better error handling. Args: text: Text that may contain JSON max_retries: Maximum retry attempts Returns: Parsed JSON dict or None """ result = extract_json_from_text(text) if result: return result # Try to fix common issues fixed_text = text # Remove leading/trailing whitespace and newlines fixed_text = fixed_text.strip() # Remove markdown formatting fixed_text = re.sub(r'^```(?:json)?\s*', '', fixed_text, flags=re.MULTILINE) fixed_text = re.sub(r'\s*```$', '', fixed_text, flags=re.MULTILINE) # Try again with fixed text result = extract_json_from_text(fixed_text) return result def is_valid_url(url: str) -> bool: """ Validate if a string is a valid URL. Args: url: URL string to validate Returns: True if valid URL, False otherwise """ try: result = urlparse(url) return all([result.scheme, result.netloc]) except Exception: return False def sanitize_filename(filename: str) -> str: """ Sanitize a filename by removing invalid characters. Args: filename: Original filename Returns: Sanitized filename """ # Remove invalid characters filename = re.sub(r'[<>:"/\\|?*]', '_', filename) # Remove leading/trailing dots and spaces filename = filename.strip('. ') return filename