Spaces:

iitmbs24f
/

Prj2

Sleeping

App Files Files Community

Prj2 / utils.py

iitmbs24f

Upload 37 files

2f95553 verified about 1 month ago

raw

history blame contribute delete

5.13 kB

	"""
	Utility functions for the IITM LLM Quiz Solver.
	"""
	import re
	import json
	import logging
	from typing import Optional, Dict, Any
	from urllib.parse import urlparse, urljoin

	# Configure logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
	)
	logger = logging.getLogger(__name__)


	def extract_submit_url(text: str, base_url: str) -> Optional[str]:
	"""
	Extract submit URL from page text.

	Looks for patterns like:
	- "Submit your answer to: https://example.com/submit"
	- "Submit to: https://example.com/submit"
	- "URL: https://example.com/submit"

	Args:
	text: The page text content
	base_url: Base URL for relative URL resolution

	Returns:
	Extracted submit URL or None
	"""
	# Common patterns for submit URLs
	patterns = [
	r'[Ss]ubmit\s+(?:your\s+)?(?:answer\s+)?(?:to\|at\|via):\s*(https?://[^\s<>"\'\)]+)',
	r'[Ss]ubmit\s+[Tt]o:\s*(https?://[^\s<>"\'\)]+)',
	r'[Uu][Rr][Ll]:\s*(https?://[^\s<>"\'\)]+)',
	r'[Pp]ost\s+(?:to\|at):\s*(https?://[^\s<>"\'\)]+)',
	r'[Ss]end\s+(?:to\|at):\s*(https?://[^\s<>"\'\)]+)',
	r'(https?://[^\s<>"\'\)]submit[^\s<>"\'\)])',
	r'(https?://[^\s<>"\'\)]answer[^\s<>"\'\)])',
	]

	for pattern in patterns:
	matches = re.findall(pattern, text, re.IGNORECASE)
	if matches:
	url = matches[0].strip().rstrip('.,;:!?)}]{["\'')
	# Validate URL
	try:
	parsed = urlparse(url)
	if parsed.scheme and parsed.netloc:
	logger.info(f"Found submit URL: {url}")
	return url
	except Exception as e:
	logger.warning(f"Invalid URL pattern found: {url}, error: {e}")
	continue

	# Try to find any URL that might be a submit endpoint
	url_pattern = r'https?://[^\s<>"\'\)]+'
	all_urls = re.findall(url_pattern, text)
	for url in all_urls:
	url_lower = url.lower()
	if 'submit' in url_lower or 'answer' in url_lower:
	try:
	parsed = urlparse(url)
	if parsed.scheme and parsed.netloc:
	logger.info(f"Found potential submit URL: {url}")
	return url
	except:
	continue

	# Try to find relative submit links (e.g. href="/submit")
	rel_patterns = [
	r'href=["\\\'](/[^"\\\']submit[^"\\\'])["\\\']',
	r'(/[^\\s"<>\']submit[^\\s"<>\'])',
	]
	for pattern in rel_patterns:
	matches = re.findall(pattern, text, re.IGNORECASE)
	if matches:
	candidate = matches[0].strip().rstrip('.,;:!?)}]{["\'')
	joined = urljoin(base_url, candidate)
	logger.info(f"Found relative submit URL: {joined}")
	return joined

	logger.warning("No submit URL found in page text")
	return None


	def validate_secret(secret: str, expected_secret: str) -> bool:
	"""
	Validate the secret key.

	Args:
	secret: Provided secret
	expected_secret: Expected secret from environment

	Returns:
	True if valid, False otherwise
	"""
	return secret == expected_secret


	def clean_text(text: str) -> str:
	"""
	Clean and normalize text content.

	Args:
	text: Raw text content

	Returns:
	Cleaned text
	"""
	if not text:
	return ""

	# Remove excessive whitespace
	text = re.sub(r'\s+', ' ', text)
	# Remove leading/trailing whitespace
	text = text.strip()

	return text


	def extract_json_from_text(text: str) -> Optional[Dict[str, Any]]:
	"""
	Try to extract JSON objects from text.

	Args:
	text: Text that may contain JSON

	Returns:
	Parsed JSON dict or None
	"""
	# Try to find JSON blocks
	json_pattern = r'\{[^{}](?:\{[^{}]\}[^{}])\}'
	matches = re.findall(json_pattern, text, re.DOTALL)

	for match in matches:
	try:
	return json.loads(match)
	except json.JSONDecodeError:
	continue

	return None


	def is_valid_url(url: str) -> bool:
	"""
	Validate if a string is a valid URL.

	Args:
	url: URL string to validate

	Returns:
	True if valid URL, False otherwise
	"""
	try:
	result = urlparse(url)
	return all([result.scheme, result.netloc])
	except Exception:
	return False


	def sanitize_filename(filename: str) -> str:
	"""
	Sanitize a filename by removing invalid characters.

	Args:
	filename: Original filename

	Returns:
	Sanitized filename
	"""
	# Remove invalid characters
	filename = re.sub(r'[<>:"/\\\|?*]', '_', filename)
	# Remove leading/trailing dots and spaces
	filename = filename.strip('. ')
	return filename