Spaces:

WORKWITHSHAFISK
/

segmentopulse-backend

Paused

segmentopulse-backend / app /utils /url_canonicalization.py

SHAFI

chore: Backend updates - latest changes

1bf7bbd 3 months ago

5.02 kB

	"""
	URL Canonicalization for Better Deduplication

	Normalizes URLs before hashing to catch duplicate stories from different sources.

	Removes:
	- Tracking parameters (utm_*, ref, fbclid, etc.)
	- Session IDs
	- Protocol differences (http vs https)
	- Trailing slashes
	- www prefix

	Example:
	IN: https://www.cnn.com/story?utm_source=twitter&id=123/
	OUT: cnn.com/story?id=123

	Impact: +15% deduplication accuracy
	"""

	from urllib.parse import urlparse, parse_qs, urlencode
	import re
	from typing import Optional

	# Tracking parameters to remove
	TRACKING_PARAMS = [
	'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',
	'utm_id', 'utm_source_platform', 'utm_creative_format', 'utm_marketing_tactic',
	'ref', 'fbclid', 'gclid', 'msclkid', 'mc_cid', 'mc_eid',
	'_ga', '_gl', 'igshid', 'ncid', 'sr_share'
	]

	# Session/tracking patterns to remove from path
	SESSION_PATTERNS = [
	r'/\d{10,}/', # Timestamp paths
	r';jsessionid=[^/]+', # Java session IDs
	r'\?PHPSESSID=[^&]+', # PHP session IDs
	]


	def canonicalize_url(url: str) -> str:
	"""
	Normalize URL for better deduplication

	Args:
	url: Original URL from news source

	Returns:
	Canonical URL string (normalized)

	Example:
	>>> canonicalize_url("https://www.cnn.com/tech?utm_source=twitter")
	'cnn.com/tech'
	"""
	if not url:
	return ''

	try:
	# Parse URL
	parsed = urlparse(url.strip())

	# 1. Normalize domain (lowercase, remove www)
	domain = parsed.netloc.lower()
	domain = domain.replace('www.', '')
	domain = domain.replace('m.', '') # Remove mobile prefix too

	if not domain:
	return url # Invalid URL, return as-is

	# 2. Normalize path
	path = parsed.path

	# Remove trailing slash
	path = path.rstrip('/')

	# Remove session IDs from path
	for pattern in SESSION_PATTERNS:
	path = re.sub(pattern, '', path)

	# Remove index.html, index.php, etc
	path = re.sub(r'/index\.(html\|php\|asp\|jsp)$', '', path)

	# 3. Clean query parameters
	query_params = parse_qs(parsed.query)

	# Remove tracking parameters
	clean_params = {
	k: v for k, v in query_params.items()
	if k.lower() not in TRACKING_PARAMS
	}

	# Sort parameters for consistency
	# parse_qs returns lists, take first value
	normalized_params = {
	k: v[0] if isinstance(v, list) else v
	for k, v in clean_params.items()
	}
	sorted_query = urlencode(sorted(normalized_params.items()))

	# 4. Rebuild canonical URL
	canonical = domain + path

	if sorted_query:
	canonical += '?' + sorted_query

	return canonical

	except Exception as e:
	# If canonicalization fails, return original URL
	# Better to have duplicates than lose articles
	print(f"Warning: Failed to canonicalize URL '{url}': {e}")
	return url


	def get_url_hash(url: str, length: int = 16) -> str:
	"""
	Generate hash from canonical URL

	Args:
	url: Original URL
	length: Hash length (default: 16 chars)

	Returns:
	Hex string hash

	Example:
	>>> get_url_hash("https://cnn.com/story?utm_source=twitter")
	>>> get_url_hash("https://www.cnn.com/story?ref=homepage")
	# Both return same hash!
	"""
	import hashlib

	canonical = canonicalize_url(url)
	hash_bytes = hashlib.sha256(canonical.encode('utf-8')).hexdigest()
	return hash_bytes[:length]


	# Test cases for validation
	if __name__ == '__main__':
	# Test 1: Tracking parameters removed
	url1 = "https://www.cnn.com/story?utm_source=twitter&id=123"
	url2 = "https://cnn.com/story?id=123&ref=homepage"

	assert canonicalize_url(url1) == canonicalize_url(url2)
	print("✓ Test 1 passed: Tracking params removed")

	# Test 2: Protocol and www normalized
	url3 = "http://www.example.com/article"
	url4 = "https://example.com/article"

	assert canonicalize_url(url3) == canonicalize_url(url4)
	print("✓ Test 2 passed: Protocol/www normalized")

	# Test 3: Trailing slash removed
	url5 = "https://example.com/article/"
	url6 = "https://example.com/article"

	assert canonicalize_url(url5) == canonicalize_url(url6)
	print("✓ Test 3 passed: Trailing slash removed")

	# Test 4: Query params sorted
	url7 = "https://example.com?b=2&a=1"
	url8 = "https://example.com?a=1&b=2"

	assert canonicalize_url(url7) == canonicalize_url(url8)
	print("✓ Test 4 passed: Query params sorted")

	print("\n✅ All tests passed!")
	print(f"\nExample canonical URL: {canonicalize_url('https://www.cnn.com/tech/ai-breakthrough?utm_source=twitter')}")