Segmento-Pulse-Backend / app /utils /redis_dedup.py
SHAFI
the full ingestion pipline of the pulse is upgraded, worked on flaws and rectified them, solved over 10 major flaws
82bd507
"""
Redis URL Deduplication Bouncer
================================
This is the 48-hour memory for the ingestion pipeline.
How it works (simple version):
Imagine a nightclub bouncer who keeps a list of everyone who came in
today and yesterday. If you try to enter again while still on the list,
you are turned away. After 48 hours, your name falls off the list and
you are welcome back.
That is exactly what this module does for article URLs.
Each article URL is:
1. Cleaned and normalized (canonicalized).
2. Converted to a short SHA-256 fingerprint (so we store 16 chars not full URLs).
3. Checked against Upstash Redis with the command: SET key 1 EX 172800 NX
- EX 172800 = expire after 172800 seconds = 48 hours
- NX = only set if Not eXists
Redis response:
- "OK" β†’ key did NOT exist β†’ article is NEW β†’ return False (not seen before)
- null β†’ key already existed β†’ article is DUPLICATE β†’ return True (seen before)
Fallback:
If Upstash is not configured or Redis is unreachable, this function
safely returns False (treats every article as new). The Appwrite
database constraint is still the final safety net in that case.
"""
import logging
from app.utils.url_canonicalization import canonicalize_url, get_url_hash
from app.services.upstash_cache import get_upstash_cache
logger = logging.getLogger(__name__)
# Redis key prefix for URL deduplication keys.
# Keeps our keys clearly separate from the article-cache keys.
_KEY_PREFIX = "seen_url:"
# 48 hours expressed in seconds.
# This matches the cleanup janitor in scheduler.py which also deletes
# articles older than 48 hours. When an article is deleted from the
# database, its Redis key will also expire around the same time,
# allowing the article to be re-ingested if it genuinely resurfaces.
_TTL_SECONDS = 172_800 # 48 * 60 * 60
async def is_url_seen_or_mark(raw_url: str) -> bool:
"""
Check if we have seen this article URL in the last 48 hours.
If we have NOT seen it, mark it as seen so future checks catch it.
Args:
raw_url: The article URL (any format β€” we normalize it internally).
Returns:
True β†’ We have seen this URL before. It is a duplicate. Skip it.
False β†’ This URL is brand new. The article was also marked in Redis
so the next run will correctly identify it as a duplicate.
"""
if not raw_url:
# No URL means we cannot deduplicate. Let it through.
return False
try:
# Step 1: Normalize the URL so different versions of the same link
# (http vs https, trailing slash, utm_ params) all produce the same key.
canonical = canonicalize_url(str(raw_url))
# Step 2: Convert to a short hash so our Redis keys are tiny and uniform.
url_hash = get_url_hash(canonical)
redis_key = f"{_KEY_PREFIX}{url_hash}"
# Step 3: Get the Upstash client (shared singleton β€” already used by cache).
cache = get_upstash_cache()
# Step 4: Try to set the key WITH NX (only if it does not already exist).
# This is an atomic check-and-set: no race condition possible.
# Command: SET seen_url:{hash} 1 EX 172800 NX
result = await cache._execute_command(
["SET", redis_key, "1", "EX", _TTL_SECONDS, "NX"]
)
if result == "OK":
# Redis successfully created the key β†’ this URL is NEW.
return False # Not a duplicate β€” let the article through.
else:
# Redis returned null β†’ key already existed β†’ DUPLICATE.
logger.debug("[REDIS DEDUP] Duplicate detected: %s", redis_key)
return True # It's a duplicate β€” skip this article.
except Exception as e:
# Something went wrong with Redis (network error, timeout, etc.).
# We NEVER block an article because of a Redis failure.
# The Appwrite database will still catch true duplicates as a safety net.
logger.warning(
"[REDIS DEDUP] Redis check failed (%s) β€” letting article through as safe fallback.",
e
)
return False # Safe fallback: treat as new article.