Spaces:

parthnuwal7
/

ido

Running

ido / services /analytics_service.py

Parthnuwal7

Adding backend to HF spaces

27d04ef 2 months ago

51.2 kB

	"""
	Analytics Service
	Computes insights from preprocessed session data
	"""

	from collections import Counter
	from typing import Optional
	import json
	from pathlib import Path


	def load_session_events(token: str) -> tuple[list[dict], dict]:
	"""Load events and stats from a preprocessed session."""
	storage_dir = Path(__file__).parent.parent / "storage"
	session_path = storage_dir / f"preprocessed_{token}.json"

	if not session_path.exists():
	return [], {}

	with open(session_path, 'r', encoding='utf-8') as f:
	data = json.load(f)

	return data.get("events", []), data.get("stats", {})


	def get_session_summary(token: str) -> dict:
	"""
	Get overall session summary.

	Returns:
	- total_events, total_watch, total_search, total_subscribe
	- unique_channels
	- date_range (first and last timestamp)
	- language_breakdown
	"""
	events, stats = load_session_events(token)

	if not events:
	return {"error": "Session not found or empty"}

	# Get unique channels from watch events
	watch_events = [e for e in events if e.get("type") == "watch"]
	channels = set(e.get("channel_clean") for e in watch_events if e.get("channel_clean"))

	# Get date range
	timestamps = [e.get("timestamp_utc") for e in events if e.get("timestamp_utc")]
	timestamps.sort()

	return {
	"total_events": stats.get("total_events", len(events)),
	"total_watch": stats.get("total_watch", len(watch_events)),
	"total_search": stats.get("total_search", 0),
	"total_subscribe": stats.get("total_subscribe", 0),
	"unique_channels": len(channels),
	"date_range": {
	"first": timestamps[0] if timestamps else None,
	"last": timestamps[-1] if timestamps else None
	},
	"language_breakdown": stats.get("language_breakdown", {})
	}


	def get_channel_analytics(token: str, top_n: int = 20, engagement_filter: str = "all") -> dict:
	"""
	Get channel analytics.

	Args:
	token: Session token
	top_n: Number of top channels to return
	engagement_filter: "all" \| "watch" (active) \| "view" (passive)

	Returns:
	- top_channels: List of (channel, count) tuples
	- total_unique_channels
	- channel_distribution: Percentage breakdown
	"""
	events, _ = load_session_events(token)

	if not events:
	return {"error": "Session not found or empty"}

	# Get all watch-type events
	all_watch_events = [e for e in events if e.get("type") == "watch"]

	# Filter by engagement if specified
	if engagement_filter == "watch":
	# Watch = active engagement
	filtered_events = [e for e in all_watch_events if e.get("engagement") == "active"]
	elif engagement_filter == "view":
	# View = passive engagement
	filtered_events = [e for e in all_watch_events if e.get("engagement") == "passive"]
	else:
	# All watch events
	filtered_events = all_watch_events

	channel_counts = Counter(
	e.get("channel_clean") for e in filtered_events
	if e.get("channel_clean")
	)

	# Get original channel names for display
	channel_display_names = {}
	for e in all_watch_events:
	clean = e.get("channel_clean")
	original = e.get("channel")
	if clean and original and clean not in channel_display_names:
	channel_display_names[clean] = original

	total_count = sum(channel_counts.values())
	top_channels = channel_counts.most_common(top_n)

	# Build result with display names and percentages
	top_channels_result = []
	for channel_clean, count in top_channels:
	display_name = channel_display_names.get(channel_clean, channel_clean)
	percentage = round((count / total_count) * 100, 2) if total_count > 0 else 0
	top_channels_result.append({
	"channel": display_name,
	"channel_clean": channel_clean,
	"count": count,
	"percentage": percentage
	})

	# Build view count distribution histogram
	# Buckets: 1, 2-5, 6-10, 11-20, 21-50, 51-100, 100+
	view_buckets = {
	"1": 0,
	"2-5": 0,
	"6-10": 0,
	"11-20": 0,
	"21-50": 0,
	"51-100": 0,
	"100+": 0
	}

	for count in channel_counts.values():
	if count == 1:
	view_buckets["1"] += 1
	elif count <= 5:
	view_buckets["2-5"] += 1
	elif count <= 10:
	view_buckets["6-10"] += 1
	elif count <= 20:
	view_buckets["11-20"] += 1
	elif count <= 50:
	view_buckets["21-50"] += 1
	elif count <= 100:
	view_buckets["51-100"] += 1
	else:
	view_buckets["100+"] += 1

	# Convert to list format for frontend
	view_distribution = [
	{"bucket": bucket, "count": count}
	for bucket, count in view_buckets.items()
	]

	# Also compute summary stats for both types
	active_count = len([e for e in all_watch_events if e.get("engagement") == "active"])
	passive_count = len([e for e in all_watch_events if e.get("engagement") == "passive"])

	return {
	"total_unique_channels": len(channel_counts),
	"total_count": total_count,
	"engagement_filter": engagement_filter,
	"top_channels": top_channels_result,
	"other_count": total_count - sum(c["count"] for c in top_channels_result),
	"view_distribution": view_distribution,
	"engagement_summary": {
	"total_watch": active_count,
	"total_view": passive_count,
	"total_all": len(all_watch_events)
	}
	}


	def get_watch_patterns(token: str) -> dict:
	"""
	Get watch time patterns.

	Returns:
	- hourly_distribution: Watches per hour (0-23)
	- daily_distribution: Watches per day of week (0-6)
	- peak_hour, peak_day
	- weekly_peak_days: For each week, which day had most watches
	- time_intervals: Grouped by time periods (morning, afternoon, etc.)
	- circular_activity: Average watches per hour for radial chart
	"""
	events, _ = load_session_events(token)

	if not events:
	return {"error": "Session not found or empty"}

	watch_events = [e for e in events if e.get("type") == "watch"]

	# Count by hour
	hourly = Counter(e.get("hour_local") for e in watch_events if e.get("hour_local") is not None)

	# Count by day of week
	daily = Counter(e.get("day_of_week") for e in watch_events if e.get("day_of_week") is not None)

	# Build full distributions (fill missing with 0)
	hourly_dist = [{"hour": h, "count": hourly.get(h, 0)} for h in range(24)]
	daily_dist = [{"day": d, "count": daily.get(d, 0)} for d in range(7)]

	# Find peaks
	peak_hour = max(hourly_dist, key=lambda x: x["count"]) if hourly_dist else None
	peak_day = max(daily_dist, key=lambda x: x["count"]) if daily_dist else None

	day_names = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

	# === Weekly Peak Day Analysis ===
	# Group events by week (using ISO week number) and find peak day for each week
	from datetime import datetime
	weekly_data = {} # {(year, week): {day_of_week: count}}

	for e in watch_events:
	ts = e.get("timestamp_local")
	dow = e.get("day_of_week")
	if ts and dow is not None:
	try:
	dt = datetime.fromisoformat(ts.replace('Z', '+00:00'))
	year, week, _ = dt.isocalendar()
	key = (year, week)
	if key not in weekly_data:
	weekly_data[key] = Counter()
	weekly_data[key][dow] += 1
	except:
	pass

	# Find peak day for each week
	weekly_peak_days = []
	peak_day_counter = Counter() # Count how many weeks each day "wins"

	for (year, week), day_counts in sorted(weekly_data.items()):
	if day_counts:
	peak_dow = max(day_counts.keys(), key=lambda d: day_counts[d])
	peak_count = day_counts[peak_dow]
	weekly_peak_days.append({
	"year": year,
	"week": week,
	"peak_day": day_names[peak_dow],
	"peak_day_num": peak_dow,
	"count": peak_count
	})
	peak_day_counter[peak_dow] += 1

	# Overall winner: which day wins the most weeks
	overall_peak_day = None
	overall_peak_wins = 0
	if peak_day_counter:
	winner_dow = max(peak_day_counter.keys(), key=lambda d: peak_day_counter[d])
	overall_peak_day = day_names[winner_dow]
	overall_peak_wins = peak_day_counter[winner_dow]

	# === Time Intervals ===
	# Group hours into intervals
	intervals = {
	"Night (12AM-6AM)": range(0, 6),
	"Morning (6AM-12PM)": range(6, 12),
	"Afternoon (12PM-6PM)": range(12, 18),
	"Evening (6PM-12AM)": range(18, 24)
	}

	interval_counts = {}
	for name, hour_range in intervals.items():
	count = sum(hourly.get(h, 0) for h in hour_range)
	interval_counts[name] = count

	time_intervals = [
	{"interval": name, "count": count, "hours": f"{list(hours)[0]}-{list(hours)[-1]+1}"}
	for name, hours in intervals.items()
	for count in [interval_counts[name]]
	]

	peak_interval = max(time_intervals, key=lambda x: x["count"]) if time_intervals else None

	# === Circular Activity (Average per hour) ===
	# Calculate average watches per hour across all days in the dataset
	total_days = len(weekly_data) * 7 if weekly_data else 1 # Approximate
	circular_activity = []
	total_watches = sum(hourly.values())

	for h in range(24):
	count = hourly.get(h, 0)
	# Percentage of total watches
	percentage = round((count / total_watches) * 100, 2) if total_watches > 0 else 0
	# Format hour label
	if h == 0:
	label = "12 AM"
	elif h < 12:
	label = f"{h} AM"
	elif h == 12:
	label = "12 PM"
	else:
	label = f"{h-12} PM"

	circular_activity.append({
	"hour": h,
	"label": label,
	"count": count,
	"percentage": percentage
	})

	return {
	"hourly_distribution": hourly_dist,
	"daily_distribution": daily_dist,
	"peak_hour": peak_hour["hour"] if peak_hour else None,
	"peak_hour_count": peak_hour["count"] if peak_hour else 0,
	"peak_day": day_names[peak_day["day"]] if peak_day and peak_day["day"] is not None else None,
	"peak_day_count": peak_day["count"] if peak_day else 0,
	# New fields
	"weekly_peak_days": weekly_peak_days[-12:], # Last 12 weeks
	"overall_peak_day": overall_peak_day,
	"overall_peak_wins": overall_peak_wins,
	"total_weeks": len(weekly_data),
	"time_intervals": time_intervals,
	"peak_interval": peak_interval["interval"] if peak_interval else None,
	"circular_activity": circular_activity
	}


	def get_temporal_trends(token: str) -> dict:
	"""
	Analyze how watching patterns change month-to-month.

	Returns:
	- monthly_stats: Watch count, peak hour, peak day for each month
	- peak_hour_trend: How peak hour shifts over months
	- peak_day_trend: How peak day shifts over months
	- activity_trend: Total watches per month
	"""
	events, _ = load_session_events(token)

	if not events:
	return {"error": "Session not found or empty"}

	watch_events = [e for e in events if e.get("type") == "watch"]

	if not watch_events:
	return {
	"monthly_stats": [],
	"peak_hour_trend": [],
	"peak_day_trend": [],
	"activity_trend": [],
	"summary": "No watch events found"
	}

	from collections import defaultdict
	from datetime import datetime

	# Group by month
	monthly_data = defaultdict(lambda: {
	"watches": 0,
	"hourly": Counter(),
	"daily": Counter()
	})

	day_names = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

	for event in watch_events:
	ts = event.get("timestamp_local") or event.get("timestamp_utc")
	hour = event.get("hour_local")
	dow = event.get("day_of_week")

	if not ts:
	continue

	try:
	# Parse year-month
	if "T" in ts:
	dt = datetime.fromisoformat(ts.replace('Z', '+00:00'))
	else:
	dt = datetime.strptime(ts[:10], "%Y-%m-%d")

	month_key = f"{dt.year}-{dt.month:02d}"

	monthly_data[month_key]["watches"] += 1

	if hour is not None:
	monthly_data[month_key]["hourly"][hour] += 1

	if dow is not None:
	monthly_data[month_key]["daily"][dow] += 1
	except:
	continue

	# Build monthly stats
	monthly_stats = []
	peak_hour_trend = []
	peak_day_trend = []
	activity_trend = []

	for month_key in sorted(monthly_data.keys()):
	data = monthly_data[month_key]

	# Find peak hour
	peak_hour = None
	peak_hour_count = 0
	if data["hourly"]:
	peak_hour = max(data["hourly"].keys(), key=lambda h: data["hourly"][h])
	peak_hour_count = data["hourly"][peak_hour]

	# Find peak day
	peak_day = None
	peak_day_name = None
	peak_day_count = 0
	if data["daily"]:
	peak_day = max(data["daily"].keys(), key=lambda d: data["daily"][d])
	peak_day_name = day_names[peak_day]
	peak_day_count = data["daily"][peak_day]

	# Format peak hour label
	if peak_hour is not None:
	if peak_hour == 0:
	peak_hour_label = "12 AM"
	elif peak_hour < 12:
	peak_hour_label = f"{peak_hour} AM"
	elif peak_hour == 12:
	peak_hour_label = "12 PM"
	else:
	peak_hour_label = f"{peak_hour - 12} PM"
	else:
	peak_hour_label = "N/A"

	# Month name
	year, month = month_key.split("-")
	month_names = ["", "Jan", "Feb", "Mar", "Apr", "May", "Jun",
	"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
	month_label = f"{month_names[int(month)]} {year}"

	monthly_stats.append({
	"month": month_key,
	"month_label": month_label,
	"total_watches": data["watches"],
	"peak_hour": peak_hour,
	"peak_hour_label": peak_hour_label,
	"peak_hour_count": peak_hour_count,
	"peak_day": peak_day,
	"peak_day_name": peak_day_name,
	"peak_day_count": peak_day_count
	})

	# Trend data for charts
	peak_hour_trend.append({
	"month": month_key,
	"month_label": month_label,
	"peak_hour": peak_hour,
	"peak_hour_label": peak_hour_label
	})

	peak_day_trend.append({
	"month": month_key,
	"month_label": month_label,
	"peak_day": peak_day,
	"peak_day_name": peak_day_name
	})

	activity_trend.append({
	"month": month_key,
	"month_label": month_label,
	"watches": data["watches"]
	})

	# Detect significant shifts
	shifts = []
	for i in range(1, len(monthly_stats)):
	prev = monthly_stats[i-1]
	curr = monthly_stats[i]

	# Hour shift
	if prev["peak_hour"] is not None and curr["peak_hour"] is not None:
	hour_diff = abs(curr["peak_hour"] - prev["peak_hour"])
	if hour_diff >= 4: # Significant if 4+ hours shift
	shifts.append({
	"type": "peak_hour",
	"from_month": prev["month_label"],
	"to_month": curr["month_label"],
	"from_value": prev["peak_hour_label"],
	"to_value": curr["peak_hour_label"],
	"description": f"Peak hour shifted from {prev['peak_hour_label']} to {curr['peak_hour_label']}"
	})

	# Day shift
	if prev["peak_day"] is not None and curr["peak_day"] is not None:
	if prev["peak_day"] != curr["peak_day"]:
	# Check if weekday<->weekend shift
	prev_weekend = prev["peak_day"] >= 5
	curr_weekend = curr["peak_day"] >= 5
	if prev_weekend != curr_weekend:
	shifts.append({
	"type": "peak_day",
	"from_month": prev["month_label"],
	"to_month": curr["month_label"],
	"from_value": prev["peak_day_name"],
	"to_value": curr["peak_day_name"],
	"description": f"Shifted from {'weekend' if prev_weekend else 'weekday'} to {'weekend' if curr_weekend else 'weekday'}"
	})

	# Generate summary
	if monthly_stats:
	first = monthly_stats[0]
	last = monthly_stats[-1]
	summary = f"Tracked {len(monthly_stats)} months from {first['month_label']} to {last['month_label']}"
	if shifts:
	summary += f" \| {len(shifts)} significant pattern shifts detected"
	else:
	summary = "No monthly data available"

	return {
	"monthly_stats": monthly_stats,
	"peak_hour_trend": peak_hour_trend,
	"peak_day_trend": peak_day_trend,
	"activity_trend": activity_trend,
	"pattern_shifts": shifts[:10], # Top 10 shifts
	"total_months": len(monthly_stats),
	"summary": summary
	}


	def get_search_analytics(token: str, top_n: int = 20) -> dict:
	"""
	Get search analytics.

	Returns:
	- total_searches
	- top_search_terms
	- language_breakdown for searches
	"""
	events, _ = load_session_events(token)

	if not events:
	return {"error": "Session not found or empty"}

	search_events = [e for e in events if e.get("type") == "search"]

	# Count search terms
	search_counts = Counter(
	e.get("text_clean") for e in search_events
	if e.get("text_clean")
	)

	# Get raw terms for display
	term_display = {}
	for e in search_events:
	clean = e.get("text_clean")
	raw = e.get("text_raw")
	if clean and raw and clean not in term_display:
	term_display[clean] = raw

	top_searches = []
	for term_clean, count in search_counts.most_common(top_n):
	display = term_display.get(term_clean, term_clean)
	top_searches.append({
	"term": display,
	"term_clean": term_clean,
	"count": count
	})

	# Language breakdown for searches
	lang_counts = Counter(e.get("language_type") for e in search_events if e.get("language_type"))

	return {
	"total_searches": len(search_events),
	"unique_searches": len(search_counts),
	"top_searches": top_searches,
	"language_breakdown": dict(lang_counts)
	}


	def get_subscription_overlap(token: str) -> dict:
	"""
	Analyze overlap between subscriptions and watch history.

	Returns:
	- total_subscriptions
	- subscribed_and_watched: Channels you're subscribed to AND watched
	- watched_not_subscribed: Channels you watch but aren't subscribed to
	- subscribed_not_watched: Channels you're subscribed to but haven't watched
	"""
	events, _ = load_session_events(token)

	if not events:
	return {"error": "Session not found or empty"}

	# Get subscribed channels (lowercase)
	subscribe_events = [e for e in events if e.get("type") == "subscribe"]
	subscribed = set(e.get("channel_clean") for e in subscribe_events if e.get("channel_clean"))

	# Get watched channels (lowercase)
	watch_events = [e for e in events if e.get("type") == "watch"]
	watched = set(e.get("channel_clean") for e in watch_events if e.get("channel_clean"))

	# Calculate overlaps
	subscribed_and_watched = subscribed & watched
	watched_not_subscribed = watched - subscribed
	subscribed_not_watched = subscribed - watched

	# Get display names
	channel_display = {}
	for e in events:
	clean = e.get("channel_clean")
	original = e.get("channel")
	if clean and original:
	channel_display[clean] = original

	return {
	"total_subscriptions": len(subscribed),
	"total_watched_channels": len(watched),
	"subscribed_and_watched": {
	"count": len(subscribed_and_watched),
	"percentage": round(len(subscribed_and_watched) / len(subscribed) * 100, 1) if subscribed else 0,
	"channels": [channel_display.get(c, c) for c in list(subscribed_and_watched)[:20]]
	},
	"watched_not_subscribed": {
	"count": len(watched_not_subscribed),
	"channels": [channel_display.get(c, c) for c in list(watched_not_subscribed)[:20]]
	},
	"subscribed_not_watched": {
	"count": len(subscribed_not_watched),
	"channels": [channel_display.get(c, c) for c in list(subscribed_not_watched)[:20]]
	}
	}


	def get_behavior_anomalies(token: str) -> dict:
	"""
	Detect deviations from normal watching patterns.

	Identifies:
	- Late night sessions (watching after midnight when normally don't)
	- Binge periods (unusually high watch counts)
	- Off-peak hour activity
	- Weekly pattern changes
	"""
	from datetime import datetime

	events, _ = load_session_events(token)

	if not events:
	return {"error": "Session not found or empty"}

	watch_events = [e for e in events if e.get("type") == "watch"]

	if not watch_events:
	return {"anomalies": [], "late_night_sessions": [], "binge_days": []}

	# === Calculate baseline patterns ===
	# Average watches per hour across all data
	hourly_counts = Counter(e.get("hour_local") for e in watch_events if e.get("hour_local") is not None)
	total_watches = sum(hourly_counts.values())

	# Define "late night" as 12AM-5AM (hours 0-4)
	late_night_hours = {0, 1, 2, 3, 4}
	late_night_baseline = sum(hourly_counts.get(h, 0) for h in late_night_hours)
	late_night_percentage = (late_night_baseline / total_watches * 100) if total_watches > 0 else 0

	# === Group by date ===
	daily_data = {} # {date_str: {hour: count, total: count}}

	for e in watch_events:
	ts = e.get("timestamp_local")
	hour = e.get("hour_local")
	if ts and hour is not None:
	try:
	dt = datetime.fromisoformat(ts.replace('Z', '+00:00'))
	date_str = dt.strftime("%Y-%m-%d")
	if date_str not in daily_data:
	daily_data[date_str] = {"hours": Counter(), "total": 0}
	daily_data[date_str]["hours"][hour] += 1
	daily_data[date_str]["total"] += 1
	except:
	pass

	# Calculate average daily watches
	if daily_data:
	avg_daily_watches = sum(d["total"] for d in daily_data.values()) / len(daily_data)
	std_dev = (sum((d["total"] - avg_daily_watches) 2 for d in daily_data.values()) / len(daily_data)) 0.5
	else:
	avg_daily_watches = 0
	std_dev = 0

	# === Detect Late Night Sessions ===
	# Days where user watched significantly in late night hours (> usual)
	late_night_sessions = []

	for date_str, data in sorted(daily_data.items()):
	late_night_count = sum(data["hours"].get(h, 0) for h in late_night_hours)
	if late_night_count >= 3: # At least 3 videos in late night
	late_night_sessions.append({
	"date": date_str,
	"late_night_count": late_night_count,
	"total_count": data["total"],
	"peak_hour": max(data["hours"].keys(), key=lambda h: data["hours"][h]) if data["hours"] else None
	})

	# === Detect Binge Days ===
	# Days with watch count > mean + 2*std_dev
	binge_threshold = avg_daily_watches + 2 * std_dev if std_dev > 0 else avg_daily_watches * 2
	binge_days = []

	for date_str, data in sorted(daily_data.items()):
	if data["total"] > binge_threshold and data["total"] >= 10: # At least 10 videos
	binge_days.append({
	"date": date_str,
	"count": data["total"],
	"above_average_by": round(data["total"] - avg_daily_watches, 1),
	"multiplier": round(data["total"] / avg_daily_watches, 2) if avg_daily_watches > 0 else 0
	})

	# === Detect Weekly Pattern Shifts ===
	# Group by week and detect if hourly pattern changed significantly
	weekly_patterns = {} # {(year, week): Counter of hours}

	for e in watch_events:
	ts = e.get("timestamp_local")
	hour = e.get("hour_local")
	if ts and hour is not None:
	try:
	dt = datetime.fromisoformat(ts.replace('Z', '+00:00'))
	year, week, _ = dt.isocalendar()
	key = (year, week)
	if key not in weekly_patterns:
	weekly_patterns[key] = Counter()
	weekly_patterns[key][hour] += 1
	except:
	pass

	# Find weeks with unusual late-night activity
	unusual_weeks = []

	for (year, week), hour_counts in sorted(weekly_patterns.items()):
	week_total = sum(hour_counts.values())
	week_late_night = sum(hour_counts.get(h, 0) for h in late_night_hours)
	week_late_pct = (week_late_night / week_total * 100) if week_total > 0 else 0

	# If this week's late night % is significantly higher than baseline
	if week_late_pct > late_night_percentage * 1.5 and week_late_night >= 5:
	unusual_weeks.append({
	"year": year,
	"week": week,
	"late_night_count": week_late_night,
	"late_night_percentage": round(week_late_pct, 1),
	"baseline_percentage": round(late_night_percentage, 1),
	"total_watches": week_total
	})

	# === Detect Streaks (Consecutive Days) ===
	from datetime import timedelta

	def find_streaks(date_list, max_gap=1):
	"""Find streaks of dates with at most max_gap days between them."""
	if not date_list:
	return []

	sorted_dates = sorted([datetime.strptime(d, "%Y-%m-%d") for d in date_list])
	streaks = []
	current_streak = [sorted_dates[0]]

	for i in range(1, len(sorted_dates)):
	gap = (sorted_dates[i] - sorted_dates[i-1]).days
	if gap <= max_gap + 1: # Allow gap of max_gap days
	current_streak.append(sorted_dates[i])
	else:
	if len(current_streak) >= 2:
	streaks.append(current_streak)
	current_streak = [sorted_dates[i]]

	if len(current_streak) >= 2:
	streaks.append(current_streak)

	return streaks

	# Find binge streaks (consecutive binge days, allowing 1 day gap)
	binge_dates = [d["date"] for d in binge_days]
	binge_streaks = find_streaks(binge_dates, max_gap=1)

	binge_watching_periods = []
	for streak in binge_streaks:
	start = streak[0].strftime("%Y-%m-%d")
	end = streak[-1].strftime("%Y-%m-%d")
	days = (streak[-1] - streak[0]).days + 1
	total_videos = sum(d["count"] for d in binge_days if d["date"] >= start and d["date"] <= end)
	binge_watching_periods.append({
	"start_date": start,
	"end_date": end,
	"duration_days": days,
	"total_videos": total_videos,
	"avg_per_day": round(total_videos / len(streak), 1)
	})

	# Find late night mood (consecutive late night sessions, allowing 1 day gap)
	late_night_dates = [s["date"] for s in late_night_sessions]
	late_night_streaks = find_streaks(late_night_dates, max_gap=1)

	late_night_moods = []
	for streak in late_night_streaks:
	start = streak[0].strftime("%Y-%m-%d")
	end = streak[-1].strftime("%Y-%m-%d")
	days = (streak[-1] - streak[0]).days + 1
	total_late = sum(s["late_night_count"] for s in late_night_sessions if s["date"] >= start and s["date"] <= end)
	late_night_moods.append({
	"start_date": start,
	"end_date": end,
	"duration_days": days,
	"total_late_videos": total_late
	})

	# === Additional Pattern Analysis ===

	# 1. Weekend Warrior Detection
	weekend_watches = 0
	weekday_watches = 0
	for e in watch_events:
	dow = e.get("day_of_week")
	if dow is not None:
	if dow >= 5: # Saturday=5, Sunday=6
	weekend_watches += 1
	else:
	weekday_watches += 1

	total_dow = weekend_watches + weekday_watches
	weekend_pct = (weekend_watches / total_dow * 100) if total_dow > 0 else 0
	# If weekend (2 days) has > 35% of watches, that's weekend warrior territory
	is_weekend_warrior = weekend_pct > 35

	# 2. Night Owl vs Morning Person
	night_hours = {20, 21, 22, 23, 0, 1, 2, 3, 4} # 8PM - 5AM
	morning_hours = {5, 6, 7, 8, 9, 10, 11} # 5AM - 12PM

	night_count = sum(hourly_counts.get(h, 0) for h in night_hours)
	morning_count = sum(hourly_counts.get(h, 0) for h in morning_hours)

	if night_count > morning_count * 1.5:
	chronotype = "Night Owl"
	elif morning_count > night_count * 1.5:
	chronotype = "Early Bird"
	else:
	chronotype = "Balanced"

	# 3. Inactive Periods (gaps of 3+ days with no watching)
	sorted_dates = sorted(daily_data.keys())
	inactive_periods = []

	for i in range(1, len(sorted_dates)):
	prev = datetime.strptime(sorted_dates[i-1], "%Y-%m-%d")
	curr = datetime.strptime(sorted_dates[i], "%Y-%m-%d")
	gap = (curr - prev).days
	if gap >= 4: # 3+ days gap
	inactive_periods.append({
	"start": sorted_dates[i-1],
	"end": sorted_dates[i],
	"gap_days": gap - 1
	})

	# === Build anomaly summary ===
	anomalies = []

	# Add binge watching periods
	for period in sorted(binge_watching_periods, key=lambda x: x["duration_days"], reverse=True)[:3]:
	anomalies.append({
	"type": "binge_streak",
	"date": f"{period['start_date']} to {period['end_date']}",
	"description": f"Binge watching period: {period['duration_days']} days, {period['total_videos']} videos",
	"severity": "high" if period["duration_days"] >= 5 else "medium"
	})

	# Add late night moods
	for mood in sorted(late_night_moods, key=lambda x: x["duration_days"], reverse=True)[:3]:
	anomalies.append({
	"type": "late_night_mood",
	"date": f"{mood['start_date']} to {mood['end_date']}",
	"description": f"Late night mood: {mood['duration_days']} consecutive nights",
	"severity": "high" if mood["duration_days"] >= 4 else "medium"
	})

	# Add top single-day anomalies
	for session in sorted(late_night_sessions, key=lambda x: x["late_night_count"], reverse=True)[:3]:
	anomalies.append({
	"type": "late_night",
	"date": session["date"],
	"description": f"Watched {session['late_night_count']} videos after midnight",
	"severity": "high" if session["late_night_count"] >= 10 else "medium"
	})

	for day in sorted(binge_days, key=lambda x: x["count"], reverse=True)[:3]:
	anomalies.append({
	"type": "binge",
	"date": day["date"],
	"description": f"Watched {day['count']} videos ({day['multiplier']}x above average)",
	"severity": "high" if day["multiplier"] >= 3 else "medium"
	})

	return {
	"baseline": {
	"avg_daily_watches": round(avg_daily_watches, 1),
	"std_dev": round(std_dev, 1),
	"late_night_baseline_pct": round(late_night_percentage, 1),
	"total_days": len(daily_data)
	},
	"anomalies": anomalies[:12], # Top 12 anomalies
	"late_night_sessions": late_night_sessions[-20:],
	"binge_days": binge_days[-20:],
	"unusual_weeks": unusual_weeks[-10:],
	# New streak data
	"binge_watching_periods": binge_watching_periods,
	"late_night_moods": late_night_moods,
	# Behavior patterns
	"patterns": {
	"weekend_warrior": is_weekend_warrior,
	"weekend_pct": round(weekend_pct, 1),
	"chronotype": chronotype,
	"night_watches": night_count,
	"morning_watches": morning_count,
	"inactive_periods": inactive_periods[-5:] # Last 5
	}
	}


	def get_habit_formation(token: str, min_streak_days: int = 3) -> dict:
	"""
	Detect habit formation patterns.

	Identifies:
	- Channels watched daily for consecutive days
	- Videos watched multiple times on different days
	- Content patterns that indicate habitual watching

	Args:
	token: Session token
	min_streak_days: Minimum consecutive days to count as a habit (default: 3)

	Returns:
	- channel_habits: Channels with daily watching streaks
	- video_habits: Videos watched on multiple days
	- content_habits: Topics/keywords watched daily
	- habit_strength: Overall habit formation score
	"""
	events, _ = load_session_events(token)

	if not events:
	return {"error": "Session not found or empty"}

	# Get watch events with dates
	watch_events = [e for e in events if e.get("type") == "watch"]

	if not watch_events:
	return {
	"channel_habits": [],
	"video_habits": [],
	"content_habits": [],
	"habit_strength": 0,
	"summary": "No watch events found"
	}

	# Group events by date and channel
	from collections import defaultdict
	from datetime import datetime, timedelta

	channel_by_date = defaultdict(set) # date -> set of channels
	date_by_channel = defaultdict(set) # channel -> set of dates

	# Track videos by date (using text_clean as identifier since we don't have video_id)
	date_by_video = defaultdict(set) # video title -> set of dates
	video_info = {} # video title -> {channel, first_seen, total_watches}

	# Also track micro_topics by date
	topic_by_date = defaultdict(set) # date -> set of topics
	date_by_topic = defaultdict(set) # topic -> set of dates

	for event in watch_events:
	ts = event.get("timestamp_local") or event.get("timestamp_utc")
	if not ts:
	continue

	# Extract date
	try:
	date_str = ts.split("T")[0] if "T" in ts else ts[:10]
	except:
	continue

	# Track channel
	channel = event.get("channel_clean")
	if channel:
	channel_by_date[date_str].add(channel)
	date_by_channel[channel].add(date_str)

	# Track video (using text_clean as identifier)
	video_title = event.get("text_clean") or event.get("text_raw")
	if video_title and len(video_title) > 5: # Skip very short titles
	date_by_video[video_title].add(date_str)
	if video_title not in video_info:
	video_info[video_title] = {
	"channel": channel or "Unknown",
	"first_seen": date_str,
	"total_watches": 0
	}
	video_info[video_title]["total_watches"] += 1

	# Track micro_topics if available
	topics = event.get("micro_topics", [])
	for topic in topics:
	topic_by_date[date_str].add(topic)
	date_by_topic[topic].add(date_str)

	# Find consecutive day streaks for each channel
	def find_daily_streaks(dates_set: set, min_days: int) -> list:
	"""Find streaks of consecutive days."""
	if len(dates_set) < min_days:
	return []

	sorted_dates = sorted(dates_set)
	streaks = []
	current_streak = [sorted_dates[0]]

	for i in range(1, len(sorted_dates)):
	try:
	prev_date = datetime.strptime(sorted_dates[i-1], "%Y-%m-%d")
	curr_date = datetime.strptime(sorted_dates[i], "%Y-%m-%d")

	if (curr_date - prev_date).days == 1:
	current_streak.append(sorted_dates[i])
	else:
	if len(current_streak) >= min_days:
	streaks.append({
	"start": current_streak[0],
	"end": current_streak[-1],
	"days": len(current_streak)
	})
	current_streak = [sorted_dates[i]]
	except:
	current_streak = [sorted_dates[i]]

	# Don't forget the last streak
	if len(current_streak) >= min_days:
	streaks.append({
	"start": current_streak[0],
	"end": current_streak[-1],
	"days": len(current_streak)
	})

	return streaks

	# Find channel habits
	channel_habits = []
	for channel, dates in date_by_channel.items():
	streaks = find_daily_streaks(dates, min_streak_days)
	if streaks:
	# Calculate total days in habits
	total_habit_days = sum(s["days"] for s in streaks)
	longest_streak = max(s["days"] for s in streaks)

	channel_habits.append({
	"channel": channel,
	"total_days_watched": len(dates),
	"habit_streaks": streaks,
	"longest_streak": longest_streak,
	"total_habit_days": total_habit_days,
	"habit_score": min(100, total_habit_days * 10) # Score 0-100
	})

	# Sort by longest streak, then total habit days
	channel_habits.sort(key=lambda x: (x["longest_streak"], x["total_habit_days"]), reverse=True)

	# Find video habits (videos watched on multiple different days)
	video_habits = []
	for video_title, dates in date_by_video.items():
	if len(dates) >= 2: # Watched on at least 2 different days
	streaks = find_daily_streaks(dates, min_streak_days)
	info = video_info.get(video_title, {})

	video_habits.append({
	"title": video_title[:80] + "..." if len(video_title) > 80 else video_title,
	"channel": info.get("channel", "Unknown"),
	"days_watched": len(dates),
	"total_watches": info.get("total_watches", 0),
	"first_seen": info.get("first_seen", ""),
	"has_streak": len(streaks) > 0,
	"longest_streak": max((s["days"] for s in streaks), default=0)
	})

	# Sort video habits by days watched, then total watches
	video_habits.sort(key=lambda x: (x["days_watched"], x["total_watches"]), reverse=True)

	# Find content/topic habits (only for topics with >= 5 total occurrences)
	content_habits = []
	for topic, dates in date_by_topic.items():
	if len(dates) < 5: # Skip rare topics
	continue

	streaks = find_daily_streaks(dates, min_streak_days)
	if streaks:
	total_habit_days = sum(s["days"] for s in streaks)
	longest_streak = max(s["days"] for s in streaks)

	content_habits.append({
	"topic": topic,
	"total_days": len(dates),
	"habit_streaks": streaks,
	"longest_streak": longest_streak,
	"total_habit_days": total_habit_days
	})

	# Sort content habits
	content_habits.sort(key=lambda x: (x["longest_streak"], x["total_habit_days"]), reverse=True)

	# Calculate overall habit strength
	total_channels_with_habits = len(channel_habits)
	max_channel_streak = max((h["longest_streak"] for h in channel_habits), default=0)

	# Habit strength: 0-100 score
	habit_strength = 0
	if total_channels_with_habits > 0:
	# Factors: number of habitual channels, longest streak, total habit days
	habit_strength = min(100, (
	total_channels_with_habits * 10 +
	max_channel_streak * 5 +
	sum(h["total_habit_days"] for h in channel_habits[:5]) # Top 5
	))

	# Generate habit summary
	summary_parts = []
	if channel_habits:
	top_habit = channel_habits[0]
	summary_parts.append(
	f"Strongest habit: {top_habit['channel']} watched {top_habit['longest_streak']} days in a row"
	)
	if total_channels_with_habits > 1:
	summary_parts.append(f"{total_channels_with_habits} channels with daily habits")
	if video_habits:
	summary_parts.append(f"{len(video_habits)} rewatched videos")
	if content_habits:
	summary_parts.append(f"{len(content_habits)} recurring topics")

	return {
	"channel_habits": channel_habits, # Return ALL channel habits
	"video_habits": video_habits[:30], # Top 30 rewatched videos
	"content_habits": content_habits[:30], # Top 30 topics
	"habit_strength": habit_strength,
	"total_channels_with_habits": total_channels_with_habits,
	"total_videos_rewatched": len(video_habits),
	"total_topics_with_habits": len(content_habits),
	"max_streak_days": max_channel_streak,
	"summary": " \| ".join(summary_parts) if summary_parts else "No strong habits detected"
	}


	def get_time_spent(token: str, break_threshold_minutes: int = 60, last_video_minutes: int = 5) -> dict:
	"""
	Calculate approximate time spent on YouTube.

	Uses session detection: groups continuous watching periods separated by
	significant breaks (>break_threshold_minutes).

	Args:
	token: Session token
	break_threshold_minutes: Gap that ends a session (default 60 min)
	last_video_minutes: Estimate for last video duration (default 5 min)

	Returns:
	- total_minutes, total_hours
	- average_daily_minutes
	- sessions stats (count, average, longest)
	"""
	events, _ = load_session_events(token)

	if not events:
	return {"error": "Session not found or empty"}

	from datetime import datetime, timedelta
	from collections import defaultdict

	# Get watch events with valid timestamps
	watch_events = []
	for e in events:
	if e.get("type") == "watch":
	ts = e.get("timestamp_local") or e.get("timestamp_utc")
	if ts:
	try:
	if "T" in ts:
	dt = datetime.fromisoformat(ts.replace('Z', '+00:00'))
	else:
	dt = datetime.strptime(ts[:19], "%Y-%m-%dT%H:%M:%S")
	watch_events.append({"timestamp": dt, "event": e})
	except:
	pass

	if not watch_events:
	return {
	"total_minutes": 0,
	"total_hours": 0,
	"average_daily_minutes": 0,
	"total_days": 0,
	"sessions": {
	"total_count": 0,
	"average_duration_minutes": 0,
	"longest_session_minutes": 0
	},
	"summary": "No watch events with timestamps"
	}

	# Sort by timestamp
	watch_events.sort(key=lambda x: x["timestamp"])

	# Detect sessions
	sessions = []
	break_threshold = timedelta(minutes=break_threshold_minutes)

	session_start = watch_events[0]["timestamp"]
	session_end = watch_events[0]["timestamp"]
	session_event_count = 1

	for i in range(1, len(watch_events)):
	current = watch_events[i]["timestamp"]
	previous = watch_events[i-1]["timestamp"]
	gap = current - previous

	if gap > break_threshold:
	# End current session
	duration = (session_end - session_start).total_seconds() / 60.0
	duration += last_video_minutes # Add estimate for last video
	sessions.append({
	"start": session_start,
	"end": session_end,
	"duration_minutes": duration,
	"event_count": session_event_count
	})
	# Start new session
	session_start = current
	session_end = current
	session_event_count = 1
	else:
	session_end = current
	session_event_count += 1

	# Don't forget the last session
	duration = (session_end - session_start).total_seconds() / 60.0
	duration += last_video_minutes
	sessions.append({
	"start": session_start,
	"end": session_end,
	"duration_minutes": duration,
	"event_count": session_event_count
	})

	# Calculate totals
	total_minutes = sum(s["duration_minutes"] for s in sessions)
	total_hours = round(total_minutes / 60, 1)

	# Get unique days
	unique_days = set()
	for we in watch_events:
	unique_days.add(we["timestamp"].date())
	total_days = len(unique_days)

	average_daily = round(total_minutes / total_days, 1) if total_days > 0 else 0

	# Session stats
	session_durations = [s["duration_minutes"] for s in sessions]
	avg_session = round(sum(session_durations) / len(session_durations), 1) if sessions else 0
	longest_session = round(max(session_durations), 1) if sessions else 0

	# Generate summary
	if total_hours >= 24:
	time_str = f"{int(total_hours // 24)} days {int(total_hours % 24)} hours"
	else:
	time_str = f"{total_hours} hours"

	summary = f"Spent approximately {time_str} on YouTube across {total_days} days ({len(sessions)} sessions)"

	return {
	"total_minutes": round(total_minutes, 1),
	"total_hours": total_hours,
	"average_daily_minutes": average_daily,
	"total_days": total_days,
	"sessions": {
	"total_count": len(sessions),
	"average_duration_minutes": avg_session,
	"longest_session_minutes": longest_session
	},
	"summary": summary
	}


	def get_channel_distribution(token: str) -> dict:
	"""
	Get channel distribution by view count bins.

	Returns:
	- bin_distribution: Channels grouped by view count [1, 2-5, 6-10, 11-20, 21-50, 51-100, 100+]
	- temporal_by_bin: Monthly breakdown of videos watched per bin
	"""
	events, _ = load_session_events(token)

	if not events:
	return {"error": "Session not found or empty"}

	from collections import defaultdict
	from datetime import datetime

	# Get watch events
	watch_events = [e for e in events if e.get("type") == "watch"]

	# Count views per channel
	channel_counts = Counter(
	e.get("channel_clean") for e in watch_events
	if e.get("channel_clean")
	)

	# Define bins
	bins = [
	(1, 1, "1"),
	(2, 5, "2-5"),
	(6, 10, "6-10"),
	(11, 20, "11-20"),
	(21, 50, "21-50"),
	(51, 100, "51-100"),
	(101, float('inf'), "100+")
	]

	# Count channels per bin
	bin_distribution = []
	channel_bin_map = {} # Map channel -> bin label

	for min_val, max_val, label in bins:
	channels_in_bin = [
	ch for ch, count in channel_counts.items()
	if min_val <= count <= max_val
	]
	video_count = sum(channel_counts[ch] for ch in channels_in_bin)

	bin_distribution.append({
	"bin": label,
	"channel_count": len(channels_in_bin),
	"video_count": video_count
	})

	for ch in channels_in_bin:
	channel_bin_map[ch] = label

	# Temporal breakdown by bin and month
	monthly_data = defaultdict(lambda: defaultdict(int)) # {month: {bin: count}}

	for event in watch_events:
	channel = event.get("channel_clean")
	ts = event.get("timestamp_local") or event.get("timestamp_utc")

	if not channel or not ts:
	continue

	bin_label = channel_bin_map.get(channel, "1") # Default to "1"

	try:
	if isinstance(ts, str):
	dt = datetime.fromisoformat(ts.replace('Z', '+00:00'))
	else:
	continue
	month_key = f"{dt.year}-{dt.month:02d}"
	monthly_data[month_key][bin_label] += 1
	except:
	continue

	# Convert to sorted list
	temporal_by_bin = []
	for month in sorted(monthly_data.keys()):
	month_entry = {
	"month": month,
	"bins": {}
	}
	for min_val, max_val, label in bins:
	month_entry["bins"][label] = monthly_data[month].get(label, 0)
	temporal_by_bin.append(month_entry)

	# Summary stats
	total_channels = len(channel_counts)
	total_videos = sum(channel_counts.values())
	single_view_channels = sum(1 for count in channel_counts.values() if count == 1)

	return {
	"bin_distribution": bin_distribution,
	"temporal_by_bin": temporal_by_bin,
	"stats": {
	"total_channels": total_channels,
	"total_videos": total_videos,
	"single_view_channels": single_view_channels,
	"single_view_percentage": round(single_view_channels / total_channels * 100, 1) if total_channels > 0 else 0
	}
	}


	def get_full_analytics(token: str) -> dict:
	"""Get all analytics in one call."""
	return {
	"summary": get_session_summary(token),
	"channels": get_channel_analytics(token),
	"watch_patterns": get_watch_patterns(token),
	"searches": get_search_analytics(token),
	"subscription_overlap": get_subscription_overlap(token),
	"behavior_anomalies": get_behavior_anomalies(token),
	"habit_formation": get_habit_formation(token),
	"temporal_trends": get_temporal_trends(token),
	"time_spent": get_time_spent(token),
	"channel_distribution": get_channel_distribution(token)
	}