""" Path Utilities ============== Centralized helpers for building filesystem paths used across the pipeline. - get_project_root() → absolute path to the News-Whisper repo root - get_timestamp_folder() → human-readable timestamp for output dirs (e.g. "24_mar_10_37_pm") - sanitize_query_folder() → filesystem-safe folder name from search queries (Unicode-safe) - find_latest_json() → most recently modified JSON file in a directory - find_latest_audio_dir() → most recently modified timestamp subfolder for a category These are imported via: from backend.common import get_project_root, get_timestamp_folder or: from backend.common.paths import get_project_root """ import re from pathlib import Path from datetime import datetime from typing import Optional def get_project_root() -> Path: """Return the absolute path to the News-Whisper project root. Works regardless of which module calls it — always resolves to the directory containing main.py/app.py (two levels up from this file). """ return Path(__file__).parent.parent.parent.absolute() def get_timestamp_folder() -> str: """Generate a human-readable timestamp folder name. Format: {day}_{month}_{hour}_{minute}_{am/pm} Examples: "1_feb_2_30_pm", "15_jan_9_45_am" """ now = datetime.now() day = now.day month = now.strftime("%b").lower() hour = now.strftime("%I").lstrip("0") minute = now.strftime("%M") am_pm = now.strftime("%p").lower() return f"{day}_{month}_{hour}_{minute}_{am_pm}" def sanitize_query_folder(query: str) -> str: """Build a filesystem-safe folder name while preserving Unicode queries. Used for search query output directories. Hindi/Devanagari characters are preserved; only OS-illegal characters are replaced with underscores. Args: query: The raw search query string (e.g. "climate change" or "पुणे") Returns: A safe folder name string (e.g. "climate_change" or "पुणे") """ safe = query.strip() safe = re.sub(r'[<>:"/\\|?*\x00-\x1F]', '_', safe) safe = re.sub(r'\s+', '_', safe) safe = re.sub(r'_+', '_', safe).strip(' ._') return safe or "query" def find_latest_json(directory: Path) -> Optional[Path]: """Return the most recently modified JSON file in a directory. Returns None if the directory doesn't exist or contains no JSON files. """ if not directory.exists(): return None json_files = list(directory.glob("*.json")) return max(json_files, key=lambda f: f.stat().st_mtime) if json_files else None def find_latest_audio_dir(base_dir: Path, category_name: str) -> Optional[Path]: """Return the most recently modified timestamp sub-directory for a category. Looks inside: base_dir/categories/{category_name_lower}/ Returns None if the directory doesn't exist or has no subdirectories. """ category_dir = base_dir / "categories" / category_name.lower().replace(" ", "_") if not category_dir.exists(): return None dirs = [d for d in category_dir.iterdir() if d.is_dir()] return max(dirs, key=lambda d: d.stat().st_mtime) if dirs else None