Spaces:
Sleeping
Sleeping
| """ | |
| Path Utilities | |
| ============== | |
| Centralized helpers for building filesystem paths used across the pipeline. | |
| - get_project_root() → absolute path to the News-Whisper repo root | |
| - get_timestamp_folder() → human-readable timestamp for output dirs (e.g. "24_mar_10_37_pm") | |
| - sanitize_query_folder() → filesystem-safe folder name from search queries (Unicode-safe) | |
| - find_latest_json() → most recently modified JSON file in a directory | |
| - find_latest_audio_dir() → most recently modified timestamp subfolder for a category | |
| These are imported via: | |
| from backend.common import get_project_root, get_timestamp_folder | |
| or: | |
| from backend.common.paths import get_project_root | |
| """ | |
| import re | |
| from pathlib import Path | |
| from datetime import datetime | |
| from typing import Optional | |
| def get_project_root() -> Path: | |
| """Return the absolute path to the News-Whisper project root. | |
| Works regardless of which module calls it — always resolves to the | |
| directory containing main.py/app.py (two levels up from this file). | |
| """ | |
| return Path(__file__).parent.parent.parent.absolute() | |
| def get_timestamp_folder() -> str: | |
| """Generate a human-readable timestamp folder name. | |
| Format: {day}_{month}_{hour}_{minute}_{am/pm} | |
| Examples: "1_feb_2_30_pm", "15_jan_9_45_am" | |
| """ | |
| now = datetime.now() | |
| day = now.day | |
| month = now.strftime("%b").lower() | |
| hour = now.strftime("%I").lstrip("0") | |
| minute = now.strftime("%M") | |
| am_pm = now.strftime("%p").lower() | |
| return f"{day}_{month}_{hour}_{minute}_{am_pm}" | |
| def sanitize_query_folder(query: str) -> str: | |
| """Build a filesystem-safe folder name while preserving Unicode queries. | |
| Used for search query output directories. Hindi/Devanagari characters | |
| are preserved; only OS-illegal characters are replaced with underscores. | |
| Args: | |
| query: The raw search query string (e.g. "climate change" or "पुणे") | |
| Returns: | |
| A safe folder name string (e.g. "climate_change" or "पुणे") | |
| """ | |
| safe = query.strip() | |
| safe = re.sub(r'[<>:"/\\|?*\x00-\x1F]', '_', safe) | |
| safe = re.sub(r'\s+', '_', safe) | |
| safe = re.sub(r'_+', '_', safe).strip(' ._') | |
| return safe or "query" | |
| def find_latest_json(directory: Path) -> Optional[Path]: | |
| """Return the most recently modified JSON file in a directory. | |
| Returns None if the directory doesn't exist or contains no JSON files. | |
| """ | |
| if not directory.exists(): | |
| return None | |
| json_files = list(directory.glob("*.json")) | |
| return max(json_files, key=lambda f: f.stat().st_mtime) if json_files else None | |
| def find_latest_audio_dir(base_dir: Path, category_name: str) -> Optional[Path]: | |
| """Return the most recently modified timestamp sub-directory for a category. | |
| Looks inside: base_dir/categories/{category_name_lower}/ | |
| Returns None if the directory doesn't exist or has no subdirectories. | |
| """ | |
| category_dir = base_dir / "categories" / category_name.lower().replace(" ", "_") | |
| if not category_dir.exists(): | |
| return None | |
| dirs = [d for d in category_dir.iterdir() if d.is_dir()] | |
| return max(dirs, key=lambda d: d.stat().st_mtime) if dirs else None | |