Devang1290
feat: deploy News Whisper on-demand search API (FastAPI + Docker)
2cb327c
"""
Path Utilities
==============
Centralized helpers for building filesystem paths used across the pipeline.
- get_project_root() → absolute path to the News-Whisper repo root
- get_timestamp_folder() → human-readable timestamp for output dirs (e.g. "24_mar_10_37_pm")
- sanitize_query_folder() → filesystem-safe folder name from search queries (Unicode-safe)
- find_latest_json() → most recently modified JSON file in a directory
- find_latest_audio_dir() → most recently modified timestamp subfolder for a category
These are imported via:
from backend.common import get_project_root, get_timestamp_folder
or:
from backend.common.paths import get_project_root
"""
import re
from pathlib import Path
from datetime import datetime
from typing import Optional
def get_project_root() -> Path:
"""Return the absolute path to the News-Whisper project root.
Works regardless of which module calls it — always resolves to the
directory containing main.py/app.py (two levels up from this file).
"""
return Path(__file__).parent.parent.parent.absolute()
def get_timestamp_folder() -> str:
"""Generate a human-readable timestamp folder name.
Format: {day}_{month}_{hour}_{minute}_{am/pm}
Examples: "1_feb_2_30_pm", "15_jan_9_45_am"
"""
now = datetime.now()
day = now.day
month = now.strftime("%b").lower()
hour = now.strftime("%I").lstrip("0")
minute = now.strftime("%M")
am_pm = now.strftime("%p").lower()
return f"{day}_{month}_{hour}_{minute}_{am_pm}"
def sanitize_query_folder(query: str) -> str:
"""Build a filesystem-safe folder name while preserving Unicode queries.
Used for search query output directories. Hindi/Devanagari characters
are preserved; only OS-illegal characters are replaced with underscores.
Args:
query: The raw search query string (e.g. "climate change" or "पुणे")
Returns:
A safe folder name string (e.g. "climate_change" or "पुणे")
"""
safe = query.strip()
safe = re.sub(r'[<>:"/\\|?*\x00-\x1F]', '_', safe)
safe = re.sub(r'\s+', '_', safe)
safe = re.sub(r'_+', '_', safe).strip(' ._')
return safe or "query"
def find_latest_json(directory: Path) -> Optional[Path]:
"""Return the most recently modified JSON file in a directory.
Returns None if the directory doesn't exist or contains no JSON files.
"""
if not directory.exists():
return None
json_files = list(directory.glob("*.json"))
return max(json_files, key=lambda f: f.stat().st_mtime) if json_files else None
def find_latest_audio_dir(base_dir: Path, category_name: str) -> Optional[Path]:
"""Return the most recently modified timestamp sub-directory for a category.
Looks inside: base_dir/categories/{category_name_lower}/
Returns None if the directory doesn't exist or has no subdirectories.
"""
category_dir = base_dir / "categories" / category_name.lower().replace(" ", "_")
if not category_dir.exists():
return None
dirs = [d for d in category_dir.iterdir() if d.is_dir()]
return max(dirs, key=lambda d: d.stat().st_mtime) if dirs else None