# src/hf_logging.py """Hugging Face dataset logging with CommitScheduler.""" import json import uuid import logging from pathlib import Path from datetime import datetime from huggingface_hub import CommitScheduler from src.config import HF_LOG_REPO_ID, HF_LOG_EVERY_MINUTES logger = logging.getLogger(__name__) # Generate a unique session ID for this app instance SESSION_ID = uuid.uuid4().hex # Initialize scheduler & local file if logging is configured if HF_LOG_REPO_ID: feedback_file = Path("query_logs") / f"queries_{uuid.uuid4().hex}.jsonl" feedback_folder = feedback_file.parent feedback_folder.mkdir(parents=True, exist_ok=True) scheduler = CommitScheduler( repo_id=HF_LOG_REPO_ID, repo_type="dataset", folder_path=feedback_folder, path_in_repo="data", # files go in data/ on the dataset every=HF_LOG_EVERY_MINUTES, # in minutes ) # Print initialization info print("\n" + "="*80) print("HF DATASET LOGGING INITIALIZED") print("="*80) print(f"Session ID: {SESSION_ID}") print(f"Repository: {HF_LOG_REPO_ID}") print(f"Local directory: {feedback_folder.absolute()}") print(f"Local file: {feedback_file.name}") print(f"Commit frequency: every {HF_LOG_EVERY_MINUTES} minutes") print("="*80 + "\n") else: scheduler = None feedback_file = None print("HF dataset logging disabled (HF_LOG_REPO_ID not set)") def log_query_event(payload: dict) -> None: """Append one JSON log line that CommitScheduler will push to the Hub. payload must be JSON-serializable. session_id and timestamp are added if missing. No-op if HF_LOG_REPO_ID is not configured. Expected payload fields: - log_type: Type of event (aql_query, zilliz_query_stats, click_event) - request_id: Unique ID for the request/search (optional, but recommended) - session_id: Session ID (auto-added if missing) - timestamp: ISO timestamp (auto-added if missing) - error_occurred: Boolean indicating if an error occurred (optional) - error_message: Error message if error_occurred is True (optional) - error_type: Type of error (optional) ... plus other event-specific fields """ if scheduler is None or feedback_file is None: return # Auto-add session_id if not present if "session_id" not in payload: payload["session_id"] = SESSION_ID # Auto-add timestamp if not present if "timestamp" not in payload: payload["timestamp"] = datetime.utcnow().isoformat() try: with scheduler.lock: with feedback_file.open("a") as f: f.write(json.dumps(payload)) f.write("\n") print(f"✓ Logged to HF dataset: {payload.get('log_type', 'unknown')}") except Exception as e: logger.error(f"Failed to write query log locally: {e}")