AION-Search / src /hf_logging.py
astronolan's picture
Enhance search functionality and UI components
fa7eb7f
# src/hf_logging.py
"""Hugging Face dataset logging with CommitScheduler."""
import json
import uuid
import logging
from pathlib import Path
from datetime import datetime
from huggingface_hub import CommitScheduler
from src.config import HF_LOG_REPO_ID, HF_LOG_EVERY_MINUTES
logger = logging.getLogger(__name__)
# Generate a unique session ID for this app instance
SESSION_ID = uuid.uuid4().hex
# Initialize scheduler & local file if logging is configured
if HF_LOG_REPO_ID:
feedback_file = Path("query_logs") / f"queries_{uuid.uuid4().hex}.jsonl"
feedback_folder = feedback_file.parent
feedback_folder.mkdir(parents=True, exist_ok=True)
scheduler = CommitScheduler(
repo_id=HF_LOG_REPO_ID,
repo_type="dataset",
folder_path=feedback_folder,
path_in_repo="data", # files go in data/ on the dataset
every=HF_LOG_EVERY_MINUTES, # in minutes
)
# Print initialization info
print("\n" + "="*80)
print("HF DATASET LOGGING INITIALIZED")
print("="*80)
print(f"Session ID: {SESSION_ID}")
print(f"Repository: {HF_LOG_REPO_ID}")
print(f"Local directory: {feedback_folder.absolute()}")
print(f"Local file: {feedback_file.name}")
print(f"Commit frequency: every {HF_LOG_EVERY_MINUTES} minutes")
print("="*80 + "\n")
else:
scheduler = None
feedback_file = None
print("HF dataset logging disabled (HF_LOG_REPO_ID not set)")
def log_query_event(payload: dict) -> None:
"""Append one JSON log line that CommitScheduler will push to the Hub.
payload must be JSON-serializable. session_id and timestamp are added if missing.
No-op if HF_LOG_REPO_ID is not configured.
Expected payload fields:
- log_type: Type of event (aql_query, zilliz_query_stats, click_event)
- request_id: Unique ID for the request/search (optional, but recommended)
- session_id: Session ID (auto-added if missing)
- timestamp: ISO timestamp (auto-added if missing)
- error_occurred: Boolean indicating if an error occurred (optional)
- error_message: Error message if error_occurred is True (optional)
- error_type: Type of error (optional)
... plus other event-specific fields
"""
if scheduler is None or feedback_file is None:
return
# Auto-add session_id if not present
if "session_id" not in payload:
payload["session_id"] = SESSION_ID
# Auto-add timestamp if not present
if "timestamp" not in payload:
payload["timestamp"] = datetime.utcnow().isoformat()
try:
with scheduler.lock:
with feedback_file.open("a") as f:
f.write(json.dumps(payload))
f.write("\n")
print(f"βœ“ Logged to HF dataset: {payload.get('log_type', 'unknown')}")
except Exception as e:
logger.error(f"Failed to write query log locally: {e}")