Spaces:
Running
Running
| # src/hf_logging.py | |
| """Hugging Face dataset logging with CommitScheduler.""" | |
| import json | |
| import uuid | |
| import logging | |
| from pathlib import Path | |
| from datetime import datetime | |
| from huggingface_hub import CommitScheduler | |
| from src.config import HF_LOG_REPO_ID, HF_LOG_EVERY_MINUTES | |
| logger = logging.getLogger(__name__) | |
| # Generate a unique session ID for this app instance | |
| SESSION_ID = uuid.uuid4().hex | |
| # Initialize scheduler & local file if logging is configured | |
| if HF_LOG_REPO_ID: | |
| feedback_file = Path("query_logs") / f"queries_{uuid.uuid4().hex}.jsonl" | |
| feedback_folder = feedback_file.parent | |
| feedback_folder.mkdir(parents=True, exist_ok=True) | |
| scheduler = CommitScheduler( | |
| repo_id=HF_LOG_REPO_ID, | |
| repo_type="dataset", | |
| folder_path=feedback_folder, | |
| path_in_repo="data", # files go in data/ on the dataset | |
| every=HF_LOG_EVERY_MINUTES, # in minutes | |
| ) | |
| # Print initialization info | |
| print("\n" + "="*80) | |
| print("HF DATASET LOGGING INITIALIZED") | |
| print("="*80) | |
| print(f"Session ID: {SESSION_ID}") | |
| print(f"Repository: {HF_LOG_REPO_ID}") | |
| print(f"Local directory: {feedback_folder.absolute()}") | |
| print(f"Local file: {feedback_file.name}") | |
| print(f"Commit frequency: every {HF_LOG_EVERY_MINUTES} minutes") | |
| print("="*80 + "\n") | |
| else: | |
| scheduler = None | |
| feedback_file = None | |
| print("HF dataset logging disabled (HF_LOG_REPO_ID not set)") | |
| def log_query_event(payload: dict) -> None: | |
| """Append one JSON log line that CommitScheduler will push to the Hub. | |
| payload must be JSON-serializable. session_id and timestamp are added if missing. | |
| No-op if HF_LOG_REPO_ID is not configured. | |
| Expected payload fields: | |
| - log_type: Type of event (aql_query, zilliz_query_stats, click_event) | |
| - request_id: Unique ID for the request/search (optional, but recommended) | |
| - session_id: Session ID (auto-added if missing) | |
| - timestamp: ISO timestamp (auto-added if missing) | |
| - error_occurred: Boolean indicating if an error occurred (optional) | |
| - error_message: Error message if error_occurred is True (optional) | |
| - error_type: Type of error (optional) | |
| ... plus other event-specific fields | |
| """ | |
| if scheduler is None or feedback_file is None: | |
| return | |
| # Auto-add session_id if not present | |
| if "session_id" not in payload: | |
| payload["session_id"] = SESSION_ID | |
| # Auto-add timestamp if not present | |
| if "timestamp" not in payload: | |
| payload["timestamp"] = datetime.utcnow().isoformat() | |
| try: | |
| with scheduler.lock: | |
| with feedback_file.open("a") as f: | |
| f.write(json.dumps(payload)) | |
| f.write("\n") | |
| print(f"β Logged to HF dataset: {payload.get('log_type', 'unknown')}") | |
| except Exception as e: | |
| logger.error(f"Failed to write query log locally: {e}") | |