Spaces:
Running
Running
File size: 2,918 Bytes
fa7eb7f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
# src/hf_logging.py
"""Hugging Face dataset logging with CommitScheduler."""
import json
import uuid
import logging
from pathlib import Path
from datetime import datetime
from huggingface_hub import CommitScheduler
from src.config import HF_LOG_REPO_ID, HF_LOG_EVERY_MINUTES
logger = logging.getLogger(__name__)
# Generate a unique session ID for this app instance
SESSION_ID = uuid.uuid4().hex
# Initialize scheduler & local file if logging is configured
if HF_LOG_REPO_ID:
feedback_file = Path("query_logs") / f"queries_{uuid.uuid4().hex}.jsonl"
feedback_folder = feedback_file.parent
feedback_folder.mkdir(parents=True, exist_ok=True)
scheduler = CommitScheduler(
repo_id=HF_LOG_REPO_ID,
repo_type="dataset",
folder_path=feedback_folder,
path_in_repo="data", # files go in data/ on the dataset
every=HF_LOG_EVERY_MINUTES, # in minutes
)
# Print initialization info
print("\n" + "="*80)
print("HF DATASET LOGGING INITIALIZED")
print("="*80)
print(f"Session ID: {SESSION_ID}")
print(f"Repository: {HF_LOG_REPO_ID}")
print(f"Local directory: {feedback_folder.absolute()}")
print(f"Local file: {feedback_file.name}")
print(f"Commit frequency: every {HF_LOG_EVERY_MINUTES} minutes")
print("="*80 + "\n")
else:
scheduler = None
feedback_file = None
print("HF dataset logging disabled (HF_LOG_REPO_ID not set)")
def log_query_event(payload: dict) -> None:
"""Append one JSON log line that CommitScheduler will push to the Hub.
payload must be JSON-serializable. session_id and timestamp are added if missing.
No-op if HF_LOG_REPO_ID is not configured.
Expected payload fields:
- log_type: Type of event (aql_query, zilliz_query_stats, click_event)
- request_id: Unique ID for the request/search (optional, but recommended)
- session_id: Session ID (auto-added if missing)
- timestamp: ISO timestamp (auto-added if missing)
- error_occurred: Boolean indicating if an error occurred (optional)
- error_message: Error message if error_occurred is True (optional)
- error_type: Type of error (optional)
... plus other event-specific fields
"""
if scheduler is None or feedback_file is None:
return
# Auto-add session_id if not present
if "session_id" not in payload:
payload["session_id"] = SESSION_ID
# Auto-add timestamp if not present
if "timestamp" not in payload:
payload["timestamp"] = datetime.utcnow().isoformat()
try:
with scheduler.lock:
with feedback_file.open("a") as f:
f.write(json.dumps(payload))
f.write("\n")
print(f"β Logged to HF dataset: {payload.get('log_type', 'unknown')}")
except Exception as e:
logger.error(f"Failed to write query log locally: {e}")
|