File size: 2,918 Bytes
fa7eb7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# src/hf_logging.py
"""Hugging Face dataset logging with CommitScheduler."""

import json
import uuid
import logging
from pathlib import Path
from datetime import datetime

from huggingface_hub import CommitScheduler

from src.config import HF_LOG_REPO_ID, HF_LOG_EVERY_MINUTES

logger = logging.getLogger(__name__)

# Generate a unique session ID for this app instance
SESSION_ID = uuid.uuid4().hex

# Initialize scheduler & local file if logging is configured
if HF_LOG_REPO_ID:
    feedback_file = Path("query_logs") / f"queries_{uuid.uuid4().hex}.jsonl"
    feedback_folder = feedback_file.parent
    feedback_folder.mkdir(parents=True, exist_ok=True)

    scheduler = CommitScheduler(
        repo_id=HF_LOG_REPO_ID,
        repo_type="dataset",
        folder_path=feedback_folder,
        path_in_repo="data",             # files go in data/ on the dataset
        every=HF_LOG_EVERY_MINUTES,      # in minutes
    )

    # Print initialization info
    print("\n" + "="*80)
    print("HF DATASET LOGGING INITIALIZED")
    print("="*80)
    print(f"Session ID: {SESSION_ID}")
    print(f"Repository: {HF_LOG_REPO_ID}")
    print(f"Local directory: {feedback_folder.absolute()}")
    print(f"Local file: {feedback_file.name}")
    print(f"Commit frequency: every {HF_LOG_EVERY_MINUTES} minutes")
    print("="*80 + "\n")
else:
    scheduler = None
    feedback_file = None
    print("HF dataset logging disabled (HF_LOG_REPO_ID not set)")


def log_query_event(payload: dict) -> None:
    """Append one JSON log line that CommitScheduler will push to the Hub.

    payload must be JSON-serializable. session_id and timestamp are added if missing.
    No-op if HF_LOG_REPO_ID is not configured.

    Expected payload fields:
        - log_type: Type of event (aql_query, zilliz_query_stats, click_event)
        - request_id: Unique ID for the request/search (optional, but recommended)
        - session_id: Session ID (auto-added if missing)
        - timestamp: ISO timestamp (auto-added if missing)
        - error_occurred: Boolean indicating if an error occurred (optional)
        - error_message: Error message if error_occurred is True (optional)
        - error_type: Type of error (optional)
        ... plus other event-specific fields
    """
    if scheduler is None or feedback_file is None:
        return

    # Auto-add session_id if not present
    if "session_id" not in payload:
        payload["session_id"] = SESSION_ID

    # Auto-add timestamp if not present
    if "timestamp" not in payload:
        payload["timestamp"] = datetime.utcnow().isoformat()

    try:
        with scheduler.lock:
            with feedback_file.open("a") as f:
                f.write(json.dumps(payload))
                f.write("\n")
        print(f"βœ“ Logged to HF dataset: {payload.get('log_type', 'unknown')}")
    except Exception as e:
        logger.error(f"Failed to write query log locally: {e}")