File size: 5,501 Bytes
f9c0202 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 | #!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Lazy-loaded shared data cache for data viewer tabs.
Loads data_viewer.jsonl once on first access, not at import time.
"""
from __future__ import annotations
import json
import pandas as pd
from pathlib import Path
BASE_DIR = Path(__file__).resolve().parent.parent
# Bucket mount point (HF Storage Bucket mounted at /data in Space runtime)
_BUCKET_DIR = Path("/data")
# Prefer bucket path if available, fallback to repo-local path
DATA_VIEWER_FILE = (
_BUCKET_DIR / "data_viewer.jsonl"
if (_BUCKET_DIR / "data_viewer.jsonl").exists()
else BASE_DIR / "data" / "data_viewer.jsonl"
)
DATA_VIEWER_INDEX_FILE = BASE_DIR / "data" / "data_viewer_index.json"
_REQUIRED_COLS = [
"model_name", "id", "prompt", "article", "overall_score",
"comprehensiveness_score", "insight_score",
"instruction_following_score", "readability_score",
]
_cache: pd.DataFrame | None = None
_index_cache: dict | None = None
def get_data() -> pd.DataFrame:
global _cache
if _cache is not None:
return _cache
records = []
if DATA_VIEWER_FILE.exists():
with DATA_VIEWER_FILE.open(encoding="utf-8") as fh:
for line in fh:
line = line.strip()
if not line:
continue
try:
records.append(json.loads(line))
except json.JSONDecodeError:
continue
df = pd.DataFrame(records)
if df.empty or not all(c in df.columns for c in _REQUIRED_COLS):
_cache = pd.DataFrame(columns=_REQUIRED_COLS)
else:
df["id"] = df["id"].astype(str)
_cache = df
return _cache
def get_index() -> dict:
global _index_cache
if _index_cache is not None:
return _index_cache
if DATA_VIEWER_INDEX_FILE.exists():
try:
_index_cache = json.loads(DATA_VIEWER_INDEX_FILE.read_text(encoding="utf-8"))
return _index_cache
except json.JSONDecodeError:
pass
models = set()
tasks = {}
if DATA_VIEWER_FILE.exists():
with DATA_VIEWER_FILE.open(encoding="utf-8") as fh:
for line in fh:
if not line.strip():
continue
try:
item = json.loads(line)
except json.JSONDecodeError:
continue
model = item.get("model_name")
item_id = str(item.get("id"))
prompt = item.get("prompt") or ""
if model:
models.add(model)
if item_id and item_id not in tasks:
tasks[item_id] = prompt
_index_cache = {
"models": sorted(models),
"tasks": [
{"id": item_id, "prompt": tasks[item_id]}
for item_id in sorted(tasks, key=lambda value: int(value))
],
}
return _index_cache
def get_entry(model_name: str, item_id: str) -> dict | None:
if not model_name or not item_id or not DATA_VIEWER_FILE.exists():
return None
item_id = str(item_id)
index = get_index()
location = index.get("lookup", {}).get(f"{model_name}\t{item_id}")
if location:
offset, length = location
with DATA_VIEWER_FILE.open("rb") as fh:
fh.seek(offset)
line = fh.read(length).decode("utf-8")
try:
item = json.loads(line)
if item.get("model_name") == model_name and str(item.get("id")) == item_id:
return item
except json.JSONDecodeError:
pass
with DATA_VIEWER_FILE.open(encoding="utf-8") as fh:
for line in fh:
if not line.strip():
continue
try:
item = json.loads(line)
except json.JSONDecodeError:
continue
if item.get("model_name") == model_name and str(item.get("id")) == item_id:
return item
return None
def get_entries_for_task(item_id: str, model_names: set[str]) -> dict[str, dict]:
if not item_id or not model_names or not DATA_VIEWER_FILE.exists():
return {}
item_id = str(item_id)
index = get_index()
locations = {
model: index.get("lookup", {}).get(f"{model}\t{item_id}")
for model in model_names
}
locations = {model: loc for model, loc in locations.items() if loc}
if locations:
found = {}
with DATA_VIEWER_FILE.open("rb") as fh:
for model, (offset, length) in locations.items():
fh.seek(offset)
try:
item = json.loads(fh.read(length).decode("utf-8"))
if item.get("model_name") == model and str(item.get("id")) == item_id:
found[model] = item
except json.JSONDecodeError:
pass
if len(found) == len(locations):
return found
found = {}
with DATA_VIEWER_FILE.open(encoding="utf-8") as fh:
for line in fh:
if not line.strip():
continue
try:
item = json.loads(line)
except json.JSONDecodeError:
continue
model = item.get("model_name")
if str(item.get("id")) == item_id and model in model_names:
found[model] = item
if len(found) == len(model_names):
break
return found
|