Spaces:
Sleeping
Sleeping
File size: 15,906 Bytes
6a1e37a f8d68c0 6a1e37a 7d08b83 6a1e37a f8d68c0 7d08b83 f8d68c0 7d08b83 f8d68c0 7d08b83 f8d68c0 6a1e37a 7d08b83 6a1e37a 7d08b83 6a1e37a 7d08b83 6a1e37a 7d08b83 6a1e37a 7d08b83 6a1e37a 7d08b83 6a1e37a 7d08b83 6a1e37a 7d08b83 6a1e37a 7d08b83 6a1e37a 7d08b83 6a1e37a 511ed70 6a1e37a 511ed70 6a1e37a 511ed70 6a1e37a 511ed70 6a1e37a 511ed70 6a1e37a 511ed70 6a1e37a 511ed70 6a1e37a 511ed70 6a1e37a a25d70c 6a1e37a a33f6ae 6a1e37a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 | """
Hugging Face Hub storage for patient evaluations.
Saves evaluation data to Hugging Face Dataset or Repository.
"""
import os
import json
import csv
import tempfile
from datetime import datetime
from typing import Dict, List, Optional, Tuple
from pathlib import Path
try:
from huggingface_hub import HfApi, login, whoami
from huggingface_hub.utils import HfHubHTTPError
HF_AVAILABLE = True
except ImportError:
HF_AVAILABLE = False
class HuggingFaceStorage:
"""Store patient evaluations in Hugging Face Hub."""
def __init__(self, repo_id: Optional[str] = None, repo_type: str = "dataset"):
"""
Initialize Hugging Face storage.
Args:
repo_id: Hugging Face repo ID (e.g., "username/dataset-name")
If None, will try to use environment variable or Space name
repo_type: "dataset" or "model" (dataset is recommended for structured data)
"""
self.repo_id = repo_id or self._get_repo_id()
self.repo_type = repo_type
self.api = HfApi() if HF_AVAILABLE else None
self._token = self._get_token()
def _get_token(self) -> Optional[str]:
"""Get HF token from common environment variable names."""
return (
os.getenv("HF_TOKEN")
or os.getenv("HUGGINGFACEHUB_API_TOKEN")
or os.getenv("HUGGING_FACE_HUB_TOKEN")
or os.getenv("HUGGINGFACE_HUB_TOKEN")
)
def _get_repo_id(self) -> Optional[str]:
"""Try to get repo ID from environment or Space name."""
# Try environment variable first
repo_id = os.getenv("HF_EVAL_REPO_ID")
if repo_id:
return repo_id
# Try to infer from Space name (if running in HF Space)
space_id = os.getenv("SPACE_ID")
if space_id:
# Convert space ID to dataset ID
username = space_id.split("/")[0] if "/" in space_id else None
if username:
return f"{username}/patient-evaluations"
return None
def _ensure_authenticated(self) -> bool:
"""Check if user is authenticated with Hugging Face."""
if not HF_AVAILABLE:
print("[HF Auth] HF_AVAILABLE is False")
return False
# If a token is provided via env vars, log in programmatically
if self._token:
try:
print("[HF Auth] Attempting login with token...")
login(token=self._token, add_to_git_credential=False)
print("[HF Auth] Login successful")
except Exception as e:
print(f"[HF Auth] Warning: Could not login to Hugging Face with provided token: {e}")
return False
try:
user_info = whoami()
print(f"[HF Auth] Authenticated as: {user_info.get('name', 'unknown')}")
return True
except Exception as e:
print(f"[HF Auth] Authentication check failed: {e}")
return False
def _ensure_repo_exists(self) -> bool:
"""Ensure the repository exists, create if it doesn't."""
if not self.repo_id or not self.api:
return False
try:
# Check if repo exists
self.api.repo_info(self.repo_id, repo_type=self.repo_type)
return True
except HfHubHTTPError as e:
if e.status_code == 404:
# Repo doesn't exist, try to create it
try:
self.api.create_repo(
repo_id=self.repo_id,
repo_type=self.repo_type,
exist_ok=False
)
return True
except Exception as create_error:
print(f"Warning: Could not create HF repo: {create_error}")
return False
else:
print(f"Warning: HF API error: {e}")
return False
except Exception as e:
print(f"Warning: Error checking HF repo: {e}")
return False
def save_evaluation(self, evaluation: Dict, filename: Optional[str] = None) -> Tuple[bool, str]:
"""
Save a single evaluation to Hugging Face Hub.
Args:
evaluation: Evaluation data dictionary
filename: Optional filename (will generate if not provided)
Returns:
(success: bool, message: str)
"""
print(f"[HF Save] Starting save_evaluation, repo_id={self.repo_id}")
if not HF_AVAILABLE:
print("[HF Save] HF_AVAILABLE is False")
return False, "huggingface_hub not available. Install with: pip install huggingface_hub"
if not self._ensure_authenticated():
print("[HF Save] Authentication failed")
return False, "Not authenticated with Hugging Face. Please login or set HF_TOKEN."
if not self.repo_id:
print("[HF Save] repo_id is None")
return False, "No Hugging Face repo ID configured. Set HF_EVAL_REPO_ID environment variable."
if not self._ensure_repo_exists():
print(f"[HF Save] Repo existence check failed for {self.repo_id}")
return False, f"Could not access or create Hugging Face repo: {self.repo_id}"
print(f"[HF Save] All checks passed, proceeding with upload to {self.repo_id}")
# Generate filename if not provided
if not filename:
patient_id = evaluation.get("patient_id", "unknown")
timestamp = evaluation.get("timestamp", datetime.now().isoformat()).replace(":", "-")
filename = f"patient_eval_{patient_id}_{timestamp}.json"
# Create temp file
try:
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False, encoding='utf-8') as f:
json.dump(evaluation, f, ensure_ascii=False, indent=2)
temp_path = f.name
# Upload to Hugging Face
print(f"[HF Save] Uploading file {filename} to {self.repo_id}...")
self.api.upload_file(
path_or_fileobj=temp_path,
path_in_repo=filename,
repo_id=self.repo_id,
repo_type=self.repo_type
)
print(f"[HF Save] Upload successful: {filename}")
# Clean up
os.unlink(temp_path)
return True, f"Saved to Hugging Face: {self.repo_id}/{filename}"
except Exception as e:
# Clean up on error
if 'temp_path' in locals() and os.path.exists(temp_path):
os.unlink(temp_path)
return False, f"Error saving to Hugging Face: {str(e)}"
def update_csv_master(self, new_row: List) -> Tuple[bool, str]:
"""
Update the master CSV file with a new evaluation row.
Args:
new_row: List of values for the CSV row
Returns:
(success: bool, message: str)
"""
if not HF_AVAILABLE or not self.api or not self.repo_id:
return False, "Hugging Face storage not available"
csv_filename = "patient_evaluations_master.csv"
try:
# Try to download existing CSV
csv_data = []
csv_exists = False
try:
# Try to download existing CSV - check both root and data/test/ directories
csv_path = None
try:
csv_path = self.api.hf_hub_download(
repo_id=self.repo_id,
filename=csv_filename,
repo_type=self.repo_type,
cache_dir=tempfile.gettempdir()
)
except Exception:
# Try data/test/ path (in case it's in a split)
try:
csv_path = self.api.hf_hub_download(
repo_id=self.repo_id,
filename=f"data/test/{csv_filename}",
repo_type=self.repo_type,
cache_dir=tempfile.gettempdir()
)
print(f"[HF CSV] Found CSV in data/test/ directory")
except Exception:
raise
print(f"[HF CSV] Downloaded existing CSV from {csv_path}")
with open(csv_path, 'r', encoding='utf-8') as f:
reader = csv.reader(f)
csv_data = list(reader)
print(f"[HF CSV] Loaded {len(csv_data)} rows from existing CSV (including header)")
if len(csv_data) > 1:
print(f"[HF CSV] Existing data rows: {len(csv_data) - 1}")
csv_exists = True
except Exception as e:
# CSV doesn't exist yet, create header
print(f"[HF CSV] CSV file not found, creating new one. Error: {e}")
csv_data = [['timestamp', 'patient_id', 'expert_name', 'overall_rating',
'clinical_accuracy', 'completeness_coverage', 'clinical_relevance', 'clarity_structure',
'reasoning_risk', 'actionability', 'hallucination', 'critical_omission',
'feedback', 'hallucination_comments', 'critical_omission_comments']]
# Check if header matches (if CSV exists)
if csv_exists and len(csv_data) > 0:
# Verify header matches expected format
expected_header = ['timestamp', 'patient_id', 'expert_name', 'overall_rating',
'clinical_accuracy', 'completeness_coverage', 'clinical_relevance', 'clarity_structure',
'reasoning_risk', 'actionability', 'hallucination', 'critical_omission',
'feedback', 'hallucination_comments', 'critical_omission_comments']
if csv_data[0] != expected_header:
print(f"[HF CSV] WARNING: Header mismatch! Existing: {csv_data[0]}")
print(f"[HF CSV] Expected: {expected_header}")
# Update header if it's the old format
if len(csv_data[0]) < len(expected_header):
print(f"[HF CSV] Updating header to new format")
csv_data[0] = expected_header
# Append new row
csv_data.append(new_row)
print(f"[HF CSV] Total rows after append: {len(csv_data)} (including header)")
# Write to temp file
with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False,
newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerows(csv_data)
temp_path = f.name
# Upload to Hugging Face
print(f"[HF CSV] Uploading CSV ({len(csv_data)} rows) to {self.repo_id}...")
self.api.upload_file(
path_or_fileobj=temp_path,
path_in_repo=csv_filename,
repo_id=self.repo_id,
repo_type=self.repo_type,
commit_message=f"Add evaluation: {new_row[1] if len(new_row) > 1 else 'new'}"
)
print(f"[HF CSV] CSV upload successful ({len(csv_data)} rows total)")
# Also try to upload a README.md if it doesn't exist (for dataset card)
try:
try:
self.api.hf_hub_download(
repo_id=self.repo_id,
filename="README.md",
repo_type=self.repo_type,
cache_dir=tempfile.gettempdir()
)
print(f"[HF CSV] README.md already exists")
except Exception:
# README doesn't exist, create one
readme_content = """---
license: apache-2.0
---
# Patient Evaluations Dataset
This dataset contains clinician evaluations of AI-generated patient summaries.
## Dataset Structure
The dataset contains a CSV file (`patient_evaluations_master.csv`) with evaluation data.
## Columns
- `timestamp`: Evaluation timestamp
- `patient_id`: Patient identifier
- `expert_name`: Clinician identifier
- `overall_rating`: Overall quality rating (1-10)
- `clinical_accuracy`: Clinical accuracy rating (1-10)
- `completeness_coverage`: Completeness/coverage rating (1-10)
- `clinical_relevance`: Clinical relevance rating (1-10)
- `clarity_structure`: Clarity and structure rating (1-10)
- `reasoning_risk`: Reasoning/risk stratification rating (1-10)
- `actionability`: Actionability rating (1-10)
- `hallucination`: Hallucination severity (1-10)
- `critical_omission`: Critical omission severity (1-10)
- `feedback`: Overall feedback text
- `hallucination_comments`: Comments about hallucinations
- `critical_omission_comments`: Comments about critical omissions
"""
with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False, encoding='utf-8') as f:
f.write(readme_content)
readme_path = f.name
self.api.upload_file(
path_or_fileobj=readme_path,
path_in_repo="README.md",
repo_id=self.repo_id,
repo_type=self.repo_type,
commit_message="Add README.md for dataset card"
)
os.unlink(readme_path)
print(f"[HF CSV] Created README.md for dataset card")
except Exception as e:
print(f"[HF CSV] Warning: Could not create/update README.md: {e}")
# Clean up
os.unlink(temp_path)
return True, f"Updated CSV in Hugging Face: {self.repo_id}/{csv_filename}"
except Exception as e:
if 'temp_path' in locals() and os.path.exists(temp_path):
os.unlink(temp_path)
return False, f"Error updating CSV: {str(e)}"
# Global storage instance
_hf_storage = None
def get_hf_storage() -> Optional[HuggingFaceStorage]:
"""Get or create the global Hugging Face storage instance."""
global _hf_storage
if _hf_storage is None:
_hf_storage = HuggingFaceStorage()
return _hf_storage
def save_to_huggingface(evaluation: Dict, csv_row: Optional[List] = None) -> Tuple[bool, str]:
"""
Convenience function to save evaluation to Hugging Face.
Args:
evaluation: Evaluation data dictionary
csv_row: Optional CSV row to append to master CSV
Returns:
(success: bool, message: str)
"""
if not HF_AVAILABLE:
return False, "huggingface_hub not available. Install with: pip install huggingface_hub"
storage = get_hf_storage()
if not storage:
return False, "Hugging Face storage not initialized"
# Save JSON file
success_json, msg_json = storage.save_evaluation(evaluation)
# Update CSV if provided
if csv_row:
success_csv, msg_csv = storage.update_csv_master(csv_row)
if success_json and success_csv:
return True, f"{msg_json}; {msg_csv}"
elif success_json:
return True, f"{msg_json} (CSV update failed: {msg_csv})"
else:
return False, f"JSON save failed: {msg_json}"
return success_json, msg_json
|