Spaces:

JesseLiu
/

chatbot-mimic-notes

Sleeping

Jesse Liu

Sync latest app + data (clinician groups, jsonl)

a25d70c 4 months ago

15.9 kB

	"""
	Hugging Face Hub storage for patient evaluations.
	Saves evaluation data to Hugging Face Dataset or Repository.
	"""

	import os
	import json
	import csv
	import tempfile
	from datetime import datetime
	from typing import Dict, List, Optional, Tuple
	from pathlib import Path

	try:
	from huggingface_hub import HfApi, login, whoami
	from huggingface_hub.utils import HfHubHTTPError
	HF_AVAILABLE = True
	except ImportError:
	HF_AVAILABLE = False


	class HuggingFaceStorage:
	"""Store patient evaluations in Hugging Face Hub."""

	def __init__(self, repo_id: Optional[str] = None, repo_type: str = "dataset"):
	"""
	Initialize Hugging Face storage.

	Args:
	repo_id: Hugging Face repo ID (e.g., "username/dataset-name")
	If None, will try to use environment variable or Space name
	repo_type: "dataset" or "model" (dataset is recommended for structured data)
	"""
	self.repo_id = repo_id or self._get_repo_id()
	self.repo_type = repo_type
	self.api = HfApi() if HF_AVAILABLE else None
	self._token = self._get_token()

	def _get_token(self) -> Optional[str]:
	"""Get HF token from common environment variable names."""
	return (
	os.getenv("HF_TOKEN")
	or os.getenv("HUGGINGFACEHUB_API_TOKEN")
	or os.getenv("HUGGING_FACE_HUB_TOKEN")
	or os.getenv("HUGGINGFACE_HUB_TOKEN")
	)

	def _get_repo_id(self) -> Optional[str]:
	"""Try to get repo ID from environment or Space name."""
	# Try environment variable first
	repo_id = os.getenv("HF_EVAL_REPO_ID")
	if repo_id:
	return repo_id

	# Try to infer from Space name (if running in HF Space)
	space_id = os.getenv("SPACE_ID")
	if space_id:
	# Convert space ID to dataset ID
	username = space_id.split("/")[0] if "/" in space_id else None
	if username:
	return f"{username}/patient-evaluations"

	return None

	def _ensure_authenticated(self) -> bool:
	"""Check if user is authenticated with Hugging Face."""
	if not HF_AVAILABLE:
	print("[HF Auth] HF_AVAILABLE is False")
	return False

	# If a token is provided via env vars, log in programmatically
	if self._token:
	try:
	print("[HF Auth] Attempting login with token...")
	login(token=self._token, add_to_git_credential=False)
	print("[HF Auth] Login successful")
	except Exception as e:
	print(f"[HF Auth] Warning: Could not login to Hugging Face with provided token: {e}")
	return False

	try:
	user_info = whoami()
	print(f"[HF Auth] Authenticated as: {user_info.get('name', 'unknown')}")
	return True
	except Exception as e:
	print(f"[HF Auth] Authentication check failed: {e}")
	return False

	def _ensure_repo_exists(self) -> bool:
	"""Ensure the repository exists, create if it doesn't."""
	if not self.repo_id or not self.api:
	return False

	try:
	# Check if repo exists
	self.api.repo_info(self.repo_id, repo_type=self.repo_type)
	return True
	except HfHubHTTPError as e:
	if e.status_code == 404:
	# Repo doesn't exist, try to create it
	try:
	self.api.create_repo(
	repo_id=self.repo_id,
	repo_type=self.repo_type,
	exist_ok=False
	)
	return True
	except Exception as create_error:
	print(f"Warning: Could not create HF repo: {create_error}")
	return False
	else:
	print(f"Warning: HF API error: {e}")
	return False
	except Exception as e:
	print(f"Warning: Error checking HF repo: {e}")
	return False

	def save_evaluation(self, evaluation: Dict, filename: Optional[str] = None) -> Tuple[bool, str]:
	"""
	Save a single evaluation to Hugging Face Hub.

	Args:
	evaluation: Evaluation data dictionary
	filename: Optional filename (will generate if not provided)

	Returns:
	(success: bool, message: str)
	"""
	print(f"[HF Save] Starting save_evaluation, repo_id={self.repo_id}")
	if not HF_AVAILABLE:
	print("[HF Save] HF_AVAILABLE is False")
	return False, "huggingface_hub not available. Install with: pip install huggingface_hub"

	if not self._ensure_authenticated():
	print("[HF Save] Authentication failed")
	return False, "Not authenticated with Hugging Face. Please login or set HF_TOKEN."

	if not self.repo_id:
	print("[HF Save] repo_id is None")
	return False, "No Hugging Face repo ID configured. Set HF_EVAL_REPO_ID environment variable."

	if not self._ensure_repo_exists():
	print(f"[HF Save] Repo existence check failed for {self.repo_id}")
	return False, f"Could not access or create Hugging Face repo: {self.repo_id}"

	print(f"[HF Save] All checks passed, proceeding with upload to {self.repo_id}")

	# Generate filename if not provided
	if not filename:
	patient_id = evaluation.get("patient_id", "unknown")
	timestamp = evaluation.get("timestamp", datetime.now().isoformat()).replace(":", "-")
	filename = f"patient_eval_{patient_id}_{timestamp}.json"

	# Create temp file
	try:
	with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False, encoding='utf-8') as f:
	json.dump(evaluation, f, ensure_ascii=False, indent=2)
	temp_path = f.name

	# Upload to Hugging Face
	print(f"[HF Save] Uploading file {filename} to {self.repo_id}...")
	self.api.upload_file(
	path_or_fileobj=temp_path,
	path_in_repo=filename,
	repo_id=self.repo_id,
	repo_type=self.repo_type
	)
	print(f"[HF Save] Upload successful: {filename}")

	# Clean up
	os.unlink(temp_path)

	return True, f"Saved to Hugging Face: {self.repo_id}/{filename}"

	except Exception as e:
	# Clean up on error
	if 'temp_path' in locals() and os.path.exists(temp_path):
	os.unlink(temp_path)
	return False, f"Error saving to Hugging Face: {str(e)}"

	def update_csv_master(self, new_row: List) -> Tuple[bool, str]:
	"""
	Update the master CSV file with a new evaluation row.

	Args:
	new_row: List of values for the CSV row

	Returns:
	(success: bool, message: str)
	"""
	if not HF_AVAILABLE or not self.api or not self.repo_id:
	return False, "Hugging Face storage not available"

	csv_filename = "patient_evaluations_master.csv"

	try:
	# Try to download existing CSV
	csv_data = []
	csv_exists = False

	try:
	# Try to download existing CSV - check both root and data/test/ directories
	csv_path = None
	try:
	csv_path = self.api.hf_hub_download(
	repo_id=self.repo_id,
	filename=csv_filename,
	repo_type=self.repo_type,
	cache_dir=tempfile.gettempdir()
	)
	except Exception:
	# Try data/test/ path (in case it's in a split)
	try:
	csv_path = self.api.hf_hub_download(
	repo_id=self.repo_id,
	filename=f"data/test/{csv_filename}",
	repo_type=self.repo_type,
	cache_dir=tempfile.gettempdir()
	)
	print(f"[HF CSV] Found CSV in data/test/ directory")
	except Exception:
	raise

	print(f"[HF CSV] Downloaded existing CSV from {csv_path}")
	with open(csv_path, 'r', encoding='utf-8') as f:
	reader = csv.reader(f)
	csv_data = list(reader)
	print(f"[HF CSV] Loaded {len(csv_data)} rows from existing CSV (including header)")
	if len(csv_data) > 1:
	print(f"[HF CSV] Existing data rows: {len(csv_data) - 1}")
	csv_exists = True
	except Exception as e:
	# CSV doesn't exist yet, create header
	print(f"[HF CSV] CSV file not found, creating new one. Error: {e}")
	csv_data = [['timestamp', 'patient_id', 'expert_name', 'overall_rating',
	'clinical_accuracy', 'completeness_coverage', 'clinical_relevance', 'clarity_structure',
	'reasoning_risk', 'actionability', 'hallucination', 'critical_omission',
	'feedback', 'hallucination_comments', 'critical_omission_comments']]

	# Check if header matches (if CSV exists)
	if csv_exists and len(csv_data) > 0:
	# Verify header matches expected format
	expected_header = ['timestamp', 'patient_id', 'expert_name', 'overall_rating',
	'clinical_accuracy', 'completeness_coverage', 'clinical_relevance', 'clarity_structure',
	'reasoning_risk', 'actionability', 'hallucination', 'critical_omission',
	'feedback', 'hallucination_comments', 'critical_omission_comments']
	if csv_data[0] != expected_header:
	print(f"[HF CSV] WARNING: Header mismatch! Existing: {csv_data[0]}")
	print(f"[HF CSV] Expected: {expected_header}")
	# Update header if it's the old format
	if len(csv_data[0]) < len(expected_header):
	print(f"[HF CSV] Updating header to new format")
	csv_data[0] = expected_header

	# Append new row
	csv_data.append(new_row)
	print(f"[HF CSV] Total rows after append: {len(csv_data)} (including header)")

	# Write to temp file
	with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False,
	newline='', encoding='utf-8') as f:
	writer = csv.writer(f)
	writer.writerows(csv_data)
	temp_path = f.name

	# Upload to Hugging Face
	print(f"[HF CSV] Uploading CSV ({len(csv_data)} rows) to {self.repo_id}...")
	self.api.upload_file(
	path_or_fileobj=temp_path,
	path_in_repo=csv_filename,
	repo_id=self.repo_id,
	repo_type=self.repo_type,
	commit_message=f"Add evaluation: {new_row[1] if len(new_row) > 1 else 'new'}"
	)
	print(f"[HF CSV] CSV upload successful ({len(csv_data)} rows total)")

	# Also try to upload a README.md if it doesn't exist (for dataset card)
	try:
	try:
	self.api.hf_hub_download(
	repo_id=self.repo_id,
	filename="README.md",
	repo_type=self.repo_type,
	cache_dir=tempfile.gettempdir()
	)
	print(f"[HF CSV] README.md already exists")
	except Exception:
	# README doesn't exist, create one
	readme_content = """---
	license: apache-2.0
	---

	# Patient Evaluations Dataset

	This dataset contains clinician evaluations of AI-generated patient summaries.

	## Dataset Structure

	The dataset contains a CSV file (`patient_evaluations_master.csv`) with evaluation data.

	## Columns

	- `timestamp`: Evaluation timestamp
	- `patient_id`: Patient identifier
	- `expert_name`: Clinician identifier
	- `overall_rating`: Overall quality rating (1-10)
	- `clinical_accuracy`: Clinical accuracy rating (1-10)
	- `completeness_coverage`: Completeness/coverage rating (1-10)
	- `clinical_relevance`: Clinical relevance rating (1-10)
	- `clarity_structure`: Clarity and structure rating (1-10)
	- `reasoning_risk`: Reasoning/risk stratification rating (1-10)
	- `actionability`: Actionability rating (1-10)
	- `hallucination`: Hallucination severity (1-10)
	- `critical_omission`: Critical omission severity (1-10)
	- `feedback`: Overall feedback text
	- `hallucination_comments`: Comments about hallucinations
	- `critical_omission_comments`: Comments about critical omissions
	"""
	with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False, encoding='utf-8') as f:
	f.write(readme_content)
	readme_path = f.name

	self.api.upload_file(
	path_or_fileobj=readme_path,
	path_in_repo="README.md",
	repo_id=self.repo_id,
	repo_type=self.repo_type,
	commit_message="Add README.md for dataset card"
	)
	os.unlink(readme_path)
	print(f"[HF CSV] Created README.md for dataset card")
	except Exception as e:
	print(f"[HF CSV] Warning: Could not create/update README.md: {e}")

	# Clean up
	os.unlink(temp_path)

	return True, f"Updated CSV in Hugging Face: {self.repo_id}/{csv_filename}"

	except Exception as e:
	if 'temp_path' in locals() and os.path.exists(temp_path):
	os.unlink(temp_path)
	return False, f"Error updating CSV: {str(e)}"


	# Global storage instance
	_hf_storage = None

	def get_hf_storage() -> Optional[HuggingFaceStorage]:
	"""Get or create the global Hugging Face storage instance."""
	global _hf_storage
	if _hf_storage is None:
	_hf_storage = HuggingFaceStorage()
	return _hf_storage

	def save_to_huggingface(evaluation: Dict, csv_row: Optional[List] = None) -> Tuple[bool, str]:
	"""
	Convenience function to save evaluation to Hugging Face.

	Args:
	evaluation: Evaluation data dictionary
	csv_row: Optional CSV row to append to master CSV

	Returns:
	(success: bool, message: str)
	"""
	if not HF_AVAILABLE:
	return False, "huggingface_hub not available. Install with: pip install huggingface_hub"

	storage = get_hf_storage()
	if not storage:
	return False, "Hugging Face storage not initialized"

	# Save JSON file
	success_json, msg_json = storage.save_evaluation(evaluation)

	# Update CSV if provided
	if csv_row:
	success_csv, msg_csv = storage.update_csv_master(csv_row)
	if success_json and success_csv:
	return True, f"{msg_json}; {msg_csv}"
	elif success_json:
	return True, f"{msg_json} (CSV update failed: {msg_csv})"
	else:
	return False, f"JSON save failed: {msg_json}"

	return success_json, msg_json