Spaces:
Sleeping
Sleeping
| """ | |
| Hugging Face Hub storage for patient evaluations. | |
| Saves evaluation data to Hugging Face Dataset or Repository. | |
| """ | |
| import os | |
| import json | |
| import csv | |
| import tempfile | |
| from datetime import datetime | |
| from typing import Dict, List, Optional, Tuple | |
| from pathlib import Path | |
| try: | |
| from huggingface_hub import HfApi, login, whoami | |
| from huggingface_hub.utils import HfHubHTTPError | |
| HF_AVAILABLE = True | |
| except ImportError: | |
| HF_AVAILABLE = False | |
| class HuggingFaceStorage: | |
| """Store patient evaluations in Hugging Face Hub.""" | |
| def __init__(self, repo_id: Optional[str] = None, repo_type: str = "dataset"): | |
| """ | |
| Initialize Hugging Face storage. | |
| Args: | |
| repo_id: Hugging Face repo ID (e.g., "username/dataset-name") | |
| If None, will try to use environment variable or Space name | |
| repo_type: "dataset" or "model" (dataset is recommended for structured data) | |
| """ | |
| self.repo_id = repo_id or self._get_repo_id() | |
| self.repo_type = repo_type | |
| self.api = HfApi() if HF_AVAILABLE else None | |
| self._token = self._get_token() | |
| def _get_token(self) -> Optional[str]: | |
| """Get HF token from common environment variable names.""" | |
| return ( | |
| os.getenv("HF_TOKEN") | |
| or os.getenv("HUGGINGFACEHUB_API_TOKEN") | |
| or os.getenv("HUGGING_FACE_HUB_TOKEN") | |
| or os.getenv("HUGGINGFACE_HUB_TOKEN") | |
| ) | |
| def _get_repo_id(self) -> Optional[str]: | |
| """Try to get repo ID from environment or Space name.""" | |
| # Try environment variable first | |
| repo_id = os.getenv("HF_EVAL_REPO_ID") | |
| if repo_id: | |
| return repo_id | |
| # Try to infer from Space name (if running in HF Space) | |
| space_id = os.getenv("SPACE_ID") | |
| if space_id: | |
| # Convert space ID to dataset ID | |
| username = space_id.split("/")[0] if "/" in space_id else None | |
| if username: | |
| return f"{username}/patient-evaluations" | |
| return None | |
| def _ensure_authenticated(self) -> bool: | |
| """Check if user is authenticated with Hugging Face.""" | |
| if not HF_AVAILABLE: | |
| print("[HF Auth] HF_AVAILABLE is False") | |
| return False | |
| # If a token is provided via env vars, log in programmatically | |
| if self._token: | |
| try: | |
| print("[HF Auth] Attempting login with token...") | |
| login(token=self._token, add_to_git_credential=False) | |
| print("[HF Auth] Login successful") | |
| except Exception as e: | |
| print(f"[HF Auth] Warning: Could not login to Hugging Face with provided token: {e}") | |
| return False | |
| try: | |
| user_info = whoami() | |
| print(f"[HF Auth] Authenticated as: {user_info.get('name', 'unknown')}") | |
| return True | |
| except Exception as e: | |
| print(f"[HF Auth] Authentication check failed: {e}") | |
| return False | |
| def _ensure_repo_exists(self) -> bool: | |
| """Ensure the repository exists, create if it doesn't.""" | |
| if not self.repo_id or not self.api: | |
| return False | |
| try: | |
| # Check if repo exists | |
| self.api.repo_info(self.repo_id, repo_type=self.repo_type) | |
| return True | |
| except HfHubHTTPError as e: | |
| if e.status_code == 404: | |
| # Repo doesn't exist, try to create it | |
| try: | |
| self.api.create_repo( | |
| repo_id=self.repo_id, | |
| repo_type=self.repo_type, | |
| exist_ok=False | |
| ) | |
| return True | |
| except Exception as create_error: | |
| print(f"Warning: Could not create HF repo: {create_error}") | |
| return False | |
| else: | |
| print(f"Warning: HF API error: {e}") | |
| return False | |
| except Exception as e: | |
| print(f"Warning: Error checking HF repo: {e}") | |
| return False | |
| def save_evaluation(self, evaluation: Dict, filename: Optional[str] = None) -> Tuple[bool, str]: | |
| """ | |
| Save a single evaluation to Hugging Face Hub. | |
| Args: | |
| evaluation: Evaluation data dictionary | |
| filename: Optional filename (will generate if not provided) | |
| Returns: | |
| (success: bool, message: str) | |
| """ | |
| print(f"[HF Save] Starting save_evaluation, repo_id={self.repo_id}") | |
| if not HF_AVAILABLE: | |
| print("[HF Save] HF_AVAILABLE is False") | |
| return False, "huggingface_hub not available. Install with: pip install huggingface_hub" | |
| if not self._ensure_authenticated(): | |
| print("[HF Save] Authentication failed") | |
| return False, "Not authenticated with Hugging Face. Please login or set HF_TOKEN." | |
| if not self.repo_id: | |
| print("[HF Save] repo_id is None") | |
| return False, "No Hugging Face repo ID configured. Set HF_EVAL_REPO_ID environment variable." | |
| if not self._ensure_repo_exists(): | |
| print(f"[HF Save] Repo existence check failed for {self.repo_id}") | |
| return False, f"Could not access or create Hugging Face repo: {self.repo_id}" | |
| print(f"[HF Save] All checks passed, proceeding with upload to {self.repo_id}") | |
| # Generate filename if not provided | |
| if not filename: | |
| patient_id = evaluation.get("patient_id", "unknown") | |
| timestamp = evaluation.get("timestamp", datetime.now().isoformat()).replace(":", "-") | |
| filename = f"patient_eval_{patient_id}_{timestamp}.json" | |
| # Create temp file | |
| try: | |
| with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False, encoding='utf-8') as f: | |
| json.dump(evaluation, f, ensure_ascii=False, indent=2) | |
| temp_path = f.name | |
| # Upload to Hugging Face | |
| print(f"[HF Save] Uploading file {filename} to {self.repo_id}...") | |
| self.api.upload_file( | |
| path_or_fileobj=temp_path, | |
| path_in_repo=filename, | |
| repo_id=self.repo_id, | |
| repo_type=self.repo_type | |
| ) | |
| print(f"[HF Save] Upload successful: {filename}") | |
| # Clean up | |
| os.unlink(temp_path) | |
| return True, f"Saved to Hugging Face: {self.repo_id}/{filename}" | |
| except Exception as e: | |
| # Clean up on error | |
| if 'temp_path' in locals() and os.path.exists(temp_path): | |
| os.unlink(temp_path) | |
| return False, f"Error saving to Hugging Face: {str(e)}" | |
| def update_csv_master(self, new_row: List) -> Tuple[bool, str]: | |
| """ | |
| Update the master CSV file with a new evaluation row. | |
| Args: | |
| new_row: List of values for the CSV row | |
| Returns: | |
| (success: bool, message: str) | |
| """ | |
| if not HF_AVAILABLE or not self.api or not self.repo_id: | |
| return False, "Hugging Face storage not available" | |
| csv_filename = "patient_evaluations_master.csv" | |
| try: | |
| # Try to download existing CSV | |
| csv_data = [] | |
| csv_exists = False | |
| try: | |
| # Try to download existing CSV - check both root and data/test/ directories | |
| csv_path = None | |
| try: | |
| csv_path = self.api.hf_hub_download( | |
| repo_id=self.repo_id, | |
| filename=csv_filename, | |
| repo_type=self.repo_type, | |
| cache_dir=tempfile.gettempdir() | |
| ) | |
| except Exception: | |
| # Try data/test/ path (in case it's in a split) | |
| try: | |
| csv_path = self.api.hf_hub_download( | |
| repo_id=self.repo_id, | |
| filename=f"data/test/{csv_filename}", | |
| repo_type=self.repo_type, | |
| cache_dir=tempfile.gettempdir() | |
| ) | |
| print(f"[HF CSV] Found CSV in data/test/ directory") | |
| except Exception: | |
| raise | |
| print(f"[HF CSV] Downloaded existing CSV from {csv_path}") | |
| with open(csv_path, 'r', encoding='utf-8') as f: | |
| reader = csv.reader(f) | |
| csv_data = list(reader) | |
| print(f"[HF CSV] Loaded {len(csv_data)} rows from existing CSV (including header)") | |
| if len(csv_data) > 1: | |
| print(f"[HF CSV] Existing data rows: {len(csv_data) - 1}") | |
| csv_exists = True | |
| except Exception as e: | |
| # CSV doesn't exist yet, create header | |
| print(f"[HF CSV] CSV file not found, creating new one. Error: {e}") | |
| csv_data = [['timestamp', 'patient_id', 'expert_name', 'overall_rating', | |
| 'clinical_accuracy', 'completeness_coverage', 'clinical_relevance', 'clarity_structure', | |
| 'reasoning_risk', 'actionability', 'hallucination', 'critical_omission', | |
| 'feedback', 'hallucination_comments', 'critical_omission_comments']] | |
| # Check if header matches (if CSV exists) | |
| if csv_exists and len(csv_data) > 0: | |
| # Verify header matches expected format | |
| expected_header = ['timestamp', 'patient_id', 'expert_name', 'overall_rating', | |
| 'clinical_accuracy', 'completeness_coverage', 'clinical_relevance', 'clarity_structure', | |
| 'reasoning_risk', 'actionability', 'hallucination', 'critical_omission', | |
| 'feedback', 'hallucination_comments', 'critical_omission_comments'] | |
| if csv_data[0] != expected_header: | |
| print(f"[HF CSV] WARNING: Header mismatch! Existing: {csv_data[0]}") | |
| print(f"[HF CSV] Expected: {expected_header}") | |
| # Update header if it's the old format | |
| if len(csv_data[0]) < len(expected_header): | |
| print(f"[HF CSV] Updating header to new format") | |
| csv_data[0] = expected_header | |
| # Append new row | |
| csv_data.append(new_row) | |
| print(f"[HF CSV] Total rows after append: {len(csv_data)} (including header)") | |
| # Write to temp file | |
| with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False, | |
| newline='', encoding='utf-8') as f: | |
| writer = csv.writer(f) | |
| writer.writerows(csv_data) | |
| temp_path = f.name | |
| # Upload to Hugging Face | |
| print(f"[HF CSV] Uploading CSV ({len(csv_data)} rows) to {self.repo_id}...") | |
| self.api.upload_file( | |
| path_or_fileobj=temp_path, | |
| path_in_repo=csv_filename, | |
| repo_id=self.repo_id, | |
| repo_type=self.repo_type, | |
| commit_message=f"Add evaluation: {new_row[1] if len(new_row) > 1 else 'new'}" | |
| ) | |
| print(f"[HF CSV] CSV upload successful ({len(csv_data)} rows total)") | |
| # Also try to upload a README.md if it doesn't exist (for dataset card) | |
| try: | |
| try: | |
| self.api.hf_hub_download( | |
| repo_id=self.repo_id, | |
| filename="README.md", | |
| repo_type=self.repo_type, | |
| cache_dir=tempfile.gettempdir() | |
| ) | |
| print(f"[HF CSV] README.md already exists") | |
| except Exception: | |
| # README doesn't exist, create one | |
| readme_content = """--- | |
| license: apache-2.0 | |
| --- | |
| # Patient Evaluations Dataset | |
| This dataset contains clinician evaluations of AI-generated patient summaries. | |
| ## Dataset Structure | |
| The dataset contains a CSV file (`patient_evaluations_master.csv`) with evaluation data. | |
| ## Columns | |
| - `timestamp`: Evaluation timestamp | |
| - `patient_id`: Patient identifier | |
| - `expert_name`: Clinician identifier | |
| - `overall_rating`: Overall quality rating (1-10) | |
| - `clinical_accuracy`: Clinical accuracy rating (1-10) | |
| - `completeness_coverage`: Completeness/coverage rating (1-10) | |
| - `clinical_relevance`: Clinical relevance rating (1-10) | |
| - `clarity_structure`: Clarity and structure rating (1-10) | |
| - `reasoning_risk`: Reasoning/risk stratification rating (1-10) | |
| - `actionability`: Actionability rating (1-10) | |
| - `hallucination`: Hallucination severity (1-10) | |
| - `critical_omission`: Critical omission severity (1-10) | |
| - `feedback`: Overall feedback text | |
| - `hallucination_comments`: Comments about hallucinations | |
| - `critical_omission_comments`: Comments about critical omissions | |
| """ | |
| with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False, encoding='utf-8') as f: | |
| f.write(readme_content) | |
| readme_path = f.name | |
| self.api.upload_file( | |
| path_or_fileobj=readme_path, | |
| path_in_repo="README.md", | |
| repo_id=self.repo_id, | |
| repo_type=self.repo_type, | |
| commit_message="Add README.md for dataset card" | |
| ) | |
| os.unlink(readme_path) | |
| print(f"[HF CSV] Created README.md for dataset card") | |
| except Exception as e: | |
| print(f"[HF CSV] Warning: Could not create/update README.md: {e}") | |
| # Clean up | |
| os.unlink(temp_path) | |
| return True, f"Updated CSV in Hugging Face: {self.repo_id}/{csv_filename}" | |
| except Exception as e: | |
| if 'temp_path' in locals() and os.path.exists(temp_path): | |
| os.unlink(temp_path) | |
| return False, f"Error updating CSV: {str(e)}" | |
| # Global storage instance | |
| _hf_storage = None | |
| def get_hf_storage() -> Optional[HuggingFaceStorage]: | |
| """Get or create the global Hugging Face storage instance.""" | |
| global _hf_storage | |
| if _hf_storage is None: | |
| _hf_storage = HuggingFaceStorage() | |
| return _hf_storage | |
| def save_to_huggingface(evaluation: Dict, csv_row: Optional[List] = None) -> Tuple[bool, str]: | |
| """ | |
| Convenience function to save evaluation to Hugging Face. | |
| Args: | |
| evaluation: Evaluation data dictionary | |
| csv_row: Optional CSV row to append to master CSV | |
| Returns: | |
| (success: bool, message: str) | |
| """ | |
| if not HF_AVAILABLE: | |
| return False, "huggingface_hub not available. Install with: pip install huggingface_hub" | |
| storage = get_hf_storage() | |
| if not storage: | |
| return False, "Hugging Face storage not initialized" | |
| # Save JSON file | |
| success_json, msg_json = storage.save_evaluation(evaluation) | |
| # Update CSV if provided | |
| if csv_row: | |
| success_csv, msg_csv = storage.update_csv_master(csv_row) | |
| if success_json and success_csv: | |
| return True, f"{msg_json}; {msg_csv}" | |
| elif success_json: | |
| return True, f"{msg_json} (CSV update failed: {msg_csv})" | |
| else: | |
| return False, f"JSON save failed: {msg_json}" | |
| return success_json, msg_json | |