File size: 15,906 Bytes
6a1e37a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f8d68c0
 
 
 
 
 
 
 
 
 
6a1e37a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7d08b83
6a1e37a
 
f8d68c0
 
 
7d08b83
f8d68c0
7d08b83
f8d68c0
7d08b83
f8d68c0
 
6a1e37a
7d08b83
 
6a1e37a
7d08b83
 
6a1e37a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7d08b83
6a1e37a
7d08b83
6a1e37a
 
 
7d08b83
6a1e37a
 
 
7d08b83
6a1e37a
 
 
7d08b83
6a1e37a
 
7d08b83
 
6a1e37a
 
 
 
 
 
 
 
 
 
 
 
 
7d08b83
6a1e37a
 
 
 
 
 
7d08b83
6a1e37a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
511ed70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6a1e37a
 
 
511ed70
 
 
6a1e37a
511ed70
6a1e37a
511ed70
6a1e37a
511ed70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6a1e37a
 
 
511ed70
6a1e37a
 
 
 
 
 
 
 
 
511ed70
6a1e37a
 
 
 
 
 
 
511ed70
6a1e37a
a25d70c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6a1e37a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a33f6ae
 
 
6a1e37a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
"""
Hugging Face Hub storage for patient evaluations.
Saves evaluation data to Hugging Face Dataset or Repository.
"""

import os
import json
import csv
import tempfile
from datetime import datetime
from typing import Dict, List, Optional, Tuple
from pathlib import Path

try:
    from huggingface_hub import HfApi, login, whoami
    from huggingface_hub.utils import HfHubHTTPError
    HF_AVAILABLE = True
except ImportError:
    HF_AVAILABLE = False


class HuggingFaceStorage:
    """Store patient evaluations in Hugging Face Hub."""
    
    def __init__(self, repo_id: Optional[str] = None, repo_type: str = "dataset"):
        """
        Initialize Hugging Face storage.
        
        Args:
            repo_id: Hugging Face repo ID (e.g., "username/dataset-name")
                   If None, will try to use environment variable or Space name
            repo_type: "dataset" or "model" (dataset is recommended for structured data)
        """
        self.repo_id = repo_id or self._get_repo_id()
        self.repo_type = repo_type
        self.api = HfApi() if HF_AVAILABLE else None
        self._token = self._get_token()
    
    def _get_token(self) -> Optional[str]:
        """Get HF token from common environment variable names."""
        return (
            os.getenv("HF_TOKEN")
            or os.getenv("HUGGINGFACEHUB_API_TOKEN")
            or os.getenv("HUGGING_FACE_HUB_TOKEN")
            or os.getenv("HUGGINGFACE_HUB_TOKEN")
        )
        
    def _get_repo_id(self) -> Optional[str]:
        """Try to get repo ID from environment or Space name."""
        # Try environment variable first
        repo_id = os.getenv("HF_EVAL_REPO_ID")
        if repo_id:
            return repo_id
        
        # Try to infer from Space name (if running in HF Space)
        space_id = os.getenv("SPACE_ID")
        if space_id:
            # Convert space ID to dataset ID
            username = space_id.split("/")[0] if "/" in space_id else None
            if username:
                return f"{username}/patient-evaluations"
        
        return None
    
    def _ensure_authenticated(self) -> bool:
        """Check if user is authenticated with Hugging Face."""
        if not HF_AVAILABLE:
            print("[HF Auth] HF_AVAILABLE is False")
            return False
        
        # If a token is provided via env vars, log in programmatically
        if self._token:
            try:
                print("[HF Auth] Attempting login with token...")
                login(token=self._token, add_to_git_credential=False)
                print("[HF Auth] Login successful")
            except Exception as e:
                print(f"[HF Auth] Warning: Could not login to Hugging Face with provided token: {e}")
                return False
        
        try:
            user_info = whoami()
            print(f"[HF Auth] Authenticated as: {user_info.get('name', 'unknown')}")
            return True
        except Exception as e:
            print(f"[HF Auth] Authentication check failed: {e}")
            return False
    
    def _ensure_repo_exists(self) -> bool:
        """Ensure the repository exists, create if it doesn't."""
        if not self.repo_id or not self.api:
            return False
        
        try:
            # Check if repo exists
            self.api.repo_info(self.repo_id, repo_type=self.repo_type)
            return True
        except HfHubHTTPError as e:
            if e.status_code == 404:
                # Repo doesn't exist, try to create it
                try:
                    self.api.create_repo(
                        repo_id=self.repo_id,
                        repo_type=self.repo_type,
                        exist_ok=False
                    )
                    return True
                except Exception as create_error:
                    print(f"Warning: Could not create HF repo: {create_error}")
                    return False
            else:
                print(f"Warning: HF API error: {e}")
                return False
        except Exception as e:
            print(f"Warning: Error checking HF repo: {e}")
            return False
    
    def save_evaluation(self, evaluation: Dict, filename: Optional[str] = None) -> Tuple[bool, str]:
        """
        Save a single evaluation to Hugging Face Hub.
        
        Args:
            evaluation: Evaluation data dictionary
            filename: Optional filename (will generate if not provided)
        
        Returns:
            (success: bool, message: str)
        """
        print(f"[HF Save] Starting save_evaluation, repo_id={self.repo_id}")
        if not HF_AVAILABLE:
            print("[HF Save] HF_AVAILABLE is False")
            return False, "huggingface_hub not available. Install with: pip install huggingface_hub"
        
        if not self._ensure_authenticated():
            print("[HF Save] Authentication failed")
            return False, "Not authenticated with Hugging Face. Please login or set HF_TOKEN."
        
        if not self.repo_id:
            print("[HF Save] repo_id is None")
            return False, "No Hugging Face repo ID configured. Set HF_EVAL_REPO_ID environment variable."
        
        if not self._ensure_repo_exists():
            print(f"[HF Save] Repo existence check failed for {self.repo_id}")
            return False, f"Could not access or create Hugging Face repo: {self.repo_id}"
        
        print(f"[HF Save] All checks passed, proceeding with upload to {self.repo_id}")
        
        # Generate filename if not provided
        if not filename:
            patient_id = evaluation.get("patient_id", "unknown")
            timestamp = evaluation.get("timestamp", datetime.now().isoformat()).replace(":", "-")
            filename = f"patient_eval_{patient_id}_{timestamp}.json"
        
        # Create temp file
        try:
            with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False, encoding='utf-8') as f:
                json.dump(evaluation, f, ensure_ascii=False, indent=2)
                temp_path = f.name
            
            # Upload to Hugging Face
            print(f"[HF Save] Uploading file {filename} to {self.repo_id}...")
            self.api.upload_file(
                path_or_fileobj=temp_path,
                path_in_repo=filename,
                repo_id=self.repo_id,
                repo_type=self.repo_type
            )
            print(f"[HF Save] Upload successful: {filename}")
            
            # Clean up
            os.unlink(temp_path)
            
            return True, f"Saved to Hugging Face: {self.repo_id}/{filename}"
        
        except Exception as e:
            # Clean up on error
            if 'temp_path' in locals() and os.path.exists(temp_path):
                os.unlink(temp_path)
            return False, f"Error saving to Hugging Face: {str(e)}"
    
    def update_csv_master(self, new_row: List) -> Tuple[bool, str]:
        """
        Update the master CSV file with a new evaluation row.
        
        Args:
            new_row: List of values for the CSV row
        
        Returns:
            (success: bool, message: str)
        """
        if not HF_AVAILABLE or not self.api or not self.repo_id:
            return False, "Hugging Face storage not available"
        
        csv_filename = "patient_evaluations_master.csv"
        
        try:
            # Try to download existing CSV
            csv_data = []
            csv_exists = False
            
            try:
                # Try to download existing CSV - check both root and data/test/ directories
                csv_path = None
                try:
                    csv_path = self.api.hf_hub_download(
                        repo_id=self.repo_id,
                        filename=csv_filename,
                        repo_type=self.repo_type,
                        cache_dir=tempfile.gettempdir()
                    )
                except Exception:
                    # Try data/test/ path (in case it's in a split)
                    try:
                        csv_path = self.api.hf_hub_download(
                            repo_id=self.repo_id,
                            filename=f"data/test/{csv_filename}",
                            repo_type=self.repo_type,
                            cache_dir=tempfile.gettempdir()
                        )
                        print(f"[HF CSV] Found CSV in data/test/ directory")
                    except Exception:
                        raise
                
                print(f"[HF CSV] Downloaded existing CSV from {csv_path}")
                with open(csv_path, 'r', encoding='utf-8') as f:
                    reader = csv.reader(f)
                    csv_data = list(reader)
                print(f"[HF CSV] Loaded {len(csv_data)} rows from existing CSV (including header)")
                if len(csv_data) > 1:
                    print(f"[HF CSV] Existing data rows: {len(csv_data) - 1}")
                csv_exists = True
            except Exception as e:
                # CSV doesn't exist yet, create header
                print(f"[HF CSV] CSV file not found, creating new one. Error: {e}")
                csv_data = [['timestamp', 'patient_id', 'expert_name', 'overall_rating', 
                           'clinical_accuracy', 'completeness_coverage', 'clinical_relevance', 'clarity_structure',
                           'reasoning_risk', 'actionability', 'hallucination', 'critical_omission',
                           'feedback', 'hallucination_comments', 'critical_omission_comments']]
            
            # Check if header matches (if CSV exists)
            if csv_exists and len(csv_data) > 0:
                # Verify header matches expected format
                expected_header = ['timestamp', 'patient_id', 'expert_name', 'overall_rating', 
                                 'clinical_accuracy', 'completeness_coverage', 'clinical_relevance', 'clarity_structure',
                                 'reasoning_risk', 'actionability', 'hallucination', 'critical_omission',
                                 'feedback', 'hallucination_comments', 'critical_omission_comments']
                if csv_data[0] != expected_header:
                    print(f"[HF CSV] WARNING: Header mismatch! Existing: {csv_data[0]}")
                    print(f"[HF CSV] Expected: {expected_header}")
                    # Update header if it's the old format
                    if len(csv_data[0]) < len(expected_header):
                        print(f"[HF CSV] Updating header to new format")
                        csv_data[0] = expected_header
            
            # Append new row
            csv_data.append(new_row)
            print(f"[HF CSV] Total rows after append: {len(csv_data)} (including header)")
            
            # Write to temp file
            with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False, 
                                           newline='', encoding='utf-8') as f:
                writer = csv.writer(f)
                writer.writerows(csv_data)
                temp_path = f.name
            
            # Upload to Hugging Face
            print(f"[HF CSV] Uploading CSV ({len(csv_data)} rows) to {self.repo_id}...")
            self.api.upload_file(
                path_or_fileobj=temp_path,
                path_in_repo=csv_filename,
                repo_id=self.repo_id,
                repo_type=self.repo_type,
                commit_message=f"Add evaluation: {new_row[1] if len(new_row) > 1 else 'new'}"
            )
            print(f"[HF CSV] CSV upload successful ({len(csv_data)} rows total)")
            
            # Also try to upload a README.md if it doesn't exist (for dataset card)
            try:
                try:
                    self.api.hf_hub_download(
                        repo_id=self.repo_id,
                        filename="README.md",
                        repo_type=self.repo_type,
                        cache_dir=tempfile.gettempdir()
                    )
                    print(f"[HF CSV] README.md already exists")
                except Exception:
                    # README doesn't exist, create one
                    readme_content = """---
license: apache-2.0
---

# Patient Evaluations Dataset

This dataset contains clinician evaluations of AI-generated patient summaries.

## Dataset Structure

The dataset contains a CSV file (`patient_evaluations_master.csv`) with evaluation data.

## Columns

- `timestamp`: Evaluation timestamp
- `patient_id`: Patient identifier
- `expert_name`: Clinician identifier
- `overall_rating`: Overall quality rating (1-10)
- `clinical_accuracy`: Clinical accuracy rating (1-10)
- `completeness_coverage`: Completeness/coverage rating (1-10)
- `clinical_relevance`: Clinical relevance rating (1-10)
- `clarity_structure`: Clarity and structure rating (1-10)
- `reasoning_risk`: Reasoning/risk stratification rating (1-10)
- `actionability`: Actionability rating (1-10)
- `hallucination`: Hallucination severity (1-10)
- `critical_omission`: Critical omission severity (1-10)
- `feedback`: Overall feedback text
- `hallucination_comments`: Comments about hallucinations
- `critical_omission_comments`: Comments about critical omissions
"""
                    with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False, encoding='utf-8') as f:
                        f.write(readme_content)
                        readme_path = f.name
                    
                    self.api.upload_file(
                        path_or_fileobj=readme_path,
                        path_in_repo="README.md",
                        repo_id=self.repo_id,
                        repo_type=self.repo_type,
                        commit_message="Add README.md for dataset card"
                    )
                    os.unlink(readme_path)
                    print(f"[HF CSV] Created README.md for dataset card")
            except Exception as e:
                print(f"[HF CSV] Warning: Could not create/update README.md: {e}")
            
            # Clean up
            os.unlink(temp_path)
            
            return True, f"Updated CSV in Hugging Face: {self.repo_id}/{csv_filename}"
        
        except Exception as e:
            if 'temp_path' in locals() and os.path.exists(temp_path):
                os.unlink(temp_path)
            return False, f"Error updating CSV: {str(e)}"


# Global storage instance
_hf_storage = None

def get_hf_storage() -> Optional[HuggingFaceStorage]:
    """Get or create the global Hugging Face storage instance."""
    global _hf_storage
    if _hf_storage is None:
        _hf_storage = HuggingFaceStorage()
    return _hf_storage

def save_to_huggingface(evaluation: Dict, csv_row: Optional[List] = None) -> Tuple[bool, str]:
    """
    Convenience function to save evaluation to Hugging Face.
    
    Args:
        evaluation: Evaluation data dictionary
        csv_row: Optional CSV row to append to master CSV
    
    Returns:
        (success: bool, message: str)
    """
    if not HF_AVAILABLE:
        return False, "huggingface_hub not available. Install with: pip install huggingface_hub"
    
    storage = get_hf_storage()
    if not storage:
        return False, "Hugging Face storage not initialized"
    
    # Save JSON file
    success_json, msg_json = storage.save_evaluation(evaluation)
    
    # Update CSV if provided
    if csv_row:
        success_csv, msg_csv = storage.update_csv_master(csv_row)
        if success_json and success_csv:
            return True, f"{msg_json}; {msg_csv}"
        elif success_json:
            return True, f"{msg_json} (CSV update failed: {msg_csv})"
        else:
            return False, f"JSON save failed: {msg_json}"
    
    return success_json, msg_json