Spaces:

ltg
/

fluency-annotation

Sleeping

App Files Files Community

davda54 commited on Aug 13, 2025

Commit

90fa8f8

verified ·

1 Parent(s): 086869b

Update app.py

Browse files

Files changed (1) hide show

app.py +161 -192

app.py CHANGED Viewed

@@ -9,7 +9,8 @@ from typing import Dict, List, Tuple
 import hashlib
 import itertools
 from datasets import load_dataset, Dataset, DatasetDict
-from huggingface_hub import HfApi, create_repo, repo_exists
 import threading
 from collections.abc import Iterable
@@ -218,8 +219,8 @@ TODO
 """
 # Configuration for the output dataset
-OUTPUT_DATASET_NAME = "ltg/fluency-annotations"  # Change to your desired dataset name
-OUTPUT_DATASET_PRIVATE = True  # Keep the annotations dataset private
 HF_TOKEN = os.environ.get("HF_TOKEN")
@@ -229,6 +230,83 @@ MODEL_NAMES = ["mistral-Nemo", "translated-SFT", "on-policy-RL"]
 # Create all pairwise comparisons
 MODEL_PAIRS = list(itertools.combinations(MODEL_NAMES, 2))
 def load_dataset_samples():
     """Load and prepare dataset samples with pairwise comparisons"""
     try:
@@ -293,99 +371,17 @@ DATASET_SAMPLES = load_dataset_samples()
 class AnnotationManager:
     def __init__(self):
-        self.annotations = {}  # Store annotations by user_id
-        self.user_states = {}  # Track each user's progress
-        self.annotation_cache = []  # Cache for batch uploads
-        self.lock = threading.Lock()  # Thread safety for annotations
-        # Initialize or load existing annotations dataset
-        self.init_annotations_dataset()
-    def init_annotations_dataset(self):
-        """Initialize or load existing annotations from HuggingFace"""
-        try:
-            if HF_TOKEN:
-                api = HfApi(token=HF_TOKEN)
-                # Check if dataset exists, if not create it
-                if not repo_exists(OUTPUT_DATASET_NAME, repo_type="dataset", token=HF_TOKEN):
-                    print(f"Creating new dataset: {OUTPUT_DATASET_NAME}")
-                    create_repo(
-                        OUTPUT_DATASET_NAME,
-                        repo_type="dataset",
-                        private=OUTPUT_DATASET_PRIVATE,
-                        token=HF_TOKEN
-                    )
-                    # Create empty dataset structure
-                    self.push_empty_dataset()
-                else:
-                    # Load existing annotations
-                    print(f"Loading existing annotations from {OUTPUT_DATASET_NAME}")
-                    self.load_existing_annotations()
-            else:
-                print("Warning: No HF_TOKEN found. Annotations will only be saved locally.")
-        except Exception as e:
-            print(f"Error initializing annotations dataset: {e}")
-            print("Continuing with local-only mode")
-    def push_empty_dataset(self):
-        """Create and push empty dataset structure"""
-        try:
-            empty_data = {
-                "user_id": [],
-                "sample_id": [],
-                "original_id": [],
-                "model_a": [],
-                "model_b": [],
-                "choice": [],
-                "prompt": [],
-                "response_a": [],
-                "response_b": [],
-                "dataset": [],
-                "timestamp": []
             }
-            dataset = Dataset.from_dict(empty_data)
-            dataset.push_to_hub(OUTPUT_DATASET_NAME, token=HF_TOKEN, private=OUTPUT_DATASET_PRIVATE)
-            print(f"Created empty dataset at {OUTPUT_DATASET_NAME}")
-        except Exception as e:
-            print(f"Error creating empty dataset: {e}")
-    def load_existing_annotations(self):
-        """Load existing annotations from HuggingFace dataset"""
-        try:
-            dataset = load_dataset(OUTPUT_DATASET_NAME, split="train", token=HF_TOKEN)
-            # Rebuild annotations dictionary from dataset
-            for item in dataset:
-                user_id = item["user_id"]
-                if user_id not in self.annotations:
-                    self.annotations[user_id] = []
-                # Add to user's annotations
-                self.annotations[user_id].append({
-                    "user_id": user_id,
-                    "sample_id": item["sample_id"],
-                    "choice": item["choice"],
-                    "model_a": item.get("model_a", ""),
-                    "model_b": item.get("model_b", ""),
-                    "timestamp": item["timestamp"]
-                })
-                # Update user state
-                if user_id not in self.user_states:
-                    self.user_states[user_id] = {
-                        "current_index": 0,
-                        "annotations": []
-                    }
-                if item["sample_id"] not in self.user_states[user_id]["annotations"]:
-                    self.user_states[user_id]["annotations"].append(item["sample_id"])
-            print(f"Loaded {len(dataset)} existing annotations")
-        except Exception as e:
-            print(f"Error loading existing annotations: {e}")
-            print("Starting with empty annotations")
     def get_user_seed(self, user_id: str) -> int:
         """Generate consistent seed for user"""
@@ -396,26 +392,38 @@ class AnnotationManager:
         seed = self.get_user_seed(user_id)
         samples = DATASET_SAMPLES.copy()
         random.Random(seed).shuffle(samples)
         return samples
     def get_next_sample(self, user_id: str) -> Tuple[Dict, int, int]:
         """Get next unannotated sample for user"""
         if user_id not in self.user_states:
-            self.user_states[user_id] = {
-                "current_index": 0,
-                "annotations": []
-            }
         samples = self.get_user_samples(user_id)
         state = self.user_states[user_id]
-        # Count already annotated
-        annotated_count = len(state["annotations"])
         # Find next unannotated sample
-        for i, sample in enumerate(samples):
             if not self.is_annotated(user_id, sample["id"]):
-                return sample, annotated_count + 1, len(samples)
         # All samples annotated
         return None, len(samples), len(samples)
@@ -427,91 +435,49 @@ class AnnotationManager:
         return any(ann["sample_id"] == sample_id for ann in self.annotations[user_id])
     def save_annotation(self, user_id: str, sample_id: str, choice: str,
-                       sample_data: Dict = None):
-        """Save user's annotation locally and to HuggingFace"""
-        with self.lock:
-            if user_id not in self.annotations:
-                self.annotations[user_id] = []
-            annotation = {
-                "user_id": user_id,
-                "sample_id": sample_id,
-                "choice": choice,
-                "timestamp": datetime.now().isoformat()
             }
-            # Add sample data if provided
-            if sample_data:
-                annotation.update({
-                    "original_id": sample_data.get("original_id", ""),
-                    "model_a": sample_data.get("model_a", ""),
-                    "model_b": sample_data.get("model_b", ""),
-                    "prompt": sample_data.get("prompt", ""),
-                    "response_a": sample_data.get("response_a", ""),
-                    "response_b": sample_data.get("response_b", ""),
-                    "dataset": sample_data.get("dataset", "")
-                })
-            self.annotations[user_id].append(annotation)
-            # Update user state
-            if user_id in self.user_states:
-                if sample_id not in self.user_states[user_id]["annotations"]:
-                    self.user_states[user_id]["annotations"].append(sample_id)
-                self.user_states[user_id]["current_index"] += 1
-            print(f"Saved annotation locally: {annotation['sample_id']} by {user_id}")
-            # Save to HuggingFace asynchronously
-            if HF_TOKEN:
-                thread = threading.Thread(
-                    target=self.push_annotation_to_hub,
-                    args=(annotation,)
-                )
-                thread.daemon = True
-                thread.start()
-    def push_annotation_to_hub(self, annotation: Dict):
-        """Push single annotation to HuggingFace dataset"""
-        try:
-            # Load current dataset
-            dataset = load_dataset(OUTPUT_DATASET_NAME, split="train", token=HF_TOKEN)
-            # Convert to dict
-            data_dict = dataset.to_dict()
-            # Ensure all keys exist
-            required_keys = ["user_id", "sample_id", "original_id", "model_a",
-                           "model_b", "choice", "prompt", "response_a",
-                           "response_b", "dataset", "timestamp"]
-            for key in required_keys:
-                if key not in data_dict:
-                    data_dict[key] = []
-                # Append new annotation data
-                data_dict[key].append(annotation.get(key, ""))
-            # Create new dataset and push
-            updated_dataset = Dataset.from_dict(data_dict)
-            updated_dataset.push_to_hub(
-                OUTPUT_DATASET_NAME,
-                token=HF_TOKEN,
-                private=OUTPUT_DATASET_PRIVATE
-            )
-            print(f"Successfully pushed annotation to hub: {annotation['sample_id']}")
-        except Exception as e:
-            print(f"Error pushing annotation to hub: {e}")
-            # Add to cache for batch upload later
-            self.annotation_cache.append(annotation)
     def get_user_progress(self, user_id: str) -> Dict:
         """Get user's annotation progress"""
-        if user_id not in self.user_states:
             return {"completed": 0, "total": len(DATASET_SAMPLES)}
-        completed = len(self.user_states[user_id]["annotations"])
         return {"completed": completed, "total": len(DATASET_SAMPLES)}
@@ -540,13 +506,16 @@ def login(user_id: str) -> Tuple:
             gr.update(visible=True),  # login_interface
             gr.update(visible=False),  # annotation_interface
             user_id,  # user_state
-            gr.update(value=f"All samples completed for user: {user_id}"),  # login_status
             gr.update(),  # prompt
             gr.update(),  # response_a
             gr.update(),  # response_b
             gr.update()   # progress
         )
     return (
         gr.update(visible=False),  # login_interface
         gr.update(visible=True),   # annotation_interface
@@ -555,8 +524,7 @@ def login(user_id: str) -> Tuple:
         gr.update(value=sample["prompt"]),  # prompt
         gr.update(value=sample["response_a"]),  # response_a
         gr.update(value=sample["response_b"]),  # response_b
-        gr.update(value=f"Progress: {current}/{total} | Comparing: {sample.get('model_a', 'A')} vs {sample.get('model_b', 'B')}")  # progress
-        # gr.update(value=f"Progress: {current}/{total}")  # progress
     )
 def annotate(choice: str, user_id: str) -> Tuple:
@@ -579,13 +547,15 @@ def annotate(choice: str, user_id: str) -> Tuple:
             "b_better": "B is more fluent",
             "equal": "Equally fluent"
         }
-        # Save with full sample data for HuggingFace dataset
         manager.save_annotation(
-            user_id,
-            sample["id"],
-            choice_map[choice],
-            sample_data=sample  # Pass the full sample data
         )
     # Get next sample
@@ -607,8 +577,7 @@ def annotate(choice: str, user_id: str) -> Tuple:
         gr.update(value=next_sample["prompt"]),  # prompt
         gr.update(value=next_sample["response_a"]),  # response_a
         gr.update(value=next_sample["response_b"]),  # response_b
-        gr.update(value=f"Progress: {current}/{total} | Comparing: {sample.get('model_a', 'A')} vs {sample.get('model_b', 'B')}"),
-        # gr.update(value=f"Progress: {current}/{total}{model_info}"),  # progress
         gr.update(value="Annotation saved!", visible=True)  # status
     )

 import hashlib
 import itertools
 from datasets import load_dataset, Dataset, DatasetDict
+from huggingface_hub import HfApi, create_repo, repo_exists, Repository
+import shutil
 import threading
 from collections.abc import Iterable
 """
 # Configuration for the output dataset
+ANNOTATIONS_REPO = "ltg/fluency-annotations"  # Change to your repo name
+ANNOTATIONS_FILE = "train.jsonl"
 HF_TOKEN = os.environ.get("HF_TOKEN")
 # Create all pairwise comparisons
 MODEL_PAIRS = list(itertools.combinations(MODEL_NAMES, 2))
+# Initialize repository
+def init_repository():
+    """Initialize or clone the repository"""
+    try:
+        repo = Repository(
+            local_dir=DATA_DIR,
+            clone_from=ANNOTATIONS_REPO,
+            use_auth_token=HF_TOKEN,
+            repo_type="dataset"
+        )
+        repo.git_pull()
+        return repo
+    except Exception as e:
+        print(f"Error initializing repository: {e}")
+        # Create local directory if repo doesn't exist
+        os.makedirs(DATA_DIR, exist_ok=True)
+        return None
+# Initialize on startup
+annotation_repo = init_repository()
+def load_existing_annotations():
+    """Load existing annotations from the jsonl file"""
+    annotations = {}
+    if os.path.exists(ANNOTATIONS_FILE):
+        try:
+            with open(ANNOTATIONS_FILE, "r") as f:
+                for line in f:
+                    if line.strip():
+                        ann = json.loads(line)
+                        user_id = ann.get("user_id")
+                        if user_id:
+                            if user_id not in annotations:
+                                annotations[user_id] = []
+                            annotations[user_id].append(ann)
+            print(f"Loaded {sum(len(v) for v in annotations.values())} existing annotations")
+        except Exception as e:
+            print(f"Error loading annotations: {e}")
+    return annotations
+def save_annotation_to_file(annotation_data):
+    """Save a single annotation to the jsonl file and push to hub"""
+    global annotation_repo
+    try:
+        # Pull latest changes
+        if annotation_repo:
+            annotation_repo.git_pull()
+        # Append to jsonl file
+        with open(ANNOTATIONS_FILE, "a") as f:
+            line = json.dumps(annotation_data, ensure_ascii=False)
+            f.write(f"{line}\n")
+        # Push to hub asynchronously
+        if annotation_repo:
+            annotation_repo.push_to_hub(blocking=False, commit_message="Add annotation")
+    except Exception as e:
+        print(f"Error saving annotation: {e}")
+        # Try to reinitialize repository
+        try:
+            shutil.rmtree(DATA_DIR)
+            annotation_repo = init_repository()
+            # Retry saving
+            with open(ANNOTATIONS_FILE, "a") as f:
+                line = json.dumps(annotation_data, ensure_ascii=False)
+                f.write(f"{line}\n")
+            if annotation_repo:
+                annotation_repo.push_to_hub(blocking=False, commit_message="Add annotation")
+        except Exception as e2:
+            print(f"Failed to save annotation after retry: {e2}")
 def load_dataset_samples():
     """Load and prepare dataset samples with pairwise comparisons"""
     try:
 class AnnotationManager:
     def __init__(self):
+        # Load existing annotations from file
+        self.annotations = load_existing_annotations()
+        self.user_states = {}
+        # Rebuild user states from loaded annotations
+        for user_id, user_annotations in self.annotations.items():
+            annotated_ids = [ann["sample_id"] for ann in user_annotations]
+            self.user_states[user_id] = {
+                "current_index": 0,
+                "annotations": annotated_ids
             }
     def get_user_seed(self, user_id: str) -> int:
         """Generate consistent seed for user"""
         seed = self.get_user_seed(user_id)
         samples = DATASET_SAMPLES.copy()
         random.Random(seed).shuffle(samples)
+        samples = [
+            sample if random.Random(seed + i).randint(0, 1) == 0 else swap_sample(sample)
+            for i, sample in enumerate(samples)
+        ]
         return samples
     def get_next_sample(self, user_id: str) -> Tuple[Dict, int, int]:
         """Get next unannotated sample for user"""
         if user_id not in self.user_states:
+            # Check if user has existing annotations
+            if user_id in self.annotations:
+                annotated_ids = [ann["sample_id"] for ann in self.annotations[user_id]]
+                self.user_states[user_id] = {
+                    "current_index": 0,
+                    "annotations": annotated_ids
+                }
+            else:
+                self.user_states[user_id] = {
+                    "current_index": 0,
+                    "annotations": []
+                }
         samples = self.get_user_samples(user_id)
         state = self.user_states[user_id]
+        # Count total annotations for this user
+        total_annotated = len(state["annotations"])
         # Find next unannotated sample
+        for idx, sample in enumerate(samples):
             if not self.is_annotated(user_id, sample["id"]):
+                return sample, total_annotated + 1, len(samples)
         # All samples annotated
         return None, len(samples), len(samples)
         return any(ann["sample_id"] == sample_id for ann in self.annotations[user_id])
     def save_annotation(self, user_id: str, sample_id: str, choice: str,
+                        model_a: str = None, model_b: str = None,
+                        original_id: str = None, dataset_name: str = None):
+        """Save user's annotation and persist to file"""
+        if user_id not in self.annotations:
+            self.annotations[user_id] = []
+        annotation = {
+            "user_id": user_id,
+            "sample_id": sample_id,
+            "original_sample_id": original_id,
+            "dataset": dataset_name,
+            "model_a": model_a,
+            "model_b": model_b,
+            "choice": choice,
+            "timestamp": datetime.now().isoformat()
+        }
+        # Save to memory
+        self.annotations[user_id].append(annotation)
+        # Update user state
+        if user_id in self.user_states:
+            self.user_states[user_id]["annotations"].append(sample_id)
+        else:
+            self.user_states[user_id] = {
+                "current_index": 0,
+                "annotations": [sample_id]
             }
+        # Save to file asynchronously
+        threading.Thread(
+            target=save_annotation_to_file,
+            args=(annotation,)
+        ).start()
+        print(f"Saved annotation: {annotation}")
     def get_user_progress(self, user_id: str) -> Dict:
         """Get user's annotation progress"""
+        if user_id not in self.annotations:
             return {"completed": 0, "total": len(DATASET_SAMPLES)}
+        completed = len(self.annotations[user_id])
         return {"completed": completed, "total": len(DATASET_SAMPLES)}
             gr.update(visible=True),  # login_interface
             gr.update(visible=False),  # annotation_interface
             user_id,  # user_state
+            gr.update(value=f"All {total} samples completed for user: {user_id}! 🎉"),  # login_status
             gr.update(),  # prompt
             gr.update(),  # response_a
             gr.update(),  # response_b
             gr.update()   # progress
         )
+    # Show which models are being compared
+    model_info = f" | Comparing: {sample.get('model_a', 'A')} vs {sample.get('model_b', 'B')}"
     return (
         gr.update(visible=False),  # login_interface
         gr.update(visible=True),   # annotation_interface
         gr.update(value=sample["prompt"]),  # prompt
         gr.update(value=sample["response_a"]),  # response_a
         gr.update(value=sample["response_b"]),  # response_b
+        gr.update(value=f"Progress: {current}/{total}{model_info}")  # progress
     )
 def annotate(choice: str, user_id: str) -> Tuple:
             "b_better": "B is more fluent",
             "equal": "Equally fluent"
         }
+        # Save with all metadata
         manager.save_annotation(
+            user_id=user_id,
+            sample_id=sample["id"],
+            choice=choice_map[choice],
+            model_a=sample.get("model_a"),
+            model_b=sample.get("model_b"),
+            original_id=sample.get("original_id"),
+            dataset_name=sample.get("dataset")
         )
     # Get next sample
         gr.update(value=next_sample["prompt"]),  # prompt
         gr.update(value=next_sample["response_a"]),  # response_a
         gr.update(value=next_sample["response_b"]),  # response_b
+        gr.update(value=f"Progress: {current}/{total}{model_info}"),  # progress
         gr.update(value="Annotation saved!", visible=True)  # status
     )