Spaces:

GeorgeIbrahim
/

Data_Collection

Sleeping

App Files Files Community

GeorgeIbrahim commited on Nov 8, 2024

Commit

df87278

1 Parent(s): 0a274b3

updates

Browse files

Files changed (1) hide show

app.py +41 -29

app.py CHANGED Viewed

@@ -25,22 +25,20 @@ try:
     dataset = load_dataset(dataset_name, split="train")
     print("Loaded existing dataset:", dataset)
     print("Dataset features:", dataset.features)  # Check if 'split' is part of features
     # Check if the 'split' column exists; if not, add it
     if 'split' not in dataset.column_names:
-        split_values = []
-        for example in dataset:
-            match = re.search(r'_(\d+)\.', example["image_id"])
-            image_id = match.group(1).lstrip('0')
-            if image_id in results:
-                split_values.append("dev")
-            else:
-                split_values.append("train")
         dataset = dataset.add_column("split", split_values)
         print("Added 'split' column to dataset.")
     else:
         print("'split' column already exists.")
@@ -98,35 +96,46 @@ def get_caption_for_image_id(image_path):
     print("Caption not found for image_id:", image_id)  # Debugging line
     return None
 # Function to get a random image that hasn’t been fully annotated
 def get_next_image(session_data):
     with lock:
-        # Available images filter based on the 'split' column instead of checking filename
         available_images = [
             img for img in image_files
             if img not in annotation_counts or
-               (dataset[annotation_counts[img]]["split"] == "dev" and annotation_counts.get(img, 0) < 2) or
-               (dataset[annotation_counts[img]]["split"] != "dev" and annotation_counts.get(img, 0) == 0)
         ]
-        print("Available images before shuffle:", available_images)  # Debugging line
-        # Shuffle available images to randomize the order
-        random.shuffle(available_images)
-        print("Available images after shuffle:", available_images)  # Debugging line
-        # Check if the user already has an image and assign a new one if they don't
         if session_data["current_image"] is None and available_images:
-            # Assign a new random image to the user from shuffled available images
-            session_data["current_image"] = available_images[0]  # Take the first from shuffled list
-            print("Current image_id:", session_data["current_image"])  # Print the current image_id
     return os.path.join(image_folder, session_data["current_image"]) if session_data["current_image"] else None
-# Function to save the annotation to Hugging Face dataset and fetch the next image
 def save_annotation(caption, session_data):
-    global dataset, annotation_counts  # Declare global dataset and annotation_counts at the start of the function
     if session_data["current_image"] is None:
         return gr.update(visible=False), gr.update(value="All images have been annotated!"), gr.update(value="")
@@ -168,8 +177,11 @@ def save_annotation(caption, session_data):
         dataset.push_to_hub(dataset_name)
         print("Pushed updated dataset")
-        # Clear user's current image if the validation image has been annotated twice
-        if ("val" not in image_id) or (annotation_count + 1 >= 2):
             session_data["current_image"] = None
     # Fetch the next image
@@ -181,7 +193,7 @@ def save_annotation(caption, session_data):
     else:
         return gr.update(visible=False), gr.update(value="All images have been annotated!"), gr.update(value="")
 def initialize_interface(session_data):
     next_image = get_next_image(session_data)
     if next_image:

     dataset = load_dataset(dataset_name, split="train")
     print("Loaded existing dataset:", dataset)
     print("Dataset features:", dataset.features)  # Check if 'split' is part of features
     # Check if the 'split' column exists; if not, add it
     if 'split' not in dataset.column_names:
+        # Define the 'split' values based on `image_id`
+        print(results)
+        print(example["image_id"] in results)
+        split_values = [
+            "dev" if example["image_id"] in results else "train"
+            for example in dataset
+        ]
+        # Add 'split' column to the dataset
         dataset = dataset.add_column("split", split_values)
         print("Added 'split' column to dataset.")
     else:
         print("'split' column already exists.")
     print("Caption not found for image_id:", image_id)  # Debugging line
     return None
+shown_counts = {}
 # Function to get a random image that hasn’t been fully annotated
 def get_next_image(session_data):
     with lock:
+        # Filter available images based on annotation counts and split
         available_images = [
             img for img in image_files
             if img not in annotation_counts or
+               ("val" in img and annotation_counts.get(img, 0) < 2) or
+               ("val" not in img and annotation_counts.get(img, 0) == 0)
         ]
+        print("Available images:", available_images)  # Debugging line
+        # Select an image to show based on split type
         if session_data["current_image"] is None and available_images:
+            random.shuffle(available_images)  # Shuffle for randomness
+            for img in available_images:
+                image_id = re.search(r'_(\d+)\.', img).group(1).lstrip('0')  # Extract image ID
+                split = "dev" if image_id in results else "train"
+                # Show 'dev' images twice
+                if split == "dev":
+                    if shown_counts.get(img, 0) < 2:
+                        shown_counts[img] = shown_counts.get(img, 0) + 1
+                        session_data["current_image"] = img
+                        print("Selected 'dev' image_id:", session_data["current_image"])
+                        break
+                # Show 'train' images once
+                else:
+                    session_data["current_image"] = img
+                    print("Selected 'train' image_id:", session_data["current_image"])
+                    break
     return os.path.join(image_folder, session_data["current_image"]) if session_data["current_image"] else None
+# Update save_annotation function to reset the shown count if 'dev' image is shown twice
 def save_annotation(caption, session_data):
+    global dataset, annotation_counts, shown_counts  # Include shown_counts
     if session_data["current_image"] is None:
         return gr.update(visible=False), gr.update(value="All images have been annotated!"), gr.update(value="")
         dataset.push_to_hub(dataset_name)
         print("Pushed updated dataset")
+        # Reset shown count if the 'dev' image has been shown twice
+        if split == "dev" and shown_counts.get(image_id, 0) >= 2:
+            shown_counts[image_id] = 0  # Reset count for 'dev' images shown twice
+            session_data["current_image"] = None
+        elif split == "train":
             session_data["current_image"] = None
     # Fetch the next image
     else:
         return gr.update(visible=False), gr.update(value="All images have been annotated!"), gr.update(value="")
 def initialize_interface(session_data):
     next_image = get_next_image(session_data)
     if next_image: