Spaces:

GeorgeIbrahim
/

Data_Collection

Sleeping

App Files Files Community

GeorgeIbrahim commited on Nov 8, 2024

Commit

bd6fc64

1 Parent(s): 54a81cc

updates

Browse files

Files changed (1) hide show

app.py +14 -8

app.py CHANGED Viewed

@@ -15,7 +15,7 @@ else:
     print("HUGGINGFACE_TOKEN environment variable not set.")
 dataset_name = "GeorgeIbrahim/EGYCOCO"  # Replace with your dataset name
-# Load or create the dataset
 try:
     dataset = load_dataset(dataset_name, split="train")
     print("Loaded existing dataset:", dataset)
@@ -34,9 +34,10 @@ except Exception as e:
     features = Features({
         'image_id': Value(dtype='string'),
         'caption': Value(dtype='string'),
-        'annotation_count': Value(dtype='int32')  # Add annotation count feature
     })
-    dataset = Dataset.from_dict({'image_id': [], 'caption': [], 'annotation_count': []}, features=features)
     annotation_counts = {}
     dataset.push_to_hub(dataset_name)  # Push the empty dataset to Hugging Face
@@ -48,12 +49,12 @@ lock = threading.Lock()
 with open('nearest_neighbors_with_captions.json', 'r') as f:
     results = json.load(f)
-# Populate the annotation counts based on validation and neighbor distinction
 for image_id, data in results.items():
-    # Mark validation image to require two annotations
     annotation_counts[image_id] = annotation_counts.get(image_id, 0)
-    # Mark each nearest neighbor to require only one annotation
     for neighbor in data["nearest_neighbors"]:
         neighbor_id = neighbor["image_id"]
         annotation_counts[neighbor_id] = annotation_counts.get(neighbor_id, 0)
@@ -113,14 +114,19 @@ def save_annotation(caption, session_data):
         annotation_count = annotation_counts.get(image_id, 0)
         new_data = Dataset.from_dict({
             "image_id": [image_id],
             "caption": [caption],
-            "annotation_count": [annotation_count + 1]
         }, features=Features({
             'image_id': Value(dtype='string'),
             'caption': Value(dtype='string'),
-            'annotation_count': Value(dtype='int32')
         }))
         annotation_counts[image_id] = annotation_count + 1

     print("HUGGINGFACE_TOKEN environment variable not set.")
 dataset_name = "GeorgeIbrahim/EGYCOCO"  # Replace with your dataset name
+# Load or create the dataset with a new 'split' column
 try:
     dataset = load_dataset(dataset_name, split="train")
     print("Loaded existing dataset:", dataset)
     features = Features({
         'image_id': Value(dtype='string'),
         'caption': Value(dtype='string'),
+        'annotation_count': Value(dtype='int32'),  # Add annotation count feature
+        'split': Value(dtype='string')  # Add split column to mark as "dev" or "train"
     })
+    dataset = Dataset.from_dict({'image_id': [], 'caption': [], 'annotation_count': [], 'split': []}, features=features)
     annotation_counts = {}
     dataset.push_to_hub(dataset_name)  # Push the empty dataset to Hugging Face
 with open('nearest_neighbors_with_captions.json', 'r') as f:
     results = json.load(f)
+# Populate the annotation counts and set split types based on validation and neighbor distinction
 for image_id, data in results.items():
+    # Mark validation image to require two annotations and set split as "dev"
     annotation_counts[image_id] = annotation_counts.get(image_id, 0)
+    # Mark each nearest neighbor to require only one annotation and set split as "train"
     for neighbor in data["nearest_neighbors"]:
         neighbor_id = neighbor["image_id"]
         annotation_counts[neighbor_id] = annotation_counts.get(neighbor_id, 0)
         annotation_count = annotation_counts.get(image_id, 0)
+        # Determine the split type based on whether it's in the validation set
+        split_type = "dev" if image_id in results else "train"
         new_data = Dataset.from_dict({
             "image_id": [image_id],
             "caption": [caption],
+            "annotation_count": [annotation_count + 1],
+            "split": [split_type]
         }, features=Features({
             'image_id': Value(dtype='string'),
             'caption': Value(dtype='string'),
+            'annotation_count': Value(dtype='int32'),
+            'split': Value(dtype='string')
         }))
         annotation_counts[image_id] = annotation_count + 1