Spaces:

GeorgeIbrahim
/

Data_Collection

Sleeping

App Files Files Community

GeorgeIbrahim commited on Nov 8, 2024

Commit

7de933f

1 Parent(s): 8be8093

updaes

Browse files

Files changed (1) hide show

app.py +12 -8

app.py CHANGED Viewed

@@ -15,6 +15,9 @@ else:
     print("HUGGINGFACE_TOKEN environment variable not set.")
 dataset_name = "GeorgeIbrahim/EGYCOCO"  # Replace with your dataset name
 # Load or create the dataset
 try:
     dataset = load_dataset(dataset_name, split="train")
@@ -25,6 +28,7 @@ try:
     for example in dataset:
         image_id = example["image_id"]
         count = example["annotation_count"]
         if image_id not in annotation_counts or count > annotation_counts[image_id]:
             annotation_counts[image_id] = count
@@ -35,9 +39,10 @@ except Exception as e:
     features = Features({
         'image_id': Value(dtype='string'),
         'caption': Value(dtype='string'),
-        'annotation_count': Value(dtype='int32')  # Add annotation count feature
     })
-    dataset = Dataset.from_dict({'image_id': [], 'caption': [], 'annotation_count': []}, features=features)
     annotation_counts = {}
     dataset.push_to_hub(dataset_name)  # Push the empty dataset to Hugging Face
@@ -45,9 +50,6 @@ image_folder = "images"
 image_files = [f for f in os.listdir(image_folder) if f.endswith(('.png', '.jpg', '.jpeg'))]
 lock = threading.Lock()
-with open('nearest_neighbors_with_captions.json', 'r') as f:
-    results = json.load(f)
 def get_caption_for_image_id(image_path):
     """
@@ -107,7 +109,7 @@ def save_annotation(caption, session_data):
     with lock:
         image_id = session_data["current_image"]
         # Save caption or "skipped" based on user input
         if caption.strip().lower() == "skip":
             caption = "skipped"
@@ -119,11 +121,13 @@ def save_annotation(caption, session_data):
         new_data = Dataset.from_dict({
             "image_id": [image_id],
             "caption": [caption],
-            "annotation_count": [annotation_count + 1]  # Increment the annotation count
         }, features=Features({
             'image_id': Value(dtype='string'),
             'caption': Value(dtype='string'),
-            'annotation_count': Value(dtype='int32')  # Ensure int32 type
         }))
         # Update the annotation count in the dictionary

     print("HUGGINGFACE_TOKEN environment variable not set.")
 dataset_name = "GeorgeIbrahim/EGYCOCO"  # Replace with your dataset name
+with open('nearest_neighbors_with_captions.json', 'r') as f:
+    results = json.load(f)
 # Load or create the dataset
 try:
     dataset = load_dataset(dataset_name, split="train")
     for example in dataset:
         image_id = example["image_id"]
         count = example["annotation_count"]
         if image_id not in annotation_counts or count > annotation_counts[image_id]:
             annotation_counts[image_id] = count
     features = Features({
         'image_id': Value(dtype='string'),
         'caption': Value(dtype='string'),
+        'annotation_count': Value(dtype='int32'),
+        'split': Value(dtype='string')
     })
+    dataset = Dataset.from_dict({'image_id': [], 'caption': [], 'annotation_count': [], 'split': []}, features=features)
     annotation_counts = {}
     dataset.push_to_hub(dataset_name)  # Push the empty dataset to Hugging Face
 image_files = [f for f in os.listdir(image_folder) if f.endswith(('.png', '.jpg', '.jpeg'))]
 lock = threading.Lock()
 def get_caption_for_image_id(image_path):
     """
     with lock:
         image_id = session_data["current_image"]
+        split = "dev" if image_id in results else "train"
         # Save caption or "skipped" based on user input
         if caption.strip().lower() == "skip":
             caption = "skipped"
         new_data = Dataset.from_dict({
             "image_id": [image_id],
             "caption": [caption],
+            "annotation_count": [annotation_count + 1],
+            "split": [split]
         }, features=Features({
             'image_id': Value(dtype='string'),
             'caption': Value(dtype='string'),
+            'annotation_count': Value(dtype='int32'),
+            'split': Value(dtype='string')
         }))
         # Update the annotation count in the dictionary