Spaces:

GeorgeIbrahim
/

Data_Collection

Sleeping

App Files Files Community

GeorgeIbrahim commited on Nov 8, 2024

Commit

52084ff

1 Parent(s): 33c417d

updates

Browse files

Files changed (1) hide show

app.py +54 -39

app.py CHANGED Viewed

@@ -13,47 +13,61 @@ if token:
     login(token=token)
 else:
     print("HUGGINGFACE_TOKEN environment variable not set.")
-dataset_name = "GeorgeIbrahim/EGYCOCO"  # Replace with your dataset name
-# Define the updated features including the new 'split' column
-features = Features({
-    'image_id': Value(dtype='string'),
-    'caption': Value(dtype='string'),
-    'annotation_count': Value(dtype='int32'),  # Annotation count
-    'split': Value(dtype='string')  # New 'split' column
-})
-# Load the dataset or create it if it doesn’t exist, with updated features
 try:
-    dataset = load_dataset(dataset_name, split="train").cast(features)
-    print("Loaded existing dataset with updated features:", dataset)
-    # Initialize annotation counts
-    annotation_counts = {}
     for example in dataset:
         image_id = example["image_id"]
-        count = example["annotation_count"]
-        annotation_counts[image_id] = count
 except Exception as e:
-    print(f"Error loading dataset: {e}")
-    # Create an empty dataset if it doesn't exist
-    dataset = Dataset.from_dict({'image_id': [], 'caption': [], 'annotation_count': [], 'split': []}, features=features)
-    annotation_counts = {}
-    dataset.push_to_hub(dataset_name)  # Push the empty dataset to Hugging Face if it's new
-# Load the nearest neighbors JSON file
-with open('nearest_neighbors_with_captions.json', 'r') as f:
-    results = json.load(f)
-# Populate the annotation counts and set split types based on validation and neighbor distinction
-for image_id, data in results.items():
-    # Mark validation image to require two annotations and set split as "dev"
-    annotation_counts[image_id] = annotation_counts.get(image_id, 0)
-    # Mark each nearest neighbor to require only one annotation and set split as "train"
-    for neighbor in data["nearest_neighbors"]:
-        neighbor_id = neighbor["image_id"]
-        annotation_counts[neighbor_id] = annotation_counts.get(neighbor_id, 0)
 def get_caption_for_image_id(image_path):
     """
@@ -77,7 +91,6 @@ def get_caption_for_image_id(image_path):
     print("Caption not found for image_id:", image_id)
     return None
 # Function to get a random image that hasn’t been fully annotated
 def get_next_image(session_data):
     with lock:
@@ -94,7 +107,6 @@ def get_next_image(session_data):
     return os.path.join(image_folder, session_data["current_image"]) if session_data["current_image"] else None
 # Function to save the annotation to Hugging Face dataset and fetch the next image
 def save_annotation(caption, session_data):
     global dataset, annotation_counts
@@ -118,7 +130,12 @@ def save_annotation(caption, session_data):
             "caption": [caption],
             "annotation_count": [annotation_count + 1],
             "split": [split_type]
-        }, features=features)
         annotation_counts[image_id] = annotation_count + 1
@@ -137,7 +154,6 @@ def save_annotation(caption, session_data):
     else:
         return gr.update(visible=False), gr.update(value="All images have been annotated!"), gr.update(value="")
 def initialize_interface(session_data):
     next_image = get_next_image(session_data)
     if next_image:
@@ -147,7 +163,6 @@ def initialize_interface(session_data):
     else:
         return gr.update(visible=False), gr.update(value="All images have been annotated!")
 with gr.Blocks() as demo:
     gr.Markdown("# Image Captioning Tool")
     gr.Markdown("Please provide your caption in Egyptian Arabic 'Masri'")

     login(token=token)
 else:
     print("HUGGINGFACE_TOKEN environment variable not set.")
+dataset_name = "GeorgeIbrahim/EGYCOCO"  # Replace with your dataset name
+# Load or create the dataset with a new 'split' column
 try:
+    dataset = load_dataset(dataset_name, split="train")
+    print("Loaded existing dataset:", dataset)
+    # Load the nearest neighbors JSON file
+    with open('nearest_neighbors_with_captions.json', 'r') as f:
+        results = json.load(f)
+    # Define the new features with the added 'split' column
+    features = Features({
+        'image_id': Value(dtype='string'),
+        'caption': Value(dtype='string'),
+        'annotation_count': Value(dtype='int32'),
+        'split': Value(dtype='string')  # New 'split' column
+    })
+    # Populate the 'split' column based on whether image_id is in results
+    updated_data = {
+        'image_id': [],
+        'caption': [],
+        'annotation_count': [],
+        'split': []
+    }
     for example in dataset:
         image_id = example["image_id"]
+        updated_data['image_id'].append(image_id)
+        updated_data['caption'].append(example["caption"])
+        updated_data['annotation_count'].append(example["annotation_count"])
+        # Determine the split type based on whether it's in the validation set
+        split_type = "dev" if image_id in results else "train"
+        updated_data['split'].append(split_type)
+    # Create a new dataset with updated features and push to the hub
+    updated_dataset = Dataset.from_dict(updated_data, features=features)
+    updated_dataset.push_to_hub(dataset_name)
+    print("Dataset updated with 'split' column and pushed to Hugging Face.")
 except Exception as e:
+    print(f"Error loading or updating dataset: {e}")
+image_folder = "images"
+image_files = [f for f in os.listdir(image_folder) if f.endswith(('.png', '.jpg', '.jpeg'))]
+lock = threading.Lock()
+# Initialize annotation counts
+annotation_counts = {}
+for example in dataset:
+    image_id = example["image_id"]
+    count = example["annotation_count"]
+    annotation_counts[image_id] = count
 def get_caption_for_image_id(image_path):
     """
     print("Caption not found for image_id:", image_id)
     return None
 # Function to get a random image that hasn’t been fully annotated
 def get_next_image(session_data):
     with lock:
     return os.path.join(image_folder, session_data["current_image"]) if session_data["current_image"] else None
 # Function to save the annotation to Hugging Face dataset and fetch the next image
 def save_annotation(caption, session_data):
     global dataset, annotation_counts
             "caption": [caption],
             "annotation_count": [annotation_count + 1],
             "split": [split_type]
+        }, features=Features({
+            'image_id': Value(dtype='string'),
+            'caption': Value(dtype='string'),
+            'annotation_count': Value(dtype='int32'),
+            'split': Value(dtype='string')
+        }))
         annotation_counts[image_id] = annotation_count + 1
     else:
         return gr.update(visible=False), gr.update(value="All images have been annotated!"), gr.update(value="")
 def initialize_interface(session_data):
     next_image = get_next_image(session_data)
     if next_image:
     else:
         return gr.update(visible=False), gr.update(value="All images have been annotated!")
 with gr.Blocks() as demo:
     gr.Markdown("# Image Captioning Tool")
     gr.Markdown("Please provide your caption in Egyptian Arabic 'Masri'")