Spaces:

GeorgeIbrahim
/

Data_Collection

Sleeping

App Files Files Community

GeorgeIbrahim commited on Nov 8, 2024

Commit

8be8093

1 Parent(s): 52084ff

updates

Browse files

Files changed (1) hide show

app.py +60 -62

app.py CHANGED Viewed

@@ -13,103 +13,94 @@ if token:
     login(token=token)
 else:
     print("HUGGINGFACE_TOKEN environment variable not set.")
 dataset_name = "GeorgeIbrahim/EGYCOCO"  # Replace with your dataset name
-# Load or create the dataset with a new 'split' column
 try:
     dataset = load_dataset(dataset_name, split="train")
     print("Loaded existing dataset:", dataset)
-    # Load the nearest neighbors JSON file
-    with open('nearest_neighbors_with_captions.json', 'r') as f:
-        results = json.load(f)
-    # Define the new features with the added 'split' column
     features = Features({
         'image_id': Value(dtype='string'),
         'caption': Value(dtype='string'),
-        'annotation_count': Value(dtype='int32'),
-        'split': Value(dtype='string')  # New 'split' column
     })
-    # Populate the 'split' column based on whether image_id is in results
-    updated_data = {
-        'image_id': [],
-        'caption': [],
-        'annotation_count': [],
-        'split': []
-    }
-    for example in dataset:
-        image_id = example["image_id"]
-        updated_data['image_id'].append(image_id)
-        updated_data['caption'].append(example["caption"])
-        updated_data['annotation_count'].append(example["annotation_count"])
-        # Determine the split type based on whether it's in the validation set
-        split_type = "dev" if image_id in results else "train"
-        updated_data['split'].append(split_type)
-    # Create a new dataset with updated features and push to the hub
-    updated_dataset = Dataset.from_dict(updated_data, features=features)
-    updated_dataset.push_to_hub(dataset_name)
-    print("Dataset updated with 'split' column and pushed to Hugging Face.")
-except Exception as e:
-    print(f"Error loading or updating dataset: {e}")
 image_folder = "images"
 image_files = [f for f in os.listdir(image_folder) if f.endswith(('.png', '.jpg', '.jpeg'))]
 lock = threading.Lock()
-# Initialize annotation counts
-annotation_counts = {}
-for example in dataset:
-    image_id = example["image_id"]
-    count = example["annotation_count"]
-    annotation_counts[image_id] = count
 def get_caption_for_image_id(image_path):
     """
     Retrieve the caption for a given image_id from the JSON data.
     """
     match = re.search(r'_(\d+)\.', image_path)
     if match:
-        image_id = match.group(1).lstrip('0')
-        print("Searching for image_id:", image_id)
         if image_id in results:
-            print("Found caption in results:", results[image_id]["caption"])
             return results[image_id]["caption"]
         for test_image_data in results.values():
             for neighbor in test_image_data["nearest_neighbors"]:
                 if neighbor["image_id"] == image_id:
-                    print("Found caption in nearest neighbors:", neighbor["caption"])
                     return neighbor["caption"]
-    print("Caption not found for image_id:", image_id)
     return None
 # Function to get a random image that hasn’t been fully annotated
 def get_next_image(session_data):
     with lock:
         available_images = [
             img for img in image_files
-            if img not in annotation_counts or annotation_counts.get(img, 0) < (2 if img in results else 1)
         ]
-        print("Available images:", available_images)
         if session_data["current_image"] is None and available_images:
             session_data["current_image"] = random.choice(available_images)
-            print("Current image_id:", session_data["current_image"])
     return os.path.join(image_folder, session_data["current_image"]) if session_data["current_image"] else None
 # Function to save the annotation to Hugging Face dataset and fetch the next image
 def save_annotation(caption, session_data):
-    global dataset, annotation_counts
     if session_data["current_image"] is None:
         return gr.update(visible=False), gr.update(value="All images have been annotated!"), gr.update(value="")
@@ -117,66 +108,73 @@ def save_annotation(caption, session_data):
     with lock:
         image_id = session_data["current_image"]
         if caption.strip().lower() == "skip":
             caption = "skipped"
         annotation_count = annotation_counts.get(image_id, 0)
-        # Determine the split type based on whether it's in the validation set
-        split_type = "dev" if image_id in results else "train"
         new_data = Dataset.from_dict({
             "image_id": [image_id],
             "caption": [caption],
-            "annotation_count": [annotation_count + 1],
-            "split": [split_type]
         }, features=Features({
             'image_id': Value(dtype='string'),
             'caption': Value(dtype='string'),
-            'annotation_count': Value(dtype='int32'),
-            'split': Value(dtype='string')
         }))
         annotation_counts[image_id] = annotation_count + 1
         dataset = concatenate_datasets([dataset, new_data])
         dataset.push_to_hub(dataset_name)
         print("Pushed updated dataset")
-        if annotation_counts[image_id] >= (2 if image_id in results else 1):
             session_data["current_image"] = None
     next_image = get_next_image(session_data)
     if next_image:
-        next_caption = get_caption_for_image_id(os.path.basename(next_image))
-        print("Next image_id:", os.path.basename(next_image))
         return gr.update(value=next_image), gr.update(value=""), gr.update(value=next_caption or "")
     else:
         return gr.update(visible=False), gr.update(value="All images have been annotated!"), gr.update(value="")
 def initialize_interface(session_data):
     next_image = get_next_image(session_data)
     if next_image:
-        next_caption = get_caption_for_image_id(os.path.basename(next_image))
-        print("Initial image_id:", os.path.basename(next_image))
         return gr.update(value=next_image), gr.update(value=next_caption or "")
     else:
         return gr.update(visible=False), gr.update(value="All images have been annotated!")
 with gr.Blocks() as demo:
     gr.Markdown("# Image Captioning Tool")
     gr.Markdown("Please provide your caption in Egyptian Arabic 'Masri'")
-    session_data = gr.State({"current_image": None})
     with gr.Row():
         image = gr.Image()
         caption = gr.Textbox(placeholder="Enter caption here...")
-        existing_caption = gr.Textbox(label="Existing Caption", interactive=False)
         submit = gr.Button("Submit")
     submit.click(fn=save_annotation, inputs=[caption, session_data], outputs=[image, caption, existing_caption])
     demo.load(fn=initialize_interface, inputs=session_data, outputs=[image, existing_caption])
 demo.launch(share=True)

     login(token=token)
 else:
     print("HUGGINGFACE_TOKEN environment variable not set.")
 dataset_name = "GeorgeIbrahim/EGYCOCO"  # Replace with your dataset name
+# Load or create the dataset
 try:
     dataset = load_dataset(dataset_name, split="train")
     print("Loaded existing dataset:", dataset)
+    # Create a dictionary to keep track of the highest annotation count for each image
+    annotation_counts = {}
+    for example in dataset:
+        image_id = example["image_id"]
+        count = example["annotation_count"]
+        if image_id not in annotation_counts or count > annotation_counts[image_id]:
+            annotation_counts[image_id] = count
+    print("Annotation counts:", annotation_counts)
+except Exception as e:
+    print(f"Error loading dataset: {e}")
+    # Create an empty dataset if it doesn't exist
     features = Features({
         'image_id': Value(dtype='string'),
         'caption': Value(dtype='string'),
+        'annotation_count': Value(dtype='int32')  # Add annotation count feature
     })
+    dataset = Dataset.from_dict({'image_id': [], 'caption': [], 'annotation_count': []}, features=features)
+    annotation_counts = {}
+    dataset.push_to_hub(dataset_name)  # Push the empty dataset to Hugging Face
 image_folder = "images"
 image_files = [f for f in os.listdir(image_folder) if f.endswith(('.png', '.jpg', '.jpeg'))]
 lock = threading.Lock()
+with open('nearest_neighbors_with_captions.json', 'r') as f:
+    results = json.load(f)
 def get_caption_for_image_id(image_path):
     """
     Retrieve the caption for a given image_id from the JSON data.
     """
+    # Extract the numeric part of the image ID
     match = re.search(r'_(\d+)\.', image_path)
     if match:
+        image_id = match.group(1).lstrip('0')  # Remove leading zeros
+        print("Searching for image_id:", image_id)  # Debugging line
+        # Check if image_id is a test image
         if image_id in results:
+            print("Found caption in results:", results[image_id]["caption"])  # Debugging line
             return results[image_id]["caption"]
+        # If image_id is not a test image, search in nearest neighbors
         for test_image_data in results.values():
             for neighbor in test_image_data["nearest_neighbors"]:
                 if neighbor["image_id"] == image_id:
+                    print("Found caption in nearest neighbors:", neighbor["caption"])  # Debugging line
                     return neighbor["caption"]
+    # Return None if the image_id is not found
+    print("Caption not found for image_id:", image_id)  # Debugging line
     return None
 # Function to get a random image that hasn’t been fully annotated
 def get_next_image(session_data):
     with lock:
+        # Available images filter
         available_images = [
             img for img in image_files
+            if img not in annotation_counts or
+               ("val" in img and annotation_counts.get(img, 0) < 2) or
+               ("val" not in img and annotation_counts.get(img, 0) == 0)
         ]
+        print("Available images:", available_images)  # Debugging line
+        # Check if the user already has an image
         if session_data["current_image"] is None and available_images:
+            # Assign a new random image to the user
             session_data["current_image"] = random.choice(available_images)
+            print("Current image_id:", session_data["current_image"])  # Print the current image_id
     return os.path.join(image_folder, session_data["current_image"]) if session_data["current_image"] else None
 # Function to save the annotation to Hugging Face dataset and fetch the next image
 def save_annotation(caption, session_data):
+    global dataset, annotation_counts  # Declare global dataset and annotation_counts at the start of the function
     if session_data["current_image"] is None:
         return gr.update(visible=False), gr.update(value="All images have been annotated!"), gr.update(value="")
     with lock:
         image_id = session_data["current_image"]
+        # Save caption or "skipped" based on user input
         if caption.strip().lower() == "skip":
             caption = "skipped"
+        # Get current annotation count
         annotation_count = annotation_counts.get(image_id, 0)
+        # Add the new annotation as a new row to the dataset
         new_data = Dataset.from_dict({
             "image_id": [image_id],
             "caption": [caption],
+            "annotation_count": [annotation_count + 1]  # Increment the annotation count
         }, features=Features({
             'image_id': Value(dtype='string'),
             'caption': Value(dtype='string'),
+            'annotation_count': Value(dtype='int32')  # Ensure int32 type
         }))
+        # Update the annotation count in the dictionary
         annotation_counts[image_id] = annotation_count + 1
+        # Concatenate with the existing dataset and push the updated dataset to Hugging Face
         dataset = concatenate_datasets([dataset, new_data])
         dataset.push_to_hub(dataset_name)
         print("Pushed updated dataset")
+        # Clear user's current image if the validation image has been annotated twice
+        if ("val" not in image_id) or (annotation_count + 1 >= 2):
             session_data["current_image"] = None
+    # Fetch the next image
     next_image = get_next_image(session_data)
     if next_image:
+        next_caption = get_caption_for_image_id(os.path.basename(next_image))  # Retrieve the caption for the new image
+        print("Next image_id:", os.path.basename(next_image))  # Debugging line
         return gr.update(value=next_image), gr.update(value=""), gr.update(value=next_caption or "")
     else:
         return gr.update(visible=False), gr.update(value="All images have been annotated!"), gr.update(value="")
 def initialize_interface(session_data):
     next_image = get_next_image(session_data)
     if next_image:
+        next_caption = get_caption_for_image_id(os.path.basename(next_image))  # Retrieve caption for initial image
+        print("Initial image_id:", os.path.basename(next_image))  # Print the initial image_id
         return gr.update(value=next_image), gr.update(value=next_caption or "")
     else:
         return gr.update(visible=False), gr.update(value="All images have been annotated!")
+# Build the Gradio interface
 with gr.Blocks() as demo:
     gr.Markdown("# Image Captioning Tool")
     gr.Markdown("Please provide your caption in Egyptian Arabic 'Masri'")
+    session_data = gr.State({"current_image": None})  # Session-specific state
     with gr.Row():
         image = gr.Image()
         caption = gr.Textbox(placeholder="Enter caption here...")
+        existing_caption = gr.Textbox(label="Existing Caption", interactive=False)  # Display existing caption
         submit = gr.Button("Submit")
+    # Define actions for buttons
     submit.click(fn=save_annotation, inputs=[caption, session_data], outputs=[image, caption, existing_caption])
+    # Load initial image
     demo.load(fn=initialize_interface, inputs=session_data, outputs=[image, existing_caption])
 demo.launch(share=True)