Spaces:

GeorgeIbrahim
/

Data_Collection

Sleeping

App Files Files Community

GeorgeSherif commited on Nov 7, 2024

Commit

179fca4

1 Parent(s): d34d332

updates

Browse files

Files changed (1) hide show

app.py +62 -56

app.py CHANGED Viewed

@@ -4,6 +4,8 @@ import threading
 import random
 from datasets import load_dataset, Dataset, Features, Value, concatenate_datasets
 from huggingface_hub import login
 # Authenticate with Hugging Face
 token = os.getenv("HUGGINGFACE_TOKEN")
@@ -13,62 +15,70 @@ else:
     print("HUGGINGFACE_TOKEN environment variable not set.")
 dataset_name = "GeorgeIbrahim/EGYCOCO"  # Replace with your dataset name
-# Load the existing dataset or create it if not available
 try:
     dataset = load_dataset(dataset_name, split="train")
     print("Loaded existing dataset:", dataset)
 except Exception as e:
-    print("Failed to load dataset:", e)
-    dataset = None
-# Check if "annotation_count" exists, if not, add it
-if dataset is not None:
-    if "annotation_count" not in dataset.column_names:
-        # Define the updated features with annotation_count added
-        features = dataset.features.copy()
-        features["annotation_count"] = Value(dtype="int32")
-        # Update dataset with new feature, initializing annotation_count based on existing annotations
-        dataset = dataset.map(
-            lambda row: {"annotation_count": 1 if "val" in row["image_id"] else 0},
-            features=features
-        )
-        # Push the updated dataset with the new feature to Hugging Face Hub
-        dataset.push_to_hub(dataset_name)
-        print("Updated dataset with annotation_count and pushed to Hub")
 image_folder = "images"
 image_files = [f for f in os.listdir(image_folder) if f.endswith(('.png', '.jpg', '.jpeg'))]
 lock = threading.Lock()
-# Function to get a random image that hasn’t been fully annotated
 def get_next_image(session_data):
     with lock:
-        # Retrieve set of annotated images with counts
-        annotated_images = {item["image_id"]: item["annotation_count"] for item in dataset}
-        # Available images filter
-        available_images = [
-            img for img in image_files
-            if img not in annotated_images or
-               ("val" in img and annotated_images[img] < 2) or
-               ("val" not in img and annotated_images[img] == 0)
-        ]
         # Check if the user already has an image
         if session_data["current_image"] is None and available_images:
             # Assign a new random image to the user
             session_data["current_image"] = random.choice(available_images)
     return os.path.join(image_folder, session_data["current_image"]) if session_data["current_image"] else None
-# Function to save the annotation to the Hugging Face dataset and fetch the next image
 def save_annotation(caption, session_data):
-    global dataset  # Declare global at the start of the function
     if session_data["current_image"] is None:
-        return gr.update(visible=False), gr.update(value="All images have been annotated!")
     with lock:
         image_id = session_data["current_image"]
@@ -79,7 +89,8 @@ def save_annotation(caption, session_data):
         # Check if image is already in dataset to update count
         existing_image = dataset.filter(lambda x: x["image_id"] == image_id)
-        if len(existing_image):
             annotation_count = existing_image[0]["annotation_count"]
         else:
             annotation_count = 0
@@ -88,56 +99,51 @@ def save_annotation(caption, session_data):
         new_data = Dataset.from_dict({
             "image_id": [image_id],
             "caption": [caption],
-            "annotation_count": [annotation_count + 1]
         })
         dataset = concatenate_datasets([dataset, new_data])
-        # Save updated dataset to Hugging Face
         dataset.push_to_hub(dataset_name)
         print("Pushed updated dataset")
-        # Clear user's current image if the validation image has been annotated five times
-        if ("val" not in image_id) or (annotation_count + 1 >= 5):
-            session_data["current_image"] = None
     # Fetch the next image
     next_image = get_next_image(session_data)
     if next_image:
-        return gr.update(value=next_image), gr.update(value="")
     else:
-        return gr.update(visible=False), gr.update(value="All images have been annotated!")
-# Function to skip the current image
-def skip_image(session_data):
-    return save_annotation("skip", session_data)
-# Function to initialize the interface
 def initialize_interface(session_data):
     next_image = get_next_image(session_data)
     if next_image:
-        return gr.update(value=next_image), gr.update(value="")
     else:
         return gr.update(visible=False), gr.update(value="All images have been annotated!")
 # Build the Gradio interface
 with gr.Blocks() as demo:
     gr.Markdown("# Image Captioning Tool")
-    gr.Markdown("Please provide a caption for each image displayed. Click 'Submit' after writing your caption, or type 'skip' if you don’t want to annotate this image.")
     session_data = gr.State({"current_image": None})  # Session-specific state
     with gr.Row():
         image = gr.Image()
         caption = gr.Textbox(placeholder="Enter caption here...")
         submit = gr.Button("Submit")
-        skip = gr.Button("Skip")  # Skip button
     # Define actions for buttons
-    submit.click(fn=save_annotation, inputs=[caption, session_data], outputs=[image, caption])
-    skip.click(fn=skip_image, inputs=session_data, outputs=[image, caption])
     # Load initial image
-    demo.load(fn=initialize_interface, inputs=session_data, outputs=[image, caption])
 demo.launch(share=True)

 import random
 from datasets import load_dataset, Dataset, Features, Value, concatenate_datasets
 from huggingface_hub import login
+import json
+import re
 # Authenticate with Hugging Face
 token = os.getenv("HUGGINGFACE_TOKEN")
     print("HUGGINGFACE_TOKEN environment variable not set.")
 dataset_name = "GeorgeIbrahim/EGYCOCO"  # Replace with your dataset name
+# Load or create the dataset
 try:
     dataset = load_dataset(dataset_name, split="train")
     print("Loaded existing dataset:", dataset)
 except Exception as e:
+    # Create an empty dataset if it doesn't exist
+    features = Features({
+        'image_id': Value(dtype='string'),
+        'caption': Value(dtype='string'),
+        'annotation_count': Value(dtype='int32')  # Add annotation count feature
+    })
+    dataset = Dataset.from_dict({'image_id': [], 'caption': [], 'annotation_count': []}, features=features)
+    dataset.push_to_hub(dataset_name)  # Push the empty dataset to Hugging Face
 image_folder = "images"
 image_files = [f for f in os.listdir(image_folder) if f.endswith(('.png', '.jpg', '.jpeg'))]
 lock = threading.Lock()
+with open('nearest_neighbors_with_captions.json', 'r') as f:
+    results = json.load(f)
+def get_caption_for_image_id(image_path):
+    """
+    Retrieve the caption for a given image_id from the JSON data.
+    """
+    # Extract the numeric part of the image ID
+    match = re.search(r'_(\d+)\.', image_path)
+    if match:
+        image_id = match.group(1).lstrip('0')  # Remove leading zeros
+        print("Searching for image_id:", image_id)  # Debugging line
+        # Check if image_id is a test image
+        if image_id in results:
+            print("Found caption in results:", results[image_id]["caption"])  # Debugging line
+            return results[image_id]["caption"]
+        # If image_id is not a test image, search in nearest neighbors
+        for test_image_data in results.values():
+            for neighbor in test_image_data["nearest_neighbors"]:
+                if neighbor["image_id"] == image_id:
+                    print("Found caption in nearest neighbors:", neighbor["caption"])  # Debugging line
+                    return neighbor["caption"]
+    # Return None if the image_id is not found
+    print("Caption not found for image_id:", image_id)  # Debugging line
+    return None
+# Function to get a random image that hasn’t been annotated or skipped
 def get_next_image(session_data):
     with lock:
+        annotated_images = set(dataset["image_id"])  # Set of annotated images
+        available_images = [img for img in image_files if img not in annotated_images]
         # Check if the user already has an image
         if session_data["current_image"] is None and available_images:
             # Assign a new random image to the user
             session_data["current_image"] = random.choice(available_images)
     return os.path.join(image_folder, session_data["current_image"]) if session_data["current_image"] else None
+# Function to save the annotation to Hugging Face dataset and fetch the next image
 def save_annotation(caption, session_data):
+    global dataset  # Declare global dataset at the start of the function
     if session_data["current_image"] is None:
+        return gr.update(visible=False), gr.update(value="All images have been annotated!"), gr.update(value="")
     with lock:
         image_id = session_data["current_image"]
         # Check if image is already in dataset to update count
         existing_image = dataset.filter(lambda x: x["image_id"] == image_id)
+        if len(existing_image) > 0:
+            # Get current annotation count
             annotation_count = existing_image[0]["annotation_count"]
         else:
             annotation_count = 0
         new_data = Dataset.from_dict({
             "image_id": [image_id],
             "caption": [caption],
+            "annotation_count": [annotation_count + 1]  # Increment the annotation count
         })
+        # Concatenate with the existing dataset and push the updated dataset to Hugging Face
         dataset = concatenate_datasets([dataset, new_data])
         dataset.push_to_hub(dataset_name)
         print("Pushed updated dataset")
+        # Clear user's current image so they get a new one next time
+        session_data["current_image"] = None
     # Fetch the next image
     next_image = get_next_image(session_data)
     if next_image:
+        next_caption = get_caption_for_image_id(os.path.basename(next_image))  # Retrieve the caption for the new image
+        return gr.update(value=next_image), gr.update(value=""), gr.update(value=next_caption or "")
     else:
+        return gr.update(visible=False), gr.update(value="All images have been annotated!"), gr.update(value="")
 def initialize_interface(session_data):
     next_image = get_next_image(session_data)
     if next_image:
+        next_caption = get_caption_for_image_id(os.path.basename(next_image))  # Retrieve caption for initial image
+        print(next_caption)
+        return gr.update(value=next_image), gr.update(value=next_caption or "")
     else:
         return gr.update(visible=False), gr.update(value="All images have been annotated!")
 # Build the Gradio interface
 with gr.Blocks() as demo:
     gr.Markdown("# Image Captioning Tool")
+    gr.Markdown("Please provide your caption in Egyptian Arabic 'Masri'")
     session_data = gr.State({"current_image": None})  # Session-specific state
     with gr.Row():
         image = gr.Image()
         caption = gr.Textbox(placeholder="Enter caption here...")
+        existing_caption = gr.Textbox(label="Existing Caption", interactive=False)  # Display existing caption
         submit = gr.Button("Submit")
     # Define actions for buttons
+    submit.click(fn=save_annotation, inputs=[caption, session_data], outputs=[image, caption, existing_caption])
     # Load initial image
+    demo.load(fn=initialize_interface, inputs=session_data, outputs=[image, existing_caption])
 demo.launch(share=True)