GeorgeSherif commited on
Commit
077d427
·
1 Parent(s): b98ca58
Files changed (1) hide show
  1. app.py +15 -28
app.py CHANGED
@@ -4,6 +4,7 @@ import threading
4
  import random
5
  from datasets import load_dataset, Dataset, Features, Value, concatenate_datasets
6
  from huggingface_hub import login
 
7
  # Authenticate with Hugging Face
8
  token = os.getenv("HUGGINGFACE_TOKEN")
9
  if token:
@@ -12,40 +13,27 @@ else:
12
  print("HUGGINGFACE_TOKEN environment variable not set.")
13
  dataset_name = "GeorgeIbrahim/EGYCOCO" # Replace with your dataset name
14
 
15
- # Load or create the dataset with train and val splits
16
  try:
17
- dataset = load_dataset(dataset_name)
18
  print("Loaded existing dataset:", dataset)
19
  except Exception as e:
20
- # Create empty datasets for train and val splits if they don't exist
21
  features = Features({
22
  'image_id': Value(dtype='string'),
23
  'caption': Value(dtype='string'),
24
  })
25
- train_dataset = Dataset.from_dict({'image_id': [], 'caption': []}, features=features)
26
- val_dataset = Dataset.from_dict({'image_id': [], 'caption': []}, features=features)
27
- dataset = {"train": train_dataset, "val": val_dataset}
28
- # Push empty splits to Hugging Face
29
- dataset["train"].push_to_hub(f"{dataset_name}", split="train")
30
- dataset["val"].push_to_hub(f"{dataset_name}", split="val")
31
-
32
- image_folder = "test"
33
  image_files = [f for f in os.listdir(image_folder) if f.endswith(('.png', '.jpg', '.jpeg'))]
34
  lock = threading.Lock()
35
 
36
- # Function to get the appropriate split from the image ID
37
- def get_split_from_image_id(image_id):
38
- if "train" in image_id:
39
- return "train"
40
- elif "val" in image_id:
41
- return "val"
42
- else:
43
- raise ValueError("Image ID does not contain a valid split identifier (train/val).")
44
-
45
  # Function to get a random image that hasn’t been annotated or skipped
46
  def get_next_image(session_data):
47
  with lock:
48
- annotated_images = set(dataset["train"]["image_id"]) | set(dataset["val"]["image_id"]) # Combine annotated image IDs from both splits
49
  available_images = [img for img in image_files if img not in annotated_images]
50
  # Check if the user already has an image
51
  if session_data["current_image"] is None and available_images:
@@ -53,27 +41,26 @@ def get_next_image(session_data):
53
  session_data["current_image"] = random.choice(available_images)
54
  return os.path.join(image_folder, session_data["current_image"]) if session_data["current_image"] else None
55
 
56
- # Function to save the annotation to the correct split and fetch the next image
57
  def save_annotation(caption, session_data):
58
  if session_data["current_image"] is None:
59
  return gr.update(visible=False), gr.update(value="All images have been annotated!")
60
 
61
  with lock:
62
  image_id = session_data["current_image"]
63
- split = get_split_from_image_id(image_id)
64
 
65
  # Save caption or "skipped" based on user input
66
  if caption.strip().lower() == "skip":
67
  caption = "skipped"
68
 
69
- # Add the new annotation as a new row to the appropriate split
70
  new_data = Dataset.from_dict({"image_id": [image_id], "caption": [caption]})
71
  global dataset
72
- dataset[split] = concatenate_datasets([dataset[split], new_data])
73
 
74
- # Save updated split to Hugging Face
75
- dataset[split].push_to_hub(dataset_name, split=split)
76
- print(f"Pushed updated dataset for split: {split}")
77
 
78
  # Clear user's current image so they get a new one next time
79
  session_data["current_image"] = None
 
4
  import random
5
  from datasets import load_dataset, Dataset, Features, Value, concatenate_datasets
6
  from huggingface_hub import login
7
+
8
  # Authenticate with Hugging Face
9
  token = os.getenv("HUGGINGFACE_TOKEN")
10
  if token:
 
13
  print("HUGGINGFACE_TOKEN environment variable not set.")
14
  dataset_name = "GeorgeIbrahim/EGYCOCO" # Replace with your dataset name
15
 
16
+ # Load or create the dataset
17
  try:
18
+ dataset = load_dataset(dataset_name, split="train")
19
  print("Loaded existing dataset:", dataset)
20
  except Exception as e:
21
+ # Create an empty dataset if it doesn't exist
22
  features = Features({
23
  'image_id': Value(dtype='string'),
24
  'caption': Value(dtype='string'),
25
  })
26
+ dataset = Dataset.from_dict({'image_id': [], 'caption': []}, features=features)
27
+ dataset.push_to_hub(dataset_name) # Push the empty dataset to Hugging Face
28
+
29
+ image_folder = "images"
 
 
 
 
30
  image_files = [f for f in os.listdir(image_folder) if f.endswith(('.png', '.jpg', '.jpeg'))]
31
  lock = threading.Lock()
32
 
 
 
 
 
 
 
 
 
 
33
  # Function to get a random image that hasn’t been annotated or skipped
34
  def get_next_image(session_data):
35
  with lock:
36
+ annotated_images = set(dataset["image_id"]) # Set of annotated images
37
  available_images = [img for img in image_files if img not in annotated_images]
38
  # Check if the user already has an image
39
  if session_data["current_image"] is None and available_images:
 
41
  session_data["current_image"] = random.choice(available_images)
42
  return os.path.join(image_folder, session_data["current_image"]) if session_data["current_image"] else None
43
 
44
+ # Function to save the annotation to Hugging Face dataset and fetch the next image
45
  def save_annotation(caption, session_data):
46
  if session_data["current_image"] is None:
47
  return gr.update(visible=False), gr.update(value="All images have been annotated!")
48
 
49
  with lock:
50
  image_id = session_data["current_image"]
 
51
 
52
  # Save caption or "skipped" based on user input
53
  if caption.strip().lower() == "skip":
54
  caption = "skipped"
55
 
56
+ # Add the new annotation as a new row to the dataset
57
  new_data = Dataset.from_dict({"image_id": [image_id], "caption": [caption]})
58
  global dataset
59
+ dataset = concatenate_datasets([dataset, new_data])
60
 
61
+ # Save updated dataset to Hugging Face
62
+ dataset.push_to_hub(dataset_name)
63
+ print("Pushed updated dataset")
64
 
65
  # Clear user's current image so they get a new one next time
66
  session_data["current_image"] = None