GeorgeIbrahim commited on
Commit
52084ff
·
1 Parent(s): 33c417d
Files changed (1) hide show
  1. app.py +54 -39
app.py CHANGED
@@ -13,47 +13,61 @@ if token:
13
  login(token=token)
14
  else:
15
  print("HUGGINGFACE_TOKEN environment variable not set.")
16
- dataset_name = "GeorgeIbrahim/EGYCOCO" # Replace with your dataset name
17
 
18
- # Define the updated features including the new 'split' column
19
- features = Features({
20
- 'image_id': Value(dtype='string'),
21
- 'caption': Value(dtype='string'),
22
- 'annotation_count': Value(dtype='int32'), # Annotation count
23
- 'split': Value(dtype='string') # New 'split' column
24
- })
25
 
26
- # Load the dataset or create it if it doesn’t exist, with updated features
27
  try:
28
- dataset = load_dataset(dataset_name, split="train").cast(features)
29
- print("Loaded existing dataset with updated features:", dataset)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
- # Initialize annotation counts
32
- annotation_counts = {}
33
  for example in dataset:
34
  image_id = example["image_id"]
35
- count = example["annotation_count"]
36
- annotation_counts[image_id] = count
 
 
 
 
 
 
 
 
 
 
37
  except Exception as e:
38
- print(f"Error loading dataset: {e}")
39
- # Create an empty dataset if it doesn't exist
40
- dataset = Dataset.from_dict({'image_id': [], 'caption': [], 'annotation_count': [], 'split': []}, features=features)
41
- annotation_counts = {}
42
- dataset.push_to_hub(dataset_name) # Push the empty dataset to Hugging Face if it's new
43
-
44
- # Load the nearest neighbors JSON file
45
- with open('nearest_neighbors_with_captions.json', 'r') as f:
46
- results = json.load(f)
47
-
48
- # Populate the annotation counts and set split types based on validation and neighbor distinction
49
- for image_id, data in results.items():
50
- # Mark validation image to require two annotations and set split as "dev"
51
- annotation_counts[image_id] = annotation_counts.get(image_id, 0)
52
-
53
- # Mark each nearest neighbor to require only one annotation and set split as "train"
54
- for neighbor in data["nearest_neighbors"]:
55
- neighbor_id = neighbor["image_id"]
56
- annotation_counts[neighbor_id] = annotation_counts.get(neighbor_id, 0)
57
 
58
  def get_caption_for_image_id(image_path):
59
  """
@@ -77,7 +91,6 @@ def get_caption_for_image_id(image_path):
77
  print("Caption not found for image_id:", image_id)
78
  return None
79
 
80
-
81
  # Function to get a random image that hasn’t been fully annotated
82
  def get_next_image(session_data):
83
  with lock:
@@ -94,7 +107,6 @@ def get_next_image(session_data):
94
 
95
  return os.path.join(image_folder, session_data["current_image"]) if session_data["current_image"] else None
96
 
97
-
98
  # Function to save the annotation to Hugging Face dataset and fetch the next image
99
  def save_annotation(caption, session_data):
100
  global dataset, annotation_counts
@@ -118,7 +130,12 @@ def save_annotation(caption, session_data):
118
  "caption": [caption],
119
  "annotation_count": [annotation_count + 1],
120
  "split": [split_type]
121
- }, features=features)
 
 
 
 
 
122
 
123
  annotation_counts[image_id] = annotation_count + 1
124
 
@@ -137,7 +154,6 @@ def save_annotation(caption, session_data):
137
  else:
138
  return gr.update(visible=False), gr.update(value="All images have been annotated!"), gr.update(value="")
139
 
140
-
141
  def initialize_interface(session_data):
142
  next_image = get_next_image(session_data)
143
  if next_image:
@@ -147,7 +163,6 @@ def initialize_interface(session_data):
147
  else:
148
  return gr.update(visible=False), gr.update(value="All images have been annotated!")
149
 
150
-
151
  with gr.Blocks() as demo:
152
  gr.Markdown("# Image Captioning Tool")
153
  gr.Markdown("Please provide your caption in Egyptian Arabic 'Masri'")
 
13
  login(token=token)
14
  else:
15
  print("HUGGINGFACE_TOKEN environment variable not set.")
 
16
 
17
+ dataset_name = "GeorgeIbrahim/EGYCOCO" # Replace with your dataset name
 
 
 
 
 
 
18
 
19
+ # Load or create the dataset with a new 'split' column
20
  try:
21
+ dataset = load_dataset(dataset_name, split="train")
22
+ print("Loaded existing dataset:", dataset)
23
+
24
+ # Load the nearest neighbors JSON file
25
+ with open('nearest_neighbors_with_captions.json', 'r') as f:
26
+ results = json.load(f)
27
+
28
+ # Define the new features with the added 'split' column
29
+ features = Features({
30
+ 'image_id': Value(dtype='string'),
31
+ 'caption': Value(dtype='string'),
32
+ 'annotation_count': Value(dtype='int32'),
33
+ 'split': Value(dtype='string') # New 'split' column
34
+ })
35
+
36
+ # Populate the 'split' column based on whether image_id is in results
37
+ updated_data = {
38
+ 'image_id': [],
39
+ 'caption': [],
40
+ 'annotation_count': [],
41
+ 'split': []
42
+ }
43
 
 
 
44
  for example in dataset:
45
  image_id = example["image_id"]
46
+ updated_data['image_id'].append(image_id)
47
+ updated_data['caption'].append(example["caption"])
48
+ updated_data['annotation_count'].append(example["annotation_count"])
49
+ # Determine the split type based on whether it's in the validation set
50
+ split_type = "dev" if image_id in results else "train"
51
+ updated_data['split'].append(split_type)
52
+
53
+ # Create a new dataset with updated features and push to the hub
54
+ updated_dataset = Dataset.from_dict(updated_data, features=features)
55
+ updated_dataset.push_to_hub(dataset_name)
56
+ print("Dataset updated with 'split' column and pushed to Hugging Face.")
57
+
58
  except Exception as e:
59
+ print(f"Error loading or updating dataset: {e}")
60
+
61
+ image_folder = "images"
62
+ image_files = [f for f in os.listdir(image_folder) if f.endswith(('.png', '.jpg', '.jpeg'))]
63
+ lock = threading.Lock()
64
+
65
+ # Initialize annotation counts
66
+ annotation_counts = {}
67
+ for example in dataset:
68
+ image_id = example["image_id"]
69
+ count = example["annotation_count"]
70
+ annotation_counts[image_id] = count
 
 
 
 
 
 
 
71
 
72
  def get_caption_for_image_id(image_path):
73
  """
 
91
  print("Caption not found for image_id:", image_id)
92
  return None
93
 
 
94
  # Function to get a random image that hasn’t been fully annotated
95
  def get_next_image(session_data):
96
  with lock:
 
107
 
108
  return os.path.join(image_folder, session_data["current_image"]) if session_data["current_image"] else None
109
 
 
110
  # Function to save the annotation to Hugging Face dataset and fetch the next image
111
  def save_annotation(caption, session_data):
112
  global dataset, annotation_counts
 
130
  "caption": [caption],
131
  "annotation_count": [annotation_count + 1],
132
  "split": [split_type]
133
+ }, features=Features({
134
+ 'image_id': Value(dtype='string'),
135
+ 'caption': Value(dtype='string'),
136
+ 'annotation_count': Value(dtype='int32'),
137
+ 'split': Value(dtype='string')
138
+ }))
139
 
140
  annotation_counts[image_id] = annotation_count + 1
141
 
 
154
  else:
155
  return gr.update(visible=False), gr.update(value="All images have been annotated!"), gr.update(value="")
156
 
 
157
  def initialize_interface(session_data):
158
  next_image = get_next_image(session_data)
159
  if next_image:
 
163
  else:
164
  return gr.update(visible=False), gr.update(value="All images have been annotated!")
165
 
 
166
  with gr.Blocks() as demo:
167
  gr.Markdown("# Image Captioning Tool")
168
  gr.Markdown("Please provide your caption in Egyptian Arabic 'Masri'")