GeorgeIbrahim commited on
Commit
7971cd8
·
1 Parent(s): c5ec5d8
Files changed (1) hide show
  1. app.py +19 -4
app.py CHANGED
@@ -22,7 +22,22 @@ with open('nearest_neighbors_with_captions.json', 'r') as f:
22
  try:
23
  dataset = load_dataset(dataset_name, split="train")
24
  print("Loaded existing dataset:", dataset)
 
 
 
 
 
 
 
 
 
25
 
 
 
 
 
 
 
26
  # Create a dictionary to keep track of the highest annotation count for each image
27
  annotation_counts = {}
28
  for example in dataset:
@@ -40,9 +55,9 @@ except Exception as e:
40
  'image_id': Value(dtype='string'),
41
  'caption': Value(dtype='string'),
42
  'annotation_count': Value(dtype='int32'),
43
- 'spl': Value(dtype='string')
44
  })
45
- dataset = Dataset.from_dict({'image_id': [], 'caption': [], 'annotation_count': [], 'spl': []}, features=features)
46
  annotation_counts = {}
47
  dataset.push_to_hub(dataset_name) # Push the empty dataset to Hugging Face
48
 
@@ -122,12 +137,12 @@ def save_annotation(caption, session_data):
122
  "image_id": [image_id],
123
  "caption": [caption],
124
  "annotation_count": [annotation_count + 1],
125
- "spl": [split]
126
  }, features=Features({
127
  'image_id': Value(dtype='string'),
128
  'caption': Value(dtype='string'),
129
  'annotation_count': Value(dtype='int32'),
130
- 'spl': Value(dtype='string')
131
  }))
132
 
133
  # Update the annotation count in the dictionary
 
22
  try:
23
  dataset = load_dataset(dataset_name, split="train")
24
  print("Loaded existing dataset:", dataset)
25
+ print("Dataset features:", dataset.features) # Check if 'split' is part of features
26
+
27
+ # Check if the 'split' column exists; if not, add it
28
+ if 'split' not in dataset.column_names:
29
+ # Define the 'split' values based on `image_id`
30
+ split_values = [
31
+ "dev" if example["image_id"] in results else "train"
32
+ for example in dataset
33
+ ]
34
 
35
+ # Add 'split' column to the dataset
36
+ dataset = dataset.add_column("split", split_values)
37
+ print("Added 'split' column to dataset.")
38
+ else:
39
+ print("'split' column already exists.")
40
+
41
  # Create a dictionary to keep track of the highest annotation count for each image
42
  annotation_counts = {}
43
  for example in dataset:
 
55
  'image_id': Value(dtype='string'),
56
  'caption': Value(dtype='string'),
57
  'annotation_count': Value(dtype='int32'),
58
+ 'split': Value(dtype='string')
59
  })
60
+ dataset = Dataset.from_dict({'image_id': [], 'caption': [], 'annotation_count': [], 'split': []}, features=features)
61
  annotation_counts = {}
62
  dataset.push_to_hub(dataset_name) # Push the empty dataset to Hugging Face
63
 
 
137
  "image_id": [image_id],
138
  "caption": [caption],
139
  "annotation_count": [annotation_count + 1],
140
+ "split": [split]
141
  }, features=Features({
142
  'image_id': Value(dtype='string'),
143
  'caption': Value(dtype='string'),
144
  'annotation_count': Value(dtype='int32'),
145
+ 'split': Value(dtype='string')
146
  }))
147
 
148
  # Update the annotation count in the dictionary