GeorgeIbrahim commited on
Commit
bd6fc64
·
1 Parent(s): 54a81cc
Files changed (1) hide show
  1. app.py +14 -8
app.py CHANGED
@@ -15,7 +15,7 @@ else:
15
  print("HUGGINGFACE_TOKEN environment variable not set.")
16
  dataset_name = "GeorgeIbrahim/EGYCOCO" # Replace with your dataset name
17
 
18
- # Load or create the dataset
19
  try:
20
  dataset = load_dataset(dataset_name, split="train")
21
  print("Loaded existing dataset:", dataset)
@@ -34,9 +34,10 @@ except Exception as e:
34
  features = Features({
35
  'image_id': Value(dtype='string'),
36
  'caption': Value(dtype='string'),
37
- 'annotation_count': Value(dtype='int32') # Add annotation count feature
 
38
  })
39
- dataset = Dataset.from_dict({'image_id': [], 'caption': [], 'annotation_count': []}, features=features)
40
  annotation_counts = {}
41
  dataset.push_to_hub(dataset_name) # Push the empty dataset to Hugging Face
42
 
@@ -48,12 +49,12 @@ lock = threading.Lock()
48
  with open('nearest_neighbors_with_captions.json', 'r') as f:
49
  results = json.load(f)
50
 
51
- # Populate the annotation counts based on validation and neighbor distinction
52
  for image_id, data in results.items():
53
- # Mark validation image to require two annotations
54
  annotation_counts[image_id] = annotation_counts.get(image_id, 0)
55
 
56
- # Mark each nearest neighbor to require only one annotation
57
  for neighbor in data["nearest_neighbors"]:
58
  neighbor_id = neighbor["image_id"]
59
  annotation_counts[neighbor_id] = annotation_counts.get(neighbor_id, 0)
@@ -113,14 +114,19 @@ def save_annotation(caption, session_data):
113
 
114
  annotation_count = annotation_counts.get(image_id, 0)
115
 
 
 
 
116
  new_data = Dataset.from_dict({
117
  "image_id": [image_id],
118
  "caption": [caption],
119
- "annotation_count": [annotation_count + 1]
 
120
  }, features=Features({
121
  'image_id': Value(dtype='string'),
122
  'caption': Value(dtype='string'),
123
- 'annotation_count': Value(dtype='int32')
 
124
  }))
125
 
126
  annotation_counts[image_id] = annotation_count + 1
 
15
  print("HUGGINGFACE_TOKEN environment variable not set.")
16
  dataset_name = "GeorgeIbrahim/EGYCOCO" # Replace with your dataset name
17
 
18
+ # Load or create the dataset with a new 'split' column
19
  try:
20
  dataset = load_dataset(dataset_name, split="train")
21
  print("Loaded existing dataset:", dataset)
 
34
  features = Features({
35
  'image_id': Value(dtype='string'),
36
  'caption': Value(dtype='string'),
37
+ 'annotation_count': Value(dtype='int32'), # Add annotation count feature
38
+ 'split': Value(dtype='string') # Add split column to mark as "dev" or "train"
39
  })
40
+ dataset = Dataset.from_dict({'image_id': [], 'caption': [], 'annotation_count': [], 'split': []}, features=features)
41
  annotation_counts = {}
42
  dataset.push_to_hub(dataset_name) # Push the empty dataset to Hugging Face
43
 
 
49
  with open('nearest_neighbors_with_captions.json', 'r') as f:
50
  results = json.load(f)
51
 
52
+ # Populate the annotation counts and set split types based on validation and neighbor distinction
53
  for image_id, data in results.items():
54
+ # Mark validation image to require two annotations and set split as "dev"
55
  annotation_counts[image_id] = annotation_counts.get(image_id, 0)
56
 
57
+ # Mark each nearest neighbor to require only one annotation and set split as "train"
58
  for neighbor in data["nearest_neighbors"]:
59
  neighbor_id = neighbor["image_id"]
60
  annotation_counts[neighbor_id] = annotation_counts.get(neighbor_id, 0)
 
114
 
115
  annotation_count = annotation_counts.get(image_id, 0)
116
 
117
+ # Determine the split type based on whether it's in the validation set
118
+ split_type = "dev" if image_id in results else "train"
119
+
120
  new_data = Dataset.from_dict({
121
  "image_id": [image_id],
122
  "caption": [caption],
123
+ "annotation_count": [annotation_count + 1],
124
+ "split": [split_type]
125
  }, features=Features({
126
  'image_id': Value(dtype='string'),
127
  'caption': Value(dtype='string'),
128
+ 'annotation_count': Value(dtype='int32'),
129
+ 'split': Value(dtype='string')
130
  }))
131
 
132
  annotation_counts[image_id] = annotation_count + 1