Spaces:
Sleeping
Sleeping
Commit
·
bd6fc64
1
Parent(s):
54a81cc
updates
Browse files
app.py
CHANGED
|
@@ -15,7 +15,7 @@ else:
|
|
| 15 |
print("HUGGINGFACE_TOKEN environment variable not set.")
|
| 16 |
dataset_name = "GeorgeIbrahim/EGYCOCO" # Replace with your dataset name
|
| 17 |
|
| 18 |
-
# Load or create the dataset
|
| 19 |
try:
|
| 20 |
dataset = load_dataset(dataset_name, split="train")
|
| 21 |
print("Loaded existing dataset:", dataset)
|
|
@@ -34,9 +34,10 @@ except Exception as e:
|
|
| 34 |
features = Features({
|
| 35 |
'image_id': Value(dtype='string'),
|
| 36 |
'caption': Value(dtype='string'),
|
| 37 |
-
'annotation_count': Value(dtype='int32') # Add annotation count feature
|
|
|
|
| 38 |
})
|
| 39 |
-
dataset = Dataset.from_dict({'image_id': [], 'caption': [], 'annotation_count': []}, features=features)
|
| 40 |
annotation_counts = {}
|
| 41 |
dataset.push_to_hub(dataset_name) # Push the empty dataset to Hugging Face
|
| 42 |
|
|
@@ -48,12 +49,12 @@ lock = threading.Lock()
|
|
| 48 |
with open('nearest_neighbors_with_captions.json', 'r') as f:
|
| 49 |
results = json.load(f)
|
| 50 |
|
| 51 |
-
# Populate the annotation counts based on validation and neighbor distinction
|
| 52 |
for image_id, data in results.items():
|
| 53 |
-
# Mark validation image to require two annotations
|
| 54 |
annotation_counts[image_id] = annotation_counts.get(image_id, 0)
|
| 55 |
|
| 56 |
-
# Mark each nearest neighbor to require only one annotation
|
| 57 |
for neighbor in data["nearest_neighbors"]:
|
| 58 |
neighbor_id = neighbor["image_id"]
|
| 59 |
annotation_counts[neighbor_id] = annotation_counts.get(neighbor_id, 0)
|
|
@@ -113,14 +114,19 @@ def save_annotation(caption, session_data):
|
|
| 113 |
|
| 114 |
annotation_count = annotation_counts.get(image_id, 0)
|
| 115 |
|
|
|
|
|
|
|
|
|
|
| 116 |
new_data = Dataset.from_dict({
|
| 117 |
"image_id": [image_id],
|
| 118 |
"caption": [caption],
|
| 119 |
-
"annotation_count": [annotation_count + 1]
|
|
|
|
| 120 |
}, features=Features({
|
| 121 |
'image_id': Value(dtype='string'),
|
| 122 |
'caption': Value(dtype='string'),
|
| 123 |
-
'annotation_count': Value(dtype='int32')
|
|
|
|
| 124 |
}))
|
| 125 |
|
| 126 |
annotation_counts[image_id] = annotation_count + 1
|
|
|
|
| 15 |
print("HUGGINGFACE_TOKEN environment variable not set.")
|
| 16 |
dataset_name = "GeorgeIbrahim/EGYCOCO" # Replace with your dataset name
|
| 17 |
|
| 18 |
+
# Load or create the dataset with a new 'split' column
|
| 19 |
try:
|
| 20 |
dataset = load_dataset(dataset_name, split="train")
|
| 21 |
print("Loaded existing dataset:", dataset)
|
|
|
|
| 34 |
features = Features({
|
| 35 |
'image_id': Value(dtype='string'),
|
| 36 |
'caption': Value(dtype='string'),
|
| 37 |
+
'annotation_count': Value(dtype='int32'), # Add annotation count feature
|
| 38 |
+
'split': Value(dtype='string') # Add split column to mark as "dev" or "train"
|
| 39 |
})
|
| 40 |
+
dataset = Dataset.from_dict({'image_id': [], 'caption': [], 'annotation_count': [], 'split': []}, features=features)
|
| 41 |
annotation_counts = {}
|
| 42 |
dataset.push_to_hub(dataset_name) # Push the empty dataset to Hugging Face
|
| 43 |
|
|
|
|
| 49 |
with open('nearest_neighbors_with_captions.json', 'r') as f:
|
| 50 |
results = json.load(f)
|
| 51 |
|
| 52 |
+
# Populate the annotation counts and set split types based on validation and neighbor distinction
|
| 53 |
for image_id, data in results.items():
|
| 54 |
+
# Mark validation image to require two annotations and set split as "dev"
|
| 55 |
annotation_counts[image_id] = annotation_counts.get(image_id, 0)
|
| 56 |
|
| 57 |
+
# Mark each nearest neighbor to require only one annotation and set split as "train"
|
| 58 |
for neighbor in data["nearest_neighbors"]:
|
| 59 |
neighbor_id = neighbor["image_id"]
|
| 60 |
annotation_counts[neighbor_id] = annotation_counts.get(neighbor_id, 0)
|
|
|
|
| 114 |
|
| 115 |
annotation_count = annotation_counts.get(image_id, 0)
|
| 116 |
|
| 117 |
+
# Determine the split type based on whether it's in the validation set
|
| 118 |
+
split_type = "dev" if image_id in results else "train"
|
| 119 |
+
|
| 120 |
new_data = Dataset.from_dict({
|
| 121 |
"image_id": [image_id],
|
| 122 |
"caption": [caption],
|
| 123 |
+
"annotation_count": [annotation_count + 1],
|
| 124 |
+
"split": [split_type]
|
| 125 |
}, features=Features({
|
| 126 |
'image_id': Value(dtype='string'),
|
| 127 |
'caption': Value(dtype='string'),
|
| 128 |
+
'annotation_count': Value(dtype='int32'),
|
| 129 |
+
'split': Value(dtype='string')
|
| 130 |
}))
|
| 131 |
|
| 132 |
annotation_counts[image_id] = annotation_count + 1
|