alyex commited on
Commit
93de1ca
·
verified ·
1 Parent(s): 00721fe

hieroglyph_annotator_gradio.py

Browse files
Files changed (1) hide show
  1. app.py +853 -0
app.py ADDED
@@ -0,0 +1,853 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ import os
4
+ import pickle
5
+ from PIL import Image
6
+ import shutil
7
+ import time
8
+ import datasets
9
+ import tempfile
10
+ from pathlib import Path
11
+ import hashlib
12
+
13
+ # Configuration for Hugging Face Spaces
14
+ # Use environment variable for token - set this in Space Secrets
15
+ HF_TOKEN = os.getenv("HF_TOKEN")
16
+ HF_DATASET_REPO = "alyex/karnak_data"
17
+ HF_CACHE_DIR = os.path.join(tempfile.gettempdir(), "karnak_cache")
18
+ HF_ANNOTATIONS_DIR = "/data/annotations" # Persistent storage in Spaces
19
+ HF_HARD_NEGATIVES_DIR = "/data/hard_negatives" # Persistent storage
20
+
21
+ # Ensure directories exist
22
+ os.makedirs(HF_CACHE_DIR, exist_ok=True)
23
+ os.makedirs(HF_ANNOTATIONS_DIR, exist_ok=True)
24
+ os.makedirs(HF_HARD_NEGATIVES_DIR, exist_ok=True)
25
+
26
+ MANIFEST_FILE = os.path.join(HF_CACHE_DIR, "manifest.pkl")
27
+
28
+ # Try to load kiu_segment_metadata from dataset or local
29
+ KIU_SEGMENT_METADATA = {}
30
+ try:
31
+ # First try to load from the dataset
32
+ if HF_TOKEN:
33
+ from huggingface_hub import hf_hub_download
34
+ METADATA_FILE = hf_hub_download(
35
+ repo_id=HF_DATASET_REPO,
36
+ filename="kiu_segment_metadata.json",
37
+ token=HF_TOKEN,
38
+ local_dir=HF_CACHE_DIR
39
+ )
40
+ with open(METADATA_FILE, 'r') as f:
41
+ KIU_SEGMENT_METADATA = json.load(f)
42
+ else:
43
+ # Try to load without token
44
+ try:
45
+ from huggingface_hub import hf_hub_download
46
+ METADATA_FILE = hf_hub_download(
47
+ repo_id=HF_DATASET_REPO,
48
+ filename="kiu_segment_metadata.json",
49
+ local_dir=HF_CACHE_DIR
50
+ )
51
+ with open(METADATA_FILE, 'r') as f:
52
+ KIU_SEGMENT_METADATA = json.load(f)
53
+ except:
54
+ pass
55
+ except Exception as e:
56
+ print(f"Could not load metadata from HF: {e}")
57
+ # Fallback to local file if in repo
58
+ try:
59
+ with open("kiu_segment_metadata.json", 'r') as f:
60
+ KIU_SEGMENT_METADATA = json.load(f)
61
+ except:
62
+ print("Warning: Could not load kiu_segment_metadata.json")
63
+
64
+ class AnnotationState:
65
+ def __init__(self):
66
+ self.manifest = []
67
+ self.current_index = 0
68
+ self.history = []
69
+ self.dataset = None
70
+ self.dataset_cache = {}
71
+ self.annotations_cache = {}
72
+
73
+ # Load dataset
74
+ self.load_dataset()
75
+ # Load manifest
76
+ self.load_manifest()
77
+
78
+ def load_dataset(self):
79
+ """Load dataset from Hugging Face"""
80
+ try:
81
+ print("Loading dataset from Hugging Face...")
82
+ load_kwargs = {
83
+ 'path': HF_DATASET_REPO,
84
+ 'trust_remote_code': True
85
+ }
86
+
87
+ # Only add token if available
88
+ if HF_TOKEN:
89
+ load_kwargs['token'] = HF_TOKEN
90
+ print("Using HF_TOKEN for authentication")
91
+
92
+ # Try loading with streaming first (more memory efficient)
93
+ try:
94
+ load_kwargs['streaming'] = True
95
+ self.dataset = datasets.load_dataset(**load_kwargs)
96
+ print("✅ Dataset loaded with streaming")
97
+ except Exception as e:
98
+ print(f"Streaming failed, trying without: {e}")
99
+ load_kwargs.pop('streaming', None)
100
+ self.dataset = datasets.load_dataset(**load_kwargs)
101
+ print("✅ Dataset loaded without streaming")
102
+
103
+ except Exception as e:
104
+ print(f"❌ Error loading dataset: {e}")
105
+ print("💡 Make sure HF_TOKEN is set in Space secrets if dataset is private")
106
+ self.dataset = None
107
+
108
+ def load_manifest(self):
109
+ """Load manifest from cache file or build from dataset"""
110
+ if os.path.exists(MANIFEST_FILE):
111
+ try:
112
+ with open(MANIFEST_FILE, 'rb') as f:
113
+ self.manifest = pickle.load(f)
114
+ print(f"✅ Loaded {len(self.manifest)} instances from cache")
115
+
116
+ # Find first unannotated
117
+ self.current_index = self.find_first_unannotated()
118
+ print(f"✅ Starting at index {self.current_index}")
119
+ except Exception as e:
120
+ print(f"❌ Error loading manifest: {e}")
121
+ self.manifest = []
122
+ self.build_manifest_from_dataset()
123
+ else:
124
+ print("❌ Manifest not found. Building from dataset...")
125
+ self.build_manifest_from_dataset()
126
+
127
+ def build_manifest_from_dataset(self):
128
+ """Build manifest from Hugging Face dataset"""
129
+ if not self.dataset:
130
+ print("❌ No dataset available")
131
+ return
132
+
133
+ print("Building manifest from dataset...")
134
+ self.manifest = []
135
+
136
+ try:
137
+ # Try to load train split first
138
+ splits = list(self.dataset.keys())
139
+ split = 'train' if 'train' in splits else splits[0] if splits else None
140
+
141
+ if not split:
142
+ print("❌ No splits found in dataset")
143
+ return
144
+
145
+ print(f"Using split: {split}")
146
+
147
+ # Check dataset structure
148
+ sample_item = None
149
+ for item in self.dataset[split]:
150
+ sample_item = item
151
+ break
152
+
153
+ if not sample_item:
154
+ print("❌ Dataset is empty")
155
+ return
156
+
157
+ print(f"Sample item keys: {list(sample_item.keys())}")
158
+
159
+ # Process dataset
160
+ for idx, item in enumerate(self.dataset[split]):
161
+ # Extract instances from the dataset item
162
+ instances = item.get('instances', [])
163
+ kiu_id = item.get('kiu_id', idx)
164
+
165
+ # If no instances key, create a single instance
166
+ if not instances:
167
+ instances = [{
168
+ 'instance_id': f"instance_{idx}",
169
+ 'class': 'Line',
170
+ 'crop_coords': item.get('crop_coords', [0, 0, 100, 100]),
171
+ 'direction': item.get('direction', ''),
172
+ 'annotated': item.get('annotated', False),
173
+ 'line_number': item.get('line_number')
174
+ }]
175
+
176
+ for instance in instances:
177
+ if instance.get('class') == 'Line':
178
+ manifest_item = {
179
+ 'kiu_id': kiu_id,
180
+ 'instance_id': instance.get('instance_id', f"instance_{len(self.manifest)}"),
181
+ 'crop_coords': instance.get('crop_coords', [0, 0, 100, 100]),
182
+ 'direction': instance.get('direction', ''),
183
+ 'annotated': instance.get('annotated', False),
184
+ 'line_number': instance.get('line_number'),
185
+ 'dataset_item': item, # Store full dataset item
186
+ 'split': split,
187
+ 'idx': len(self.manifest)
188
+ }
189
+ self.manifest.append(manifest_item)
190
+
191
+ print(f"✅ Built manifest with {len(self.manifest)} instances")
192
+
193
+ # Save manifest to cache
194
+ with open(MANIFEST_FILE, 'wb') as f:
195
+ pickle.dump(self.manifest, f)
196
+
197
+ self.current_index = self.find_first_unannotated()
198
+
199
+ except Exception as e:
200
+ print(f"❌ Error building manifest: {e}")
201
+ import traceback
202
+ traceback.print_exc()
203
+
204
+ def needs_annotation(self, item):
205
+ """Check if item needs annotation"""
206
+ if not item.get('annotated', False):
207
+ return True
208
+
209
+ direction = item.get('direction')
210
+ if direction == 'TTB_NEEDS_RECLASSIFICATION':
211
+ return True
212
+
213
+ return False
214
+
215
+ def find_first_unannotated(self):
216
+ """Find first item that needs annotation"""
217
+ for idx, item in enumerate(self.manifest):
218
+ if self.needs_annotation(item):
219
+ return idx
220
+ return 0
221
+
222
+ def find_next_unannotated(self, from_index=None):
223
+ """Find next item that needs annotation"""
224
+ if not self.manifest:
225
+ return 0
226
+
227
+ start_idx = from_index if from_index is not None else self.current_index
228
+
229
+ # Search forward
230
+ for idx in range(start_idx + 1, len(self.manifest)):
231
+ if self.needs_annotation(self.manifest[idx]):
232
+ return idx
233
+
234
+ # Wrap around
235
+ for idx in range(0, start_idx):
236
+ if self.needs_annotation(self.manifest[idx]):
237
+ return idx
238
+
239
+ return start_idx
240
+
241
+ def get_statistics(self):
242
+ """Calculate statistics"""
243
+ if not self.manifest:
244
+ return {'total': 0, 'remaining': 0, 'processed': 0, 'progress_pct': 0}
245
+
246
+ total = len(self.manifest)
247
+ remaining = sum(1 for item in self.manifest if self.needs_annotation(item))
248
+ processed = total - remaining
249
+ progress_pct = (processed / total * 100) if total > 0 else 0
250
+
251
+ # Count by direction
252
+ ltr = sum(1 for item in self.manifest if item.get('direction') == 'LTR')
253
+ rtl = sum(1 for item in self.manifest if item.get('direction') == 'RTL')
254
+ ttb_ltr = sum(1 for item in self.manifest if item.get('direction') == 'TTB_LTR')
255
+ ttb_rtl = sum(1 for item in self.manifest if item.get('direction') == 'TTB_RTL')
256
+ skip = sum(1 for item in self.manifest if item.get('direction') == 'Skip')
257
+ unclear = sum(1 for item in self.manifest if item.get('direction') == 'Unclear')
258
+ ttb_reclass = sum(1 for item in self.manifest if item.get('direction') == 'TTB_NEEDS_RECLASSIFICATION')
259
+ hard_negative = sum(1 for item in self.manifest if item.get('direction') == 'HardNegative')
260
+
261
+ return {
262
+ 'total': total,
263
+ 'remaining': remaining,
264
+ 'processed': processed,
265
+ 'progress_pct': progress_pct,
266
+ 'ltr': ltr,
267
+ 'rtl': rtl,
268
+ 'ttb_ltr': ttb_ltr,
269
+ 'ttb_rtl': ttb_rtl,
270
+ 'skip': skip,
271
+ 'unclear': unclear,
272
+ 'ttb_reclass': ttb_reclass,
273
+ 'hard_negative': hard_negative
274
+ }
275
+
276
+ def get_annotation_file_path(self, item):
277
+ """Get path for annotation file in persistent storage"""
278
+ kiu_id = str(item['kiu_id']).zfill(5)
279
+ instance_id = item['instance_id']
280
+
281
+ # Create a unique filename
282
+ filename = f"KIU_{kiu_id}_instance_{instance_id}.json"
283
+ return os.path.join(HF_ANNOTATIONS_DIR, filename)
284
+
285
+ def load_annotation(self, item):
286
+ """Load annotation from persistent storage"""
287
+ annotation_file = self.get_annotation_file_path(item)
288
+
289
+ if os.path.exists(annotation_file):
290
+ try:
291
+ with open(annotation_file, 'r', encoding='utf-8') as f:
292
+ return json.load(f)
293
+ except:
294
+ pass
295
+
296
+ # Return default if not exists
297
+ return {
298
+ 'kiu_id': item['kiu_id'],
299
+ 'instance_id': item['instance_id'],
300
+ 'direction': item.get('direction', ''),
301
+ 'annotated': item.get('annotated', False),
302
+ 'line_number': item.get('line_number'),
303
+ 'original_data_hash': hashlib.md5(
304
+ json.dumps(item.get('dataset_item', {}), sort_keys=True).encode()
305
+ ).hexdigest()[:8]
306
+ }
307
+
308
+ def save_annotation(self, item, direction, line_number):
309
+ """Save annotation to persistent storage"""
310
+ annotation_file = self.get_annotation_file_path(item)
311
+
312
+ annotation_data = {
313
+ 'kiu_id': item['kiu_id'],
314
+ 'instance_id': item['instance_id'],
315
+ 'direction': direction,
316
+ 'annotated': True,
317
+ 'line_number': line_number if line_number and line_number.strip() else None,
318
+ 'timestamp': time.time(),
319
+ 'original_data_hash': hashlib.md5(
320
+ json.dumps(item.get('dataset_item', {}), sort_keys=True).encode()
321
+ ).hexdigest()[:8]
322
+ }
323
+
324
+ with open(annotation_file, 'w', encoding='utf-8') as f:
325
+ json.dump(annotation_data, f, indent=2, ensure_ascii=False)
326
+
327
+ return annotation_file
328
+
329
+ # Global state
330
+ state = AnnotationState()
331
+
332
+ def get_artifact_image(kiu_id, crop_coords):
333
+ """Get artifact image from dataset"""
334
+ if not state.dataset:
335
+ return None
336
+
337
+ try:
338
+ # Get current item from manifest
339
+ if state.manifest and state.current_index < len(state.manifest):
340
+ item = state.manifest[state.current_index]
341
+ dataset_item = item.get('dataset_item', {})
342
+
343
+ # Try different image access patterns
344
+ if 'image' in dataset_item and dataset_item['image'] is not None:
345
+ img = dataset_item['image']
346
+ if isinstance(img, Image.Image):
347
+ return crop_and_resize(img, crop_coords)
348
+
349
+ # Check for PIL Image in other fields
350
+ for key, value in dataset_item.items():
351
+ if isinstance(value, Image.Image):
352
+ return crop_and_resize(value, crop_coords)
353
+
354
+ # Check for image bytes or paths
355
+ if 'image_bytes' in dataset_item:
356
+ try:
357
+ img = Image.open(io.BytesIO(dataset_item['image_bytes']))
358
+ return crop_and_resize(img, crop_coords)
359
+ except:
360
+ pass
361
+
362
+ # Try to get from images dict
363
+ if 'images' in dataset_item and isinstance(dataset_item['images'], dict):
364
+ for img in dataset_item['images'].values():
365
+ if isinstance(img, Image.Image):
366
+ return crop_and_resize(img, crop_coords)
367
+
368
+ except Exception as e:
369
+ print(f"Error getting artifact image: {e}")
370
+
371
+ # Return a placeholder if no image found
372
+ return create_placeholder_image(crop_coords)
373
+
374
+ def crop_and_resize(img, crop_coords):
375
+ """Crop and resize image"""
376
+ try:
377
+ x1, y1, x2, y2 = map(int, crop_coords)
378
+ x1, y1 = max(0, x1), max(0, y1)
379
+ x2, y2 = min(img.width, x2), min(img.height, y2)
380
+
381
+ if x2 > x1 and y2 > y1:
382
+ cropped = img.crop((x1, y1, x2, y2))
383
+ if cropped.height > 800:
384
+ ratio = 800 / cropped.height
385
+ new_width = int(cropped.width * ratio)
386
+ cropped = cropped.resize((new_width, 800), Image.LANCZOS)
387
+ return cropped
388
+ except Exception as e:
389
+ print(f"Error cropping image: {e}")
390
+ return None
391
+
392
+ def create_placeholder_image(crop_coords):
393
+ """Create a placeholder image when no image is available"""
394
+ try:
395
+ x1, y1, x2, y2 = map(int, crop_coords)
396
+ width = max(100, x2 - x1)
397
+ height = max(100, y2 - y1)
398
+
399
+ # Create a simple placeholder
400
+ img = Image.new('RGB', (width, height), color=(240, 240, 240))
401
+
402
+ # Add some text
403
+ from PIL import ImageDraw, ImageFont
404
+ draw = ImageDraw.Draw(img)
405
+
406
+ # Try to use default font
407
+ try:
408
+ font = ImageFont.truetype("arial.ttf", 20)
409
+ except:
410
+ font = ImageFont.load_default()
411
+
412
+ text = "Image not available"
413
+ text_bbox = draw.textbbox((0, 0), text, font=font)
414
+ text_width = text_bbox[2] - text_bbox[0]
415
+ text_height = text_bbox[3] - text_bbox[1]
416
+
417
+ position = ((width - text_width) // 2, (height - text_height) // 2)
418
+ draw.text(position, text, fill=(100, 100, 100), font=font)
419
+
420
+ return img
421
+ except:
422
+ return None
423
+
424
+ def load_current_instance():
425
+ """Load and display current instance"""
426
+ if not state.manifest or state.current_index >= len(state.manifest):
427
+ return None, "", "No items available", ""
428
+
429
+ item = state.manifest[state.current_index]
430
+ kiu_id = str(item['kiu_id'])
431
+
432
+ # Load image from dataset
433
+ cropped = get_artifact_image(kiu_id, item['crop_coords'])
434
+
435
+ # HTML reference
436
+ html_content = f"""
437
+ <div style="padding: 10px; background: #f5f5f5; border-radius: 5px;">
438
+ <strong>KIU {kiu_id.zfill(5)}</strong> -
439
+ <span style="color: #0066cc;">Instance {item['instance_id']}</span>
440
+ <br>
441
+ <small>Coordinates: {item['crop_coords']}</small>
442
+ </div>
443
+ """
444
+
445
+ # Load existing annotation
446
+ annotation = state.load_annotation(item)
447
+ direction = annotation.get('direction', '')
448
+ annotated = annotation.get('annotated', False)
449
+
450
+ # Info display
451
+ stats = state.get_statistics()
452
+ line_num = annotation.get('line_number')
453
+
454
+ # Determine status
455
+ if annotated:
456
+ if direction in ['LTR', 'RTL', 'TTB_LTR', 'TTB_RTL']:
457
+ status = f"✅ {direction}"
458
+ elif direction == 'TTB_NEEDS_RECLASSIFICATION':
459
+ status = "⚠️ NEEDS RECLASS"
460
+ elif direction == 'Skip':
461
+ status = "⏭️ Skipped"
462
+ elif direction == 'Unclear':
463
+ status = "❓ Unclear"
464
+ elif direction == 'HardNegative':
465
+ status = "🚫 Hard Negative"
466
+ else:
467
+ status = f"✅ {direction}"
468
+ else:
469
+ status = "⏳ Pending"
470
+
471
+ line_info = f" | Line #{line_num}" if line_num else ""
472
+
473
+ info = f"""
474
+ **{state.current_index + 1} / {stats['total']}** | KIU {kiu_id.zfill(5)} | Instance {item['instance_id']}{line_info} | {status}
475
+
476
+ **Progress:** {stats['processed']}/{stats['total']} ({stats['progress_pct']:.1f}%) | **Left:** {stats['remaining']}
477
+
478
+ **Counts:** LTR: {stats['ltr']} | RTL: {stats['rtl']} | TTB+LTR: {stats['ttb_ltr']} | TTB+RTL: {stats['ttb_rtl']}
479
+ Skipped: {stats['skip']} | Unclear: {stats['unclear']} | Needs Reclass: {stats['ttb_reclass']} | Hard Negatives: {stats['hard_negative']}
480
+ """
481
+
482
+ line_number_value = str(line_num) if line_num else ""
483
+
484
+ return cropped, html_content, info, line_number_value
485
+
486
+ def save_annotation_to_state(direction, line_number):
487
+ """Save annotation to state and persistent storage"""
488
+ if not state.manifest or state.current_index >= len(state.manifest):
489
+ return False
490
+
491
+ item = state.manifest[state.current_index]
492
+
493
+ try:
494
+ # Save to persistent storage
495
+ annotation_file = state.save_annotation(item, direction, line_number)
496
+
497
+ # Update manifest
498
+ item['direction'] = direction
499
+ item['annotated'] = True
500
+ if line_number and line_number.strip():
501
+ try:
502
+ item['line_number'] = int(line_number.strip())
503
+ except:
504
+ pass
505
+
506
+ # Update manifest cache
507
+ with open(MANIFEST_FILE, 'wb') as f:
508
+ pickle.dump(state.manifest, f)
509
+
510
+ print(f"✅ Annotation saved to {annotation_file}")
511
+ return True
512
+ except Exception as e:
513
+ print(f"Error saving: {e}")
514
+ return False
515
+
516
+ def annotate_and_next(direction, line_number):
517
+ """Annotate current instance and jump to next unannotated"""
518
+ if not state.manifest:
519
+ return None, "", "No items available", ""
520
+
521
+ # Save to history
522
+ state.history.append({
523
+ 'index': state.current_index,
524
+ 'item': state.manifest[state.current_index].copy()
525
+ })
526
+
527
+ # Save annotation
528
+ save_annotation_to_state(direction, line_number)
529
+
530
+ # Jump to next unannotated
531
+ next_idx = state.find_next_unannotated()
532
+ state.current_index = next_idx
533
+
534
+ return load_current_instance()
535
+
536
+ def undo_last():
537
+ """Undo last annotation"""
538
+ if not state.manifest or not state.history:
539
+ return load_current_instance()
540
+
541
+ last = state.history.pop()
542
+ state.current_index = last['index']
543
+
544
+ item = state.manifest[state.current_index]
545
+
546
+ try:
547
+ # Get annotation file path
548
+ annotation_file = state.get_annotation_file_path(item)
549
+
550
+ if os.path.exists(annotation_file):
551
+ os.remove(annotation_file)
552
+
553
+ # Restore manifest
554
+ state.manifest[state.current_index] = last['item']
555
+
556
+ # Update manifest cache
557
+ with open(MANIFEST_FILE, 'wb') as f:
558
+ pickle.dump(state.manifest, f)
559
+
560
+ except Exception as e:
561
+ print(f"Error undoing: {e}")
562
+
563
+ return load_current_instance()
564
+
565
+ def flag_hard_negative(line_number):
566
+ """Flag current instance as hard negative"""
567
+ if not state.manifest or state.current_index >= len(state.manifest):
568
+ return load_current_instance()
569
+
570
+ item = state.manifest[state.current_index]
571
+
572
+ try:
573
+ # Create hard negative directory
574
+ instance_dir_name = f"KIU_{str(item['kiu_id']).zfill(5)}_instance_{item['instance_id']}"
575
+ instance_dir = os.path.join(HF_HARD_NEGATIVES_DIR, instance_dir_name)
576
+ os.makedirs(instance_dir, exist_ok=True)
577
+
578
+ # Get artifact image
579
+ cropped = get_artifact_image(item['kiu_id'], item['crop_coords'])
580
+ if cropped:
581
+ cropped.save(os.path.join(instance_dir, "hard_negative.jpg"), "JPEG")
582
+
583
+ # Save metadata
584
+ metadata = {
585
+ 'kiu_id': item['kiu_id'],
586
+ 'instance_id': item['instance_id'],
587
+ 'crop_coords': item['crop_coords'],
588
+ 'flagged_as_hard_negative': True,
589
+ 'timestamp': time.time()
590
+ }
591
+
592
+ with open(os.path.join(instance_dir, "metadata.json"), 'w', encoding='utf-8') as f:
593
+ json.dump(metadata, f, indent=2, ensure_ascii=False)
594
+
595
+ # Mark as hard negative in annotations
596
+ save_annotation_to_state("HardNegative", line_number)
597
+
598
+ # Update manifest
599
+ item['direction'] = 'HardNegative'
600
+ item['annotated'] = True
601
+
602
+ # Update manifest cache
603
+ with open(MANIFEST_FILE, 'wb') as f:
604
+ pickle.dump(state.manifest, f)
605
+
606
+ except Exception as e:
607
+ print(f"Error flagging: {e}")
608
+
609
+ # Move to next unannotated
610
+ next_idx = state.find_next_unannotated()
611
+ state.current_index = next_idx
612
+
613
+ return load_current_instance()
614
+
615
+ # Navigation functions
616
+ def go_back():
617
+ if not state.manifest:
618
+ return None, "", "No items available", ""
619
+ state.current_index = max(0, state.current_index - 1)
620
+ return load_current_instance()
621
+
622
+ def go_forward():
623
+ if not state.manifest:
624
+ return None, "", "No items available", ""
625
+ state.current_index = min(len(state.manifest) - 1, state.current_index + 1)
626
+ return load_current_instance()
627
+
628
+ def jump_to_next_unannotated():
629
+ if not state.manifest:
630
+ return None, "", "No items available", ""
631
+ next_idx = state.find_next_unannotated()
632
+ state.current_index = next_idx
633
+ return load_current_instance()
634
+
635
+ def jump_to_index(target):
636
+ if not state.manifest:
637
+ return None, "", "No items available", ""
638
+ try:
639
+ idx = int(target) - 1
640
+ if 0 <= idx < len(state.manifest):
641
+ state.current_index = idx
642
+ except:
643
+ pass
644
+ return load_current_instance()
645
+
646
+ def jump_to_kiu(kiu_id):
647
+ if not state.manifest:
648
+ return None, "", "No items available", ""
649
+ kiu_id = kiu_id.strip().zfill(5)
650
+ for idx, item in enumerate(state.manifest):
651
+ if str(item['kiu_id']).zfill(5) == kiu_id:
652
+ state.current_index = idx
653
+ break
654
+ return load_current_instance()
655
+
656
+ def jump_to_old_ttb():
657
+ """Jump to next TTB that needs reclassification"""
658
+ if not state.manifest:
659
+ return None, "", "No items available", ""
660
+
661
+ for idx in range(state.current_index + 1, len(state.manifest)):
662
+ if state.manifest[idx].get('direction') == 'TTB_NEEDS_RECLASSIFICATION':
663
+ state.current_index = idx
664
+ return load_current_instance()
665
+
666
+ for idx in range(0, state.current_index):
667
+ if state.manifest[idx].get('direction') == 'TTB_NEEDS_RECLASSIFICATION':
668
+ state.current_index = idx
669
+ return load_current_instance()
670
+
671
+ return load_current_instance()
672
+
673
+ def export_annotations():
674
+ """Export all annotations as a single JSON file"""
675
+ annotations = []
676
+
677
+ # Scan annotation directory
678
+ for filename in os.listdir(HF_ANNOTATIONS_DIR):
679
+ if filename.endswith('.json'):
680
+ filepath = os.path.join(HF_ANNOTATIONS_DIR, filename)
681
+ try:
682
+ with open(filepath, 'r', encoding='utf-8') as f:
683
+ annotations.append(json.load(f))
684
+ except:
685
+ pass
686
+
687
+ # Create export file
688
+ export_file = os.path.join(HF_ANNOTATIONS_DIR, "all_annotations.json")
689
+ with open(export_file, 'w', encoding='utf-8') as f:
690
+ json.dump(annotations, f, indent=2, ensure_ascii=False)
691
+
692
+ return f"✅ Exported {len(annotations)} annotations to `{export_file}`"
693
+
694
+ # Build Gradio Interface
695
+ with gr.Blocks(title="⚡ Hieroglyph Annotation Tool - Hugging Face") as app:
696
+
697
+ gr.HTML("""
698
+ <style>
699
+ .gr-button {
700
+ margin: 2px !important;
701
+ min-width: 80px !important;
702
+ height: 36px !important;
703
+ }
704
+ .gradio-container {
705
+ max-width: 1400px !important;
706
+ }
707
+ h1 {
708
+ color: #2c3e50;
709
+ margin-bottom: 10px;
710
+ }
711
+ .info-box {
712
+ background: #f8f9fa;
713
+ padding: 10px;
714
+ border-radius: 5px;
715
+ border-left: 4px solid #007bff;
716
+ margin-bottom: 10px;
717
+ }
718
+ .space-info {
719
+ background: #e3f2fd;
720
+ padding: 10px;
721
+ border-radius: 5px;
722
+ margin-bottom: 15px;
723
+ }
724
+ </style>
725
+ """)
726
+
727
+ gr.Markdown("# ⚡ Hieroglyph Direction Annotation - Hugging Face Space")
728
+
729
+ # Display authentication status
730
+ auth_status = "🔒 Private (HF_TOKEN set)" if HF_TOKEN else "🔓 Public (no HF_TOKEN)"
731
+
732
+ gr.HTML(f"""
733
+ <div class="space-info">
734
+ <strong>📊 Space Configuration:</strong><br>
735
+ • Dataset: <code>{HF_DATASET_REPO}</code><br>
736
+ • Authentication: {auth_status}<br>
737
+ • Annotations stored in: <code>{HF_ANNOTATIONS_DIR}</code><br>
738
+ • Total instances in manifest: {len(state.manifest) if state.manifest else 0}<br>
739
+ {'• <span style="color: green;">✓ HF_TOKEN is set</span>' if HF_TOKEN else '• <span style="color: orange;">⚠ No HF_TOKEN set (dataset must be public)</span>'}
740
+ </div>
741
+ """)
742
+
743
+ with gr.Row():
744
+ with gr.Column(scale=2):
745
+ image_display = gr.Image(label="Line Instance", type="pil", height=500)
746
+
747
+ line_number_input = gr.Textbox(
748
+ label="Line Number (optional)",
749
+ placeholder="1, 2, 3...",
750
+ max_lines=1
751
+ )
752
+
753
+ with gr.Row():
754
+ ltr_btn = gr.Button("➡️ LTR", variant="primary")
755
+ rtl_btn = gr.Button("⬅️ RTL", variant="primary")
756
+ ttb_ltr_btn = gr.Button("⬇️➡️ TTB+LTR", variant="primary")
757
+ ttb_rtl_btn = gr.Button("⬇️⬅️ TTB+RTL", variant="primary")
758
+
759
+ with gr.Row():
760
+ skip_btn = gr.Button("⏭️ Skip", variant="secondary")
761
+ unclear_btn = gr.Button("❓ Unclear", variant="secondary")
762
+ hard_neg_btn = gr.Button("🚫 Bad", variant="secondary")
763
+
764
+ with gr.Column(scale=1):
765
+ html_display = gr.HTML(label="Reference")
766
+
767
+ info_display = gr.Markdown()
768
+
769
+ with gr.Row():
770
+ back_btn = gr.Button("⬅️ Back")
771
+ forward_btn = gr.Button("➡️ Forward")
772
+ undo_btn = gr.Button("↩️ Undo")
773
+ next_unann_btn = gr.Button("⏭️ Next Unannotated", variant="primary")
774
+ export_btn = gr.Button("💾 Export", variant="secondary")
775
+
776
+ with gr.Row():
777
+ jump_input = gr.Textbox(label="Jump to Index", placeholder="123", scale=2)
778
+ jump_btn = gr.Button("Go", scale=1)
779
+ kiu_input = gr.Textbox(label="Find KIU", placeholder="00001", scale=2)
780
+ kiu_btn = gr.Button("Find", scale=1)
781
+ old_ttb_btn = gr.Button("Find TTB Reclass", scale=1, variant="secondary")
782
+
783
+ export_output = gr.Markdown()
784
+
785
+ # Event handlers
786
+ ltr_btn.click(
787
+ lambda ln: annotate_and_next("LTR", ln),
788
+ inputs=[line_number_input],
789
+ outputs=[image_display, html_display, info_display, line_number_input]
790
+ )
791
+ rtl_btn.click(
792
+ lambda ln: annotate_and_next("RTL", ln),
793
+ inputs=[line_number_input],
794
+ outputs=[image_display, html_display, info_display, line_number_input]
795
+ )
796
+ ttb_ltr_btn.click(
797
+ lambda ln: annotate_and_next("TTB_LTR", ln),
798
+ inputs=[line_number_input],
799
+ outputs=[image_display, html_display, info_display, line_number_input]
800
+ )
801
+ ttb_rtl_btn.click(
802
+ lambda ln: annotate_and_next("TTB_RTL", ln),
803
+ inputs=[line_number_input],
804
+ outputs=[image_display, html_display, info_display, line_number_input]
805
+ )
806
+ skip_btn.click(
807
+ lambda ln: annotate_and_next("Skip", ln),
808
+ inputs=[line_number_input],
809
+ outputs=[image_display, html_display, info_display, line_number_input]
810
+ )
811
+ unclear_btn.click(
812
+ lambda ln: annotate_and_next("Unclear", ln),
813
+ inputs=[line_number_input],
814
+ outputs=[image_display, html_display, info_display, line_number_input]
815
+ )
816
+ hard_neg_btn.click(
817
+ flag_hard_negative,
818
+ inputs=[line_number_input],
819
+ outputs=[image_display, html_display, info_display, line_number_input]
820
+ )
821
+
822
+ back_btn.click(go_back, outputs=[image_display, html_display, info_display, line_number_input])
823
+ forward_btn.click(go_forward, outputs=[image_display, html_display, info_display, line_number_input])
824
+ undo_btn.click(undo_last, outputs=[image_display, html_display, info_display, line_number_input])
825
+ next_unann_btn.click(jump_to_next_unannotated, outputs=[image_display, html_display, info_display, line_number_input])
826
+ export_btn.click(export_annotations, outputs=[export_output])
827
+
828
+ jump_btn.click(jump_to_index, inputs=[jump_input], outputs=[image_display, html_display, info_display, line_number_input])
829
+ kiu_btn.click(jump_to_kiu, inputs=[kiu_input], outputs=[image_display, html_display, info_display, line_number_input])
830
+ old_ttb_btn.click(jump_to_old_ttb, outputs=[image_display, html_display, info_display, line_number_input])
831
+
832
+ # Load initial state
833
+ app.load(load_current_instance, outputs=[image_display, html_display, info_display, line_number_input])
834
+
835
+ if __name__ == "__main__":
836
+ stats = state.get_statistics()
837
+ print("\n" + "="*80)
838
+ print("⚡ Hieroglyph Annotation Tool - Hugging Face Space")
839
+ print("="*80)
840
+ print(f"📊 Statistics:")
841
+ print(f" Total instances: {stats['total']:,}")
842
+ print(f" Remaining: {stats['remaining']:,}")
843
+ print(f" Progress: {stats['progress_pct']:.1f}%")
844
+ print(f" Annotations dir: {HF_ANNOTATIONS_DIR}")
845
+ print(f" HF_TOKEN available: {'Yes' if HF_TOKEN else 'No'}")
846
+ print("="*80)
847
+
848
+ # For Hugging Face Spaces
849
+ app.launch(
850
+ show_error=True,
851
+ server_name="0.0.0.0",
852
+ server_port=7860
853
+ )