alyex commited on
Commit
7ae51cc
·
verified ·
1 Parent(s): 32cf313

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +645 -176
app.py CHANGED
@@ -1,4 +1,3 @@
1
- # app_fixed.py
2
  import gradio as gr
3
  import json
4
  import os
@@ -6,242 +5,712 @@ import pickle
6
  from PIL import Image
7
  import tempfile
8
  import time
9
- from pathlib import Path
10
- import hashlib
11
  import io
12
  import shutil
13
- import rarfile
14
 
15
- # Hugging Face imports
16
  from datasets import load_dataset
 
17
 
18
  # ========== CONFIGURATION ==========
19
  HF_DATASET_REPO = "alyex/karnak-data-app"
20
 
21
- # Directories
22
- HF_CACHE_DIR = os.path.join(tempfile.gettempdir(), "karnak_cache")
23
  HF_ANNOTATIONS_DIR = "/data/annotations"
24
  HF_HARD_NEGATIVES_DIR = "/data/hard_negatives"
25
  SEGMENT_EXTRACT_DIR = "/data/kiu_segment_metadata"
26
 
27
- os.makedirs(HF_CACHE_DIR, exist_ok=True)
 
 
28
  os.makedirs(HF_ANNOTATIONS_DIR, exist_ok=True)
29
  os.makedirs(HF_HARD_NEGATIVES_DIR, exist_ok=True)
30
  os.makedirs(SEGMENT_EXTRACT_DIR, exist_ok=True)
 
31
 
32
- MANIFEST_FILE = os.path.join(HF_CACHE_DIR, "manifest.pkl")
33
 
34
- class AnnotationState:
 
 
 
35
  def __init__(self):
36
- self.manifest = []
37
- self.current_index = 0
38
- self.history = []
39
- self.karnak_dataset = None
40
  self.segment_data = {}
41
- self.kiu_lookup = {} # Fast lookup dictionary
42
 
43
- # Load datasets
44
- self.load_datasets()
45
- # Load manifest
46
- self.load_manifest()
47
 
48
- def load_datasets(self):
49
- """Load datasets with fast lookup"""
50
- print("Loading Karnak dataset...")
51
  try:
52
- self.karnak_dataset = load_dataset(HF_DATASET_REPO, streaming=True)
53
- print(f"✅ Karnak dataset loaded")
54
-
55
- # Build fast lookup for first N items
56
- self.build_kiu_lookup(2000) # Lookup first 2000 KIUs
57
-
58
  except Exception as e:
59
- print(f"❌ Error loading dataset: {e}")
60
- self.karnak_dataset = None
61
-
62
- print("Loading segment metadata from local...")
63
- self.load_segment_metadata_local()
64
 
65
- def build_kiu_lookup(self, limit=2000):
66
- """Build fast lookup dictionary"""
67
- if not self.karnak_dataset:
68
- return
69
 
70
- print(f"Building KIU lookup table (first {limit} items)...")
71
- self.kiu_lookup = {}
72
-
73
- count = 0
74
- try:
75
- for item in self.karnak_dataset["train"]:
76
- kiu_id = str(item["kiu_id"]).strip()
77
- # Normalize: remove leading zeros
78
- kiu_id_norm = kiu_id.lstrip('0') or '0'
79
-
80
- # Store in lookup
81
- self.kiu_lookup[kiu_id_norm] = item
82
-
83
- count += 1
84
- if count >= limit:
85
- break
86
-
87
- print(f"✅ Built lookup for {count} KIUs")
88
-
89
- except Exception as e:
90
- print(f"Lookup building stopped: {e}")
91
-
92
- def load_segment_metadata_local(self):
93
- """Load from local RAR file"""
94
- RAR_FILENAME = "kiu_segment_metadata.rar"
95
-
96
- if not os.path.exists(RAR_FILENAME):
97
- print(f"❌ {RAR_FILENAME} not found")
98
  return
99
 
100
- print(f"✅ Found {RAR_FILENAME}")
101
-
102
- # Check cache
103
- json_path = os.path.join(SEGMENT_EXTRACT_DIR, "kiu_segment_metadata.json")
104
- if os.path.exists(json_path):
105
- print(f"Loading cached metadata...")
106
- try:
107
- with open(json_path, 'r') as f:
108
- self.segment_data = json.load(f)
109
- print(f"✅ Loaded {len(self.segment_data)} entries")
110
- return
111
- except:
112
- pass
113
-
114
  # Extract from RAR
 
 
 
 
 
115
  try:
116
- print(f"Extracting {RAR_FILENAME}...")
117
- with rarfile.RarFile(RAR_FILENAME) as rf:
118
- # Find and extract JSON
119
  for file_info in rf.infolist():
120
- if file_info.filename.endswith('.json'):
121
- rf.extract(file_info, SEGMENT_EXTRACT_DIR)
122
- extracted_path = os.path.join(SEGMENT_EXTRACT_DIR, file_info.filename)
123
-
124
- # Load data
125
- with open(extracted_path, 'r') as f:
126
- self.segment_data = json.load(f)
127
-
128
- print(f"✅ Loaded {len(self.segment_data)} entries")
129
-
130
- # Cache as standard name
131
- if extracted_path != json_path:
132
- shutil.move(extracted_path, json_path)
133
-
134
  break
135
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  except Exception as e:
137
- print(f"❌ Extraction error: {e}")
 
 
138
 
139
- def load_manifest(self):
140
- """Build manifest, filtering for KIUs we have in dataset"""
141
  if not self.segment_data:
142
- print("❌ No segment data")
143
- self.manifest = []
144
  return
145
 
146
- print("Building manifest (checking for available KIUs)...")
147
  self.manifest = []
148
 
149
- available_count = 0
150
- unavailable_count = 0
151
-
152
  for key, segment_info in self.segment_data.items():
153
  if not isinstance(segment_info, dict):
154
  continue
155
 
156
- # Get KIU ID from segment metadata
157
- kiu_id = str(segment_info.get('kiu_id', ''))
158
- kiu_id_norm = kiu_id.lstrip('0') or '0'
 
 
 
 
159
 
160
- # Check if we have this KIU in our lookup
161
- if kiu_id_norm in self.kiu_lookup:
162
- available_count += 1
163
- instances = segment_info.get('instances', [])
164
-
165
- for instance in instances:
166
- if instance.get('class') == 'Line':
167
- manifest_item = {
168
- 'kiu_id': kiu_id.zfill(5), # Store with leading zeros
169
- 'kiu_id_norm': kiu_id_norm, # Normalized for lookup
170
- 'instance_id': instance.get('instance_id'),
171
- 'crop_coords': instance.get('crop_coords', [0, 0, 100, 100]),
172
- 'direction': instance.get('direction', ''),
173
- 'annotated': instance.get('annotated', False),
174
- 'line_number': instance.get('line_number'),
175
- 'segment_key': key
176
- }
177
- self.manifest.append(manifest_item)
178
- else:
179
- unavailable_count += 1
180
 
181
- print(f"✅ Manifest: {len(self.manifest)} instances from {available_count} available KIUs")
182
- print(f" Skipped: {unavailable_count} KIUs not in dataset")
183
 
184
- if self.manifest:
185
- # Save to cache
186
- with open(MANIFEST_FILE, 'wb') as f:
187
- pickle.dump(self.manifest, f)
188
-
189
- self.current_index = self.find_first_unannotated()
190
 
191
  def get_kiu_data(self, kiu_id):
192
- """Fast lookup using dictionary"""
193
- if not self.kiu_lookup:
 
 
 
194
  return None, None
195
 
196
- # Normalize KIU ID
197
- kiu_id_norm = str(kiu_id).strip().lstrip('0') or '0'
198
 
199
- if kiu_id_norm in self.kiu_lookup:
200
- item = self.kiu_lookup[kiu_id_norm]
 
 
 
 
 
 
 
 
 
 
 
201
 
202
- # Convert image
203
- image = None
204
- if item.get("image"):
205
- try:
206
- image = Image.open(io.BytesIO(item["image"]))
207
- except Exception as e:
208
- print(f"Image conversion error for KIU {kiu_id}: {e}")
209
 
210
- return item.get("html", ""), image
 
 
 
 
 
 
 
 
 
 
 
211
 
212
- # Fallback: search in dataset
213
- return self.get_kiu_data_slow(kiu_id)
214
 
215
- def get_kiu_data_slow(self, kiu_id):
216
- """Fallback: search through dataset"""
217
- if not self.karnak_dataset:
218
- return None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
220
- kiu_id_norm = str(kiu_id).strip().lstrip('0') or '0'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
  try:
223
- for item in self.karnak_dataset["train"]:
224
- item_id = str(item["kiu_id"]).strip().lstrip('0') or '0'
225
- if item_id == kiu_id_norm:
226
- image = None
227
- if item.get("image"):
228
- try:
229
- image = Image.open(io.BytesIO(item["image"]))
230
- except:
231
- pass
232
- return item.get("html", ""), image
233
  except:
234
  pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
 
236
- return None, None
 
237
 
238
- # [Keep all your other methods: needs_annotation, find_first_unannotated, etc.]
239
- # ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
 
241
- # [Keep the rest of your app code: load_current_instance, navigation functions, Gradio interface]
242
- # ...
243
 
244
- # Initialize state
245
- state = AnnotationState()
246
 
247
- # Rest of your app code remains the same...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import json
3
  import os
 
5
  from PIL import Image
6
  import tempfile
7
  import time
 
 
8
  import io
9
  import shutil
10
+ import html
11
 
 
12
  from datasets import load_dataset
13
+ import rarfile
14
 
15
  # ========== CONFIGURATION ==========
16
  HF_DATASET_REPO = "alyex/karnak-data-app"
17
 
18
+ # Persistent directories (survive Space restarts)
 
19
  HF_ANNOTATIONS_DIR = "/data/annotations"
20
  HF_HARD_NEGATIVES_DIR = "/data/hard_negatives"
21
  SEGMENT_EXTRACT_DIR = "/data/kiu_segment_metadata"
22
 
23
+ # Temporary cache (doesn't need to persist)
24
+ CACHE_DIR = os.path.join(tempfile.gettempdir(), "karnak_cache")
25
+
26
  os.makedirs(HF_ANNOTATIONS_DIR, exist_ok=True)
27
  os.makedirs(HF_HARD_NEGATIVES_DIR, exist_ok=True)
28
  os.makedirs(SEGMENT_EXTRACT_DIR, exist_ok=True)
29
+ os.makedirs(CACHE_DIR, exist_ok=True)
30
 
31
+ MANIFEST_FILE = os.path.join(CACHE_DIR, "manifest.pkl")
32
 
33
+
34
+ class DataManager:
35
+ """Handles all data loading and caching"""
36
+
37
  def __init__(self):
38
+ self.dataset = None
 
 
 
39
  self.segment_data = {}
40
+ self.manifest = []
41
 
42
+ self._load_segment_metadata()
43
+ self._build_manifest()
44
+ self._init_streaming_dataset()
 
45
 
46
+ def _init_streaming_dataset(self):
47
+ """Initialize streaming dataset"""
48
+ print("Initializing streaming dataset...")
49
  try:
50
+ self.dataset = load_dataset(HF_DATASET_REPO, split="train", streaming=True)
51
+ print(f"✅ Streaming dataset initialized")
 
 
 
 
52
  except Exception as e:
53
+ print(f"❌ Failed to load dataset: {e}")
54
+ self.dataset = None
 
 
 
55
 
56
+ def _load_segment_metadata(self):
57
+ """Load segment metadata from RAR file"""
58
+ RAR_FILE = "kiu_segment_metadata.rar"
59
+ JSON_FILE = os.path.join(SEGMENT_EXTRACT_DIR, "kiu_segment_metadata.json")
60
 
61
+ # Check if already extracted
62
+ if os.path.exists(JSON_FILE):
63
+ print(f"Loading cached segment metadata...")
64
+ with open(JSON_FILE, 'r') as f:
65
+ self.segment_data = json.load(f)
66
+ print(f"✅ Loaded {len(self.segment_data)} segment entries")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  return
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  # Extract from RAR
70
+ if not os.path.exists(RAR_FILE):
71
+ print(f"❌ {RAR_FILE} not found in Space root")
72
+ return
73
+
74
+ print(f"Extracting {RAR_FILE}...")
75
  try:
76
+ with rarfile.RarFile(RAR_FILE) as rf:
77
+ # Find JSON file
78
+ json_in_rar = None
79
  for file_info in rf.infolist():
80
+ if file_info.filename.endswith('.json') and 'segment_metadata' in file_info.filename:
81
+ json_in_rar = file_info.filename
 
 
 
 
 
 
 
 
 
 
 
 
82
  break
83
+
84
+ if not json_in_rar:
85
+ print("❌ No JSON file found in RAR")
86
+ return
87
+
88
+ # Extract
89
+ rf.extract(json_in_rar, SEGMENT_EXTRACT_DIR)
90
+ extracted = os.path.join(SEGMENT_EXTRACT_DIR, json_in_rar)
91
+
92
+ # Load
93
+ with open(extracted, 'r') as f:
94
+ self.segment_data = json.load(f)
95
+
96
+ # Rename to standard location
97
+ if extracted != JSON_FILE:
98
+ shutil.move(extracted, JSON_FILE)
99
+
100
+ print(f"✅ Extracted and loaded {len(self.segment_data)} entries")
101
+
102
  except Exception as e:
103
+ print(f"❌ Error extracting RAR: {e}")
104
+ import traceback
105
+ traceback.print_exc()
106
 
107
+ def _build_manifest(self):
108
+ """Build manifest of all line instances"""
109
  if not self.segment_data:
110
+ print("❌ No segment data - cannot build manifest")
 
111
  return
112
 
113
+ print("Building manifest...")
114
  self.manifest = []
115
 
 
 
 
116
  for key, segment_info in self.segment_data.items():
117
  if not isinstance(segment_info, dict):
118
  continue
119
 
120
+ # Extract KIU ID
121
+ kiu_id = segment_info.get('kiu_id')
122
+ if not kiu_id and 'KIU_' in key:
123
+ try:
124
+ kiu_id = int(key.split('_')[1])
125
+ except:
126
+ continue
127
 
128
+ # Process line instances
129
+ instances = segment_info.get('instances', [])
130
+ for instance in instances:
131
+ if instance.get('class') == 'Line':
132
+ self.manifest.append({
133
+ 'kiu_id': str(kiu_id).zfill(5),
134
+ 'instance_id': instance.get('instance_id'),
135
+ 'crop_coords': instance.get('crop_coords', [0, 0, 100, 100]),
136
+ 'direction': instance.get('direction', ''),
137
+ 'annotated': instance.get('annotated', False),
138
+ 'line_number': instance.get('line_number'),
139
+ 'segment_key': key
140
+ })
 
 
 
 
 
 
 
141
 
142
+ print(f"✅ Built manifest with {len(self.manifest)} line instances")
 
143
 
144
+ # Save manifest
145
+ with open(MANIFEST_FILE, 'wb') as f:
146
+ pickle.dump(self.manifest, f)
 
 
 
147
 
148
  def get_kiu_data(self, kiu_id):
149
+ """
150
+ Stream through dataset to find KIU
151
+ Returns: (html_content, image_pil) or (None, None)
152
+ """
153
+ if not self.dataset:
154
  return None, None
155
 
156
+ target_kiu = str(kiu_id).zfill(5)
 
157
 
158
+ try:
159
+ # Stream through dataset
160
+ for item in self.dataset:
161
+ if item['kiu_id'] == target_kiu:
162
+ html_content = item['html']
163
+
164
+ # Convert bytes to PIL Image
165
+ try:
166
+ image_pil = Image.open(io.BytesIO(item['image']))
167
+ return html_content, image_pil
168
+ except Exception as e:
169
+ print(f"❌ Error decoding image bytes for KIU {target_kiu}: {e}")
170
+ return html_content, None
171
 
172
+ print(f"❌ KIU {target_kiu} not found in dataset")
173
+ return None, None
 
 
 
 
 
174
 
175
+ except Exception as e:
176
+ print(f"❌ Error streaming dataset: {e}")
177
+ return None, None
178
+
179
+
180
+ class AnnotationApp:
181
+ """Main annotation application"""
182
+
183
+ def __init__(self):
184
+ self.data_mgr = DataManager()
185
+ self.current_index = 0
186
+ self.history = []
187
 
188
+ # Jump to first unannotated
189
+ self.current_index = self._find_first_unannotated()
190
 
191
+ def _needs_annotation(self, item):
192
+ """Check if item needs annotation"""
193
+ if not item.get('annotated', False):
194
+ return True
195
+ if item.get('direction') == 'TTB_NEEDS_RECLASSIFICATION':
196
+ return True
197
+ return False
198
+
199
+ def _find_first_unannotated(self):
200
+ """Find first unannotated item"""
201
+ for idx, item in enumerate(self.data_mgr.manifest):
202
+ if self._needs_annotation(item):
203
+ return idx
204
+ return 0
205
+
206
+ def _find_next_unannotated(self, from_index=None):
207
+ """Find next unannotated item"""
208
+ if not self.data_mgr.manifest:
209
+ return 0
210
+
211
+ start = from_index if from_index is not None else self.current_index
212
+
213
+ # Search forward
214
+ for idx in range(start + 1, len(self.data_mgr.manifest)):
215
+ if self._needs_annotation(self.data_mgr.manifest[idx]):
216
+ return idx
217
+
218
+ # Wrap around
219
+ for idx in range(0, start):
220
+ if self._needs_annotation(self.data_mgr.manifest[idx]):
221
+ return idx
222
+
223
+ return start
224
+
225
+ def get_statistics(self):
226
+ """Calculate annotation statistics"""
227
+ manifest = self.data_mgr.manifest
228
+ if not manifest:
229
+ return {
230
+ 'total': 0, 'remaining': 0, 'processed': 0, 'progress_pct': 0,
231
+ 'ltr': 0, 'rtl': 0, 'ttb_ltr': 0, 'ttb_rtl': 0,
232
+ 'skip': 0, 'unclear': 0, 'ttb_reclass': 0
233
+ }
234
+
235
+ total = len(manifest)
236
+ remaining = sum(1 for item in manifest if self._needs_annotation(item))
237
+ processed = total - remaining
238
+ progress = (processed / total * 100) if total > 0 else 0
239
+
240
+ return {
241
+ 'total': total,
242
+ 'remaining': remaining,
243
+ 'processed': processed,
244
+ 'progress_pct': progress,
245
+ 'ltr': sum(1 for i in manifest if i.get('direction') == 'LTR'),
246
+ 'rtl': sum(1 for i in manifest if i.get('direction') == 'RTL'),
247
+ 'ttb_ltr': sum(1 for i in manifest if i.get('direction') == 'TTB_LTR'),
248
+ 'ttb_rtl': sum(1 for i in manifest if i.get('direction') == 'TTB_RTL'),
249
+ 'skip': sum(1 for i in manifest if i.get('direction') == 'Skip'),
250
+ 'unclear': sum(1 for i in manifest if i.get('direction') == 'Unclear'),
251
+ 'ttb_reclass': sum(1 for i in manifest if i.get('direction') == 'TTB_NEEDS_RECLASSIFICATION')
252
+ }
253
+
254
+ def _get_annotation_path(self, item):
255
+ """Get annotation file path"""
256
+ filename = f"KIU_{item['kiu_id']}_instance_{item['instance_id']}.json"
257
+ return os.path.join(HF_ANNOTATIONS_DIR, filename)
258
+
259
+ def _save_annotation(self, item, direction, line_number):
260
+ """Save annotation to disk"""
261
+ annotation_file = self._get_annotation_path(item)
262
+
263
+ data = {
264
+ 'kiu_id': item['kiu_id'],
265
+ 'instance_id': item['instance_id'],
266
+ 'direction': direction,
267
+ 'annotated': True,
268
+ 'line_number': int(line_number) if line_number and line_number.strip() else None,
269
+ 'timestamp': time.time(),
270
+ 'crop_coords': item['crop_coords']
271
+ }
272
+
273
+ with open(annotation_file, 'w', encoding='utf-8') as f:
274
+ json.dump(data, f, indent=2, ensure_ascii=False)
275
+
276
+ return annotation_file
277
+
278
+ def load_current(self):
279
+ """
280
+ Load and display current instance
281
+ Returns: (image, html, info, line_number)
282
+ """
283
+ manifest = self.data_mgr.manifest
284
+
285
+ if not manifest or self.current_index >= len(manifest):
286
+ return (
287
+ None,
288
+ "<div style='padding: 20px; background: #fff3cd;'>⚠️ No items available</div>",
289
+ "No items",
290
+ ""
291
+ )
292
+
293
+ item = manifest[self.current_index]
294
+ kiu_id = item['kiu_id']
295
+
296
+ print(f"\n{'='*60}")
297
+ print(f"Loading: Index {self.current_index + 1}/{len(manifest)}")
298
+ print(f"KIU: {kiu_id}, Instance: {item['instance_id']}")
299
+
300
+ # Get data from dataset (streaming)
301
+ html_content, full_image = self.data_mgr.get_kiu_data(kiu_id)
302
+
303
+ # Crop image
304
+ cropped_image = None
305
+ if full_image:
306
+ try:
307
+ x1, y1, x2, y2 = map(int, item['crop_coords'])
308
+
309
+ # Clamp to image bounds
310
+ x1 = max(0, min(x1, full_image.width))
311
+ y1 = max(0, min(y1, full_image.height))
312
+ x2 = max(0, min(x2, full_image.width))
313
+ y2 = max(0, min(y2, full_image.height))
314
+
315
+ if x2 > x1 and y2 > y1:
316
+ cropped_image = full_image.crop((x1, y1, x2, y2))
317
+ print(f"✅ Cropped: {cropped_image.size}")
318
+
319
+ # Resize if too tall
320
+ if cropped_image.height > 800:
321
+ ratio = 800 / cropped_image.height
322
+ new_w = int(cropped_image.width * ratio)
323
+ cropped_image = cropped_image.resize((new_w, 800), Image.LANCZOS)
324
+ else:
325
+ print(f"❌ Invalid crop coords")
326
+
327
+ except Exception as e:
328
+ print(f"❌ Crop error: {e}")
329
+ else:
330
+ print(f"❌ No image for KIU {kiu_id}")
331
 
332
+ # Format HTML
333
+ if html_content and html_content.strip():
334
+ escaped = html.escape(html_content[:5000])
335
+ if len(html_content) > 5000:
336
+ escaped += "\n\n... (truncated)"
337
+
338
+ html_display = f"""
339
+ <div style="padding: 12px; background: #f8f9fa; border-radius: 8px; border: 1px solid #dee2e6;">
340
+ <div style="font-weight: bold; color: #495057; margin-bottom: 8px;">
341
+ 📄 KIU {kiu_id} Reference HTML
342
+ </div>
343
+ <div style="max-height: 400px; overflow-y: auto; background: white;
344
+ padding: 10px; border-radius: 4px; font-family: 'Courier New', monospace;
345
+ font-size: 11px; line-height: 1.5; white-space: pre-wrap; word-wrap: break-word;">
346
+ {escaped}
347
+ </div>
348
+ </div>
349
+ """
350
+ else:
351
+ html_display = f"""
352
+ <div style='padding: 20px; background: #fff3cd; border-radius: 8px; border: 1px solid #ffc107;'>
353
+ ⚠️ No HTML content available for KIU {kiu_id}
354
+ </div>
355
+ """
356
+
357
+ # Load existing annotation
358
+ annotation_file = self._get_annotation_path(item)
359
+ existing_line_num = ""
360
+ if os.path.exists(annotation_file):
361
+ try:
362
+ with open(annotation_file, 'r') as f:
363
+ ann = json.load(f)
364
+ if ann.get('line_number'):
365
+ existing_line_num = str(ann['line_number'])
366
+ except:
367
+ pass
368
+
369
+ # Build info display
370
+ stats = self.get_statistics()
371
+ direction = item.get('direction', '')
372
+ annotated = item.get('annotated', False)
373
+
374
+ # Status indicator
375
+ if annotated:
376
+ if direction in ['LTR', 'RTL', 'TTB_LTR', 'TTB_RTL']:
377
+ status = f"✅ {direction}"
378
+ elif direction == 'TTB_NEEDS_RECLASSIFICATION':
379
+ status = "⚠️ NEEDS RECLASS"
380
+ elif direction == 'Skip':
381
+ status = "⏭️ Skipped"
382
+ elif direction == 'Unclear':
383
+ status = "❓ Unclear"
384
+ else:
385
+ status = f"✅ {direction}"
386
+ else:
387
+ status = "⏳ Pending"
388
+
389
+ line_info = f" | Line #{existing_line_num}" if existing_line_num else ""
390
+
391
+ info = f"""
392
+ **Instance {self.current_index + 1} / {stats['total']}** | KIU {kiu_id} | Instance {item['instance_id']}{line_info} | {status}
393
+
394
+ **Progress:** {stats['processed']}/{stats['total']} ({stats['progress_pct']:.1f}%) | **Remaining:** {stats['remaining']}
395
+
396
+ **Direction Counts:**
397
+ LTR: {stats['ltr']} | RTL: {stats['rtl']} | TTB+LTR: {stats['ttb_ltr']} | TTB+RTL: {stats['ttb_rtl']}
398
+ Skipped: {stats['skip']} | Unclear: {stats['unclear']} | Needs Reclass: {stats['ttb_reclass']}
399
+ """
400
+
401
+ print(f"{'='*60}\n")
402
+
403
+ return cropped_image, html_display, info, existing_line_num
404
+
405
+ def annotate_and_next(self, direction, line_number):
406
+ """Save annotation and move to next unannotated"""
407
+ manifest = self.data_mgr.manifest
408
+ if not manifest:
409
+ return self.load_current()
410
+
411
+ # Save to history for undo
412
+ self.history.append({
413
+ 'index': self.current_index,
414
+ 'item': manifest[self.current_index].copy()
415
+ })
416
+
417
+ # Save annotation
418
+ item = manifest[self.current_index]
419
+ self._save_annotation(item, direction, line_number)
420
+
421
+ # Update manifest
422
+ item['direction'] = direction
423
+ item['annotated'] = True
424
+ if line_number and line_number.strip():
425
+ try:
426
+ item['line_number'] = int(line_number.strip())
427
+ except:
428
+ pass
429
+
430
+ # Save manifest
431
+ with open(MANIFEST_FILE, 'wb') as f:
432
+ pickle.dump(manifest, f)
433
+
434
+ print(f"✅ Saved: {direction}")
435
+
436
+ # Move to next unannotated
437
+ self.current_index = self._find_next_unannotated()
438
+
439
+ return self.load_current()
440
+
441
+ def undo_last(self):
442
+ """Undo last annotation"""
443
+ if not self.history:
444
+ print("⚠️ Nothing to undo")
445
+ return self.load_current()
446
+
447
+ last = self.history.pop()
448
+ self.current_index = last['index']
449
+
450
+ item = self.data_mgr.manifest[self.current_index]
451
+
452
+ # Delete annotation file
453
+ annotation_file = self._get_annotation_path(item)
454
+ if os.path.exists(annotation_file):
455
+ os.remove(annotation_file)
456
+
457
+ # Restore manifest item
458
+ self.data_mgr.manifest[self.current_index] = last['item']
459
+
460
+ # Save manifest
461
+ with open(MANIFEST_FILE, 'wb') as f:
462
+ pickle.dump(self.data_mgr.manifest, f)
463
+
464
+ print("✅ Undo successful")
465
+ return self.load_current()
466
+
467
+ def flag_hard_negative(self, line_number):
468
+ """Flag as hard negative and save image"""
469
+ manifest = self.data_mgr.manifest
470
+ if not manifest:
471
+ return self.load_current()
472
+
473
+ item = manifest[self.current_index]
474
+
475
+ # Create directory
476
+ dir_name = f"KIU_{item['kiu_id']}_instance_{item['instance_id']}"
477
+ dir_path = os.path.join(HF_HARD_NEGATIVES_DIR, dir_name)
478
+ os.makedirs(dir_path, exist_ok=True)
479
+
480
+ # Get and save cropped image
481
+ html_content, full_image = self.data_mgr.get_kiu_data(item['kiu_id'])
482
+ if full_image:
483
+ try:
484
+ x1, y1, x2, y2 = map(int, item['crop_coords'])
485
+ x1 = max(0, min(x1, full_image.width))
486
+ y1 = max(0, min(y1, full_image.height))
487
+ x2 = max(0, min(x2, full_image.width))
488
+ y2 = max(0, min(y2, full_image.height))
489
+
490
+ if x2 > x1 and y2 > y1:
491
+ cropped = full_image.crop((x1, y1, x2, y2))
492
+ cropped.save(os.path.join(dir_path, "hard_negative.jpg"), "JPEG")
493
+ except Exception as e:
494
+ print(f"❌ Error saving hard negative image: {e}")
495
+
496
+ # Save metadata
497
+ metadata = {
498
+ 'kiu_id': item['kiu_id'],
499
+ 'instance_id': item['instance_id'],
500
+ 'crop_coords': item['crop_coords'],
501
+ 'flagged_as_hard_negative': True,
502
+ 'timestamp': time.time()
503
+ }
504
 
505
+ with open(os.path.join(dir_path, "metadata.json"), 'w') as f:
506
+ json.dump(metadata, f, indent=2)
507
+
508
+ # Save as annotation
509
+ self._save_annotation(item, "HardNegative", line_number)
510
+
511
+ item['direction'] = 'HardNegative'
512
+ item['annotated'] = True
513
+
514
+ with open(MANIFEST_FILE, 'wb') as f:
515
+ pickle.dump(manifest, f)
516
+
517
+ print("✅ Flagged as hard negative")
518
+
519
+ # Move to next
520
+ self.current_index = self._find_next_unannotated()
521
+ return self.load_current()
522
+
523
+ # Navigation methods
524
+ def go_back(self):
525
+ self.current_index = max(0, self.current_index - 1)
526
+ return self.load_current()
527
+
528
+ def go_forward(self):
529
+ manifest = self.data_mgr.manifest
530
+ self.current_index = min(len(manifest) - 1, self.current_index + 1)
531
+ return self.load_current()
532
+
533
+ def jump_to_next_unannotated(self):
534
+ self.current_index = self._find_next_unannotated()
535
+ return self.load_current()
536
+
537
+ def jump_to_index(self, target):
538
  try:
539
+ idx = int(target) - 1
540
+ if 0 <= idx < len(self.data_mgr.manifest):
541
+ self.current_index = idx
 
 
 
 
 
 
 
542
  except:
543
  pass
544
+ return self.load_current()
545
+
546
+ def jump_to_kiu(self, kiu_id):
547
+ target = kiu_id.strip().zfill(5)
548
+ for idx, item in enumerate(self.data_mgr.manifest):
549
+ if item['kiu_id'] == target:
550
+ self.current_index = idx
551
+ break
552
+ return self.load_current()
553
+
554
+ def jump_to_ttb_reclass(self):
555
+ """Find next TTB needing reclassification"""
556
+ manifest = self.data_mgr.manifest
557
+
558
+ # Search forward
559
+ for idx in range(self.current_index + 1, len(manifest)):
560
+ if manifest[idx].get('direction') == 'TTB_NEEDS_RECLASSIFICATION':
561
+ self.current_index = idx
562
+ return self.load_current()
563
+
564
+ # Wrap around
565
+ for idx in range(0, self.current_index):
566
+ if manifest[idx].get('direction') == 'TTB_NEEDS_RECLASSIFICATION':
567
+ self.current_index = idx
568
+ return self.load_current()
569
 
570
+ print("⚠️ No TTB reclassification items found")
571
+ return self.load_current()
572
 
573
+ def export_annotations(self):
574
+ """Export all annotations to single JSON"""
575
+ annotations = []
576
+
577
+ for filename in os.listdir(HF_ANNOTATIONS_DIR):
578
+ if filename.endswith('.json') and filename != 'all_annotations.json':
579
+ filepath = os.path.join(HF_ANNOTATIONS_DIR, filename)
580
+ try:
581
+ with open(filepath, 'r') as f:
582
+ annotations.append(json.load(f))
583
+ except:
584
+ pass
585
+
586
+ export_file = os.path.join(HF_ANNOTATIONS_DIR, "all_annotations.json")
587
+ with open(export_file, 'w', encoding='utf-8') as f:
588
+ json.dump(annotations, f, indent=2, ensure_ascii=False)
589
+
590
+ return f"✅ Exported {len(annotations)} annotations to `{export_file}`"
591
 
 
 
592
 
593
+ # Initialize app
594
+ app_state = AnnotationApp()
595
 
596
+ # Build Gradio UI
597
+ with gr.Blocks(title="⚡ Hieroglyph Annotation", theme=gr.themes.Soft()) as demo:
598
+
599
+ gr.Markdown("# ⚡ Hieroglyph Direction Annotation Tool")
600
+
601
+ stats = app_state.get_statistics()
602
+
603
+ gr.HTML(f"""
604
+ <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
605
+ color: white; padding: 15px; border-radius: 10px; margin-bottom: 20px;">
606
+ <h3 style="margin: 0 0 10px 0;">📊 System Status</h3>
607
+ <div style="display: grid; grid-template-columns: repeat(3, 1fr); gap: 10px;">
608
+ <div><strong>Dataset:</strong> {HF_DATASET_REPO}</div>
609
+ <div><strong>Total Instances:</strong> {stats['total']:,}</div>
610
+ <div><strong>Remaining:</strong> {stats['remaining']:,}</div>
611
+ </div>
612
+ <div style="margin-top: 10px; background: rgba(255,255,255,0.2);
613
+ padding: 8px; border-radius: 5px;">
614
+ <strong>Progress:</strong> {stats['progress_pct']:.1f}%
615
+ ({stats['processed']:,} / {stats['total']:,} annotated)
616
+ </div>
617
+ </div>
618
+ """)
619
+
620
+ with gr.Row():
621
+ with gr.Column(scale=2):
622
+ image_display = gr.Image(label="📸 Line Instance", type="pil", height=500)
623
+
624
+ line_number_input = gr.Textbox(
625
+ label="📝 Line Number (optional)",
626
+ placeholder="Enter line number: 1, 2, 3...",
627
+ max_lines=1
628
+ )
629
+
630
+ gr.Markdown("### 🎯 Annotation")
631
+
632
+ with gr.Row():
633
+ ltr_btn = gr.Button("➡️ LTR", variant="primary", size="lg")
634
+ rtl_btn = gr.Button("⬅️ RTL", variant="primary", size="lg")
635
+
636
+ with gr.Row():
637
+ ttb_ltr_btn = gr.Button("⬇️➡️ TTB+LTR", variant="primary")
638
+ ttb_rtl_btn = gr.Button("⬇️⬅️ TTB+RTL", variant="primary")
639
+
640
+ with gr.Row():
641
+ skip_btn = gr.Button("⏭️ Skip", variant="secondary")
642
+ unclear_btn = gr.Button("❓ Unclear", variant="secondary")
643
+ hard_neg_btn = gr.Button("🚫 Bad Detection", variant="stop")
644
+
645
+ with gr.Column(scale=1):
646
+ html_display = gr.HTML(label="📄 Reference")
647
+
648
+ info_display = gr.Markdown()
649
+
650
+ gr.Markdown("---")
651
+ gr.Markdown("### 🧭 Navigation & Controls")
652
+
653
+ with gr.Row():
654
+ back_btn = gr.Button("⬅️ Previous")
655
+ forward_btn = gr.Button("➡️ Next")
656
+ undo_btn = gr.Button("↩️ Undo Last")
657
+ next_unann_btn = gr.Button("⏭️ Next Unannotated", variant="primary")
658
+ export_btn = gr.Button("💾 Export All", variant="secondary")
659
+
660
+ with gr.Row():
661
+ with gr.Column(scale=2):
662
+ jump_input = gr.Textbox(label="Jump to Index", placeholder="e.g., 123")
663
+ with gr.Column(scale=1):
664
+ jump_btn = gr.Button("Go")
665
+ with gr.Column(scale=2):
666
+ kiu_input = gr.Textbox(label="Find KIU", placeholder="e.g., 00001")
667
+ with gr.Column(scale=1):
668
+ kiu_btn = gr.Button("Find")
669
+ with gr.Column(scale=2):
670
+ ttb_btn = gr.Button("Find TTB Reclass", variant="secondary")
671
+
672
+ export_output = gr.Markdown()
673
+
674
+ # Event handlers
675
+ outputs = [image_display, html_display, info_display, line_number_input]
676
+
677
+ ltr_btn.click(lambda ln: app_state.annotate_and_next("LTR", ln),
678
+ inputs=[line_number_input], outputs=outputs)
679
+ rtl_btn.click(lambda ln: app_state.annotate_and_next("RTL", ln),
680
+ inputs=[line_number_input], outputs=outputs)
681
+ ttb_ltr_btn.click(lambda ln: app_state.annotate_and_next("TTB_LTR", ln),
682
+ inputs=[line_number_input], outputs=outputs)
683
+ ttb_rtl_btn.click(lambda ln: app_state.annotate_and_next("TTB_RTL", ln),
684
+ inputs=[line_number_input], outputs=outputs)
685
+ skip_btn.click(lambda ln: app_state.annotate_and_next("Skip", ln),
686
+ inputs=[line_number_input], outputs=outputs)
687
+ unclear_btn.click(lambda ln: app_state.annotate_and_next("Unclear", ln),
688
+ inputs=[line_number_input], outputs=outputs)
689
+ hard_neg_btn.click(app_state.flag_hard_negative,
690
+ inputs=[line_number_input], outputs=outputs)
691
+
692
+ back_btn.click(app_state.go_back, outputs=outputs)
693
+ forward_btn.click(app_state.go_forward, outputs=outputs)
694
+ undo_btn.click(app_state.undo_last, outputs=outputs)
695
+ next_unann_btn.click(app_state.jump_to_next_unannotated, outputs=outputs)
696
+ export_btn.click(app_state.export_annotations, outputs=[export_output])
697
+
698
+ jump_btn.click(app_state.jump_to_index, inputs=[jump_input], outputs=outputs)
699
+ kiu_btn.click(app_state.jump_to_kiu, inputs=[kiu_input], outputs=outputs)
700
+ ttb_btn.click(app_state.jump_to_ttb_reclass, outputs=outputs)
701
+
702
+ # Load initial state
703
+ demo.load(app_state.load_current, outputs=outputs)
704
+
705
+ if __name__ == "__main__":
706
+ stats = app_state.get_statistics()
707
+ print("\n" + "="*80)
708
+ print("⚡ HIEROGLYPH ANNOTATION TOOL")
709
+ print("="*80)
710
+ print(f"Dataset: {HF_DATASET_REPO}")
711
+ print(f"Total Instances: {stats['total']:,}")
712
+ print(f"Remaining: {stats['remaining']:,}")
713
+ print(f"Progress: {stats['progress_pct']:.1f}%")
714
+ print("="*80 + "\n")
715
+
716
+ demo.launch(server_name="0.0.0.0", server_port=7860)