alyex commited on
Commit
32cf313
Β·
verified Β·
1 Parent(s): df66e3b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +229 -78
app.py CHANGED
@@ -1,96 +1,247 @@
1
- # app_working.py
2
  import gradio as gr
3
- import os
4
  import json
5
- from datasets import load_dataset
 
6
  from PIL import Image
 
 
 
 
7
  import io
 
 
8
 
9
- HF_TOKEN = os.getenv("HF_TOKEN")
10
- DATASET_REPO = "alyex/karnak-data-app"
11
 
12
- # Simple state
13
- current_index = 0
14
- manifest = []
15
 
16
- def load_test_manifest():
17
- """Create a test manifest"""
18
- return [
19
- {"kiu_id": "00001", "instance_id": 0, "crop_coords": [0, 0, 100, 100]},
20
- {"kiu_id": "00002", "instance_id": 0, "crop_coords": [0, 0, 100, 100]},
21
- ]
22
 
23
- def get_current_data():
24
- """Get data for current item"""
25
- global current_index, manifest
26
-
27
- if not manifest:
28
- manifest = load_test_manifest()
29
-
30
- if current_index >= len(manifest):
31
- return None, "No items", "No data", ""
 
 
 
 
 
 
 
 
 
 
 
32
 
33
- item = manifest[current_index]
34
- kiu_id = item["kiu_id"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
- # Try to fetch from dataset
37
- try:
38
- dataset = load_dataset(DATASET_REPO, streaming=True, token=HF_TOKEN)
39
-
40
- for data_item in dataset["train"].take(100):
41
- if data_item["kiu_id"] == kiu_id:
42
- html = data_item.get("html", "No HTML")
43
- image = None
44
-
45
- if data_item.get("image"):
46
- try:
47
- image = Image.open(io.BytesIO(data_item["image"]))
48
- # Apply crop
49
- x1, y1, x2, y2 = item["crop_coords"]
50
- image = image.crop((x1, y1, x2, y2))
51
- except:
52
- pass
53
 
54
- html_display = f"<div style='padding:10px'>{html[:1000]}...</div>"
55
- info = f"KIU {kiu_id} | Instance {item['instance_id']}"
56
 
57
- return image, html_display, info, ""
 
 
 
 
 
 
 
58
 
59
- except Exception as e:
60
- return None, f"Error: {str(e)}", f"KIU {kiu_id}", ""
61
-
62
- return None, "Not found in dataset", f"KIU {kiu_id}", ""
63
-
64
- with gr.Blocks(title="⚑ Working App") as demo:
65
- gr.Markdown("# ⚑ Hieroglyph Annotation")
66
-
67
- image_display = gr.Image(label="Line Instance")
68
- html_display = gr.HTML(label="Reference")
69
- info_display = gr.Markdown()
70
- line_input = gr.Textbox(label="Line Number")
71
-
72
- next_btn = gr.Button("Next")
73
- prev_btn = gr.Button("Previous")
74
-
75
- def update_display():
76
- return get_current_data()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
- def next_item():
79
- global current_index
80
- if current_index < len(manifest) - 1:
81
- current_index += 1
82
- return get_current_data()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
- def prev_item():
85
- global current_index
86
- if current_index > 0:
87
- current_index -= 1
88
- return get_current_data()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
- # Initial load
91
- demo.load(update_display, outputs=[image_display, html_display, info_display, line_input])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
- next_btn.click(next_item, outputs=[image_display, html_display, info_display, line_input])
94
- prev_btn.click(prev_item, outputs=[image_display, html_display, info_display, line_input])
 
 
 
 
 
 
95
 
96
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
+ # app_fixed.py
2
  import gradio as gr
 
3
  import json
4
+ import os
5
+ import pickle
6
  from PIL import Image
7
+ import tempfile
8
+ import time
9
+ from pathlib import Path
10
+ import hashlib
11
  import io
12
+ import shutil
13
+ import rarfile
14
 
15
+ # Hugging Face imports
16
+ from datasets import load_dataset
17
 
18
+ # ========== CONFIGURATION ==========
19
+ HF_DATASET_REPO = "alyex/karnak-data-app"
 
20
 
21
+ # Directories
22
+ HF_CACHE_DIR = os.path.join(tempfile.gettempdir(), "karnak_cache")
23
+ HF_ANNOTATIONS_DIR = "/data/annotations"
24
+ HF_HARD_NEGATIVES_DIR = "/data/hard_negatives"
25
+ SEGMENT_EXTRACT_DIR = "/data/kiu_segment_metadata"
 
26
 
27
+ os.makedirs(HF_CACHE_DIR, exist_ok=True)
28
+ os.makedirs(HF_ANNOTATIONS_DIR, exist_ok=True)
29
+ os.makedirs(HF_HARD_NEGATIVES_DIR, exist_ok=True)
30
+ os.makedirs(SEGMENT_EXTRACT_DIR, exist_ok=True)
31
+
32
+ MANIFEST_FILE = os.path.join(HF_CACHE_DIR, "manifest.pkl")
33
+
34
+ class AnnotationState:
35
+ def __init__(self):
36
+ self.manifest = []
37
+ self.current_index = 0
38
+ self.history = []
39
+ self.karnak_dataset = None
40
+ self.segment_data = {}
41
+ self.kiu_lookup = {} # Fast lookup dictionary
42
+
43
+ # Load datasets
44
+ self.load_datasets()
45
+ # Load manifest
46
+ self.load_manifest()
47
 
48
+ def load_datasets(self):
49
+ """Load datasets with fast lookup"""
50
+ print("Loading Karnak dataset...")
51
+ try:
52
+ self.karnak_dataset = load_dataset(HF_DATASET_REPO, streaming=True)
53
+ print(f"βœ… Karnak dataset loaded")
54
+
55
+ # Build fast lookup for first N items
56
+ self.build_kiu_lookup(2000) # Lookup first 2000 KIUs
57
+
58
+ except Exception as e:
59
+ print(f"❌ Error loading dataset: {e}")
60
+ self.karnak_dataset = None
61
+
62
+ print("Loading segment metadata from local...")
63
+ self.load_segment_metadata_local()
64
 
65
+ def build_kiu_lookup(self, limit=2000):
66
+ """Build fast lookup dictionary"""
67
+ if not self.karnak_dataset:
68
+ return
69
+
70
+ print(f"Building KIU lookup table (first {limit} items)...")
71
+ self.kiu_lookup = {}
72
+
73
+ count = 0
74
+ try:
75
+ for item in self.karnak_dataset["train"]:
76
+ kiu_id = str(item["kiu_id"]).strip()
77
+ # Normalize: remove leading zeros
78
+ kiu_id_norm = kiu_id.lstrip('0') or '0'
 
 
 
79
 
80
+ # Store in lookup
81
+ self.kiu_lookup[kiu_id_norm] = item
82
 
83
+ count += 1
84
+ if count >= limit:
85
+ break
86
+
87
+ print(f"βœ… Built lookup for {count} KIUs")
88
+
89
+ except Exception as e:
90
+ print(f"Lookup building stopped: {e}")
91
 
92
+ def load_segment_metadata_local(self):
93
+ """Load from local RAR file"""
94
+ RAR_FILENAME = "kiu_segment_metadata.rar"
95
+
96
+ if not os.path.exists(RAR_FILENAME):
97
+ print(f"❌ {RAR_FILENAME} not found")
98
+ return
99
+
100
+ print(f"βœ… Found {RAR_FILENAME}")
101
+
102
+ # Check cache
103
+ json_path = os.path.join(SEGMENT_EXTRACT_DIR, "kiu_segment_metadata.json")
104
+ if os.path.exists(json_path):
105
+ print(f"Loading cached metadata...")
106
+ try:
107
+ with open(json_path, 'r') as f:
108
+ self.segment_data = json.load(f)
109
+ print(f"βœ… Loaded {len(self.segment_data)} entries")
110
+ return
111
+ except:
112
+ pass
113
+
114
+ # Extract from RAR
115
+ try:
116
+ print(f"Extracting {RAR_FILENAME}...")
117
+ with rarfile.RarFile(RAR_FILENAME) as rf:
118
+ # Find and extract JSON
119
+ for file_info in rf.infolist():
120
+ if file_info.filename.endswith('.json'):
121
+ rf.extract(file_info, SEGMENT_EXTRACT_DIR)
122
+ extracted_path = os.path.join(SEGMENT_EXTRACT_DIR, file_info.filename)
123
+
124
+ # Load data
125
+ with open(extracted_path, 'r') as f:
126
+ self.segment_data = json.load(f)
127
+
128
+ print(f"βœ… Loaded {len(self.segment_data)} entries")
129
+
130
+ # Cache as standard name
131
+ if extracted_path != json_path:
132
+ shutil.move(extracted_path, json_path)
133
+
134
+ break
135
+
136
+ except Exception as e:
137
+ print(f"❌ Extraction error: {e}")
138
 
139
+ def load_manifest(self):
140
+ """Build manifest, filtering for KIUs we have in dataset"""
141
+ if not self.segment_data:
142
+ print("❌ No segment data")
143
+ self.manifest = []
144
+ return
145
+
146
+ print("Building manifest (checking for available KIUs)...")
147
+ self.manifest = []
148
+
149
+ available_count = 0
150
+ unavailable_count = 0
151
+
152
+ for key, segment_info in self.segment_data.items():
153
+ if not isinstance(segment_info, dict):
154
+ continue
155
+
156
+ # Get KIU ID from segment metadata
157
+ kiu_id = str(segment_info.get('kiu_id', ''))
158
+ kiu_id_norm = kiu_id.lstrip('0') or '0'
159
+
160
+ # Check if we have this KIU in our lookup
161
+ if kiu_id_norm in self.kiu_lookup:
162
+ available_count += 1
163
+ instances = segment_info.get('instances', [])
164
+
165
+ for instance in instances:
166
+ if instance.get('class') == 'Line':
167
+ manifest_item = {
168
+ 'kiu_id': kiu_id.zfill(5), # Store with leading zeros
169
+ 'kiu_id_norm': kiu_id_norm, # Normalized for lookup
170
+ 'instance_id': instance.get('instance_id'),
171
+ 'crop_coords': instance.get('crop_coords', [0, 0, 100, 100]),
172
+ 'direction': instance.get('direction', ''),
173
+ 'annotated': instance.get('annotated', False),
174
+ 'line_number': instance.get('line_number'),
175
+ 'segment_key': key
176
+ }
177
+ self.manifest.append(manifest_item)
178
+ else:
179
+ unavailable_count += 1
180
+
181
+ print(f"βœ… Manifest: {len(self.manifest)} instances from {available_count} available KIUs")
182
+ print(f" Skipped: {unavailable_count} KIUs not in dataset")
183
+
184
+ if self.manifest:
185
+ # Save to cache
186
+ with open(MANIFEST_FILE, 'wb') as f:
187
+ pickle.dump(self.manifest, f)
188
+
189
+ self.current_index = self.find_first_unannotated()
190
 
191
+ def get_kiu_data(self, kiu_id):
192
+ """Fast lookup using dictionary"""
193
+ if not self.kiu_lookup:
194
+ return None, None
195
+
196
+ # Normalize KIU ID
197
+ kiu_id_norm = str(kiu_id).strip().lstrip('0') or '0'
198
+
199
+ if kiu_id_norm in self.kiu_lookup:
200
+ item = self.kiu_lookup[kiu_id_norm]
201
+
202
+ # Convert image
203
+ image = None
204
+ if item.get("image"):
205
+ try:
206
+ image = Image.open(io.BytesIO(item["image"]))
207
+ except Exception as e:
208
+ print(f"Image conversion error for KIU {kiu_id}: {e}")
209
+
210
+ return item.get("html", ""), image
211
+
212
+ # Fallback: search in dataset
213
+ return self.get_kiu_data_slow(kiu_id)
214
 
215
+ def get_kiu_data_slow(self, kiu_id):
216
+ """Fallback: search through dataset"""
217
+ if not self.karnak_dataset:
218
+ return None, None
219
+
220
+ kiu_id_norm = str(kiu_id).strip().lstrip('0') or '0'
221
+
222
+ try:
223
+ for item in self.karnak_dataset["train"]:
224
+ item_id = str(item["kiu_id"]).strip().lstrip('0') or '0'
225
+ if item_id == kiu_id_norm:
226
+ image = None
227
+ if item.get("image"):
228
+ try:
229
+ image = Image.open(io.BytesIO(item["image"]))
230
+ except:
231
+ pass
232
+ return item.get("html", ""), image
233
+ except:
234
+ pass
235
+
236
+ return None, None
237
 
238
+ # [Keep all your other methods: needs_annotation, find_first_unannotated, etc.]
239
+ # ...
240
+
241
+ # [Keep the rest of your app code: load_current_instance, navigation functions, Gradio interface]
242
+ # ...
243
+
244
+ # Initialize state
245
+ state = AnnotationState()
246
 
247
+ # Rest of your app code remains the same...