Spaces:
Sleeping
Sleeping
$P@D$3RV£R commited on
Commit ·
e13f368
1
Parent(s): 63981c2
Configure app to load data from HuggingFace dataset instead of local filesystem
Browse files- app.py +214 -21
- requirements.txt +3 -0
app.py
CHANGED
|
@@ -5,7 +5,13 @@ import argparse
|
|
| 5 |
from flask import Flask, redirect, url_for, request
|
| 6 |
from flask import render_template
|
| 7 |
from flask import send_file
|
| 8 |
-
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
app = Flask(__name__)
|
| 11 |
app.config['SEND_FILE_MAX_AGE_DEFAULT'] = 0
|
|
@@ -360,28 +366,179 @@ def label(temp_id):
|
|
| 360 |
|
| 361 |
@app.route('/image/<path:f>')
|
| 362 |
def images(f):
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 380 |
folder_sets = []
|
| 381 |
required_suffixes = ['sr_int_full.png', '-tr_line.png', '-tr_int_full.png']
|
| 382 |
|
| 383 |
-
for (dirpath, dirnames, filenames) in walk(
|
| 384 |
-
if dirpath ==
|
| 385 |
continue
|
| 386 |
|
| 387 |
# Find ALL images with required suffixes in this folder and group by file ID prefix
|
|
@@ -389,7 +546,7 @@ if __name__ == "__main__":
|
|
| 389 |
for filename in filenames:
|
| 390 |
for suffix in required_suffixes:
|
| 391 |
if filename.endswith(suffix):
|
| 392 |
-
relative_path = os.path.relpath(os.path.join(dirpath, filename),
|
| 393 |
found_images[suffix].append(relative_path)
|
| 394 |
|
| 395 |
# Group images by their file ID prefix (everything before the first '-')
|
|
@@ -428,8 +585,44 @@ if __name__ == "__main__":
|
|
| 428 |
'image_sets': valid_image_sets
|
| 429 |
})
|
| 430 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 431 |
if not folder_sets:
|
| 432 |
print("No folders found with all three required image types (sr_int_full.png, -tr_line.png, -tr_int_full.png)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 433 |
exit()
|
| 434 |
|
| 435 |
app.config["FOLDER_SETS"] = folder_sets
|
|
|
|
| 5 |
from flask import Flask, redirect, url_for, request
|
| 6 |
from flask import render_template
|
| 7 |
from flask import send_file
|
| 8 |
+
import os
|
| 9 |
+
from datasets import load_dataset
|
| 10 |
+
from huggingface_hub import hf_hub_download
|
| 11 |
+
from io import BytesIO
|
| 12 |
+
from PIL import Image
|
| 13 |
+
import tempfile
|
| 14 |
+
import shutil
|
| 15 |
|
| 16 |
app = Flask(__name__)
|
| 17 |
app.config['SEND_FILE_MAX_AGE_DEFAULT'] = 0
|
|
|
|
| 366 |
|
| 367 |
@app.route('/image/<path:f>')
|
| 368 |
def images(f):
|
| 369 |
+
# Check if using HuggingFace dataset
|
| 370 |
+
if app.config.get("USE_HF_DATASET", False):
|
| 371 |
+
# Load image from HuggingFace dataset
|
| 372 |
+
try:
|
| 373 |
+
from huggingface_hub import hf_hub_download
|
| 374 |
+
|
| 375 |
+
dataset_name = app.config.get("HF_DATASET_NAME", "0001AMA/multimodal_data_annotator_dataset")
|
| 376 |
+
cache_dir = app.config.get("CACHE_DIR", None)
|
| 377 |
+
|
| 378 |
+
# Try to find the file path
|
| 379 |
+
file_path = f
|
| 380 |
+
dataset_files = app.config.get("HF_DATASET_FILES", {})
|
| 381 |
+
|
| 382 |
+
# Try exact match first
|
| 383 |
+
if f not in dataset_files:
|
| 384 |
+
# Try to find by matching path
|
| 385 |
+
for path in dataset_files:
|
| 386 |
+
if path.endswith(f) or f in path:
|
| 387 |
+
file_path = path
|
| 388 |
+
break
|
| 389 |
+
|
| 390 |
+
# Download file from HuggingFace
|
| 391 |
+
try:
|
| 392 |
+
local_path = hf_hub_download(
|
| 393 |
+
repo_id=dataset_name,
|
| 394 |
+
filename=file_path,
|
| 395 |
+
repo_type="dataset",
|
| 396 |
+
cache_dir=cache_dir
|
| 397 |
+
)
|
| 398 |
+
|
| 399 |
+
if os.path.exists(local_path):
|
| 400 |
+
return send_file(local_path)
|
| 401 |
+
except Exception as download_error:
|
| 402 |
+
print(f"Error downloading file {file_path}: {download_error}")
|
| 403 |
+
# Try alternative: download to cache and serve
|
| 404 |
+
try:
|
| 405 |
+
# Use cache_dir if available
|
| 406 |
+
cache_file = os.path.join(cache_dir or tempfile.gettempdir(), file_path.replace('/', '_'))
|
| 407 |
+
if not os.path.exists(cache_file):
|
| 408 |
+
local_path = hf_hub_download(
|
| 409 |
+
repo_id=dataset_name,
|
| 410 |
+
filename=file_path,
|
| 411 |
+
repo_type="dataset"
|
| 412 |
+
)
|
| 413 |
+
# Copy to cache
|
| 414 |
+
os.makedirs(os.path.dirname(cache_file), exist_ok=True)
|
| 415 |
+
shutil.copy2(local_path, cache_file)
|
| 416 |
+
else:
|
| 417 |
+
local_path = cache_file
|
| 418 |
+
|
| 419 |
+
return send_file(local_path)
|
| 420 |
+
except Exception as e2:
|
| 421 |
+
print(f"Alternative download also failed: {e2}")
|
| 422 |
+
|
| 423 |
+
except Exception as e:
|
| 424 |
+
print(f"Error loading image from dataset: {e}")
|
| 425 |
+
import traceback
|
| 426 |
+
traceback.print_exc()
|
| 427 |
+
# Fallback to local file if available
|
| 428 |
+
pass
|
| 429 |
+
|
| 430 |
+
# Fallback to local file system
|
| 431 |
+
images_dir = app.config.get('IMAGES', '')
|
| 432 |
+
if images_dir:
|
| 433 |
+
file_path = os.path.join(images_dir, f)
|
| 434 |
+
if os.path.exists(file_path):
|
| 435 |
+
return send_file(file_path)
|
| 436 |
+
|
| 437 |
+
return "Image not found", 404
|
| 438 |
+
|
| 439 |
+
def load_from_huggingface_dataset(dataset_name="0001AMA/multimodal_data_annotator_dataset"):
|
| 440 |
+
"""Load and process images from HuggingFace dataset"""
|
| 441 |
+
print(f"Loading dataset from HuggingFace: {dataset_name}")
|
| 442 |
+
|
| 443 |
+
try:
|
| 444 |
+
from huggingface_hub import list_repo_files, hf_hub_download
|
| 445 |
+
|
| 446 |
+
# List all files in the dataset repository
|
| 447 |
+
print("Listing files in dataset repository...")
|
| 448 |
+
repo_files = list_repo_files(repo_id=dataset_name, repo_type="dataset")
|
| 449 |
+
print(f"Found {len(repo_files)} files in repository")
|
| 450 |
+
|
| 451 |
+
# Filter PNG files only
|
| 452 |
+
png_files = [f for f in repo_files if f.endswith('.png')]
|
| 453 |
+
print(f"Found {len(png_files)} PNG files")
|
| 454 |
+
|
| 455 |
+
# Create a cache directory for images
|
| 456 |
+
cache_dir = os.path.join(tempfile.gettempdir(), "hf_dataset_cache")
|
| 457 |
+
os.makedirs(cache_dir, exist_ok=True)
|
| 458 |
+
app.config["CACHE_DIR"] = cache_dir
|
| 459 |
+
|
| 460 |
+
# Process files to group by folder and file ID
|
| 461 |
+
folder_sets = []
|
| 462 |
+
required_suffixes = ['sr_int_full.png', '-tr_line.png', '-tr_int_full.png']
|
| 463 |
+
|
| 464 |
+
# Group files by folder and file ID
|
| 465 |
+
folder_files = {} # {folder_name: {file_id: {suffix: file_path}}}
|
| 466 |
+
|
| 467 |
+
for file_path in png_files:
|
| 468 |
+
# Extract folder name and filename
|
| 469 |
+
path_parts = file_path.split('/')
|
| 470 |
+
if len(path_parts) < 2:
|
| 471 |
+
continue
|
| 472 |
+
|
| 473 |
+
folder_name = path_parts[0]
|
| 474 |
+
filename = path_parts[-1]
|
| 475 |
+
|
| 476 |
+
# Check if file matches required suffixes
|
| 477 |
+
matched_suffix = None
|
| 478 |
+
for suffix in required_suffixes:
|
| 479 |
+
if filename.endswith(suffix):
|
| 480 |
+
matched_suffix = suffix
|
| 481 |
+
break
|
| 482 |
+
|
| 483 |
+
if not matched_suffix:
|
| 484 |
+
continue
|
| 485 |
+
|
| 486 |
+
# Extract file ID prefix (everything before the first '-')
|
| 487 |
+
if '-' in filename:
|
| 488 |
+
file_id = filename.split('-')[0]
|
| 489 |
+
else:
|
| 490 |
+
continue
|
| 491 |
+
|
| 492 |
+
# Initialize folder structure
|
| 493 |
+
if folder_name not in folder_files:
|
| 494 |
+
folder_files[folder_name] = {}
|
| 495 |
+
if file_id not in folder_files[folder_name]:
|
| 496 |
+
folder_files[folder_name][file_id] = {}
|
| 497 |
+
|
| 498 |
+
# Store file path
|
| 499 |
+
folder_files[folder_name][file_id][matched_suffix] = file_path
|
| 500 |
+
|
| 501 |
+
# Create folder sets with valid image sets
|
| 502 |
+
for folder_name, file_ids in folder_files.items():
|
| 503 |
+
valid_image_sets = []
|
| 504 |
+
for file_id, images in file_ids.items():
|
| 505 |
+
# Check if all three required suffixes are present
|
| 506 |
+
if all(suffix in images for suffix in required_suffixes):
|
| 507 |
+
valid_image_sets.append({
|
| 508 |
+
'file_id': file_id,
|
| 509 |
+
'sr_int_full': images['sr_int_full.png'],
|
| 510 |
+
'tr_line': images['-tr_line.png'],
|
| 511 |
+
'tr_int_full': images['-tr_int_full.png']
|
| 512 |
+
})
|
| 513 |
+
print(f"DEBUG: Created valid image set for file_id '{file_id}' in folder '{folder_name}'")
|
| 514 |
+
|
| 515 |
+
if valid_image_sets:
|
| 516 |
+
folder_sets.append({
|
| 517 |
+
'folder': folder_name,
|
| 518 |
+
'image_sets': valid_image_sets
|
| 519 |
+
})
|
| 520 |
+
print(f"DEBUG: Added folder '{folder_name}' with {len(valid_image_sets)} image sets")
|
| 521 |
+
|
| 522 |
+
# Store file list for image serving
|
| 523 |
+
app.config["HF_DATASET_FILES"] = {f: f for f in png_files}
|
| 524 |
+
app.config["HF_DATASET_NAME"] = dataset_name
|
| 525 |
+
|
| 526 |
+
print(f"Successfully processed {len(folder_sets)} folders with valid image sets")
|
| 527 |
+
return folder_sets
|
| 528 |
+
|
| 529 |
+
except Exception as e:
|
| 530 |
+
print(f"Error loading HuggingFace dataset: {e}")
|
| 531 |
+
import traceback
|
| 532 |
+
traceback.print_exc()
|
| 533 |
+
return []
|
| 534 |
+
|
| 535 |
+
def load_from_local_directory(directory):
|
| 536 |
+
"""Load and process images from local directory (original method)"""
|
| 537 |
folder_sets = []
|
| 538 |
required_suffixes = ['sr_int_full.png', '-tr_line.png', '-tr_int_full.png']
|
| 539 |
|
| 540 |
+
for (dirpath, dirnames, filenames) in walk(directory):
|
| 541 |
+
if dirpath == directory: # Skip root directory
|
| 542 |
continue
|
| 543 |
|
| 544 |
# Find ALL images with required suffixes in this folder and group by file ID prefix
|
|
|
|
| 546 |
for filename in filenames:
|
| 547 |
for suffix in required_suffixes:
|
| 548 |
if filename.endswith(suffix):
|
| 549 |
+
relative_path = os.path.relpath(os.path.join(dirpath, filename), directory)
|
| 550 |
found_images[suffix].append(relative_path)
|
| 551 |
|
| 552 |
# Group images by their file ID prefix (everything before the first '-')
|
|
|
|
| 585 |
'image_sets': valid_image_sets
|
| 586 |
})
|
| 587 |
|
| 588 |
+
return folder_sets
|
| 589 |
+
|
| 590 |
+
if __name__ == "__main__":
|
| 591 |
+
parser = argparse.ArgumentParser()
|
| 592 |
+
parser.add_argument('--dir', type=str, default=None, help='specify the images directory (optional, uses HF dataset if not provided)')
|
| 593 |
+
parser.add_argument("--out")
|
| 594 |
+
args = parser.parse_args()
|
| 595 |
+
|
| 596 |
+
app.config["LABELS"] = []
|
| 597 |
+
app.config["CLASS_TO_ID"] = {} # Maps class names to IDs
|
| 598 |
+
app.config["NEXT_CLASS_ID"] = 1 # Next available class ID
|
| 599 |
+
|
| 600 |
+
# Check if running on HuggingFace Spaces or if no local directory specified
|
| 601 |
+
is_hf_space = os.getenv("SPACE_ID") is not None
|
| 602 |
+
use_hf_dataset = args.dir is None or is_hf_space
|
| 603 |
+
|
| 604 |
+
if use_hf_dataset:
|
| 605 |
+
print("===== Application Startup at " + str(os.popen('date').read().strip()) + " =====")
|
| 606 |
+
print("Loading from HuggingFace dataset...")
|
| 607 |
+
app.config["USE_HF_DATASET"] = True
|
| 608 |
+
folder_sets = load_from_huggingface_dataset("0001AMA/multimodal_data_annotator_dataset")
|
| 609 |
+
app.config["IMAGES"] = "" # Not using local directory
|
| 610 |
+
else:
|
| 611 |
+
print("Loading from local directory...")
|
| 612 |
+
app.config["USE_HF_DATASET"] = False
|
| 613 |
+
directory = args.dir
|
| 614 |
+
if directory[-1] != "/":
|
| 615 |
+
directory += "/"
|
| 616 |
+
app.config["IMAGES"] = directory
|
| 617 |
+
folder_sets = load_from_local_directory(directory)
|
| 618 |
+
|
| 619 |
if not folder_sets:
|
| 620 |
print("No folders found with all three required image types (sr_int_full.png, -tr_line.png, -tr_int_full.png)")
|
| 621 |
+
if use_hf_dataset:
|
| 622 |
+
print("This may be due to:")
|
| 623 |
+
print("1. Dataset not fully uploaded yet")
|
| 624 |
+
print("2. Dataset structure doesn't match expected format")
|
| 625 |
+
print("3. Network issues loading the dataset")
|
| 626 |
exit()
|
| 627 |
|
| 628 |
app.config["FOLDER_SETS"] = folder_sets
|
requirements.txt
CHANGED
|
@@ -1,2 +1,5 @@
|
|
| 1 |
flask
|
|
|
|
|
|
|
|
|
|
| 2 |
|
|
|
|
| 1 |
flask
|
| 2 |
+
datasets
|
| 3 |
+
huggingface_hub
|
| 4 |
+
Pillow
|
| 5 |
|