$P@D$3RV£R commited on
Commit
e13f368
·
1 Parent(s): 63981c2

Configure app to load data from HuggingFace dataset instead of local filesystem

Browse files
Files changed (2) hide show
  1. app.py +214 -21
  2. requirements.txt +3 -0
app.py CHANGED
@@ -5,7 +5,13 @@ import argparse
5
  from flask import Flask, redirect, url_for, request
6
  from flask import render_template
7
  from flask import send_file
8
- import os
 
 
 
 
 
 
9
 
10
  app = Flask(__name__)
11
  app.config['SEND_FILE_MAX_AGE_DEFAULT'] = 0
@@ -360,28 +366,179 @@ def label(temp_id):
360
 
361
  @app.route('/image/<path:f>')
362
  def images(f):
363
- images = app.config['IMAGES']
364
- return send_file(os.path.join(images, f))
365
-
366
- if __name__ == "__main__":
367
- parser = argparse.ArgumentParser()
368
- parser.add_argument('--dir', type=str, default='/Users/pd3rvr/Documents/Data_out', help='specify the images directory')
369
- parser.add_argument("--out")
370
- args = parser.parse_args()
371
- directory = args.dir
372
- if directory[-1] != "/":
373
- directory += "/"
374
- app.config["IMAGES"] = directory
375
- app.config["LABELS"] = []
376
- app.config["CLASS_TO_ID"] = {} # Maps class names to IDs
377
- app.config["NEXT_CLASS_ID"] = 1 # Next available class ID
378
-
379
- # Collect folders with ALL images of the three specific types
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
380
  folder_sets = []
381
  required_suffixes = ['sr_int_full.png', '-tr_line.png', '-tr_int_full.png']
382
 
383
- for (dirpath, dirnames, filenames) in walk(app.config["IMAGES"]):
384
- if dirpath == app.config["IMAGES"]: # Skip root directory
385
  continue
386
 
387
  # Find ALL images with required suffixes in this folder and group by file ID prefix
@@ -389,7 +546,7 @@ if __name__ == "__main__":
389
  for filename in filenames:
390
  for suffix in required_suffixes:
391
  if filename.endswith(suffix):
392
- relative_path = os.path.relpath(os.path.join(dirpath, filename), app.config["IMAGES"])
393
  found_images[suffix].append(relative_path)
394
 
395
  # Group images by their file ID prefix (everything before the first '-')
@@ -428,8 +585,44 @@ if __name__ == "__main__":
428
  'image_sets': valid_image_sets
429
  })
430
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
431
  if not folder_sets:
432
  print("No folders found with all three required image types (sr_int_full.png, -tr_line.png, -tr_int_full.png)")
 
 
 
 
 
433
  exit()
434
 
435
  app.config["FOLDER_SETS"] = folder_sets
 
5
  from flask import Flask, redirect, url_for, request
6
  from flask import render_template
7
  from flask import send_file
8
+ import os
9
+ from datasets import load_dataset
10
+ from huggingface_hub import hf_hub_download
11
+ from io import BytesIO
12
+ from PIL import Image
13
+ import tempfile
14
+ import shutil
15
 
16
  app = Flask(__name__)
17
  app.config['SEND_FILE_MAX_AGE_DEFAULT'] = 0
 
366
 
367
  @app.route('/image/<path:f>')
368
  def images(f):
369
+ # Check if using HuggingFace dataset
370
+ if app.config.get("USE_HF_DATASET", False):
371
+ # Load image from HuggingFace dataset
372
+ try:
373
+ from huggingface_hub import hf_hub_download
374
+
375
+ dataset_name = app.config.get("HF_DATASET_NAME", "0001AMA/multimodal_data_annotator_dataset")
376
+ cache_dir = app.config.get("CACHE_DIR", None)
377
+
378
+ # Try to find the file path
379
+ file_path = f
380
+ dataset_files = app.config.get("HF_DATASET_FILES", {})
381
+
382
+ # Try exact match first
383
+ if f not in dataset_files:
384
+ # Try to find by matching path
385
+ for path in dataset_files:
386
+ if path.endswith(f) or f in path:
387
+ file_path = path
388
+ break
389
+
390
+ # Download file from HuggingFace
391
+ try:
392
+ local_path = hf_hub_download(
393
+ repo_id=dataset_name,
394
+ filename=file_path,
395
+ repo_type="dataset",
396
+ cache_dir=cache_dir
397
+ )
398
+
399
+ if os.path.exists(local_path):
400
+ return send_file(local_path)
401
+ except Exception as download_error:
402
+ print(f"Error downloading file {file_path}: {download_error}")
403
+ # Try alternative: download to cache and serve
404
+ try:
405
+ # Use cache_dir if available
406
+ cache_file = os.path.join(cache_dir or tempfile.gettempdir(), file_path.replace('/', '_'))
407
+ if not os.path.exists(cache_file):
408
+ local_path = hf_hub_download(
409
+ repo_id=dataset_name,
410
+ filename=file_path,
411
+ repo_type="dataset"
412
+ )
413
+ # Copy to cache
414
+ os.makedirs(os.path.dirname(cache_file), exist_ok=True)
415
+ shutil.copy2(local_path, cache_file)
416
+ else:
417
+ local_path = cache_file
418
+
419
+ return send_file(local_path)
420
+ except Exception as e2:
421
+ print(f"Alternative download also failed: {e2}")
422
+
423
+ except Exception as e:
424
+ print(f"Error loading image from dataset: {e}")
425
+ import traceback
426
+ traceback.print_exc()
427
+ # Fallback to local file if available
428
+ pass
429
+
430
+ # Fallback to local file system
431
+ images_dir = app.config.get('IMAGES', '')
432
+ if images_dir:
433
+ file_path = os.path.join(images_dir, f)
434
+ if os.path.exists(file_path):
435
+ return send_file(file_path)
436
+
437
+ return "Image not found", 404
438
+
439
+ def load_from_huggingface_dataset(dataset_name="0001AMA/multimodal_data_annotator_dataset"):
440
+ """Load and process images from HuggingFace dataset"""
441
+ print(f"Loading dataset from HuggingFace: {dataset_name}")
442
+
443
+ try:
444
+ from huggingface_hub import list_repo_files, hf_hub_download
445
+
446
+ # List all files in the dataset repository
447
+ print("Listing files in dataset repository...")
448
+ repo_files = list_repo_files(repo_id=dataset_name, repo_type="dataset")
449
+ print(f"Found {len(repo_files)} files in repository")
450
+
451
+ # Filter PNG files only
452
+ png_files = [f for f in repo_files if f.endswith('.png')]
453
+ print(f"Found {len(png_files)} PNG files")
454
+
455
+ # Create a cache directory for images
456
+ cache_dir = os.path.join(tempfile.gettempdir(), "hf_dataset_cache")
457
+ os.makedirs(cache_dir, exist_ok=True)
458
+ app.config["CACHE_DIR"] = cache_dir
459
+
460
+ # Process files to group by folder and file ID
461
+ folder_sets = []
462
+ required_suffixes = ['sr_int_full.png', '-tr_line.png', '-tr_int_full.png']
463
+
464
+ # Group files by folder and file ID
465
+ folder_files = {} # {folder_name: {file_id: {suffix: file_path}}}
466
+
467
+ for file_path in png_files:
468
+ # Extract folder name and filename
469
+ path_parts = file_path.split('/')
470
+ if len(path_parts) < 2:
471
+ continue
472
+
473
+ folder_name = path_parts[0]
474
+ filename = path_parts[-1]
475
+
476
+ # Check if file matches required suffixes
477
+ matched_suffix = None
478
+ for suffix in required_suffixes:
479
+ if filename.endswith(suffix):
480
+ matched_suffix = suffix
481
+ break
482
+
483
+ if not matched_suffix:
484
+ continue
485
+
486
+ # Extract file ID prefix (everything before the first '-')
487
+ if '-' in filename:
488
+ file_id = filename.split('-')[0]
489
+ else:
490
+ continue
491
+
492
+ # Initialize folder structure
493
+ if folder_name not in folder_files:
494
+ folder_files[folder_name] = {}
495
+ if file_id not in folder_files[folder_name]:
496
+ folder_files[folder_name][file_id] = {}
497
+
498
+ # Store file path
499
+ folder_files[folder_name][file_id][matched_suffix] = file_path
500
+
501
+ # Create folder sets with valid image sets
502
+ for folder_name, file_ids in folder_files.items():
503
+ valid_image_sets = []
504
+ for file_id, images in file_ids.items():
505
+ # Check if all three required suffixes are present
506
+ if all(suffix in images for suffix in required_suffixes):
507
+ valid_image_sets.append({
508
+ 'file_id': file_id,
509
+ 'sr_int_full': images['sr_int_full.png'],
510
+ 'tr_line': images['-tr_line.png'],
511
+ 'tr_int_full': images['-tr_int_full.png']
512
+ })
513
+ print(f"DEBUG: Created valid image set for file_id '{file_id}' in folder '{folder_name}'")
514
+
515
+ if valid_image_sets:
516
+ folder_sets.append({
517
+ 'folder': folder_name,
518
+ 'image_sets': valid_image_sets
519
+ })
520
+ print(f"DEBUG: Added folder '{folder_name}' with {len(valid_image_sets)} image sets")
521
+
522
+ # Store file list for image serving
523
+ app.config["HF_DATASET_FILES"] = {f: f for f in png_files}
524
+ app.config["HF_DATASET_NAME"] = dataset_name
525
+
526
+ print(f"Successfully processed {len(folder_sets)} folders with valid image sets")
527
+ return folder_sets
528
+
529
+ except Exception as e:
530
+ print(f"Error loading HuggingFace dataset: {e}")
531
+ import traceback
532
+ traceback.print_exc()
533
+ return []
534
+
535
+ def load_from_local_directory(directory):
536
+ """Load and process images from local directory (original method)"""
537
  folder_sets = []
538
  required_suffixes = ['sr_int_full.png', '-tr_line.png', '-tr_int_full.png']
539
 
540
+ for (dirpath, dirnames, filenames) in walk(directory):
541
+ if dirpath == directory: # Skip root directory
542
  continue
543
 
544
  # Find ALL images with required suffixes in this folder and group by file ID prefix
 
546
  for filename in filenames:
547
  for suffix in required_suffixes:
548
  if filename.endswith(suffix):
549
+ relative_path = os.path.relpath(os.path.join(dirpath, filename), directory)
550
  found_images[suffix].append(relative_path)
551
 
552
  # Group images by their file ID prefix (everything before the first '-')
 
585
  'image_sets': valid_image_sets
586
  })
587
 
588
+ return folder_sets
589
+
590
+ if __name__ == "__main__":
591
+ parser = argparse.ArgumentParser()
592
+ parser.add_argument('--dir', type=str, default=None, help='specify the images directory (optional, uses HF dataset if not provided)')
593
+ parser.add_argument("--out")
594
+ args = parser.parse_args()
595
+
596
+ app.config["LABELS"] = []
597
+ app.config["CLASS_TO_ID"] = {} # Maps class names to IDs
598
+ app.config["NEXT_CLASS_ID"] = 1 # Next available class ID
599
+
600
+ # Check if running on HuggingFace Spaces or if no local directory specified
601
+ is_hf_space = os.getenv("SPACE_ID") is not None
602
+ use_hf_dataset = args.dir is None or is_hf_space
603
+
604
+ if use_hf_dataset:
605
+ print("===== Application Startup at " + str(os.popen('date').read().strip()) + " =====")
606
+ print("Loading from HuggingFace dataset...")
607
+ app.config["USE_HF_DATASET"] = True
608
+ folder_sets = load_from_huggingface_dataset("0001AMA/multimodal_data_annotator_dataset")
609
+ app.config["IMAGES"] = "" # Not using local directory
610
+ else:
611
+ print("Loading from local directory...")
612
+ app.config["USE_HF_DATASET"] = False
613
+ directory = args.dir
614
+ if directory[-1] != "/":
615
+ directory += "/"
616
+ app.config["IMAGES"] = directory
617
+ folder_sets = load_from_local_directory(directory)
618
+
619
  if not folder_sets:
620
  print("No folders found with all three required image types (sr_int_full.png, -tr_line.png, -tr_int_full.png)")
621
+ if use_hf_dataset:
622
+ print("This may be due to:")
623
+ print("1. Dataset not fully uploaded yet")
624
+ print("2. Dataset structure doesn't match expected format")
625
+ print("3. Network issues loading the dataset")
626
  exit()
627
 
628
  app.config["FOLDER_SETS"] = folder_sets
requirements.txt CHANGED
@@ -1,2 +1,5 @@
1
  flask
 
 
 
2
 
 
1
  flask
2
+ datasets
3
+ huggingface_hub
4
+ Pillow
5