Spaces:

latterworks
/

steer

Runtime error

App Files Files Community

latterworks commited on Mar 18, 2025

Commit

fad02fc

verified ·

1 Parent(s): 13d168d

Update app.py

Browse files

Files changed (1) hide show

app.py +338 -243

app.py CHANGED Viewed

@@ -1,9 +1,3 @@
-"""
-EXIF Extraction Pipeline - HuggingFace Space Implementation
-Provides a full-stack solution for extracting EXIF metadata from images and
-pushing directly to a linked HuggingFace dataset repository.
-"""
 import os
 import io
 import json
@@ -15,35 +9,57 @@ from datetime import datetime
 import threading
 import queue
 import gradio as gr
-from PIL import Image, ExifTags, UnidentifiedImageError
 import pandas as pd
-from huggingface_hub import HfApi, upload_file, create_repo, Repository, hf_hub_download
-from datasets import Dataset, load_dataset, concatenate_datasets
-# Configuration variables
 HF_USERNAME = os.environ.get("HF_USERNAME", "latterworks")
-HF_TOKEN = os.environ.get("HF_TOKEN", None)  # Will use Spaces runtime token if not provided
 DATASET_NAME = os.environ.get("DATASET_NAME", "geo-metadata")
 DATASET_REPO = f"{HF_USERNAME}/{DATASET_NAME}"
-SPACE_ID = os.environ.get("SPACE_ID", f"{HF_USERNAME}/exif-extractor")
-REPO_MOUNTED = os.environ.get("REPO_MOUNTED", "true").lower() in ("true", "1", "t")
-LOCAL_STORAGE_PATH = Path("data")
 METADATA_FILE = LOCAL_STORAGE_PATH / "metadata.jsonl"
 MAX_BATCH_SIZE = 25
 SUPPORTED_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.heic', '.tiff', '.tif', '.bmp', '.webp']
-# Initialize storage
-LOCAL_STORAGE_PATH.mkdir(exist_ok=True, parents=True)
-# Processing queue for background tasks
 process_queue = queue.Queue()
 upload_queue = queue.Queue()
-# ========== EXIF Extraction Core ==========
 def convert_to_degrees(value):
-    """Convert GPS coordinates to decimal degrees"""
     try:
         d, m, s = value
         return d + (m / 60.0) + (s / 3600.0)
@@ -51,7 +67,7 @@ def convert_to_degrees(value):
         return value
 def extract_gps_info(gps_info):
-    """Extract and format GPS metadata from EXIF"""
     if not gps_info or not isinstance(gps_info, dict):
         return None
@@ -60,12 +76,9 @@ def extract_gps_info(gps_info):
         tag_name = ExifTags.GPSTAGS.get(key, key)
         gps_data[tag_name] = val
-    # Convert GPS coordinates to decimal format
     if 'GPSLatitude' in gps_data and 'GPSLongitude' in gps_data:
         lat = convert_to_degrees(gps_data['GPSLatitude'])
         lon = convert_to_degrees(gps_data['GPSLongitude'])
-        # Apply reference direction
         if gps_data.get('GPSLatitudeRef') == 'S':
             lat = -lat
         if gps_data.get('GPSLongitudeRef') == 'W':
@@ -73,58 +86,41 @@ def extract_gps_info(gps_info):
         gps_data['Latitude'] = lat
         gps_data['Longitude'] = lon
     return gps_data
 def make_serializable(value):
-    """Convert non-serializable objects to JSON-serializable types"""
-    # Handle PIL IFDRational objects
     if hasattr(value, 'numerator') and hasattr(value, 'denominator'):
         try:
             return float(value.numerator) / float(value.denominator)
-        except (TypeError, ValueError, ZeroDivisionError):
             return str(value)
-    # Handle rational numbers as tuples
     elif isinstance(value, tuple) and len(value) == 2:
         try:
             return float(value[0]) / float(value[1])
-        except (TypeError, ValueError, ZeroDivisionError):
             return str(value)
-    # Handle compound types recursively
     elif isinstance(value, (list, tuple)):
-        return [make_serializable(item) for item in value]
     elif isinstance(value, dict):
         return {k: make_serializable(v) for k, v in value.items()}
-    # Handle binary data
     elif isinstance(value, bytes):
         try:
             return value.decode('utf-8')
         except UnicodeDecodeError:
             return str(value)
-    # Check JSON serializability
     try:
         json.dumps(value)
         return value
-    except (TypeError, OverflowError):
         return str(value)
 def extract_metadata(image_path_or_obj, original_filename=None):
     """
-    Extract EXIF and metadata from an image file or PIL Image object
-    Args:
-        image_path_or_obj: Path object, string path, or PIL Image object
-        original_filename: Original filename if image_path_or_obj is a PIL Image
-    Returns:
-        Dict containing image metadata
     """
     try:
-        # Handle different input types
         if isinstance(image_path_or_obj, Image.Image):
             image = image_path_or_obj
             file_name = original_filename or "unknown.jpg"
@@ -137,21 +133,17 @@ def extract_metadata(image_path_or_obj, original_filename=None):
             file_size = image_path.stat().st_size
             file_extension = image_path.suffix.lower()
-        # Basic image metadata
         metadata = {
             "file_name": file_name,
             "format": image.format,
             "size": list(image.size),
             "mode": image.mode,
             "extraction_timestamp": datetime.now().isoformat(),
         }
         if file_size:
             metadata["file_size"] = file_size
-        metadata["file_extension"] = file_extension
-        # Extract EXIF data with error handling
         try:
             exif_data = image._getexif()
         except Exception as e:
@@ -162,8 +154,6 @@ def extract_metadata(image_path_or_obj, original_filename=None):
             for tag_id, value in exif_data.items():
                 try:
                     tag_name = ExifTags.TAGS.get(tag_id, f"tag_{tag_id}")
-                    # Extract GPS info
                     if tag_name == "GPSInfo":
                         gps_info = extract_gps_info(value)
                         if gps_info:
@@ -175,16 +165,20 @@ def extract_metadata(image_path_or_obj, original_filename=None):
         else:
             metadata["exif"] = "No EXIF data available"
-        # Validate serialization before returning
         try:
             json.dumps(metadata)
-        except (TypeError, OverflowError) as e:
-            # Filter out problematic entries as last resort
-            basic_metadata = {k: v for k, v in metadata.items()
-                          if k in ["file_name", "format", "size", "mode", "file_size", "file_extension"]}
-            basic_metadata["serialization_error"] = "Some metadata fields were removed due to JSON issues"
             return basic_metadata
         return metadata
     except Exception as e:
@@ -194,13 +188,10 @@ def extract_metadata(image_path_or_obj, original_filename=None):
             "extraction_timestamp": datetime.now().isoformat()
         }
-# ========== HuggingFace Integration ==========
 def save_metadata_to_jsonl(metadata_list, append=True):
-    """Save metadata to JSONL file with error handling"""
     mode = 'a' if append and METADATA_FILE.exists() else 'w'
     success_count = 0
     with open(METADATA_FILE, mode) as f:
         for entry in metadata_list:
             try:
@@ -209,18 +200,16 @@ def save_metadata_to_jsonl(metadata_list, append=True):
                 success_count += 1
             except Exception as e:
                 print(f"Failed to serialize entry: {e}")
-                # Write simplified version as fallback
-                simplified = {"file_name": entry.get("file_name", "unknown"),
-                             "error": "Serialization failed"}
                 f.write(json.dumps(simplified) + '\n')
     return success_count, len(metadata_list)
 def read_metadata_jsonl():
-    """Read metadata from JSONL file"""
     if not METADATA_FILE.exists():
         return []
     metadata_list = []
     with open(METADATA_FILE, 'r') as f:
         for line in f:
@@ -230,75 +219,56 @@ def read_metadata_jsonl():
                 continue
     return metadata_list
 def push_to_hub(metadata_list=None, create_if_not_exists=True):
-    """Push metadata to HuggingFace Hub as a dataset"""
     api = HfApi(token=HF_TOKEN)
     try:
         if metadata_list is None:
             metadata_list = read_metadata_jsonl()
         if not metadata_list:
             return "No metadata to push", "warning"
-        # Check if repository exists and create if needed
         repo_exists = True
         try:
             api.repo_info(repo_id=DATASET_REPO, repo_type="dataset")
         except Exception:
             repo_exists = False
             if create_if_not_exists:
-                create_repo(
-                    repo_id=DATASET_REPO,
-                    repo_type="dataset",
-                    token=HF_TOKEN,
-                    private=False
-                )
             else:
-                return f"Dataset repository {DATASET_REPO} doesn't exist", "error"
-        # Check if we need to merge with existing data
         existing_metadata = []
         if repo_exists:
             try:
-                # Attempt to download existing metadata
-                try:
-                    existing_file = hf_hub_download(
-                        repo_id=DATASET_REPO,
-                        filename="metadata.jsonl",
-                        repo_type="dataset",
-                        token=HF_TOKEN
-                    )
-                    # Parse existing metadata
-                    with open(existing_file, 'r') as f:
-                        for line in f:
-                            try:
-                                existing_metadata.append(json.loads(line))
-                            except json.JSONDecodeError:
-                                continue
-                except Exception as e:
-                    print(f"No existing metadata found: {e}")
             except Exception as e:
-                print(f"Error fetching existing metadata: {e}")
-        # Merge new metadata with existing (avoiding duplicates by filename)
         if existing_metadata:
             existing_filenames = {item.get("file_name") for item in existing_metadata}
-            unique_new_items = [item for item in metadata_list
-                               if item.get("file_name") not in existing_filenames]
-            combined_metadata = existing_metadata + unique_new_items
-            print(f"Combining {len(existing_metadata)} existing entries with {len(unique_new_items)} new entries")
         else:
             combined_metadata = metadata_list
-        # Save temporary JSONL for upload
         temp_file = Path(tempfile.mktemp(suffix=".jsonl"))
         with open(temp_file, 'w') as f:
             for entry in combined_metadata:
                 f.write(json.dumps(entry) + '\n')
-        # Push to Hub with explicit API version compatibility
         api.upload_file(
             path_or_fileobj=str(temp_file),
             path_in_repo="metadata.jsonl",
@@ -307,32 +277,27 @@ def push_to_hub(metadata_list=None, create_if_not_exists=True):
             token=HF_TOKEN
         )
-        # Create dataset card if needed
         readme_path = LOCAL_STORAGE_PATH / "README.md"
         if not readme_path.exists():
             with open(readme_path, 'w') as f:
-                f.write(f"# EXIF Metadata Dataset\n\n"
-                        f"This dataset contains EXIF metadata extracted from images.\n\n"
-                        f"Last updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
-                        f"Total entries: {len(combined_metadata)}")
-        # Update timestamp in README
         try:
             with open(readme_path, 'r') as f:
                 readme_content = f.read()
-            # Handle both cases: update existing timestamp or add one
-            if "Last updated:" in readme_content:
-                updated_readme = readme_content.replace(
-                    "Last updated:",
-                    f"Last updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\nTotal entries: {len(combined_metadata)}"
-                )
-            else:
-                updated_readme = readme_content + f"\n\nLast updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\nTotal entries: {len(combined_metadata)}"
             with open(readme_path, 'w') as f:
                 f.write(updated_readme)
             api.upload_file(
                 path_or_fileobj=str(readme_path),
                 path_in_repo="README.md",
@@ -343,216 +308,346 @@ def push_to_hub(metadata_list=None, create_if_not_exists=True):
         except Exception as e:
             print(f"Error updating README: {e}")
-        return f"Successfully pushed {len(metadata_list)} metadata entries to {DATASET_REPO}", "success"
     except Exception as e:
-        return f"Error pushing to Hub: {str(e)}", "error"
-# ========== Background Processing ==========
 def process_worker():
-    """Background worker to process images in the queue"""
     while True:
         try:
             task = process_queue.get()
-            if task is None:  # Sentinel to stop the thread
                 break
             file_path, original_filename = task
             metadata = extract_metadata(file_path, original_filename)
-            # Save to JSONL
             success, total = save_metadata_to_jsonl([metadata])
-            # Add to upload queue
             if success:
                 upload_queue.put(metadata)
             process_queue.task_done()
         except Exception as e:
             print(f"Error in process worker: {e}")
             process_queue.task_done()
 def upload_worker():
-    """Background worker to batch upload metadata to Hub"""
     batch = []
     last_upload_time = time.time()
     while True:
         try:
-            # Wait for item with timeout
             try:
-                metadata = upload_queue.get(timeout=60)  # 1 minute timeout
             except queue.Empty:
-                # If timeout and we have items, upload them
-                if batch and (time.time() - last_upload_time) > 300:  # 5 minutes passed
                     push_to_hub(batch)
                     batch = []
                     last_upload_time = time.time()
                 continue
-            if metadata is None:  # Sentinel to stop the thread
                 break
             batch.append(metadata)
             upload_queue.task_done()
-            # If batch size reached, upload
             if len(batch) >= MAX_BATCH_SIZE:
                 push_to_hub(batch)
                 batch = []
                 last_upload_time = time.time()
         except Exception as e:
             print(f"Error in upload worker: {e}")
             if metadata:
                 upload_queue.task_done()
-# Start worker threads
 process_thread = threading.Thread(target=process_worker, daemon=True)
 process_thread.start()
 upload_thread = threading.Thread(target=upload_worker, daemon=True)
 upload_thread.start()
-# ========== Gradio Interface ==========
 def process_uploaded_files(files):
-    """Process uploaded files and extract metadata"""
     if not files:
         return "No files uploaded", "warning"
     processed = 0
     metadata_list = []
     for file in files:
         try:
-            # Handle both Gradio v3.x and v4.x file objects
             if hasattr(file, 'name'):
-                # Gradio v3.x
                 file_path = Path(file.name)
                 file_name = file_path.name
             else:
-                # Gradio v4.x returns a tuple (path, orig_name)
                 file_path = Path(file)
                 file_name = file_path.name
             if file_path.suffix.lower() not in SUPPORTED_EXTENSIONS:
                 continue
             metadata = extract_metadata(file_path, file_name)
             metadata_list.append(metadata)
             processed += 1
-            # Queue for background processing if needed
             process_queue.put((file_path, file_name))
         except Exception as e:
-            print(f"Error processing {getattr(file, 'name', str(file))}: {e}")
     if metadata_list:
         success, total = save_metadata_to_jsonl(metadata_list)
-        return f"Processed {processed} files. {success}/{total} metadata entries saved successfully.", "success"
     else:
-        return f"No valid image files found among {len(files)} uploaded files", "warning"
 def view_metadata():
-    """Display current metadata as a DataFrame"""
     metadata_list = read_metadata_jsonl()
     if not metadata_list:
         return "No metadata available", pd.DataFrame()
-    # Create a flattened version for display
     display_data = []
     for entry in metadata_list:
-        display_row = {
             "filename": entry.get("file_name", "unknown"),
-            "width": entry.get("size", [0, 0])[0] if isinstance(entry.get("size"), list) else None,
-            "height": entry.get("size", [0, 0])[1] if isinstance(entry.get("size"), list) else None,
             "format": entry.get("format"),
             "has_gps": "Yes" if entry.get("gps_info") else "No"
         }
-        # Extract GPS coordinates if available
         if entry.get("gps_info"):
             gps = entry["gps_info"]
-            display_row["latitude"] = gps.get("Latitude")
-            display_row["longitude"] = gps.get("Longitude")
-        display_data.append(display_row)
     df = pd.DataFrame(display_data)
-    return f"Found {len(metadata_list)} metadata entries", df
 def manual_push_to_hub():
-    """Manually trigger push to Hub"""
     return push_to_hub()
 with gr.Blocks(title="EXIF Extraction Pipeline") as app:
-    gr.Markdown("""
     # EXIF Metadata Extraction Pipeline
-    Upload images to extract EXIF metadata including GPS coordinates and publish to HuggingFace Hub.
-    **Current configuration:**
-    * Dataset repo: {repo}
-    * Local storage: {storage}
-    * Supported formats: {formats}
-    """.format(
-        repo=DATASET_REPO,
-        storage=LOCAL_STORAGE_PATH,
-        formats=", ".join(SUPPORTED_EXTENSIONS)
-    ))
     with gr.Tabs():
         with gr.TabItem("Upload Images"):
-            with gr.Row():
-                file_input = gr.File(file_count="multiple", label="Upload Images")
-            with gr.Row():
-                submit_btn = gr.Button("Process Images")
-                output_status = gr.Textbox(label="Status")
-            submit_btn.click(
-                fn=process_uploaded_files,
-                inputs=[file_input],
-                outputs=[output_status]
-            )
         with gr.TabItem("View Metadata"):
-            with gr.Row():
-                refresh_btn = gr.Button("Refresh Metadata")
-            with gr.Row():
-                view_status = gr.Textbox(label="Status")
-            with gr.Row():
-                results_df = gr.DataFrame(label="Metadata Overview")
-            refresh_btn.click(
-                fn=view_metadata,
-                inputs=[],
-                outputs=[view_status, results_df]
-            )
-            # Auto-load metadata on tab selection
-            app.load(
-                fn=view_metadata,
-                inputs=[],
-                outputs=[view_status, results_df]
-            )
         with gr.TabItem("Hub Management"):
-            with gr.Row():
-                push_btn = gr.Button("Push to HuggingFace Hub")
-                push_status = gr.Textbox(label="Status")
-            push_btn.click(
-                fn=manual_push_to_hub,
-                inputs=[],
-                outputs=[push_status]
-            )
-# Initialize application
 if __name__ == "__main__":
-    app.launch()

 import os
 import io
 import json
 import threading
 import queue
+# ====================== Additional Imports ======================
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch.utils.data import Dataset, DataLoader
+from torchvision import transforms
+from PIL import Image, ExifTags
+from tqdm import tqdm
 import gradio as gr
 import pandas as pd
+# Hugging Face Hub
+from huggingface_hub import (
+    hf_hub_download,
+    login,
+    whoami,
+    create_repo,
+    HfApi,
+    InferenceClient,
+)
+# ====================== Configuration & Paths ======================
 HF_USERNAME = os.environ.get("HF_USERNAME", "latterworks")
+HF_TOKEN = os.environ.get("HF_TOKEN", None)  # If not provided, use default Spaces token
 DATASET_NAME = os.environ.get("DATASET_NAME", "geo-metadata")
 DATASET_REPO = f"{HF_USERNAME}/{DATASET_NAME}"
+# Relative local paths
+LOCAL_STORAGE_PATH = Path("./data")
+LOCAL_STORAGE_PATH.mkdir(exist_ok=True, parents=True)
 METADATA_FILE = LOCAL_STORAGE_PATH / "metadata.jsonl"
+IMAGES_DIR = Path("./images")  # place your images here
+IMAGES_DIR.mkdir(exist_ok=True, parents=True)
+# We’ll store checkpoints here:
+CHECKPOINTS_DIR = Path("./checkpoints")
+CHECKPOINTS_DIR.mkdir(exist_ok=True, parents=True)
+CHECKPOINT_PATH = CHECKPOINTS_DIR / "last_checkpoint.pth"
 MAX_BATCH_SIZE = 25
 SUPPORTED_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.heic', '.tiff', '.tif', '.bmp', '.webp']
+# ====================== Queues and Threads ======================
 process_queue = queue.Queue()
 upload_queue = queue.Queue()
+# ====================== EXIF Extraction Core ======================
 def convert_to_degrees(value):
+    """Convert GPS coords to decimal degrees."""
     try:
         d, m, s = value
         return d + (m / 60.0) + (s / 3600.0)
         return value
 def extract_gps_info(gps_info):
+    """Extract and format GPS metadata from EXIF."""
     if not gps_info or not isinstance(gps_info, dict):
         return None
         tag_name = ExifTags.GPSTAGS.get(key, key)
         gps_data[tag_name] = val
     if 'GPSLatitude' in gps_data and 'GPSLongitude' in gps_data:
         lat = convert_to_degrees(gps_data['GPSLatitude'])
         lon = convert_to_degrees(gps_data['GPSLongitude'])
         if gps_data.get('GPSLatitudeRef') == 'S':
             lat = -lat
         if gps_data.get('GPSLongitudeRef') == 'W':
         gps_data['Latitude'] = lat
         gps_data['Longitude'] = lon
     return gps_data
 def make_serializable(value):
+    """Convert objects to JSON-serializable."""
     if hasattr(value, 'numerator') and hasattr(value, 'denominator'):
         try:
             return float(value.numerator) / float(value.denominator)
+        except:
             return str(value)
     elif isinstance(value, tuple) and len(value) == 2:
         try:
             return float(value[0]) / float(value[1])
+        except:
             return str(value)
     elif isinstance(value, (list, tuple)):
+        return [make_serializable(v) for v in value]
     elif isinstance(value, dict):
         return {k: make_serializable(v) for k, v in value.items()}
     elif isinstance(value, bytes):
         try:
             return value.decode('utf-8')
         except UnicodeDecodeError:
             return str(value)
+    # final fallback
     try:
         json.dumps(value)
         return value
+    except:
         return str(value)
 def extract_metadata(image_path_or_obj, original_filename=None):
     """
+    Extract EXIF & metadata from a file or PIL Image.
     """
     try:
         if isinstance(image_path_or_obj, Image.Image):
             image = image_path_or_obj
             file_name = original_filename or "unknown.jpg"
             file_size = image_path.stat().st_size
             file_extension = image_path.suffix.lower()
         metadata = {
             "file_name": file_name,
             "format": image.format,
             "size": list(image.size),
             "mode": image.mode,
             "extraction_timestamp": datetime.now().isoformat(),
+            "file_extension": file_extension
         }
         if file_size:
             metadata["file_size"] = file_size
         try:
             exif_data = image._getexif()
         except Exception as e:
             for tag_id, value in exif_data.items():
                 try:
                     tag_name = ExifTags.TAGS.get(tag_id, f"tag_{tag_id}")
                     if tag_name == "GPSInfo":
                         gps_info = extract_gps_info(value)
                         if gps_info:
         else:
             metadata["exif"] = "No EXIF data available"
+        # Validate serializability
         try:
             json.dumps(metadata)
+        except:
+            # fallback
+            basic_metadata = {
+                "file_name": metadata.get("file_name", "unknown"),
+                "format": metadata.get("format", None),
+                "size": metadata.get("size", None),
+                "mode": metadata.get("mode", None),
+                "file_extension": metadata.get("file_extension", None),
+            }
+            basic_metadata["serialization_error"] = "Some metadata were removed."
             return basic_metadata
         return metadata
     except Exception as e:
             "extraction_timestamp": datetime.now().isoformat()
         }
+# ====================== Save/Load JSONL ======================
 def save_metadata_to_jsonl(metadata_list, append=True):
     mode = 'a' if append and METADATA_FILE.exists() else 'w'
     success_count = 0
     with open(METADATA_FILE, mode) as f:
         for entry in metadata_list:
             try:
                 success_count += 1
             except Exception as e:
                 print(f"Failed to serialize entry: {e}")
+                simplified = {
+                    "file_name": entry.get("file_name", "unknown"),
+                    "error": "Serialization failed"
+                }
                 f.write(json.dumps(simplified) + '\n')
     return success_count, len(metadata_list)
 def read_metadata_jsonl():
     if not METADATA_FILE.exists():
         return []
     metadata_list = []
     with open(METADATA_FILE, 'r') as f:
         for line in f:
                 continue
     return metadata_list
+# ====================== Pushing to HuggingFace Hub ======================
 def push_to_hub(metadata_list=None, create_if_not_exists=True):
     api = HfApi(token=HF_TOKEN)
     try:
         if metadata_list is None:
             metadata_list = read_metadata_jsonl()
         if not metadata_list:
             return "No metadata to push", "warning"
         repo_exists = True
         try:
             api.repo_info(repo_id=DATASET_REPO, repo_type="dataset")
         except Exception:
             repo_exists = False
             if create_if_not_exists:
+                create_repo(repo_id=DATASET_REPO, repo_type="dataset", token=HF_TOKEN, private=False)
             else:
+                return f"Dataset repo {DATASET_REPO} doesn't exist.", "error"
         existing_metadata = []
         if repo_exists:
             try:
+                existing_file = hf_hub_download(
+                    repo_id=DATASET_REPO,
+                    filename="metadata.jsonl",
+                    repo_type="dataset",
+                    token=HF_TOKEN
+                )
+                with open(existing_file, 'r') as f:
+                    for line in f:
+                        try:
+                            existing_metadata.append(json.loads(line))
+                        except:
+                            pass
             except Exception as e:
+                print(f"No existing metadata found or error reading: {e}")
         if existing_metadata:
             existing_filenames = {item.get("file_name") for item in existing_metadata}
+            unique_new = [item for item in metadata_list
+                          if item.get("file_name") not in existing_filenames]
+            combined_metadata = existing_metadata + unique_new
         else:
             combined_metadata = metadata_list
         temp_file = Path(tempfile.mktemp(suffix=".jsonl"))
         with open(temp_file, 'w') as f:
             for entry in combined_metadata:
                 f.write(json.dumps(entry) + '\n')
         api.upload_file(
             path_or_fileobj=str(temp_file),
             path_in_repo="metadata.jsonl",
             token=HF_TOKEN
         )
         readme_path = LOCAL_STORAGE_PATH / "README.md"
         if not readme_path.exists():
             with open(readme_path, 'w') as f:
+                f.write(
+                    f"# EXIF Metadata Dataset\n\n"
+                    f"This dataset contains EXIF metadata.\n\n"
+                    f"Last updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
+                    f"Total entries: {len(combined_metadata)}"
+                )
         try:
             with open(readme_path, 'r') as f:
                 readme_content = f.read()
+            updated_readme = (
+                f"# EXIF Metadata Dataset\n\n"
+                f"This dataset contains EXIF metadata.\n\n"
+                f"Last updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
+                f"Total entries: {len(combined_metadata)}"
+            )
             with open(readme_path, 'w') as f:
                 f.write(updated_readme)
             api.upload_file(
                 path_or_fileobj=str(readme_path),
                 path_in_repo="README.md",
         except Exception as e:
             print(f"Error updating README: {e}")
+        return f"Successfully pushed {len(metadata_list)} entries to {DATASET_REPO}", "success"
     except Exception as e:
+        return f"Error pushing to Hub: {e}", "error"
+# ====================== Background Processing Threads ======================
 def process_worker():
     while True:
         try:
             task = process_queue.get()
+            if task is None:
                 break
             file_path, original_filename = task
             metadata = extract_metadata(file_path, original_filename)
             success, total = save_metadata_to_jsonl([metadata])
             if success:
                 upload_queue.put(metadata)
             process_queue.task_done()
         except Exception as e:
             print(f"Error in process worker: {e}")
             process_queue.task_done()
 def upload_worker():
     batch = []
     last_upload_time = time.time()
     while True:
         try:
             try:
+                metadata = upload_queue.get(timeout=60)
             except queue.Empty:
+                if batch and (time.time() - last_upload_time) > 300:
                     push_to_hub(batch)
                     batch = []
                     last_upload_time = time.time()
                 continue
+            if metadata is None:
                 break
             batch.append(metadata)
             upload_queue.task_done()
             if len(batch) >= MAX_BATCH_SIZE:
                 push_to_hub(batch)
                 batch = []
                 last_upload_time = time.time()
         except Exception as e:
             print(f"Error in upload worker: {e}")
             if metadata:
                 upload_queue.task_done()
 process_thread = threading.Thread(target=process_worker, daemon=True)
 process_thread.start()
 upload_thread = threading.Thread(target=upload_worker, daemon=True)
 upload_thread.start()
+# ====================== Gradio App ======================
 def process_uploaded_files(files):
     if not files:
         return "No files uploaded", "warning"
     processed = 0
     metadata_list = []
     for file in files:
         try:
+            # If using Gradio 3.x
             if hasattr(file, 'name'):
                 file_path = Path(file.name)
                 file_name = file_path.name
             else:
+                # If using Gradio 4.x => (path, orig_name)
                 file_path = Path(file)
                 file_name = file_path.name
             if file_path.suffix.lower() not in SUPPORTED_EXTENSIONS:
                 continue
             metadata = extract_metadata(file_path, file_name)
             metadata_list.append(metadata)
             processed += 1
             process_queue.put((file_path, file_name))
         except Exception as e:
+            print(f"Error processing {file_path}: {e}")
     if metadata_list:
         success, total = save_metadata_to_jsonl(metadata_list)
+        return (f"Processed {processed} files. "
+                f"{success}/{total} metadata entries saved."), "success"
     else:
+        return f"No valid image files among the {len(files)} uploaded.", "warning"
 def view_metadata():
     metadata_list = read_metadata_jsonl()
     if not metadata_list:
         return "No metadata available", pd.DataFrame()
     display_data = []
     for entry in metadata_list:
+        row = {
             "filename": entry.get("file_name", "unknown"),
+            "width": None,
+            "height": None,
             "format": entry.get("format"),
             "has_gps": "Yes" if entry.get("gps_info") else "No"
         }
+        size = entry.get("size")
+        if isinstance(size, list) and len(size) == 2:
+            row["width"], row["height"] = size
         if entry.get("gps_info"):
             gps = entry["gps_info"]
+            row["latitude"] = gps.get("Latitude")
+            row["longitude"] = gps.get("Longitude")
+        display_data.append(row)
     df = pd.DataFrame(display_data)
+    return f"Found {len(metadata_list)} entries", df
 def manual_push_to_hub():
     return push_to_hub()
 with gr.Blocks(title="EXIF Extraction Pipeline") as app:
+    gr.Markdown(f"""
     # EXIF Metadata Extraction Pipeline
+    **Local storage**: `./data`
+    **Images directory**: `./images`
+    **Checkpoints**: `./checkpoints`
+    **Supported formats**: {", ".join(SUPPORTED_EXTENSIONS)}
+    Upload images to extract EXIF metadata (including GPS) and push to HuggingFace Hub.
+    """)
     with gr.Tabs():
         with gr.TabItem("Upload Images"):
+            file_input = gr.File(file_count="multiple", label="Upload Images")
+            submit_btn = gr.Button("Process Images")
+            output_status = gr.Textbox(label="Status")
+            submit_btn.click(fn=process_uploaded_files, inputs=[file_input], outputs=[output_status])
         with gr.TabItem("View Metadata"):
+            refresh_btn = gr.Button("Refresh Metadata")
+            view_status = gr.Textbox(label="Status")
+            results_df = gr.DataFrame(label="Metadata Overview")
+            refresh_btn.click(fn=view_metadata, inputs=[], outputs=[view_status, results_df])
+            app.load(fn=view_metadata, inputs=[], outputs=[view_status, results_df])
         with gr.TabItem("Hub Management"):
+            push_btn = gr.Button("Push to HuggingFace Hub")
+            push_status = gr.Textbox(label="Status")
+            push_btn.click(fn=manual_push_to_hub, inputs=[], outputs=[push_status])
+# ====================== PyTorch: Using GPS Data ======================
+def load_exif_gps_metadata(metadata_file=METADATA_FILE):
+    gps_map = {}
+    if not os.path.exists(metadata_file):
+        return gps_map
+    with open(metadata_file, "r") as f:
+        for line in f:
+            try:
+                entry = json.loads(line)
+                gps_info = entry.get("gps_info")
+                if gps_info and "Latitude" in gps_info and "Longitude" in gps_info:
+                    lat = gps_info["Latitude"]
+                    lon = gps_info["Longitude"]
+                    gps_map[entry["file_name"]] = (lat, lon)
+            except:
+                pass
+    return gps_map
+class GPSImageDataset(Dataset):
+    def __init__(self, images_dir, gps_map, transform=None):
+        self.images_dir = Path(images_dir)
+        self.transform = transform
+        self.gps_map = gps_map
+        # Filter to only files that have GPS data
+        self.file_names = []
+        for fn in os.listdir(self.images_dir):
+            if fn in gps_map:  # ensure we have matching metadata
+                self.file_names.append(fn)
+    def __len__(self):
+        return len(self.file_names)
+    def __getitem__(self, idx):
+        file_name = self.file_names[idx]
+        img_path = self.images_dir / file_name
+        image = Image.open(img_path).convert("RGB")
+        if self.transform:
+            image = self.transform(image)
+        lat, lon = self.gps_map[file_name]
+        gps_tensor = torch.tensor([lat, lon], dtype=torch.float)
+        return image, gps_tensor
+def train_one_epoch(
+    train_dataloader, model, optimizer, epoch, batch_size, device,
+    scheduler=None, criterion=nn.CrossEntropyLoss()
+):
+    print(f"\nStarting Epoch {epoch} ...")
+    bar = tqdm(enumerate(train_dataloader), total=len(train_dataloader))
+    # Create some placeholder targets (for demonstration only).
+    targets_img_gps = torch.arange(0, batch_size).long().to(device)
+    for i, (imgs, gps) in bar:
+        imgs, gps = imgs.to(device), gps.to(device)
+        gps_queue = model.get_gps_queue()  # Hypothetical in your model
+        optimizer.zero_grad()
+        gps_all = torch.cat([gps, gps_queue], dim=0)
+        model.dequeue_and_enqueue(gps)
+        logits_img_gps = model(imgs, gps_all)
+        loss = criterion(logits_img_gps, targets_img_gps)
+        loss.backward()
+        optimizer.step()
+        bar.set_description(f"Epoch {epoch} loss: {loss.item():.5f}")
+    if scheduler:
+        scheduler.step()
+# ====================== Checkpoint Helpers ======================
+def save_checkpoint(model, optimizer, epoch, path=CHECKPOINT_PATH):
+    """
+    Saves model + optimizer state_dict along with current epoch
+    to `path`.
+    """
+    ckpt = {
+        "epoch": epoch,
+        "model_state": model.state_dict(),
+        "optimizer_state": optimizer.state_dict(),
+    }
+    torch.save(ckpt, path)
+    print(f"[Checkpoint] Saved at epoch={epoch} -> {path}")
+def load_checkpoint(model, optimizer, path=CHECKPOINT_PATH, device="cpu"):
+    """
+    Loads checkpoint into model + optimizer, returns the last epoch.
+    """
+    if not os.path.exists(path):
+        print(f"No checkpoint found at {path}. Starting fresh.")
+        return 0
+    ckpt = torch.load(path, map_location=device)
+    model.load_state_dict(ckpt["model_state"])
+    optimizer.load_state_dict(ckpt["optimizer_state"])
+    print(f"[Checkpoint] Loaded from {path} (epoch={ckpt['epoch']})")
+    return ckpt["epoch"]
+# ====================== Continuous Trainer ======================
+def continuous_train(
+    train_dataloader,
+    model,
+    optimizer,
+    device,
+    start_epoch=1,
+    max_epochs=5,
+    scheduler=None
+):
+    """
+    Loads checkpoint if available, then trains up to `max_epochs`.
+    Saves new checkpoint at the end of each epoch.
+    """
+    # Attempt to load from existing checkpoint
+    loaded_epoch = load_checkpoint(model, optimizer, path=CHECKPOINT_PATH, device=device)
+    # If loaded_epoch=3 and user says max_epochs=5, we continue from epoch 4, 5
+    current_epoch = loaded_epoch + 1
+    final_epoch = max(loaded_epoch + 1, max_epochs)  # ensure we do something
+    # Example: train from current_epoch -> max_epochs
+    while current_epoch <= max_epochs:
+        train_one_epoch(
+            train_dataloader=train_dataloader,
+            model=model,
+            optimizer=optimizer,
+            epoch=current_epoch,
+            batch_size=train_dataloader.batch_size,
+            device=device,
+            scheduler=scheduler
+        )
+        # Save checkpoint each epoch
+        save_checkpoint(model, optimizer, current_epoch, CHECKPOINT_PATH)
+        current_epoch += 1
+class ExampleGPSModel(nn.Module):
+    def __init__(self, gps_queue_len=10):
+        super().__init__()
+        self.conv = nn.Conv2d(3, 16, kernel_size=3, padding=1)
+        self.flatten = nn.Flatten()
+        self.fc_img = nn.Linear(16 * 224 * 224, 32)
+        self.fc_gps = nn.Linear(2, 32)
+        self.fc_out = nn.Linear(64, 10)
+        self.gps_queue_len = gps_queue_len
+        self._gps_queue = torch.zeros((gps_queue_len, 2), dtype=torch.float)
+    def forward(self, imgs, gps_all):
+        x = self.conv(imgs)
+        x = F.relu(x)
+        x = self.flatten(x)
+        x = self.fc_img(x)
+        g = self.fc_gps(gps_all)
+        # Average all GPS embeddings
+        if g.dim() == 2:
+            g = g.mean(dim=0, keepdim=True)
+        combined = torch.cat([x, g.repeat(x.size(0), 1)], dim=1)
+        out = self.fc_out(combined)
+        return out
+    def get_gps_queue(self):
+        return self._gps_queue
+    def dequeue_and_enqueue(self, new_gps):
+        B = new_gps.shape[0]
+        self._gps_queue = torch.roll(self._gps_queue, shifts=-B, dims=0)
+        self._gps_queue[-B:] = new_gps
 if __name__ == "__main__":
+    # ========== Example usage: build dataset/dataloader ==========
+    gps_map = load_exif_gps_metadata(METADATA_FILE)  # from ./data/metadata.jsonl
+    transform = transforms.Compose([
+        transforms.Resize((224, 224)),
+        transforms.ToTensor(),
+    ])
+    train_dataset = GPSImageDataset(IMAGES_DIR, gps_map, transform=transform)
+    train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)
+    # ========== Create model & optimizer ==========
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = ExampleGPSModel().to(device)
+    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
+    # ========== Continuous training example (5 epochs) ==========
+    continuous_train(
+        train_dataloader=train_dataloader,
+        model=model,
+        optimizer=optimizer,
+        device=device,
+        start_epoch=1,   # not used if there's a checkpoint
+        max_epochs=5
+    )
+    print("Done training. Launching Gradio app...")
+    # ========== Launch Gradio ==========
+    app.launch(server_name="0.0.0.0", server_port=7860)