Spaces:

multimodalart
/

ai-toolkit

Running on CPU Upgrade

App Files Files Community

multimodalart HF Staff commited on Sep 17

Commit

f1ba847

verified ·

1 Parent(s): e218de2

support control dataset

Browse files

Files changed (3) hide show

ui/src/app/api/hf-hub/route.ts +99 -28
ui/src/app/api/hf-jobs/route.ts +108 -50
ui/src/components/HFJobsWorkflow.tsx +123 -6

ui/src/app/api/hf-hub/route.ts CHANGED Viewed

@@ -2,11 +2,12 @@ import { NextRequest, NextResponse } from 'next/server';
 import { whoAmI, createRepo, uploadFiles, datasetInfo } from '@huggingface/hub';
 import { readdir, stat } from 'fs/promises';
 import path from 'path';
 export async function POST(request: NextRequest) {
   try {
     const body = await request.json();
-    const { action, token, namespace, datasetName, datasetPath, datasetId } = body;
     if (!token) {
       return NextResponse.json({ error: 'HF token is required' }, { status: 400 });
@@ -46,43 +47,113 @@ export async function POST(request: NextRequest) {
       case 'uploadDataset':
         try {
-          if (!namespace || !datasetName || !datasetPath) {
             return NextResponse.json({ error: 'Missing required parameters' }, { status: 400 });
           }
           const repoId = `datasets/${namespace}/${datasetName}`;
-          // Check if directory exists
-          try {
-            await stat(datasetPath);
-          } catch {
-            return NextResponse.json({ error: 'Dataset path does not exist' }, { status: 400 });
           }
-          // Read files from directory and upload them
-          const files = await readdir(datasetPath);
-          const filesToUpload = [];
-          for (const fileName of files) {
-            const filePath = path.join(datasetPath, fileName);
-            const fileStats = await stat(filePath);
-            if (fileStats.isFile()) {
-              filesToUpload.push({
-                path: fileName,
-                content: new URL(`file://${filePath}`)
-              });
             }
           }
-          if (filesToUpload.length > 0) {
-            await uploadFiles({
-              repo: repoId,
-              accessToken: token,
-              files: filesToUpload,
-            });
           }
           return NextResponse.json({ success: true, repoId });
         } catch (error: any) {
           console.error('Upload error:', error);
@@ -162,4 +233,4 @@ export async function POST(request: NextRequest) {
     console.error('HF Hub API error:', error);
     return NextResponse.json({ error: error.message || 'Internal server error' }, { status: 500 });
   }
-}

 import { whoAmI, createRepo, uploadFiles, datasetInfo } from '@huggingface/hub';
 import { readdir, stat } from 'fs/promises';
 import path from 'path';
+import { pathToFileURL } from 'url';
 export async function POST(request: NextRequest) {
   try {
     const body = await request.json();
+    const { action, token, namespace, datasetName, datasetPath, datasetId, artifacts, manifest } = body;
     if (!token) {
       return NextResponse.json({ error: 'HF token is required' }, { status: 400 });
       case 'uploadDataset':
         try {
+          if (!namespace || !datasetName) {
             return NextResponse.json({ error: 'Missing required parameters' }, { status: 400 });
           }
           const repoId = `datasets/${namespace}/${datasetName}`;
+          const structuredArtifacts = Array.isArray(artifacts) ? artifacts : [];
+          const hasStructuredArtifacts = structuredArtifacts.length > 0;
+          if (!hasStructuredArtifacts && !datasetPath) {
+            return NextResponse.json({ error: 'Dataset path could not be resolved' }, { status: 400 });
           }
+          const filesToUpload: { path: string; content: any }[] = [];
+          const uploadedPaths = new Set<string>();
+          const normalizeRepoPath = (value: string) => value.replace(/\\/g, '/').replace(/^\/+/, '');
+          const addUploadContent = (repoFilePath: string, content: any) => {
+            const normalizedRepoPath = normalizeRepoPath(repoFilePath);
+            if (!normalizedRepoPath || uploadedPaths.has(normalizedRepoPath)) {
+              return;
+            }
+            uploadedPaths.add(normalizedRepoPath);
+            filesToUpload.push({ path: normalizedRepoPath, content });
+          };
+          const addUploadFile = (absolutePath: string, repoFilePath: string) => {
+            addUploadContent(repoFilePath, pathToFileURL(absolutePath));
+          };
+          const walkDirectory = async (basePath: string, repoPrefix: string) => {
+            const entries = await readdir(basePath, { withFileTypes: true });
+            for (const entry of entries) {
+              const entryPath = path.join(basePath, entry.name);
+              if (entry.isDirectory()) {
+                const nextPrefix = repoPrefix ? `${repoPrefix}/${entry.name}` : entry.name;
+                await walkDirectory(entryPath, nextPrefix);
+              } else if (entry.isFile()) {
+                const repoFilePath = repoPrefix ? `${repoPrefix}/${entry.name}` : entry.name;
+                addUploadFile(entryPath, repoFilePath);
+              }
+            }
+          };
+          const processArtifact = async (localPath: string, repoPath: string) => {
+            const resolvedPath = path.resolve(localPath);
+            let stats;
+            try {
+              stats = await stat(resolvedPath);
+            } catch {
+              throw new Error(`Dataset path does not exist: ${localPath}`);
+            }
+            const normalizedRepoPrefix = repoPath ? normalizeRepoPath(repoPath) : '';
+            if (stats.isDirectory()) {
+              await walkDirectory(resolvedPath, normalizedRepoPrefix);
+            } else if (stats.isFile()) {
+              let destination = normalizedRepoPrefix;
+              if (!destination || destination.endsWith('/')) {
+                destination = `${destination}${path.basename(resolvedPath)}`;
+              } else if (!path.posix.extname(destination)) {
+                destination = `${destination}/${path.basename(resolvedPath)}`;
+              }
+              addUploadFile(resolvedPath, destination);
+            } else {
+              throw new Error(`Unsupported artifact type for path: ${localPath}`);
+            }
+          };
+          if (hasStructuredArtifacts) {
+            for (const artifact of structuredArtifacts) {
+              if (!artifact?.localPath || !artifact?.repoPath) {
+                continue;
+              }
+              await processArtifact(artifact.localPath, artifact.repoPath);
             }
+          } else {
+            const resolvedDatasetPath = path.resolve(datasetPath);
+            let datasetStats;
+            try {
+              datasetStats = await stat(resolvedDatasetPath);
+            } catch {
+              return NextResponse.json({ error: 'Dataset path does not exist' }, { status: 400 });
+            }
+            if (!datasetStats.isDirectory()) {
+              return NextResponse.json({ error: 'Dataset path must be a directory' }, { status: 400 });
+            }
+            await walkDirectory(resolvedDatasetPath, '');
+          }
+          if (manifest) {
+            addUploadContent('manifest.json', Buffer.from(JSON.stringify(manifest, null, 2), 'utf-8'));
           }
+          if (filesToUpload.length === 0) {
+            return NextResponse.json({ error: 'No files found to upload for dataset' }, { status: 400 });
           }
+          await uploadFiles({
+            repo: repoId,
+            accessToken: token,
+            files: filesToUpload,
+          });
           return NextResponse.json({ success: true, repoId });
         } catch (error: any) {
           console.error('Upload error:', error);
     console.error('HF Hub API error:', error);
     return NextResponse.json({ error: error.message || 'Internal server error' }, { status: 500 });
   }
+}

ui/src/app/api/hf-jobs/route.ts CHANGED Viewed

@@ -205,46 +205,41 @@ def normalize_repo_id(dataset_repo: str) -> str:
 def copy_dataset_files(source_dir: str, local_path: str):
     print(f"Collecting data files from {source_dir}")
-    image_extensions = ['*.jpg', '*.jpeg', '*.png', '*.webp', '*.bmp', '*.JPG', '*.JPEG', '*.PNG']
-    image_files = []
-    for ext in image_extensions:
-        pattern = os.path.join(source_dir, "**", ext)
-        found_files = glob.glob(pattern, recursive=True)
-        image_files.extend(found_files)
-        print(f"Pattern {pattern} found {len(found_files)} files")
-    text_files = glob.glob(os.path.join(source_dir, "**", "*.txt"), recursive=True)
-    print(f"Found {len(image_files)} image files and {len(text_files)} text files")
-    for i, img_file in enumerate(image_files):
-        dest_path = os.path.join(local_path, f"image_{i:06d}.jpg")
-        try:
-            with Image.open(img_file) as image:
-                if image.mode == 'RGBA':
-                    background = Image.new('RGB', image.size, (255, 255, 255))
-                    background.paste(image, mask=image.split()[-1])
-                    image = background
-                elif image.mode not in ['RGB', 'L']:
-                    image = image.convert('RGB')
-                image.save(dest_path, 'JPEG')
-        except Exception as img_error:
-            print(f"Error processing image {img_file}: {img_error}")
-            continue
-    captions_to_copy = min(len(text_files), len(image_files))
-    for i, txt_file in enumerate(text_files[:captions_to_copy]):
-        dest_path = os.path.join(local_path, f"image_{i:06d}.txt")
-        try:
-            shutil.copy2(txt_file, dest_path)
-        except Exception as txt_error:
-            print(f"Error copying text file {txt_file}: {txt_error}")
-            continue
-    print(f"Prepared {len(image_files)} images and {captions_to_copy} captions in {local_path}")
-    return len(image_files), captions_to_copy
 def download_dataset(dataset_repo: str, local_path: str):
@@ -324,18 +319,81 @@ def create_config(dataset_path: str, output_path: str):
     config_str = config_str.replace('true', 'True').replace('false', 'False').replace('null', 'None')
     config = eval(config_str)
-    # Update paths for cloud environment
-    config["config"]["process"][0]["datasets"][0]["folder_path"] = dataset_path
-    config["config"]["process"][0]["training_folder"] = output_path
     # Remove sqlite_db_path as it's not needed for cloud training
-    if "sqlite_db_path" in config["config"]["process"][0]:
-        del config["config"]["process"][0]["sqlite_db_path"]
     # Also change trainer type from ui_trainer to standard trainer to avoid UI dependencies
-    if config["config"]["process"][0]["type"] == "ui_trainer":
-        config["config"]["process"][0]["type"] = "sd_trainer"
     return config
 def upload_results(output_path: str, model_name: str, namespace: str, token: str, config: dict):

 def copy_dataset_files(source_dir: str, local_path: str):
     print(f"Collecting data files from {source_dir}")
+    image_exts = {'.jpg', '.jpeg', '.png', '.webp', '.bmp'}
+    copied_images = 0
+    copied_captions = 0
+    for root, _, files in os.walk(source_dir):
+        for file_name in files:
+            ext = os.path.splitext(file_name)[1].lower()
+            src_path = os.path.join(root, file_name)
+            rel_path = os.path.relpath(src_path, source_dir)
+            dest_path = os.path.join(local_path, rel_path)
+            dest_dir = os.path.dirname(dest_path)
+            if dest_dir and not os.path.exists(dest_dir):
+                os.makedirs(dest_dir, exist_ok=True)
+            if ext in image_exts:
+                try:
+                    shutil.copy2(src_path, dest_path)
+                    copied_images += 1
+                except Exception as img_error:
+                    print(f"Error copying image {src_path}: {img_error}")
+            elif ext == '.txt':
+                try:
+                    shutil.copy2(src_path, dest_path)
+                    copied_captions += 1
+                except Exception as txt_error:
+                    print(f"Error copying text file {src_path}: {txt_error}")
+            else:
+                try:
+                    shutil.copy2(src_path, dest_path)
+                except Exception as other_error:
+                    print(f"Error copying file {src_path}: {other_error}")
+    print(f"Prepared {copied_images} images and {copied_captions} captions in {local_path}")
+    return copied_images, copied_captions
 def download_dataset(dataset_repo: str, local_path: str):
     config_str = config_str.replace('true', 'True').replace('false', 'False').replace('null', 'None')
     config = eval(config_str)
+    def resolve_manifest_value(value):
+        if value is None:
+            return None
+        if isinstance(value, list):
+            resolved_list = [resolve_manifest_value(v) for v in value]
+            return [v for v in resolved_list if v is not None]
+        if not isinstance(value, str) or value.strip() == "":
+            return None
+        normalized = value.replace("\\", "/")
+        parts = [part for part in normalized.split("/") if part not in ("", ".")]
+        return os.path.normpath(os.path.join(dataset_path, *parts))
+    manifest_path = os.path.join(dataset_path, "manifest.json")
+    manifest_data = None
+    if os.path.isfile(manifest_path):
+        try:
+            with open(manifest_path, "r", encoding="utf-8") as manifest_file:
+                manifest_data = json.load(manifest_file)
+        except Exception as manifest_error:
+            print(f"Failed to load dataset manifest: {manifest_error}")
+            manifest_data = None
+    process_config = config["config"]["process"][0]
+    datasets_config = process_config.get("datasets", [])
+    if manifest_data and isinstance(manifest_data, dict) and "datasets" in manifest_data:
+        manifest_datasets = manifest_data.get("datasets", [])
+        for idx, dataset_cfg in enumerate(datasets_config):
+            manifest_entry = manifest_datasets[idx] if idx < len(manifest_datasets) else {}
+            if isinstance(manifest_entry, dict):
+                for key, value in manifest_entry.items():
+                    resolved_value = resolve_manifest_value(value)
+                    if resolved_value is not None and resolved_value != []:
+                        dataset_cfg[key] = resolved_value
+                        if key == "folder_path":
+                            dataset_cfg["dataset_path"] = resolved_value
+            if "folder_path" not in dataset_cfg or not dataset_cfg["folder_path"]:
+                dataset_cfg["folder_path"] = dataset_path
+                dataset_cfg["dataset_path"] = dataset_path
+    else:
+        for dataset_cfg in datasets_config:
+            dataset_cfg["folder_path"] = dataset_path
+            dataset_cfg["dataset_path"] = dataset_path
+    samples_config = process_config.get("sample", {}).get("samples", [])
+    if manifest_data and isinstance(manifest_data, dict):
+        manifest_samples = manifest_data.get("samples", [])
+        for sample_entry in manifest_samples:
+            if not isinstance(sample_entry, dict):
+                continue
+            index = sample_entry.get("index")
+            ctrl_img_rel = sample_entry.get("ctrl_img")
+            if (
+                isinstance(index, int)
+                and 0 <= index < len(samples_config)
+                and ctrl_img_rel is not None
+            ):
+                resolved_ctrl_img = resolve_manifest_value(ctrl_img_rel)
+                if resolved_ctrl_img:
+                    samples_config[index]["ctrl_img"] = resolved_ctrl_img
+    # Update training folder for cloud environment
+    process_config["training_folder"] = output_path
     # Remove sqlite_db_path as it's not needed for cloud training
+    if "sqlite_db_path" in process_config:
+        del process_config["sqlite_db_path"]
     # Also change trainer type from ui_trainer to standard trainer to avoid UI dependencies
+    if process_config["type"] == "ui_trainer":
+        process_config["type"] = "sd_trainer"
     return config
 def upload_results(output_path: str, model_name: str, namespace: str, token: str, config: dict):

ui/src/components/HFJobsWorkflow.tsx CHANGED Viewed

@@ -6,6 +6,123 @@ import { SelectInput, TextInput, Checkbox } from '@/components/formInputs';
 import Card from '@/components/Card';
 import { apiClient } from '@/utils/api';
 import { JobConfig } from '@/types';
 import useSettings from '@/hooks/useSettings';
 import { upsertJob } from '@/utils/storage/jobStorage';
 import { useAuth } from '@/contexts/AuthContext';
@@ -126,19 +243,19 @@ export default function HFJobsWorkflow({ jobConfig, onComplete }: HFJobsWorkflow
           throw new Error('Failed to create dataset repository');
         }
-        // Get dataset path from first dataset in config
-        const datasetPath = jobConfig.config.process[0].datasets[0]?.folder_path;
-        if (!datasetPath || datasetPath.trim() === '' || datasetPath === datasetName) {
-          throw new Error('Dataset path could not be resolved. Please ensure the dataset folder exists on the host.');
         }
-        // Upload dataset files
         const uploadResponse = await apiClient.post('/api/hf-hub', {
           action: 'uploadDataset',
           token: effectiveToken,
           namespace: resolvedNamespace,
           datasetName,
-          datasetPath,
         });
         if (uploadResponse.data.success) {

 import Card from '@/components/Card';
 import { apiClient } from '@/utils/api';
 import { JobConfig } from '@/types';
+type DatasetUploadArtifact = {
+  localPath: string;
+  repoPath: string;
+};
+type DatasetManifest = {
+  datasets: any[];
+  samples: any[];
+};
+type DatasetUploadPlan = {
+  artifacts: DatasetUploadArtifact[];
+  manifest: DatasetManifest;
+};
+const ensurePosixPath = (value: string) => value.replace(/\\/g, '/').replace(/^\/+/, '');
+const buildDatasetUploadPlan = (jobConfig: JobConfig): DatasetUploadPlan => {
+  const datasetEntries = jobConfig?.config?.process?.[0]?.datasets ?? [];
+  const sampleEntries = jobConfig?.config?.process?.[0]?.sample?.samples ?? [];
+  const artifactMap = new Map<string, DatasetUploadArtifact>();
+  const manifestDatasets: any[] = [];
+  const manifestSamples: any[] = [];
+  const recordArtifact = (localPath: string | null | undefined, repoPath: string) => {
+    if (!localPath) {
+      return;
+    }
+    const trimmedLocalPath = localPath.trim();
+    if (trimmedLocalPath === '') {
+      return;
+    }
+    const normalizedRepoPath = ensurePosixPath(repoPath);
+    if (!artifactMap.has(normalizedRepoPath)) {
+      artifactMap.set(normalizedRepoPath, {
+        localPath: trimmedLocalPath,
+        repoPath: normalizedRepoPath,
+      });
+    }
+  };
+  const pathFieldMappings: Record<string, string> = {
+    control_path: 'control',
+    inpaint_path: 'inpaint',
+    mask_path: 'mask',
+    unconditional_path: 'unconditional',
+    clip_image_path: 'clip_images',
+  };
+  datasetEntries.forEach((dataset, index) => {
+    const datasetPrefix = `datasets/dataset_${index}`;
+    const manifestEntry: Record<string, any> = {};
+    const folderPath = (dataset as any).folder_path as string | null | undefined;
+    if (folderPath && folderPath.trim() !== '') {
+      const repoPath = `${datasetPrefix}/images`;
+      recordArtifact(folderPath, repoPath);
+      manifestEntry.folder_path = ensurePosixPath(repoPath);
+    }
+    Object.entries(pathFieldMappings).forEach(([field, suffix]) => {
+      const rawValue = (dataset as any)[field];
+      if (rawValue === null || rawValue === undefined) {
+        return;
+      }
+      const values = Array.isArray(rawValue) ? rawValue : [rawValue];
+      const normalizedValues = values
+        .map(value => (typeof value === 'string' ? value.trim() : value))
+        .filter(value => typeof value === 'string' && value !== '') as string[];
+      if (normalizedValues.length === 0) {
+        return;
+      }
+      if (normalizedValues.length === 1) {
+        const repoPath = `${datasetPrefix}/${suffix}`;
+        recordArtifact(normalizedValues[0], repoPath);
+        manifestEntry[field] = ensurePosixPath(repoPath);
+      } else {
+        const repoLocations = normalizedValues.map((value, idx) => {
+          const repoPath = `${datasetPrefix}/${suffix}_${idx}`;
+          recordArtifact(value, repoPath);
+          return ensurePosixPath(repoPath);
+        });
+        manifestEntry[field] = repoLocations;
+      }
+    });
+    manifestDatasets.push(manifestEntry);
+  });
+  sampleEntries.forEach((sample, index) => {
+    const ctrlImg = (sample as any)?.ctrl_img as string | undefined;
+    if (!ctrlImg || ctrlImg.trim() === '') {
+      return;
+    }
+    const trimmedCtrlImg = ctrlImg.trim();
+    const extensionMatch = trimmedCtrlImg.match(/\.([a-zA-Z0-9]+)$/);
+    const extension = extensionMatch ? extensionMatch[0].toLowerCase() : '.png';
+    const repoPath = ensurePosixPath(`samples/ctrl/sample_${index}${extension}`);
+    recordArtifact(trimmedCtrlImg, repoPath);
+    manifestSamples.push({ index, ctrl_img: repoPath });
+  });
+  return {
+    artifacts: Array.from(artifactMap.values()),
+    manifest: {
+      datasets: manifestDatasets,
+      samples: manifestSamples,
+    },
+  };
+};
 import useSettings from '@/hooks/useSettings';
 import { upsertJob } from '@/utils/storage/jobStorage';
 import { useAuth } from '@/contexts/AuthContext';
           throw new Error('Failed to create dataset repository');
         }
+        const uploadPlan = buildDatasetUploadPlan(jobConfig);
+        if (!uploadPlan.artifacts || uploadPlan.artifacts.length === 0) {
+          throw new Error('Dataset path could not be resolved. Please ensure the dataset folders exist on the host.');
         }
         const uploadResponse = await apiClient.post('/api/hf-hub', {
           action: 'uploadDataset',
           token: effectiveToken,
           namespace: resolvedNamespace,
           datasetName,
+          artifacts: uploadPlan.artifacts,
+          manifest: uploadPlan.manifest,
         });
         if (uploadResponse.data.success) {