File size: 23,042 Bytes
c438f1b
 
 
 
 
 
 
 
 
 
 
7528710
c438f1b
 
 
 
f9583f1
0a5b850
 
 
 
 
129c04a
 
2489bda
 
 
129c04a
 
c438f1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a20e4c3
 
 
 
 
 
951cf8d
a20e4c3
 
 
2489bda
c438f1b
 
 
 
 
a20e4c3
c438f1b
a20e4c3
 
 
 
 
c438f1b
 
 
f9583f1
 
 
c438f1b
f9583f1
 
 
d0141ad
f9583f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c438f1b
 
 
a20e4c3
 
 
951cf8d
c438f1b
 
a20e4c3
 
 
 
951cf8d
a20e4c3
c438f1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0a5b850
7528710
c438f1b
0a5b850
 
7528710
 
 
 
 
 
0a5b850
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129c04a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0a5b850
 
129c04a
 
 
 
 
0a5b850
 
129c04a
 
 
 
 
0a5b850
 
 
 
 
 
 
 
7528710
0a5b850
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7528710
0a5b850
 
 
 
 
 
 
 
 
7528710
0a5b850
 
 
 
 
 
 
 
 
 
 
 
 
c438f1b
0a5b850
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7528710
0a5b850
2489bda
0a5b850
7528710
0a5b850
 
 
 
 
 
 
 
c438f1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2489bda
c438f1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2489bda
 
c438f1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a20e4c3
 
 
 
 
 
 
 
 
 
 
c438f1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2489bda
c438f1b
2489bda
c438f1b
 
 
 
 
 
7528710
 
 
 
 
 
92609ba
7528710
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
"""
HuggingFace to ModelScope Migration Tool

This Gradio app enables migration of models and datasets from HuggingFace to ModelScope.
"""

import os
import shutil
import tempfile
from pathlib import Path
from typing import Tuple, Optional
import argparse

import gradio as gr
from huggingface_hub import snapshot_download, HfApi
from modelscope.hub.api import HubApi
from modelscope.hub.constants import Licenses, ModelVisibility, DatasetVisibility
import sys
import io
import threading
import queue
import time
import re

# Set ModelScope domain to use the international site
os.environ.setdefault("MODELSCOPE_DOMAIN", "modelscope.ai")

# Regex to match ANSI escape codes (like [A, [2K, etc.)
ANSI_ESCAPE = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')


class MigrationTool:
    """Handles migration of models and datasets between HuggingFace and ModelScope."""

    def __init__(self):
        self.temp_dir = None

    def download_from_hf(
        self,
        repo_id: str,
        repo_type: str = "model",
        token: Optional[str] = None
    ) -> Tuple[bool, str, Optional[str]]:
        """Download a repository from HuggingFace.

        Args:
            repo_id: HuggingFace repository ID (e.g., 'username/repo-name')
            repo_type: Type of repository ('model' or 'dataset')
            token: HuggingFace authentication token

        Returns:
            Tuple of (success, message, local_path)
        """
        try:
            # Create temporary directory
            self.temp_dir = tempfile.mkdtemp(prefix="hf2ms_")

            # Download the repository
            local_path = snapshot_download(
                repo_id=repo_id,
                repo_type=repo_type,
                local_dir=self.temp_dir,
                local_dir_use_symlinks=False,
                token=token
            )

            return True, f"βœ“ Successfully downloaded {repo_type} from HuggingFace", local_path
        except Exception as e:
            return False, f"βœ— Download failed: {str(e)}", None

    def upload_to_ms(
        self,
        local_path: str,
        repo_id: str,
        token: str,
        repo_type: str = "model",
        visibility: str = "public",
        license_type: str = "apache-2.0",
        chinese_name: Optional[str] = None
    ) -> Tuple[bool, str]:
        """Upload a repository to ModelScope.

        Args:
            local_path: Local path to the repository
            repo_id: ModelScope repository ID (e.g., 'username/repo-name')
            token: ModelScope authentication token
            repo_type: Type of repository ('model' or 'dataset')
            visibility: Repository visibility ('public' or 'private')
            license_type: License type
            chinese_name: Optional Chinese name for the repository

        Returns:
            Tuple of (success, message)
        """
        try:
            # Clean and validate token
            token = token.strip()
            if not token:
                return False, "βœ— ModelScope token is empty"

            # Create HubApi instance and login explicitly
            api = HubApi()
            try:
                api.login(token)
            except Exception as login_error:
                return False, f"βœ— ModelScope Login failed: {str(login_error)}\n\nπŸ’‘ Tip: Ensure you are using an 'SDK Token' from https://www.modelscope.ai/my/myaccesstoken. The token usually starts with 'ms-'."

            # Map license types
            license_map = {
                "apache-2.0": Licenses.APACHE_V2,
                "mit": Licenses.MIT,
                "gpl-2.0": Licenses.GPL_V2,
                "gpl-3.0": Licenses.GPL_V3,
                "lgpl-2.1": Licenses.LGPL_V2_1,
                "lgpl-3.0": Licenses.LGPL_V3,
                "afl-3.0": Licenses.AFL_V3,
                "ecl-2.0": Licenses.ECL_V2,
                "other": None,
            }
            lic = license_map.get(license_type.lower(), Licenses.APACHE_V2)

            # Check if repository exists
            repo_exists = api.repo_exists(repo_id=repo_id, repo_type=repo_type, token=token)
            
            # Create repository if it doesn't exist
            # Important: We must create with the correct visibility BEFORE upload_folder,
            # because upload_folder will create a public repo by default if repo doesn't exist
            if not repo_exists:
                try:
                    if repo_type == "model":
                        # Determine visibility for models (1=private, 5=public)
                        vis = ModelVisibility.PUBLIC if visibility == "public" else ModelVisibility.PRIVATE
                        # Build parameters, only include license if not None
                        create_params = {
                            "model_id": repo_id,
                            "visibility": vis,
                            "token": token,
                        }
                        if lic is not None:
                            create_params["license"] = lic
                        if chinese_name:
                            create_params["chinese_name"] = chinese_name
                        api.create_model(**create_params)
                    else:
                        # Determine visibility for datasets (1=private, 5=public)
                        vis = DatasetVisibility.PUBLIC if visibility == "public" else DatasetVisibility.PRIVATE
                        # For datasets, need to split repo_id into namespace and name
                        parts = repo_id.split('/')
                        if len(parts) != 2:
                            return False, f"βœ— Invalid dataset ID format: {repo_id}. Must be 'namespace/name'"
                        namespace, dataset_name = parts
                        # Build parameters, only include license if not None
                        create_params = {
                            "dataset_name": dataset_name,
                            "namespace": namespace,
                            "visibility": vis,
                        }
                        if lic is not None:
                            create_params["license"] = lic
                        if chinese_name:
                            create_params["chinese_name"] = chinese_name
                        api.create_dataset(**create_params)
                except Exception as create_error:
                    error_msg = str(create_error)
                    # Only ignore if repo already exists (race condition)
                    if "already exists" not in error_msg.lower():
                        return False, f"βœ— Failed to create repository: {error_msg}"

            # Push the model/dataset
            if repo_type == "model":
                api.upload_folder(
                    repo_id=repo_id,
                    folder_path=local_path,
                    token=token,
                )
            else:
                # For datasets, use upload_folder with repo_type='dataset'
                api.upload_folder(
                    repo_id=repo_id,
                    folder_path=local_path,
                    token=token,
                    repo_type="dataset"
                )

            return True, f"βœ“ Successfully uploaded {repo_type} to ModelScope"
        except Exception as e:
            return False, f"βœ— Upload failed: {str(e)}"

    def cleanup(self):
        """Clean up temporary files."""
        if self.temp_dir and os.path.exists(self.temp_dir):
            try:
                shutil.rmtree(self.temp_dir)
                self.temp_dir = None
            except Exception as e:
                print(f"Warning: Failed to clean up temporary directory: {e}")

    def migrate(
        self,
        hf_token: str,
        ms_token: str,
        hf_repo_id: str,
        ms_repo_id: str,
        repo_type: str,
        visibility: str,
        license_type: str,
        chinese_name: Optional[str] = None,
        progress=None
    ) -> str:
        """Perform the complete migration process with real-time console log capture."""
        
        # If no progress tracker is provided (CLI mode), we just skip progress updates
        # Gradio will pass its own tracker when called from the UI
        def update_progress(val, desc=""):
            if progress:
                progress(val, desc=desc)

        log_queue = queue.Queue()
        output_lines = []
        
        # Helper to capture console output and send it to the queue
        class StreamToQueue(io.StringIO):
            def __init__(self, original_stream, q):
                super().__init__()
                self.original_stream = original_stream
                self.q = q
            def write(self, s):
                if s:
                    # Write to original stream (console) and our queue (Gradio)
                    self.original_stream.write(s)
                    self.original_stream.flush()
                    self.q.put(s)
            def flush(self):
                self.original_stream.flush()

        def update_output():
            """Gather all pending messages from the queue and return the full status."""
            new_data = False
            while not log_queue.empty():
                try:
                    raw_msg = log_queue.get_nowait()
                    # 1. Strip ANSI escape codes (those [A, [m, etc.)
                    msg = ANSI_ESCAPE.sub('', raw_msg)
                    if not msg:
                        continue

                    # 2. Process the message line by line
                    # We handle \r by treating it as a signal to potentially overwrite the last line
                    # We handle \n as a signal to start a new line
                    for line in msg.replace('\r', '\n').split('\n'):
                        clean_line = line.strip()
                        if not clean_line:
                            continue

                        # 3. Smart Progress Bar Handling
                        # Identify if this line is a progress bar update
                        # Progress bars usually look like: "Label: 45%|### | ..."
                        is_progress = '%' in clean_line and '|' in clean_line and ('[' in clean_line or ']' in clean_line)
                        
                        if is_progress:
                            # Extract the label (everything before the progress bar/percentage)
                            # This helps us identify WHICH progress bar to update
                            label = clean_line.split('|')[0].split('%')[0].strip()
                            # If the label ends with a number (like '45'), try to get the text before it
                            label = re.sub(r'\d+$', '', label).strip()
                            
                            found = False
                            # Look at the last few lines to see if we're updating an existing bar
                            # We only look back ~10 lines to keep it fast
                            for i in range(len(output_lines) - 1, max(-1, len(output_lines) - 11), -1):
                                if label and label in output_lines[i] and ('%' in output_lines[i] or '|' in output_lines[i]):
                                    output_lines[i] = clean_line
                                    found = True
                                    new_data = True
                                    break
                            
                            if not found:
                                output_lines.append(clean_line)
                                new_data = True
                        else:
                            # Regular log message
                            output_lines.append(clean_line)
                            new_data = True

                except queue.Empty:
                    break
            
            # Keep the output box from growing infinitely (last 1000 lines)
            if len(output_lines) > 1000:
                output_lines[:] = output_lines[-1000:]
                
            return "\n".join(output_lines), new_data

        # Thread-safe migration execution storage
        results = {"success": False, "message": "", "finished": False}

        def run_migration():
            try:
                # Clean inputs
                update_progress(0, desc="Validating inputs...")
                nonlocal hf_token, ms_token, hf_repo_id, ms_repo_id
                hf_token = hf_token.strip() if hf_token else ""
                ms_token = ms_token.strip() if ms_token else ""
                hf_repo_id = hf_repo_id.strip() if hf_repo_id else ""
                ms_repo_id = ms_repo_id.strip() if ms_repo_id else ""

                if not hf_token or not ms_token or not hf_repo_id or not ms_repo_id:
                    results["message"] = "βœ— Error: All tokens and repository IDs are required"
                    results["finished"] = True
                    return

                if "/" not in ms_repo_id:
                    results["message"] = "βœ— Error: ModelScope Repo ID must include your namespace (e.g., 'username/repo-name')"
                    results["finished"] = True
                    return

                # 1. Download
                update_progress(0.1, desc="Downloading from HuggingFace...")
                print(f"⬇️  Starting download from HuggingFace: {hf_repo_id}...")
                success, msg, local_path = self.download_from_hf(hf_repo_id, repo_type, hf_token)
                print(msg)
                if not success:
                    results["message"] = msg
                    results["finished"] = True
                    return

                # 2. Upload
                update_progress(0.4, desc="Uploading to ModelScope...")
                print(f"\n⬆️  Starting upload to ModelScope: {ms_repo_id}...")
                success, msg = self.upload_to_ms(
                    local_path, 
                    ms_repo_id, 
                    ms_token, 
                    repo_type, 
                    visibility, 
                    license_type, 
                    chinese_name
                )
                print(msg)
                results["success"] = success
                results["message"] = msg

            except Exception as e:
                results["message"] = f"βœ— Unexpected error: {str(e)}"
            finally:
                print("\n🧹 Cleaning up temporary files...")
                self.cleanup()
                print("βœ“ Cleanup complete")
                results["finished"] = True

        # Redirect stdout and stderr to our queue
        old_stdout = sys.stdout
        old_stderr = sys.stderr
        sys.stdout = StreamToQueue(sys.stdout, log_queue)
        sys.stderr = StreamToQueue(sys.stderr, log_queue)

        try:
            # Start the migration in a background thread
            thread = threading.Thread(target=run_migration)
            thread.start()

            # Continuously yield updates until the migration thread completes
            while not results["finished"]:
                current_status, updated = update_output()
                if updated:
                    yield current_status
                time.sleep(0.1)

            # Final capture of any remaining logs
            final_status, _ = update_output()
            
            # Append final results
            if results["success"]:
                update_progress(1.0, desc="Completed")
                final_status += f"\n\nβœ… Migration completed successfully!"
                final_status += f"\nYour {repo_type} is available at: https://www.modelscope.ai/models/{ms_repo_id}"
            else:
                update_progress(1.0, desc="Failed")
                final_status += f"\n\n❌ Migration failed: {results['message']}"
            
            yield final_status

        finally:
            # CRITICAL: Restore original streams so we don't break the whole app
            sys.stdout = old_stdout
            sys.stderr = old_stderr


def create_interface():
    """Create the Gradio interface."""

    migration_tool = MigrationTool()

    with gr.Blocks(title="HuggingFace to ModelScope Migration Tool") as app:
        gr.Markdown("""
        # πŸš€ HuggingFace to ModelScope Migration Tool

        Easily migrate your models and datasets from HuggingFace to ModelScope.

        ## πŸ“‹ Instructions:
        1. Get your **HuggingFace token** from: https://huggingface.co/settings/tokens
        2. Get your **ModelScope SDK token** from: https://www.modelscope.ai/my/myaccesstoken
        3. Fill in the repository details below
        4. Click "Start Migration"
        """)

        with gr.Row():
            with gr.Column():
                gr.Markdown("### πŸ”‘ Authentication")
                hf_token = gr.Textbox(
                    label="HuggingFace Token",
                    type="password",
                    placeholder="hf_...",
                    info="Your HuggingFace access token"
                )
                ms_token = gr.Textbox(
                    label="ModelScope Token",
                    type="password",
                    placeholder="ms_...",
                    info="Your SDK token from modelscope.ai (usually starts with 'ms-')"
                )

            with gr.Column():
                gr.Markdown("### πŸ“¦ Repository Details")
                repo_type = gr.Radio(
                    choices=["model", "dataset"],
                    label="Repository Type",
                    value="model",
                    info="Select what you want to migrate"
                )
                visibility = gr.Radio(
                    choices=["public", "private"],
                    label="Visibility",
                    value="public",
                    info="Visibility of the repository on ModelScope"
                )

        with gr.Row():
            with gr.Column():
                hf_repo_id = gr.Textbox(
                    label="Source HuggingFace Repo ID",
                    placeholder="username/repo-name",
                    info="e.g., bert-base-uncased or username/my-model"
                )

            with gr.Column():
                ms_repo_id = gr.Textbox(
                    label="Destination ModelScope Repo ID",
                    placeholder="username/repo-name",
                    info="Your ModelScope username/repo-name"
                )

        with gr.Row():
            with gr.Column():
                license_type = gr.Dropdown(
                    choices=[
                        "apache-2.0", 
                        "mit", 
                        "gpl-2.0", 
                        "gpl-3.0", 
                        "lgpl-2.1", 
                        "lgpl-3.0", 
                        "afl-3.0", 
                        "ecl-2.0",
                        "other"
                    ],
                    label="License",
                    value="apache-2.0",
                    info="License for the repository"
                )

            with gr.Column():
                chinese_name = gr.Textbox(
                    label="Chinese Name (Optional)",
                    placeholder="ζ¨‘εž‹δΈ­ζ–‡εη§°",
                    info="Optional Chinese name for the repository"
                )

        migrate_btn = gr.Button("πŸš€ Start Migration", variant="primary", size="lg")

        output = gr.Textbox(
            label="Migration Status",
            lines=15,
            interactive=False
        )

        migrate_btn.click(
            fn=migration_tool.migrate,
            inputs=[
                hf_token,
                ms_token,
                hf_repo_id,
                ms_repo_id,
                repo_type,
                visibility,
                license_type,
                chinese_name
            ],
            outputs=output
        )

        gr.Markdown("""
        ---
        ### πŸ“ Notes:
        - Large models may take some time to download and upload
        - Make sure you have enough disk space for temporary storage
        - Private repositories require appropriate token permissions
        - The tool will create the repository on ModelScope if it doesn't exist

        ### πŸ”— Resources:
        - [HuggingFace Hub](https://huggingface.co/)
        - [ModelScope](https://www.modelscope.ai/)
        - [HuggingFace Token Settings](https://huggingface.co/settings/tokens)
        - [ModelScope Token Settings](https://www.modelscope.ai/my/myaccesstoken)
        """)

    return app


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="HuggingFace to ModelScope Migration Tool")
    
    # Mode selection
    parser.add_argument("--cli", action="store_true", help="Run in CLI mode instead of Gradio UI")
    
    # Gradio options
    parser.add_argument("--host", type=str, default="0.0.0.0", help="Host for Gradio app")
    parser.add_argument("--port", type=int, default=7860, help="Port for Gradio app")
    parser.add_argument("--share", action="store_true", help="Create a public link for Gradio app")
    
    # CLI options (only used if --cli is set)
    parser.add_argument("--hf-token", type=str, help="HuggingFace access token")
    parser.add_argument("--ms-token", type=str, help="ModelScope access token")
    parser.add_argument("--hf-repo", type=str, help="Source HuggingFace repo ID")
    parser.add_argument("--ms-repo", type=str, help="Destination ModelScope repo ID")
    parser.add_argument("--type", type=str, choices=["model", "dataset"], default="model", help="Repository type")
    parser.add_argument("--visibility", type=str, choices=["public", "private"], default="public", help="Repo visibility")
    parser.add_argument("--license", type=str, default="apache-2.0", help="Repo license")
    parser.add_argument("--chinese-name", type=str, help="Optional Chinese name for the repo")
    
    args = parser.parse_args()
    
    if args.cli:
        if not all([args.hf_token, args.ms_token, args.hf_repo, args.ms_repo]):
            print("βœ— Error: CLI mode requires --hf-token, --ms-token, --hf-repo, and --ms-repo")
            sys.exit(1)
            
        tool = MigrationTool()
        print(f"πŸš€ Starting CLI Migration: {args.hf_repo} -> {args.ms_repo}")
        
        # In CLI mode, we just consume the generator
        last_status = ""
        for status in tool.migrate(
            args.hf_token,
            args.ms_token,
            args.hf_repo,
            args.ms_repo,
            args.type,
            args.visibility,
            args.license,
            args.chinese_name
        ):
            last_status = status
            
        print("\n" + "="*50)
        print("Final Status:")
        print(last_status)
    else:
        app = create_interface()
        app.launch(
            server_name=args.host,
            server_port=args.port,
            share=args.share
        )