File size: 13,258 Bytes
119fd59
 
 
 
 
 
 
 
5a75fec
119fd59
 
 
 
 
 
 
 
5a75fec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119fd59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5a75fec
119fd59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5a75fec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119fd59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
#!/usr/bin/env python3
"""
Sync BitTransformerLM repository to HuggingFace Hub for OS launch.
Uploads all cleaned documentation and code with proper commit message.
"""

import os
import logging
import re
from pathlib import Path
from huggingface_hub import HfApi, login
from typing import Optional, List

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def scan_for_secrets(file_path: Path) -> List[str]:
    """Scan a file for potential secrets and tokens."""
    secrets_found = []

    # Patterns for common secrets
    secret_patterns = {
        'HuggingFace Token': r'hf_[A-Za-z0-9_]{30,}',
        'OpenAI API Key': r'sk-[A-Za-z0-9]{48}',
        'GitHub Token': r'gh[pousr]_[A-Za-z0-9_]{36,}',
        'AWS Access Key': r'AKIA[0-9A-Z]{16}',
        'Generic API Key': r'["\']?[Aa]pi[_-]?[Kk]ey["\']?\s*[:=]\s*["\']?[A-Za-z0-9_\-]{20,}["\']?',
        'Generic Token': r'["\']?[Tt]oken["\']?\s*[:=]\s*["\']?[A-Za-z0-9_\-]{20,}["\']?',
        'Generic Secret': r'["\']?[Ss]ecret["\']?\s*[:=]\s*["\']?[A-Za-z0-9_\-]{20,}["\']?',
    }

    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            content = f.read()

        for secret_type, pattern in secret_patterns.items():
            matches = re.finditer(pattern, content, re.IGNORECASE)
            for match in matches:
                line_num = content[:match.start()].count('\n') + 1
                secrets_found.append(f"{secret_type} found at line {line_num}: {match.group()[:50]}...")

    except Exception as e:
        logger.warning(f"Could not scan {file_path} for secrets: {e}")

    return secrets_found


def get_files_to_sync(repo_root: Path) -> List[Path]:
    """Get the exact list of files that will be synced to HuggingFace."""
    # Files and directories to upload (excluding unnecessary files)
    include_patterns = [
        # Core code
        "bit_transformer/**/*.py",
        "tests/**/*.py",
        "scripts/**/*.py",  # Organized scripts
        "scripts/**/*.md",  # Script documentation

        # All root level files (filtered by type)
        "*.py",
        "*.md",
        "*.txt",
        "*.toml",
        "*.sh",
        "Dockerfile",

        # License files
        "LICENSE/**/*",
    ]

    # Files to exclude
    exclude_patterns = [
        "__pycache__/**",
        "*.pyc",
        ".git/**",
        ".pytest_cache/**",
        ".ipynb_checkpoints/**",
        "weights/**",
        "checkpoints/**",  # Contains potentially sensitive configs
        "*.log",
        "*.pt",  # Model weights
        "*.zip",  # Backup files
        # Temporary or generated files
        "*-checkpoint.*",
        "*.tmp",
        "*.swp",
        # OS files
        ".DS_Store",
        "Thumbs.db",
    ]

    # Get all files to upload
    files_to_upload = []
    for pattern in include_patterns:
        for file_path in repo_root.glob(pattern):
            if file_path.is_file():
                # Check if file should be excluded
                relative_path = file_path.relative_to(repo_root)
                should_exclude = any(
                    relative_path.match(exclude)
                    for exclude in exclude_patterns
                )
                if not should_exclude:
                    files_to_upload.append(file_path)

    return sorted(files_to_upload)


def preview_sync(repo_root: Path = None) -> None:
    """Preview what files will be synced without actually uploading."""
    if repo_root is None:
        repo_root = Path(__file__).parent.parent.parent

    files_to_upload = get_files_to_sync(repo_root)

    print(f"\nπŸ“ Repository root: {repo_root}")
    print(f"πŸ“¦ Files to sync: {len(files_to_upload)}")
    print("\nπŸ“‹ File list:")

    for file_path in files_to_upload:
        relative_path = file_path.relative_to(repo_root)
        file_size = file_path.stat().st_size
        print(f"  {relative_path} ({file_size:,} bytes)")

    total_size = sum(f.stat().st_size for f in files_to_upload)
    print(f"\nπŸ“Š Total size: {total_size:,} bytes ({total_size/1024/1024:.2f} MB)")


def sync_repository_to_hf(
    repo_id: str = "WCNegentropy/BitTransformerLM",
    token: Optional[str] = None,
    commit_message: str = "πŸš€ Refined BitTransformerLM: Organized codebase with best practices",
    preview_only: bool = False
):
    """
    Sync the entire cleaned BitTransformerLM repository to HuggingFace Hub.
    
    Args:
        repo_id: HuggingFace repository ID
        token: HF token (defaults to HF_TOKEN environment variable)
        commit_message: Commit message for the upload
    """
    
    # Get token from environment if not provided
    if token is None:
        token = os.environ.get('HF_TOKEN')
        if not token:
            logger.error("HF_TOKEN environment variable not set and no token provided")
            return False
    
    try:
        # Login to HuggingFace
        login(token=token)
        api = HfApi()
        logger.info("Successfully authenticated with HuggingFace Hub")
        
        # Get the repository root directory (go up from scripts/tools/)
        repo_root = Path(__file__).parent.parent.parent
        logger.info(f"Repository root: {repo_root}")

        # Get files to sync using the centralized function
        files_to_upload = get_files_to_sync(repo_root)
        logger.info(f"Found {len(files_to_upload)} files to upload")

        # CRITICAL SECURITY CHECK: Scan all files for secrets
        logger.info("πŸ” Scanning files for secrets and tokens...")
        all_secrets = []
        for file_path in files_to_upload:
            secrets = scan_for_secrets(file_path)
            if secrets:
                relative_path = file_path.relative_to(repo_root)
                all_secrets.extend([f"{relative_path}: {secret}" for secret in secrets])

        if all_secrets:
            logger.error("🚨 SECURITY ALERT: Secrets detected in files!")
            logger.error("The following secrets were found and MUST be removed before sync:")
            for secret in all_secrets:
                logger.error(f"  - {secret}")
            logger.error("❌ SYNC ABORTED for security reasons!")
            logger.error("Please remove all secrets and use environment variables instead.")
            return False

        logger.info("βœ… Security scan passed - no secrets detected")

        # If preview only, just show the files and return
        if preview_only:
            preview_sync(repo_root)
            return True
        
        # Use upload_folder for exact sync - this will mirror the entire directory
        logger.info("Syncing entire repository structure to HuggingFace...")

        try:
            # First, let's create a temporary directory with only the files we want
            import tempfile
            import shutil

            with tempfile.TemporaryDirectory() as temp_dir:
                temp_path = Path(temp_dir)

                # Copy all files we want to upload to temp directory
                for file_path in files_to_upload:
                    relative_path = file_path.relative_to(repo_root)
                    dest_path = temp_path / relative_path
                    dest_path.parent.mkdir(parents=True, exist_ok=True)
                    shutil.copy2(file_path, dest_path)

                logger.info(f"Prepared {len(files_to_upload)} files for upload")

                # Upload the entire folder structure - this ensures exact mirroring
                api.upload_folder(
                    folder_path=str(temp_path),
                    repo_id=repo_id,
                    repo_type="model",
                    commit_message=commit_message,
                    commit_description="""
BitTransformerLM refined with ML engineering best practices:

βœ… **Organized Codebase Structure**
- Cleaned up 30+ scattered scripts into organized directories
- Standardized imports and docstring formatting
- Consolidated configuration management
- Professional package metadata

βœ… **Enhanced Developer Experience**
- Comprehensive CLI interface with standardized arguments
- Type-safe configuration system with presets
- Improved error handling and logging
- Better modular organization

βœ… **Production Quality**
- PyProject.toml with proper dependencies and tooling
- Consistent code formatting and documentation
- Maintainable directory structure
- Ready for serious development and research

The bit-native transformer architecture with reversible layers, safety telemetry,
and distributed training capabilities is now properly packaged for research use.
                    """.strip(),
                    delete_patterns=["*"]  # This ensures old files are removed
                )

                uploaded_count = len(files_to_upload)

        except Exception as e:
            logger.error(f"Failed to upload folder: {e}")
            logger.info("Falling back to individual file upload...")

            # Fallback to individual file upload
            uploaded_count = 0
            for file_path in files_to_upload:
                try:
                    relative_path = file_path.relative_to(repo_root)
                    logger.info(f"Uploading: {relative_path}")

                    api.upload_file(
                        path_or_fileobj=str(file_path),
                        path_in_repo=str(relative_path),
                        repo_id=repo_id,
                        repo_type="model",
                        commit_message=commit_message,
                    )

                    uploaded_count += 1
                    if uploaded_count % 10 == 0:
                        logger.info(f"Progress: {uploaded_count}/{len(files_to_upload)} files uploaded")

                except Exception as e:
                    logger.warning(f"Failed to upload {relative_path}: {e}")
                    continue
        
        logger.info(f"βœ… Successfully uploaded {uploaded_count}/{len(files_to_upload)} files")
        logger.info(f"πŸŽ‰ Repository synced to: https://huggingface.co/{repo_id}")
        
        return True
        
    except Exception as e:
        logger.error(f"❌ Failed to sync repository: {e}")
        return False

def create_release_info():
    """Create a release information file for the OS launch."""
    release_info = """# BitTransformerLM v0.1.0 - Experimental Research Release

**Release Date:** August 2025  
**Status:** Open Source Research Implementation  
**License:** AGPLv3 + Commercial Licensing Available  

## What's Included

This release provides a complete experimental framework for bit-native language modeling research:

- **Core Architecture:** 57 Python files implementing bit-native transformer with reversible layers
- **Safety Systems:** Real-time K/C/S telemetry and monitoring
- **Research Tools:** Interactive dashboard, distributed training, comprehensive testing
- **Documentation:** Professional model card, research status, and validation reports

## Important Notes

⚠️ **Experimental Status:** This is research code requiring rigorous baseline validation  
⚠️ **Not Production Ready:** Needs extensive evaluation vs standard transformers  
⚠️ **Research Use Only:** Intended for academic investigation and experimentation  

## Licensing

- **Open Source:** AGPLv3 for research and open source use
- **Commercial:** Contact contact@wcnegentropy.com for commercial licensing

## Next Steps

The research community is invited to:
1. Conduct rigorous baseline comparisons vs standard transformers
2. Evaluate on established language modeling benchmarks  
3. Validate (or refute) claimed memory efficiency benefits
4. Share findings openly to advance the field

**Research responsibly. Validate rigorously. Share openly.**
"""
    
    release_file = Path(__file__).parent / "RELEASE_INFO.md"
    with open(release_file, 'w') as f:
        f.write(release_info)
    
    logger.info("Created RELEASE_INFO.md")
    return release_file

if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="Sync BitTransformerLM to HuggingFace Hub")
    parser.add_argument("--preview", action="store_true", help="Preview files without uploading")
    parser.add_argument("--repo-id", default="WCNegentropy/BitTransformerLM", help="HuggingFace repo ID")
    parser.add_argument("--token", help="HuggingFace token (or set HF_TOKEN env var)")
    args = parser.parse_args()

    if args.preview:
        print("πŸ” Preview mode: showing files that would be synced...")
        preview_sync()
        print("\nβœ… Use --token YOUR_TOKEN to perform actual sync")
    else:
        # Create release info file
        create_release_info()

        # Sync to HuggingFace
        success = sync_repository_to_hf(
            repo_id=args.repo_id,
            token=args.token
        )

        if success:
            print(f"\nπŸš€ BitTransformerLM Sync Complete!")
            print(f"πŸ“ Repository: https://huggingface.co/{args.repo_id}")
            print("\nRefined codebase with ML engineering best practices is now live! ✨")
        else:
            print("\n❌ Sync failed. Please check logs and try again.")