| | |
| | """ |
| | Sync BitTransformerLM repository to HuggingFace Hub for OS launch. |
| | Uploads all cleaned documentation and code with proper commit message. |
| | """ |
| |
|
| | import os |
| | import logging |
| | from pathlib import Path |
| | from huggingface_hub import HfApi, login |
| | from typing import Optional, List |
| |
|
| | |
| | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
| | logger = logging.getLogger(__name__) |
| |
|
| | def get_files_to_sync(repo_root: Path) -> List[Path]: |
| | """Get the exact list of files that will be synced to HuggingFace.""" |
| | |
| | include_patterns = [ |
| | |
| | "bit_transformer/**/*.py", |
| | "tests/**/*.py", |
| | "scripts/**/*.py", |
| | "scripts/**/*.md", |
| |
|
| | |
| | "*.py", |
| | "*.md", |
| | "*.txt", |
| | "*.toml", |
| | "*.sh", |
| | "Dockerfile", |
| |
|
| | |
| | "LICENSE/**/*", |
| | ] |
| |
|
| | |
| | exclude_patterns = [ |
| | "__pycache__/**", |
| | "*.pyc", |
| | ".git/**", |
| | ".pytest_cache/**", |
| | ".ipynb_checkpoints/**", |
| | "weights/**", |
| | "checkpoints/**", |
| | "*.log", |
| | "*.pt", |
| | "*.zip", |
| | |
| | "*-checkpoint.*", |
| | "*.tmp", |
| | "*.swp", |
| | |
| | ".DS_Store", |
| | "Thumbs.db", |
| | ] |
| |
|
| | |
| | files_to_upload = [] |
| | for pattern in include_patterns: |
| | for file_path in repo_root.glob(pattern): |
| | if file_path.is_file(): |
| | |
| | relative_path = file_path.relative_to(repo_root) |
| | should_exclude = any( |
| | relative_path.match(exclude) |
| | for exclude in exclude_patterns |
| | ) |
| | if not should_exclude: |
| | files_to_upload.append(file_path) |
| |
|
| | return sorted(files_to_upload) |
| |
|
| |
|
| | def preview_sync(repo_root: Path = None) -> None: |
| | """Preview what files will be synced without actually uploading.""" |
| | if repo_root is None: |
| | repo_root = Path(__file__).parent.parent.parent |
| |
|
| | files_to_upload = get_files_to_sync(repo_root) |
| |
|
| | print(f"\nπ Repository root: {repo_root}") |
| | print(f"π¦ Files to sync: {len(files_to_upload)}") |
| | print("\nπ File list:") |
| |
|
| | for file_path in files_to_upload: |
| | relative_path = file_path.relative_to(repo_root) |
| | file_size = file_path.stat().st_size |
| | print(f" {relative_path} ({file_size:,} bytes)") |
| |
|
| | total_size = sum(f.stat().st_size for f in files_to_upload) |
| | print(f"\nπ Total size: {total_size:,} bytes ({total_size/1024/1024:.2f} MB)") |
| |
|
| |
|
| | def sync_repository_to_hf( |
| | repo_id: str = "WCNegentropy/BitTransformerLM", |
| | token: Optional[str] = None, |
| | commit_message: str = "π Refined BitTransformerLM: Organized codebase with best practices", |
| | preview_only: bool = False |
| | ): |
| | """ |
| | Sync the entire cleaned BitTransformerLM repository to HuggingFace Hub. |
| | |
| | Args: |
| | repo_id: HuggingFace repository ID |
| | token: HF token (defaults to HF_TOKEN environment variable) |
| | commit_message: Commit message for the upload |
| | """ |
| | |
| | |
| | if token is None: |
| | token = os.environ.get('HF_TOKEN') |
| | if not token: |
| | logger.error("HF_TOKEN environment variable not set and no token provided") |
| | return False |
| | |
| | try: |
| | |
| | login(token=token) |
| | api = HfApi() |
| | logger.info("Successfully authenticated with HuggingFace Hub") |
| | |
| | |
| | repo_root = Path(__file__).parent.parent.parent |
| | logger.info(f"Repository root: {repo_root}") |
| |
|
| | |
| | files_to_upload = get_files_to_sync(repo_root) |
| | logger.info(f"Found {len(files_to_upload)} files to upload") |
| |
|
| | |
| | if preview_only: |
| | preview_sync(repo_root) |
| | return True |
| | |
| | |
| | logger.info("Syncing entire repository structure to HuggingFace...") |
| |
|
| | try: |
| | |
| | import tempfile |
| | import shutil |
| |
|
| | with tempfile.TemporaryDirectory() as temp_dir: |
| | temp_path = Path(temp_dir) |
| |
|
| | |
| | for file_path in files_to_upload: |
| | relative_path = file_path.relative_to(repo_root) |
| | dest_path = temp_path / relative_path |
| | dest_path.parent.mkdir(parents=True, exist_ok=True) |
| | shutil.copy2(file_path, dest_path) |
| |
|
| | logger.info(f"Prepared {len(files_to_upload)} files for upload") |
| |
|
| | |
| | api.upload_folder( |
| | folder_path=str(temp_path), |
| | repo_id=repo_id, |
| | repo_type="model", |
| | commit_message=commit_message, |
| | commit_description=""" |
| | BitTransformerLM refined with ML engineering best practices: |
| | |
| | β
**Organized Codebase Structure** |
| | - Cleaned up 30+ scattered scripts into organized directories |
| | - Standardized imports and docstring formatting |
| | - Consolidated configuration management |
| | - Professional package metadata |
| | |
| | β
**Enhanced Developer Experience** |
| | - Comprehensive CLI interface with standardized arguments |
| | - Type-safe configuration system with presets |
| | - Improved error handling and logging |
| | - Better modular organization |
| | |
| | β
**Production Quality** |
| | - PyProject.toml with proper dependencies and tooling |
| | - Consistent code formatting and documentation |
| | - Maintainable directory structure |
| | - Ready for serious development and research |
| | |
| | The bit-native transformer architecture with reversible layers, safety telemetry, |
| | and distributed training capabilities is now properly packaged for research use. |
| | """.strip(), |
| | delete_patterns=["*"] |
| | ) |
| |
|
| | uploaded_count = len(files_to_upload) |
| |
|
| | except Exception as e: |
| | logger.error(f"Failed to upload folder: {e}") |
| | logger.info("Falling back to individual file upload...") |
| |
|
| | |
| | uploaded_count = 0 |
| | for file_path in files_to_upload: |
| | try: |
| | relative_path = file_path.relative_to(repo_root) |
| | logger.info(f"Uploading: {relative_path}") |
| |
|
| | api.upload_file( |
| | path_or_fileobj=str(file_path), |
| | path_in_repo=str(relative_path), |
| | repo_id=repo_id, |
| | repo_type="model", |
| | commit_message=commit_message, |
| | ) |
| |
|
| | uploaded_count += 1 |
| | if uploaded_count % 10 == 0: |
| | logger.info(f"Progress: {uploaded_count}/{len(files_to_upload)} files uploaded") |
| |
|
| | except Exception as e: |
| | logger.warning(f"Failed to upload {relative_path}: {e}") |
| | continue |
| | |
| | logger.info(f"β
Successfully uploaded {uploaded_count}/{len(files_to_upload)} files") |
| | logger.info(f"π Repository synced to: https://huggingface.co/{repo_id}") |
| | |
| | return True |
| | |
| | except Exception as e: |
| | logger.error(f"β Failed to sync repository: {e}") |
| | return False |
| |
|
| | def create_release_info(): |
| | """Create a release information file for the OS launch.""" |
| | release_info = """# BitTransformerLM v0.1.0 - Experimental Research Release |
| | |
| | **Release Date:** August 2025 |
| | **Status:** Open Source Research Implementation |
| | **License:** AGPLv3 + Commercial Licensing Available |
| | |
| | ## What's Included |
| | |
| | This release provides a complete experimental framework for bit-native language modeling research: |
| | |
| | - **Core Architecture:** 57 Python files implementing bit-native transformer with reversible layers |
| | - **Safety Systems:** Real-time K/C/S telemetry and monitoring |
| | - **Research Tools:** Interactive dashboard, distributed training, comprehensive testing |
| | - **Documentation:** Professional model card, research status, and validation reports |
| | |
| | ## Important Notes |
| | |
| | β οΈ **Experimental Status:** This is research code requiring rigorous baseline validation |
| | β οΈ **Not Production Ready:** Needs extensive evaluation vs standard transformers |
| | β οΈ **Research Use Only:** Intended for academic investigation and experimentation |
| | |
| | ## Licensing |
| | |
| | - **Open Source:** AGPLv3 for research and open source use |
| | - **Commercial:** Contact contact@wcnegentropy.com for commercial licensing |
| | |
| | ## Next Steps |
| | |
| | The research community is invited to: |
| | 1. Conduct rigorous baseline comparisons vs standard transformers |
| | 2. Evaluate on established language modeling benchmarks |
| | 3. Validate (or refute) claimed memory efficiency benefits |
| | 4. Share findings openly to advance the field |
| | |
| | **Research responsibly. Validate rigorously. Share openly.** |
| | """ |
| | |
| | release_file = Path(__file__).parent / "RELEASE_INFO.md" |
| | with open(release_file, 'w') as f: |
| | f.write(release_info) |
| | |
| | logger.info("Created RELEASE_INFO.md") |
| | return release_file |
| |
|
| | if __name__ == "__main__": |
| | import argparse |
| |
|
| | parser = argparse.ArgumentParser(description="Sync BitTransformerLM to HuggingFace Hub") |
| | parser.add_argument("--preview", action="store_true", help="Preview files without uploading") |
| | parser.add_argument("--repo-id", default="WCNegentropy/BitTransformerLM", help="HuggingFace repo ID") |
| | parser.add_argument("--token", help="HuggingFace token (or set HF_TOKEN env var)") |
| | args = parser.parse_args() |
| |
|
| | if args.preview: |
| | print("π Preview mode: showing files that would be synced...") |
| | preview_sync() |
| | print("\nβ
Use --token YOUR_TOKEN to perform actual sync") |
| | else: |
| | |
| | create_release_info() |
| |
|
| | |
| | success = sync_repository_to_hf( |
| | repo_id=args.repo_id, |
| | token=args.token |
| | ) |
| |
|
| | if success: |
| | print(f"\nπ BitTransformerLM Sync Complete!") |
| | print(f"π Repository: https://huggingface.co/{args.repo_id}") |
| | print("\nRefined codebase with ML engineering best practices is now live! β¨") |
| | else: |
| | print("\nβ Sync failed. Please check logs and try again.") |