File size: 13,258 Bytes
119fd59 5a75fec 119fd59 5a75fec 119fd59 5a75fec 119fd59 5a75fec 119fd59 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 |
#!/usr/bin/env python3
"""
Sync BitTransformerLM repository to HuggingFace Hub for OS launch.
Uploads all cleaned documentation and code with proper commit message.
"""
import os
import logging
import re
from pathlib import Path
from huggingface_hub import HfApi, login
from typing import Optional, List
# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def scan_for_secrets(file_path: Path) -> List[str]:
"""Scan a file for potential secrets and tokens."""
secrets_found = []
# Patterns for common secrets
secret_patterns = {
'HuggingFace Token': r'hf_[A-Za-z0-9_]{30,}',
'OpenAI API Key': r'sk-[A-Za-z0-9]{48}',
'GitHub Token': r'gh[pousr]_[A-Za-z0-9_]{36,}',
'AWS Access Key': r'AKIA[0-9A-Z]{16}',
'Generic API Key': r'["\']?[Aa]pi[_-]?[Kk]ey["\']?\s*[:=]\s*["\']?[A-Za-z0-9_\-]{20,}["\']?',
'Generic Token': r'["\']?[Tt]oken["\']?\s*[:=]\s*["\']?[A-Za-z0-9_\-]{20,}["\']?',
'Generic Secret': r'["\']?[Ss]ecret["\']?\s*[:=]\s*["\']?[A-Za-z0-9_\-]{20,}["\']?',
}
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
for secret_type, pattern in secret_patterns.items():
matches = re.finditer(pattern, content, re.IGNORECASE)
for match in matches:
line_num = content[:match.start()].count('\n') + 1
secrets_found.append(f"{secret_type} found at line {line_num}: {match.group()[:50]}...")
except Exception as e:
logger.warning(f"Could not scan {file_path} for secrets: {e}")
return secrets_found
def get_files_to_sync(repo_root: Path) -> List[Path]:
"""Get the exact list of files that will be synced to HuggingFace."""
# Files and directories to upload (excluding unnecessary files)
include_patterns = [
# Core code
"bit_transformer/**/*.py",
"tests/**/*.py",
"scripts/**/*.py", # Organized scripts
"scripts/**/*.md", # Script documentation
# All root level files (filtered by type)
"*.py",
"*.md",
"*.txt",
"*.toml",
"*.sh",
"Dockerfile",
# License files
"LICENSE/**/*",
]
# Files to exclude
exclude_patterns = [
"__pycache__/**",
"*.pyc",
".git/**",
".pytest_cache/**",
".ipynb_checkpoints/**",
"weights/**",
"checkpoints/**", # Contains potentially sensitive configs
"*.log",
"*.pt", # Model weights
"*.zip", # Backup files
# Temporary or generated files
"*-checkpoint.*",
"*.tmp",
"*.swp",
# OS files
".DS_Store",
"Thumbs.db",
]
# Get all files to upload
files_to_upload = []
for pattern in include_patterns:
for file_path in repo_root.glob(pattern):
if file_path.is_file():
# Check if file should be excluded
relative_path = file_path.relative_to(repo_root)
should_exclude = any(
relative_path.match(exclude)
for exclude in exclude_patterns
)
if not should_exclude:
files_to_upload.append(file_path)
return sorted(files_to_upload)
def preview_sync(repo_root: Path = None) -> None:
"""Preview what files will be synced without actually uploading."""
if repo_root is None:
repo_root = Path(__file__).parent.parent.parent
files_to_upload = get_files_to_sync(repo_root)
print(f"\nπ Repository root: {repo_root}")
print(f"π¦ Files to sync: {len(files_to_upload)}")
print("\nπ File list:")
for file_path in files_to_upload:
relative_path = file_path.relative_to(repo_root)
file_size = file_path.stat().st_size
print(f" {relative_path} ({file_size:,} bytes)")
total_size = sum(f.stat().st_size for f in files_to_upload)
print(f"\nπ Total size: {total_size:,} bytes ({total_size/1024/1024:.2f} MB)")
def sync_repository_to_hf(
repo_id: str = "WCNegentropy/BitTransformerLM",
token: Optional[str] = None,
commit_message: str = "π Refined BitTransformerLM: Organized codebase with best practices",
preview_only: bool = False
):
"""
Sync the entire cleaned BitTransformerLM repository to HuggingFace Hub.
Args:
repo_id: HuggingFace repository ID
token: HF token (defaults to HF_TOKEN environment variable)
commit_message: Commit message for the upload
"""
# Get token from environment if not provided
if token is None:
token = os.environ.get('HF_TOKEN')
if not token:
logger.error("HF_TOKEN environment variable not set and no token provided")
return False
try:
# Login to HuggingFace
login(token=token)
api = HfApi()
logger.info("Successfully authenticated with HuggingFace Hub")
# Get the repository root directory (go up from scripts/tools/)
repo_root = Path(__file__).parent.parent.parent
logger.info(f"Repository root: {repo_root}")
# Get files to sync using the centralized function
files_to_upload = get_files_to_sync(repo_root)
logger.info(f"Found {len(files_to_upload)} files to upload")
# CRITICAL SECURITY CHECK: Scan all files for secrets
logger.info("π Scanning files for secrets and tokens...")
all_secrets = []
for file_path in files_to_upload:
secrets = scan_for_secrets(file_path)
if secrets:
relative_path = file_path.relative_to(repo_root)
all_secrets.extend([f"{relative_path}: {secret}" for secret in secrets])
if all_secrets:
logger.error("π¨ SECURITY ALERT: Secrets detected in files!")
logger.error("The following secrets were found and MUST be removed before sync:")
for secret in all_secrets:
logger.error(f" - {secret}")
logger.error("β SYNC ABORTED for security reasons!")
logger.error("Please remove all secrets and use environment variables instead.")
return False
logger.info("β
Security scan passed - no secrets detected")
# If preview only, just show the files and return
if preview_only:
preview_sync(repo_root)
return True
# Use upload_folder for exact sync - this will mirror the entire directory
logger.info("Syncing entire repository structure to HuggingFace...")
try:
# First, let's create a temporary directory with only the files we want
import tempfile
import shutil
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
# Copy all files we want to upload to temp directory
for file_path in files_to_upload:
relative_path = file_path.relative_to(repo_root)
dest_path = temp_path / relative_path
dest_path.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(file_path, dest_path)
logger.info(f"Prepared {len(files_to_upload)} files for upload")
# Upload the entire folder structure - this ensures exact mirroring
api.upload_folder(
folder_path=str(temp_path),
repo_id=repo_id,
repo_type="model",
commit_message=commit_message,
commit_description="""
BitTransformerLM refined with ML engineering best practices:
β
**Organized Codebase Structure**
- Cleaned up 30+ scattered scripts into organized directories
- Standardized imports and docstring formatting
- Consolidated configuration management
- Professional package metadata
β
**Enhanced Developer Experience**
- Comprehensive CLI interface with standardized arguments
- Type-safe configuration system with presets
- Improved error handling and logging
- Better modular organization
β
**Production Quality**
- PyProject.toml with proper dependencies and tooling
- Consistent code formatting and documentation
- Maintainable directory structure
- Ready for serious development and research
The bit-native transformer architecture with reversible layers, safety telemetry,
and distributed training capabilities is now properly packaged for research use.
""".strip(),
delete_patterns=["*"] # This ensures old files are removed
)
uploaded_count = len(files_to_upload)
except Exception as e:
logger.error(f"Failed to upload folder: {e}")
logger.info("Falling back to individual file upload...")
# Fallback to individual file upload
uploaded_count = 0
for file_path in files_to_upload:
try:
relative_path = file_path.relative_to(repo_root)
logger.info(f"Uploading: {relative_path}")
api.upload_file(
path_or_fileobj=str(file_path),
path_in_repo=str(relative_path),
repo_id=repo_id,
repo_type="model",
commit_message=commit_message,
)
uploaded_count += 1
if uploaded_count % 10 == 0:
logger.info(f"Progress: {uploaded_count}/{len(files_to_upload)} files uploaded")
except Exception as e:
logger.warning(f"Failed to upload {relative_path}: {e}")
continue
logger.info(f"β
Successfully uploaded {uploaded_count}/{len(files_to_upload)} files")
logger.info(f"π Repository synced to: https://huggingface.co/{repo_id}")
return True
except Exception as e:
logger.error(f"β Failed to sync repository: {e}")
return False
def create_release_info():
"""Create a release information file for the OS launch."""
release_info = """# BitTransformerLM v0.1.0 - Experimental Research Release
**Release Date:** August 2025
**Status:** Open Source Research Implementation
**License:** AGPLv3 + Commercial Licensing Available
## What's Included
This release provides a complete experimental framework for bit-native language modeling research:
- **Core Architecture:** 57 Python files implementing bit-native transformer with reversible layers
- **Safety Systems:** Real-time K/C/S telemetry and monitoring
- **Research Tools:** Interactive dashboard, distributed training, comprehensive testing
- **Documentation:** Professional model card, research status, and validation reports
## Important Notes
β οΈ **Experimental Status:** This is research code requiring rigorous baseline validation
β οΈ **Not Production Ready:** Needs extensive evaluation vs standard transformers
β οΈ **Research Use Only:** Intended for academic investigation and experimentation
## Licensing
- **Open Source:** AGPLv3 for research and open source use
- **Commercial:** Contact contact@wcnegentropy.com for commercial licensing
## Next Steps
The research community is invited to:
1. Conduct rigorous baseline comparisons vs standard transformers
2. Evaluate on established language modeling benchmarks
3. Validate (or refute) claimed memory efficiency benefits
4. Share findings openly to advance the field
**Research responsibly. Validate rigorously. Share openly.**
"""
release_file = Path(__file__).parent / "RELEASE_INFO.md"
with open(release_file, 'w') as f:
f.write(release_info)
logger.info("Created RELEASE_INFO.md")
return release_file
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Sync BitTransformerLM to HuggingFace Hub")
parser.add_argument("--preview", action="store_true", help="Preview files without uploading")
parser.add_argument("--repo-id", default="WCNegentropy/BitTransformerLM", help="HuggingFace repo ID")
parser.add_argument("--token", help="HuggingFace token (or set HF_TOKEN env var)")
args = parser.parse_args()
if args.preview:
print("π Preview mode: showing files that would be synced...")
preview_sync()
print("\nβ
Use --token YOUR_TOKEN to perform actual sync")
else:
# Create release info file
create_release_info()
# Sync to HuggingFace
success = sync_repository_to_hf(
repo_id=args.repo_id,
token=args.token
)
if success:
print(f"\nπ BitTransformerLM Sync Complete!")
print(f"π Repository: https://huggingface.co/{args.repo_id}")
print("\nRefined codebase with ML engineering best practices is now live! β¨")
else:
print("\nβ Sync failed. Please check logs and try again.") |