File size: 2,811 Bytes
198ccb0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
"""Script to set up DVC for data versioning."""
import logging
import subprocess
from pathlib import Path
from typing import Optional
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def setup_dvc(remote_name: str = "default", remote_url: Optional[str] = None) -> None:
"""
Set up DVC repository.
Args:
remote_name: Remote storage name
remote_url: Remote storage URL (S3, GCS, Azure, etc.)
"""
logger.info("Setting up DVC...")
# Initialize DVC if not already initialized
if not Path(".dvc").exists():
logger.info("Initializing DVC repository...")
subprocess.run(["dvc", "init"], check=True)
logger.info("DVC repository initialized")
else:
logger.info("DVC repository already initialized")
# Add remote if provided
if remote_url:
logger.info(f"Adding remote: {remote_name} -> {remote_url}")
try:
subprocess.run(
["dvc", "remote", "add", remote_name, remote_url],
check=True
)
logger.info(f"Remote '{remote_name}' added successfully")
except subprocess.CalledProcessError as e:
logger.warning(f"Failed to add remote: {e}")
logger.info("You can add remote manually with: dvc remote add <name> <url>")
# Add data files to DVC
data_files = [
"data/raw/ria_news.tsv",
"data/raw/vk_news.tsv",
"data/raw/vk_comments.tsv",
]
logger.info("Adding data files to DVC...")
for data_file in data_files:
file_path = Path(data_file)
if file_path.exists():
try:
subprocess.run(["dvc", "add", data_file], check=True)
logger.info(f"Added to DVC: {data_file}")
except subprocess.CalledProcessError as e:
logger.warning(f"Failed to add {data_file}: {e}")
else:
logger.warning(f"Data file not found: {data_file}")
logger.info("DVC setup complete!")
logger.info("Next steps:")
logger.info(" 1. Commit .dvc files: git add *.dvc .dvcignore")
logger.info(" 2. Push data to remote: dvc push")
logger.info(" 3. Run pipeline: dvc repro")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Set up DVC for data versioning")
parser.add_argument(
"--remote-name",
type=str,
default="default",
help="Remote storage name"
)
parser.add_argument(
"--remote-url",
type=str,
default=None,
help="Remote storage URL (S3, GCS, Azure, etc.)"
)
args = parser.parse_args()
setup_dvc(
remote_name=args.remote_name,
remote_url=args.remote_url,
)
|