|
|
"""Script to set up DVC for data versioning.""" |
|
|
|
|
|
import logging |
|
|
import subprocess |
|
|
from pathlib import Path |
|
|
from typing import Optional |
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
def setup_dvc(remote_name: str = "default", remote_url: Optional[str] = None) -> None: |
|
|
""" |
|
|
Set up DVC repository. |
|
|
|
|
|
Args: |
|
|
remote_name: Remote storage name |
|
|
remote_url: Remote storage URL (S3, GCS, Azure, etc.) |
|
|
""" |
|
|
logger.info("Setting up DVC...") |
|
|
|
|
|
|
|
|
if not Path(".dvc").exists(): |
|
|
logger.info("Initializing DVC repository...") |
|
|
subprocess.run(["dvc", "init"], check=True) |
|
|
logger.info("DVC repository initialized") |
|
|
else: |
|
|
logger.info("DVC repository already initialized") |
|
|
|
|
|
|
|
|
if remote_url: |
|
|
logger.info(f"Adding remote: {remote_name} -> {remote_url}") |
|
|
try: |
|
|
subprocess.run( |
|
|
["dvc", "remote", "add", remote_name, remote_url], |
|
|
check=True |
|
|
) |
|
|
logger.info(f"Remote '{remote_name}' added successfully") |
|
|
except subprocess.CalledProcessError as e: |
|
|
logger.warning(f"Failed to add remote: {e}") |
|
|
logger.info("You can add remote manually with: dvc remote add <name> <url>") |
|
|
|
|
|
|
|
|
data_files = [ |
|
|
"data/raw/ria_news.tsv", |
|
|
"data/raw/vk_news.tsv", |
|
|
"data/raw/vk_comments.tsv", |
|
|
] |
|
|
|
|
|
logger.info("Adding data files to DVC...") |
|
|
for data_file in data_files: |
|
|
file_path = Path(data_file) |
|
|
if file_path.exists(): |
|
|
try: |
|
|
subprocess.run(["dvc", "add", data_file], check=True) |
|
|
logger.info(f"Added to DVC: {data_file}") |
|
|
except subprocess.CalledProcessError as e: |
|
|
logger.warning(f"Failed to add {data_file}: {e}") |
|
|
else: |
|
|
logger.warning(f"Data file not found: {data_file}") |
|
|
|
|
|
logger.info("DVC setup complete!") |
|
|
logger.info("Next steps:") |
|
|
logger.info(" 1. Commit .dvc files: git add *.dvc .dvcignore") |
|
|
logger.info(" 2. Push data to remote: dvc push") |
|
|
logger.info(" 3. Run pipeline: dvc repro") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
import argparse |
|
|
|
|
|
parser = argparse.ArgumentParser(description="Set up DVC for data versioning") |
|
|
parser.add_argument( |
|
|
"--remote-name", |
|
|
type=str, |
|
|
default="default", |
|
|
help="Remote storage name" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--remote-url", |
|
|
type=str, |
|
|
default=None, |
|
|
help="Remote storage URL (S3, GCS, Azure, etc.)" |
|
|
) |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
setup_dvc( |
|
|
remote_name=args.remote_name, |
|
|
remote_url=args.remote_url, |
|
|
) |
|
|
|
|
|
|