tools / utils /oss /oss_batch_upload.py
Adinosaur's picture
Upload folder using huggingface_hub
1c980b1 verified
import argparse
import os
import alibabacloud_oss_v2 as oss
def main():
parser = argparse.ArgumentParser(description="批量上传JSONL文件到OSS")
# 必需参数
parser.add_argument('--region', required=True, help='OSS存储空间所在区域')
parser.add_argument('--bucket', required=True, help='目标存储空间名称')
parser.add_argument('--key', required=True, help='OSS目标文件夹路径(如:/data)')
parser.add_argument('--file_path', required=True, help='本地包含JSONL文件的文件夹路径')
# 可选参数
parser.add_argument('--endpoint', help='自定义访问端点')
args = parser.parse_args()
# 验证文件路径有效性
if not os.path.isdir(args.file_path):
raise ValueError(f"无效的目录路径: {args.file_path}")
# 收集所有JSONL文件
jsonl_files = []
for filename in os.listdir(args.file_path):
if filename.endswith('.jsonl'):
full_path = os.path.join(args.file_path, filename)
if os.path.isfile(full_path):
jsonl_files.append((full_path, filename))
if not jsonl_files:
print("未找到任何JSONL文件")
return
# 初始化OSS配置
credentials_provider = oss.credentials.EnvironmentVariableCredentialsProvider()
cfg = oss.config.load_default()
cfg.credentials_provider = credentials_provider
cfg.region = args.region
if args.endpoint:
cfg.endpoint = args.endpoint
# 创建OSS客户端
client = oss.Client(cfg)
uploader = client.uploader()
# 处理OSS路径格式
base_key = args.key.rstrip('/')
# 批量上传
for local_path, filename in jsonl_files:
oss_key = f"{base_key}/{filename}" if base_key else filename
try:
result = uploader.upload_file(
oss.PutObjectRequest(
bucket=args.bucket,
key=oss_key,
),
filepath=local_path
)
# 输出上传结果
print(f" 成功上传 {filename}")
print(f" OSS路径: {oss_key}")
print(f" 状态码: {result.status_code}")
print(f" 请求ID: {result.request_id}")
print(f" ETag: {result.etag}\n")
except Exception as e:
print(f" 上传失败 {filename}")
print(f" 错误信息: {str(e)}\n")
if __name__ == "__main__":
main()