TTI / Reward /robometer /scripts /untar_processed_datasets.sh
JosephBai's picture
Upload folder using huggingface_hub
857c2e9 verified
#!/bin/bash
if [ -z "${ROBOMETER_PROCESSED_DATASETS_PATH:-$RBM_PROCESSED_DATASETS_PATH}" ]; then
echo "ROBOMETER_PROCESSED_DATASETS_PATH (or RBM_PROCESSED_DATASETS_PATH) is not set"
exit 1
fi
cd "${ROBOMETER_PROCESSED_DATASETS_PATH:-$RBM_PROCESSED_DATASETS_PATH}" || exit 1
# Track already processed archives to avoid duplicates
declare -A processed_archives
# First, handle split archives (.tar.partaa, .tar.partab, etc.)
echo "Processing split archives..."
for file in *.tar.partaa; do
if [ -f "$file" ]; then
# Get the base name without .partaa
base_name="${file%.partaa}"
echo "Extracting split archive: $base_name"
# Concatenate all parts and extract
cat "${base_name}.part"* | tar -xvf -
# remove the parts if successfully extracted
if [ $? -eq 0 ]; then
rm "${base_name}.part"*
else
echo "Failed to extract $base_name, will need to retry and remove the failed parts"
continue
fi
# Mark this base archive as processed
processed_archives["$base_name"]=1
fi
done
# Now handle split archives that look like .tar.part-aa, .tar.part-ab, etc.
echo "Processing split archives..."
for file in *.tar.part-aa; do
if [ -f "$file" ]; then
# Get the base name without .part-aa
base_name="${file%.part-aa}"
echo "Extracting split archive: $base_name"
# Concatenate all parts and extract
cat "${base_name}.part"* | tar -xvf -
# remove the parts if successfully extracted
if [ $? -eq 0 ]; then
rm "${base_name}.part"*
else
echo "Failed to extract $base_name, will need to retry and remove the failed parts"
continue
fi
# Mark this base archive as processed
processed_archives["$base_name"]=1
fi
done
# Then, handle regular tar files (skip those that were split archives)
echo "Processing regular tar files..."
for file in *.tar; do
if [ -f "$file" ]; then
# Skip if this was already processed as a split archive
if [ -z "${processed_archives[$file]}" ]; then
echo "Extracting: $file"
tar -xvf "$file"
# remove the tar file only if it was successfully extracted
if [ $? -eq 0 ]; then
processed_archives["$file"]=1
rm "$file"
else
echo "Failed to extract $file, will need to retry and remove the failed tar file"
continue
fi
fi
fi
done
# Now, some of the datasets are moved into a `processed_datasets` subdirectory, so move them out and delete
# the overall processed_datasets directory if it exists.
if [ -d "processed_datasets" ]; then
echo "Moving datasets out of processed_datasets subdirectory..."
for dir in processed_datasets/*; do
if [ -d "$dir" ]; then
mv "$dir" .
fi
done
rm -rf processed_datasets
echo "Done moving datasets out of processed_datasets subdirectory!"
fi
# print which datasets might've failed
for file in *.tar; do
if [ -z "${processed_archives[$file]}" ]; then
echo "Failed to extract $file"
fi
done
cd ..
echo "Done extracting all archives!"