File size: 3,281 Bytes
857c2e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#!/bin/bash
if [ -z "${ROBOMETER_PROCESSED_DATASETS_PATH:-$RBM_PROCESSED_DATASETS_PATH}" ]; then
    echo "ROBOMETER_PROCESSED_DATASETS_PATH (or RBM_PROCESSED_DATASETS_PATH) is not set"
    exit 1
fi

cd "${ROBOMETER_PROCESSED_DATASETS_PATH:-$RBM_PROCESSED_DATASETS_PATH}" || exit 1

# Track already processed archives to avoid duplicates
declare -A processed_archives

# First, handle split archives (.tar.partaa, .tar.partab, etc.)
echo "Processing split archives..."
for file in *.tar.partaa; do
    if [ -f "$file" ]; then
        # Get the base name without .partaa
        base_name="${file%.partaa}"
        
        echo "Extracting split archive: $base_name"
        # Concatenate all parts and extract
        cat "${base_name}.part"* | tar -xvf -

        # remove the parts if successfully extracted
        if [ $? -eq 0 ]; then
            rm "${base_name}.part"*
        else
            echo "Failed to extract $base_name, will need to retry and remove the failed parts"
            continue
        fi
        # Mark this base archive as processed
        processed_archives["$base_name"]=1

    fi
done

# Now handle split archives that look like .tar.part-aa, .tar.part-ab, etc.
echo "Processing split archives..."
for file in *.tar.part-aa; do
    if [ -f "$file" ]; then
        # Get the base name without .part-aa
        base_name="${file%.part-aa}"
        echo "Extracting split archive: $base_name"
        # Concatenate all parts and extract
        cat "${base_name}.part"* | tar -xvf -

        # remove the parts if successfully extracted
        if [ $? -eq 0 ]; then
            rm "${base_name}.part"*
        else
            echo "Failed to extract $base_name, will need to retry and remove the failed parts"
            continue
        fi
        # Mark this base archive as processed
        processed_archives["$base_name"]=1
    fi
done

# Then, handle regular tar files (skip those that were split archives)
echo "Processing regular tar files..."
for file in *.tar; do
    if [ -f "$file" ]; then
        # Skip if this was already processed as a split archive
        if [ -z "${processed_archives[$file]}" ]; then
            echo "Extracting: $file"
            tar -xvf "$file"

            # remove the tar file only if it was successfully extracted
            if [ $? -eq 0 ]; then
                processed_archives["$file"]=1
                rm "$file"
            else
                echo "Failed to extract $file, will need to retry and remove the failed tar file"
                continue
            fi
        fi
    fi
done

# Now, some of the datasets are moved into a `processed_datasets` subdirectory, so move them out and delete 
# the overall processed_datasets directory if it exists.
if [ -d "processed_datasets" ]; then
    echo "Moving datasets out of processed_datasets subdirectory..."
    for dir in processed_datasets/*; do
        if [ -d "$dir" ]; then
            mv "$dir" .
        fi
    done
    rm -rf processed_datasets
    echo "Done moving datasets out of processed_datasets subdirectory!"
fi

# print which datasets might've failed
for file in *.tar; do
    if [ -z "${processed_archives[$file]}" ]; then
        echo "Failed to extract $file"
    fi
done
cd ..
echo "Done extracting all archives!"