File size: 3,490 Bytes
88b6846
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import { promises as fs } from 'fs';
import path from 'path';

/**
 * Determines the base data directory path based on environment.
 * On Hugging Face Spaces with persistent storage, uses /data
 * Otherwise, uses the local dataset folder.
 */
export function getDataDir(): string {
    // Check for explicit environment variable first
    if (process.env.DATA_DIR) {
        return process.env.DATA_DIR;
    }
    
    // On HF Spaces with persistent storage, /data is available
    // We check this at runtime since /data only exists at runtime, not build time
    if (process.env.SPACE_ID || isHuggingFaceSpaces()) {
        return '/data';
    }
    
    // Default to local dataset directory
    return path.join(process.cwd(), 'dataset');
}

/**
 * Check if running on Hugging Face Spaces
 */
function isHuggingFaceSpaces(): boolean {
    // HF Spaces sets SPACE_ID environment variable
    return !!process.env.SPACE_ID;
}

/**
 * Get the full path to a subdirectory within the data directory
 */
export function getDataPath(...subPaths: string[]): string {
    return path.join(getDataDir(), ...subPaths);
}

/**
 * Get audio directory path for a speaker
 */
export function getAudioPath(speakerId?: string): string {
    if (speakerId) {
        return getDataPath('audio', speakerId);
    }
    return getDataPath('audio');
}

/**
 * Get transcriptions directory path for a speaker
 */
export function getTranscriptionsPath(speakerId?: string): string {
    if (speakerId) {
        return getDataPath('transcriptions', speakerId);
    }
    return getDataPath('transcriptions');
}

/**
 * Get metadata directory path
 */
export function getMetadataPath(): string {
    return getDataPath('metadata');
}

/**
 * Get fonts directory path
 */
export function getFontsPath(): string {
    return getDataPath('fonts');
}

/**
 * Safely create a directory, handling errors gracefully
 */
export async function ensureDir(dirPath: string): Promise<void> {
    try {
        await fs.mkdir(dirPath, { recursive: true });
    } catch (error: unknown) {
        // Ignore EEXIST errors (directory already exists)
        if (error instanceof Error && 'code' in error && (error as NodeJS.ErrnoException).code !== 'EEXIST') {
            console.error(`Failed to create directory ${dirPath}:`, error);
            throw error;
        }
    }
}

/**
 * Sanitize a string for use in file paths
 * Prevents path traversal attacks and invalid characters
 */
export function sanitizePath(input: string, maxLength: number = 50): string {
    if (!input || typeof input !== 'string') {
        return 'unknown';
    }
    
    // Remove any path traversal attempts and invalid characters
    return input
        .replace(/\.\./g, '') // Prevent path traversal
        .replace(/[\/\\:*?"<>|]/g, '_') // Remove invalid path characters
        .replace(/[^a-zA-Z0-9_-]/g, '_') // Keep only safe characters
        .substring(0, maxLength)
        .replace(/^_+|_+$/g, '') // Trim leading/trailing underscores
        || 'unknown';
}

/**
 * Initialize the data directory structure
 * Creates all necessary subdirectories
 */
export async function initializeDataDirs(): Promise<void> {
    const dirs = [
        getDataPath(),
        getAudioPath(),
        getTranscriptionsPath(),
        getMetadataPath(),
        getFontsPath(),
    ];
    
    for (const dir of dirs) {
        await ensureDir(dir);
    }
    
    console.log(`[DataPath] Initialized data directories at: ${getDataDir()}`);
}