Spaces:

sachin1801
/

splicing-predictor

Running

File size: 9,846 Bytes

923926c

/**
 * File parsing utilities for CSV and FASTA files.
 * Also handles multi-sequence text input parsing.
 */

/**
 * Parse FASTA format text.
 * Format: >Name\nSEQUENCE\n>Name2\nSEQUENCE2
 * @param {string} text - FASTA formatted text
 * @returns {Array<{name: string, sequence: string}>}
 */
function parseFasta(text) {
    const sequences = [];
    const lines = text.trim().split('\n');
    let currentName = '';
    let currentSeq = '';
    let seqCount = 0;

    for (const line of lines) {
        const trimmedLine = line.trim();
        if (trimmedLine.startsWith('>')) {
            // Save previous sequence if exists
            if (currentSeq) {
                sequences.push({
                    name: currentName || `Seq_${seqCount}`,
                    sequence: currentSeq.toUpperCase()
                });
            }
            seqCount++;
            // Extract name from header (remove > and trim)
            currentName = trimmedLine.substring(1).trim() || `Seq_${seqCount}`;
            currentSeq = '';
        } else if (trimmedLine) {
            // Append to current sequence (remove whitespace)
            currentSeq += trimmedLine.replace(/\s/g, '');
        }
    }

    // Don't forget the last sequence
    if (currentSeq) {
        sequences.push({
            name: currentName || `Seq_${seqCount}`,
            sequence: currentSeq.toUpperCase()
        });
    }

    return sequences;
}

/**
 * Parse plain sequences (one per line, no headers).
 * @param {string} text - Plain text with one sequence per line
 * @returns {Array<{name: string, sequence: string}>}
 */
function parsePlainSequences(text) {
    const lines = text.trim().split('\n');
    const sequences = [];

    for (let i = 0; i < lines.length; i++) {
        const seq = lines[i].trim().toUpperCase().replace(/\s/g, '');
        if (seq.length > 0) {
            sequences.push({
                name: `Seq_${sequences.length + 1}`,
                sequence: seq
            });
        }
    }

    return sequences;
}

/**
 * Auto-detect format and parse multi-sequence text.
 * If any line starts with '>', treat as FASTA, otherwise plain sequences.
 * @param {string} text - Text to parse
 * @returns {Array<{name: string, sequence: string}>}
 */
function parseMultiSequenceText(text) {
    const trimmed = text.trim();

    if (!trimmed) {
        return [];
    }

    // Check if it looks like FASTA (any line starts with >)
    if (trimmed.includes('>')) {
        return parseFasta(trimmed);
    }

    // Otherwise treat as plain sequences
    return parsePlainSequences(trimmed);
}

/**
 * Detect the delimiter used in a CSV file.
 * @param {string} text - CSV text content
 * @returns {string} Detected delimiter: ',', ';', or '\t'
 */
function detectDelimiter(text) {
    const firstLine = text.split('\n')[0] || '';

    // Count occurrences of each potential delimiter in first line
    const tabCount = (firstLine.match(/\t/g) || []).length;
    const semicolonCount = (firstLine.match(/;/g) || []).length;
    const commaCount = (firstLine.match(/,/g) || []).length;

    // Return the most common delimiter
    if (tabCount > 0 && tabCount >= semicolonCount && tabCount >= commaCount) {
        return '\t';
    }
    if (semicolonCount > 0 && semicolonCount >= commaCount) {
        return ';';
    }
    return ',';
}

/**
 * Detect if the first row is a header row.
 * @param {Array<string>} row - First row of CSV
 * @returns {boolean} True if row looks like a header
 */
function detectHeader(row) {
    if (!row || row.length === 0) return false;

    // Common header patterns (case insensitive)
    const headerPatterns = ['name', 'sequence', 'seq', 'id', 'exon', 'header'];

    // Check if any cell matches header patterns
    for (const cell of row) {
        const lowerCell = cell.toLowerCase().trim();
        if (headerPatterns.some(pattern => lowerCell.includes(pattern))) {
            return true;
        }
    }

    // If first cell looks like a valid sequence (all ACGTU), probably not a header
    const firstCell = row[0].toUpperCase().trim();
    if (/^[ACGTU]+$/.test(firstCell) && firstCell.length > 20) {
        return false;
    }

    return false;
}

/**
 * Parse CSV content.
 * Supports:
 * - 2 columns: name, sequence (with or without header)
 * - 1 column: sequence only (with or without header)
 * - Auto-detects delimiter (comma, semicolon, tab)
 * @param {string} text - CSV file content
 * @returns {Array<{name: string, sequence: string}>}
 */
function parseCSV(text) {
    const delimiter = detectDelimiter(text);
    const lines = text.trim().split('\n');

    if (lines.length === 0) {
        return [];
    }

    // Parse all rows
    const rows = lines.map(line => {
        // Handle quoted fields properly
        const cells = [];
        let current = '';
        let inQuotes = false;

        for (let i = 0; i < line.length; i++) {
            const char = line[i];
            if (char === '"') {
                inQuotes = !inQuotes;
            } else if (char === delimiter && !inQuotes) {
                cells.push(current.trim());
                current = '';
            } else {
                current += char;
            }
        }
        cells.push(current.trim());
        return cells;
    });

    // Detect if first row is header
    const hasHeader = detectHeader(rows[0]);
    const dataRows = hasHeader ? rows.slice(1) : rows;

    // Determine format based on number of columns
    const numCols = rows[0].length;

    const sequences = [];
    for (let i = 0; i < dataRows.length; i++) {
        const row = dataRows[i];
        if (!row || row.length === 0 || (row.length === 1 && !row[0].trim())) {
            continue; // Skip empty rows
        }

        if (numCols >= 2 && row.length >= 2) {
            // Two or more columns: assume name, sequence
            const name = row[0].trim() || `Seq_${sequences.length + 1}`;
            const seq = row[1].trim().toUpperCase().replace(/\s/g, '');
            if (seq) {
                sequences.push({ name, sequence: seq });
            }
        } else {
            // Single column: sequence only
            const seq = row[0].trim().toUpperCase().replace(/\s/g, '');
            if (seq) {
                sequences.push({
                    name: `Seq_${sequences.length + 1}`,
                    sequence: seq
                });
            }
        }
    }

    return sequences;
}

/**
 * Read and parse a file (CSV or FASTA).
 * @param {File} file - File object to parse
 * @returns {Promise<Array<{name: string, sequence: string}>>}
 */
async function parseFile(file) {
    return new Promise((resolve, reject) => {
        const reader = new FileReader();

        reader.onload = (event) => {
            const text = event.target.result;
            const fileName = file.name.toLowerCase();

            let sequences;
            if (fileName.endsWith('.fasta') || fileName.endsWith('.fa') || fileName.endsWith('.fna')) {
                sequences = parseFasta(text);
            } else if (fileName.endsWith('.csv') || fileName.endsWith('.tsv') || fileName.endsWith('.txt')) {
                // Try CSV first
                sequences = parseCSV(text);
                // If CSV parsing resulted in weird sequences, try FASTA
                if (sequences.length === 0 || (sequences.length === 1 && text.includes('>'))) {
                    sequences = parseFasta(text);
                }
            } else {
                // Unknown extension - try to auto-detect
                if (text.trim().startsWith('>')) {
                    sequences = parseFasta(text);
                } else {
                    sequences = parseCSV(text);
                }
            }

            resolve(sequences);
        };

        reader.onerror = () => {
            reject(new Error('Failed to read file'));
        };

        reader.readAsText(file);
    });
}

/**
 * Validate a single sequence.
 * @param {string} sequence - Sequence to validate
 * @param {number} expectedLength - Expected length (default 70)
 * @returns {{valid: boolean, error: string}}
 */
function validateSequence(sequence, expectedLength = 70) {
    const cleaned = sequence.toUpperCase().replace(/U/g, 'T').replace(/\s/g, '');

    if (cleaned.length !== expectedLength) {
        return {
            valid: false,
            error: `Must be exactly ${expectedLength} nucleotides (got ${cleaned.length})`
        };
    }

    const invalidChars = cleaned.match(/[^ACGT]/g);
    if (invalidChars) {
        const unique = [...new Set(invalidChars)];
        return {
            valid: false,
            error: `Contains invalid characters: ${unique.join(', ')}`
        };
    }

    return { valid: true, error: '' };
}

/**
 * Parse and validate sequences from text input or file.
 * @param {string} text - Text input to parse
 * @param {number} expectedLength - Expected sequence length
 * @returns {{sequences: Array<{name: string, sequence: string}>, validCount: number, invalidCount: number}}
 */
function parseAndValidate(text, expectedLength = 70) {
    const sequences = parseMultiSequenceText(text);
    let validCount = 0;
    let invalidCount = 0;

    for (const seq of sequences) {
        const validation = validateSequence(seq.sequence, expectedLength);
        seq.valid = validation.valid;
        seq.error = validation.error;
        if (validation.valid) {
            validCount++;
        } else {
            invalidCount++;
        }
    }

    return { sequences, validCount, invalidCount };
}

// Export functions for use in other scripts
window.FileParser = {
    parseFasta,
    parsePlainSequences,
    parseMultiSequenceText,
    parseCSV,
    parseFile,
    detectDelimiter,
    detectHeader,
    validateSequence,
    parseAndValidate,
};