Spaces:

sachin1801
/

splicing-predictor

Running

App Files Files Community

splicing-predictor / webapp /static /js /file-parser.js

sachin1801

added api routes, changes to input pages, db creationg + token mapping, history page created, batch support for csv + fasta files

923926c about 1 month ago

history blame contribute delete

9.85 kB

	/**
	* File parsing utilities for CSV and FASTA files.
	* Also handles multi-sequence text input parsing.
	*/

	/**
	* Parse FASTA format text.
	* Format: >Name\nSEQUENCE\n>Name2\nSEQUENCE2
	* @param {string} text - FASTA formatted text
	* @returns {Array<{name: string, sequence: string}>}
	*/
	function parseFasta(text) {
	const sequences = [];
	const lines = text.trim().split('\n');
	let currentName = '';
	let currentSeq = '';
	let seqCount = 0;

	for (const line of lines) {
	const trimmedLine = line.trim();
	if (trimmedLine.startsWith('>')) {
	// Save previous sequence if exists
	if (currentSeq) {
	sequences.push({
	name: currentName \|\| `Seq_${seqCount}`,
	sequence: currentSeq.toUpperCase()
	});
	}
	seqCount++;
	// Extract name from header (remove > and trim)
	currentName = trimmedLine.substring(1).trim() \|\| `Seq_${seqCount}`;
	currentSeq = '';
	} else if (trimmedLine) {
	// Append to current sequence (remove whitespace)
	currentSeq += trimmedLine.replace(/\s/g, '');
	}
	}

	// Don't forget the last sequence
	if (currentSeq) {
	sequences.push({
	name: currentName \|\| `Seq_${seqCount}`,
	sequence: currentSeq.toUpperCase()
	});
	}

	return sequences;
	}

	/**
	* Parse plain sequences (one per line, no headers).
	* @param {string} text - Plain text with one sequence per line
	* @returns {Array<{name: string, sequence: string}>}
	*/
	function parsePlainSequences(text) {
	const lines = text.trim().split('\n');
	const sequences = [];

	for (let i = 0; i < lines.length; i++) {
	const seq = lines[i].trim().toUpperCase().replace(/\s/g, '');
	if (seq.length > 0) {
	sequences.push({
	name: `Seq_${sequences.length + 1}`,
	sequence: seq
	});
	}
	}

	return sequences;
	}

	/**
	* Auto-detect format and parse multi-sequence text.
	* If any line starts with '>', treat as FASTA, otherwise plain sequences.
	* @param {string} text - Text to parse
	* @returns {Array<{name: string, sequence: string}>}
	*/
	function parseMultiSequenceText(text) {
	const trimmed = text.trim();

	if (!trimmed) {
	return [];
	}

	// Check if it looks like FASTA (any line starts with >)
	if (trimmed.includes('>')) {
	return parseFasta(trimmed);
	}

	// Otherwise treat as plain sequences
	return parsePlainSequences(trimmed);
	}

	/**
	* Detect the delimiter used in a CSV file.
	* @param {string} text - CSV text content
	* @returns {string} Detected delimiter: ',', ';', or '\t'
	*/
	function detectDelimiter(text) {
	const firstLine = text.split('\n')[0] \|\| '';

	// Count occurrences of each potential delimiter in first line
	const tabCount = (firstLine.match(/\t/g) \|\| []).length;
	const semicolonCount = (firstLine.match(/;/g) \|\| []).length;
	const commaCount = (firstLine.match(/,/g) \|\| []).length;

	// Return the most common delimiter
	if (tabCount > 0 && tabCount >= semicolonCount && tabCount >= commaCount) {
	return '\t';
	}
	if (semicolonCount > 0 && semicolonCount >= commaCount) {
	return ';';
	}
	return ',';
	}

	/**
	* Detect if the first row is a header row.
	* @param {Array<string>} row - First row of CSV
	* @returns {boolean} True if row looks like a header
	*/
	function detectHeader(row) {
	if (!row \|\| row.length === 0) return false;

	// Common header patterns (case insensitive)
	const headerPatterns = ['name', 'sequence', 'seq', 'id', 'exon', 'header'];

	// Check if any cell matches header patterns
	for (const cell of row) {
	const lowerCell = cell.toLowerCase().trim();
	if (headerPatterns.some(pattern => lowerCell.includes(pattern))) {
	return true;
	}
	}

	// If first cell looks like a valid sequence (all ACGTU), probably not a header
	const firstCell = row[0].toUpperCase().trim();
	if (/^[ACGTU]+$/.test(firstCell) && firstCell.length > 20) {
	return false;
	}

	return false;
	}

	/**
	* Parse CSV content.
	* Supports:
	* - 2 columns: name, sequence (with or without header)
	* - 1 column: sequence only (with or without header)
	* - Auto-detects delimiter (comma, semicolon, tab)
	* @param {string} text - CSV file content
	* @returns {Array<{name: string, sequence: string}>}
	*/
	function parseCSV(text) {
	const delimiter = detectDelimiter(text);
	const lines = text.trim().split('\n');

	if (lines.length === 0) {
	return [];
	}

	// Parse all rows
	const rows = lines.map(line => {
	// Handle quoted fields properly
	const cells = [];
	let current = '';
	let inQuotes = false;

	for (let i = 0; i < line.length; i++) {
	const char = line[i];
	if (char === '"') {
	inQuotes = !inQuotes;
	} else if (char === delimiter && !inQuotes) {
	cells.push(current.trim());
	current = '';
	} else {
	current += char;
	}
	}
	cells.push(current.trim());
	return cells;
	});

	// Detect if first row is header
	const hasHeader = detectHeader(rows[0]);
	const dataRows = hasHeader ? rows.slice(1) : rows;

	// Determine format based on number of columns
	const numCols = rows[0].length;

	const sequences = [];
	for (let i = 0; i < dataRows.length; i++) {
	const row = dataRows[i];
	if (!row \|\| row.length === 0 \|\| (row.length === 1 && !row[0].trim())) {
	continue; // Skip empty rows
	}

	if (numCols >= 2 && row.length >= 2) {
	// Two or more columns: assume name, sequence
	const name = row[0].trim() \|\| `Seq_${sequences.length + 1}`;
	const seq = row[1].trim().toUpperCase().replace(/\s/g, '');
	if (seq) {
	sequences.push({ name, sequence: seq });
	}
	} else {
	// Single column: sequence only
	const seq = row[0].trim().toUpperCase().replace(/\s/g, '');
	if (seq) {
	sequences.push({
	name: `Seq_${sequences.length + 1}`,
	sequence: seq
	});
	}
	}
	}

	return sequences;
	}

	/**
	* Read and parse a file (CSV or FASTA).
	* @param {File} file - File object to parse
	* @returns {Promise<Array<{name: string, sequence: string}>>}
	*/
	async function parseFile(file) {
	return new Promise((resolve, reject) => {
	const reader = new FileReader();

	reader.onload = (event) => {
	const text = event.target.result;
	const fileName = file.name.toLowerCase();

	let sequences;
	if (fileName.endsWith('.fasta') \|\| fileName.endsWith('.fa') \|\| fileName.endsWith('.fna')) {
	sequences = parseFasta(text);
	} else if (fileName.endsWith('.csv') \|\| fileName.endsWith('.tsv') \|\| fileName.endsWith('.txt')) {
	// Try CSV first
	sequences = parseCSV(text);
	// If CSV parsing resulted in weird sequences, try FASTA
	if (sequences.length === 0 \|\| (sequences.length === 1 && text.includes('>'))) {
	sequences = parseFasta(text);
	}
	} else {
	// Unknown extension - try to auto-detect
	if (text.trim().startsWith('>')) {
	sequences = parseFasta(text);
	} else {
	sequences = parseCSV(text);
	}
	}

	resolve(sequences);
	};

	reader.onerror = () => {
	reject(new Error('Failed to read file'));
	};

	reader.readAsText(file);
	});
	}

	/**
	* Validate a single sequence.
	* @param {string} sequence - Sequence to validate
	* @param {number} expectedLength - Expected length (default 70)
	* @returns {{valid: boolean, error: string}}
	*/
	function validateSequence(sequence, expectedLength = 70) {
	const cleaned = sequence.toUpperCase().replace(/U/g, 'T').replace(/\s/g, '');

	if (cleaned.length !== expectedLength) {
	return {
	valid: false,
	error: `Must be exactly ${expectedLength} nucleotides (got ${cleaned.length})`
	};
	}

	const invalidChars = cleaned.match(/[^ACGT]/g);
	if (invalidChars) {
	const unique = [...new Set(invalidChars)];
	return {
	valid: false,
	error: `Contains invalid characters: ${unique.join(', ')}`
	};
	}

	return { valid: true, error: '' };
	}

	/**
	* Parse and validate sequences from text input or file.
	* @param {string} text - Text input to parse
	* @param {number} expectedLength - Expected sequence length
	* @returns {{sequences: Array<{name: string, sequence: string}>, validCount: number, invalidCount: number}}
	*/
	function parseAndValidate(text, expectedLength = 70) {
	const sequences = parseMultiSequenceText(text);
	let validCount = 0;
	let invalidCount = 0;

	for (const seq of sequences) {
	const validation = validateSequence(seq.sequence, expectedLength);
	seq.valid = validation.valid;
	seq.error = validation.error;
	if (validation.valid) {
	validCount++;
	} else {
	invalidCount++;
	}
	}

	return { sequences, validCount, invalidCount };
	}

	// Export functions for use in other scripts
	window.FileParser = {
	parseFasta,
	parsePlainSequences,
	parseMultiSequenceText,
	parseCSV,
	parseFile,
	detectDelimiter,
	detectHeader,
	validateSequence,
	parseAndValidate,
	};