sachin1801
added api routes, changes to input pages, db creationg + token mapping, history page created, batch support for csv + fasta files
923926c
/**
* File parsing utilities for CSV and FASTA files.
* Also handles multi-sequence text input parsing.
*/
/**
* Parse FASTA format text.
* Format: >Name\nSEQUENCE\n>Name2\nSEQUENCE2
* @param {string} text - FASTA formatted text
* @returns {Array<{name: string, sequence: string}>}
*/
function parseFasta(text) {
const sequences = [];
const lines = text.trim().split('\n');
let currentName = '';
let currentSeq = '';
let seqCount = 0;
for (const line of lines) {
const trimmedLine = line.trim();
if (trimmedLine.startsWith('>')) {
// Save previous sequence if exists
if (currentSeq) {
sequences.push({
name: currentName || `Seq_${seqCount}`,
sequence: currentSeq.toUpperCase()
});
}
seqCount++;
// Extract name from header (remove > and trim)
currentName = trimmedLine.substring(1).trim() || `Seq_${seqCount}`;
currentSeq = '';
} else if (trimmedLine) {
// Append to current sequence (remove whitespace)
currentSeq += trimmedLine.replace(/\s/g, '');
}
}
// Don't forget the last sequence
if (currentSeq) {
sequences.push({
name: currentName || `Seq_${seqCount}`,
sequence: currentSeq.toUpperCase()
});
}
return sequences;
}
/**
* Parse plain sequences (one per line, no headers).
* @param {string} text - Plain text with one sequence per line
* @returns {Array<{name: string, sequence: string}>}
*/
function parsePlainSequences(text) {
const lines = text.trim().split('\n');
const sequences = [];
for (let i = 0; i < lines.length; i++) {
const seq = lines[i].trim().toUpperCase().replace(/\s/g, '');
if (seq.length > 0) {
sequences.push({
name: `Seq_${sequences.length + 1}`,
sequence: seq
});
}
}
return sequences;
}
/**
* Auto-detect format and parse multi-sequence text.
* If any line starts with '>', treat as FASTA, otherwise plain sequences.
* @param {string} text - Text to parse
* @returns {Array<{name: string, sequence: string}>}
*/
function parseMultiSequenceText(text) {
const trimmed = text.trim();
if (!trimmed) {
return [];
}
// Check if it looks like FASTA (any line starts with >)
if (trimmed.includes('>')) {
return parseFasta(trimmed);
}
// Otherwise treat as plain sequences
return parsePlainSequences(trimmed);
}
/**
* Detect the delimiter used in a CSV file.
* @param {string} text - CSV text content
* @returns {string} Detected delimiter: ',', ';', or '\t'
*/
function detectDelimiter(text) {
const firstLine = text.split('\n')[0] || '';
// Count occurrences of each potential delimiter in first line
const tabCount = (firstLine.match(/\t/g) || []).length;
const semicolonCount = (firstLine.match(/;/g) || []).length;
const commaCount = (firstLine.match(/,/g) || []).length;
// Return the most common delimiter
if (tabCount > 0 && tabCount >= semicolonCount && tabCount >= commaCount) {
return '\t';
}
if (semicolonCount > 0 && semicolonCount >= commaCount) {
return ';';
}
return ',';
}
/**
* Detect if the first row is a header row.
* @param {Array<string>} row - First row of CSV
* @returns {boolean} True if row looks like a header
*/
function detectHeader(row) {
if (!row || row.length === 0) return false;
// Common header patterns (case insensitive)
const headerPatterns = ['name', 'sequence', 'seq', 'id', 'exon', 'header'];
// Check if any cell matches header patterns
for (const cell of row) {
const lowerCell = cell.toLowerCase().trim();
if (headerPatterns.some(pattern => lowerCell.includes(pattern))) {
return true;
}
}
// If first cell looks like a valid sequence (all ACGTU), probably not a header
const firstCell = row[0].toUpperCase().trim();
if (/^[ACGTU]+$/.test(firstCell) && firstCell.length > 20) {
return false;
}
return false;
}
/**
* Parse CSV content.
* Supports:
* - 2 columns: name, sequence (with or without header)
* - 1 column: sequence only (with or without header)
* - Auto-detects delimiter (comma, semicolon, tab)
* @param {string} text - CSV file content
* @returns {Array<{name: string, sequence: string}>}
*/
function parseCSV(text) {
const delimiter = detectDelimiter(text);
const lines = text.trim().split('\n');
if (lines.length === 0) {
return [];
}
// Parse all rows
const rows = lines.map(line => {
// Handle quoted fields properly
const cells = [];
let current = '';
let inQuotes = false;
for (let i = 0; i < line.length; i++) {
const char = line[i];
if (char === '"') {
inQuotes = !inQuotes;
} else if (char === delimiter && !inQuotes) {
cells.push(current.trim());
current = '';
} else {
current += char;
}
}
cells.push(current.trim());
return cells;
});
// Detect if first row is header
const hasHeader = detectHeader(rows[0]);
const dataRows = hasHeader ? rows.slice(1) : rows;
// Determine format based on number of columns
const numCols = rows[0].length;
const sequences = [];
for (let i = 0; i < dataRows.length; i++) {
const row = dataRows[i];
if (!row || row.length === 0 || (row.length === 1 && !row[0].trim())) {
continue; // Skip empty rows
}
if (numCols >= 2 && row.length >= 2) {
// Two or more columns: assume name, sequence
const name = row[0].trim() || `Seq_${sequences.length + 1}`;
const seq = row[1].trim().toUpperCase().replace(/\s/g, '');
if (seq) {
sequences.push({ name, sequence: seq });
}
} else {
// Single column: sequence only
const seq = row[0].trim().toUpperCase().replace(/\s/g, '');
if (seq) {
sequences.push({
name: `Seq_${sequences.length + 1}`,
sequence: seq
});
}
}
}
return sequences;
}
/**
* Read and parse a file (CSV or FASTA).
* @param {File} file - File object to parse
* @returns {Promise<Array<{name: string, sequence: string}>>}
*/
async function parseFile(file) {
return new Promise((resolve, reject) => {
const reader = new FileReader();
reader.onload = (event) => {
const text = event.target.result;
const fileName = file.name.toLowerCase();
let sequences;
if (fileName.endsWith('.fasta') || fileName.endsWith('.fa') || fileName.endsWith('.fna')) {
sequences = parseFasta(text);
} else if (fileName.endsWith('.csv') || fileName.endsWith('.tsv') || fileName.endsWith('.txt')) {
// Try CSV first
sequences = parseCSV(text);
// If CSV parsing resulted in weird sequences, try FASTA
if (sequences.length === 0 || (sequences.length === 1 && text.includes('>'))) {
sequences = parseFasta(text);
}
} else {
// Unknown extension - try to auto-detect
if (text.trim().startsWith('>')) {
sequences = parseFasta(text);
} else {
sequences = parseCSV(text);
}
}
resolve(sequences);
};
reader.onerror = () => {
reject(new Error('Failed to read file'));
};
reader.readAsText(file);
});
}
/**
* Validate a single sequence.
* @param {string} sequence - Sequence to validate
* @param {number} expectedLength - Expected length (default 70)
* @returns {{valid: boolean, error: string}}
*/
function validateSequence(sequence, expectedLength = 70) {
const cleaned = sequence.toUpperCase().replace(/U/g, 'T').replace(/\s/g, '');
if (cleaned.length !== expectedLength) {
return {
valid: false,
error: `Must be exactly ${expectedLength} nucleotides (got ${cleaned.length})`
};
}
const invalidChars = cleaned.match(/[^ACGT]/g);
if (invalidChars) {
const unique = [...new Set(invalidChars)];
return {
valid: false,
error: `Contains invalid characters: ${unique.join(', ')}`
};
}
return { valid: true, error: '' };
}
/**
* Parse and validate sequences from text input or file.
* @param {string} text - Text input to parse
* @param {number} expectedLength - Expected sequence length
* @returns {{sequences: Array<{name: string, sequence: string}>, validCount: number, invalidCount: number}}
*/
function parseAndValidate(text, expectedLength = 70) {
const sequences = parseMultiSequenceText(text);
let validCount = 0;
let invalidCount = 0;
for (const seq of sequences) {
const validation = validateSequence(seq.sequence, expectedLength);
seq.valid = validation.valid;
seq.error = validation.error;
if (validation.valid) {
validCount++;
} else {
invalidCount++;
}
}
return { sequences, validCount, invalidCount };
}
// Export functions for use in other scripts
window.FileParser = {
parseFasta,
parsePlainSequences,
parseMultiSequenceText,
parseCSV,
parseFile,
detectDelimiter,
detectHeader,
validateSequence,
parseAndValidate,
};