fhueni's picture
feat: add imdb and ag news dataset to datasets
3a94de3
raw
history blame
7.4 kB
/**
* DatasetLoader loads a dataset and prepare it for the processing.
*/
export class DatasetLoader {
constructor(datasetName = 'boolq_validation') {
this.running = false;
this._dataset = null;
this._datasetName = datasetName
this.loadDataset(this._datasetName);
}
/**
* Load the dataset from CSV file based on the given name
* If a comma appears inside a quote (context) it is not interpreted as a delimiter
*
* @param name - Name of the csv dataset to load without file extension
* @private
*/
loadDataset(name) {
const path = `./dataset/${name}.csv`;
return fetch(path)
.then(response => {
if (!response.ok) {
throw new Error(`Dataset file not found: ${path}`);
}
return response.text();
})
.then(data => {
const lines = data.replace(/\r\n/g, '\n').replace(/\r/g, '\n').split('\n');
// drop header
lines.shift();
this._dataset = lines
.filter(l => l.trim().length > 0)
.map(line => {
let id, answer, full_prompt;
// load different datasets based on the dataset name
switch (name) {
case 'boolq_validation':
({id, full_prompt, answer} = this._loadBoolQLine(line));
break;
case 'spam_ham_dataset':
({id, full_prompt, answer} = this._loadSpamHamLine(line));
break;
case 'imdb_dataset':
({id, full_prompt, answer} = this._loadIMDBLine(line));
break;
case 'ag_news_test':
({id, full_prompt, answer} = this._loadAGNewsLine(line));
break;
default:
throw new Error(`DatasetLoader: Unsupported dataset name '${name}'`);
}
return {id: id, prompt: full_prompt, groundTruth: answer};
});
console.log(`✅ Dataset '${name}' loaded with ${this._dataset.length} items.`);
console.log(this._dataset.slice(0, 2)); // log first 2 items for verification
return this._dataset;
})
.catch(error => {
console.error(error);
});
}
/**
* Load a single line from the BoolQ dataset and prepare the prompt
*
* @param line - A single line from the BoolQ CSV dataset
* @returns {{full_prompt: string, answer: *, id: *}}
* @private
*/
_loadBoolQLine(line) {
// parse line into fields handling quoted commas
const [id, question, answer, context] = this._parseCSVLine(line);
// set the prompt
const full_prompt = `Question: ${question}
Context: ${context}
Instructions: Answer with ONLY the word "true" or "false". Do not provide any explanation or additional text.
Answer:`;
return {id, full_prompt, answer}
}
/**
* Load a single line from the SpamHam dataset and prepare the prompt
*
* @param line - A single line from the SpamHam CSV dataset
* @returns {{full_prompt: string, answer: (string), id: *}}
* @private
*/
_loadSpamHamLine(line) {
let [id, text, answer] = this._parseCSVLine(line);
// convert answer to string boolean
answer = (answer.toLowerCase() === 'spam') ? 'true' : 'false';
// set the prompt
const full_prompt = `Task: Determine whether the following message is spam or not.
Instructions: Answer with ONLY the word "true" or "false". Do not provide any explanation or additional text.
Message: ${text}
Answer:`;
return {id, full_prompt, answer}
}
/**
* Load a single line from the IMDB dataset and prepare the prompt
*
* @param line - A single line from the IMDB CSV dataset
* @returns {{full_prompt: string, answer: *, id: *}}
* @private
*/
_loadIMDBLine(line) {
let [id, review, answer] = this._parseCSVLine(line);
// set the prompt
const full_prompt = `Task: Determine whether the sentiment of the following review is positive or negative.
Instructions: Answer with ONLY the word "positive" or "negative". Do not provide any explanation or additional text.
Review: ${review}
Sentiment:`;
return {id, full_prompt, answer}
}
/**
* Load a single line from the AG News dataset and prepare the prompt
*
* @param line - A single line from the AG News CSV dataset
* @returns {{full_prompt: string, answer: *, id: *}}
* @private
*/
_loadAGNewsLine(line) {
let [id, answer, title, description] = this._parseCSVLine(line);
// set the prompt
const full_prompt = `Task: Determine whether the following news article belong to world, sports, business or Sci/Tech category.
Categories: World (1), Sports (2), Business (3), Sci/Tech (4).
Instructions: Answer with ONLY the id (1,2,3 or 4) of the class. Do not provide any explanation or additional text.
News Title: ${title}
News Description: ${description}
`;
return {id, full_prompt, answer}
}
/**
* Parse a single CSV line into fields, handling quoted fields with commas
*
* @param line - A single line from a CSV file
* @private
*/
_parseCSVLine(line) {
// inline CSV parse with quotes support
const fields = [];
let cur = '';
let inQuotes = false;
for (let i = 0; i < line.length; i++) {
const ch = line[i];
if (inQuotes) { // if we are in a quote we just look for the quote ending
if (ch === '"') {
// escaped quote ""
if (i + 1 < line.length && line[i + 1] === '"') {
cur += '"';
i++;
} else {
inQuotes = false;
}
} else {
cur += ch;
}
} else { // only if we are not in a quote we count the comma as e delimiter
if (ch === ',') {
fields.push(cur);
cur = '';
} else if (ch === '"') {
inQuotes = true;
} else {
cur += ch;
}
}
}
fields.push(cur);
return fields;
}
}