File size: 7,399 Bytes
9b487db 3a94de3 9b487db 3a94de3 9b487db 3a94de3 9b487db | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 |
/**
* DatasetLoader loads a dataset and prepare it for the processing.
*/
export class DatasetLoader {
constructor(datasetName = 'boolq_validation') {
this.running = false;
this._dataset = null;
this._datasetName = datasetName
this.loadDataset(this._datasetName);
}
/**
* Load the dataset from CSV file based on the given name
* If a comma appears inside a quote (context) it is not interpreted as a delimiter
*
* @param name - Name of the csv dataset to load without file extension
* @private
*/
loadDataset(name) {
const path = `./dataset/${name}.csv`;
return fetch(path)
.then(response => {
if (!response.ok) {
throw new Error(`Dataset file not found: ${path}`);
}
return response.text();
})
.then(data => {
const lines = data.replace(/\r\n/g, '\n').replace(/\r/g, '\n').split('\n');
// drop header
lines.shift();
this._dataset = lines
.filter(l => l.trim().length > 0)
.map(line => {
let id, answer, full_prompt;
// load different datasets based on the dataset name
switch (name) {
case 'boolq_validation':
({id, full_prompt, answer} = this._loadBoolQLine(line));
break;
case 'spam_ham_dataset':
({id, full_prompt, answer} = this._loadSpamHamLine(line));
break;
case 'imdb_dataset':
({id, full_prompt, answer} = this._loadIMDBLine(line));
break;
case 'ag_news_test':
({id, full_prompt, answer} = this._loadAGNewsLine(line));
break;
default:
throw new Error(`DatasetLoader: Unsupported dataset name '${name}'`);
}
return {id: id, prompt: full_prompt, groundTruth: answer};
});
console.log(`✅ Dataset '${name}' loaded with ${this._dataset.length} items.`);
console.log(this._dataset.slice(0, 2)); // log first 2 items for verification
return this._dataset;
})
.catch(error => {
console.error(error);
});
}
/**
* Load a single line from the BoolQ dataset and prepare the prompt
*
* @param line - A single line from the BoolQ CSV dataset
* @returns {{full_prompt: string, answer: *, id: *}}
* @private
*/
_loadBoolQLine(line) {
// parse line into fields handling quoted commas
const [id, question, answer, context] = this._parseCSVLine(line);
// set the prompt
const full_prompt = `Question: ${question}
Context: ${context}
Instructions: Answer with ONLY the word "true" or "false". Do not provide any explanation or additional text.
Answer:`;
return {id, full_prompt, answer}
}
/**
* Load a single line from the SpamHam dataset and prepare the prompt
*
* @param line - A single line from the SpamHam CSV dataset
* @returns {{full_prompt: string, answer: (string), id: *}}
* @private
*/
_loadSpamHamLine(line) {
let [id, text, answer] = this._parseCSVLine(line);
// convert answer to string boolean
answer = (answer.toLowerCase() === 'spam') ? 'true' : 'false';
// set the prompt
const full_prompt = `Task: Determine whether the following message is spam or not.
Instructions: Answer with ONLY the word "true" or "false". Do not provide any explanation or additional text.
Message: ${text}
Answer:`;
return {id, full_prompt, answer}
}
/**
* Load a single line from the IMDB dataset and prepare the prompt
*
* @param line - A single line from the IMDB CSV dataset
* @returns {{full_prompt: string, answer: *, id: *}}
* @private
*/
_loadIMDBLine(line) {
let [id, review, answer] = this._parseCSVLine(line);
// set the prompt
const full_prompt = `Task: Determine whether the sentiment of the following review is positive or negative.
Instructions: Answer with ONLY the word "positive" or "negative". Do not provide any explanation or additional text.
Review: ${review}
Sentiment:`;
return {id, full_prompt, answer}
}
/**
* Load a single line from the AG News dataset and prepare the prompt
*
* @param line - A single line from the AG News CSV dataset
* @returns {{full_prompt: string, answer: *, id: *}}
* @private
*/
_loadAGNewsLine(line) {
let [id, answer, title, description] = this._parseCSVLine(line);
// set the prompt
const full_prompt = `Task: Determine whether the following news article belong to world, sports, business or Sci/Tech category.
Categories: World (1), Sports (2), Business (3), Sci/Tech (4).
Instructions: Answer with ONLY the id (1,2,3 or 4) of the class. Do not provide any explanation or additional text.
News Title: ${title}
News Description: ${description}
`;
return {id, full_prompt, answer}
}
/**
* Parse a single CSV line into fields, handling quoted fields with commas
*
* @param line - A single line from a CSV file
* @private
*/
_parseCSVLine(line) {
// inline CSV parse with quotes support
const fields = [];
let cur = '';
let inQuotes = false;
for (let i = 0; i < line.length; i++) {
const ch = line[i];
if (inQuotes) { // if we are in a quote we just look for the quote ending
if (ch === '"') {
// escaped quote ""
if (i + 1 < line.length && line[i + 1] === '"') {
cur += '"';
i++;
} else {
inQuotes = false;
}
} else {
cur += ch;
}
} else { // only if we are not in a quote we count the comma as e delimiter
if (ch === ',') {
fields.push(cur);
cur = '';
} else if (ch === '"') {
inQuotes = true;
} else {
cur += ch;
}
}
}
fields.push(cur);
return fields;
}
}
|