|
|
|
|
|
|
|
|
|
|
|
|
|
|
export class DatasetLoader { |
|
|
constructor(datasetName = 'boolq_validation') { |
|
|
this.running = false; |
|
|
this._dataset = null; |
|
|
this._datasetName = datasetName |
|
|
|
|
|
this.loadDataset(this._datasetName); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
loadDataset(name) { |
|
|
const path = `./dataset/${name}.csv`; |
|
|
|
|
|
return fetch(path) |
|
|
.then(response => { |
|
|
if (!response.ok) { |
|
|
throw new Error(`Dataset file not found: ${path}`); |
|
|
} |
|
|
return response.text(); |
|
|
}) |
|
|
.then(data => { |
|
|
const lines = data.replace(/\r\n/g, '\n').replace(/\r/g, '\n').split('\n'); |
|
|
|
|
|
lines.shift(); |
|
|
|
|
|
this._dataset = lines |
|
|
.filter(l => l.trim().length > 0) |
|
|
.map(line => { |
|
|
let id, answer, full_prompt; |
|
|
|
|
|
|
|
|
switch (name) { |
|
|
case 'boolq_validation': |
|
|
({id, full_prompt, answer} = this._loadBoolQLine(line)); |
|
|
break; |
|
|
case 'spam_ham_dataset': |
|
|
({id, full_prompt, answer} = this._loadSpamHamLine(line)); |
|
|
break; |
|
|
case 'imdb_dataset': |
|
|
({id, full_prompt, answer} = this._loadIMDBLine(line)); |
|
|
break; |
|
|
case 'ag_news_test': |
|
|
({id, full_prompt, answer} = this._loadAGNewsLine(line)); |
|
|
break; |
|
|
default: |
|
|
throw new Error(`DatasetLoader: Unsupported dataset name '${name}'`); |
|
|
} |
|
|
|
|
|
return {id: id, prompt: full_prompt, groundTruth: answer}; |
|
|
}); |
|
|
|
|
|
console.log(`✅ Dataset '${name}' loaded with ${this._dataset.length} items.`); |
|
|
console.log(this._dataset.slice(0, 2)); |
|
|
return this._dataset; |
|
|
}) |
|
|
.catch(error => { |
|
|
console.error(error); |
|
|
}); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_loadBoolQLine(line) { |
|
|
|
|
|
const [id, question, answer, context] = this._parseCSVLine(line); |
|
|
|
|
|
|
|
|
const full_prompt = `Question: ${question} |
|
|
Context: ${context} |
|
|
Instructions: Answer with ONLY the word "true" or "false". Do not provide any explanation or additional text. |
|
|
Answer:`; |
|
|
|
|
|
return {id, full_prompt, answer} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_loadSpamHamLine(line) { |
|
|
let [id, text, answer] = this._parseCSVLine(line); |
|
|
|
|
|
|
|
|
answer = (answer.toLowerCase() === 'spam') ? 'true' : 'false'; |
|
|
|
|
|
|
|
|
const full_prompt = `Task: Determine whether the following message is spam or not. |
|
|
Instructions: Answer with ONLY the word "true" or "false". Do not provide any explanation or additional text. |
|
|
Message: ${text} |
|
|
Answer:`; |
|
|
|
|
|
return {id, full_prompt, answer} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_loadIMDBLine(line) { |
|
|
let [id, review, answer] = this._parseCSVLine(line); |
|
|
|
|
|
|
|
|
const full_prompt = `Task: Determine whether the sentiment of the following review is positive or negative. |
|
|
Instructions: Answer with ONLY the word "positive" or "negative". Do not provide any explanation or additional text. |
|
|
Review: ${review} |
|
|
Sentiment:`; |
|
|
|
|
|
return {id, full_prompt, answer} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_loadAGNewsLine(line) { |
|
|
let [id, answer, title, description] = this._parseCSVLine(line); |
|
|
|
|
|
|
|
|
const full_prompt = `Task: Determine whether the following news article belong to world, sports, business or Sci/Tech category. |
|
|
Categories: World (1), Sports (2), Business (3), Sci/Tech (4). |
|
|
Instructions: Answer with ONLY the id (1,2,3 or 4) of the class. Do not provide any explanation or additional text. |
|
|
News Title: ${title} |
|
|
News Description: ${description} |
|
|
`; |
|
|
|
|
|
return {id, full_prompt, answer} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_parseCSVLine(line) { |
|
|
|
|
|
|
|
|
const fields = []; |
|
|
let cur = ''; |
|
|
let inQuotes = false; |
|
|
|
|
|
for (let i = 0; i < line.length; i++) { |
|
|
const ch = line[i]; |
|
|
if (inQuotes) { |
|
|
if (ch === '"') { |
|
|
|
|
|
if (i + 1 < line.length && line[i + 1] === '"') { |
|
|
cur += '"'; |
|
|
i++; |
|
|
} else { |
|
|
inQuotes = false; |
|
|
} |
|
|
} else { |
|
|
cur += ch; |
|
|
} |
|
|
} else { |
|
|
if (ch === ',') { |
|
|
fields.push(cur); |
|
|
cur = ''; |
|
|
} else if (ch === '"') { |
|
|
inQuotes = true; |
|
|
} else { |
|
|
cur += ch; |
|
|
} |
|
|
} |
|
|
} |
|
|
fields.push(cur); |
|
|
return fields; |
|
|
} |
|
|
} |
|
|
|
|
|
|