Spaces:

fhueni
/

on-device-vs-cloud-llm-inference

Running

File size: 7,399 Bytes


/**
 * DatasetLoader loads a dataset and prepare it for the processing.
 */
export class DatasetLoader {
    constructor(datasetName = 'boolq_validation') {
        this.running = false;
        this._dataset = null;
        this._datasetName = datasetName

        this.loadDataset(this._datasetName);
    }

    /**
     * Load the dataset from CSV file based on the given name
     * If a comma appears inside a quote (context) it is not interpreted as a delimiter
     *
     * @param name - Name of the csv dataset to load without file extension
     * @private
     */
    loadDataset(name) {
        const path = `./dataset/${name}.csv`;

        return fetch(path)
            .then(response => {
                if (!response.ok) {
                    throw new Error(`Dataset file not found: ${path}`);
                }
                return response.text();
            })
            .then(data => {
                const lines = data.replace(/\r\n/g, '\n').replace(/\r/g, '\n').split('\n');
                // drop header
                lines.shift();

                this._dataset = lines
                    .filter(l => l.trim().length > 0)
                    .map(line => {
                        let id, answer, full_prompt;

                        // load different datasets based on the dataset name
                        switch (name) {
                            case 'boolq_validation':
                                ({id, full_prompt, answer} = this._loadBoolQLine(line));
                                break;
                            case 'spam_ham_dataset':
                                ({id, full_prompt, answer} = this._loadSpamHamLine(line));
                                break;
                            case 'imdb_dataset':
                                ({id, full_prompt, answer} = this._loadIMDBLine(line));
                                break;
                            case 'ag_news_test':
                                ({id, full_prompt, answer} = this._loadAGNewsLine(line));
                                break;
                            default:
                                throw new Error(`DatasetLoader: Unsupported dataset name '${name}'`);
                        }

                        return {id: id, prompt: full_prompt, groundTruth: answer};
                    });

                console.log(`✅ Dataset '${name}' loaded with ${this._dataset.length} items.`);
                console.log(this._dataset.slice(0, 2)); // log first 2 items for verification
                return this._dataset;
            })
            .catch(error => {
                console.error(error);
            });
    }


    /**
     * Load a single line from the BoolQ dataset and prepare the prompt
     *
     * @param line - A single line from the BoolQ CSV dataset
     * @returns {{full_prompt: string, answer: *, id: *}}
     * @private
     */
    _loadBoolQLine(line) {
        // parse line into fields handling quoted commas
        const [id, question, answer, context] = this._parseCSVLine(line);

        // set the prompt
        const full_prompt = `Question: ${question}
                                        Context: ${context}
                                        Instructions: Answer with ONLY the word "true" or "false". Do not provide any explanation or additional text.
                                        Answer:`;

        return {id, full_prompt, answer}
    }


    /**
     * Load a single line from the SpamHam dataset and prepare the prompt
     *
     * @param line - A single line from the SpamHam CSV dataset
     * @returns {{full_prompt: string, answer: (string), id: *}}
     * @private
     */
    _loadSpamHamLine(line) {
        let [id, text, answer] = this._parseCSVLine(line);

        // convert answer to string boolean
        answer = (answer.toLowerCase() === 'spam') ? 'true' : 'false';

        // set the prompt
        const full_prompt = `Task: Determine whether the following message is spam or not.
                                        Instructions: Answer with ONLY the word "true" or "false". Do not provide any explanation or additional text.
                                        Message: ${text}
                                        Answer:`;

        return {id, full_prompt, answer}
    }


    /**
     * Load a single line from the IMDB dataset and prepare the prompt
     *
     * @param line - A single line from the IMDB CSV dataset
     * @returns {{full_prompt: string, answer: *, id: *}}
     * @private
     */
    _loadIMDBLine(line) {
        let [id, review, answer] = this._parseCSVLine(line);

        // set the prompt
        const full_prompt = `Task: Determine whether the sentiment of the following review is positive or negative.
                                        Instructions: Answer with ONLY the word "positive" or "negative". Do not provide any explanation or additional text.
                                        Review: ${review}
                                        Sentiment:`;

        return {id, full_prompt, answer}
    }


    /**
     * Load a single line from the AG News dataset and prepare the prompt
     *
     * @param line - A single line from the AG News CSV dataset
     * @returns {{full_prompt: string, answer: *, id: *}}
     * @private
     */
    _loadAGNewsLine(line) {
        let [id, answer, title, description] = this._parseCSVLine(line);

        // set the prompt
        const full_prompt = `Task: Determine whether the following news article belong to world, sports, business or Sci/Tech category.
                                        Categories: World (1), Sports (2), Business (3), Sci/Tech (4).
                                        Instructions: Answer with ONLY the id (1,2,3 or 4) of the class. Do not provide any explanation or additional text.
                                        News Title: ${title}
                                        News Description: ${description}
                                        `;

        return {id, full_prompt, answer}
    }



    /**
     * Parse a single CSV line into fields, handling quoted fields with commas
     *
     * @param line - A single line from a CSV file
     * @private
     */
    _parseCSVLine(line) {

        // inline CSV parse with quotes support
        const fields = [];
        let cur = '';
        let inQuotes = false;

        for (let i = 0; i < line.length; i++) {
            const ch = line[i];
            if (inQuotes) { // if we are in a quote we just look for the quote ending
                if (ch === '"') {
                    // escaped quote ""
                    if (i + 1 < line.length && line[i + 1] === '"') {
                        cur += '"';
                        i++;
                    } else {
                        inQuotes = false;
                    }
                } else {
                    cur += ch;
                }
            } else {   // only if we are not in a quote we count the comma as e delimiter
                if (ch === ',') {
                    fields.push(cur);
                    cur = '';
                } else if (ch === '"') {
                    inQuotes = true;
                } else {
                    cur += ch;
                }
            }
        }
        fields.push(cur);
        return fields;
    }
}