Spaces:

Nitishkumar-ai
/

arena-learning

No application file

arena-learning / studyArena /lib /export /html-parser /lexer.ts

Nitish kumar

Upload folder using huggingface_hub

c20f20c verified 18 days ago

6.38 kB

	import type { Token } from './types';
	import { childlessTags } from './tags';

	interface State {
	str: string;
	position: number;
	tokens: Token[];
	}

	const jumpPosition = (state: State, end: number) => {
	const len = end - state.position;
	movePositopn(state, len);
	};

	const movePositopn = (state: State, len: number) => {
	state.position = state.position + len;
	};

	const findTextEnd = (str: string, index: number) => {
	const isEnd = false;
	while (!isEnd) {
	const textEnd = str.indexOf('<', index);
	if (textEnd === -1) {
	return textEnd;
	}
	const char = str.charAt(textEnd + 1);
	if (char === '/' \|\| char === '!' \|\| /[A-Za-z0-9]/.test(char)) {
	return textEnd;
	}
	index = textEnd + 1;
	}
	return -1;
	};

	const lexText = (state: State) => {
	const { str } = state;
	let textEnd = findTextEnd(str, state.position);
	if (textEnd === state.position) return;
	if (textEnd === -1) {
	textEnd = str.length;
	}

	const content = str.slice(state.position, textEnd);
	jumpPosition(state, textEnd);

	state.tokens.push({
	type: 'text',
	content,
	});
	};

	const lexComment = (state: State) => {
	const { str } = state;

	movePositopn(state, 4);
	let contentEnd = str.indexOf('-->', state.position);
	let commentEnd = contentEnd + 3;
	if (contentEnd === -1) {
	contentEnd = commentEnd = str.length;
	}

	const content = str.slice(state.position, contentEnd);
	jumpPosition(state, commentEnd);

	state.tokens.push({
	type: 'comment',
	content,
	});
	};

	const lexTagName = (state: State) => {
	const { str } = state;
	const len = str.length;
	let start = state.position;

	while (start < len) {
	const char = str.charAt(start);
	const isTagChar = !(/\s/.test(char) \|\| char === '/' \|\| char === '>');
	if (isTagChar) break;
	start++;
	}

	let end = start + 1;
	while (end < len) {
	const char = str.charAt(end);
	const isTagChar = !(/\s/.test(char) \|\| char === '/' \|\| char === '>');
	if (!isTagChar) break;
	end++;
	}

	jumpPosition(state, end);
	const tagName = str.slice(start, end);
	state.tokens.push({
	type: 'tag',
	content: tagName,
	});
	return tagName;
	};

	const lexTagAttributes = (state: State) => {
	const { str, tokens } = state;
	let cursor = state.position;
	let quote = null;
	let wordBegin = cursor;
	const words = [];
	const len = str.length;
	while (cursor < len) {
	const char = str.charAt(cursor);
	if (quote) {
	const isQuoteEnd = char === quote;
	if (isQuoteEnd) quote = null;
	cursor++;
	continue;
	}

	const isTagEnd = char === '/' \|\| char === '>';
	if (isTagEnd) {
	if (cursor !== wordBegin) words.push(str.slice(wordBegin, cursor));
	break;
	}

	const isWordEnd = /\s/.test(char);
	if (isWordEnd) {
	if (cursor !== wordBegin) words.push(str.slice(wordBegin, cursor));
	wordBegin = cursor + 1;
	cursor++;
	continue;
	}

	const isQuoteStart = char === "'" \|\| char === '"';
	if (isQuoteStart) {
	quote = char;
	cursor++;
	continue;
	}

	cursor++;
	}
	jumpPosition(state, cursor);

	const type = 'attribute';
	for (let i = 0; i < words.length; i++) {
	const word = words[i];

	const isNotPair = word.indexOf('=') === -1;
	if (isNotPair) {
	const secondWord = words[i + 1];
	if (secondWord && secondWord.startsWith('=')) {
	if (secondWord.length > 1) {
	const newWord = word + secondWord;
	tokens.push({ type, content: newWord });
	i += 1;
	continue;
	}
	const thirdWord = words[i + 2];
	i += 1;
	if (thirdWord) {
	const newWord = word + '=' + thirdWord;
	tokens.push({ type, content: newWord });
	i += 1;
	continue;
	}
	}
	}
	if (word.endsWith('=')) {
	const secondWord = words[i + 1];
	if (secondWord && secondWord.indexOf('=') === -1) {
	const newWord = word + secondWord;
	tokens.push({ type, content: newWord });
	i += 1;
	continue;
	}

	const newWord = word.slice(0, -1);
	tokens.push({ type, content: newWord });
	continue;
	}

	tokens.push({ type, content: word });
	}
	};

	const lexSkipTag = (tagName: string, state: State) => {
	const { str, tokens } = state;
	const safeTagName = tagName.toLowerCase();
	const len = str.length;
	let index = state.position;

	while (index < len) {
	const nextTag = str.indexOf('</', index);
	if (nextTag === -1) {
	lexText(state);
	break;
	}

	const tagState = {
	str,
	position: state.position,
	tokens: [],
	};
	jumpPosition(tagState, nextTag);
	const name = lexTag(tagState);
	if (safeTagName !== name.toLowerCase()) {
	index = tagState.position;
	continue;
	}

	if (nextTag !== state.position) {
	const textStart = state.position;
	jumpPosition(state, nextTag);
	tokens.push({
	type: 'text',
	content: str.slice(textStart, nextTag),
	});
	}

	tokens.push(...tagState.tokens);
	jumpPosition(state, tagState.position);
	break;
	}
	};

	const lexTag = (state: State) => {
	const { str } = state;
	const secondChar = str.charAt(state.position + 1);
	const tagStartClose = secondChar === '/';
	movePositopn(state, tagStartClose ? 2 : 1);
	state.tokens.push({
	type: 'tag-start',
	close: tagStartClose,
	});

	const tagName = lexTagName(state);
	lexTagAttributes(state);

	const firstChar = str.charAt(state.position);
	const tagEndClose = firstChar === '/';
	movePositopn(state, tagEndClose ? 2 : 1);
	state.tokens.push({
	type: 'tag-end',
	close: tagEndClose,
	});
	return tagName;
	};

	const lex = (state: State) => {
	const str = state.str;
	const len = str.length;

	while (state.position < len) {
	const start = state.position;
	lexText(state);

	if (state.position === start) {
	const isComment = str.startsWith('!--', start + 1);
	if (isComment) lexComment(state);
	else {
	const tagName = lexTag(state);
	const safeTag = tagName.toLowerCase();
	if (childlessTags.includes(safeTag)) lexSkipTag(tagName, state);
	}
	}
	}
	};

	export const lexer = (str: string): Token[] => {
	const state = {
	str,
	position: 0,
	tokens: [],
	};
	lex(state);
	return state.tokens;
	};