Nitish kumar
Upload folder using huggingface_hub
c20f20c verified
import type { Token } from './types';
import { childlessTags } from './tags';
interface State {
str: string;
position: number;
tokens: Token[];
}
const jumpPosition = (state: State, end: number) => {
const len = end - state.position;
movePositopn(state, len);
};
const movePositopn = (state: State, len: number) => {
state.position = state.position + len;
};
const findTextEnd = (str: string, index: number) => {
const isEnd = false;
while (!isEnd) {
const textEnd = str.indexOf('<', index);
if (textEnd === -1) {
return textEnd;
}
const char = str.charAt(textEnd + 1);
if (char === '/' || char === '!' || /[A-Za-z0-9]/.test(char)) {
return textEnd;
}
index = textEnd + 1;
}
return -1;
};
const lexText = (state: State) => {
const { str } = state;
let textEnd = findTextEnd(str, state.position);
if (textEnd === state.position) return;
if (textEnd === -1) {
textEnd = str.length;
}
const content = str.slice(state.position, textEnd);
jumpPosition(state, textEnd);
state.tokens.push({
type: 'text',
content,
});
};
const lexComment = (state: State) => {
const { str } = state;
movePositopn(state, 4);
let contentEnd = str.indexOf('-->', state.position);
let commentEnd = contentEnd + 3;
if (contentEnd === -1) {
contentEnd = commentEnd = str.length;
}
const content = str.slice(state.position, contentEnd);
jumpPosition(state, commentEnd);
state.tokens.push({
type: 'comment',
content,
});
};
const lexTagName = (state: State) => {
const { str } = state;
const len = str.length;
let start = state.position;
while (start < len) {
const char = str.charAt(start);
const isTagChar = !(/\s/.test(char) || char === '/' || char === '>');
if (isTagChar) break;
start++;
}
let end = start + 1;
while (end < len) {
const char = str.charAt(end);
const isTagChar = !(/\s/.test(char) || char === '/' || char === '>');
if (!isTagChar) break;
end++;
}
jumpPosition(state, end);
const tagName = str.slice(start, end);
state.tokens.push({
type: 'tag',
content: tagName,
});
return tagName;
};
const lexTagAttributes = (state: State) => {
const { str, tokens } = state;
let cursor = state.position;
let quote = null;
let wordBegin = cursor;
const words = [];
const len = str.length;
while (cursor < len) {
const char = str.charAt(cursor);
if (quote) {
const isQuoteEnd = char === quote;
if (isQuoteEnd) quote = null;
cursor++;
continue;
}
const isTagEnd = char === '/' || char === '>';
if (isTagEnd) {
if (cursor !== wordBegin) words.push(str.slice(wordBegin, cursor));
break;
}
const isWordEnd = /\s/.test(char);
if (isWordEnd) {
if (cursor !== wordBegin) words.push(str.slice(wordBegin, cursor));
wordBegin = cursor + 1;
cursor++;
continue;
}
const isQuoteStart = char === "'" || char === '"';
if (isQuoteStart) {
quote = char;
cursor++;
continue;
}
cursor++;
}
jumpPosition(state, cursor);
const type = 'attribute';
for (let i = 0; i < words.length; i++) {
const word = words[i];
const isNotPair = word.indexOf('=') === -1;
if (isNotPair) {
const secondWord = words[i + 1];
if (secondWord && secondWord.startsWith('=')) {
if (secondWord.length > 1) {
const newWord = word + secondWord;
tokens.push({ type, content: newWord });
i += 1;
continue;
}
const thirdWord = words[i + 2];
i += 1;
if (thirdWord) {
const newWord = word + '=' + thirdWord;
tokens.push({ type, content: newWord });
i += 1;
continue;
}
}
}
if (word.endsWith('=')) {
const secondWord = words[i + 1];
if (secondWord && secondWord.indexOf('=') === -1) {
const newWord = word + secondWord;
tokens.push({ type, content: newWord });
i += 1;
continue;
}
const newWord = word.slice(0, -1);
tokens.push({ type, content: newWord });
continue;
}
tokens.push({ type, content: word });
}
};
const lexSkipTag = (tagName: string, state: State) => {
const { str, tokens } = state;
const safeTagName = tagName.toLowerCase();
const len = str.length;
let index = state.position;
while (index < len) {
const nextTag = str.indexOf('</', index);
if (nextTag === -1) {
lexText(state);
break;
}
const tagState = {
str,
position: state.position,
tokens: [],
};
jumpPosition(tagState, nextTag);
const name = lexTag(tagState);
if (safeTagName !== name.toLowerCase()) {
index = tagState.position;
continue;
}
if (nextTag !== state.position) {
const textStart = state.position;
jumpPosition(state, nextTag);
tokens.push({
type: 'text',
content: str.slice(textStart, nextTag),
});
}
tokens.push(...tagState.tokens);
jumpPosition(state, tagState.position);
break;
}
};
const lexTag = (state: State) => {
const { str } = state;
const secondChar = str.charAt(state.position + 1);
const tagStartClose = secondChar === '/';
movePositopn(state, tagStartClose ? 2 : 1);
state.tokens.push({
type: 'tag-start',
close: tagStartClose,
});
const tagName = lexTagName(state);
lexTagAttributes(state);
const firstChar = str.charAt(state.position);
const tagEndClose = firstChar === '/';
movePositopn(state, tagEndClose ? 2 : 1);
state.tokens.push({
type: 'tag-end',
close: tagEndClose,
});
return tagName;
};
const lex = (state: State) => {
const str = state.str;
const len = str.length;
while (state.position < len) {
const start = state.position;
lexText(state);
if (state.position === start) {
const isComment = str.startsWith('!--', start + 1);
if (isComment) lexComment(state);
else {
const tagName = lexTag(state);
const safeTag = tagName.toLowerCase();
if (childlessTags.includes(safeTag)) lexSkipTag(tagName, state);
}
}
}
};
export const lexer = (str: string): Token[] => {
const state = {
str,
position: 0,
tokens: [],
};
lex(state);
return state.tokens;
};