Spaces:
Running
Running
| /** | |
| * Arabic Fuzzy Search Utility | |
| * | |
| * Handles common Arabic character confusions to improve search results. | |
| * Similar to how Amazon and other e-commerce platforms handle Arabic text search. | |
| * | |
| * Example: searching for "سنيور" will also match "شنيور" | |
| */ | |
| // Groups of Arabic characters that are commonly confused or interchanged | |
| const ARABIC_SIMILAR_GROUPS = [ | |
| // Alef variants — أ إ آ ا ٱ | |
| ['ا', 'أ', 'إ', 'آ', 'ٱ'], | |
| // Seen / Sheen — س ش | |
| ['س', 'ش'], | |
| // Saad / Daad — ص ض | |
| ['ص', 'ض'], | |
| // Taa / Dhaa — ط ظ | |
| ['ط', 'ظ'], | |
| // Haa / Taa Marbouta — ه ة | |
| ['ه', 'ة'], | |
| // Yaa / Alef Maqsoura — ي ى ئ | |
| ['ي', 'ى', 'ئ'], | |
| // Waw / Waw with Hamza — و ؤ | |
| ['و', 'ؤ'], | |
| // Daal / Thaal — د ذ | |
| ['د', 'ذ'], | |
| // Taa / Thaa — ت ث | |
| ['ت', 'ث'], | |
| // Kaaf / Qaaf — ك ق | |
| ['ك', 'ق'], | |
| // Zay / Dhaal (less common but sometimes confused) — ز ذ | |
| // Note: ذ already in د/ذ group above, so we skip this | |
| // Haa / Khaa — ح خ | |
| ['ح', 'خ', 'ج'], | |
| // Ain / Ghain — ع غ | |
| ['ع', 'غ'], | |
| ]; | |
| // Build a lookup map: character -> its similar group | |
| const CHAR_TO_GROUP = {}; | |
| ARABIC_SIMILAR_GROUPS.forEach((group) => { | |
| const charClass = `[${group.join('')}]`; | |
| group.forEach((char) => { | |
| CHAR_TO_GROUP[char] = charClass; | |
| }); | |
| }); | |
| /** | |
| * Convert an Arabic search term into a fuzzy regex pattern. | |
| * | |
| * Each Arabic character that belongs to a "similar group" is replaced | |
| * with a character class containing all members of that group. | |
| * | |
| * Also strips common Arabic diacritics (tashkeel) so that | |
| * "سُنيور" matches "سنيور". | |
| * | |
| * @param {string} term - The raw search term from the user | |
| * @returns {string} A regex-compatible pattern string | |
| * | |
| * @example | |
| * buildArabicFuzzyPattern('سنيور') | |
| * // Returns: '[سش]ن[يىئ][وؤ]ر' | |
| * // This pattern matches: سنيور, شنيور, سنيؤر, etc. | |
| */ | |
| const convertArabicNumerals = (text) => { | |
| if (!text) return ''; | |
| return text.replace(/[٠١٢٣٤٥٦٧٨٩]/g, (d) => '٠١٢٣٤٥٦٧٨٩'.indexOf(d)); | |
| }; | |
| /** | |
| * Normalize Arabic text for comparison. | |
| * Removes diacritics and converts all variants of Alef, Yaa, etc. to a base form. | |
| */ | |
| const normalizeArabic = (text) => { | |
| if (!text) return ''; | |
| return text | |
| .replace(/[\u064B-\u065F\u0670]/g, '') // Strip diacritics | |
| .replace(/[أإآٱ]/g, 'ا') | |
| .replace(/[ىئ]/g, 'ي') | |
| .replace(/ؤ/g, 'و') | |
| .replace(/ة/g, 'ه') | |
| .replace(/\s+/g, ' ') | |
| .trim(); | |
| }; | |
| /** | |
| * Calculate Levenshtein distance between two strings | |
| */ | |
| const levenshtein = (a, b) => { | |
| const tmp = []; | |
| for (let i = 0; i <= a.length; i++) { | |
| tmp[i] = [i]; | |
| } | |
| for (let j = 0; j <= b.length; j++) { | |
| tmp[0][j] = j; | |
| } | |
| for (let i = 1; i <= a.length; i++) { | |
| for (let j = 1; j <= b.length; j++) { | |
| tmp[i][j] = Math.min( | |
| tmp[i - 1][j] + 1, | |
| tmp[i][j - 1] + 1, | |
| tmp[i - 1][j - 1] + (a[i - 1] === b[j - 1] ? 0 : 1), | |
| ); | |
| } | |
| } | |
| return tmp[a.length][b.length]; | |
| }; | |
| /** | |
| * Get a similarity score between 0 and 1 | |
| */ | |
| const getSimilarity = (str1, str2) => { | |
| const s1 = normalizeArabic(str1); | |
| const s2 = normalizeArabic(str2); | |
| if (s1 === s2) return 1; | |
| if (s1.length === 0 || s2.length === 0) return 0; | |
| // Check if it starts with the term (very high weight) | |
| if (s2.startsWith(s1)) { | |
| const longer = Math.max(s1.length, s2.length); | |
| const shorter = Math.min(s1.length, s2.length); | |
| return 0.9 + (shorter / longer) * 0.1; | |
| } | |
| // Check if one contains the other (high weight) | |
| if (s1.includes(s2) || s2.includes(s1)) { | |
| const longer = Math.max(s1.length, s2.length); | |
| const shorter = Math.min(s1.length, s2.length); | |
| return 0.8 + (shorter / longer) * 0.15; | |
| } | |
| const distance = levenshtein(s1, s2); | |
| const maxLength = Math.max(s1.length, s2.length); | |
| return 1 - distance / maxLength; | |
| }; | |
| /** | |
| * Convert an Arabic search term into a fuzzy regex pattern. | |
| */ | |
| const buildArabicFuzzyPattern = (term) => { | |
| // 1) Strip Arabic diacritics | |
| const cleaned = term.replace(/[\u064B-\u065F\u0670]/g, ''); | |
| if (!cleaned) return ''; | |
| // 2) Replace each character with its fuzzy group | |
| let pattern = ''; | |
| const chars = [...cleaned]; | |
| const isLongTerm = cleaned.length >= 2; | |
| // Optional prefix (like 'ال') | |
| pattern += '(?:ال)?\\s*'; | |
| for (let i = 0; i < chars.length; i++) { | |
| const char = chars[i]; | |
| let charPattern = ''; | |
| if (CHAR_TO_GROUP[char]) { | |
| charPattern = CHAR_TO_GROUP[char]; | |
| } else { | |
| charPattern = char.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); | |
| } | |
| // Handle specific character behavior: | |
| // Make vowels and Alef variants optional/flexible | |
| const isVowel = [ | |
| 'ا', | |
| 'أ', | |
| 'إ', | |
| 'آ', | |
| 'ٱ', | |
| 'و', | |
| 'ؤ', | |
| 'ي', | |
| 'ى', | |
| 'ئ', | |
| 'ه', | |
| 'ة', | |
| ].includes(char); | |
| if (isLongTerm && isVowel) { | |
| // Optional and can be repeated | |
| pattern += `(?:${charPattern})*`; | |
| } else { | |
| // Required and can be repeated | |
| pattern += `(?:${charPattern})+`; | |
| } | |
| // Add an optional gap between characters to handle missing letters | |
| if (isLongTerm && i < chars.length - 1) { | |
| pattern += '.?'; | |
| } | |
| } | |
| return pattern; | |
| }; | |
| /** | |
| * Create a RegExp object for fuzzy Arabic search. | |
| */ | |
| const createArabicFuzzyRegex = (term, flags = 'i') => { | |
| const pattern = buildArabicFuzzyPattern(term); | |
| return new RegExp(pattern, flags); | |
| }; | |
| /** | |
| * Determine if a search term contains Arabic characters. | |
| */ | |
| const containsArabic = (term) => /[\u0600-\u06FF]/.test(term); | |
| /** | |
| * Create a search regex that is fuzzy for Arabic text. | |
| */ | |
| const createSmartSearchRegex = (term) => { | |
| if (containsArabic(term)) { | |
| // Strip diacritics from term for length check | |
| const cleaned = term.replace(/[\u064B-\u065F\u0670]/g, ''); | |
| // For very short Arabic terms (1-2 characters), use exact/simple matching | |
| // to avoid the fuzzy regex matching too many products | |
| if (cleaned.length <= 2) { | |
| // Simple partial match without fuzzy logic for short terms | |
| const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); | |
| return new RegExp(escapedTerm, 'i'); | |
| } | |
| // For longer terms, use fuzzy matching | |
| return createArabicFuzzyRegex(term); | |
| } | |
| // For non-Arabic text, use standard case-insensitive regex with partial match | |
| return new RegExp(term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'), 'i'); | |
| }; | |
| module.exports = { | |
| normalizeArabic, | |
| convertArabicNumerals, | |
| getSimilarity, | |
| buildArabicFuzzyPattern, | |
| createArabicFuzzyRegex, | |
| containsArabic, | |
| createSmartSearchRegex, | |
| ARABIC_SIMILAR_GROUPS, | |
| }; | |