samoulla-backend / utils /arabicSearch.js
Samoulla Sync Bot
Auto-deploy Samoulla Backend: 19c8fe03f7b7b0f52ba4ad506ac1341bcb562d57
1459d1b
/**
* Arabic Fuzzy Search Utility
*
* Handles common Arabic character confusions to improve search results.
* Similar to how Amazon and other e-commerce platforms handle Arabic text search.
*
* Example: searching for "سنيور" will also match "شنيور"
*/
// Groups of Arabic characters that are commonly confused or interchanged
const ARABIC_SIMILAR_GROUPS = [
// Alef variants — أ إ آ ا ٱ
['ا', 'أ', 'إ', 'آ', 'ٱ'],
// Seen / Sheen — س ش
['س', 'ش'],
// Saad / Daad — ص ض
['ص', 'ض'],
// Taa / Dhaa — ط ظ
['ط', 'ظ'],
// Haa / Taa Marbouta — ه ة
['ه', 'ة'],
// Yaa / Alef Maqsoura — ي ى ئ
['ي', 'ى', 'ئ'],
// Waw / Waw with Hamza — و ؤ
['و', 'ؤ'],
// Daal / Thaal — د ذ
['د', 'ذ'],
// Taa / Thaa — ت ث
['ت', 'ث'],
// Kaaf / Qaaf — ك ق
['ك', 'ق'],
// Zay / Dhaal (less common but sometimes confused) — ز ذ
// Note: ذ already in د/ذ group above, so we skip this
// Haa / Khaa — ح خ
['ح', 'خ', 'ج'],
// Ain / Ghain — ع غ
['ع', 'غ'],
];
// Build a lookup map: character -> its similar group
const CHAR_TO_GROUP = {};
ARABIC_SIMILAR_GROUPS.forEach((group) => {
const charClass = `[${group.join('')}]`;
group.forEach((char) => {
CHAR_TO_GROUP[char] = charClass;
});
});
/**
* Convert an Arabic search term into a fuzzy regex pattern.
*
* Each Arabic character that belongs to a "similar group" is replaced
* with a character class containing all members of that group.
*
* Also strips common Arabic diacritics (tashkeel) so that
* "سُنيور" matches "سنيور".
*
* @param {string} term - The raw search term from the user
* @returns {string} A regex-compatible pattern string
*
* @example
* buildArabicFuzzyPattern('سنيور')
* // Returns: '[سش]ن[يىئ][وؤ]ر'
* // This pattern matches: سنيور, شنيور, سنيؤر, etc.
*/
const convertArabicNumerals = (text) => {
if (!text) return '';
return text.replace(/[٠١٢٣٤٥٦٧٨٩]/g, (d) => '٠١٢٣٤٥٦٧٨٩'.indexOf(d));
};
/**
* Normalize Arabic text for comparison.
* Removes diacritics and converts all variants of Alef, Yaa, etc. to a base form.
*/
const normalizeArabic = (text) => {
if (!text) return '';
return text
.replace(/[\u064B-\u065F\u0670]/g, '') // Strip diacritics
.replace(/[أإآٱ]/g, 'ا')
.replace(/[ىئ]/g, 'ي')
.replace(/ؤ/g, 'و')
.replace(/ة/g, 'ه')
.replace(/\s+/g, ' ')
.trim();
};
/**
* Calculate Levenshtein distance between two strings
*/
const levenshtein = (a, b) => {
const tmp = [];
for (let i = 0; i <= a.length; i++) {
tmp[i] = [i];
}
for (let j = 0; j <= b.length; j++) {
tmp[0][j] = j;
}
for (let i = 1; i <= a.length; i++) {
for (let j = 1; j <= b.length; j++) {
tmp[i][j] = Math.min(
tmp[i - 1][j] + 1,
tmp[i][j - 1] + 1,
tmp[i - 1][j - 1] + (a[i - 1] === b[j - 1] ? 0 : 1),
);
}
}
return tmp[a.length][b.length];
};
/**
* Get a similarity score between 0 and 1
*/
const getSimilarity = (str1, str2) => {
const s1 = normalizeArabic(str1);
const s2 = normalizeArabic(str2);
if (s1 === s2) return 1;
if (s1.length === 0 || s2.length === 0) return 0;
// Check if it starts with the term (very high weight)
if (s2.startsWith(s1)) {
const longer = Math.max(s1.length, s2.length);
const shorter = Math.min(s1.length, s2.length);
return 0.9 + (shorter / longer) * 0.1;
}
// Check if one contains the other (high weight)
if (s1.includes(s2) || s2.includes(s1)) {
const longer = Math.max(s1.length, s2.length);
const shorter = Math.min(s1.length, s2.length);
return 0.8 + (shorter / longer) * 0.15;
}
const distance = levenshtein(s1, s2);
const maxLength = Math.max(s1.length, s2.length);
return 1 - distance / maxLength;
};
/**
* Convert an Arabic search term into a fuzzy regex pattern.
*/
const buildArabicFuzzyPattern = (term) => {
// 1) Strip Arabic diacritics
const cleaned = term.replace(/[\u064B-\u065F\u0670]/g, '');
if (!cleaned) return '';
// 2) Replace each character with its fuzzy group
let pattern = '';
const chars = [...cleaned];
const isLongTerm = cleaned.length >= 2;
// Optional prefix (like 'ال')
pattern += '(?:ال)?\\s*';
for (let i = 0; i < chars.length; i++) {
const char = chars[i];
let charPattern = '';
if (CHAR_TO_GROUP[char]) {
charPattern = CHAR_TO_GROUP[char];
} else {
charPattern = char.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}
// Handle specific character behavior:
// Make vowels and Alef variants optional/flexible
const isVowel = [
'ا',
'أ',
'إ',
'آ',
'ٱ',
'و',
'ؤ',
'ي',
'ى',
'ئ',
'ه',
'ة',
].includes(char);
if (isLongTerm && isVowel) {
// Optional and can be repeated
pattern += `(?:${charPattern})*`;
} else {
// Required and can be repeated
pattern += `(?:${charPattern})+`;
}
// Add an optional gap between characters to handle missing letters
if (isLongTerm && i < chars.length - 1) {
pattern += '.?';
}
}
return pattern;
};
/**
* Create a RegExp object for fuzzy Arabic search.
*/
const createArabicFuzzyRegex = (term, flags = 'i') => {
const pattern = buildArabicFuzzyPattern(term);
return new RegExp(pattern, flags);
};
/**
* Determine if a search term contains Arabic characters.
*/
const containsArabic = (term) => /[\u0600-\u06FF]/.test(term);
/**
* Create a search regex that is fuzzy for Arabic text.
*/
const createSmartSearchRegex = (term) => {
if (containsArabic(term)) {
// Strip diacritics from term for length check
const cleaned = term.replace(/[\u064B-\u065F\u0670]/g, '');
// For very short Arabic terms (1-2 characters), use exact/simple matching
// to avoid the fuzzy regex matching too many products
if (cleaned.length <= 2) {
// Simple partial match without fuzzy logic for short terms
const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
return new RegExp(escapedTerm, 'i');
}
// For longer terms, use fuzzy matching
return createArabicFuzzyRegex(term);
}
// For non-Arabic text, use standard case-insensitive regex with partial match
return new RegExp(term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'), 'i');
};
module.exports = {
normalizeArabic,
convertArabicNumerals,
getSimilarity,
buildArabicFuzzyPattern,
createArabicFuzzyRegex,
containsArabic,
createSmartSearchRegex,
ARABIC_SIMILAR_GROUPS,
};