Spaces:

samoulla
/

samoulla-backend

Running

App Files Files Community

samoulla-backend / utils /arabicSearch.js

Samoulla Sync Bot

Auto-deploy Samoulla Backend: 19c8fe03f7b7b0f52ba4ad506ac1341bcb562d57

1459d1b 5 days ago

history blame contribute delete

6.67 kB

	/**
	* Arabic Fuzzy Search Utility
	*
	* Handles common Arabic character confusions to improve search results.
	* Similar to how Amazon and other e-commerce platforms handle Arabic text search.
	*
	* Example: searching for "سنيور" will also match "شنيور"
	*/

	// Groups of Arabic characters that are commonly confused or interchanged
	const ARABIC_SIMILAR_GROUPS = [
	// Alef variants — أ إ آ ا ٱ
	['ا', 'أ', 'إ', 'آ', 'ٱ'],

	// Seen / Sheen — س ش
	['س', 'ش'],

	// Saad / Daad — ص ض
	['ص', 'ض'],

	// Taa / Dhaa — ط ظ
	['ط', 'ظ'],

	// Haa / Taa Marbouta — ه ة
	['ه', 'ة'],

	// Yaa / Alef Maqsoura — ي ى ئ
	['ي', 'ى', 'ئ'],

	// Waw / Waw with Hamza — و ؤ
	['و', 'ؤ'],

	// Daal / Thaal — د ذ
	['د', 'ذ'],

	// Taa / Thaa — ت ث
	['ت', 'ث'],

	// Kaaf / Qaaf — ك ق
	['ك', 'ق'],

	// Zay / Dhaal (less common but sometimes confused) — ز ذ
	// Note: ذ already in د/ذ group above, so we skip this

	// Haa / Khaa — ح خ
	['ح', 'خ', 'ج'],

	// Ain / Ghain — ع غ
	['ع', 'غ'],
	];

	// Build a lookup map: character -> its similar group
	const CHAR_TO_GROUP = {};
	ARABIC_SIMILAR_GROUPS.forEach((group) => {
	const charClass = `[${group.join('')}]`;
	group.forEach((char) => {
	CHAR_TO_GROUP[char] = charClass;
	});
	});

	/**
	* Convert an Arabic search term into a fuzzy regex pattern.
	*
	* Each Arabic character that belongs to a "similar group" is replaced
	* with a character class containing all members of that group.
	*
	* Also strips common Arabic diacritics (tashkeel) so that
	* "سُنيور" matches "سنيور".
	*
	* @param {string} term - The raw search term from the user
	* @returns {string} A regex-compatible pattern string
	*
	* @example
	* buildArabicFuzzyPattern('سنيور')
	* // Returns: '[سش]ن[يىئ][وؤ]ر'
	* // This pattern matches: سنيور, شنيور, سنيؤر, etc.
	*/
	const convertArabicNumerals = (text) => {
	if (!text) return '';
	return text.replace(/[٠١٢٣٤٥٦٧٨٩]/g, (d) => '٠١٢٣٤٥٦٧٨٩'.indexOf(d));
	};

	/**
	* Normalize Arabic text for comparison.
	* Removes diacritics and converts all variants of Alef, Yaa, etc. to a base form.
	*/
	const normalizeArabic = (text) => {
	if (!text) return '';
	return text
	.replace(/[\u064B-\u065F\u0670]/g, '') // Strip diacritics
	.replace(/[أإآٱ]/g, 'ا')
	.replace(/[ىئ]/g, 'ي')
	.replace(/ؤ/g, 'و')
	.replace(/ة/g, 'ه')
	.replace(/\s+/g, ' ')
	.trim();
	};

	/**
	* Calculate Levenshtein distance between two strings
	*/
	const levenshtein = (a, b) => {
	const tmp = [];
	for (let i = 0; i <= a.length; i++) {
	tmp[i] = [i];
	}
	for (let j = 0; j <= b.length; j++) {
	tmp[0][j] = j;
	}
	for (let i = 1; i <= a.length; i++) {
	for (let j = 1; j <= b.length; j++) {
	tmp[i][j] = Math.min(
	tmp[i - 1][j] + 1,
	tmp[i][j - 1] + 1,
	tmp[i - 1][j - 1] + (a[i - 1] === b[j - 1] ? 0 : 1),
	);
	}
	}
	return tmp[a.length][b.length];
	};

	/**
	* Get a similarity score between 0 and 1
	*/
	const getSimilarity = (str1, str2) => {
	const s1 = normalizeArabic(str1);
	const s2 = normalizeArabic(str2);
	if (s1 === s2) return 1;
	if (s1.length === 0 \|\| s2.length === 0) return 0;

	// Check if it starts with the term (very high weight)
	if (s2.startsWith(s1)) {
	const longer = Math.max(s1.length, s2.length);
	const shorter = Math.min(s1.length, s2.length);
	return 0.9 + (shorter / longer) * 0.1;
	}

	// Check if one contains the other (high weight)
	if (s1.includes(s2) \|\| s2.includes(s1)) {
	const longer = Math.max(s1.length, s2.length);
	const shorter = Math.min(s1.length, s2.length);
	return 0.8 + (shorter / longer) * 0.15;
	}

	const distance = levenshtein(s1, s2);
	const maxLength = Math.max(s1.length, s2.length);
	return 1 - distance / maxLength;
	};

	/**
	* Convert an Arabic search term into a fuzzy regex pattern.
	*/
	const buildArabicFuzzyPattern = (term) => {
	// 1) Strip Arabic diacritics
	const cleaned = term.replace(/[\u064B-\u065F\u0670]/g, '');
	if (!cleaned) return '';

	// 2) Replace each character with its fuzzy group
	let pattern = '';
	const chars = [...cleaned];
	const isLongTerm = cleaned.length >= 2;

	// Optional prefix (like 'ال')
	pattern += '(?:ال)?\\s*';

	for (let i = 0; i < chars.length; i++) {
	const char = chars[i];
	let charPattern = '';

	if (CHAR_TO_GROUP[char]) {
	charPattern = CHAR_TO_GROUP[char];
	} else {
	charPattern = char.replace(/[.*+?^${}()\|[\]\\]/g, '\\$&');
	}

	// Handle specific character behavior:
	// Make vowels and Alef variants optional/flexible
	const isVowel = [
	'ا',
	'أ',
	'إ',
	'آ',
	'ٱ',
	'و',
	'ؤ',
	'ي',
	'ى',
	'ئ',
	'ه',
	'ة',
	].includes(char);

	if (isLongTerm && isVowel) {
	// Optional and can be repeated
	pattern += `(?:${charPattern})*`;
	} else {
	// Required and can be repeated
	pattern += `(?:${charPattern})+`;
	}

	// Add an optional gap between characters to handle missing letters
	if (isLongTerm && i < chars.length - 1) {
	pattern += '.?';
	}
	}

	return pattern;
	};

	/**
	* Create a RegExp object for fuzzy Arabic search.
	*/
	const createArabicFuzzyRegex = (term, flags = 'i') => {
	const pattern = buildArabicFuzzyPattern(term);
	return new RegExp(pattern, flags);
	};

	/**
	* Determine if a search term contains Arabic characters.
	*/
	const containsArabic = (term) => /[\u0600-\u06FF]/.test(term);

	/**
	* Create a search regex that is fuzzy for Arabic text.
	*/
	const createSmartSearchRegex = (term) => {
	if (containsArabic(term)) {
	// Strip diacritics from term for length check
	const cleaned = term.replace(/[\u064B-\u065F\u0670]/g, '');

	// For very short Arabic terms (1-2 characters), use exact/simple matching
	// to avoid the fuzzy regex matching too many products
	if (cleaned.length <= 2) {
	// Simple partial match without fuzzy logic for short terms
	const escapedTerm = term.replace(/[.*+?^${}()\|[\]\\]/g, '\\$&');
	return new RegExp(escapedTerm, 'i');
	}

	// For longer terms, use fuzzy matching
	return createArabicFuzzyRegex(term);
	}
	// For non-Arabic text, use standard case-insensitive regex with partial match
	return new RegExp(term.replace(/[.*+?^${}()\|[\]\\]/g, '\\$&'), 'i');
	};

	module.exports = {
	normalizeArabic,
	convertArabicNumerals,
	getSimilarity,
	buildArabicFuzzyPattern,
	createArabicFuzzyRegex,
	containsArabic,
	createSmartSearchRegex,
	ARABIC_SIMILAR_GROUPS,
	};