| """ |
| note: this code is used in bw2ar.py file |
| """ |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| """ Constants for arabic """ |
| import re |
| COMMA = u'\u060C' |
| SEMICOLON = u'\u061B' |
| QUESTION = u'\u061F' |
| HAMZA = u'\u0621' |
| ALEF_MADDA = u'\u0622' |
| ALEF_HAMZA_ABOVE = u'\u0623' |
| WAW_HAMZA = u'\u0624' |
| ALEF_HAMZA_BELOW = u'\u0625' |
| YEH_HAMZA = u'\u0626' |
| ALEF = u'\u0627' |
| BEH = u'\u0628' |
| TEH_MARBUTA = u'\u0629' |
| TEH = u'\u062a' |
| THEH = u'\u062b' |
| JEEM = u'\u062c' |
| HAH = u'\u062d' |
| KHAH = u'\u062e' |
| DAL = u'\u062f' |
| THAL = u'\u0630' |
| REH = u'\u0631' |
| ZAIN = u'\u0632' |
| SEEN = u'\u0633' |
| SHEEN = u'\u0634' |
| SAD = u'\u0635' |
| DAD = u'\u0636' |
| TAH = u'\u0637' |
| ZAH = u'\u0638' |
| AIN = u'\u0639' |
| GHAIN = u'\u063a' |
| TATWEEL = u'\u0640' |
| FEH = u'\u0641' |
| QAF = u'\u0642' |
| KAF = u'\u0643' |
| LAM = u'\u0644' |
| MEEM = u'\u0645' |
| NOON = u'\u0646' |
| HEH = u'\u0647' |
| WAW = u'\u0648' |
| ALEF_MAKSURA = u'\u0649' |
| YEH = u'\u064a' |
| MADDA_ABOVE = u'\u0653' |
| HAMZA_ABOVE = u'\u0654' |
| HAMZA_BELOW = u'\u0655' |
| ZERO = u'\u0660' |
| ONE = u'\u0661' |
| TWO = u'\u0662' |
| THREE = u'\u0663' |
| FOUR = u'\u0664' |
| FIVE = u'\u0665' |
| SIX = u'\u0666' |
| SEVEN = u'\u0667' |
| EIGHT = u'\u0668' |
| NINE = u'\u0669' |
| PERCENT = u'\u066a' |
| DECIMAL = u'\u066b' |
| THOUSANDS = u'\u066c' |
| STAR = u'\u066d' |
| MINI_ALEF = u'\u0670' |
| ALEF_WASLA = u'\u0671' |
| FULL_STOP = u'\u06d4' |
| BYTE_ORDER_MARK = u'\ufeff' |
|
|
| |
| FATHATAN = u'\u064b' |
| DAMMATAN = u'\u064c' |
| KASRATAN = u'\u064d' |
| FATHA = u'\u064e' |
| DAMMA = u'\u064f' |
| KASRA = u'\u0650' |
| SHADDA = u'\u0651' |
| SUKUN = u'\u0652' |
|
|
| |
| LAM_ALEF = u'\ufefb' |
| LAM_ALEF_HAMZA_ABOVE = u'\ufef7' |
| LAM_ALEF_HAMZA_BELOW = u'\ufef9' |
| LAM_ALEF_MADDA_ABOVE = u'\ufef5' |
| SIMPLE_LAM_ALEF = u'\u0644\u0627' |
| SIMPLE_LAM_ALEF_HAMZA_ABOVE = u'\u0644\u0623' |
| SIMPLE_LAM_ALEF_HAMZA_BELOW = u'\u0644\u0625' |
| SIMPLE_LAM_ALEF_MADDA_ABOVE = u'\u0644\u0622' |
|
|
|
|
| HARAKAT_PAT = re.compile(u"["+u"".join([FATHATAN, DAMMATAN, KASRATAN, |
| FATHA, DAMMA, KASRA, SUKUN, |
| SHADDA])+u"]") |
| HAMZAT_PAT = re.compile(u"["+u"".join([WAW_HAMZA, YEH_HAMZA])+u"]") |
| ALEFAT_PAT = re.compile(u"["+u"".join([ALEF_MADDA, ALEF_HAMZA_ABOVE, |
| ALEF_HAMZA_BELOW, HAMZA_ABOVE, |
| HAMZA_BELOW])+u"]") |
| LAMALEFAT_PAT = re.compile(u"["+u"".join([LAM_ALEF, |
| LAM_ALEF_HAMZA_ABOVE, |
| LAM_ALEF_HAMZA_BELOW, |
| LAM_ALEF_MADDA_ABOVE])+u"]") |
|
|
| def strip_tashkeel(text): |
| text = HARAKAT_PAT.sub('', text) |
| text = re.sub(u"[\u064E]", "", text, flags=re.UNICODE) |
| text = re.sub(u"[\u0671]", "", text, flags=re.UNICODE) |
| return text |
|
|
| def strip_tatweel(text): |
| return re.sub(u'[%s]' % TATWEEL, '', text) |
|
|
| |
| def remove_non_arabic(text): |
| text = strip_tashkeel(text) |
| text = strip_tatweel(text) |
| return ' '.join(re.sub(u"[^\u0621-\u063A\u0641-\u064A ]", " ", text, flags=re.UNICODE).split()) |
|
|
|
|