File size: 2,939 Bytes
21baa2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80


'''
Created on ١١‏/٠٥‏/٢٠١٠

@Created by: Muhammad Altabba
'''
from Models.Lexicon.RootsAndPatternsRepository import *;
from Models.Lexicon.SpecialWords.StandAloneParticle import *;
from Models.Lexicon.SpecialWords.ProperNoun import *;

from Controllers.TextEntities.TextEncapsulator import *;
from Controllers.TextEntities.Word import *;
from Controllers.Tokenization.Tokenizer import *;
from Controllers.Normalization.Normalizer import *;
from Controllers.Morphology.AffixParser import *;
from Controllers.Morphology.MorphologicalAnalyzer import *;

import codecs;
import io;
import os;
from os.path import join, getsize;



compoundNounsXmlFile = '../../Data/MorphologyTransducers/Proclitics.xml';
procliticsXmlFile = '../../Data/MorphologyTransducers/Proclitics.xml';
encliticsXmlFile = '../../Data/MorphologyTransducers/Enclitics.xml';
prematureTaggingRulesXmlFile = '../../Data/TaggingRepository/PrematureTaggingRules.xml';
overdueTaggingRulesXmlFile = '../../Data/TaggingRepository/OverdueTaggingRules.xml';
baseDirectoryOfAlKhalil = 'D:/temp/AlKhalil_1/db/'
rootsFolder = 'roots2'


text = TextEncapsulator();
text.LoadFromFiles(baseDirectoryOfAlKhalil, rootsFolder, \
                   procliticsXmlFile, encliticsXmlFile,\
                   prematureTaggingRulesXmlFile, \
                   overdueTaggingRulesXmlFile);

base = 'D:/temp/Latifa2/'




for root, dirs, files in os.walk(base):
    for dir in dirs:
        print('Start parsing directory: ['+dir+']');
        for subroot, subdirs, subfiles in os.walk(root+dir):
            for file in subfiles:
                if file.endswith('.txt') and file.find('-') == -1 :  
                    if(file.find('Edu') == -1 ):
                        continue;
                    print('\tStart parsing file: ['+file+']');
                    
                    f = codecs.open('/'.join([subroot, file]), 'r', 'utf-8');
                    string = f.read();
                    f.close();
                    
                    text.String = string;
                    text.Tokenize();
                    text.Normalize(2);
                    
                    text.ParseClitics();
                    
                    print('\tProcessing...');
                    
                    text.PatternMatchingSimpleStem();
                    
                    print('\tWriting...');
                    
                    xmlStreamWriter = io.StringIO();
                    text.RenderTextSimpleStem(xmlStreamWriter);
                    writer = codecs.open('/'.join([subroot, file.replace('.txt','-Qutuf.txt')]), 'w', 'utf-8');
                    writer.write(xmlStreamWriter.getvalue());
                    xmlStreamWriter.close();
                    writer.close();
                    
                    print ('\tEnd parsing file: ['+file+']');
                    print('------------------------------------------------------');