| | |
| |
|
| | import os |
| | import re |
| | from glob import glob |
| | from functions import * |
| |
|
| | |
| | |
| | class database(object): |
| | def __init__(self): |
| | """ |
| | """ |
| | self.count_sentences_fr = {} |
| | self.count_sentences_als = {} |
| | self.count_words_fr = {} |
| | self.count_words_als = {} |
| |
|
| | self.db = [] |
| |
|
| | |
| | def count_sentences_words_als(self, line): |
| | """ |
| | Fill up the Alsacien dictionary of counts of sentences and words |
| | """ |
| | |
| | if line in self.count_sentences_als.keys(): |
| | self.count_sentences_als[line] = self.count_sentences_als[line] + 1 |
| | else: |
| | self.count_sentences_als[line] = 1 |
| |
|
| | |
| | for word in line.split(): |
| | if word in self.count_words_als.keys(): |
| | self.count_words_als[word] = self.count_words_als[word] + 1 |
| | else: |
| | self.count_words_als[word] = 1 |
| |
|
| | |
| | def count_sentences_words_fr(self, line): |
| | """ |
| | Fill up the French dictionary of counts of sentences and words |
| | """ |
| | |
| | if line in self.count_sentences_fr.keys(): |
| | self.count_sentences_fr[line] = self.count_sentences_fr[line] + 1 |
| | else: |
| | self.count_sentences_fr[line] = 1 |
| |
|
| | |
| | for word in line.split(): |
| | if word in self.count_words_fr.keys(): |
| | self.count_words_fr[word] = self.count_words_fr[word] + 1 |
| | else: |
| | self.count_words_fr[word] = 1 |
| |
|
| | |
| | def get_data_alsaimmer(self, display=False): |
| | """ |
| | Function to read the xml files from www.alsa-immer.eu |
| | and extract the database |
| | """ |
| | for filename in glob("/content/drive/MyDrive/www.alsa-immer.eu/*xml") + glob("/content/drive/MyDrive/www.alsa-immer.eu/*/*xml") : |
| | try: |
| | fic = open(filename, 'r', encoding="utf-8") |
| | line_als = fic.readline() |
| | except UnicodeDecodeError: |
| | fic = open(filename, 'r', encoding='ISO-8859-1') |
| | line_als = fic.readline() |
| | try: |
| | while True: |
| | if not len(line_als): |
| | raise EOFError |
| |
|
| | if "<als>" in line_als: |
| | if "<fr>" in line_als: |
| | line_fr = line_als |
| | else: |
| | line_fr = fic.readline() |
| | while "<fr>" not in line_fr: |
| | line_fr = fic.readline() |
| | if not len(line_fr): |
| | raise EOFError |
| | |
| | |
| | line_als_clean = extract_between_tags(line_als, "als") |
| | line_fr_clean = extract_between_tags(line_fr, "fr") |
| |
|
| | if len(line_als_clean) and len(line_fr_clean): |
| | |
| | self.count_sentences_words_als(line_als_clean) |
| | |
| | self.count_sentences_words_fr(line_fr_clean) |
| |
|
| | |
| | self.db.append({'fr':line_fr_clean, 'als':line_als_clean}) |
| | |
| |
|
| | |
| | line_als = fic.readline() |
| | except EOFError: |
| | fic.close() |
| |
|
| | if display: |
| | print("Alsacien : %d sentences, %d words"%(sum(self.count_sentences_als.values()), sum(self.count_words_als.values()))) |
| | print("Francais : %d sentences, %d words"%(sum(self.count_sentences_fr.values()), sum(self.count_words_fr.values()))) |
| |
|
| | |
| | def get_data_alsatext(self, display=False): |
| | """ |
| | Script to read the file www.alsatext.eu/cours_grammaire.php |
| | and extract the database. |
| | """ |
| |
|
| | filename="/content/drive/MyDrive/www.alsatext.eu/cours_grammaire.php" |
| | fic = open(filename, 'rt', encoding='utf8') |
| | try: |
| | while True: |
| | line = fic.readline() |
| | if not len(line): |
| | raise EOFError |
| |
|
| | if "<ex_als>" in line and "<ex_fr>" in line: |
| | line_als = extract_between_tags(line, 'ex_als') |
| | line_fr = extract_between_tags(line, 'ex_fr') |
| |
|
| | |
| | line_als_clean = clean_line(remove_html_tags(line_als)) |
| | line_fr_clean = clean_line(remove_html_tags(line_fr)) |
| |
|
| | |
| | self.count_sentences_words_als(line_als_clean) |
| | |
| | self.count_sentences_words_fr(line_fr_clean) |
| |
|
| | |
| | self.db.append({'fr':line_fr_clean, 'als':line_als_clean}) |
| | except EOFError: |
| | fic.close() |
| |
|
| | if display: |
| | print("Alsacien : %d sentences, %d words"%(sum(self.count_sentences_als.values()), sum(self.count_words_als.values()))) |
| | print("Francais : %d sentences, %d words"%(sum(self.count_sentences_fr.values()), sum(self.count_words_fr.values()))) |
| |
|
| | |
| | def get_data_motsAlsacienMulhouse(self, display=False): |
| | """ |
| | Script to extract data from mots_alsacien_Mulhouse.csv |
| | """ |
| | filename="mots_alsacien_Mulhouse.csv" |
| | fic=open("/content/drive/MyDrive/%s"%filename, 'rt', encoding="iso-8859-1") |
| | fic.readline() |
| | try: |
| | while True: |
| | line = fic.readline() |
| | if not len(line): |
| | raise EOFError |
| |
|
| | |
| | self.count_sentences_words_als(line.split(";")[0]) |
| | |
| | self.count_sentences_words_fr(line.split(";")[1]) |
| |
|
| | |
| | self.db.append({'fr':line.split(";")[1], 'als':line.split(";")[0]}) |
| | |
| | except EOFError: |
| | fic.close() |
| |
|
| | if display: |
| | print("Alsacien : %d sentences, %d words"%(sum(self.count_sentences_als.values()), sum(self.count_words_als.values()))) |
| | print("Francais : %d sentences, %d words"%(sum(self.count_sentences_fr.values()), sum(self.count_words_fr.values()))) |
| |
|
| |
|
| | |
| | def get_data_alignments(self, display=False): |
| | """ |
| | Script to extract data from alignments.csv |
| | """ |
| | filename="alignments.csv" |
| | fic=open("/content/drive/MyDrive/%s"%filename, 'rt') |
| | fic.readline() |
| | try: |
| | while True: |
| | line = fic.readline() |
| | if not len(line): |
| | raise EOFError |
| |
|
| | als = line.split('\t')[0].split()[0].split(';')[0] |
| | fr = line.split('\t')[2].split()[0].split(';')[0] |
| |
|
| | |
| | self.count_sentences_words_als(als) |
| | |
| | self.count_sentences_words_fr(fr) |
| |
|
| | |
| | self.db.append({'fr':fr, 'als':als}) |
| | except EOFError: |
| | fic.close() |
| |
|
| | if display: |
| | print("Alsacien : %d sentences, %d words"%(sum(self.count_sentences_als.values()), sum(self.count_words_als.values()))) |
| | print("Francais : %d sentences, %d words"%(sum(self.count_sentences_fr.values()), sum(self.count_words_fr.values()))) |
| |
|
| |
|
| |
|
| | |
| | def get_data_lexique(self, display=False): |
| | """ |
| | Read the 'lexique_*.pdf' and extract the data |
| | Create .html using `pdftohtml .pdf` command |
| | """ |
| |
|
| | filename="../lexique_artisans" |
| |
|
| | os.system('pdftohtml -q %s.pdf'%filename) |
| | os.system('rm %s-* %s_* %s.html'%(filename, filename, filename)) |
| |
|
| | fic=open("%ss.html"%filename, 'rt') |
| | start = False |
| | N = 0 |
| | try: |
| | while True: |
| | line = fic.readline() |
| | if not len(line) or N>5: |
| | raise EOFError |
| |
|
| | if '<body>' in line: |
| | start = True |
| | if start: |
| |
|
| | |
| | if " <br/>\n" in line and len(clean_html(line.split(' <br/>')[0])): |
| | line_fr = clean_html(line.split(' <br/>')[0]) |
| | line_als = line |
| | while " </i><br/>\n" not in line_als: |
| | line_als = clean_html(extract_between_tags(fic.readline(), 'i')) |
| | |
| | |
| |
|
| | |
| | self.db.append({'fr':line_fr, 'als':line_als}) |
| | print({'fr':line_fr, 'als':line_als}) |
| | N = N +1 |
| |
|
| | |
| | self.count_sentences_words_als(line_als) |
| | |
| | self.count_sentences_words_fr(line_fr) |
| | break |
| |
|
| | |
| | |
| |
|
| | |
| | except EOFError: |
| | fic.close() |
| |
|
| |
|
| | if display: |
| | print("Alsacien : %d sentences, %d words"%(sum(self.count_sentences_als.values()), sum(self.count_words_als.values()))) |
| | print("Francais : %d sentences, %d words"%(sum(self.count_sentences_fr.values()), sum(self.count_words_fr.values()))) |
| |
|
| |
|
| | |
| | def create_db(self): |
| | """ |
| | Create a dictionary for each sentence: |
| | {'fr': 'ksjdfdk', 'als':'rtefv'} |
| | """ |
| | |
| |
|
| |
|
| |
|
| |
|