#!/usr/bin/env python3 import os import re from glob import glob from functions import * # -------------------------------------------------------------------------------------------------- # -------------------------------------------------------------------------------------------------- class database(object): def __init__(self): """ """ self.count_sentences_fr = {} self.count_sentences_als = {} self.count_words_fr = {} self.count_words_als = {} self.db = [] # -------------------------------------------------------------------------------------------------- def count_sentences_words_als(self, line): """ Fill up the Alsacien dictionary of counts of sentences and words """ # Make the count of sentences : if line in self.count_sentences_als.keys(): self.count_sentences_als[line] = self.count_sentences_als[line] + 1 else: self.count_sentences_als[line] = 1 # Make the count of words : for word in line.split(): if word in self.count_words_als.keys(): self.count_words_als[word] = self.count_words_als[word] + 1 else: self.count_words_als[word] = 1 # -------------------------------------------------------------------------------------------------- def count_sentences_words_fr(self, line): """ Fill up the French dictionary of counts of sentences and words """ # Make the count of sentences : if line in self.count_sentences_fr.keys(): self.count_sentences_fr[line] = self.count_sentences_fr[line] + 1 else: self.count_sentences_fr[line] = 1 # Make the count of words : for word in line.split(): if word in self.count_words_fr.keys(): self.count_words_fr[word] = self.count_words_fr[word] + 1 else: self.count_words_fr[word] = 1 # -------------------------------------------------------------------------------------------------- def get_data_alsaimmer(self, display=False): """ Function to read the xml files from www.alsa-immer.eu and extract the database """ for filename in glob("/content/drive/MyDrive/www.alsa-immer.eu/*xml") + glob("/content/drive/MyDrive/www.alsa-immer.eu/*/*xml") : try: fic = open(filename, 'r', encoding="utf-8") line_als = fic.readline() except UnicodeDecodeError: fic = open(filename, 'r', encoding='ISO-8859-1') line_als = fic.readline() try: while True: if not len(line_als): raise EOFError if "" in line_als: if "" in line_als: line_fr = line_als else: # most of the time : alsacien followed by french. Sometimes on the same line line_fr = fic.readline() while "" not in line_fr: line_fr = fic.readline() if not len(line_fr): raise EOFError # Remove HTML tags: line_als_clean = extract_between_tags(line_als, "als") line_fr_clean = extract_between_tags(line_fr, "fr") if len(line_als_clean) and len(line_fr_clean): # if data for both languages # Make the count of sentences : self.count_sentences_words_als(line_als_clean) # Make the count of words : self.count_sentences_words_fr(line_fr_clean) # Fill the database: self.db.append({'fr':line_fr_clean, 'als':line_als_clean}) #print({'fr':line_fr_clean, 'als':line_als_clean}) # Read next line: line_als = fic.readline() except EOFError: fic.close() if display: print("Alsacien : %d sentences, %d words"%(sum(self.count_sentences_als.values()), sum(self.count_words_als.values()))) print("Francais : %d sentences, %d words"%(sum(self.count_sentences_fr.values()), sum(self.count_words_fr.values()))) # -------------------------------------------------------------------------------------------------- def get_data_alsatext(self, display=False): """ Script to read the file www.alsatext.eu/cours_grammaire.php and extract the database. """ filename="/content/drive/MyDrive/www.alsatext.eu/cours_grammaire.php" fic = open(filename, 'rt', encoding='utf8') try: while True: line = fic.readline() if not len(line): raise EOFError if "" in line and "" in line: line_als = extract_between_tags(line, 'ex_als') line_fr = extract_between_tags(line, 'ex_fr') # Clean data by removing HTML tags and other things: line_als_clean = clean_line(remove_html_tags(line_als)) line_fr_clean = clean_line(remove_html_tags(line_fr)) # Make the count of sentences: self.count_sentences_words_als(line_als_clean) # Make the count of words : self.count_sentences_words_fr(line_fr_clean) # Fill the database: self.db.append({'fr':line_fr_clean, 'als':line_als_clean}) except EOFError: fic.close() if display: print("Alsacien : %d sentences, %d words"%(sum(self.count_sentences_als.values()), sum(self.count_words_als.values()))) print("Francais : %d sentences, %d words"%(sum(self.count_sentences_fr.values()), sum(self.count_words_fr.values()))) # -------------------------------------------------------------------------------------------------- def get_data_motsAlsacienMulhouse(self, display=False): """ Script to extract data from mots_alsacien_Mulhouse.csv """ filename="mots_alsacien_Mulhouse.csv" fic=open("/content/drive/MyDrive/%s"%filename, 'rt', encoding="iso-8859-1") fic.readline() # read header try: while True: line = fic.readline() if not len(line): raise EOFError # Make the count of sentences: self.count_sentences_words_als(line.split(";")[0]) # Make the count of words : self.count_sentences_words_fr(line.split(";")[1]) # Fill the database: self.db.append({'fr':line.split(";")[1], 'als':line.split(";")[0]}) #print({'fr':line.split(";")[1], 'als':line.split(";")[0]}) except EOFError: fic.close() if display: print("Alsacien : %d sentences, %d words"%(sum(self.count_sentences_als.values()), sum(self.count_words_als.values()))) print("Francais : %d sentences, %d words"%(sum(self.count_sentences_fr.values()), sum(self.count_words_fr.values()))) # -------------------------------------------------------------------------------------------------- def get_data_alignments(self, display=False): """ Script to extract data from alignments.csv """ filename="alignments.csv" fic=open("/content/drive/MyDrive/%s"%filename, 'rt') #, encoding="iso-8859-1") fic.readline() # read header try: while True: line = fic.readline() if not len(line): raise EOFError als = line.split('\t')[0].split()[0].split(';')[0] fr = line.split('\t')[2].split()[0].split(';')[0] # Make the count of sentences: self.count_sentences_words_als(als) # Make the count of words : self.count_sentences_words_fr(fr) # Fill the database: self.db.append({'fr':fr, 'als':als}) except EOFError: fic.close() if display: print("Alsacien : %d sentences, %d words"%(sum(self.count_sentences_als.values()), sum(self.count_words_als.values()))) print("Francais : %d sentences, %d words"%(sum(self.count_sentences_fr.values()), sum(self.count_words_fr.values()))) # -------------------------------------------------------------------------------------------------- def get_data_lexique(self, display=False): """ Read the 'lexique_*.pdf' and extract the data Create .html using `pdftohtml .pdf` command """ filename="../lexique_artisans" os.system('pdftohtml -q %s.pdf'%filename) os.system('rm %s-* %s_* %s.html'%(filename, filename, filename)) fic=open("%ss.html"%filename, 'rt') start = False N = 0 try: while True: line = fic.readline() if not len(line) or N>5: raise EOFError if '' in line: start = True # start processing the file only after '' if start: # we can process the line: # FR sentences with "
" at the end of the sentence : if "
\n" in line and len(clean_html(line.split('
')[0])): line_fr = clean_html(line.split('
')[0]) line_als = line while "
\n" not in line_als: line_als = clean_html(extract_between_tags(fic.readline(), 'i')) #FIXME Does not work well. No way to distinguish the languages !! # How to extract in the pdf depending on the color text ? # Fill the database: self.db.append({'fr':line_fr, 'als':line_als}) print({'fr':line_fr, 'als':line_als}) N = N +1 # Make the count of sentences: self.count_sentences_words_als(line_als) # Make the count of words : self.count_sentences_words_fr(line_fr) break # TODO # - same for titles, in bold :
---
except EOFError: fic.close() if display: print("Alsacien : %d sentences, %d words"%(sum(self.count_sentences_als.values()), sum(self.count_words_als.values()))) print("Francais : %d sentences, %d words"%(sum(self.count_sentences_fr.values()), sum(self.count_words_fr.values()))) # -------------------------------------------------------------------------------------------------- def create_db(self): """ Create a dictionary for each sentence: {'fr': 'ksjdfdk', 'als':'rtefv'} """