Spaces:
Running
Running
| from nltk import word_tokenize | |
| import wordninja | |
| def get_average_words_per_line(lines): | |
| sum = 0 | |
| for line in lines: | |
| tokens = word_tokenize(line) | |
| sum+= len(tokens) | |
| return sum/ len(lines) | |
| def get_average_line_len(lines): | |
| sum = 0 | |
| for line in lines: | |
| sum+=len(line) | |
| return sum / len(lines) | |
| def percentage_difference(value1, value2): | |
| average_value = (value1 + value2) / 2 | |
| diff = abs(value1 - value2) | |
| percentage_diff = (diff / average_value) * 100 | |
| return percentage_diff | |
| def recover_text(line): | |
| tokens = word_tokenize(line) | |
| condition = percentage_difference(len(tokens), len(wordninja.split(line))) > 150 | |
| #condition = percentage_difference(line_width, len(tokens)) > percentage_difference(average_width, avg_tokens) | |
| return " ".join(wordninja.split(line)) if condition else line | |