data files
Browse files- conv_text_to_csv.py +57 -0
- sun_also_rises.txt +0 -0
- sunalsorises.csv +0 -0
conv_text_to_csv.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import re
|
| 4 |
+
|
| 5 |
+
filename = '/Users/steve/sun_also_rises.txt'
|
| 6 |
+
|
| 7 |
+
csv = []
|
| 8 |
+
|
| 9 |
+
with open(filename, "r") as myfile:
|
| 10 |
+
lines = myfile.readlines()
|
| 11 |
+
sentence_fragment = ''
|
| 12 |
+
for i, line in enumerate(lines):
|
| 13 |
+
line = re.sub(' +', ' ', line)
|
| 14 |
+
line = re.sub('\n', '', line)
|
| 15 |
+
sentences = line.split('.')
|
| 16 |
+
sentences = [sent for sent in sentences if len(sent) > 0] # remove empty strings (e.g. if line ended with a '.')
|
| 17 |
+
print(f"\n line {i}: {line}, len(line) = {len(line)}, numsent = {len(sentences)}")
|
| 18 |
+
|
| 19 |
+
# don't bother with lines with 0 or 1 characters in them
|
| 20 |
+
if len(line) <= 1:
|
| 21 |
+
continue
|
| 22 |
+
|
| 23 |
+
# check that this line has at least 2 or sentences (i.e. at least one period), or the line ends with a period
|
| 24 |
+
if len(sentences) > 1:
|
| 25 |
+
|
| 26 |
+
# put each of the 0, 1, .., n-1 sentences in our list of sentences
|
| 27 |
+
for sent in sentences[:-1]:
|
| 28 |
+
csv.append(str(sentence_fragment + sent + '.').lstrip())
|
| 29 |
+
sentence_fragment = ''
|
| 30 |
+
#print(f"\n sentence: {sent + '.'}, len(sent) = {len(sent)}")
|
| 31 |
+
|
| 32 |
+
# put remaining sentence in sentence fragement until next line (if there was no period on end of this line)
|
| 33 |
+
if line[-1] != '.':
|
| 34 |
+
sentence_fragment += sentences[-1] + ' '
|
| 35 |
+
|
| 36 |
+
# there is only one sentence in this line
|
| 37 |
+
if len(sentences) == 1:
|
| 38 |
+
|
| 39 |
+
# this this sentence ends with a period, so add it to the csv file
|
| 40 |
+
if line[-1] == '.':
|
| 41 |
+
csv.append(str(sentence_fragment + sentences[0] + '.').lstrip())
|
| 42 |
+
sentence_fragment = ''
|
| 43 |
+
#print(f"\n sentence: {sent + '.'}, len(sent) = {len(sent)}")
|
| 44 |
+
else:
|
| 45 |
+
sentence_fragment += sentences[0] + ' '
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
#print(f" m ={m} and len(m) = {len(m)}")
|
| 49 |
+
#if i > 300:
|
| 50 |
+
# break
|
| 51 |
+
|
| 52 |
+
myfile.close()
|
| 53 |
+
|
| 54 |
+
df = pd.DataFrame({'sentence':csv})
|
| 55 |
+
df.to_csv('/Users/steve/sunalsorises.csv', index=False)
|
| 56 |
+
|
| 57 |
+
#print(csv)
|
sun_also_rises.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
sunalsorises.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|