geinitz commited on
Commit
5a53e95
·
1 Parent(s): d567d76

data files

Browse files
Files changed (3) hide show
  1. conv_text_to_csv.py +57 -0
  2. sun_also_rises.txt +0 -0
  3. sunalsorises.csv +0 -0
conv_text_to_csv.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import pandas as pd
3
+ import re
4
+
5
+ filename = '/Users/steve/sun_also_rises.txt'
6
+
7
+ csv = []
8
+
9
+ with open(filename, "r") as myfile:
10
+ lines = myfile.readlines()
11
+ sentence_fragment = ''
12
+ for i, line in enumerate(lines):
13
+ line = re.sub(' +', ' ', line)
14
+ line = re.sub('\n', '', line)
15
+ sentences = line.split('.')
16
+ sentences = [sent for sent in sentences if len(sent) > 0] # remove empty strings (e.g. if line ended with a '.')
17
+ print(f"\n line {i}: {line}, len(line) = {len(line)}, numsent = {len(sentences)}")
18
+
19
+ # don't bother with lines with 0 or 1 characters in them
20
+ if len(line) <= 1:
21
+ continue
22
+
23
+ # check that this line has at least 2 or sentences (i.e. at least one period), or the line ends with a period
24
+ if len(sentences) > 1:
25
+
26
+ # put each of the 0, 1, .., n-1 sentences in our list of sentences
27
+ for sent in sentences[:-1]:
28
+ csv.append(str(sentence_fragment + sent + '.').lstrip())
29
+ sentence_fragment = ''
30
+ #print(f"\n sentence: {sent + '.'}, len(sent) = {len(sent)}")
31
+
32
+ # put remaining sentence in sentence fragement until next line (if there was no period on end of this line)
33
+ if line[-1] != '.':
34
+ sentence_fragment += sentences[-1] + ' '
35
+
36
+ # there is only one sentence in this line
37
+ if len(sentences) == 1:
38
+
39
+ # this this sentence ends with a period, so add it to the csv file
40
+ if line[-1] == '.':
41
+ csv.append(str(sentence_fragment + sentences[0] + '.').lstrip())
42
+ sentence_fragment = ''
43
+ #print(f"\n sentence: {sent + '.'}, len(sent) = {len(sent)}")
44
+ else:
45
+ sentence_fragment += sentences[0] + ' '
46
+
47
+
48
+ #print(f" m ={m} and len(m) = {len(m)}")
49
+ #if i > 300:
50
+ # break
51
+
52
+ myfile.close()
53
+
54
+ df = pd.DataFrame({'sentence':csv})
55
+ df.to_csv('/Users/steve/sunalsorises.csv', index=False)
56
+
57
+ #print(csv)
sun_also_rises.txt ADDED
The diff for this file is too large to render. See raw diff
 
sunalsorises.csv ADDED
The diff for this file is too large to render. See raw diff