hiDenorIYamano commited on
Commit
cd646ca
·
1 Parent(s): 4b01a04

ホワイトリスト追加、最大行数追加

Browse files
Files changed (1) hide show
  1. app.py +29 -8
app.py CHANGED
@@ -8,6 +8,9 @@ def split_japanese_line(line, max_length):
8
  if len(line) < max_length:
9
  return line
10
 
 
 
 
11
  m = MeCab.Tagger()
12
  nodes = m.parse(line).split("\n")
13
 
@@ -18,13 +21,24 @@ def split_japanese_line(line, max_length):
18
  lines = []
19
  current_line = ""
20
 
21
- for idx, word in enumerate(words):
 
 
22
  # If adding the next word exceeds the max_length or if the word is a particle, break the line
23
- if len(word_features) > idx - 1:
24
- if word_features[idx - 1] == "助詞-格助詞":
 
 
 
 
25
  lines.append(current_line)
26
  current_line = ""
27
- current_line += word
 
 
 
 
 
28
 
29
  # Append the last line if it exists
30
  if current_line:
@@ -32,13 +46,20 @@ def split_japanese_line(line, max_length):
32
 
33
  # Merge lines to ensure each line is less than or equal to max_length
34
  merged_lines = []
35
- temp_line = ""
36
  for line in lines:
37
- if len(temp_line + line) <= max_length:
 
 
 
 
38
  temp_line += line
39
  else:
40
- merged_lines.append(temp_line)
41
- temp_line = line
 
 
 
42
  if temp_line:
43
  merged_lines.append(temp_line)
44
 
 
8
  if len(line) < max_length:
9
  return line
10
 
11
+ max_line = 2
12
+ line = re.sub(r'[、。]', '', line)
13
+
14
  m = MeCab.Tagger()
15
  nodes = m.parse(line).split("\n")
16
 
 
21
  lines = []
22
  current_line = ""
23
 
24
+ idx = 0
25
+ while idx < len(words):
26
+ word = words[idx]
27
  # If adding the next word exceeds the max_length or if the word is a particle, break the line
28
+ if idx + 1 < len(words) and any(
29
+ whitelisted_word == word + words[idx + 1] for whitelisted_word in ["という", "っていう", "になって"]):
30
+ current_line += word + words[idx + 1]
31
+ idx += 2 # Increment to skip next word
32
+ elif len(word_features) > idx - 1:
33
+ if word_features[idx - 1] in ["助詞-格助詞", "助詞-副助詞", "助詞-終助詞"]:
34
  lines.append(current_line)
35
  current_line = ""
36
+ current_line += word
37
+ idx += 1
38
+ else:
39
+ current_line += word
40
+ idx += 1
41
+
42
 
43
  # Append the last line if it exists
44
  if current_line:
 
46
 
47
  # Merge lines to ensure each line is less than or equal to max_length
48
  merged_lines = []
49
+ temp_line = ''
50
  for line in lines:
51
+ if len(temp_line) <= 12:
52
+ temp_line += line
53
+ elif len(temp_line + line) <= max_length:
54
+ temp_line += line
55
+ elif len(merged_lines) >= max_line - 1:
56
  temp_line += line
57
  else:
58
+ if temp_line == '':
59
+ merged_lines.append(line)
60
+ else:
61
+ merged_lines.append(temp_line)
62
+ temp_line = line
63
  if temp_line:
64
  merged_lines.append(temp_line)
65