Spaces:
Runtime error
Runtime error
Commit ·
cd646ca
1
Parent(s): 4b01a04
ホワイトリスト追加、最大行数追加
Browse files
app.py
CHANGED
|
@@ -8,6 +8,9 @@ def split_japanese_line(line, max_length):
|
|
| 8 |
if len(line) < max_length:
|
| 9 |
return line
|
| 10 |
|
|
|
|
|
|
|
|
|
|
| 11 |
m = MeCab.Tagger()
|
| 12 |
nodes = m.parse(line).split("\n")
|
| 13 |
|
|
@@ -18,13 +21,24 @@ def split_japanese_line(line, max_length):
|
|
| 18 |
lines = []
|
| 19 |
current_line = ""
|
| 20 |
|
| 21 |
-
|
|
|
|
|
|
|
| 22 |
# If adding the next word exceeds the max_length or if the word is a particle, break the line
|
| 23 |
-
if len(
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
lines.append(current_line)
|
| 26 |
current_line = ""
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
# Append the last line if it exists
|
| 30 |
if current_line:
|
|
@@ -32,13 +46,20 @@ def split_japanese_line(line, max_length):
|
|
| 32 |
|
| 33 |
# Merge lines to ensure each line is less than or equal to max_length
|
| 34 |
merged_lines = []
|
| 35 |
-
temp_line =
|
| 36 |
for line in lines:
|
| 37 |
-
if len(temp_line
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
temp_line += line
|
| 39 |
else:
|
| 40 |
-
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
| 42 |
if temp_line:
|
| 43 |
merged_lines.append(temp_line)
|
| 44 |
|
|
|
|
| 8 |
if len(line) < max_length:
|
| 9 |
return line
|
| 10 |
|
| 11 |
+
max_line = 2
|
| 12 |
+
line = re.sub(r'[、。]', '', line)
|
| 13 |
+
|
| 14 |
m = MeCab.Tagger()
|
| 15 |
nodes = m.parse(line).split("\n")
|
| 16 |
|
|
|
|
| 21 |
lines = []
|
| 22 |
current_line = ""
|
| 23 |
|
| 24 |
+
idx = 0
|
| 25 |
+
while idx < len(words):
|
| 26 |
+
word = words[idx]
|
| 27 |
# If adding the next word exceeds the max_length or if the word is a particle, break the line
|
| 28 |
+
if idx + 1 < len(words) and any(
|
| 29 |
+
whitelisted_word == word + words[idx + 1] for whitelisted_word in ["という", "っていう", "になって"]):
|
| 30 |
+
current_line += word + words[idx + 1]
|
| 31 |
+
idx += 2 # Increment to skip next word
|
| 32 |
+
elif len(word_features) > idx - 1:
|
| 33 |
+
if word_features[idx - 1] in ["助詞-格助詞", "助詞-副助詞", "助詞-終助詞"]:
|
| 34 |
lines.append(current_line)
|
| 35 |
current_line = ""
|
| 36 |
+
current_line += word
|
| 37 |
+
idx += 1
|
| 38 |
+
else:
|
| 39 |
+
current_line += word
|
| 40 |
+
idx += 1
|
| 41 |
+
|
| 42 |
|
| 43 |
# Append the last line if it exists
|
| 44 |
if current_line:
|
|
|
|
| 46 |
|
| 47 |
# Merge lines to ensure each line is less than or equal to max_length
|
| 48 |
merged_lines = []
|
| 49 |
+
temp_line = ''
|
| 50 |
for line in lines:
|
| 51 |
+
if len(temp_line) <= 12:
|
| 52 |
+
temp_line += line
|
| 53 |
+
elif len(temp_line + line) <= max_length:
|
| 54 |
+
temp_line += line
|
| 55 |
+
elif len(merged_lines) >= max_line - 1:
|
| 56 |
temp_line += line
|
| 57 |
else:
|
| 58 |
+
if temp_line == '':
|
| 59 |
+
merged_lines.append(line)
|
| 60 |
+
else:
|
| 61 |
+
merged_lines.append(temp_line)
|
| 62 |
+
temp_line = line
|
| 63 |
if temp_line:
|
| 64 |
merged_lines.append(temp_line)
|
| 65 |
|