Spaces:
Runtime error
Runtime error
modify app.py
Browse files
app.py
CHANGED
|
@@ -68,7 +68,7 @@ class MGTHuman(datasets.GeneratorBasedBuilder):
|
|
| 68 |
else:
|
| 69 |
return text
|
| 70 |
|
| 71 |
-
def get_text_by_index(self, filepath, index):
|
| 72 |
count = 0
|
| 73 |
with open(filepath, 'r') as f:
|
| 74 |
data = json.load(f)
|
|
@@ -76,7 +76,9 @@ class MGTHuman(datasets.GeneratorBasedBuilder):
|
|
| 76 |
if not row["text"].strip():
|
| 77 |
continue
|
| 78 |
if count == index:
|
| 79 |
-
text =
|
|
|
|
|
|
|
| 80 |
return text
|
| 81 |
count += 1
|
| 82 |
return "Index 超出范围,请输入有效的数字。"
|
|
@@ -124,9 +126,12 @@ if uploaded_folder:
|
|
| 124 |
|
| 125 |
# 输入序号查看文本
|
| 126 |
index_to_view = st.number_input("输入要查看的文本序号", min_value=0, max_value=total_entries - 1, step=1)
|
|
|
|
|
|
|
|
|
|
| 127 |
|
| 128 |
if st.button("显示文本"):
|
| 129 |
-
text = mgt_human.get_text_by_index(file_to_display, index=index_to_view)
|
| 130 |
st.write("对应的文本内容为:", text)
|
| 131 |
else:
|
| 132 |
st.write("未找到任何 JSON 文件,请检查 ZIP 文件结构。")
|
|
|
|
| 68 |
else:
|
| 69 |
return text
|
| 70 |
|
| 71 |
+
def get_text_by_index(self, filepath, index, cut_tokens=False, max_tokens=2048):
|
| 72 |
count = 0
|
| 73 |
with open(filepath, 'r') as f:
|
| 74 |
data = json.load(f)
|
|
|
|
| 76 |
if not row["text"].strip():
|
| 77 |
continue
|
| 78 |
if count == index:
|
| 79 |
+
text = row["text"]
|
| 80 |
+
if cut_tokens:
|
| 81 |
+
text = self.truncate_text(text, max_tokens)
|
| 82 |
return text
|
| 83 |
count += 1
|
| 84 |
return "Index 超出范围,请输入有效的数字。"
|
|
|
|
| 126 |
|
| 127 |
# 输入序号查看文本
|
| 128 |
index_to_view = st.number_input("输入要查看的文本序号", min_value=0, max_value=total_entries - 1, step=1)
|
| 129 |
+
|
| 130 |
+
# 添加复选框以选择是否切割文本
|
| 131 |
+
cut_tokens = st.checkbox("是否对文本进行token切割", value=False)
|
| 132 |
|
| 133 |
if st.button("显示文本"):
|
| 134 |
+
text = mgt_human.get_text_by_index(file_to_display, index=index_to_view, cut_tokens=cut_tokens)
|
| 135 |
st.write("对应的文本内容为:", text)
|
| 136 |
else:
|
| 137 |
st.write("未找到任何 JSON 文件,请检查 ZIP 文件结构。")
|