Spaces:
Runtime error
Runtime error
modify app.py
Browse files
app.py
CHANGED
|
@@ -49,6 +49,16 @@ class MGTHuman(datasets.GeneratorBasedBuilder):
|
|
| 49 |
return text
|
| 50 |
count += 1
|
| 51 |
return "Index 超出范围,请输入有效的数字。"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
# Streamlit UI
|
| 54 |
st.title("MGTHuman Dataset Viewer")
|
|
@@ -65,30 +75,33 @@ if uploaded_folder:
|
|
| 65 |
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
| 66 |
zip_ref.extractall(folder_path)
|
| 67 |
|
| 68 |
-
#
|
| 69 |
category = {}
|
| 70 |
-
for json_file in folder_path.
|
| 71 |
domain = json_file.stem.split('_task3')[0]
|
| 72 |
category.setdefault(domain, []).append(str(json_file))
|
| 73 |
|
| 74 |
-
# 显示可用的 domain
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
# 输入序号查看文本
|
| 81 |
-
index_to_view = st.number_input("输入要查看的文本序号", min_value=0, step=1)
|
| 82 |
-
|
| 83 |
-
if st.button("显示文本"):
|
| 84 |
-
# 选择第一个文件进行展示
|
| 85 |
file_to_display = category[selected_domain][0]
|
| 86 |
mgt_human = MGTHuman(name=selected_domain)
|
| 87 |
-
|
| 88 |
-
st.write("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
# 清理上传文件的临时目录
|
| 91 |
if st.button("清除文件"):
|
| 92 |
import shutil
|
| 93 |
shutil.rmtree("temp")
|
| 94 |
-
st.write("临时文件已清除。")
|
|
|
|
| 49 |
return text
|
| 50 |
count += 1
|
| 51 |
return "Index 超出范围,请输入有效的数字。"
|
| 52 |
+
|
| 53 |
+
def count_entries(self, filepath):
|
| 54 |
+
"""返回文件中的总条数,用于动态生成索引范围"""
|
| 55 |
+
count = 0
|
| 56 |
+
with open(filepath, 'r') as f:
|
| 57 |
+
data = json.load(f)
|
| 58 |
+
for row in data:
|
| 59 |
+
if row["text"].strip():
|
| 60 |
+
count += 1
|
| 61 |
+
return count
|
| 62 |
|
| 63 |
# Streamlit UI
|
| 64 |
st.title("MGTHuman Dataset Viewer")
|
|
|
|
| 75 |
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
| 76 |
zip_ref.extractall(folder_path)
|
| 77 |
|
| 78 |
+
# 递归获取所有 JSON 文件并分类到不同的 domain
|
| 79 |
category = {}
|
| 80 |
+
for json_file in folder_path.rglob("*.json"): # 使用 rglob 递归查找所有 JSON 文件
|
| 81 |
domain = json_file.stem.split('_task3')[0]
|
| 82 |
category.setdefault(domain, []).append(str(json_file))
|
| 83 |
|
| 84 |
+
# 显示可用的 domain 下拉框
|
| 85 |
+
if category:
|
| 86 |
+
selected_domain = st.selectbox("选择数据种类", options=list(category.keys()))
|
| 87 |
+
|
| 88 |
+
# 确定该 domain 的第一个文件路径并获取条目数量
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
file_to_display = category[selected_domain][0]
|
| 90 |
mgt_human = MGTHuman(name=selected_domain)
|
| 91 |
+
total_entries = mgt_human.count_entries(file_to_display)
|
| 92 |
+
st.write(f"可用的索引范围: 0 到 {total_entries - 1}")
|
| 93 |
+
|
| 94 |
+
# 输入序号查看文本
|
| 95 |
+
index_to_view = st.number_input("输入要查看的文本序号", min_value=0, max_value=total_entries - 1, step=1)
|
| 96 |
+
|
| 97 |
+
if st.button("显示文本"):
|
| 98 |
+
text = mgt_human.get_text_by_index(file_to_display, index=index_to_view)
|
| 99 |
+
st.write("对应的文本内容为:", text)
|
| 100 |
+
else:
|
| 101 |
+
st.write("未找到任何 JSON 文件,请检查 ZIP 文件结构。")
|
| 102 |
|
| 103 |
# 清理上传文件的临时目录
|
| 104 |
if st.button("清除文件"):
|
| 105 |
import shutil
|
| 106 |
shutil.rmtree("temp")
|
| 107 |
+
st.write("临时文件已清除。")
|