Spaces:

NghiTran1009
/

annotator-tool

Sleeping

App Files Files Community

NghiTran1009 commited on Mar 27, 2025

Commit

c6f00b2

verified ·

1 Parent(s): 9d7548a

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -11

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import streamlit as st
 import pandas as pd
 import joblib
 # Load mô hình CRF đã train
 @st.cache_resource
@@ -16,15 +17,22 @@ def extract_features(sentence):
 st.title("📝 Preprocessing tool")
 # Upload file
-uploaded_file = st.file_uploader("📤 Tải lên file CSV chứa dữ liệu cần gán nhãn", type=["csv"])
 if uploaded_file:
-    df_test = pd.read_csv(uploaded_file)
     # Xóa cột Chunk nếu tồn tại
     if "Chunk" in df_test.columns:
         df_test = df_test.drop(columns=["Chunk"])
     sentences = [group for _, group in df_test.groupby("Sentence_ID")]
     # Gán nhãn chunking
@@ -37,11 +45,18 @@ if uploaded_file:
     # Ghép lại thành dataframe
     df_chunked = pd.concat(all_sentences)
-    # Xuất file CSV
-    output_file = "chunked_data.csv"
-    df_chunked.to_csv(output_file, index=False, encoding="utf-8")
-    # Nút tải xuống
-    with open(output_file, "rb") as file:
-        st.download_button("📥 Tải xuống dữ liệu đã gán nhãn", file, output_file)

 import streamlit as st
 import pandas as pd
 import joblib
+import io
 # Load mô hình CRF đã train
 @st.cache_resource
 st.title("📝 Preprocessing tool")
 # Upload file
+uploaded_file = st.file_uploader("📤 Tải lên file CSV hoặc Excel chứa dữ liệu cần gán nhãn", type=["csv", "xlsx"])
 if uploaded_file:
+    # Kiểm tra định dạng file
+    file_type = uploaded_file.name.split(".")[-1]
+    if file_type == "csv":
+        df_test = pd.read_csv(uploaded_file)
+    else:  # file_type == "xlsx"
+        df_test = pd.read_excel(uploaded_file)
     # Xóa cột Chunk nếu tồn tại
     if "Chunk" in df_test.columns:
         df_test = df_test.drop(columns=["Chunk"])
+    # Nhóm theo câu
     sentences = [group for _, group in df_test.groupby("Sentence_ID")]
     # Gán nhãn chunking
     # Ghép lại thành dataframe
     df_chunked = pd.concat(all_sentences)
+    # Tạo file CSV để tải xuống
+    csv_buffer = io.StringIO()
+    df_chunked.to_csv(csv_buffer, index=False, encoding="utf-8")
+    csv_data = csv_buffer.getvalue()
+    # Tạo file Excel để tải xuống
+    excel_buffer = io.BytesIO()
+    with pd.ExcelWriter(excel_buffer, engine="xlsxwriter") as writer:
+        df_chunked.to_excel(writer, index=False, sheet_name="Chunked Data")
+    excel_data = excel_buffer.getvalue()
+    # Hiển thị nút tải xuống
+    st.download_button("📥 Tải xuống dữ liệu đã gán nhãn (CSV)", csv_data, "chunked_data.csv", "text/csv")
+    st.download_button("📥 Tải xuống dữ liệu đã gán nhãn (Excel)", excel_data, "chunked_data.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")