import streamlit as st import pandas as pd import joblib import io import os @st.cache_resource def load_model(): return joblib.load("chunking_crf_model.pkl") crf = load_model() def extract_features(sentence): return [{"word": token, "pos": pos} for token, pos in zip(sentence["Token"], sentence["POS"])] st.title("📝 Preprocessing tool") uploaded_file = st.file_uploader("📤 Tải lên file CSV hoặc Excel chứa dữ liệu cần gán nhãn", type=["csv", "xlsx"]) if uploaded_file: file_type = uploaded_file.name.split(".")[-1] try: if file_type == "csv": df_test = pd.read_csv(uploaded_file) else: df_test = pd.read_excel(uploaded_file, engine="openpyxl") required_columns = {"Sentence_ID", "Token", "POS"} if not required_columns.issubset(df_test.columns): st.error(f"⚠️ File phải chứa các cột: {', '.join(required_columns)}") else: if "Chunk" in df_test.columns: df_test = df_test.drop(columns=["Chunk"]) sentences = [group.copy() for _, group in df_test.groupby("Sentence_ID")] all_sentences = [] for sentence in sentences: X_test = [extract_features(sentence)] y_pred = crf.predict(X_test)[0] sentence.loc[:, "Chunk"] = y_pred all_sentences.append(sentence) df_chunked = pd.concat(all_sentences) csv_buffer = io.StringIO() df_chunked.to_csv(csv_buffer, index=False, encoding="utf-8") csv_data = csv_buffer.getvalue() excel_buffer = io.BytesIO() with pd.ExcelWriter(excel_buffer, engine="openpyxl") as writer: df_chunked.to_excel(writer, index=False, sheet_name="Chunked Data") excel_data = excel_buffer.getvalue() file_base_name = os.path.splitext(uploaded_file.name)[0] csv_filename = f"{file_base_name}_chunked.csv" excel_filename = f"{file_base_name}_chunked.xlsx" st.download_button("📥 Tải xuống dữ liệu đã gán nhãn (CSV)", csv_data, csv_filename, "text/csv") st.download_button("📥 Tải xuống dữ liệu đã gán nhãn (Excel)", excel_data, excel_filename, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet") except Exception as e: st.error(f"❌ Lỗi khi xử lý file: {e}")