Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import joblib | |
| import io | |
| import os | |
| def load_model(): | |
| return joblib.load("chunking_crf_model.pkl") | |
| crf = load_model() | |
| def extract_features(sentence): | |
| return [{"word": token, "pos": pos} for token, pos in zip(sentence["Token"], sentence["POS"])] | |
| st.title("📝 Preprocessing tool") | |
| uploaded_file = st.file_uploader("📤 Tải lên file CSV hoặc Excel chứa dữ liệu cần gán nhãn", type=["csv", "xlsx"]) | |
| if uploaded_file: | |
| file_type = uploaded_file.name.split(".")[-1] | |
| try: | |
| if file_type == "csv": | |
| df_test = pd.read_csv(uploaded_file) | |
| else: | |
| df_test = pd.read_excel(uploaded_file, engine="openpyxl") | |
| required_columns = {"Sentence_ID", "Token", "POS"} | |
| if not required_columns.issubset(df_test.columns): | |
| st.error(f"⚠️ File phải chứa các cột: {', '.join(required_columns)}") | |
| else: | |
| if "Chunk" in df_test.columns: | |
| df_test = df_test.drop(columns=["Chunk"]) | |
| sentences = [group.copy() for _, group in df_test.groupby("Sentence_ID")] | |
| all_sentences = [] | |
| for sentence in sentences: | |
| X_test = [extract_features(sentence)] | |
| y_pred = crf.predict(X_test)[0] | |
| sentence.loc[:, "Chunk"] = y_pred | |
| all_sentences.append(sentence) | |
| df_chunked = pd.concat(all_sentences) | |
| csv_buffer = io.StringIO() | |
| df_chunked.to_csv(csv_buffer, index=False, encoding="utf-8") | |
| csv_data = csv_buffer.getvalue() | |
| excel_buffer = io.BytesIO() | |
| with pd.ExcelWriter(excel_buffer, engine="openpyxl") as writer: | |
| df_chunked.to_excel(writer, index=False, sheet_name="Chunked Data") | |
| excel_data = excel_buffer.getvalue() | |
| file_base_name = os.path.splitext(uploaded_file.name)[0] | |
| csv_filename = f"{file_base_name}_chunked.csv" | |
| excel_filename = f"{file_base_name}_chunked.xlsx" | |
| st.download_button("📥 Tải xuống dữ liệu đã gán nhãn (CSV)", csv_data, csv_filename, "text/csv") | |
| st.download_button("📥 Tải xuống dữ liệu đã gán nhãn (Excel)", excel_data, excel_filename, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet") | |
| except Exception as e: | |
| st.error(f"❌ Lỗi khi xử lý file: {e}") | |