File size: 2,491 Bytes
aeb7075
 
 
c6f00b2
8f43a80
aeb7075
6813e8f
 
 
aeb7075
6813e8f
aeb7075
 
 
 
9d7548a
f5b15ce
c6f00b2
f5b15ce
 
c6f00b2
 
4ba1af3
 
 
8f43a80
 
4ba1af3
 
 
 
 
 
 
 
 
 
 
 
 
 
8f43a80
4ba1af3
 
 
 
 
 
 
 
 
 
 
 
 
8f43a80
 
 
 
 
 
4ba1af3
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import streamlit as st
import pandas as pd
import joblib
import io
import os

@st.cache_resource
def load_model():
    return joblib.load("chunking_crf_model.pkl")

crf = load_model()

def extract_features(sentence):
    return [{"word": token, "pos": pos} for token, pos in zip(sentence["Token"], sentence["POS"])]

st.title("📝 Preprocessing tool")

uploaded_file = st.file_uploader("📤 Tải lên file CSV hoặc Excel chứa dữ liệu cần gán nhãn", type=["csv", "xlsx"])

if uploaded_file:
    file_type = uploaded_file.name.split(".")[-1]

    try:
        if file_type == "csv":
            df_test = pd.read_csv(uploaded_file)
        else:
            df_test = pd.read_excel(uploaded_file, engine="openpyxl")

        required_columns = {"Sentence_ID", "Token", "POS"}
        if not required_columns.issubset(df_test.columns):
            st.error(f"⚠️ File phải chứa các cột: {', '.join(required_columns)}")
        else:
            if "Chunk" in df_test.columns:
                df_test = df_test.drop(columns=["Chunk"])
            
            sentences = [group.copy() for _, group in df_test.groupby("Sentence_ID")]
            
            all_sentences = []
            for sentence in sentences:
                X_test = [extract_features(sentence)]
                y_pred = crf.predict(X_test)[0]
                sentence.loc[:, "Chunk"] = y_pred
                all_sentences.append(sentence)
            
            df_chunked = pd.concat(all_sentences)

            csv_buffer = io.StringIO()
            df_chunked.to_csv(csv_buffer, index=False, encoding="utf-8")
            csv_data = csv_buffer.getvalue()

            excel_buffer = io.BytesIO()
            with pd.ExcelWriter(excel_buffer, engine="openpyxl") as writer:
                df_chunked.to_excel(writer, index=False, sheet_name="Chunked Data")
            excel_data = excel_buffer.getvalue()

            file_base_name = os.path.splitext(uploaded_file.name)[0]
            csv_filename = f"{file_base_name}_chunked.csv"
            excel_filename = f"{file_base_name}_chunked.xlsx"

            st.download_button("📥 Tải xuống dữ liệu đã gán nhãn (CSV)", csv_data, csv_filename, "text/csv")
            st.download_button("📥 Tải xuống dữ liệu đã gán nhãn (Excel)", excel_data, excel_filename, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")

    except Exception as e:
        st.error(f"❌ Lỗi khi xử lý file: {e}")