Spaces:

NghiTran1009
/

annotator-tool

Sleeping

App Files Files Community

NghiTran1009 commited on Mar 26, 2025

Commit

f5b15ce

verified ·

1 Parent(s): da4e37a

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -56

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import streamlit as st
 import pandas as pd
 import joblib
-import os  # Fix lỗi FileNotFoundError
 # Load mô hình CRF đã train
 @st.cache_resource
@@ -14,58 +13,30 @@ crf = load_model()
 def extract_features(sentence):
     return [{"word": token, "pos": pos} for token, pos in zip(sentence["Token"], sentence["POS"])]
-# Load dữ liệu test
-df_test = pd.read_csv("vi-chunk-test.csv")
-# Nhóm dữ liệu theo câu
-sentences = [group for _, group in df_test.groupby("Sentence_ID")]
-# Giao diện Annotator
-st.title("📝 Tool Annotator - Chỉnh sửa dữ liệu chunking")
-# Chọn câu để annotate
-sentence_id = st.selectbox("Chọn câu", range(1, len(sentences) + 1))
-sentence = sentences[sentence_id - 1].copy()  # Tạo bản sao tránh ảnh hưởng dữ liệu gốc
-# Dự đoán bằng mô hình CRF
-X_test = [extract_features(sentence)]
-y_pred = crf.predict(X_test)[0]
-# Thêm nhãn dự đoán vào dataframe
-sentence["Predicted_Chunk"] = y_pred
-sentence["Is_Correct"] = sentence["Chunk"] == sentence["Predicted_Chunk"]  # Kiểm tra dự đoán
-# **Ẩn cột `Chunk` trong tất cả các bảng**
-sentence_display = sentence.drop(columns=["Chunk"])  # Bỏ hiển thị cột `Chunk`
-# Highlight lỗi: Màu đỏ nếu `Predicted_Chunk` sai
-def highlight_errors(row):
-    return ["background-color: #FF9999" if not row["Is_Correct"] else "" for _ in row]
-# Hiển thị số lượng đúng/sai
-num_correct = sentence["Is_Correct"].sum()
-num_wrong = len(sentence) - num_correct
-st.write(f"✅ **Số token đúng**: {num_correct} / {len(sentence)}")
-st.write(f"❌ **Số token sai**: {num_wrong}")
-# **🔹 Hiển thị bảng với highlight lỗi (ẨN `Chunk`)**
-st.write("🔹 **Câu gốc (Highlight lỗi màu đỏ)**")
-st.dataframe(sentence_display.style.apply(highlight_errors, axis=1))
-# **🔹 Annotator chỉnh sửa `Predicted_Chunk` (ẨN `Chunk`)**
-edited_df = st.data_editor(
-    sentence_display[["Token", "POS", "Predicted_Chunk"]],  # Chỉ hiển thị các cột cần thiết
-    num_rows="dynamic",  # Cho phép thêm hàng ở bất kỳ đâu
-    key=f"edit_table_{sentence_id}"
-)
-# Kiểm tra nếu file tồn tại trước khi tải
-if os.path.exists("corrected_data.csv"):
-    with open("corrected_data.csv", "rb") as file:
-        st.download_button("📥 Tải xuống corrected_data.csv", file, "corrected_data.csv")
-# Lưu lại dữ liệu chỉnh sửa
-if st.button("Lưu chỉnh sửa"):
-    sentence.update(edited_df)  # Cập nhật lại dữ liệu chỉnh sửa vào dataframe gốc
-    sentence.to_csv("corrected_data.csv", index=False, encoding="utf-8", columns=["Token", "POS", "Predicted_Chunk"])  # Chỉ lưu các cột cần thiết
-    st.success("✅ Dữ liệu đã được lưu thành corrected_data.csv!")

 import streamlit as st
 import pandas as pd
 import joblib
 # Load mô hình CRF đã train
 @st.cache_resource
 def extract_features(sentence):
     return [{"word": token, "pos": pos} for token, pos in zip(sentence["Token"], sentence["POS"])]
+st.title("📝 Tool Gán Nhãn Chunking")
+# Upload file
+uploaded_file = st.file_uploader("📤 Tải lên file CSV chứa dữ liệu cần gán nhãn", type=["csv"])
+if uploaded_file:
+    df_test = pd.read_csv(uploaded_file)
+    sentences = [group for _, group in df_test.groupby("Sentence_ID")]
+    # Gán nhãn chunking
+    all_sentences = []
+    for sentence in sentences:
+        X_test = [extract_features(sentence)]
+        y_pred = crf.predict(X_test)[0]
+        sentence["Chunk"] = y_pred  # Gán nhãn dự đoán
+        all_sentences.append(sentence)
+    # Ghép lại thành dataframe
+    df_chunked = pd.concat(all_sentences)
+    # Xuất file CSV
+    output_file = "chunked_data.csv"
+    df_chunked.to_csv(output_file, index=False, encoding="utf-8")
+    # Nút tải xuống
+    with open(output_file, "rb") as file:
+        st.download_button("📥 Tải xuống dữ liệu đã gán nhãn", file, output_file)