Spaces:

farwew
/

Extract_text

Sleeping

File size: 2,485 Bytes

0f94aaf
 
745cc73
 
87d3f3f
 
745cc73
87d3f3f
 
 
 
 
 
 
 
0f94aaf
745cc73
0f94aaf
 
745cc73
 
0f94aaf
745cc73
 
 
 
 
 
 
87d3f3f
745cc73
 
 
 
0f94aaf
 
745cc73
 
 
 
 
 
 
0f94aaf
745cc73
 
 
 
 
 
 
 
 
 
 
 
0f94aaf
e118a81
 
0f94aaf
 
745cc73
0f94aaf
745cc73
0f94aaf
 
e118a81
745cc73
 
0f94aaf
 
 
 
 
 
745cc73
0f94aaf
 
745cc73
0f94aaf
 
745cc73
0f94aaf

import gradio as gr
import numpy as np
import joblib
import re
import nltk
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer

# === Pastikan stopwords tersedia ===
try:
    stopwords.words("english")
except LookupError:
    nltk.download("stopwords")

stop_words = set(stopwords.words("english"))

# === Load SentenceTransformer ===
st_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

# === Load trained XGBoost models ===
models = joblib.load("xgb_models_all.joblib")

# === Preprocessing function ===
def preprocess_text(text: str) -> str:
    if not isinstance(text, str) or text.strip() == "":
        return ""
    text = text.lower()
    text = re.sub(r"\r\n", " ", text)
    text = re.sub(r"[^a-z\s]", "", text)
    tokens = [w for w in text.split() if w not in stop_words]
    return " ".join(tokens)

# === Prediction function ===
def predict(text: str, normalize: bool = True):
    text = (text or "").strip()
    if not text:
        return {}, [], 0
    
    # 1. Preprocess
    clean_text = preprocess_text(text)

    # 2. Embedding
    vec = st_model.encode([clean_text], normalize_embeddings=normalize)[0]

    # 3. Tambah fitur essay_length
    essay_length = len(text)
    X = np.concatenate([vec, [essay_length]])

    # 4. Prediksi dari semua model
    results = {}
    for col, model in models.items():
        results[col] = float(model.predict(X.reshape(1, -1))[0])

    return results, vec.tolist(), int(vec.shape[0])

# === Gradio UI ===
with gr.Blocks() as demo:
    gr.Markdown("# Essay Scoring Demo")
    gr.Markdown("Masukkan teks")

    with gr.Row():
        text_in = gr.Textbox(label="Input Kalimat / Essay", placeholder="Tulis di sini...", lines=5)
    normalize = gr.Checkbox(value=True, label="Normalize embedding (L2)")
    btn = gr.Button("Prediksi", variant="primary")

    with gr.Row():
        pred_out = gr.JSON(label="Prediksi Skor")
    with gr.Row():
        vec_out = gr.JSON(label="Embedding Vector (list of floats)")
        dim_out = gr.Number(label="Dimensi vektor", interactive=False)

    gr.Examples(
        examples=[
            ["Halo dunia!"],
            ["Machine learning is fun."],
            ["This is a sample essay for IELTS task."],
        ],
        inputs=[text_in],
        label="Contoh input",
    )

    btn.click(predict, inputs=[text_in, normalize], outputs=[pred_out, vec_out, dim_out])

demo.queue()

if __name__ == "__main__":
    demo.launch()