Extract_text / app.py
farwew's picture
Update app.py
e118a81 verified
import gradio as gr
import numpy as np
import joblib
import re
import nltk
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
# === Pastikan stopwords tersedia ===
try:
stopwords.words("english")
except LookupError:
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
# === Load SentenceTransformer ===
st_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
# === Load trained XGBoost models ===
models = joblib.load("xgb_models_all.joblib")
# === Preprocessing function ===
def preprocess_text(text: str) -> str:
if not isinstance(text, str) or text.strip() == "":
return ""
text = text.lower()
text = re.sub(r"\r\n", " ", text)
text = re.sub(r"[^a-z\s]", "", text)
tokens = [w for w in text.split() if w not in stop_words]
return " ".join(tokens)
# === Prediction function ===
def predict(text: str, normalize: bool = True):
text = (text or "").strip()
if not text:
return {}, [], 0
# 1. Preprocess
clean_text = preprocess_text(text)
# 2. Embedding
vec = st_model.encode([clean_text], normalize_embeddings=normalize)[0]
# 3. Tambah fitur essay_length
essay_length = len(text)
X = np.concatenate([vec, [essay_length]])
# 4. Prediksi dari semua model
results = {}
for col, model in models.items():
results[col] = float(model.predict(X.reshape(1, -1))[0])
return results, vec.tolist(), int(vec.shape[0])
# === Gradio UI ===
with gr.Blocks() as demo:
gr.Markdown("# Essay Scoring Demo")
gr.Markdown("Masukkan teks")
with gr.Row():
text_in = gr.Textbox(label="Input Kalimat / Essay", placeholder="Tulis di sini...", lines=5)
normalize = gr.Checkbox(value=True, label="Normalize embedding (L2)")
btn = gr.Button("Prediksi", variant="primary")
with gr.Row():
pred_out = gr.JSON(label="Prediksi Skor")
with gr.Row():
vec_out = gr.JSON(label="Embedding Vector (list of floats)")
dim_out = gr.Number(label="Dimensi vektor", interactive=False)
gr.Examples(
examples=[
["Halo dunia!"],
["Machine learning is fun."],
["This is a sample essay for IELTS task."],
],
inputs=[text_in],
label="Contoh input",
)
btn.click(predict, inputs=[text_in, normalize], outputs=[pred_out, vec_out, dim_out])
demo.queue()
if __name__ == "__main__":
demo.launch()