|
|
from datasets import load_dataset |
|
|
import gradio as gr |
|
|
from sklearn.pipeline import Pipeline |
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
from sklearn.linear_model import LogisticRegression |
|
|
from sklearn.model_selection import train_test_split |
|
|
import warnings |
|
|
|
|
|
warnings.filterwarnings("ignore") |
|
|
|
|
|
|
|
|
try: |
|
|
dataset = load_dataset("UniversalCEFR/cefr_sp_en")["train"] |
|
|
print(f"✅ Датасет успешно загружен: {len(dataset)} строк") |
|
|
except Exception as e: |
|
|
raise RuntimeError(f"❌ Ошибка при загрузке датасета: {e}") |
|
|
|
|
|
|
|
|
texts = [item["text"] for item in dataset if item["text"]] |
|
|
labels = [item["cefr_level"] for item in dataset if item["text"]] |
|
|
|
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split( |
|
|
texts, labels, test_size=0.2, random_state=42 |
|
|
) |
|
|
|
|
|
|
|
|
model = Pipeline([ |
|
|
("vectorizer", TfidfVectorizer(max_features=5000)), |
|
|
("classifier", LogisticRegression(max_iter=1000)) |
|
|
]) |
|
|
|
|
|
print("🔄 Обучение модели...") |
|
|
model.fit(X_train, y_train) |
|
|
print("✅ Модель обучена успешно") |
|
|
|
|
|
|
|
|
def predict_level(text): |
|
|
if not text.strip(): |
|
|
return "⛔️ Введите непустой текст" |
|
|
prediction = model.predict([text])[0] |
|
|
confidence = model.predict_proba([text])[0].max() |
|
|
return f"📘 Уровень: {prediction}\nУверенность: {confidence:.2%}" |
|
|
|
|
|
demo = gr.Interface( |
|
|
fn=predict_level, |
|
|
inputs=gr.Textbox(lines=4, label="Введите английское предложение"), |
|
|
outputs=gr.Text(label="Результат"), |
|
|
title="CEFR Level Predictor (A1–C2)", |
|
|
description="Модель на основе датасета UniversalCEFR, предсказывает уровень знания английского языка.", |
|
|
) |
|
|
|
|
|
demo.launch() |
|
|
|