Spaces:
Running
Running
update
Browse files- app.py +4 -3
- demo/__init__.py +1 -0
- demo/binary_classifier_demo.py +39 -17
- model_utils.py +40 -0
app.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
-
|
|
|
|
| 2 |
|
| 3 |
if __name__ == "__main__":
|
| 4 |
-
# Launch
|
| 5 |
-
print("Starting
|
| 6 |
binary_app.launch(show_api=False, debug=True, share=True)
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from demo import binary_app
|
| 3 |
|
| 4 |
if __name__ == "__main__":
|
| 5 |
+
# Launch the classifier demo
|
| 6 |
+
print("Starting AI Text Classifier demo...")
|
| 7 |
binary_app.launch(show_api=False, debug=True, share=True)
|
demo/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
from .binary_classifier_demo import binary_app
|
demo/binary_classifier_demo.py
CHANGED
|
@@ -6,7 +6,7 @@ import os
|
|
| 6 |
import spaces
|
| 7 |
import gc
|
| 8 |
|
| 9 |
-
from model_utils import load_model, classify_text
|
| 10 |
from binoculars_utils import compute_scores, cleanup_model, cleanup_models
|
| 11 |
|
| 12 |
MINIMUM_TOKENS = 200
|
|
@@ -30,6 +30,14 @@ css = """
|
|
| 30 |
border-radius: 0.5rem;
|
| 31 |
font-weight: bold;
|
| 32 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
.analysis-block {
|
| 34 |
background: #f5f5f5;
|
| 35 |
padding: 15px;
|
|
@@ -46,7 +54,7 @@ css = """
|
|
| 46 |
"""
|
| 47 |
|
| 48 |
@spaces.GPU
|
| 49 |
-
def
|
| 50 |
# Check GPU status at the beginning
|
| 51 |
if torch.cuda.is_available():
|
| 52 |
print(f"Starting classification with GPU: {torch.cuda.get_device_name(0)}")
|
|
@@ -59,10 +67,13 @@ def run_binary_classifier(text, show_analysis=False):
|
|
| 59 |
return gr.Markdown(f"Текст слишком короткий. Требуется минимум {MINIMUM_TOKENS} символов."), None, None
|
| 60 |
|
| 61 |
try:
|
| 62 |
-
# Load
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
-
# Compute scores
|
| 66 |
scores = compute_scores(text, use_chat=True, use_coder=True)
|
| 67 |
|
| 68 |
# Run classification
|
|
@@ -87,7 +98,7 @@ def run_binary_classifier(text, show_analysis=False):
|
|
| 87 |
scores_str += f"- Score Coder: {scores['score_coder']:.4f}\n"
|
| 88 |
|
| 89 |
# Result markdown
|
| 90 |
-
class_style = "human-text" if predicted_class == "Human" else "ai-text"
|
| 91 |
result_md = f"""
|
| 92 |
## Результат классификации
|
| 93 |
|
|
@@ -314,7 +325,7 @@ def reset_outputs():
|
|
| 314 |
with gr.Blocks(css=css, theme=gr.themes.Base()) as binary_app:
|
| 315 |
with gr.Row():
|
| 316 |
with gr.Column(scale=3):
|
| 317 |
-
gr.HTML("<h1
|
| 318 |
|
| 319 |
with gr.Row():
|
| 320 |
with gr.Column():
|
|
@@ -322,7 +333,15 @@ with gr.Blocks(css=css, theme=gr.themes.Base()) as binary_app:
|
|
| 322 |
lines=10, label="Текст для анализа")
|
| 323 |
|
| 324 |
with gr.Row():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 325 |
analysis_checkbox = gr.Checkbox(label="Показать детальный анализ текста", value=False)
|
|
|
|
|
|
|
| 326 |
submit_button = gr.Button("Классифицировать", variant="primary")
|
| 327 |
clear_button = gr.Button("Очистить")
|
| 328 |
|
|
@@ -336,15 +355,18 @@ with gr.Blocks(css=css, theme=gr.themes.Base()) as binary_app:
|
|
| 336 |
|
| 337 |
with gr.Accordion("О модели", open=False):
|
| 338 |
gr.Markdown("""
|
| 339 |
-
### О
|
|
|
|
|
|
|
| 340 |
|
| 341 |
-
|
|
|
|
|
|
|
| 342 |
|
| 343 |
-
####
|
| 344 |
-
-
|
| 345 |
-
-
|
| 346 |
-
-
|
| 347 |
-
- Dropout: 0.3
|
| 348 |
|
| 349 |
#### Особенности:
|
| 350 |
- Используется анализ текста и оценки качества текста с помощью Binoculars
|
|
@@ -353,13 +375,13 @@ with gr.Blocks(css=css, theme=gr.themes.Base()) as binary_app:
|
|
| 353 |
|
| 354 |
#### Рекомендации:
|
| 355 |
- Для более точной классификации рекомендуется использовать тексты длиннее 200 слов
|
| 356 |
-
-
|
| 357 |
""")
|
| 358 |
|
| 359 |
# Set up event handlers
|
| 360 |
submit_button.click(
|
| 361 |
-
fn=
|
| 362 |
-
inputs=[input_text, analysis_checkbox],
|
| 363 |
outputs=[result_output, analysis_output, input_text]
|
| 364 |
)
|
| 365 |
|
|
|
|
| 6 |
import spaces
|
| 7 |
import gc
|
| 8 |
|
| 9 |
+
from model_utils import load_model, load_ternary_model, classify_text
|
| 10 |
from binoculars_utils import compute_scores, cleanup_model, cleanup_models
|
| 11 |
|
| 12 |
MINIMUM_TOKENS = 200
|
|
|
|
| 30 |
border-radius: 0.5rem;
|
| 31 |
font-weight: bold;
|
| 32 |
}
|
| 33 |
+
.rephrased-text {
|
| 34 |
+
color: black !important;
|
| 35 |
+
line-height: 1.9em;
|
| 36 |
+
padding: 0.5em;
|
| 37 |
+
background: #ffcc99;
|
| 38 |
+
border-radius: 0.5rem;
|
| 39 |
+
font-weight: bold;
|
| 40 |
+
}
|
| 41 |
.analysis-block {
|
| 42 |
background: #f5f5f5;
|
| 43 |
padding: 15px;
|
|
|
|
| 54 |
"""
|
| 55 |
|
| 56 |
@spaces.GPU
|
| 57 |
+
def run_classifier(text, mode="binary", show_analysis=False):
|
| 58 |
# Check GPU status at the beginning
|
| 59 |
if torch.cuda.is_available():
|
| 60 |
print(f"Starting classification with GPU: {torch.cuda.get_device_name(0)}")
|
|
|
|
| 67 |
return gr.Markdown(f"Текст слишком короткий. Требуется минимум {MINIMUM_TOKENS} символов."), None, None
|
| 68 |
|
| 69 |
try:
|
| 70 |
+
# Load appropriate classifier model based on mode
|
| 71 |
+
if mode == "binary":
|
| 72 |
+
model, scaler, label_encoder, imputer = load_model()
|
| 73 |
+
else: # ternary
|
| 74 |
+
model, scaler, label_encoder, imputer = load_ternary_model()
|
| 75 |
|
| 76 |
+
# Compute scores
|
| 77 |
scores = compute_scores(text, use_chat=True, use_coder=True)
|
| 78 |
|
| 79 |
# Run classification
|
|
|
|
| 98 |
scores_str += f"- Score Coder: {scores['score_coder']:.4f}\n"
|
| 99 |
|
| 100 |
# Result markdown
|
| 101 |
+
class_style = "human-text" if predicted_class == "Human" else "ai-text" if predicted_class in ["AI", "Raw AI"] else "rephrased-text"
|
| 102 |
result_md = f"""
|
| 103 |
## Результат классификации
|
| 104 |
|
|
|
|
| 325 |
with gr.Blocks(css=css, theme=gr.themes.Base()) as binary_app:
|
| 326 |
with gr.Row():
|
| 327 |
with gr.Column(scale=3):
|
| 328 |
+
gr.HTML("<h1>Классификатор AI-текста</h1>")
|
| 329 |
|
| 330 |
with gr.Row():
|
| 331 |
with gr.Column():
|
|
|
|
| 333 |
lines=10, label="Текст для анализа")
|
| 334 |
|
| 335 |
with gr.Row():
|
| 336 |
+
model_mode = gr.Radio(
|
| 337 |
+
["binary", "ternary"],
|
| 338 |
+
label="Режим классификации",
|
| 339 |
+
value="binary",
|
| 340 |
+
info="Выберите тип классификации: бинарная (человек/ИИ) или тернарная (человек/ИИ/перефразированный ИИ)"
|
| 341 |
+
)
|
| 342 |
analysis_checkbox = gr.Checkbox(label="Показать детальный анализ текста", value=False)
|
| 343 |
+
|
| 344 |
+
with gr.Row():
|
| 345 |
submit_button = gr.Button("Классифицировать", variant="primary")
|
| 346 |
clear_button = gr.Button("Очистить")
|
| 347 |
|
|
|
|
| 355 |
|
| 356 |
with gr.Accordion("О модели", open=False):
|
| 357 |
gr.Markdown("""
|
| 358 |
+
### О классификаторе AI-текста
|
| 359 |
+
|
| 360 |
+
Эта демонстрация использует нейронные сети для классификации текста в двух режимах:
|
| 361 |
|
| 362 |
+
#### Бинарная классификация:
|
| 363 |
+
- Human (Человек) - текст написан человеком
|
| 364 |
+
- AI (ИИ) - текст сгенерирован искусственным интеллектом
|
| 365 |
|
| 366 |
+
#### Тернарная классификация:
|
| 367 |
+
- Human (Человек) - текст написан челове��ом
|
| 368 |
+
- Raw AI (Чистый ИИ) - текст сгенерирован искусственным интеллектом без редактирования
|
| 369 |
+
- Rephrased AI (Перефразированный ИИ) - текст сгенерирован ИИ и затем отредактирован
|
|
|
|
| 370 |
|
| 371 |
#### Особенности:
|
| 372 |
- Используется анализ текста и оценки качества текста с помощью Binoculars
|
|
|
|
| 375 |
|
| 376 |
#### Рекомендации:
|
| 377 |
- Для более точной классификации рекомендуется использовать тексты длиннее 200 слов
|
| 378 |
+
- Модели обучены на русскоязычных текстах
|
| 379 |
""")
|
| 380 |
|
| 381 |
# Set up event handlers
|
| 382 |
submit_button.click(
|
| 383 |
+
fn=run_classifier,
|
| 384 |
+
inputs=[input_text, model_mode, analysis_checkbox],
|
| 385 |
outputs=[result_output, analysis_output, input_text]
|
| 386 |
)
|
| 387 |
|
model_utils.py
CHANGED
|
@@ -4,6 +4,7 @@ import joblib
|
|
| 4 |
import numpy as np
|
| 5 |
from sklearn.impute import SimpleImputer
|
| 6 |
from NN_classifier.simple_binary_classifier import Medium_Binary_Network
|
|
|
|
| 7 |
from feature_extraction import extract_features
|
| 8 |
import pandas as pd
|
| 9 |
|
|
@@ -45,6 +46,45 @@ def load_model(model_dir='models/medium_binary_classifier'):
|
|
| 45 |
|
| 46 |
return model, scaler, label_encoder, imputer
|
| 47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
def classify_text(text, model, scaler, label_encoder, imputer=None, scores=None):
|
| 49 |
features_df, text_analysis = extract_features(text, scores=scores)
|
| 50 |
|
|
|
|
| 4 |
import numpy as np
|
| 5 |
from sklearn.impute import SimpleImputer
|
| 6 |
from NN_classifier.simple_binary_classifier import Medium_Binary_Network
|
| 7 |
+
from NN_classifier.neural_net_t import Neural_Network
|
| 8 |
from feature_extraction import extract_features
|
| 9 |
import pandas as pd
|
| 10 |
|
|
|
|
| 46 |
|
| 47 |
return model, scaler, label_encoder, imputer
|
| 48 |
|
| 49 |
+
def load_ternary_model(model_dir='models/neural_network'):
|
| 50 |
+
model_path = os.path.join(model_dir, 'nn_model.pt')
|
| 51 |
+
scaler_path = os.path.join(model_dir, 'scaler.joblib')
|
| 52 |
+
encoder_path = os.path.join(model_dir, 'label_encoder.joblib')
|
| 53 |
+
imputer_path = os.path.join(model_dir, 'imputer.joblib')
|
| 54 |
+
|
| 55 |
+
if not os.path.exists(model_path):
|
| 56 |
+
raise FileNotFoundError(f"Model not found at: {model_path}")
|
| 57 |
+
|
| 58 |
+
label_encoder = joblib.load(encoder_path)
|
| 59 |
+
scaler = joblib.load(scaler_path)
|
| 60 |
+
|
| 61 |
+
imputer = None
|
| 62 |
+
if os.path.exists(imputer_path):
|
| 63 |
+
imputer = joblib.load(imputer_path)
|
| 64 |
+
else:
|
| 65 |
+
print("Warning: Imputer not found, will create a new one during classification")
|
| 66 |
+
|
| 67 |
+
input_size = scaler.n_features_in_
|
| 68 |
+
num_classes = len(label_encoder.classes_)
|
| 69 |
+
|
| 70 |
+
model = Neural_Network(input_size, hidden_layers=[256, 192, 128, 64], num_classes=num_classes, dropout_rate=0.3).to(DEVICE)
|
| 71 |
+
model.load_state_dict(torch.load(model_path, map_location=DEVICE))
|
| 72 |
+
model.eval()
|
| 73 |
+
|
| 74 |
+
print(f"Loaded ternary classifier model with {num_classes} classes: {label_encoder.classes_}")
|
| 75 |
+
|
| 76 |
+
if imputer is not None:
|
| 77 |
+
try:
|
| 78 |
+
if hasattr(imputer, 'feature_names_in_'):
|
| 79 |
+
print(f"Imputer has {len(imputer.feature_names_in_)} features")
|
| 80 |
+
print(f"First few feature names: {imputer.feature_names_in_[:5]}")
|
| 81 |
+
else:
|
| 82 |
+
print("Warning: Imputer does not have feature_names_in_ attribute")
|
| 83 |
+
except Exception as e:
|
| 84 |
+
print(f"Error checking imputer: {str(e)}")
|
| 85 |
+
|
| 86 |
+
return model, scaler, label_encoder, imputer
|
| 87 |
+
|
| 88 |
def classify_text(text, model, scaler, label_encoder, imputer=None, scores=None):
|
| 89 |
features_df, text_analysis = extract_features(text, scores=scores)
|
| 90 |
|