Spaces:

CompactAI
/

AIFinder

Running

App Files Files Community

CompactAI commited on Mar 12

Commit

f52234e

verified ·

1 Parent(s): 2fc7a6d

Upload 13 files

Browse files

Files changed (13) hide show

app.py +174 -0
classify.py +248 -0
config.py +105 -0
data_loader.py +205 -0
features.py +157 -0
model.py +37 -0
models/aifinder_trained.pt +3 -0
models/classifier.pt +3 -0
models/feature_pipeline.joblib +3 -0
models/model_enc.joblib +3 -0
models/provider_enc.joblib +3 -0
requirements.txt +12 -0
static/index.html +742 -0

app.py ADDED Viewed

	@@ -0,0 +1,174 @@

+"""
+AIFinder API Server
+Serves classification and training endpoints for the frontend.
+"""
+import os
+import sys
+import json
+import joblib
+import numpy as np
+import torch
+import torch.nn as nn
+from flask import Flask, request, jsonify, send_from_directory
+from flask_cors import CORS
+from config import MODEL_DIR
+from model import AIFinderNet
+from features import FeaturePipeline
+app = Flask(__name__, static_folder="static", static_url_path="")
+CORS(app)
+pipeline = None
+provider_enc = None
+net = None
+device = None
+checkpoint = None
+def load_models():
+    global pipeline, provider_enc, net, device, checkpoint
+    pipeline = joblib.load(os.path.join(MODEL_DIR, "feature_pipeline.joblib"))
+    provider_enc = joblib.load(os.path.join(MODEL_DIR, "provider_enc.joblib"))
+    checkpoint = torch.load(
+        os.path.join(MODEL_DIR, "classifier.pt"),
+        map_location="cpu",
+        weights_only=True,
+    )
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    net = AIFinderNet(
+        input_dim=checkpoint["input_dim"],
+        num_providers=checkpoint["num_providers"],
+        hidden_dim=checkpoint["hidden_dim"],
+        embed_dim=checkpoint["embed_dim"],
+        dropout=checkpoint["dropout"],
+    ).to(device)
+    net.load_state_dict(checkpoint["state_dict"], strict=False)
+    net.eval()
+@app.route("/")
+def index():
+    return send_from_directory("static", "index.html")
+@app.route("/api/providers", methods=["GET"])
+def get_providers():
+    """Return list of available providers."""
+    return jsonify({"providers": sorted(provider_enc.classes_.tolist())})
+@app.route("/api/classify", methods=["POST"])
+def classify():
+    """Classify text and return provider predictions."""
+    data = request.json
+    text = data.get("text", "")
+    if len(text) < 20:
+        return jsonify({"error": "Text too short (minimum 20 characters)"}), 400
+    X = pipeline.transform([text])
+    X_t = torch.tensor(X.toarray(), dtype=torch.float32).to(device)
+    with torch.no_grad():
+        prov_logits = net(X_t)
+    prov_proba = torch.softmax(prov_logits.float(), dim=1)[0].cpu().numpy()
+    top_prov_idxs = np.argsort(prov_proba)[::-1][:5]
+    top_providers = [
+        {
+            "name": provider_enc.inverse_transform([i])[0],
+            "confidence": float(prov_proba[i] * 100),
+        }
+        for i in top_prov_idxs
+    ]
+    return jsonify(
+        {
+            "provider": top_providers[0]["name"],
+            "confidence": top_providers[0]["confidence"],
+            "top_providers": top_providers,
+        }
+    )
+@app.route("/api/correct", methods=["POST"])
+def correct():
+    """Train on a corrected example."""
+    data = request.json
+    text = data.get("text", "")
+    correct_provider = data.get("correct_provider", "")
+    if not text or not correct_provider:
+        return jsonify({"error": "Missing text or correct_provider"}), 400
+    try:
+        prov_idx = provider_enc.transform([correct_provider])[0]
+    except ValueError as e:
+        return jsonify({"error": f"Unknown provider: {e}"}), 400
+    X = pipeline.transform([text])
+    X_t = torch.tensor(X.toarray(), dtype=torch.float32).to(device)
+    y_prov = torch.tensor([prov_idx], dtype=torch.long).to(device)
+    net.train()
+    for module in net.modules():
+        if isinstance(module, nn.modules.batchnorm._BatchNorm):
+            module.eval()
+    optimizer = torch.optim.AdamW(net.parameters(), lr=1e-4, weight_decay=1e-4)
+    optimizer.zero_grad(set_to_none=True)
+    prov_criterion = nn.CrossEntropyLoss()
+    prov_logits = net(X_t)
+    loss = prov_criterion(prov_logits, y_prov)
+    loss.backward()
+    torch.nn.utils.clip_grad_norm_(net.parameters(), max_norm=1.0)
+    optimizer.step()
+    net.eval()
+    checkpoint["state_dict"] = net.state_dict()
+    return jsonify({"success": True, "loss": float(loss.item())})
+@app.route("/api/save", methods=["POST"])
+def save_model():
+    """Save the current model state to a file for export."""
+    global checkpoint
+    data = request.json
+    filename = data.get("filename", "aifinder_model.pt")
+    save_path = os.path.join(MODEL_DIR, filename)
+    torch.save(checkpoint, save_path)
+    return jsonify({"success": True, "filename": filename})
+@app.route("/models/<filename>")
+def download_model(filename):
+    """Download exported model file."""
+    return send_from_directory(MODEL_DIR, filename)
+@app.route("/api/status", methods=["GET"])
+def status():
+    """Check if models are loaded."""
+    return jsonify(
+        {
+            "loaded": net is not None,
+            "device": str(device) if device else None,
+        }
+    )
+if __name__ == "__main__":
+    print("Loading models...")
+    load_models()
+    print(f"Ready on {device}")
+    app.run(host="0.0.0.0", port=7860)

classify.py ADDED Viewed

	@@ -0,0 +1,248 @@

+"""
+AIFinder Interactive Classifier
+Loads trained model and provides an interactive REPL for classifying text.
+Usage: python3 classify.py
+"""
+import os
+import sys
+import time
+import joblib
+import numpy as np
+import torch
+import torch.nn as nn
+from config import MODEL_DIR, DATASET_REGISTRY, DEEPSEEK_AM_DATASETS
+from model import AIFinderNet
+def load_models():
+    """Load all model components from the model directory."""
+    try:
+        pipeline = joblib.load(os.path.join(MODEL_DIR, "feature_pipeline.joblib"))
+        provider_enc = joblib.load(os.path.join(MODEL_DIR, "provider_enc.joblib"))
+        checkpoint = torch.load(
+            os.path.join(MODEL_DIR, "classifier.pt"),
+            map_location="cpu",
+            weights_only=True,
+        )
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        net = AIFinderNet(
+            input_dim=checkpoint["input_dim"],
+            num_providers=checkpoint["num_providers"],
+            hidden_dim=checkpoint["hidden_dim"],
+            embed_dim=checkpoint["embed_dim"],
+            dropout=checkpoint["dropout"],
+        ).to(device)
+        net.load_state_dict(checkpoint["state_dict"], strict=False)
+        net.eval()
+        return pipeline, net, provider_enc, checkpoint, device
+    except FileNotFoundError:
+        print(f"Error: Models not found in {MODEL_DIR}")
+        print(f"Run 'python3 train.py' first to train the models.")
+        sys.exit(1)
+def classify_text(text, pipeline, net, provider_enc, device):
+    """Classify a single text and return provider results."""
+    t0 = time.time()
+    X = pipeline.transform([text])
+    X_t = torch.tensor(X.toarray(), dtype=torch.float32).to(device)
+    print(f"  (featurize: {time.time() - t0:.2f}s)", end="")
+    with torch.no_grad():
+        prov_logits = net(X_t)
+    prov_proba = torch.softmax(prov_logits.float(), dim=1)[0].cpu().numpy()
+    # Provider top-5
+    top_prov_idxs = np.argsort(prov_proba)[::-1][:5]
+    top_providers = [
+        (provider_enc.inverse_transform([i])[0], prov_proba[i] * 100)
+        for i in top_prov_idxs
+    ]
+    elapsed = time.time() - t0
+    print(f"  (total classify: {elapsed:.2f}s)")
+    return {
+        "provider": top_providers[0][0],
+        "provider_confidence": top_providers[0][1],
+        "top_providers": top_providers,
+    }
+def print_results(results):
+    """Pretty-print classification results."""
+    print()
+    print("  ┌───────────────────────────────────────────────┐")
+    print(
+        f"  │  Provider: {results['provider']} ({results['provider_confidence']:.1f}%)"
+    )
+    for name, conf in results["top_providers"]:
+        c = 0.0 if np.isnan(conf) else conf
+        bar = "█" * int(c / 5) + "░" * (20 - int(c / 5))
+        print(f"  │    {name:.<25s} {c:5.1f}% {bar}")
+    print("  └───────────────────────────────────────────────┘")
+    print()
+def correct_provider(
+    net,
+    X_t,
+    correct_provider_name,
+    provider_enc,
+    optimizer,
+    device,
+):
+    """Do a backward pass to correct the provider on a single example."""
+    try:
+        prov_idx = provider_enc.transform([correct_provider_name])[0]
+    except ValueError as e:
+        print(f"  (label not in encoder: {e})")
+        return False
+    y_prov = torch.tensor([prov_idx], dtype=torch.long).to(device)
+    was_training = net.training
+    net.train()
+    # Disable batchnorm for single-sample training
+    if X_t.shape[0] <= 1:
+        for module in net.modules():
+            if isinstance(module, nn.modules.batchnorm._BatchNorm):
+                module.eval()
+    optimizer.zero_grad(set_to_none=True)
+    prov_criterion = nn.CrossEntropyLoss()
+    prov_logits = net(X_t)
+    loss = prov_criterion(prov_logits, y_prov)
+    loss.backward()
+    torch.nn.utils.clip_grad_norm_(net.parameters(), max_norm=1.0)
+    optimizer.step()
+    if was_training:
+        net.train()
+    else:
+        net.eval()
+    print(f"  ✓ Corrected → {correct_provider_name} (loss={loss.item():.4f})")
+    return True
+def prompt_correction(known_providers):
+    """Ask user for the correct provider."""
+    print("  Wrong? Enter correct provider number (or Enter to skip):")
+    for i, name in enumerate(known_providers, 1):
+        print(f"    {i:>2d}. {name}")
+    try:
+        prov_choice = input("  Provider > ").strip()
+    except EOFError:
+        return None
+    if not prov_choice:
+        return None
+    correct_provider = None
+    try:
+        idx = int(prov_choice) - 1
+        if 0 <= idx < len(known_providers):
+            correct_provider = known_providers[idx]
+    except ValueError:
+        matches = [m for m in known_providers if prov_choice.lower() in m.lower()]
+        if len(matches) == 1:
+            correct_provider = matches[0]
+    if not correct_provider:
+        print("  (invalid choice, skipping)")
+        return None
+    return correct_provider
+def main():
+    print()
+    print("  ╔═══════════════════════════════════════╗")
+    print("  ║   AIFinder - AI Response Classifier   ║")
+    print("  ╚═══════════════════════════════════════╝")
+    print()
+    print("  Loading models...")
+    t0 = time.time()
+    pipeline, net, provider_enc, checkpoint, device = load_models()
+    print(f"  Models loaded in {time.time() - t0:.1f}s.")
+    # Prepare online learning components
+    optimizer = torch.optim.AdamW(net.parameters(), lr=1e-4, weight_decay=1e-4)
+    known_providers = sorted(provider_enc.classes_.tolist())
+    corrections_made = 0
+    print()
+    print("  Paste text to classify (submit with TWO empty lines).")
+    print("  Type 'quit' to exit.\n")
+    last_X_t = None
+    while True:
+        print("  ─── Paste text below ───")
+        lines = []
+        empty_count = 0
+        while True:
+            try:
+                line = input()
+            except EOFError:
+                break
+            if line.strip() == "":
+                empty_count += 1
+                if empty_count >= 2:
+                    break
+                lines.append(line)
+            else:
+                empty_count = 0
+                if line.strip().lower() == "quit":
+                    if corrections_made > 0:
+                        print(
+                            f"  Saving {corrections_made} correction(s) to checkpoint..."
+                        )
+                        checkpoint["state_dict"] = net.state_dict()
+                        torch.save(checkpoint, os.path.join(MODEL_DIR, "classifier.pt"))
+                        print("  ✓ Saved.")
+                    print("  Goodbye!")
+                    return
+                lines.append(line)
+        text = "\n".join(lines).strip()
+        if not text:
+            print("  (empty input, try again)")
+            continue
+        if len(text) < 20:
+            print("  (text too short, need at least 20 chars)")
+            continue
+        results = classify_text(text, pipeline, net, provider_enc, device)
+        print_results(results)
+        X = pipeline.transform([text])
+        last_X_t = torch.tensor(X.toarray(), dtype=torch.float32).to(device)
+        correct_prov = prompt_correction(known_providers)
+        if correct_prov:
+            ok = correct_provider(
+                net,
+                last_X_t,
+                correct_prov,
+                provider_enc,
+                optimizer,
+                device,
+            )
+            if ok:
+                corrections_made += 1
+if __name__ == "__main__":
+    main()

config.py ADDED Viewed

	@@ -0,0 +1,105 @@

+"""
+AIFinder Configuration
+Dataset registry, label mappings, and feature parameters.
+"""
+import os
+# --- Paths ---
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+MODEL_DIR = os.path.join(BASE_DIR, "models")
+# --- Dataset Registry ---
+# Each entry: (hf_dataset_id, provider, model_name, optional_kwargs)
+# optional_kwargs: subset name, split, etc.
+DATASET_REGISTRY = [
+    # Anthropic
+    ("TeichAI/claude-4.5-opus-high-reasoning-250x", "Anthropic", "Claude 4.5 Opus", {}),
+    ("TeichAI/claude-sonnet-4.5-high-reasoning-250x", "Anthropic", "Claude Sonnet 4.5", {}),
+    ("Roman1111111/claude-opus-4.6-10000x", "Anthropic", "Claude Opus 4.6", {"max_samples": 1500}),
+    # OpenAI
+    ("TeichAI/gpt-5.2-high-reasoning-250x", "OpenAI", "GPT-5.2", {}),
+    ("TeichAI/gpt-5.1-high-reasoning-1000x", "OpenAI", "GPT-5.1", {}),
+    ("TeichAI/gpt-5.1-codex-max-1000x", "OpenAI", "GPT-5.1 Codex Max", {}),
+    ("TeichAI/gpt-5-codex-250x", "OpenAI", "GPT-5 Codex", {}),
+    ("TeichAI/gpt-5-codex-1000x", "OpenAI", "GPT-5 Codex", {}),
+    # Google
+    ("TeichAI/gemini-3-pro-preview-high-reasoning-1000x", "Google", "Gemini 3 Pro", {}),
+    ("TeichAI/gemini-3-pro-preview-high-reasoning-250x", "Google", "Gemini 3 Pro", {}),
+    ("TeichAI/gemini-2.5-flash-11000x", "Google", "Gemini 2.5 Flash", {"max_samples": 1500}),
+    ("TeichAI/Gemini-3-Flash-Preview-VIBE", "Google", "Gemini 3 Flash", {}),
+    ("TeichAI/gemini-3-flash-preview-1000x", "Google", "Gemini 3 Flash", {}),
+    ("TeichAI/gemini-3-flash-preview-complex-1000x", "Google", "Gemini 3 Flash", {}),
+    # xAI
+    ("TeichAI/brainstorm-v3.1-grok-4-fast-200x", "xAI", "Grok 4 Fast", {}),
+    ("TeichAI/sherlock-thinking-alpha-11000x", "xAI", "Grok 4.1 Fast", {"max_samples": 1500}),
+    ("TeichAI/sherlock-dash-alpha-1000x", "xAI", "Grok 4.1 Fast", {}),
+    ("TeichAI/sherlock-think-alpha-1000x", "xAI", "Grok 4.1 Fast", {}),
+    ("TeichAI/grok-code-fast-1-1000x", "xAI", "Grok Code Fast 1", {}),
+    # MoonshotAI
+    ("TeichAI/kimi-k2-thinking-250x", "MoonshotAI", "Kimi K2", {}),
+    ("TeichAI/kimi-k2-thinking-1000x", "MoonshotAI", "Kimi K2", {}),
+    # Mistral
+    ("TeichAI/mistral-small-creative-500x", "Mistral", "Mistral Small", {}),
+    # MiniMax
+    ("TeichAI/MiniMax-M2.1-Code-SFT", "MiniMax", "MiniMax M2.1", {}),
+    ("TeichAI/convo-v1", "MiniMax", "MiniMax M2.1", {}),
+    # StepFun
+    ("TeichAI/Step-3.5-Flash-2600x", "StepFun", "Step 3.5 Flash", {"max_samples": 1500}),
+    # Zhipu
+    ("TeichAI/Pony-Alpha-15k", "Zhipu", "GLM-5", {"max_samples": 1500}),
+    # DeepSeek (TeichAI)
+    ("TeichAI/deepseek-v3.2-speciale-1000x", "DeepSeek", "DeepSeek V3.2 Speciale", {}),
+    ("TeichAI/deepseek-v3.2-speciale-openr1-math-3k", "DeepSeek", "DeepSeek V3.2 Speciale", {"max_samples": 1500}),
+]
+# DeepSeek (a-m-team) — different format, handled separately
+DEEPSEEK_AM_DATASETS = [
+    ("a-m-team/AM-DeepSeek-R1-Distilled-1.4M", "DeepSeek", "DeepSeek R1", {"name": "am_0.9M_sample_1k", "max_samples": 1000}),
+]
+# --- All providers and models ---
+PROVIDERS = [
+    "Anthropic", "OpenAI", "Google", "xAI", "MoonshotAI",
+    "Mistral", "MiniMax", "StepFun", "Zhipu", "DeepSeek"
+]
+# --- Feature parameters ---
+TFIDF_WORD_PARAMS = {
+    "analyzer": "word",
+    "ngram_range": (1, 2),
+    "max_features": 20000,
+    "sublinear_tf": True,
+    "min_df": 3,
+}
+TFIDF_CHAR_PARAMS = {
+    "analyzer": "char_wb",
+    "ngram_range": (3, 5),
+    "max_features": 20000,
+    "sublinear_tf": True,
+    "min_df": 3,
+}
+# --- Train/test split ---
+TEST_SIZE = 0.2
+RANDOM_STATE = 42
+# --- Neural Network ---
+HIDDEN_DIM = 1024
+EMBED_DIM = 256
+DROPOUT = 0.3
+BATCH_SIZE = 2048
+EPOCHS = 50
+EARLY_STOP_PATIENCE = 8
+LEARNING_RATE = 1e-3
+WEIGHT_DECAY = 1e-4

data_loader.py ADDED Viewed

	@@ -0,0 +1,205 @@

+"""
+AIFinder Data Loader
+Downloads and parses HuggingFace datasets, extracts assistant responses,
+and labels them with is_ai, provider, and model.
+"""
+import re
+import time
+from datasets import load_dataset
+from tqdm import tqdm
+from config import (
+    DATASET_REGISTRY,
+    DEEPSEEK_AM_DATASETS,
+)
+def _parse_msg(msg):
+    """Parse a message that may be a dict or a JSON string."""
+    if isinstance(msg, dict):
+        return msg
+    if isinstance(msg, str):
+        try:
+            import json
+            parsed = json.loads(msg)
+            if isinstance(parsed, dict):
+                return parsed
+        except (json.JSONDecodeError, ValueError):
+            pass
+    return {}
+def _extract_assistant_texts_from_conversations(rows):
+    """Extract assistant message content from conversation datasets.
+    These have a 'conversations' or 'messages' column with list of
+    {role, content} dicts (or JSON strings encoding such dicts).
+    """
+    texts = []
+    for row in rows:
+        convos = row.get("conversations")
+        if convos is None or (hasattr(convos, "__len__") and len(convos) == 0):
+            convos = row.get("messages")
+        if convos is None or (hasattr(convos, "__len__") and len(convos) == 0):
+            convos = []
+        parts = []
+        for msg in convos:
+            msg = _parse_msg(msg)
+            role = msg.get("role", "")
+            content = msg.get("content", "")
+            if role in ("assistant", "gpt", "model") and content:
+                parts.append(content)
+        if parts:
+            texts.append("\n\n".join(parts))
+    return texts
+def _extract_from_am_dataset(row):
+    """Extract assistant text from a-m-team format (messages list with role/content)."""
+    messages = row.get("messages") or row.get("conversations") or []
+    parts = []
+    for msg in messages:
+        role = msg.get("role", "") if isinstance(msg, dict) else ""
+        content = msg.get("content", "") if isinstance(msg, dict) else ""
+        if role == "assistant" and content:
+            parts.append(content)
+    return "\n\n".join(parts) if parts else ""
+def load_teichai_dataset(dataset_id, provider, model_name, kwargs):
+    """Load a single conversation-format dataset and return (texts, providers, models)."""
+    max_samples = kwargs.get("max_samples")
+    load_kwargs = {}
+    if "name" in kwargs:
+        load_kwargs["name"] = kwargs["name"]
+    try:
+        ds = load_dataset(dataset_id, split="train", **load_kwargs)
+        rows = list(ds)
+    except Exception as e:
+        # Fallback: load from auto-converted parquet via HF API
+        try:
+            import pandas as pd
+            url = f"https://huggingface.co/api/datasets/{dataset_id}/parquet/default/train/0.parquet"
+            df = pd.read_parquet(url)
+            rows = df.to_dict(orient="records")
+        except Exception as e2:
+            print(f"  [SKIP] {dataset_id}: {e} / parquet fallback: {e2}")
+            return [], [], []
+    if max_samples and len(rows) > max_samples:
+        import random
+        random.seed(42)
+        rows = random.sample(rows, max_samples)
+    texts = _extract_assistant_texts_from_conversations(rows)
+    # Filter out empty/too-short texts
+    filtered = [(t, provider, model_name) for t in texts if len(t) > 50]
+    if not filtered:
+        print(f"  [SKIP] {dataset_id}: no valid texts extracted")
+        return [], [], []
+    t, p, m = zip(*filtered)
+    return list(t), list(p), list(m)
+def load_am_deepseek_dataset(dataset_id, provider, model_name, kwargs):
+    """Load a-m-team DeepSeek dataset."""
+    max_samples = kwargs.get("max_samples")
+    load_kwargs = {}
+    if "name" in kwargs:
+        load_kwargs["name"] = kwargs["name"]
+    try:
+        ds = load_dataset(dataset_id, split="train", **load_kwargs)
+    except Exception as e1:
+        # Try without name kwarg as fallback
+        try:
+            ds = load_dataset(dataset_id, split="train", streaming=True)
+            rows = []
+            for row in ds:
+                rows.append(row)
+                if max_samples and len(rows) >= max_samples:
+                    break
+        except Exception as e2:
+            print(f"  [SKIP] {dataset_id}: {e2}")
+            return [], [], []
+    else:
+        rows = list(ds)
+        if max_samples and len(rows) > max_samples:
+            rows = rows[:max_samples]
+    texts = []
+    for row in rows:
+        text = _extract_from_am_dataset(row)
+        if len(text) > 50:
+            texts.append(text)
+    providers = [provider] * len(texts)
+    models = [model_name] * len(texts)
+    return texts, providers, models
+def load_all_data():
+    """Load all datasets and return combined lists.
+    Returns:
+        texts: list of str
+        providers: list of str
+        models: list of str
+        is_ai: list of int (1=AI, 0=Human)
+    """
+    all_texts = []
+    all_providers = []
+    all_models = []
+    # TeichAI datasets
+    print("Loading TeichAI datasets...")
+    for dataset_id, provider, model_name, kwargs in tqdm(
+        DATASET_REGISTRY, desc="TeichAI"
+    ):
+        t0 = time.time()
+        texts, providers, models = load_teichai_dataset(
+            dataset_id, provider, model_name, kwargs
+        )
+        elapsed = time.time() - t0
+        all_texts.extend(texts)
+        all_providers.extend(providers)
+        all_models.extend(models)
+        print(f"  {dataset_id}: {len(texts)} samples ({elapsed:.1f}s)")
+    # DeepSeek a-m-team datasets
+    print("\nLoading DeepSeek (a-m-team) datasets...")
+    for dataset_id, provider, model_name, kwargs in tqdm(
+        DEEPSEEK_AM_DATASETS, desc="DeepSeek-AM"
+    ):
+        t0 = time.time()
+        texts, providers, models = load_am_deepseek_dataset(
+            dataset_id, provider, model_name, kwargs
+        )
+        elapsed = time.time() - t0
+        all_texts.extend(texts)
+        all_providers.extend(providers)
+        all_models.extend(models)
+        print(f"  {dataset_id}: {len(texts)} samples ({elapsed:.1f}s)")
+    # Build is_ai labels (all AI)
+    is_ai = [1] * len(all_texts)
+    print(f"\n=== Total: {len(all_texts)} samples ===")
+    # Print per-provider counts
+    from collections import Counter
+    prov_counts = Counter(all_providers)
+    for p, c in sorted(prov_counts.items(), key=lambda x: -x[1]):
+        print(f"  {p}: {c}")
+    return all_texts, all_providers, all_models, is_ai
+if __name__ == "__main__":
+    texts, providers, models, is_ai = load_all_data()

features.py ADDED Viewed

	@@ -0,0 +1,157 @@

+"""
+AIFinder Feature Extraction
+TF-IDF pipeline + stylometric features.
+Supports CoT-aware and no-CoT text preprocessing.
+"""
+import re
+import numpy as np
+from scipy.sparse import hstack, csr_matrix
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.preprocessing import MaxAbsScaler
+from sklearn.base import BaseEstimator, TransformerMixin
+from config import TFIDF_WORD_PARAMS, TFIDF_CHAR_PARAMS
+# --- Text Preprocessing ---
+def strip_cot(text):
+    """Remove <think>...</think> blocks from text."""
+    return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
+def has_cot(text):
+    """Check if text contains <think>...</think> blocks."""
+    return bool(re.search(r"<think>.*?</think>", text, flags=re.DOTALL))
+def cot_ratio(text):
+    """Ratio of thinking text to total text length."""
+    think_matches = re.findall(r"<think>(.*?)</think>", text, flags=re.DOTALL)
+    if not think_matches or len(text) == 0:
+        return 0.0
+    think_len = sum(len(m) for m in think_matches)
+    return think_len / len(text)
+# --- Stylometric Features ---
+class StylometricFeatures(BaseEstimator, TransformerMixin):
+    """Extract stylometric features from text."""
+    def fit(self, X, y=None):
+        return self
+    def transform(self, X):
+        features = []
+        for text in X:
+            features.append(self._extract(text))
+        return csr_matrix(np.array(features, dtype=np.float32))
+    def _extract(self, text):
+        sentences = re.split(r'[.!?]+', text)
+        sentences = [s.strip() for s in sentences if s.strip()]
+        words = text.split()
+        n_chars = max(len(text), 1)
+        n_words = max(len(words), 1)
+        n_sentences = max(len(sentences), 1)
+        # Basic stats
+        avg_word_len = np.mean([len(w) for w in words]) if words else 0
+        avg_sent_len = n_words / n_sentences
+        # Punctuation densities
+        n_commas = text.count(",") / n_chars
+        n_semicolons = text.count(";") / n_chars
+        n_colons = text.count(":") / n_chars
+        n_exclaim = text.count("!") / n_chars
+        n_question = text.count("?") / n_chars
+        n_ellipsis = text.count("...") / n_chars
+        n_dash = (text.count("—") + text.count("--")) / n_chars
+        # Markdown elements
+        n_headers = len(re.findall(r'^#{1,6}\s', text, re.MULTILINE)) / n_sentences
+        n_bold = len(re.findall(r'\*\*.*?\*\*', text)) / n_sentences
+        n_italic = len(re.findall(r'(?<!\*)\*(?!\*).*?(?<!\*)\*(?!\*)', text)) / n_sentences
+        n_code_blocks = len(re.findall(r'```', text)) / n_sentences
+        n_inline_code = len(re.findall(r'`[^`]+`', text)) / n_sentences
+        n_bullet = len(re.findall(r'^[\s]*[-*+]\s', text, re.MULTILINE)) / n_sentences
+        n_numbered = len(re.findall(r'^\s*\d+[.)]\s', text, re.MULTILINE)) / n_sentences
+        # Vocabulary richness
+        unique_words = len(set(w.lower() for w in words))
+        ttr = unique_words / n_words  # type-token ratio
+        # Paragraph structure
+        paragraphs = text.split("\n\n")
+        n_paragraphs = len([p for p in paragraphs if p.strip()])
+        avg_para_len = n_words / max(n_paragraphs, 1)
+        # Special patterns
+        starts_with_certainly = 1.0 if re.match(r'^(Certainly|Of course|Sure|Absolutely|Great question)', text, re.IGNORECASE) else 0.0
+        has_disclaimer = 1.0 if re.search(r"(I'm an AI|as an AI|language model|I cannot|I can't help)", text, re.IGNORECASE) else 0.0
+        # CoT features (present even in no-CoT mode, just will be 0)
+        has_think = 1.0 if has_cot(text) else 0.0
+        think_ratio = cot_ratio(text)
+        return [
+            avg_word_len, avg_sent_len,
+            n_commas, n_semicolons, n_colons, n_exclaim, n_question,
+            n_ellipsis, n_dash,
+            n_headers, n_bold, n_italic, n_code_blocks, n_inline_code,
+            n_bullet, n_numbered,
+            ttr, n_paragraphs, avg_para_len,
+            starts_with_certainly, has_disclaimer,
+            has_think, think_ratio,
+            n_chars, n_words,
+        ]
+# --- Feature Pipeline ---
+class FeaturePipeline:
+    """Combined TF-IDF + stylometric feature pipeline."""
+    def __init__(self):
+        self.word_tfidf = TfidfVectorizer(**TFIDF_WORD_PARAMS)
+        self.char_tfidf = TfidfVectorizer(**TFIDF_CHAR_PARAMS)
+        self.stylo = StylometricFeatures()
+        self.scaler = MaxAbsScaler()
+    def fit_transform(self, texts):
+        """Fit and transform texts into feature matrix."""
+        import time
+        print(f"    Input: {len(texts)} texts")
+        # Strip <think> blocks for TF-IDF so n-grams learn style, not CoT
+        texts_no_cot = [strip_cot(t) for t in texts]
+        t0 = time.time()
+        word_features = self.word_tfidf.fit_transform(texts_no_cot)
+        print(f"    word tfidf: {word_features.shape[1]} features ({time.time()-t0:.1f}s)")
+        t0 = time.time()
+        char_features = self.char_tfidf.fit_transform(texts_no_cot)
+        print(f"    char tfidf: {char_features.shape[1]} features ({time.time()-t0:.1f}s)")
+        # Stylometric uses original text (has_think, think_ratio still work)
+        t0 = time.time()
+        stylo_features = self.stylo.fit_transform(texts)
+        print(f"    stylometric: {stylo_features.shape[1]} features ({time.time()-t0:.1f}s)")
+        combined = hstack([word_features, char_features, stylo_features])
+        combined = self.scaler.fit_transform(combined)
+        print(f"    Combined feature matrix: {combined.shape}")
+        return combined
+    def transform(self, texts):
+        """Transform texts into feature matrix (after fitting)."""
+        texts_no_cot = [strip_cot(t) for t in texts]
+        word_features = self.word_tfidf.transform(texts_no_cot)
+        char_features = self.char_tfidf.transform(texts_no_cot)
+        stylo_features = self.stylo.transform(texts)
+        combined = hstack([word_features, char_features, stylo_features])
+        return self.scaler.transform(combined)

model.py ADDED Viewed

	@@ -0,0 +1,37 @@

+"""
+AIFinder Neural Network
+Single-headed MLP: predicts provider only.
+"""
+import torch
+import torch.nn as nn
+class AIFinderNet(nn.Module):
+    """Single-headed classifier: predicts provider only."""
+    def __init__(
+        self,
+        input_dim,
+        num_providers,
+        hidden_dim=1024,
+        embed_dim=256,
+        dropout=0.3,
+    ):
+        super().__init__()
+        self.backbone = nn.Sequential(
+            nn.Linear(input_dim, hidden_dim),
+            nn.BatchNorm1d(hidden_dim),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim, embed_dim),
+            nn.BatchNorm1d(embed_dim),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+        )
+        self.provider_head = nn.Linear(embed_dim, num_providers)
+    def forward(self, x):
+        h = self.backbone(x)
+        provider_logits = self.provider_head(h)
+        return provider_logits

models/aifinder_trained.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:87706447863f7cae3a6295d06ecbfb35333b2f05f670d5b47133a76757b6377f
+size 165033273

models/classifier.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8b0f36a1dd01e6375df5980017b582ff62469b59e5cc9d37b349fc5c48aa5734
+size 165211381

models/feature_pipeline.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb73af66efcc5a3be022451e0e5ed6871d4df6cf0522e9f8d338f9079a57c267
+size 2094058

models/model_enc.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1d7a8a7bc3087ebbd1634bc9663e9f9fe4701c4872e3bcac7fe37671eaa93f79
+size 1999

models/provider_enc.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7552c92dfc1d08686d9c6f360321e8e45df52e78f2b2eb450ccf117f29aaf62d
+size 727

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+datasets>=4.0
+scikit-learn>=1.5
+numpy>=1.26
+scipy>=1.12
+joblib>=1.3
+tqdm>=4.60
+torch>=2.0
+gradio>=5.0
+pandas>=2.0
+huggingface_hub>=0.23.0
+flask>=3.0
+flask-cors>=4.0

static/index.html ADDED Viewed

	@@ -0,0 +1,742 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>AIFinder - Identify AI Responses</title>
+    <style>
+        @import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;500;600&family=Outfit:wght@300;400;500;600;700&display=swap');
+        * {
+            margin: 0;
+            padding: 0;
+            box-sizing: border-box;
+        }
+        :root {
+            --bg-primary: #0d0d0d;
+            --bg-secondary: #171717;
+            --bg-tertiary: #1f1f1f;
+            --bg-elevated: #262626;
+            --text-primary: #f5f5f5;
+            --text-secondary: #a3a3a3;
+            --text-muted: #737373;
+            --accent: #e85d04;
+            --accent-hover: #f48c06;
+            --accent-muted: #9c4300;
+            --success: #22c55e;
+            --success-muted: #166534;
+            --border: #333333;
+            --border-light: #404040;
+        }
+        body {
+            font-family: 'Outfit', -apple-system, sans-serif;
+            background: var(--bg-primary);
+            color: var(--text-primary);
+            min-height: 100vh;
+            line-height: 1.6;
+        }
+        .container {
+            max-width: 900px;
+            margin: 0 auto;
+            padding: 2rem 1.5rem;
+        }
+        header {
+            text-align: center;
+            margin-bottom: 3rem;
+            padding-top: 1rem;
+        }
+        .logo {
+            font-size: 2.5rem;
+            font-weight: 700;
+            letter-spacing: -0.05em;
+            margin-bottom: 0.5rem;
+        }
+        .logo span {
+            color: var(--accent);
+        }
+        .tagline {
+            color: var(--text-secondary);
+            font-size: 1rem;
+            font-weight: 300;
+        }
+        .card {
+            background: var(--bg-secondary);
+            border: 1px solid var(--border);
+            border-radius: 12px;
+            padding: 1.5rem;
+            margin-bottom: 1.5rem;
+            transition: border-color 0.2s ease;
+        }
+        .card:focus-within {
+            border-color: var(--border-light);
+        }
+        .card-label {
+            font-size: 0.75rem;
+            text-transform: uppercase;
+            letter-spacing: 0.1em;
+            color: var(--text-muted);
+            margin-bottom: 0.75rem;
+            font-weight: 500;
+        }
+        textarea {
+            width: 100%;
+            background: var(--bg-tertiary);
+            border: 1px solid var(--border);
+            border-radius: 8px;
+            padding: 1rem;
+            color: var(--text-primary);
+            font-family: 'JetBrains Mono', monospace;
+            font-size: 0.875rem;
+            resize: vertical;
+            min-height: 180px;
+            transition: border-color 0.2s ease;
+        }
+        textarea:focus {
+            outline: none;
+            border-color: var(--accent-muted);
+        }
+        textarea::placeholder {
+            color: var(--text-muted);
+        }
+        .btn {
+            display: inline-flex;
+            align-items: center;
+            justify-content: center;
+            gap: 0.5rem;
+            padding: 0.75rem 1.5rem;
+            border-radius: 8px;
+            font-family: 'Outfit', sans-serif;
+            font-size: 0.9rem;
+            font-weight: 500;
+            cursor: pointer;
+            transition: all 0.2s ease;
+            border: none;
+        }
+        .btn-primary {
+            background: var(--accent);
+            color: white;
+        }
+        .btn-primary:hover:not(:disabled) {
+            background: var(--accent-hover);
+        }
+        .btn-primary:disabled {
+            opacity: 0.5;
+            cursor: not-allowed;
+        }
+        .btn-secondary {
+            background: var(--bg-tertiary);
+            color: var(--text-primary);
+            border: 1px solid var(--border);
+        }
+        .btn-secondary:hover:not(:disabled) {
+            background: var(--bg-elevated);
+            border-color: var(--border-light);
+        }
+        .btn-group {
+            display: flex;
+            gap: 0.75rem;
+            flex-wrap: wrap;
+        }
+        .results {
+            display: none;
+        }
+        .results.visible {
+            display: block;
+            animation: fadeIn 0.3s ease;
+        }
+        @keyframes fadeIn {
+            from { opacity: 0; transform: translateY(10px); }
+            to { opacity: 1; transform: translateY(0); }
+        }
+        .result-main {
+            display: flex;
+            align-items: center;
+            justify-content: space-between;
+            padding: 1.25rem;
+            background: var(--bg-tertiary);
+            border-radius: 8px;
+            margin-bottom: 1rem;
+        }
+        .result-provider {
+            font-size: 1.5rem;
+            font-weight: 600;
+        }
+        .result-confidence {
+            font-size: 1.25rem;
+            font-weight: 500;
+            color: var(--accent);
+        }
+        .result-bar {
+            height: 8px;
+            background: var(--bg-elevated);
+            border-radius: 4px;
+            margin-bottom: 1rem;
+            overflow: hidden;
+        }
+        .result-bar-fill {
+            height: 100%;
+            background: var(--accent);
+            border-radius: 4px;
+            transition: width 0.5s ease;
+        }
+        .result-list {
+            list-style: none;
+        }
+        .result-item {
+            display: flex;
+            align-items: center;
+            justify-content: space-between;
+            padding: 0.75rem 0;
+            border-bottom: 1px solid var(--border);
+        }
+        .result-item:last-child {
+            border-bottom: none;
+        }
+        .result-name {
+            font-weight: 500;
+        }
+        .result-percent {
+            font-family: 'JetBrains Mono', monospace;
+            color: var(--text-secondary);
+            font-size: 0.875rem;
+        }
+        .correction {
+            display: none;
+            margin-top: 1.5rem;
+            padding-top: 1.5rem;
+            border-top: 1px solid var(--border);
+        }
+        .correction.visible {
+            display: block;
+            animation: fadeIn 0.3s ease;
+        }
+        .correction-title {
+            font-size: 0.875rem;
+            font-weight: 500;
+            margin-bottom: 0.75rem;
+            color: var(--text-secondary);
+        }
+        select {
+            width: 100%;
+            padding: 0.75rem 1rem;
+            background: var(--bg-tertiary);
+            border: 1px solid var(--border);
+            border-radius: 8px;
+            color: var(--text-primary);
+            font-family: 'Outfit', sans-serif;
+            font-size: 0.9rem;
+            margin-bottom: 0.75rem;
+            cursor: pointer;
+        }
+        select:focus {
+            outline: none;
+            border-color: var(--accent-muted);
+        }
+        .stats {
+            display: flex;
+            gap: 1.5rem;
+            margin-bottom: 1.5rem;
+            flex-wrap: wrap;
+        }
+        .stat {
+            background: var(--bg-secondary);
+            border: 1px solid var(--border);
+            border-radius: 8px;
+            padding: 1rem 1.25rem;
+            flex: 1;
+            min-width: 120px;
+        }
+        .stat-value {
+            font-size: 1.5rem;
+            font-weight: 600;
+            color: var(--accent);
+        }
+        .stat-label {
+            font-size: 0.75rem;
+            color: var(--text-muted);
+            text-transform: uppercase;
+            letter-spacing: 0.05em;
+        }
+        .actions {
+            display: flex;
+            gap: 0.75rem;
+            margin-top: 1rem;
+        }
+        .toast {
+            position: fixed;
+            bottom: 2rem;
+            right: 2rem;
+            background: var(--bg-elevated);
+            border: 1px solid var(--border);
+            border-radius: 8px;
+            padding: 1rem 1.5rem;
+            color: var(--text-primary);
+            font-size: 0.9rem;
+            opacity: 0;
+            transform: translateY(20px);
+            transition: all 0.3s ease;
+            z-index: 1000;
+        }
+        .toast.visible {
+            opacity: 1;
+            transform: translateY(0);
+        }
+        .toast.success {
+            border-color: var(--success-muted);
+        }
+        .footer {
+            text-align: center;
+            margin-top: 3rem;
+            padding: 1.5rem;
+            color: var(--text-muted);
+            font-size: 0.8rem;
+        }
+        .footer a {
+            color: var(--text-secondary);
+            text-decoration: none;
+        }
+        .footer a:hover {
+            color: var(--accent);
+        }
+        .loading {
+            display: inline-block;
+            width: 16px;
+            height: 16px;
+            border: 2px solid var(--text-muted);
+            border-top-color: var(--accent);
+            border-radius: 50%;
+            animation: spin 0.8s linear infinite;
+        }
+        @keyframes spin {
+            to { transform: rotate(360deg); }
+        }
+        .status-indicator {
+            display: inline-flex;
+            align-items: center;
+            gap: 0.5rem;
+            font-size: 0.8rem;
+            color: var(--text-muted);
+            margin-bottom: 1rem;
+        }
+        .status-dot {
+            width: 8px;
+            height: 8px;
+            border-radius: 50%;
+            background: var(--success);
+        }
+        .status-dot.loading {
+            background: var(--accent);
+            animation: pulse 1s ease infinite;
+        }
+        @keyframes pulse {
+            0%, 100% { opacity: 1; }
+            50% { opacity: 0.5; }
+        }
+        .empty-state {
+            text-align: center;
+            padding: 3rem 1rem;
+            color: var(--text-muted);
+        }
+        .empty-state-icon {
+            font-size: 3rem;
+            margin-bottom: 1rem;
+            opacity: 0.5;
+        }
+        @media (max-width: 600px) {
+            .container {
+                padding: 1rem;
+            }
+            .logo {
+                font-size: 2rem;
+            }
+            .btn-group {
+                flex-direction: column;
+            }
+            .btn {
+                width: 100%;
+            }
+            .result-main {
+                flex-direction: column;
+                gap: 0.5rem;
+                text-align: center;
+            }
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <header>
+            <div class="logo">AI<span>Finder</span></div>
+            <p class="tagline">Identify which AI provider generated a response</p>
+        </header>
+        <div class="status-indicator">
+            <span class="status-dot" id="statusDot"></span>
+            <span id="statusText">Connecting to API...</span>
+        </div>
+        <div class="card">
+            <div class="card-label">Paste AI Response</div>
+            <textarea id="inputText" placeholder="Paste an AI response here to identify which provider generated it..."></textarea>
+        </div>
+        <div class="btn-group">
+            <button class="btn btn-primary" id="classifyBtn" disabled>
+                <span id="classifyBtnText">Classify</span>
+            </button>
+            <button class="btn btn-secondary" id="clearBtn">Clear</button>
+        </div>
+        <div class="results" id="results">
+            <div class="card">
+                <div class="card-label">Result</div>
+                <div class="result-main">
+                    <span class="result-provider" id="resultProvider">-</span>
+                    <span class="result-confidence" id="resultConfidence">-</span>
+                </div>
+                <div class="result-bar">
+                    <div class="result-bar-fill" id="resultBar" style="width: 0%"></div>
+                </div>
+                <ul class="result-list" id="resultList"></ul>
+            </div>
+            <div class="correction" id="correction">
+                <div class="correction-title">Wrong? Correct the provider to train the model:</div>
+                <select id="providerSelect"></select>
+                <button class="btn btn-primary" id="trainBtn">Train & Save</button>
+            </div>
+        </div>
+        <div class="stats" id="stats" style="display: none;">
+            <div class="stat">
+                <div class="stat-value" id="correctionsCount">0</div>
+                <div class="stat-label">Corrections</div>
+            </div>
+            <div class="stat">
+                <div class="stat-value" id="sessionCount">0</div>
+                <div class="stat-label">Session</div>
+            </div>
+        </div>
+        <div class="actions" id="actions" style="display: none;">
+            <button class="btn btn-secondary" id="exportBtn">Export Trained Model</button>
+            <button class="btn btn-secondary" id="resetBtn">Reset Training</button>
+        </div>
+        <div class="footer">
+            <p>AIFinder &mdash; Train on corrections to improve accuracy</p>
+            <p style="margin-top: 0.5rem;">
+                Want to contribute? Test this and post to the
+                <a href="https://huggingface.co/spaces" target="_blank">HuggingFace Spaces Community</a>
+                if you want it merged!
+            </p>
+        </div>
+    </div>
+    <div class="toast" id="toast"></div>
+    <script>
+        const API_BASE = window.location.hostname === 'localhost' || window.location.hostname === '127.0.0.1'
+            ? 'http://localhost:7860'
+            : '';
+        let providers = [];
+        let correctionsCount = 0;
+        let sessionCorrections = 0;
+        const inputText = document.getElementById('inputText');
+        const classifyBtn = document.getElementById('classifyBtn');
+        const classifyBtnText = document.getElementById('classifyBtnText');
+        const clearBtn = document.getElementById('clearBtn');
+        const results = document.getElementById('results');
+        const resultProvider = document.getElementById('resultProvider');
+        const resultConfidence = document.getElementById('resultConfidence');
+        const resultBar = document.getElementById('resultBar');
+        const resultList = document.getElementById('resultList');
+        const correction = document.getElementById('correction');
+        const providerSelect = document.getElementById('providerSelect');
+        const trainBtn = document.getElementById('trainBtn');
+        const stats = document.getElementById('stats');
+        const correctionsCountEl = document.getElementById('correctionsCount');
+        const sessionCountEl = document.getElementById('sessionCount');
+        const actions = document.getElementById('actions');
+        const exportBtn = document.getElementById('exportBtn');
+        const resetBtn = document.getElementById('resetBtn');
+        const toast = document.getElementById('toast');
+        const statusDot = document.getElementById('statusDot');
+        const statusText = document.getElementById('statusText');
+        function showToast(message, type = 'info') {
+            toast.textContent = message;
+            toast.className = 'toast visible' + (type === 'success' ? ' success' : '');
+            setTimeout(() => {
+                toast.classList.remove('visible');
+            }, 3000);
+        }
+        async function checkStatus() {
+            try {
+                const res = await fetch(`${API_BASE}/api/status`);
+                const data = await res.json();
+                if (data.loaded) {
+                    statusDot.classList.remove('loading');
+                    statusText.textContent = `Ready (${data.device})`;
+                    classifyBtn.disabled = false;
+                    loadProviders();
+                    loadStats();
+                } else {
+                    setTimeout(checkStatus, 1000);
+                }
+            } catch (e) {
+                statusDot.classList.add('loading');
+                statusText.textContent = 'Connecting to API...';
+                setTimeout(checkStatus, 2000);
+            }
+        }
+        async function loadProviders() {
+            const res = await fetch(`${API_BASE}/api/providers`);
+            const data = await res.json();
+            providers = data.providers;
+            providerSelect.innerHTML = providers.map(p =>
+                `<option value="${p}">${p}</option>`
+            ).join('');
+        }
+        function loadStats() {
+            const saved = localStorage.getItem('aifinder_corrections');
+            if (saved) {
+                correctionsCount = parseInt(saved, 10);
+                correctionsCountEl.textContent = correctionsCount;
+                stats.style.display = 'flex';
+                actions.style.display = 'flex';
+            }
+            sessionCountEl.textContent = sessionCorrections;
+        }
+        function saveStats() {
+            localStorage.setItem('aifinder_corrections', correctionsCount.toString());
+        }
+        async function classify() {
+            const text = inputText.value.trim();
+            if (text.length < 20) {
+                showToast('Text must be at least 20 characters');
+                return;
+            }
+            classifyBtn.disabled = true;
+            classifyBtnText.innerHTML = '<span class="loading"></span>';
+            try {
+                const res = await fetch(`${API_BASE}/api/classify`, {
+                    method: 'POST',
+                    headers: { 'Content-Type': 'application/json' },
+                    body: JSON.stringify({ text })
+                });
+                if (!res.ok) {
+                    throw new Error('Classification failed');
+                }
+                const data = await res.json();
+                showResults(data);
+            } catch (e) {
+                showToast('Error: ' + e.message);
+            } finally {
+                classifyBtn.disabled = false;
+                classifyBtnText.textContent = 'Classify';
+            }
+        }
+        function showResults(data) {
+            resultProvider.textContent = data.provider;
+            resultConfidence.textContent = data.confidence.toFixed(1) + '%';
+            resultBar.style.width = data.confidence + '%';
+            resultList.innerHTML = data.top_providers.map(p => `
+                <li class="result-item">
+                    <span class="result-name">${p.name}</span>
+                    <span class="result-percent">${p.confidence.toFixed(1)}%</span>
+                </li>
+            `).join('');
+            providerSelect.value = data.provider;
+            results.classList.add('visible');
+            correction.classList.add('visible');
+            if (correctionsCount > 0 || sessionCorrections > 0) {
+                stats.style.display = 'flex';
+                actions.style.display = 'flex';
+            }
+        }
+        async function train() {
+            const text = inputText.value.trim();
+            const correctProvider = providerSelect.value;
+            trainBtn.disabled = true;
+            trainBtn.innerHTML = '<span class="loading"></span>';
+            try {
+                const res = await fetch(`${API_BASE}/api/correct`, {
+                    method: 'POST',
+                    headers: { 'Content-Type': 'application/json' },
+                    body: JSON.stringify({ text, correct_provider: correctProvider })
+                });
+                if (!res.ok) {
+                    throw new Error('Training failed');
+                }
+                const data = await res.json();
+                correctionsCount++;
+                sessionCorrections++;
+                saveStats();
+                correctionsCountEl.textContent = correctionsCount;
+                sessionCountEl.textContent = sessionCorrections;
+                showToast(`Trained! Loss: ${data.loss.toFixed(4)}`, 'success');
+                stats.style.display = 'flex';
+                actions.style.display = 'flex';
+                classify();
+            } catch (e) {
+                showToast('Error: ' + e.message);
+            } finally {
+                trainBtn.disabled = false;
+                trainBtn.textContent = 'Train & Save';
+            }
+        }
+        async function exportModel() {
+            exportBtn.disabled = true;
+            exportBtn.innerHTML = '<span class="loading"></span>';
+            try {
+                const res = await fetch(`${API_BASE}/api/save`, {
+                    method: 'POST',
+                    headers: { 'Content-Type': 'application/json' },
+                    body: JSON.stringify({ filename: 'aifinder_trained.pt' })
+                });
+                if (!res.ok) {
+                    throw new Error('Save failed');
+                }
+                const data = await res.json();
+                const link = document.createElement('a');
+                link.href = `${API_BASE}/models/${data.filename}`;
+                link.download = data.filename;
+                link.click();
+                showToast('Model exported!', 'success');
+            } catch (e) {
+                showToast('Error: ' + e.message);
+            } finally {
+                exportBtn.disabled = false;
+                exportBtn.textContent = 'Export Trained Model';
+            }
+        }
+        function resetTraining() {
+            if (!confirm('Reset all training data? This cannot be undone.')) {
+                return;
+            }
+            correctionsCount = 0;
+            sessionCorrections = 0;
+            localStorage.removeItem('aifinder_corrections');
+            correctionsCountEl.textContent = '0';
+            sessionCountEl.textContent = '0';
+            stats.style.display = 'none';
+            actions.style.display = 'none';
+            showToast('Training data reset');
+        }
+        classifyBtn.addEventListener('click', classify);
+        clearBtn.addEventListener('click', () => {
+            inputText.value = '';
+            results.classList.remove('visible');
+            correction.classList.remove('visible');
+        });
+        trainBtn.addEventListener('click', train);
+        exportBtn.addEventListener('click', exportModel);
+        resetBtn.addEventListener('click', resetTraining);
+        inputText.addEventListener('keydown', (e) => {
+            if (e.key === 'Enter' && e.ctrlKey) {
+                classify();
+            }
+        });
+        checkStatus();
+    </script>
+</body>
+</html>