Spaces:

cmpunkmannu
/

quora-duplicate-detector

Sleeping

App Files Files Community

RISHABH KUMAR commited on Feb 22

Commit

162b166

1 Parent(s): 19f9b01

Add Quora duplicate detector Gradio app

Browse files

Files changed (19) hide show

app.py +118 -0
requirements.txt +41 -0
src/__init__.py +0 -0
src/__pycache__/__init__.cpython-310.pyc +0 -0
src/__pycache__/__init__.cpython-312.pyc +0 -0
src/__pycache__/embeddings.cpython-310.pyc +0 -0
src/__pycache__/embeddings.cpython-312.pyc +0 -0
src/__pycache__/feature_engineering.cpython-310.pyc +0 -0
src/__pycache__/feature_engineering.cpython-312.pyc +0 -0
src/__pycache__/model.cpython-310.pyc +0 -0
src/__pycache__/preprocessing.cpython-310.pyc +0 -0
src/__pycache__/preprocessing.cpython-312.pyc +0 -0
src/embeddings.py +40 -0
src/feature_engineering.py +150 -0
src/model.py +88 -0
src/preprocessing.py +178 -0
streamlit-app/.DS_Store +0 -0
streamlit-app/__pycache__/helper.cpython-310.pyc +0 -0
streamlit-app/helper.py +157 -0

app.py ADDED Viewed

	@@ -0,0 +1,118 @@

+"""
+Gradio app for Quora Duplicate Question Detector.
+Deploy to Hugging Face Spaces with Gradio SDK.
+"""
+import sys
+from pathlib import Path
+ROOT = Path(__file__).resolve().parent
+sys.path.insert(0, str(ROOT))
+sys.path.insert(0, str(ROOT / "streamlit-app"))
+import nltk
+nltk.download("stopwords", quiet=True)
+import helper
+import gradio as gr
+def predict_fn(q1: str, q2: str, model_name: str):
+    """Run prediction and return formatted output."""
+    q1_clean = (q1 or "").strip()
+    q2_clean = (q2 or "").strip()
+    if not q1_clean or not q2_clean:
+        return "⚠️ Please enter both questions.", 0.0
+    if len(q1_clean) < 3 or len(q2_clean) < 3:
+        return "⚠️ Questions should be at least 3 characters.", 0.0
+    try:
+        model_type = "classical" if "Classical" in model_name else "transformer"
+        pred, proba = helper.predict(q1_clean, q2_clean, model_type)
+        if pred:
+            msg = "**Duplicate** — These questions likely have the same meaning."
+        else:
+            msg = "**Not Duplicate** — These questions appear to be different."
+        return msg, proba
+    except Exception as e:
+        return f"❌ Error: {str(e)}", 0.0
+# Build model options
+available = helper.get_available_models()
+if not available:
+    raise RuntimeError("No models found. Add models to models/ or configure HF Hub download.")
+inference_times = helper.get_inference_times()
+model_choices = [helper.get_model_display_name(m) for m in available]
+model_choices_with_time = []
+for m in model_choices:
+    key = "classical" if "Classical" in m else "transformer"
+    ms = inference_times.get(key, {}).get("mean_ms", 0)
+    suffix = f" (~{ms:.0f} ms)" if ms else ""
+    model_choices_with_time.append(f"{m}{suffix}")
+with gr.Blocks(title="Quora Duplicate Detector", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🔍 Quora Duplicate Question Pairs")
+    gr.Markdown("Enter two questions to check if they are semantically duplicate.")
+    with gr.Row():
+        with gr.Column(scale=2):
+            q1 = gr.Textbox(
+                label="Question 1",
+                placeholder="e.g. What is the capital of India?",
+                lines=2,
+            )
+            q2 = gr.Textbox(
+                label="Question 2",
+                placeholder="e.g. Which city is India's capital?",
+                lines=2,
+            )
+            model_dropdown = gr.Dropdown(
+                label="Model",
+                choices=model_choices_with_time,
+                value=model_choices_with_time[0],
+            )
+            check_btn = gr.Button("Check", variant="primary")
+        with gr.Column(scale=1):
+            result_text = gr.Markdown(value="")
+            proba_slider = gr.Slider(
+                minimum=0,
+                maximum=1,
+                value=0,
+                label="Probability of Duplicate",
+                interactive=False,
+            )
+    with gr.Accordion("Try example pairs", open=False):
+        gr.Examples(
+            examples=[
+                ["How do I learn Python?", "What is the best way to learn Python programming?"],
+                ["What is the capital of France?", "How do I cook pasta?"],
+            ],
+            inputs=[q1, q2],
+            label="",
+        )
+    check_btn.click(
+        fn=predict_fn,
+        inputs=[q1, q2, model_dropdown],
+        outputs=[result_text, proba_slider],
+    )
+    gr.Markdown("---")
+    with gr.Accordion("About", open=False):
+        gr.Markdown("""
+        This app predicts whether two Quora questions are duplicates (same meaning).
+        **Models:**
+        - **Classical**: Random Forest or XGBoost on 25 handcrafted features + TF-IDF
+        - **DistilBERT**: Fine-tuned transformer for sentence-pair classification
+        *Built for fun & learning. Results may not always be accurate — use with caution.*
+        """)
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,41 @@

+# Core data science
+numpy>=1.24,<3
+pandas>=2.0
+scikit-learn>=1.3
+scipy>=1.11
+# NLP & text
+nltk>=3.8
+beautifulsoup4>=4.12
+fuzzywuzzy>=0.18
+python-Levenshtein>=0.21
+distance>=0.1.3
+# Models
+xgboost>=2.0
+lightgbm>=4.0
+# Embeddings (Phase 2)
+torch>=2.0
+sentence-transformers>=2.2
+# Transformer fine-tuning
+transformers>=4.30
+datasets>=2.14
+accelerate>=0.20
+# App (Gradio for HF Spaces)
+gradio>=4.0
+huggingface_hub>=0.20
+# Visualization
+matplotlib>=3.7
+seaborn>=0.13
+plotly>=5.18
+# Progress & utils
+tqdm>=4.65
+# Jupyter (for notebooks)
+jupyter>=1.0
+ipykernel>=6.0

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (159 Bytes). View file

src/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (163 Bytes). View file

src/__pycache__/embeddings.cpython-310.pyc ADDED Viewed

Binary file (1.26 kB). View file

src/__pycache__/embeddings.cpython-312.pyc ADDED Viewed

Binary file (1.79 kB). View file

src/__pycache__/feature_engineering.cpython-310.pyc ADDED Viewed

Binary file (5.06 kB). View file

src/__pycache__/feature_engineering.cpython-312.pyc ADDED Viewed

Binary file (9.79 kB). View file

src/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (2.71 kB). View file

src/__pycache__/preprocessing.cpython-310.pyc ADDED Viewed

Binary file (4.84 kB). View file

src/__pycache__/preprocessing.cpython-312.pyc ADDED Viewed

Binary file (7.04 kB). View file

src/embeddings.py ADDED Viewed

	@@ -0,0 +1,40 @@

+"""
+Sentence Transformer embeddings for semantic similarity.
+Uses MPS (Apple Silicon GPU) when available.
+"""
+import numpy as np
+_embedding_model = None
+def get_embedding_model(device: str = None):
+    """Load Sentence Transformer model (cached singleton)."""
+    global _embedding_model
+    if _embedding_model is not None:
+        return _embedding_model
+    try:
+        from sentence_transformers import SentenceTransformer
+        import torch
+        if device is None:
+            device = "mps" if torch.backends.mps.is_available() else "cpu"
+        _embedding_model = SentenceTransformer("all-MiniLM-L6-v2", device=device)
+        return _embedding_model
+    except ImportError:
+        return None
+def embedding_cosine_similarity(q1: str, q2: str, model=None) -> float:
+    """
+    Compute cosine similarity between question embeddings.
+    Returns 0.0 if model unavailable.
+    """
+    if model is None:
+        model = get_embedding_model()
+    if model is None:
+        return 0.0
+    embeddings = model.encode([q1, q2])
+    a, b = embeddings[0], embeddings[1]
+    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-9))

src/feature_engineering.py ADDED Viewed

	@@ -0,0 +1,150 @@

+"""
+Feature extraction for Quora question pairs.
+"""
+import distance
+from fuzzywuzzy import fuzz
+import numpy as np
+from .preprocessing import preprocess
+# Use NLTK stopwords (no pickle dependency)
+try:
+    from nltk.corpus import stopwords
+    STOP_WORDS = set(stopwords.words('english'))
+except LookupError:
+    import nltk
+    nltk.download('stopwords', quiet=True)
+    from nltk.corpus import stopwords
+    STOP_WORDS = set(stopwords.words('english'))
+SAFE_DIV = 0.0001
+def _common_words(q1: str, q2: str) -> int:
+    w1 = set(word.lower().strip() for word in q1.split())
+    w2 = set(word.lower().strip() for word in q2.split())
+    return len(w1 & w2)
+def _total_words(q1: str, q2: str) -> int:
+    w1 = set(word.lower().strip() for word in q1.split())
+    w2 = set(word.lower().strip() for word in q2.split())
+    return len(w1) + len(w2)
+def _fetch_token_features(q1: str, q2: str) -> list:
+    token_features = [0.0] * 8
+    q1_tokens = q1.split()
+    q2_tokens = q2.split()
+    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
+        return token_features
+    q1_words = set(w for w in q1_tokens if w not in STOP_WORDS)
+    q2_words = set(w for w in q2_tokens if w not in STOP_WORDS)
+    q1_stops = set(w for w in q1_tokens if w in STOP_WORDS)
+    q2_stops = set(w for w in q2_tokens if w in STOP_WORDS)
+    common_word_count = len(q1_words & q2_words)
+    common_stop_count = len(q1_stops & q2_stops)
+    common_token_count = len(set(q1_tokens) & set(q2_tokens))
+    token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + SAFE_DIV)
+    token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + SAFE_DIV)
+    token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + SAFE_DIV)
+    token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + SAFE_DIV)
+    token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
+    token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
+    token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])
+    token_features[7] = int(q1_tokens[0] == q2_tokens[0])
+    return token_features
+def _fetch_length_features(q1: str, q2: str) -> list:
+    length_features = [0.0] * 3
+    q1_tokens = q1.split()
+    q2_tokens = q2.split()
+    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
+        return length_features
+    length_features[0] = abs(len(q1_tokens) - len(q2_tokens))
+    length_features[1] = (len(q1_tokens) + len(q2_tokens)) / 2
+    # Guard against empty lcsubstrings (IndexError)
+    strs = list(distance.lcsubstrings(q1, q2))
+    if strs:
+        length_features[2] = len(strs[0]) / (min(len(q1), len(q2)) + 1)
+    else:
+        length_features[2] = 0.0
+    return length_features
+def _fetch_fuzzy_features(q1: str, q2: str) -> list:
+    return [
+        fuzz.QRatio(q1, q2),
+        fuzz.partial_ratio(q1, q2),
+        fuzz.token_sort_ratio(q1, q2),
+        fuzz.token_set_ratio(q1, q2),
+    ]
+def _jaccard_similarity(q1: str, q2: str) -> float:
+    """|intersection| / |union| of word sets."""
+    w1 = set(word.lower().strip() for word in q1.split())
+    w2 = set(word.lower().strip() for word in q2.split())
+    if not w1 and not w2:
+        return 0.0
+    inter = len(w1 & w2)
+    union = len(w1 | w2)
+    return inter / union if union else 0.0
+def _sentence_length_ratio(q1: str, q2: str) -> float:
+    """min(word_count) / max(word_count)."""
+    n1, n2 = len(q1.split()), len(q2.split())
+    if max(n1, n2) == 0:
+        return 0.0
+    return min(n1, n2) / max(n1, n2)
+def query_point_creator(
+    q1: str, q2: str, vectorizer, embedding_model=None
+) -> np.ndarray:
+    """
+    Build feature vector for a question pair.
+    Requires a fitted CountVectorizer or TfidfVectorizer.
+    If embedding_model provided, adds cosine similarity between question embeddings.
+    """
+    q1 = preprocess(q1)
+    q2 = preprocess(q2)
+    input_query = [
+        len(q1),
+        len(q2),
+        len(q1.split()),
+        len(q2.split()),
+        _common_words(q1, q2),
+        _total_words(q1, q2),
+        round(_common_words(q1, q2) / (_total_words(q1, q2) + SAFE_DIV), 2),
+    ]
+    input_query.extend(_fetch_token_features(q1, q2))
+    input_query.extend(_fetch_length_features(q1, q2))
+    input_query.extend(_fetch_fuzzy_features(q1, q2))
+    input_query.append(_jaccard_similarity(q1, q2))
+    input_query.append(_sentence_length_ratio(q1, q2))
+    # Sentence Transformer cosine similarity (semantic)
+    if embedding_model is not None:
+        from .embeddings import embedding_cosine_similarity
+        input_query.append(embedding_cosine_similarity(q1, q2, embedding_model))
+    q1_vec = vectorizer.transform([q1]).toarray()
+    q2_vec = vectorizer.transform([q2]).toarray()
+    n_handcrafted = len(input_query)
+    return np.hstack((np.array(input_query).reshape(1, n_handcrafted), q1_vec, q2_vec))

src/model.py ADDED Viewed

	@@ -0,0 +1,88 @@

+"""
+Model training and evaluation utilities.
+"""
+import numpy as np
+from sklearn.model_selection import StratifiedKFold
+from sklearn.base import clone
+from sklearn.metrics import (
+    accuracy_score,
+    log_loss,
+    precision_score,
+    recall_score,
+    f1_score,
+    roc_auc_score,
+    confusion_matrix,
+)
+def evaluate_model(model, X_test, y_test, prefix: str = ""):
+    """
+    Compute full evaluation metrics for a binary classifier.
+    Returns dict of metrics.
+    """
+    y_pred = model.predict(X_test)
+    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
+    metrics = {
+        "accuracy": accuracy_score(y_test, y_pred),
+        "precision": precision_score(y_test, y_pred, zero_division=0),
+        "recall": recall_score(y_test, y_pred, zero_division=0),
+        "f1": f1_score(y_test, y_pred, zero_division=0),
+    }
+    if y_proba is not None:
+        try:
+            metrics["log_loss"] = log_loss(y_test, y_proba)
+        except ValueError:
+            metrics["log_loss"] = float("nan")
+        try:
+            metrics["auc_roc"] = roc_auc_score(y_test, y_proba)
+        except ValueError:
+            metrics["auc_roc"] = float("nan")
+    return metrics
+def print_metrics(metrics: dict, prefix: str = ""):
+    """Print metrics in a readable format."""
+    p = f"{prefix} " if prefix else ""
+    print(f"\n--- {p}Metrics ---")
+    for name, val in metrics.items():
+        if isinstance(val, float) and not np.isnan(val):
+            print(f"  {name}: {val:.4f}")
+        else:
+            print(f"  {name}: {val}")
+    print()
+def stratified_cv_evaluate(model, X, y, n_folds: int = 5, random_state: int = 42):
+    """
+    Run Stratified K-Fold CV and return mean metrics.
+    """
+    from tqdm import tqdm
+    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_state)
+    fold_metrics = []
+    for fold, (train_idx, val_idx) in tqdm(
+        enumerate(skf.split(X, y)),
+        total=n_folds,
+        desc="CV folds",
+        unit="fold",
+    ):
+        X_train, X_val = X[train_idx], X[val_idx]
+        y_train, y_val = y[train_idx], y[val_idx]
+        model_clone = clone(model)
+        model_clone.fit(X_train, y_train)
+        m = evaluate_model(model_clone, X_val, y_val)
+        fold_metrics.append(m)
+        print(f"  Fold {fold + 1}: F1={m['f1']:.4f}, AUC={m.get('auc_roc', 0):.4f}")
+    # Mean across folds
+    mean_metrics = {}
+    for key in fold_metrics[0]:
+        vals = [m[key] for m in fold_metrics if not (isinstance(m[key], float) and np.isnan(m[key]))]
+        mean_metrics[key] = np.mean(vals) if vals else float("nan")
+    return mean_metrics, fold_metrics

src/preprocessing.py ADDED Viewed

	@@ -0,0 +1,178 @@

+"""
+Text preprocessing for Quora question pairs.
+"""
+import re
+from bs4 import BeautifulSoup
+# Module-level constant (avoid recreating on every call)
+CONTRACTIONS = {
+    "ain't": "am not",
+    "aren't": "are not",
+    "can't": "can not",
+    "can't've": "can not have",
+    "'cause": "because",
+    "could've": "could have",
+    "couldn't": "could not",
+    "couldn't've": "could not have",
+    "didn't": "did not",
+    "doesn't": "does not",
+    "don't": "do not",
+    "hadn't": "had not",
+    "hadn't've": "had not have",
+    "hasn't": "has not",
+    "haven't": "have not",
+    "he'd": "he would",
+    "he'd've": "he would have",
+    "he'll": "he will",
+    "he'll've": "he will have",
+    "he's": "he is",
+    "how'd": "how did",
+    "how'd'y": "how do you",
+    "how'll": "how will",
+    "how's": "how is",
+    "i'd": "i would",
+    "i'd've": "i would have",
+    "i'll": "i will",
+    "i'll've": "i will have",
+    "i'm": "i am",
+    "i've": "i have",
+    "isn't": "is not",
+    "it'd": "it would",
+    "it'd've": "it would have",
+    "it'll": "it will",
+    "it'll've": "it will have",
+    "it's": "it is",
+    "let's": "let us",
+    "ma'am": "madam",
+    "mayn't": "may not",
+    "might've": "might have",
+    "mightn't": "might not",
+    "mightn't've": "might not have",
+    "must've": "must have",
+    "mustn't": "must not",
+    "mustn't've": "must not have",
+    "needn't": "need not",
+    "needn't've": "need not have",
+    "o'clock": "of the clock",
+    "oughtn't": "ought not",
+    "oughtn't've": "ought not have",
+    "shan't": "shall not",
+    "sha'n't": "shall not",
+    "shan't've": "shall not have",
+    "she'd": "she would",
+    "she'd've": "she would have",
+    "she'll": "she will",
+    "she'll've": "she will have",
+    "she's": "she is",
+    "should've": "should have",
+    "shouldn't": "should not",
+    "shouldn't've": "should not have",
+    "so've": "so have",
+    "so's": "so as",
+    "that'd": "that would",
+    "that'd've": "that would have",
+    "that's": "that is",
+    "there'd": "there would",
+    "there'd've": "there would have",
+    "there's": "there is",
+    "they'd": "they would",
+    "they'd've": "they would have",
+    "they'll": "they will",
+    "they'll've": "they will have",
+    "they're": "they are",
+    "they've": "they have",
+    "to've": "to have",
+    "wasn't": "was not",
+    "we'd": "we would",
+    "we'd've": "we would have",
+    "we'll": "we will",
+    "we'll've": "we will have",
+    "we're": "we are",
+    "we've": "we have",
+    "weren't": "were not",
+    "what'll": "what will",
+    "what'll've": "what will have",
+    "what're": "what are",
+    "what's": "what is",
+    "what've": "what have",
+    "when's": "when is",
+    "when've": "when have",
+    "where'd": "where did",
+    "where's": "where is",
+    "where've": "where have",
+    "who'll": "who will",
+    "who'll've": "who will have",
+    "who's": "who is",
+    "who've": "who have",
+    "why's": "why is",
+    "why've": "why have",
+    "will've": "will have",
+    "won't": "will not",
+    "won't've": "will not have",
+    "would've": "would have",
+    "wouldn't": "would not",
+    "wouldn't've": "would not have",
+    "y'all": "you all",
+    "y'all'd": "you all would",
+    "y'all'd've": "you all would have",
+    "y'all're": "you all are",
+    "y'all've": "you all have",
+    "you'd": "you would",
+    "you'd've": "you would have",
+    "you'll": "you will",
+    "you'll've": "you will have",
+    "you're": "you are",
+    "you've": "you have",
+}
+def preprocess(q: str) -> str:
+    """
+    Preprocess a question string for feature extraction.
+    - Lowercase, strip whitespace
+    - Replace special chars ($, %, etc.)
+    - Expand contractions
+    - Remove HTML tags
+    - Remove punctuation
+    """
+    q = str(q).lower().strip()
+    # Replace certain special characters with their string equivalents
+    q = q.replace('%', ' percent')
+    q = q.replace('$', ' dollar ')
+    q = q.replace('₹', ' rupee ')
+    q = q.replace('€', ' euro ')
+    q = q.replace('@', ' at ')
+    # The pattern '[math]' appears around 900 times in the whole dataset.
+    q = q.replace('[math]', '')
+    # Replacing some numbers with string equivalents
+    q = q.replace(',000,000,000 ', 'b ')
+    q = q.replace(',000,000 ', 'm ')
+    q = q.replace(',000 ', 'k ')
+    q = re.sub(r'([0-9]+)000000000', r'\1b', q)
+    q = re.sub(r'([0-9]+)000000', r'\1m', q)
+    q = re.sub(r'([0-9]+)000', r'\1k', q)
+    # Decontracting words
+    q_decontracted = []
+    for word in q.split():
+        if word in CONTRACTIONS:
+            word = CONTRACTIONS[word]
+        q_decontracted.append(word)
+    q = ' '.join(q_decontracted)
+    q = q.replace("'ve", " have")
+    q = q.replace("n't", " not")
+    q = q.replace("'re", " are")
+    q = q.replace("'ll", " will")
+    # Removing HTML tags (specify parser to avoid warning)
+    q = BeautifulSoup(q, "html.parser").get_text()
+    # Remove punctuations
+    pattern = re.compile(r'\W')
+    q = re.sub(pattern, ' ', q).strip()
+    return q

streamlit-app/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

streamlit-app/__pycache__/helper.cpython-310.pyc ADDED Viewed

Binary file (5.22 kB). View file

streamlit-app/helper.py ADDED Viewed

	@@ -0,0 +1,157 @@

+"""
+Helper module for Streamlit app.
+Loads model artifacts and delegates to src for feature extraction.
+Supports classical (RF/XGBoost) and transformer (DistilBERT) models.
+"""
+import pickle
+import json
+from pathlib import Path
+from typing import Optional, Tuple
+# Add project root to path for src imports
+_project_root = Path(__file__).resolve().parent.parent
+import sys
+if str(_project_root) not in sys.path:
+    sys.path.insert(0, str(_project_root))
+from src.feature_engineering import query_point_creator as _query_point_creator
+from src.embeddings import get_embedding_model
+# Paths
+_models_dir = _project_root / "models"
+_app_dir = Path(__file__).resolve().parent
+_transformer_dir = _models_dir / "transformer"
+_inference_times_path = _models_dir / "inference_times.json"
+def _ensure_models_from_hf():
+    """Download models from HF Hub if not present and HF_MODEL_REPO is set."""
+    import os
+    repo_id = os.environ.get("HF_MODEL_REPO")
+    if not repo_id or (_models_dir / "model.pkl").exists():
+        return
+    try:
+        from huggingface_hub import snapshot_download
+        _models_dir.mkdir(parents=True, exist_ok=True)
+        snapshot_download(repo_id=repo_id, local_dir=str(_models_dir))
+    except Exception as e:
+        print(f"HF Hub download skipped or failed: {e}")
+# Try HF Hub download when models missing (for HF Spaces deployment)
+_ensure_models_from_hf()
+# Classical model artifacts (lazy loaded)
+_classical_model = None
+_classical_cv = None
+_embedding_model = None
+# Transformer (lazy loaded)
+_transformer_model = None
+_transformer_tokenizer = None
+def _get_cv_path():
+    return _models_dir / "cv.pkl" if (_models_dir / "cv.pkl").exists() else _app_dir / "cv.pkl"
+def _get_model_path():
+    return _models_dir / "model.pkl" if (_models_dir / "model.pkl").exists() else _app_dir / "model.pkl"
+def get_available_models() -> list:
+    """Return list of available model identifiers."""
+    available = []
+    if _get_model_path().exists() and _get_cv_path().exists():
+        available.append("classical")
+    if (_transformer_dir / "config.json").exists():
+        available.append("transformer")
+    return available
+def get_inference_times() -> dict:
+    """Load benchmark results from models/inference_times.json."""
+    if not _inference_times_path.exists():
+        return {}
+    try:
+        with open(_inference_times_path) as f:
+            return json.load(f)
+    except Exception:
+        return {}
+def _load_classical():
+    global _classical_model, _classical_cv, _embedding_model
+    if _classical_model is None:
+        _classical_model = pickle.load(open(_get_model_path(), "rb"))
+        _classical_cv = pickle.load(open(_get_cv_path(), "rb"))
+        _embedding_model = get_embedding_model()
+    return _classical_model, _classical_cv, _embedding_model
+def _load_transformer():
+    global _transformer_model, _transformer_tokenizer
+    if _transformer_model is None:
+        from transformers import AutoTokenizer, AutoModelForSequenceClassification
+        import torch
+        _transformer_tokenizer = AutoTokenizer.from_pretrained(str(_transformer_dir))
+        _transformer_model = AutoModelForSequenceClassification.from_pretrained(str(_transformer_dir))
+        device = "mps" if torch.backends.mps.is_available() else ("cuda" if torch.cuda.is_available() else "cpu")
+        _transformer_model = _transformer_model.to(device)
+        _transformer_model.eval()
+    return _transformer_model, _transformer_tokenizer
+def query_point_creator(q1: str, q2: str):
+    """Build feature vector for classical model. Uses shared src modules + embeddings."""
+    _, cv, emb = _load_classical()
+    return _query_point_creator(q1, q2, cv, embedding_model=emb)
+def predict_classical(q1: str, q2: str) -> Tuple[int, float]:
+    """Predict using classical model. Returns (pred, proba)."""
+    model, cv, emb = _load_classical()
+    feat = _query_point_creator(q1, q2, cv, embedding_model=emb)
+    proba = model.predict_proba(feat)[0, 1]
+    pred = int(proba >= 0.5)
+    return pred, float(proba)
+def predict_transformer(q1: str, q2: str) -> Tuple[int, float]:
+    """Predict using DistilBERT. Returns (pred, proba)."""
+    from src.preprocessing import preprocess
+    import torch
+    model, tokenizer = _load_transformer()
+    q1_p, q2_p = preprocess(q1), preprocess(q2)
+    inputs = tokenizer(
+        q1_p, q2_p,
+        return_tensors="pt",
+        truncation=True,
+        max_length=128,
+        padding="max_length",
+    )
+    inputs = {k: v.to(model.device) for k, v in inputs.items()}
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    proba = torch.softmax(logits, dim=-1)[0, 1].item()
+    pred = 1 if proba >= 0.5 else 0
+    return pred, float(proba)
+def predict(q1: str, q2: str, model_type: str) -> Tuple[int, float]:
+    """Unified prediction. model_type: 'classical' or 'transformer'."""
+    if model_type == "classical":
+        return predict_classical(q1, q2)
+    if model_type == "transformer":
+        return predict_transformer(q1, q2)
+    raise ValueError(f"Unknown model_type: {model_type}")
+def get_model_display_name(model_type: str) -> str:
+    """Human-readable name for model selector."""
+    return {"classical": "Classical (RF/XGBoost + TF-IDF)", "transformer": "DistilBERT (Transformer)"}.get(
+        model_type, model_type
+    )