File size: 3,894 Bytes
162b166
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
"""
Gradio app for Quora Duplicate Question Detector.
Deploy to Hugging Face Spaces with Gradio SDK.
"""
import sys
from pathlib import Path

ROOT = Path(__file__).resolve().parent
sys.path.insert(0, str(ROOT))
sys.path.insert(0, str(ROOT / "streamlit-app"))

import nltk
nltk.download("stopwords", quiet=True)

import helper

import gradio as gr


def predict_fn(q1: str, q2: str, model_name: str):
    """Run prediction and return formatted output."""
    q1_clean = (q1 or "").strip()
    q2_clean = (q2 or "").strip()

    if not q1_clean or not q2_clean:
        return "โš ๏ธ Please enter both questions.", 0.0
    if len(q1_clean) < 3 or len(q2_clean) < 3:
        return "โš ๏ธ Questions should be at least 3 characters.", 0.0

    try:
        model_type = "classical" if "Classical" in model_name else "transformer"
        pred, proba = helper.predict(q1_clean, q2_clean, model_type)

        if pred:
            msg = "**Duplicate** โ€” These questions likely have the same meaning."
        else:
            msg = "**Not Duplicate** โ€” These questions appear to be different."

        return msg, proba
    except Exception as e:
        return f"โŒ Error: {str(e)}", 0.0


# Build model options
available = helper.get_available_models()
if not available:
    raise RuntimeError("No models found. Add models to models/ or configure HF Hub download.")

inference_times = helper.get_inference_times()
model_choices = [helper.get_model_display_name(m) for m in available]
model_choices_with_time = []
for m in model_choices:
    key = "classical" if "Classical" in m else "transformer"
    ms = inference_times.get(key, {}).get("mean_ms", 0)
    suffix = f" (~{ms:.0f} ms)" if ms else ""
    model_choices_with_time.append(f"{m}{suffix}")

with gr.Blocks(title="Quora Duplicate Detector", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# ๐Ÿ” Quora Duplicate Question Pairs")
    gr.Markdown("Enter two questions to check if they are semantically duplicate.")

    with gr.Row():
        with gr.Column(scale=2):
            q1 = gr.Textbox(
                label="Question 1",
                placeholder="e.g. What is the capital of India?",
                lines=2,
            )
            q2 = gr.Textbox(
                label="Question 2",
                placeholder="e.g. Which city is India's capital?",
                lines=2,
            )
            model_dropdown = gr.Dropdown(
                label="Model",
                choices=model_choices_with_time,
                value=model_choices_with_time[0],
            )
            check_btn = gr.Button("Check", variant="primary")
        with gr.Column(scale=1):
            result_text = gr.Markdown(value="")
            proba_slider = gr.Slider(
                minimum=0,
                maximum=1,
                value=0,
                label="Probability of Duplicate",
                interactive=False,
            )

    with gr.Accordion("Try example pairs", open=False):
        gr.Examples(
            examples=[
                ["How do I learn Python?", "What is the best way to learn Python programming?"],
                ["What is the capital of France?", "How do I cook pasta?"],
            ],
            inputs=[q1, q2],
            label="",
        )

    check_btn.click(
        fn=predict_fn,
        inputs=[q1, q2, model_dropdown],
        outputs=[result_text, proba_slider],
    )

    gr.Markdown("---")
    with gr.Accordion("About", open=False):
        gr.Markdown("""
        This app predicts whether two Quora questions are duplicates (same meaning).

        **Models:**
        - **Classical**: Random Forest or XGBoost on 25 handcrafted features + TF-IDF
        - **DistilBERT**: Fine-tuned transformer for sentence-pair classification

        *Built for fun & learning. Results may not always be accurate โ€” use with caution.*
        """)

demo.launch()