Spaces:

cmpunkmannu
/

quora-duplicate-detector

Sleeping

File size: 3,894 Bytes

162b166

"""
Gradio app for Quora Duplicate Question Detector.
Deploy to Hugging Face Spaces with Gradio SDK.
"""
import sys
from pathlib import Path

ROOT = Path(__file__).resolve().parent
sys.path.insert(0, str(ROOT))
sys.path.insert(0, str(ROOT / "streamlit-app"))

import nltk
nltk.download("stopwords", quiet=True)

import helper

import gradio as gr


def predict_fn(q1: str, q2: str, model_name: str):
    """Run prediction and return formatted output."""
    q1_clean = (q1 or "").strip()
    q2_clean = (q2 or "").strip()

    if not q1_clean or not q2_clean:
        return "⚠️ Please enter both questions.", 0.0
    if len(q1_clean) < 3 or len(q2_clean) < 3:
        return "⚠️ Questions should be at least 3 characters.", 0.0

    try:
        model_type = "classical" if "Classical" in model_name else "transformer"
        pred, proba = helper.predict(q1_clean, q2_clean, model_type)

        if pred:
            msg = "**Duplicate** — These questions likely have the same meaning."
        else:
            msg = "**Not Duplicate** — These questions appear to be different."

        return msg, proba
    except Exception as e:
        return f"❌ Error: {str(e)}", 0.0


# Build model options
available = helper.get_available_models()
if not available:
    raise RuntimeError("No models found. Add models to models/ or configure HF Hub download.")

inference_times = helper.get_inference_times()
model_choices = [helper.get_model_display_name(m) for m in available]
model_choices_with_time = []
for m in model_choices:
    key = "classical" if "Classical" in m else "transformer"
    ms = inference_times.get(key, {}).get("mean_ms", 0)
    suffix = f" (~{ms:.0f} ms)" if ms else ""
    model_choices_with_time.append(f"{m}{suffix}")

with gr.Blocks(title="Quora Duplicate Detector", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🔍 Quora Duplicate Question Pairs")
    gr.Markdown("Enter two questions to check if they are semantically duplicate.")

    with gr.Row():
        with gr.Column(scale=2):
            q1 = gr.Textbox(
                label="Question 1",
                placeholder="e.g. What is the capital of India?",
                lines=2,
            )
            q2 = gr.Textbox(
                label="Question 2",
                placeholder="e.g. Which city is India's capital?",
                lines=2,
            )
            model_dropdown = gr.Dropdown(
                label="Model",
                choices=model_choices_with_time,
                value=model_choices_with_time[0],
            )
            check_btn = gr.Button("Check", variant="primary")
        with gr.Column(scale=1):
            result_text = gr.Markdown(value="")
            proba_slider = gr.Slider(
                minimum=0,
                maximum=1,
                value=0,
                label="Probability of Duplicate",
                interactive=False,
            )

    with gr.Accordion("Try example pairs", open=False):
        gr.Examples(
            examples=[
                ["How do I learn Python?", "What is the best way to learn Python programming?"],
                ["What is the capital of France?", "How do I cook pasta?"],
            ],
            inputs=[q1, q2],
            label="",
        )

    check_btn.click(
        fn=predict_fn,
        inputs=[q1, q2, model_dropdown],
        outputs=[result_text, proba_slider],
    )

    gr.Markdown("---")
    with gr.Accordion("About", open=False):
        gr.Markdown("""
        This app predicts whether two Quora questions are duplicates (same meaning).

        **Models:**
        - **Classical**: Random Forest or XGBoost on 25 handcrafted features + TF-IDF
        - **DistilBERT**: Fine-tuned transformer for sentence-pair classification

        *Built for fun & learning. Results may not always be accurate — use with caution.*
        """)

demo.launch()