Spaces:

RAHULJUNEJA33
/

String_Similarity_Calibration-Models

Build error

File size: 7,812 Bytes

import streamlit as st
import numpy as np
import matplotlib.pyplot as plt
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from Levenshtein import distance as levenshtein_distance
from textdistance import jaro_winkler, damerau_levenshtein, cosine
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize

# -----------------------
# 🎨 Streamlit App Layout
# -----------------------

# Display title of the app on the web page
st.title("🔍 String Similarity & Model Calibration App")

# Sidebar for selecting the task to perform
st.sidebar.header("📌 Select an Option")
# Option to choose between String Similarity and Model Calibration
option = st.sidebar.radio("Choose a Task:", ["String Similarity", "Model Calibration"])

# ℹ️ PROJECT INFORMATION & EDUCATION SECTION
# -----------------------
st.sidebar.subheader("ℹ️ About This App")
st.sidebar.write(
    """
    This app explores two key concepts:
    
    **1️⃣ String Similarity Models** 📝  
    - Compare words using different similarity algorithms.  
    - Helps with **spell checking, record linkage, and fuzzy matching**.  

    **2️⃣ Model Calibration** 📊  
    - Evaluate how well a model’s probability predictions match reality.  
    - Uses **Platt Scaling & Isotonic Regression** to improve predictions.
    """
)

# Explaining how the models work in simple terms
st.sidebar.subheader("🧠 How It Works?")
st.sidebar.write(
    """
    - **Levenshtein Distance**: Counts how many edits are needed to turn one word into another.  
    - **Jaro-Winkler**: Focuses on shared characters, especially at the start of words.  
    - **Damerau-Levenshtein**: Similar to Levenshtein but also considers transpositions (changing letter order).  
    - **Cosine Similarity**: Treats words as vectors (arrays of numbers) and compares their angle.  
    - **Q-Gram Similarity**: Breaks words into small parts (n-grams) and compares them.  

    **Model Calibration**  
    - Checks how accurate a model’s probability predictions are.  
    - **Platt Scaling** applies logistic regression for adjustment.  
    - **Isotonic Regression** fine-tunes predictions using a flexible non-linear approach.  
    """
)

# -----------------------
# 1️⃣ STRING SIMILARITY MODELS
# -----------------------
if option == "String Similarity":
    st.header("📝 String Similarity Calculator")

    # User inputs the two words they want to compare
    word1 = st.text_input("Enter First Word:", "AARUSH")
    word2 = st.text_input("Enter Second Word:", "AASHVI")

    # When the user clicks the button, calculate similarity
    if st.button("Compute Similarity"):
        # Compute the different types of word similarity
        lev_dist = levenshtein_distance(word1, word2)  # Levenshtein distance (edit distance)
        jaro_wink = jaro_winkler(word1, word2)  # Jaro-Winkler similarity
        damerau_lev = damerau_levenshtein(word1, word2)  # Damerau-Levenshtein distance (edit + transposition)
        cosine_sim = cosine(word1, word2)  # Cosine similarity (word angle comparison)

        # Q-Gram Similarity function calculates similarity based on small parts (n-grams)
        def qgram_similarity(s1, s2, q=2):
            vectorizer = CountVectorizer(analyzer='char', ngram_range=(q, q))  # Split words into small parts
            q1 = vectorizer.fit_transform([s1, s2])  # Convert words into vector form
            q1 = normalize(q1, norm='l1')  # Normalize vectors
            return (q1 * q1.T).toarray()[0, 1]  # Compare the vectors and return similarity

        qgram_sim = qgram_similarity(word1, word2)  # Compute Q-Gram similarity

        # Display the computed similarity results
        st.subheader("🔹 Similarity Scores")
        st.write(f"**Levenshtein Distance:** {lev_dist}")  # Shows Levenshtein distance
        st.write(f"**Jaro-Winkler Similarity:** {jaro_wink:.4f}")  # Shows Jaro-Winkler similarity
        st.write(f"**Damerau-Levenshtein Distance:** {damerau_lev}")  # Shows Damerau-Levenshtein distance
        st.write(f"**Cosine Similarity:** {cosine_sim:.4f}")  # Shows Cosine similarity
        st.write(f"**Q-Gram Similarity:** {qgram_sim:.4f}")  # Shows Q-Gram similarity

# -----------------------
# 2️⃣ MODEL CALIBRATION (RELIABILITY DIAGRAM)
# -----------------------
elif option == "Model Calibration":
    st.header("📊 Model Calibration & Reliability Diagram")

    # Generate synthetic dataset (random numbers) for model training
    np.random.seed(42)  # For reproducibility
    X = np.random.rand(1000, 5)  # Random features (1000 samples, 5 features)
    y = (X[:, 0] + X[:, 1] > 1).astype(int)  # Simple rule to generate labels (0 or 1)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Train a Logistic Regression Model
    clf = LogisticRegression()
    clf.fit(X_train, y_train)  # Train the model
    y_prob = clf.predict_proba(X_test)[:, 1]  # Get the predicted probabilities for the test set

    # Apply Calibration methods (Platt Scaling and Isotonic Regression)
    platt_scaling = CalibratedClassifierCV(clf, method='sigmoid')  # Platt Scaling method
    iso_regression = CalibratedClassifierCV(clf, method='isotonic')  # Isotonic Regression method
    platt_scaling.fit(X_train, y_train)  # Fit Platt Scaling model
    iso_regression.fit(X_train, y_train)  # Fit Isotonic Regression model

    y_prob_platt = platt_scaling.predict_proba(X_test)[:, 1]  # Get probabilities after Platt Scaling
    y_prob_iso = iso_regression.predict_proba(X_test)[:, 1]  # Get probabilities after Isotonic Regression

    # Compute Calibration Curves (shows how close the predicted probabilities are to the true values)
    prob_true, prob_pred = calibration_curve(y_test, y_prob, n_bins=10)
    prob_true_platt, prob_pred_platt = calibration_curve(y_test, y_prob_platt, n_bins=10)
    prob_true_iso, prob_pred_iso = calibration_curve(y_test, y_prob_iso, n_bins=10)

    # Plot the Calibration Curves
    fig, ax = plt.subplots(figsize=(8, 6))  # Create a figure for the plot
    ax.plot(prob_pred, prob_true, "s-", label="Uncalibrated Model")  # Plot uncalibrated model
    ax.plot(prob_pred_platt, prob_true_platt, "o-", label="Platt Scaling")  # Plot Platt Scaling
    ax.plot(prob_pred_iso, prob_true_iso, "d-", label="Isotonic Regression")  # Plot Isotonic Regression
    ax.plot([0, 1], [0, 1], "k--", label="Perfect Calibration")  # Plot perfect calibration line
    ax.set_xlabel("Predicted Probability")  # Label for X-axis
    ax.set_ylabel("True Probability")  # Label for Y-axis
    ax.legend()  # Show legend to differentiate lines
    ax.set_title("Calibration Curve (Reliability Diagram)")  # Title of the plot
    
    # Display the plot in Streamlit
    st.pyplot(fig)

    # -----------------------
    # 3️⃣ EVALUATE MODEL PERFORMANCE
    # -----------------------

    # Predict the labels for the test set using different models
    y_pred = clf.predict(X_test)
    y_pred_platt = platt_scaling.predict(X_test)
    y_pred_iso = iso_regression.predict(X_test)

    # Display the accuracy of each model
    st.subheader("🔹 Model Accuracy Scores:")
    st.write(f"**Uncalibrated Model Accuracy:** {accuracy_score(y_test, y_pred):.4f}")  # Accuracy of uncalibrated model
    st.write(f"**Platt Scaled Model Accuracy:** {accuracy_score(y_test, y_pred_platt):.4f}")  # Accuracy after Platt Scaling
    st.write(f"**Isotonic Regression Model Accuracy:** {accuracy_score(y_test, y_pred_iso):.4f}")  # Accuracy after Isotonic Regression