File size: 7,812 Bytes
2556fd7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a729e45
 
2556fd7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import streamlit as st
import numpy as np
import matplotlib.pyplot as plt
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from Levenshtein import distance as levenshtein_distance
from textdistance import jaro_winkler, damerau_levenshtein, cosine
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize

# -----------------------
# 🎨 Streamlit App Layout
# -----------------------

# Display title of the app on the web page
st.title("🔍 String Similarity & Model Calibration App")

# Sidebar for selecting the task to perform
st.sidebar.header("📌 Select an Option")
# Option to choose between String Similarity and Model Calibration
option = st.sidebar.radio("Choose a Task:", ["String Similarity", "Model Calibration"])

# ℹ️ PROJECT INFORMATION & EDUCATION SECTION
# -----------------------
st.sidebar.subheader("ℹ️ About This App")
st.sidebar.write(
    """
    This app explores two key concepts:
    
    **1️⃣ String Similarity Models** 📝  
    - Compare words using different similarity algorithms.  
    - Helps with **spell checking, record linkage, and fuzzy matching**.  

    **2️⃣ Model Calibration** 📊  
    - Evaluate how well a model’s probability predictions match reality.  
    - Uses **Platt Scaling & Isotonic Regression** to improve predictions.
    """
)

# Explaining how the models work in simple terms
st.sidebar.subheader("🧠 How It Works?")
st.sidebar.write(
    """
    - **Levenshtein Distance**: Counts how many edits are needed to turn one word into another.  
    - **Jaro-Winkler**: Focuses on shared characters, especially at the start of words.  
    - **Damerau-Levenshtein**: Similar to Levenshtein but also considers transpositions (changing letter order).  
    - **Cosine Similarity**: Treats words as vectors (arrays of numbers) and compares their angle.  
    - **Q-Gram Similarity**: Breaks words into small parts (n-grams) and compares them.  

    **Model Calibration**  
    - Checks how accurate a model’s probability predictions are.  
    - **Platt Scaling** applies logistic regression for adjustment.  
    - **Isotonic Regression** fine-tunes predictions using a flexible non-linear approach.  
    """
)

# -----------------------
# 1️⃣ STRING SIMILARITY MODELS
# -----------------------
if option == "String Similarity":
    st.header("📝 String Similarity Calculator")

    # User inputs the two words they want to compare
    word1 = st.text_input("Enter First Word:", "AARUSH")
    word2 = st.text_input("Enter Second Word:", "AASHVI")

    # When the user clicks the button, calculate similarity
    if st.button("Compute Similarity"):
        # Compute the different types of word similarity
        lev_dist = levenshtein_distance(word1, word2)  # Levenshtein distance (edit distance)
        jaro_wink = jaro_winkler(word1, word2)  # Jaro-Winkler similarity
        damerau_lev = damerau_levenshtein(word1, word2)  # Damerau-Levenshtein distance (edit + transposition)
        cosine_sim = cosine(word1, word2)  # Cosine similarity (word angle comparison)

        # Q-Gram Similarity function calculates similarity based on small parts (n-grams)
        def qgram_similarity(s1, s2, q=2):
            vectorizer = CountVectorizer(analyzer='char', ngram_range=(q, q))  # Split words into small parts
            q1 = vectorizer.fit_transform([s1, s2])  # Convert words into vector form
            q1 = normalize(q1, norm='l1')  # Normalize vectors
            return (q1 * q1.T).toarray()[0, 1]  # Compare the vectors and return similarity

        qgram_sim = qgram_similarity(word1, word2)  # Compute Q-Gram similarity

        # Display the computed similarity results
        st.subheader("🔹 Similarity Scores")
        st.write(f"**Levenshtein Distance:** {lev_dist}")  # Shows Levenshtein distance
        st.write(f"**Jaro-Winkler Similarity:** {jaro_wink:.4f}")  # Shows Jaro-Winkler similarity
        st.write(f"**Damerau-Levenshtein Distance:** {damerau_lev}")  # Shows Damerau-Levenshtein distance
        st.write(f"**Cosine Similarity:** {cosine_sim:.4f}")  # Shows Cosine similarity
        st.write(f"**Q-Gram Similarity:** {qgram_sim:.4f}")  # Shows Q-Gram similarity

# -----------------------
# 2️⃣ MODEL CALIBRATION (RELIABILITY DIAGRAM)
# -----------------------
elif option == "Model Calibration":
    st.header("📊 Model Calibration & Reliability Diagram")

    # Generate synthetic dataset (random numbers) for model training
    np.random.seed(42)  # For reproducibility
    X = np.random.rand(1000, 5)  # Random features (1000 samples, 5 features)
    y = (X[:, 0] + X[:, 1] > 1).astype(int)  # Simple rule to generate labels (0 or 1)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Train a Logistic Regression Model
    clf = LogisticRegression()
    clf.fit(X_train, y_train)  # Train the model
    y_prob = clf.predict_proba(X_test)[:, 1]  # Get the predicted probabilities for the test set

    # Apply Calibration methods (Platt Scaling and Isotonic Regression)
    platt_scaling = CalibratedClassifierCV(clf, method='sigmoid')  # Platt Scaling method
    iso_regression = CalibratedClassifierCV(clf, method='isotonic')  # Isotonic Regression method
    platt_scaling.fit(X_train, y_train)  # Fit Platt Scaling model
    iso_regression.fit(X_train, y_train)  # Fit Isotonic Regression model

    y_prob_platt = platt_scaling.predict_proba(X_test)[:, 1]  # Get probabilities after Platt Scaling
    y_prob_iso = iso_regression.predict_proba(X_test)[:, 1]  # Get probabilities after Isotonic Regression

    # Compute Calibration Curves (shows how close the predicted probabilities are to the true values)
    prob_true, prob_pred = calibration_curve(y_test, y_prob, n_bins=10)
    prob_true_platt, prob_pred_platt = calibration_curve(y_test, y_prob_platt, n_bins=10)
    prob_true_iso, prob_pred_iso = calibration_curve(y_test, y_prob_iso, n_bins=10)

    # Plot the Calibration Curves
    fig, ax = plt.subplots(figsize=(8, 6))  # Create a figure for the plot
    ax.plot(prob_pred, prob_true, "s-", label="Uncalibrated Model")  # Plot uncalibrated model
    ax.plot(prob_pred_platt, prob_true_platt, "o-", label="Platt Scaling")  # Plot Platt Scaling
    ax.plot(prob_pred_iso, prob_true_iso, "d-", label="Isotonic Regression")  # Plot Isotonic Regression
    ax.plot([0, 1], [0, 1], "k--", label="Perfect Calibration")  # Plot perfect calibration line
    ax.set_xlabel("Predicted Probability")  # Label for X-axis
    ax.set_ylabel("True Probability")  # Label for Y-axis
    ax.legend()  # Show legend to differentiate lines
    ax.set_title("Calibration Curve (Reliability Diagram)")  # Title of the plot
    
    # Display the plot in Streamlit
    st.pyplot(fig)

    # -----------------------
    # 3️⃣ EVALUATE MODEL PERFORMANCE
    # -----------------------

    # Predict the labels for the test set using different models
    y_pred = clf.predict(X_test)
    y_pred_platt = platt_scaling.predict(X_test)
    y_pred_iso = iso_regression.predict(X_test)

    # Display the accuracy of each model
    st.subheader("🔹 Model Accuracy Scores:")
    st.write(f"**Uncalibrated Model Accuracy:** {accuracy_score(y_test, y_pred):.4f}")  # Accuracy of uncalibrated model
    st.write(f"**Platt Scaled Model Accuracy:** {accuracy_score(y_test, y_pred_platt):.4f}")  # Accuracy after Platt Scaling
    st.write(f"**Isotonic Regression Model Accuracy:** {accuracy_score(y_test, y_pred_iso):.4f}")  # Accuracy after Isotonic Regression