File size: 7,812 Bytes
2556fd7 a729e45 2556fd7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
import streamlit as st
import numpy as np
import matplotlib.pyplot as plt
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from Levenshtein import distance as levenshtein_distance
from textdistance import jaro_winkler, damerau_levenshtein, cosine
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
# -----------------------
# 🎨 Streamlit App Layout
# -----------------------
# Display title of the app on the web page
st.title("🔍 String Similarity & Model Calibration App")
# Sidebar for selecting the task to perform
st.sidebar.header("📌 Select an Option")
# Option to choose between String Similarity and Model Calibration
option = st.sidebar.radio("Choose a Task:", ["String Similarity", "Model Calibration"])
# ℹ️ PROJECT INFORMATION & EDUCATION SECTION
# -----------------------
st.sidebar.subheader("ℹ️ About This App")
st.sidebar.write(
"""
This app explores two key concepts:
**1️⃣ String Similarity Models** 📝
- Compare words using different similarity algorithms.
- Helps with **spell checking, record linkage, and fuzzy matching**.
**2️⃣ Model Calibration** 📊
- Evaluate how well a model’s probability predictions match reality.
- Uses **Platt Scaling & Isotonic Regression** to improve predictions.
"""
)
# Explaining how the models work in simple terms
st.sidebar.subheader("🧠 How It Works?")
st.sidebar.write(
"""
- **Levenshtein Distance**: Counts how many edits are needed to turn one word into another.
- **Jaro-Winkler**: Focuses on shared characters, especially at the start of words.
- **Damerau-Levenshtein**: Similar to Levenshtein but also considers transpositions (changing letter order).
- **Cosine Similarity**: Treats words as vectors (arrays of numbers) and compares their angle.
- **Q-Gram Similarity**: Breaks words into small parts (n-grams) and compares them.
**Model Calibration**
- Checks how accurate a model’s probability predictions are.
- **Platt Scaling** applies logistic regression for adjustment.
- **Isotonic Regression** fine-tunes predictions using a flexible non-linear approach.
"""
)
# -----------------------
# 1️⃣ STRING SIMILARITY MODELS
# -----------------------
if option == "String Similarity":
st.header("📝 String Similarity Calculator")
# User inputs the two words they want to compare
word1 = st.text_input("Enter First Word:", "AARUSH")
word2 = st.text_input("Enter Second Word:", "AASHVI")
# When the user clicks the button, calculate similarity
if st.button("Compute Similarity"):
# Compute the different types of word similarity
lev_dist = levenshtein_distance(word1, word2) # Levenshtein distance (edit distance)
jaro_wink = jaro_winkler(word1, word2) # Jaro-Winkler similarity
damerau_lev = damerau_levenshtein(word1, word2) # Damerau-Levenshtein distance (edit + transposition)
cosine_sim = cosine(word1, word2) # Cosine similarity (word angle comparison)
# Q-Gram Similarity function calculates similarity based on small parts (n-grams)
def qgram_similarity(s1, s2, q=2):
vectorizer = CountVectorizer(analyzer='char', ngram_range=(q, q)) # Split words into small parts
q1 = vectorizer.fit_transform([s1, s2]) # Convert words into vector form
q1 = normalize(q1, norm='l1') # Normalize vectors
return (q1 * q1.T).toarray()[0, 1] # Compare the vectors and return similarity
qgram_sim = qgram_similarity(word1, word2) # Compute Q-Gram similarity
# Display the computed similarity results
st.subheader("🔹 Similarity Scores")
st.write(f"**Levenshtein Distance:** {lev_dist}") # Shows Levenshtein distance
st.write(f"**Jaro-Winkler Similarity:** {jaro_wink:.4f}") # Shows Jaro-Winkler similarity
st.write(f"**Damerau-Levenshtein Distance:** {damerau_lev}") # Shows Damerau-Levenshtein distance
st.write(f"**Cosine Similarity:** {cosine_sim:.4f}") # Shows Cosine similarity
st.write(f"**Q-Gram Similarity:** {qgram_sim:.4f}") # Shows Q-Gram similarity
# -----------------------
# 2️⃣ MODEL CALIBRATION (RELIABILITY DIAGRAM)
# -----------------------
elif option == "Model Calibration":
st.header("📊 Model Calibration & Reliability Diagram")
# Generate synthetic dataset (random numbers) for model training
np.random.seed(42) # For reproducibility
X = np.random.rand(1000, 5) # Random features (1000 samples, 5 features)
y = (X[:, 0] + X[:, 1] > 1).astype(int) # Simple rule to generate labels (0 or 1)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Train a Logistic Regression Model
clf = LogisticRegression()
clf.fit(X_train, y_train) # Train the model
y_prob = clf.predict_proba(X_test)[:, 1] # Get the predicted probabilities for the test set
# Apply Calibration methods (Platt Scaling and Isotonic Regression)
platt_scaling = CalibratedClassifierCV(clf, method='sigmoid') # Platt Scaling method
iso_regression = CalibratedClassifierCV(clf, method='isotonic') # Isotonic Regression method
platt_scaling.fit(X_train, y_train) # Fit Platt Scaling model
iso_regression.fit(X_train, y_train) # Fit Isotonic Regression model
y_prob_platt = platt_scaling.predict_proba(X_test)[:, 1] # Get probabilities after Platt Scaling
y_prob_iso = iso_regression.predict_proba(X_test)[:, 1] # Get probabilities after Isotonic Regression
# Compute Calibration Curves (shows how close the predicted probabilities are to the true values)
prob_true, prob_pred = calibration_curve(y_test, y_prob, n_bins=10)
prob_true_platt, prob_pred_platt = calibration_curve(y_test, y_prob_platt, n_bins=10)
prob_true_iso, prob_pred_iso = calibration_curve(y_test, y_prob_iso, n_bins=10)
# Plot the Calibration Curves
fig, ax = plt.subplots(figsize=(8, 6)) # Create a figure for the plot
ax.plot(prob_pred, prob_true, "s-", label="Uncalibrated Model") # Plot uncalibrated model
ax.plot(prob_pred_platt, prob_true_platt, "o-", label="Platt Scaling") # Plot Platt Scaling
ax.plot(prob_pred_iso, prob_true_iso, "d-", label="Isotonic Regression") # Plot Isotonic Regression
ax.plot([0, 1], [0, 1], "k--", label="Perfect Calibration") # Plot perfect calibration line
ax.set_xlabel("Predicted Probability") # Label for X-axis
ax.set_ylabel("True Probability") # Label for Y-axis
ax.legend() # Show legend to differentiate lines
ax.set_title("Calibration Curve (Reliability Diagram)") # Title of the plot
# Display the plot in Streamlit
st.pyplot(fig)
# -----------------------
# 3️⃣ EVALUATE MODEL PERFORMANCE
# -----------------------
# Predict the labels for the test set using different models
y_pred = clf.predict(X_test)
y_pred_platt = platt_scaling.predict(X_test)
y_pred_iso = iso_regression.predict(X_test)
# Display the accuracy of each model
st.subheader("🔹 Model Accuracy Scores:")
st.write(f"**Uncalibrated Model Accuracy:** {accuracy_score(y_test, y_pred):.4f}") # Accuracy of uncalibrated model
st.write(f"**Platt Scaled Model Accuracy:** {accuracy_score(y_test, y_pred_platt):.4f}") # Accuracy after Platt Scaling
st.write(f"**Isotonic Regression Model Accuracy:** {accuracy_score(y_test, y_pred_iso):.4f}") # Accuracy after Isotonic Regression |