File size: 6,506 Bytes
1e6fe2e e304025 3f24b7c 1e6fe2e 3f24b7c 1e6fe2e 176a470 e304025 1e6fe2e e304025 176a470 1e6fe2e 176a470 1e6fe2e 176a470 1e6fe2e 176a470 1e6fe2e 176a470 e304025 1e6fe2e e304025 1e6fe2e e304025 1e6fe2e e304025 1e6fe2e e304025 1e6fe2e e304025 1e6fe2e e304025 1e6fe2e e304025 1e6fe2e e304025 1e6fe2e e304025 1e6fe2e e304025 1e6fe2e e304025 1e6fe2e e304025 1e6fe2e e304025 1e6fe2e e304025 1e6fe2e e304025 1e6fe2e e304025 1e6fe2e e304025 1e6fe2e e304025 1e6fe2e e304025 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import streamlit as st # Web app framework
import numpy as np # For numerical operations
import matplotlib.pyplot as plt # For plotting graphs
from sklearn.calibration import calibration_curve, CalibratedClassifierCV # Model calibration
from sklearn.linear_model import LogisticRegression # Logistic regression model
from sklearn.model_selection import train_test_split # Splitting dataset
from sklearn.metrics import accuracy_score # Evaluating model accuracy
from Levenshtein import distance as levenshtein_distance # Levenshtein distance metric
from textdistance import jaro_winkler, damerau_levenshtein, cosine # Other similarity metrics
from sklearn.feature_extraction.text import CountVectorizer # Converting text to numerical format
from sklearn.preprocessing import normalize # Normalizing numerical data
import pandas as pd # Handling data efficiently
# -----------------------
# π¨ STREAMLIT APP LAYOUT
# -----------------------
st.title("π String Similarity & Model Calibration App") # Main title
st.sidebar.header("π Select an Option") # Sidebar header
option = st.sidebar.radio("Choose a Task:", ["String Similarity", "Model Calibration"]) # User selection
# -----------------------
# βΉοΈ INFORMATION SECTION (For non-technical users)
# -----------------------
st.sidebar.subheader("βΉοΈ About This App")
st.sidebar.write(
"""
This app provides two key functionalities:
**1οΈβ£ String Similarity** π
- Used in **spell checking, data matching, and fuzzy search**.
**2οΈβ£ Model Calibration** π
- Helps improve the **reliability of probability predictions** from ML models.
π **Project Repository:**
π [RAHULJUNEJA33/String_Similarity_Calibration-Models](https://github.com/RAHULJUNEJA33/String_Similarity_Calibration-Models)
"""
)
# -----------------------
# 1οΈβ£ STRING SIMILARITY MODELS
# -----------------------
if option == "String Similarity":
st.header("π String Similarity Calculator") # Section header
# User inputs: Two words to compare
word1 = st.text_input("Enter First Word:", "MARTHA")
word2 = st.text_input("Enter Second Word:", "MARHTA")
if st.button("Compute Similarity"): # Compute similarity when button is clicked
# Compute similarity metrics
lev_dist = levenshtein_distance(word1, word2) # Levenshtein Distance
jaro_wink = jaro_winkler(word1, word2) # Jaro-Winkler Similarity
damerau_lev = damerau_levenshtein(word1, word2) # Damerau-Levenshtein Distance
cosine_sim = cosine(word1, word2) # Cosine Similarity
# Q-Gram Similarity Function
def qgram_similarity(s1, s2, q=2):
vectorizer = CountVectorizer(analyzer='char', ngram_range=(q, q)) # Convert text into character n-grams
q1 = vectorizer.fit_transform([s1, s2]) # Transform input words into vectors
q1 = normalize(q1, norm='l1') # Normalize the vectors
return (q1 * q1.T).toarray()[0, 1] # Compute similarity score
qgram_sim = qgram_similarity(word1, word2) # Compute Q-Gram Similarity
# Display Results
st.subheader("πΉ Similarity Scores")
st.write(f"**Levenshtein Distance:** {lev_dist}")
st.write(f"**Jaro-Winkler Similarity:** {jaro_wink:.4f}")
st.write(f"**Damerau-Levenshtein Distance:** {damerau_lev}")
st.write(f"**Cosine Similarity:** {cosine_sim:.4f}")
st.write(f"**Q-Gram Similarity:** {qgram_sim:.4f}")
# -----------------------
# 2οΈβ£ MODEL CALIBRATION (RELIABILITY DIAGRAM)
# -----------------------
elif option == "Model Calibration":
st.header("π Model Calibration & Reliability Diagram") # Section header
# Generate synthetic dataset (random data)
np.random.seed(42) # Set seed for reproducibility
X = np.random.rand(1000, 5) # 1000 samples, 5 random features
y = (X[:, 0] + X[:, 1] > 1).astype(int) # Classification rule (sum of first 2 features > 1)
# Split data into training and testing sets (70%-30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Train a Logistic Regression Model
clf = LogisticRegression()
clf.fit(X_train, y_train) # Fit model to training data
y_prob = clf.predict_proba(X_test)[:, 1] # Get probability scores for class 1
# Apply Model Calibration (Platt Scaling & Isotonic Regression)
platt_scaling = CalibratedClassifierCV(clf, method='sigmoid') # Platt Scaling method
iso_regression = CalibratedClassifierCV(clf, method='isotonic') # Isotonic Regression method
platt_scaling.fit(X_train, y_train) # Train calibrated models
iso_regression.fit(X_train, y_train)
y_prob_platt = platt_scaling.predict_proba(X_test)[:, 1] # Platt Scaling probabilities
y_prob_iso = iso_regression.predict_proba(X_test)[:, 1] # Isotonic Regression probabilities
# Compute Calibration Curves (actual vs. predicted probabilities)
prob_true, prob_pred = calibration_curve(y_test, y_prob, n_bins=10)
prob_true_platt, prob_pred_platt = calibration_curve(y_test, y_prob_platt, n_bins=10)
prob_true_iso, prob_pred_iso = calibration_curve(y_test, y_prob_iso, n_bins=10)
# Plot Calibration Curves
fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(prob_pred, prob_true, "s-", label="Uncalibrated Model")
ax.plot(prob_pred_platt, prob_true_platt, "o-", label="Platt Scaling")
ax.plot(prob_pred_iso, prob_true_iso, "d-", label="Isotonic Regression")
ax.plot([0, 1], [0, 1], "k--", label="Perfect Calibration") # Ideal case
ax.set_xlabel("Predicted Probability")
ax.set_ylabel("True Probability")
ax.legend()
ax.set_title("Calibration Curve (Reliability Diagram)")
# Display plot in Streamlit
st.pyplot(fig)
# -----------------------
# 3οΈβ£ EVALUATE MODEL PERFORMANCE
# -----------------------
y_pred = clf.predict(X_test) # Predictions (uncalibrated)
y_pred_platt = platt_scaling.predict(X_test) # Predictions (Platt Scaling)
y_pred_iso = iso_regression.predict(X_test) # Predictions (Isotonic Regression)
# Display Accuracy Scores
st.subheader("πΉ Model Accuracy Scores:")
st.write(f"**Uncalibrated Model Accuracy:** {accuracy_score(y_test, y_pred):.4f}")
st.write(f"**Platt Scaled Model Accuracy:** {accuracy_score(y_test, y_pred_platt):.4f}")
st.write(f"**Isotonic Regression Model Accuracy:** {accuracy_score(y_test, y_pred_iso):.4f}") |