Update app.py
Browse files
app.py
CHANGED
|
@@ -1,56 +1,41 @@
|
|
| 1 |
-
import streamlit as st
|
| 2 |
-
import numpy as np
|
| 3 |
-
import matplotlib.pyplot as plt
|
| 4 |
-
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
|
| 5 |
-
from sklearn.linear_model import LogisticRegression
|
| 6 |
-
from sklearn.model_selection import train_test_split
|
| 7 |
-
from sklearn.metrics import accuracy_score
|
| 8 |
-
from Levenshtein import distance as levenshtein_distance
|
| 9 |
-
from textdistance import jaro_winkler, damerau_levenshtein, cosine
|
| 10 |
-
from sklearn.feature_extraction.text import CountVectorizer
|
| 11 |
-
from sklearn.preprocessing import normalize
|
| 12 |
-
import pandas as pd
|
| 13 |
|
| 14 |
# -----------------------
|
| 15 |
-
# ๐จ
|
| 16 |
# -----------------------
|
| 17 |
|
| 18 |
-
st.title("๐ String Similarity & Model Calibration App")
|
| 19 |
-
st.sidebar.header("๐ Select an Option")
|
| 20 |
-
option = st.sidebar.radio("Choose a Task:", ["String Similarity", "Model Calibration"])
|
| 21 |
|
| 22 |
# -----------------------
|
| 23 |
-
# โน๏ธ
|
| 24 |
# -----------------------
|
| 25 |
|
| 26 |
st.sidebar.subheader("โน๏ธ About This App")
|
| 27 |
st.sidebar.write(
|
| 28 |
"""
|
| 29 |
-
This app
|
| 30 |
-
|
| 31 |
-
**1๏ธโฃ String Similarity Models** ๐
|
| 32 |
-
- Compare words using different similarity algorithms.
|
| 33 |
-
- Helps with **spell checking, record linkage, and fuzzy matching**.
|
| 34 |
|
|
|
|
|
|
|
|
|
|
| 35 |
**2๏ธโฃ Model Calibration** ๐
|
| 36 |
-
-
|
| 37 |
-
- Uses **Platt Scaling & Isotonic Regression** to improve predictions.
|
| 38 |
-
"""
|
| 39 |
-
)
|
| 40 |
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
"""
|
| 44 |
-
- **Levenshtein Distance**: Counts how many edits are needed to turn one word into another.
|
| 45 |
-
- **Jaro-Winkler**: Focuses on shared characters, especially at the start of words.
|
| 46 |
-
- **Damerau-Levenshtein**: Similar to Levenshtein but also considers transpositions.
|
| 47 |
-
- **Cosine Similarity**: Treats words as vectors and compares their angle.
|
| 48 |
-
- **Q-Gram Similarity**: Breaks words into small parts (n-grams) and compares them.
|
| 49 |
-
|
| 50 |
-
**Model Calibration**
|
| 51 |
-
- Checks how accurate a modelโs probability predictions are.
|
| 52 |
-
- **Platt Scaling** applies logistic regression for adjustment.
|
| 53 |
-
- **Isotonic Regression** fine-tunes predictions using a flexible non-linear approach.
|
| 54 |
"""
|
| 55 |
)
|
| 56 |
|
|
@@ -58,27 +43,27 @@ st.sidebar.write(
|
|
| 58 |
# 1๏ธโฃ STRING SIMILARITY MODELS
|
| 59 |
# -----------------------
|
| 60 |
if option == "String Similarity":
|
| 61 |
-
st.header("๐ String Similarity Calculator")
|
| 62 |
|
| 63 |
-
# User
|
| 64 |
word1 = st.text_input("Enter First Word:", "MARTHA")
|
| 65 |
word2 = st.text_input("Enter Second Word:", "MARHTA")
|
| 66 |
|
| 67 |
-
if st.button("Compute Similarity"):
|
| 68 |
# Compute similarity metrics
|
| 69 |
-
lev_dist = levenshtein_distance(word1, word2)
|
| 70 |
-
jaro_wink = jaro_winkler(word1, word2)
|
| 71 |
-
damerau_lev = damerau_levenshtein(word1, word2)
|
| 72 |
-
cosine_sim = cosine(word1, word2)
|
| 73 |
|
| 74 |
-
# Q-Gram Similarity
|
| 75 |
def qgram_similarity(s1, s2, q=2):
|
| 76 |
-
vectorizer = CountVectorizer(analyzer='char', ngram_range=(q, q))
|
| 77 |
-
q1 = vectorizer.fit_transform([s1, s2])
|
| 78 |
-
q1 = normalize(q1, norm='l1')
|
| 79 |
-
return (q1 * q1.T).toarray()[0, 1]
|
| 80 |
|
| 81 |
-
qgram_sim = qgram_similarity(word1, word2)
|
| 82 |
|
| 83 |
# Display Results
|
| 84 |
st.subheader("๐น Similarity Scores")
|
|
@@ -88,52 +73,35 @@ if option == "String Similarity":
|
|
| 88 |
st.write(f"**Cosine Similarity:** {cosine_sim:.4f}")
|
| 89 |
st.write(f"**Q-Gram Similarity:** {qgram_sim:.4f}")
|
| 90 |
|
| 91 |
-
# -----------------------
|
| 92 |
-
# ๐ STRING SIMILARITY EXAMPLES TABLE
|
| 93 |
-
# -----------------------
|
| 94 |
-
|
| 95 |
-
st.subheader("๐ Example Word Comparisons")
|
| 96 |
-
data = {
|
| 97 |
-
"Word 1": ["MARTHA", "HOUSE", "SUNDAY", "NIGHT", "FLIGHT"],
|
| 98 |
-
"Word 2": ["MARHTA", "HORSE", "MONDAY", "KNIGHT", "FIGHT"],
|
| 99 |
-
"Levenshtein Distance": [1, 2, 2, 2, 1],
|
| 100 |
-
"Jaro-Winkler Similarity": [0.9611, 0.8375, 0.8222, 0.9444, 0.9740],
|
| 101 |
-
"Damerau-Levenshtein Distance": [1, 1, 2, 1, 1],
|
| 102 |
-
"Cosine Similarity": [0.8333, 0.7500, 0.6667, 0.8000, 0.9500],
|
| 103 |
-
"Q-Gram Similarity": [0.8571, 0.7143, 0.6667, 0.7778, 0.9231],
|
| 104 |
-
}
|
| 105 |
-
df = pd.DataFrame(data)
|
| 106 |
-
st.table(df)
|
| 107 |
-
|
| 108 |
# -----------------------
|
| 109 |
# 2๏ธโฃ MODEL CALIBRATION (RELIABILITY DIAGRAM)
|
| 110 |
# -----------------------
|
| 111 |
elif option == "Model Calibration":
|
| 112 |
-
st.header("๐ Model Calibration & Reliability Diagram")
|
| 113 |
|
| 114 |
-
# Generate synthetic dataset
|
| 115 |
-
np.random.seed(42)
|
| 116 |
-
X = np.random.rand(1000, 5) #
|
| 117 |
-
y = (X[:, 0] + X[:, 1] > 1).astype(int) #
|
| 118 |
|
| 119 |
-
# Split into
|
| 120 |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
|
| 121 |
|
| 122 |
-
# Train Logistic Regression Model
|
| 123 |
clf = LogisticRegression()
|
| 124 |
-
clf.fit(X_train, y_train)
|
| 125 |
-
y_prob = clf.predict_proba(X_test)[:, 1] # Get probability scores
|
| 126 |
|
| 127 |
-
# Apply Calibration (Platt Scaling & Isotonic Regression)
|
| 128 |
-
platt_scaling = CalibratedClassifierCV(clf, method='sigmoid') # Platt Scaling
|
| 129 |
-
iso_regression = CalibratedClassifierCV(clf, method='isotonic') # Isotonic Regression
|
| 130 |
-
platt_scaling.fit(X_train, y_train)
|
| 131 |
iso_regression.fit(X_train, y_train)
|
| 132 |
|
| 133 |
-
y_prob_platt = platt_scaling.predict_proba(X_test)[:, 1]
|
| 134 |
-
y_prob_iso = iso_regression.predict_proba(X_test)[:, 1]
|
| 135 |
|
| 136 |
-
# Compute Calibration Curves
|
| 137 |
prob_true, prob_pred = calibration_curve(y_test, y_prob, n_bins=10)
|
| 138 |
prob_true_platt, prob_pred_platt = calibration_curve(y_test, y_prob_platt, n_bins=10)
|
| 139 |
prob_true_iso, prob_pred_iso = calibration_curve(y_test, y_prob_iso, n_bins=10)
|
|
@@ -143,12 +111,12 @@ elif option == "Model Calibration":
|
|
| 143 |
ax.plot(prob_pred, prob_true, "s-", label="Uncalibrated Model")
|
| 144 |
ax.plot(prob_pred_platt, prob_true_platt, "o-", label="Platt Scaling")
|
| 145 |
ax.plot(prob_pred_iso, prob_true_iso, "d-", label="Isotonic Regression")
|
| 146 |
-
ax.plot([0, 1], [0, 1], "k--", label="Perfect Calibration")
|
| 147 |
ax.set_xlabel("Predicted Probability")
|
| 148 |
ax.set_ylabel("True Probability")
|
| 149 |
ax.legend()
|
| 150 |
ax.set_title("Calibration Curve (Reliability Diagram)")
|
| 151 |
-
|
| 152 |
# Display plot in Streamlit
|
| 153 |
st.pyplot(fig)
|
| 154 |
|
|
@@ -156,10 +124,11 @@ elif option == "Model Calibration":
|
|
| 156 |
# 3๏ธโฃ EVALUATE MODEL PERFORMANCE
|
| 157 |
# -----------------------
|
| 158 |
|
| 159 |
-
y_pred = clf.predict(X_test)
|
| 160 |
-
y_pred_platt = platt_scaling.predict(X_test)
|
| 161 |
-
y_pred_iso = iso_regression.predict(X_test)
|
| 162 |
|
|
|
|
| 163 |
st.subheader("๐น Model Accuracy Scores:")
|
| 164 |
st.write(f"**Uncalibrated Model Accuracy:** {accuracy_score(y_test, y_pred):.4f}")
|
| 165 |
st.write(f"**Platt Scaled Model Accuracy:** {accuracy_score(y_test, y_pred_platt):.4f}")
|
|
|
|
| 1 |
+
import streamlit as st # Web app framework
|
| 2 |
+
import numpy as np # For numerical operations
|
| 3 |
+
import matplotlib.pyplot as plt # For plotting graphs
|
| 4 |
+
from sklearn.calibration import calibration_curve, CalibratedClassifierCV # Model calibration
|
| 5 |
+
from sklearn.linear_model import LogisticRegression # Logistic regression model
|
| 6 |
+
from sklearn.model_selection import train_test_split # Splitting dataset
|
| 7 |
+
from sklearn.metrics import accuracy_score # Evaluating model accuracy
|
| 8 |
+
from Levenshtein import distance as levenshtein_distance # Levenshtein distance metric
|
| 9 |
+
from textdistance import jaro_winkler, damerau_levenshtein, cosine # Other similarity metrics
|
| 10 |
+
from sklearn.feature_extraction.text import CountVectorizer # Converting text to numerical format
|
| 11 |
+
from sklearn.preprocessing import normalize # Normalizing numerical data
|
| 12 |
+
import pandas as pd # Handling data efficiently
|
| 13 |
|
| 14 |
# -----------------------
|
| 15 |
+
# ๐จ STREAMLIT APP LAYOUT
|
| 16 |
# -----------------------
|
| 17 |
|
| 18 |
+
st.title("๐ String Similarity & Model Calibration App") # Main title
|
| 19 |
+
st.sidebar.header("๐ Select an Option") # Sidebar header
|
| 20 |
+
option = st.sidebar.radio("Choose a Task:", ["String Similarity", "Model Calibration"]) # User selection
|
| 21 |
|
| 22 |
# -----------------------
|
| 23 |
+
# โน๏ธ INFORMATION SECTION (For non-technical users)
|
| 24 |
# -----------------------
|
| 25 |
|
| 26 |
st.sidebar.subheader("โน๏ธ About This App")
|
| 27 |
st.sidebar.write(
|
| 28 |
"""
|
| 29 |
+
This app provides two key functionalities:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
+
**1๏ธโฃ String Similarity** ๐
|
| 32 |
+
- Used in **spell checking, data matching, and fuzzy search**.
|
| 33 |
+
|
| 34 |
**2๏ธโฃ Model Calibration** ๐
|
| 35 |
+
- Helps improve the **reliability of probability predictions** from ML models.
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
+
๐ **Project Repository:**
|
| 38 |
+
๐ [RAHULJUNEJA33/String_Similarity_Calibration-Models](https://github.com/RAHULJUNEJA33/String_Similarity_Calibration-Models)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
"""
|
| 40 |
)
|
| 41 |
|
|
|
|
| 43 |
# 1๏ธโฃ STRING SIMILARITY MODELS
|
| 44 |
# -----------------------
|
| 45 |
if option == "String Similarity":
|
| 46 |
+
st.header("๐ String Similarity Calculator") # Section header
|
| 47 |
|
| 48 |
+
# User inputs: Two words to compare
|
| 49 |
word1 = st.text_input("Enter First Word:", "MARTHA")
|
| 50 |
word2 = st.text_input("Enter Second Word:", "MARHTA")
|
| 51 |
|
| 52 |
+
if st.button("Compute Similarity"): # Compute similarity when button is clicked
|
| 53 |
# Compute similarity metrics
|
| 54 |
+
lev_dist = levenshtein_distance(word1, word2) # Levenshtein Distance
|
| 55 |
+
jaro_wink = jaro_winkler(word1, word2) # Jaro-Winkler Similarity
|
| 56 |
+
damerau_lev = damerau_levenshtein(word1, word2) # Damerau-Levenshtein Distance
|
| 57 |
+
cosine_sim = cosine(word1, word2) # Cosine Similarity
|
| 58 |
|
| 59 |
+
# Q-Gram Similarity Function
|
| 60 |
def qgram_similarity(s1, s2, q=2):
|
| 61 |
+
vectorizer = CountVectorizer(analyzer='char', ngram_range=(q, q)) # Convert text into character n-grams
|
| 62 |
+
q1 = vectorizer.fit_transform([s1, s2]) # Transform input words into vectors
|
| 63 |
+
q1 = normalize(q1, norm='l1') # Normalize the vectors
|
| 64 |
+
return (q1 * q1.T).toarray()[0, 1] # Compute similarity score
|
| 65 |
|
| 66 |
+
qgram_sim = qgram_similarity(word1, word2) # Compute Q-Gram Similarity
|
| 67 |
|
| 68 |
# Display Results
|
| 69 |
st.subheader("๐น Similarity Scores")
|
|
|
|
| 73 |
st.write(f"**Cosine Similarity:** {cosine_sim:.4f}")
|
| 74 |
st.write(f"**Q-Gram Similarity:** {qgram_sim:.4f}")
|
| 75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
# -----------------------
|
| 77 |
# 2๏ธโฃ MODEL CALIBRATION (RELIABILITY DIAGRAM)
|
| 78 |
# -----------------------
|
| 79 |
elif option == "Model Calibration":
|
| 80 |
+
st.header("๐ Model Calibration & Reliability Diagram") # Section header
|
| 81 |
|
| 82 |
+
# Generate synthetic dataset (random data)
|
| 83 |
+
np.random.seed(42) # Set seed for reproducibility
|
| 84 |
+
X = np.random.rand(1000, 5) # 1000 samples, 5 random features
|
| 85 |
+
y = (X[:, 0] + X[:, 1] > 1).astype(int) # Classification rule (sum of first 2 features > 1)
|
| 86 |
|
| 87 |
+
# Split data into training and testing sets (70%-30%)
|
| 88 |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
|
| 89 |
|
| 90 |
+
# Train a Logistic Regression Model
|
| 91 |
clf = LogisticRegression()
|
| 92 |
+
clf.fit(X_train, y_train) # Fit model to training data
|
| 93 |
+
y_prob = clf.predict_proba(X_test)[:, 1] # Get probability scores for class 1
|
| 94 |
|
| 95 |
+
# Apply Model Calibration (Platt Scaling & Isotonic Regression)
|
| 96 |
+
platt_scaling = CalibratedClassifierCV(clf, method='sigmoid') # Platt Scaling method
|
| 97 |
+
iso_regression = CalibratedClassifierCV(clf, method='isotonic') # Isotonic Regression method
|
| 98 |
+
platt_scaling.fit(X_train, y_train) # Train calibrated models
|
| 99 |
iso_regression.fit(X_train, y_train)
|
| 100 |
|
| 101 |
+
y_prob_platt = platt_scaling.predict_proba(X_test)[:, 1] # Platt Scaling probabilities
|
| 102 |
+
y_prob_iso = iso_regression.predict_proba(X_test)[:, 1] # Isotonic Regression probabilities
|
| 103 |
|
| 104 |
+
# Compute Calibration Curves (actual vs. predicted probabilities)
|
| 105 |
prob_true, prob_pred = calibration_curve(y_test, y_prob, n_bins=10)
|
| 106 |
prob_true_platt, prob_pred_platt = calibration_curve(y_test, y_prob_platt, n_bins=10)
|
| 107 |
prob_true_iso, prob_pred_iso = calibration_curve(y_test, y_prob_iso, n_bins=10)
|
|
|
|
| 111 |
ax.plot(prob_pred, prob_true, "s-", label="Uncalibrated Model")
|
| 112 |
ax.plot(prob_pred_platt, prob_true_platt, "o-", label="Platt Scaling")
|
| 113 |
ax.plot(prob_pred_iso, prob_true_iso, "d-", label="Isotonic Regression")
|
| 114 |
+
ax.plot([0, 1], [0, 1], "k--", label="Perfect Calibration") # Ideal case
|
| 115 |
ax.set_xlabel("Predicted Probability")
|
| 116 |
ax.set_ylabel("True Probability")
|
| 117 |
ax.legend()
|
| 118 |
ax.set_title("Calibration Curve (Reliability Diagram)")
|
| 119 |
+
|
| 120 |
# Display plot in Streamlit
|
| 121 |
st.pyplot(fig)
|
| 122 |
|
|
|
|
| 124 |
# 3๏ธโฃ EVALUATE MODEL PERFORMANCE
|
| 125 |
# -----------------------
|
| 126 |
|
| 127 |
+
y_pred = clf.predict(X_test) # Predictions (uncalibrated)
|
| 128 |
+
y_pred_platt = platt_scaling.predict(X_test) # Predictions (Platt Scaling)
|
| 129 |
+
y_pred_iso = iso_regression.predict(X_test) # Predictions (Isotonic Regression)
|
| 130 |
|
| 131 |
+
# Display Accuracy Scores
|
| 132 |
st.subheader("๐น Model Accuracy Scores:")
|
| 133 |
st.write(f"**Uncalibrated Model Accuracy:** {accuracy_score(y_test, y_pred):.4f}")
|
| 134 |
st.write(f"**Platt Scaled Model Accuracy:** {accuracy_score(y_test, y_pred_platt):.4f}")
|