import streamlit as st import numpy as np import matplotlib.pyplot as plt from sklearn.calibration import calibration_curve, CalibratedClassifierCV from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from Levenshtein import distance as levenshtein_distance from textdistance import jaro_winkler, damerau_levenshtein, cosine from sklearn.feature_extraction.text import CountVectorizer from sklearn.preprocessing import normalize # ----------------------- # 🎨 Streamlit App Layout # ----------------------- # Display title of the app on the web page st.title("🔍 String Similarity & Model Calibration App") # Sidebar for selecting the task to perform st.sidebar.header("📌 Select an Option") # Option to choose between String Similarity and Model Calibration option = st.sidebar.radio("Choose a Task:", ["String Similarity", "Model Calibration"]) # ℹ️ PROJECT INFORMATION & EDUCATION SECTION # ----------------------- st.sidebar.subheader("ℹ️ About This App") st.sidebar.write( """ This app explores two key concepts: **1️⃣ String Similarity Models** 📝 - Compare words using different similarity algorithms. - Helps with **spell checking, record linkage, and fuzzy matching**. **2️⃣ Model Calibration** 📊 - Evaluate how well a model’s probability predictions match reality. - Uses **Platt Scaling & Isotonic Regression** to improve predictions. """ ) # Explaining how the models work in simple terms st.sidebar.subheader("🧠 How It Works?") st.sidebar.write( """ - **Levenshtein Distance**: Counts how many edits are needed to turn one word into another. - **Jaro-Winkler**: Focuses on shared characters, especially at the start of words. - **Damerau-Levenshtein**: Similar to Levenshtein but also considers transpositions (changing letter order). - **Cosine Similarity**: Treats words as vectors (arrays of numbers) and compares their angle. - **Q-Gram Similarity**: Breaks words into small parts (n-grams) and compares them. **Model Calibration** - Checks how accurate a model’s probability predictions are. - **Platt Scaling** applies logistic regression for adjustment. - **Isotonic Regression** fine-tunes predictions using a flexible non-linear approach. """ ) # ----------------------- # 1️⃣ STRING SIMILARITY MODELS # ----------------------- if option == "String Similarity": st.header("📝 String Similarity Calculator") # User inputs the two words they want to compare word1 = st.text_input("Enter First Word:", "AARUSH") word2 = st.text_input("Enter Second Word:", "AASHVI") # When the user clicks the button, calculate similarity if st.button("Compute Similarity"): # Compute the different types of word similarity lev_dist = levenshtein_distance(word1, word2) # Levenshtein distance (edit distance) jaro_wink = jaro_winkler(word1, word2) # Jaro-Winkler similarity damerau_lev = damerau_levenshtein(word1, word2) # Damerau-Levenshtein distance (edit + transposition) cosine_sim = cosine(word1, word2) # Cosine similarity (word angle comparison) # Q-Gram Similarity function calculates similarity based on small parts (n-grams) def qgram_similarity(s1, s2, q=2): vectorizer = CountVectorizer(analyzer='char', ngram_range=(q, q)) # Split words into small parts q1 = vectorizer.fit_transform([s1, s2]) # Convert words into vector form q1 = normalize(q1, norm='l1') # Normalize vectors return (q1 * q1.T).toarray()[0, 1] # Compare the vectors and return similarity qgram_sim = qgram_similarity(word1, word2) # Compute Q-Gram similarity # Display the computed similarity results st.subheader("🔹 Similarity Scores") st.write(f"**Levenshtein Distance:** {lev_dist}") # Shows Levenshtein distance st.write(f"**Jaro-Winkler Similarity:** {jaro_wink:.4f}") # Shows Jaro-Winkler similarity st.write(f"**Damerau-Levenshtein Distance:** {damerau_lev}") # Shows Damerau-Levenshtein distance st.write(f"**Cosine Similarity:** {cosine_sim:.4f}") # Shows Cosine similarity st.write(f"**Q-Gram Similarity:** {qgram_sim:.4f}") # Shows Q-Gram similarity # ----------------------- # 2️⃣ MODEL CALIBRATION (RELIABILITY DIAGRAM) # ----------------------- elif option == "Model Calibration": st.header("📊 Model Calibration & Reliability Diagram") # Generate synthetic dataset (random numbers) for model training np.random.seed(42) # For reproducibility X = np.random.rand(1000, 5) # Random features (1000 samples, 5 features) y = (X[:, 0] + X[:, 1] > 1).astype(int) # Simple rule to generate labels (0 or 1) # Split the data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # Train a Logistic Regression Model clf = LogisticRegression() clf.fit(X_train, y_train) # Train the model y_prob = clf.predict_proba(X_test)[:, 1] # Get the predicted probabilities for the test set # Apply Calibration methods (Platt Scaling and Isotonic Regression) platt_scaling = CalibratedClassifierCV(clf, method='sigmoid') # Platt Scaling method iso_regression = CalibratedClassifierCV(clf, method='isotonic') # Isotonic Regression method platt_scaling.fit(X_train, y_train) # Fit Platt Scaling model iso_regression.fit(X_train, y_train) # Fit Isotonic Regression model y_prob_platt = platt_scaling.predict_proba(X_test)[:, 1] # Get probabilities after Platt Scaling y_prob_iso = iso_regression.predict_proba(X_test)[:, 1] # Get probabilities after Isotonic Regression # Compute Calibration Curves (shows how close the predicted probabilities are to the true values) prob_true, prob_pred = calibration_curve(y_test, y_prob, n_bins=10) prob_true_platt, prob_pred_platt = calibration_curve(y_test, y_prob_platt, n_bins=10) prob_true_iso, prob_pred_iso = calibration_curve(y_test, y_prob_iso, n_bins=10) # Plot the Calibration Curves fig, ax = plt.subplots(figsize=(8, 6)) # Create a figure for the plot ax.plot(prob_pred, prob_true, "s-", label="Uncalibrated Model") # Plot uncalibrated model ax.plot(prob_pred_platt, prob_true_platt, "o-", label="Platt Scaling") # Plot Platt Scaling ax.plot(prob_pred_iso, prob_true_iso, "d-", label="Isotonic Regression") # Plot Isotonic Regression ax.plot([0, 1], [0, 1], "k--", label="Perfect Calibration") # Plot perfect calibration line ax.set_xlabel("Predicted Probability") # Label for X-axis ax.set_ylabel("True Probability") # Label for Y-axis ax.legend() # Show legend to differentiate lines ax.set_title("Calibration Curve (Reliability Diagram)") # Title of the plot # Display the plot in Streamlit st.pyplot(fig) # ----------------------- # 3️⃣ EVALUATE MODEL PERFORMANCE # ----------------------- # Predict the labels for the test set using different models y_pred = clf.predict(X_test) y_pred_platt = platt_scaling.predict(X_test) y_pred_iso = iso_regression.predict(X_test) # Display the accuracy of each model st.subheader("🔹 Model Accuracy Scores:") st.write(f"**Uncalibrated Model Accuracy:** {accuracy_score(y_test, y_pred):.4f}") # Accuracy of uncalibrated model st.write(f"**Platt Scaled Model Accuracy:** {accuracy_score(y_test, y_pred_platt):.4f}") # Accuracy after Platt Scaling st.write(f"**Isotonic Regression Model Accuracy:** {accuracy_score(y_test, y_pred_iso):.4f}") # Accuracy after Isotonic Regression