RAHULJUNEJA33 commited on
Commit
1e6fe2e
ยท
verified ยท
1 Parent(s): 3f24b7c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -90
app.py CHANGED
@@ -1,56 +1,41 @@
1
- import streamlit as st
2
- import numpy as np
3
- import matplotlib.pyplot as plt
4
- from sklearn.calibration import calibration_curve, CalibratedClassifierCV
5
- from sklearn.linear_model import LogisticRegression
6
- from sklearn.model_selection import train_test_split
7
- from sklearn.metrics import accuracy_score
8
- from Levenshtein import distance as levenshtein_distance
9
- from textdistance import jaro_winkler, damerau_levenshtein, cosine
10
- from sklearn.feature_extraction.text import CountVectorizer
11
- from sklearn.preprocessing import normalize
12
- import pandas as pd
13
 
14
  # -----------------------
15
- # ๐ŸŽจ Streamlit App Layout
16
  # -----------------------
17
 
18
- st.title("๐Ÿ” String Similarity & Model Calibration App")
19
- st.sidebar.header("๐Ÿ“Œ Select an Option")
20
- option = st.sidebar.radio("Choose a Task:", ["String Similarity", "Model Calibration"])
21
 
22
  # -----------------------
23
- # โ„น๏ธ PROJECT INFORMATION & EDUCATION SECTION
24
  # -----------------------
25
 
26
  st.sidebar.subheader("โ„น๏ธ About This App")
27
  st.sidebar.write(
28
  """
29
- This app explores two key concepts:
30
-
31
- **1๏ธโƒฃ String Similarity Models** ๐Ÿ“
32
- - Compare words using different similarity algorithms.
33
- - Helps with **spell checking, record linkage, and fuzzy matching**.
34
 
 
 
 
35
  **2๏ธโƒฃ Model Calibration** ๐Ÿ“Š
36
- - Evaluate how well a modelโ€™s probability predictions match reality.
37
- - Uses **Platt Scaling & Isotonic Regression** to improve predictions.
38
- """
39
- )
40
 
41
- st.sidebar.subheader("๐Ÿง  How It Works?")
42
- st.sidebar.write(
43
- """
44
- - **Levenshtein Distance**: Counts how many edits are needed to turn one word into another.
45
- - **Jaro-Winkler**: Focuses on shared characters, especially at the start of words.
46
- - **Damerau-Levenshtein**: Similar to Levenshtein but also considers transpositions.
47
- - **Cosine Similarity**: Treats words as vectors and compares their angle.
48
- - **Q-Gram Similarity**: Breaks words into small parts (n-grams) and compares them.
49
-
50
- **Model Calibration**
51
- - Checks how accurate a modelโ€™s probability predictions are.
52
- - **Platt Scaling** applies logistic regression for adjustment.
53
- - **Isotonic Regression** fine-tunes predictions using a flexible non-linear approach.
54
  """
55
  )
56
 
@@ -58,27 +43,27 @@ st.sidebar.write(
58
  # 1๏ธโƒฃ STRING SIMILARITY MODELS
59
  # -----------------------
60
  if option == "String Similarity":
61
- st.header("๐Ÿ“ String Similarity Calculator")
62
 
63
- # User input
64
  word1 = st.text_input("Enter First Word:", "MARTHA")
65
  word2 = st.text_input("Enter Second Word:", "MARHTA")
66
 
67
- if st.button("Compute Similarity"):
68
  # Compute similarity metrics
69
- lev_dist = levenshtein_distance(word1, word2)
70
- jaro_wink = jaro_winkler(word1, word2)
71
- damerau_lev = damerau_levenshtein(word1, word2)
72
- cosine_sim = cosine(word1, word2)
73
 
74
- # Q-Gram Similarity
75
  def qgram_similarity(s1, s2, q=2):
76
- vectorizer = CountVectorizer(analyzer='char', ngram_range=(q, q))
77
- q1 = vectorizer.fit_transform([s1, s2])
78
- q1 = normalize(q1, norm='l1')
79
- return (q1 * q1.T).toarray()[0, 1]
80
 
81
- qgram_sim = qgram_similarity(word1, word2)
82
 
83
  # Display Results
84
  st.subheader("๐Ÿ”น Similarity Scores")
@@ -88,52 +73,35 @@ if option == "String Similarity":
88
  st.write(f"**Cosine Similarity:** {cosine_sim:.4f}")
89
  st.write(f"**Q-Gram Similarity:** {qgram_sim:.4f}")
90
 
91
- # -----------------------
92
- # ๐Ÿ“Š STRING SIMILARITY EXAMPLES TABLE
93
- # -----------------------
94
-
95
- st.subheader("๐Ÿ“Š Example Word Comparisons")
96
- data = {
97
- "Word 1": ["MARTHA", "HOUSE", "SUNDAY", "NIGHT", "FLIGHT"],
98
- "Word 2": ["MARHTA", "HORSE", "MONDAY", "KNIGHT", "FIGHT"],
99
- "Levenshtein Distance": [1, 2, 2, 2, 1],
100
- "Jaro-Winkler Similarity": [0.9611, 0.8375, 0.8222, 0.9444, 0.9740],
101
- "Damerau-Levenshtein Distance": [1, 1, 2, 1, 1],
102
- "Cosine Similarity": [0.8333, 0.7500, 0.6667, 0.8000, 0.9500],
103
- "Q-Gram Similarity": [0.8571, 0.7143, 0.6667, 0.7778, 0.9231],
104
- }
105
- df = pd.DataFrame(data)
106
- st.table(df)
107
-
108
  # -----------------------
109
  # 2๏ธโƒฃ MODEL CALIBRATION (RELIABILITY DIAGRAM)
110
  # -----------------------
111
  elif option == "Model Calibration":
112
- st.header("๐Ÿ“Š Model Calibration & Reliability Diagram")
113
 
114
- # Generate synthetic dataset
115
- np.random.seed(42)
116
- X = np.random.rand(1000, 5) # Random features
117
- y = (X[:, 0] + X[:, 1] > 1).astype(int) # Simple classification rule
118
 
119
- # Split into train/test
120
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
121
 
122
- # Train Logistic Regression Model
123
  clf = LogisticRegression()
124
- clf.fit(X_train, y_train)
125
- y_prob = clf.predict_proba(X_test)[:, 1] # Get probability scores
126
 
127
- # Apply Calibration (Platt Scaling & Isotonic Regression)
128
- platt_scaling = CalibratedClassifierCV(clf, method='sigmoid') # Platt Scaling
129
- iso_regression = CalibratedClassifierCV(clf, method='isotonic') # Isotonic Regression
130
- platt_scaling.fit(X_train, y_train)
131
  iso_regression.fit(X_train, y_train)
132
 
133
- y_prob_platt = platt_scaling.predict_proba(X_test)[:, 1]
134
- y_prob_iso = iso_regression.predict_proba(X_test)[:, 1]
135
 
136
- # Compute Calibration Curves
137
  prob_true, prob_pred = calibration_curve(y_test, y_prob, n_bins=10)
138
  prob_true_platt, prob_pred_platt = calibration_curve(y_test, y_prob_platt, n_bins=10)
139
  prob_true_iso, prob_pred_iso = calibration_curve(y_test, y_prob_iso, n_bins=10)
@@ -143,12 +111,12 @@ elif option == "Model Calibration":
143
  ax.plot(prob_pred, prob_true, "s-", label="Uncalibrated Model")
144
  ax.plot(prob_pred_platt, prob_true_platt, "o-", label="Platt Scaling")
145
  ax.plot(prob_pred_iso, prob_true_iso, "d-", label="Isotonic Regression")
146
- ax.plot([0, 1], [0, 1], "k--", label="Perfect Calibration")
147
  ax.set_xlabel("Predicted Probability")
148
  ax.set_ylabel("True Probability")
149
  ax.legend()
150
  ax.set_title("Calibration Curve (Reliability Diagram)")
151
-
152
  # Display plot in Streamlit
153
  st.pyplot(fig)
154
 
@@ -156,10 +124,11 @@ elif option == "Model Calibration":
156
  # 3๏ธโƒฃ EVALUATE MODEL PERFORMANCE
157
  # -----------------------
158
 
159
- y_pred = clf.predict(X_test)
160
- y_pred_platt = platt_scaling.predict(X_test)
161
- y_pred_iso = iso_regression.predict(X_test)
162
 
 
163
  st.subheader("๐Ÿ”น Model Accuracy Scores:")
164
  st.write(f"**Uncalibrated Model Accuracy:** {accuracy_score(y_test, y_pred):.4f}")
165
  st.write(f"**Platt Scaled Model Accuracy:** {accuracy_score(y_test, y_pred_platt):.4f}")
 
1
+ import streamlit as st # Web app framework
2
+ import numpy as np # For numerical operations
3
+ import matplotlib.pyplot as plt # For plotting graphs
4
+ from sklearn.calibration import calibration_curve, CalibratedClassifierCV # Model calibration
5
+ from sklearn.linear_model import LogisticRegression # Logistic regression model
6
+ from sklearn.model_selection import train_test_split # Splitting dataset
7
+ from sklearn.metrics import accuracy_score # Evaluating model accuracy
8
+ from Levenshtein import distance as levenshtein_distance # Levenshtein distance metric
9
+ from textdistance import jaro_winkler, damerau_levenshtein, cosine # Other similarity metrics
10
+ from sklearn.feature_extraction.text import CountVectorizer # Converting text to numerical format
11
+ from sklearn.preprocessing import normalize # Normalizing numerical data
12
+ import pandas as pd # Handling data efficiently
13
 
14
  # -----------------------
15
+ # ๐ŸŽจ STREAMLIT APP LAYOUT
16
  # -----------------------
17
 
18
+ st.title("๐Ÿ” String Similarity & Model Calibration App") # Main title
19
+ st.sidebar.header("๐Ÿ“Œ Select an Option") # Sidebar header
20
+ option = st.sidebar.radio("Choose a Task:", ["String Similarity", "Model Calibration"]) # User selection
21
 
22
  # -----------------------
23
+ # โ„น๏ธ INFORMATION SECTION (For non-technical users)
24
  # -----------------------
25
 
26
  st.sidebar.subheader("โ„น๏ธ About This App")
27
  st.sidebar.write(
28
  """
29
+ This app provides two key functionalities:
 
 
 
 
30
 
31
+ **1๏ธโƒฃ String Similarity** ๐Ÿ“
32
+ - Used in **spell checking, data matching, and fuzzy search**.
33
+
34
  **2๏ธโƒฃ Model Calibration** ๐Ÿ“Š
35
+ - Helps improve the **reliability of probability predictions** from ML models.
 
 
 
36
 
37
+ ๐Ÿ“Œ **Project Repository:**
38
+ ๐Ÿ‘‰ [RAHULJUNEJA33/String_Similarity_Calibration-Models](https://github.com/RAHULJUNEJA33/String_Similarity_Calibration-Models)
 
 
 
 
 
 
 
 
 
 
 
39
  """
40
  )
41
 
 
43
  # 1๏ธโƒฃ STRING SIMILARITY MODELS
44
  # -----------------------
45
  if option == "String Similarity":
46
+ st.header("๐Ÿ“ String Similarity Calculator") # Section header
47
 
48
+ # User inputs: Two words to compare
49
  word1 = st.text_input("Enter First Word:", "MARTHA")
50
  word2 = st.text_input("Enter Second Word:", "MARHTA")
51
 
52
+ if st.button("Compute Similarity"): # Compute similarity when button is clicked
53
  # Compute similarity metrics
54
+ lev_dist = levenshtein_distance(word1, word2) # Levenshtein Distance
55
+ jaro_wink = jaro_winkler(word1, word2) # Jaro-Winkler Similarity
56
+ damerau_lev = damerau_levenshtein(word1, word2) # Damerau-Levenshtein Distance
57
+ cosine_sim = cosine(word1, word2) # Cosine Similarity
58
 
59
+ # Q-Gram Similarity Function
60
  def qgram_similarity(s1, s2, q=2):
61
+ vectorizer = CountVectorizer(analyzer='char', ngram_range=(q, q)) # Convert text into character n-grams
62
+ q1 = vectorizer.fit_transform([s1, s2]) # Transform input words into vectors
63
+ q1 = normalize(q1, norm='l1') # Normalize the vectors
64
+ return (q1 * q1.T).toarray()[0, 1] # Compute similarity score
65
 
66
+ qgram_sim = qgram_similarity(word1, word2) # Compute Q-Gram Similarity
67
 
68
  # Display Results
69
  st.subheader("๐Ÿ”น Similarity Scores")
 
73
  st.write(f"**Cosine Similarity:** {cosine_sim:.4f}")
74
  st.write(f"**Q-Gram Similarity:** {qgram_sim:.4f}")
75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  # -----------------------
77
  # 2๏ธโƒฃ MODEL CALIBRATION (RELIABILITY DIAGRAM)
78
  # -----------------------
79
  elif option == "Model Calibration":
80
+ st.header("๐Ÿ“Š Model Calibration & Reliability Diagram") # Section header
81
 
82
+ # Generate synthetic dataset (random data)
83
+ np.random.seed(42) # Set seed for reproducibility
84
+ X = np.random.rand(1000, 5) # 1000 samples, 5 random features
85
+ y = (X[:, 0] + X[:, 1] > 1).astype(int) # Classification rule (sum of first 2 features > 1)
86
 
87
+ # Split data into training and testing sets (70%-30%)
88
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
89
 
90
+ # Train a Logistic Regression Model
91
  clf = LogisticRegression()
92
+ clf.fit(X_train, y_train) # Fit model to training data
93
+ y_prob = clf.predict_proba(X_test)[:, 1] # Get probability scores for class 1
94
 
95
+ # Apply Model Calibration (Platt Scaling & Isotonic Regression)
96
+ platt_scaling = CalibratedClassifierCV(clf, method='sigmoid') # Platt Scaling method
97
+ iso_regression = CalibratedClassifierCV(clf, method='isotonic') # Isotonic Regression method
98
+ platt_scaling.fit(X_train, y_train) # Train calibrated models
99
  iso_regression.fit(X_train, y_train)
100
 
101
+ y_prob_platt = platt_scaling.predict_proba(X_test)[:, 1] # Platt Scaling probabilities
102
+ y_prob_iso = iso_regression.predict_proba(X_test)[:, 1] # Isotonic Regression probabilities
103
 
104
+ # Compute Calibration Curves (actual vs. predicted probabilities)
105
  prob_true, prob_pred = calibration_curve(y_test, y_prob, n_bins=10)
106
  prob_true_platt, prob_pred_platt = calibration_curve(y_test, y_prob_platt, n_bins=10)
107
  prob_true_iso, prob_pred_iso = calibration_curve(y_test, y_prob_iso, n_bins=10)
 
111
  ax.plot(prob_pred, prob_true, "s-", label="Uncalibrated Model")
112
  ax.plot(prob_pred_platt, prob_true_platt, "o-", label="Platt Scaling")
113
  ax.plot(prob_pred_iso, prob_true_iso, "d-", label="Isotonic Regression")
114
+ ax.plot([0, 1], [0, 1], "k--", label="Perfect Calibration") # Ideal case
115
  ax.set_xlabel("Predicted Probability")
116
  ax.set_ylabel("True Probability")
117
  ax.legend()
118
  ax.set_title("Calibration Curve (Reliability Diagram)")
119
+
120
  # Display plot in Streamlit
121
  st.pyplot(fig)
122
 
 
124
  # 3๏ธโƒฃ EVALUATE MODEL PERFORMANCE
125
  # -----------------------
126
 
127
+ y_pred = clf.predict(X_test) # Predictions (uncalibrated)
128
+ y_pred_platt = platt_scaling.predict(X_test) # Predictions (Platt Scaling)
129
+ y_pred_iso = iso_regression.predict(X_test) # Predictions (Isotonic Regression)
130
 
131
+ # Display Accuracy Scores
132
  st.subheader("๐Ÿ”น Model Accuracy Scores:")
133
  st.write(f"**Uncalibrated Model Accuracy:** {accuracy_score(y_test, y_pred):.4f}")
134
  st.write(f"**Platt Scaled Model Accuracy:** {accuracy_score(y_test, y_pred_platt):.4f}")