Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -7,6 +7,7 @@ from Levenshtein import distance as levenshtein_distance
|
|
| 7 |
import matplotlib.pyplot as plt
|
| 8 |
import seaborn as sns
|
| 9 |
|
|
|
|
| 10 |
ms = st.session_state
|
| 11 |
if "themes" not in ms:
|
| 12 |
ms.themes = {"current_theme": "light",
|
|
@@ -56,13 +57,20 @@ def read_csv_or_excel(file):
|
|
| 56 |
return pd.read_excel(file)
|
| 57 |
else:
|
| 58 |
raise ValueError("Unsupported file format. Only CSV and Excel files are supported.")
|
|
|
|
| 59 |
|
| 60 |
def find_exact_match(df1, df2, column_name):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
# Find rows with exact matches in the specified column
|
| 62 |
matches = pd.merge(df1, df2, on=column_name, how='inner')
|
| 63 |
return matches
|
| 64 |
|
| 65 |
|
|
|
|
|
|
|
| 66 |
def find_similar_texts(df1, df2, column_name, threshold=0.3):
|
| 67 |
# Find rows with similar texts in the specified column, excluding exact matches
|
| 68 |
similar_texts = []
|
|
@@ -108,6 +116,7 @@ def plot_correlation(df, column):
|
|
| 108 |
return plt.gcf() # Return the matplotlib figure
|
| 109 |
|
| 110 |
st.set_option('deprecation.showPyplotGlobalUse', False)
|
|
|
|
| 111 |
def plot_correlation_matrix(df):
|
| 112 |
# Filter for numeric columns, if the DataFrame has non-numeric columns
|
| 113 |
numeric_df = df.select_dtypes(include=['number'])
|
|
@@ -160,6 +169,8 @@ def main():
|
|
| 160 |
# Display exact matches
|
| 161 |
st.header("Exact Matches Compare")
|
| 162 |
for match in exact_matches:
|
|
|
|
|
|
|
| 163 |
st.write(f"Row {match[0]} in warehouse item stocks is exactly the same as Row {match[1]} in industry item stocks:")
|
| 164 |
st.write(f"Warehouse: {match[2]}")
|
| 165 |
st.write(f"Industry: {match[3]}")
|
|
@@ -169,6 +180,9 @@ def main():
|
|
| 169 |
# Display similar texts
|
| 170 |
st.header("Similar (but Not Same) Texts")
|
| 171 |
for text_pair in similar_texts:
|
|
|
|
|
|
|
|
|
|
| 172 |
st.write(f"Row {text_pair[0]} in warehouse item stocks is similar to Row {text_pair[1]} in industry item stocks:")
|
| 173 |
st.write(f"Warehouse: {text_pair[2]}")
|
| 174 |
st.write(f"Industry: {text_pair[3]}")
|
|
@@ -202,4 +216,4 @@ def main():
|
|
| 202 |
plot_correlation_matrix(industry_df)
|
| 203 |
|
| 204 |
if __name__ == "__main__":
|
| 205 |
-
main()
|
|
|
|
| 7 |
import matplotlib.pyplot as plt
|
| 8 |
import seaborn as sns
|
| 9 |
|
| 10 |
+
|
| 11 |
ms = st.session_state
|
| 12 |
if "themes" not in ms:
|
| 13 |
ms.themes = {"current_theme": "light",
|
|
|
|
| 57 |
return pd.read_excel(file)
|
| 58 |
else:
|
| 59 |
raise ValueError("Unsupported file format. Only CSV and Excel files are supported.")
|
| 60 |
+
|
| 61 |
|
| 62 |
def find_exact_match(df1, df2, column_name):
|
| 63 |
+
# Ensure the column for merging has the same data type
|
| 64 |
+
df1[column_name] = df1[column_name].astype(str).str.strip()
|
| 65 |
+
df2[column_name] = df2[column_name].astype(str).str.strip()
|
| 66 |
+
|
| 67 |
# Find rows with exact matches in the specified column
|
| 68 |
matches = pd.merge(df1, df2, on=column_name, how='inner')
|
| 69 |
return matches
|
| 70 |
|
| 71 |
|
| 72 |
+
|
| 73 |
+
|
| 74 |
def find_similar_texts(df1, df2, column_name, threshold=0.3):
|
| 75 |
# Find rows with similar texts in the specified column, excluding exact matches
|
| 76 |
similar_texts = []
|
|
|
|
| 116 |
return plt.gcf() # Return the matplotlib figure
|
| 117 |
|
| 118 |
st.set_option('deprecation.showPyplotGlobalUse', False)
|
| 119 |
+
|
| 120 |
def plot_correlation_matrix(df):
|
| 121 |
# Filter for numeric columns, if the DataFrame has non-numeric columns
|
| 122 |
numeric_df = df.select_dtypes(include=['number'])
|
|
|
|
| 169 |
# Display exact matches
|
| 170 |
st.header("Exact Matches Compare")
|
| 171 |
for match in exact_matches:
|
| 172 |
+
warehouse_index = text_pair[0] + 2
|
| 173 |
+
industry_index = text_pair[1] + 2
|
| 174 |
st.write(f"Row {match[0]} in warehouse item stocks is exactly the same as Row {match[1]} in industry item stocks:")
|
| 175 |
st.write(f"Warehouse: {match[2]}")
|
| 176 |
st.write(f"Industry: {match[3]}")
|
|
|
|
| 180 |
# Display similar texts
|
| 181 |
st.header("Similar (but Not Same) Texts")
|
| 182 |
for text_pair in similar_texts:
|
| 183 |
+
warehouse_index = text_pair[0] + 2
|
| 184 |
+
industry_index = text_pair[1] + 2
|
| 185 |
+
|
| 186 |
st.write(f"Row {text_pair[0]} in warehouse item stocks is similar to Row {text_pair[1]} in industry item stocks:")
|
| 187 |
st.write(f"Warehouse: {text_pair[2]}")
|
| 188 |
st.write(f"Industry: {text_pair[3]}")
|
|
|
|
| 216 |
plot_correlation_matrix(industry_df)
|
| 217 |
|
| 218 |
if __name__ == "__main__":
|
| 219 |
+
main()
|