Spaces:

Mattral
/

Excel-Match-Analysis

Sleeping

App Files Files Community

Mattral commited on Apr 26, 2024

Commit

eca35fa

verified ·

1 Parent(s): d5b07e0

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -1

app.py CHANGED Viewed

@@ -59,7 +59,39 @@ def read_csv_or_excel(file):
 def find_exact_matches(df1, df2, column_name):
     # Find rows with exact matches in the specified column
     matches = pd.merge(df1, df2, on=column_name, how='inner')
-    return matches
 def find_similar_texts(df1, df2, column_name, exact_matches, threshold=0.3):
@@ -123,6 +155,7 @@ def main():
             # Find similar texts
             similar_texts = find_similar_texts(warehouse_df, industry_df, warehouse_column, exact_matches)
             # Display results
             st.header("Exact Matches")
@@ -136,6 +169,13 @@ def main():
                 st.write(f"Industry: {text_pair[3]}")
                 st.write
 if __name__ == "__main__":
     main()

 def find_exact_matches(df1, df2, column_name):
     # Find rows with exact matches in the specified column
     matches = pd.merge(df1, df2, on=column_name, how='inner')
+    return
+def find_similar_texts2(df1, df2, column_name, exact_matches, threshold=0.3):
+    # Find rows with similar texts in the specified column, excluding exact matches
+    similar_texts = []
+    exact_match_indices = set(exact_matches.index.tolist())
+    # Concatenate texts from both dataframes
+    all_texts = df1[column_name].astype(str).tolist() + df2[column_name].astype(str).tolist()
+    # Compute TF-IDF vectors
+    vectorizer = TfidfVectorizer()
+    tfidf_matrix = vectorizer.fit_transform(all_texts)
+    # Compute cosine similarity matrix
+    similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
+    # Iterate over pairs of rows to find similar texts
+    for i, row1 in df1.iterrows():
+        for j, row2 in df2.iterrows():
+            if i not in exact_match_indices and j not in exact_match_indices:
+                similarity = similarity_matrix[i, len(df1) + j]
+                if similarity = 1:  # Exclude exact matches
+                    # Calculate Levenshtein distance between strings
+                    distance = levenshtein_distance(row1[column_name], row2[column_name])
+                    max_length = max(len(row1[column_name]), len(row2[column_name]))
+                    similarity_score = 1 - (distance / max_length)
+                    if similarity_score >= threshold:
+                        similar_texts.append((i, j, row1[column_name], row2[column_name]))
+    return similar_texts2
 def find_similar_texts(df1, df2, column_name, exact_matches, threshold=0.3):
             # Find similar texts
             similar_texts = find_similar_texts(warehouse_df, industry_df, warehouse_column, exact_matches)
+            similar_texts2 = find_similar_texts(warehouse_df, industry_df, warehouse_column, exact_matches)
             # Display results
             st.header("Exact Matches")
                 st.write(f"Industry: {text_pair[3]}")
                 st.write
+            st.header("Exactly Same Texts")
+            for text_pair in similar_texts2:
+                st.write(f"Row {text_pair[0]} in warehouse item stocks is the same as Row {text_pair[1]} in industry item stocks:")
+                st.write(f"Warehouse: {text_pair[2]}")
+                st.write(f"Industry: {text_pair[3]}")
+                st.write
 if __name__ == "__main__":
     main()