prernajeet01 commited on
Commit
b69ea4e
Β·
verified Β·
1 Parent(s): 2e78b4d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +270 -0
app.py ADDED
@@ -0,0 +1,270 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.cluster import KMeans
5
+ from sklearn.preprocessing import StandardScaler
6
+ from sklearn.feature_extraction.text import TfidfVectorizer
7
+ from datetime import datetime, timedelta
8
+ import random
9
+ import string
10
+ import re
11
+
12
+ def generate_sample_data():
13
+ """Generate sample payment data for demonstration"""
14
+ vendors = ['ABC Corp', 'XYZ Ltd', 'Tech Solutions', 'Global Services', 'ABC Corp', 'XYZ Ltd']
15
+ descriptions = ['Software License', 'Consulting Services', 'Hardware Purchase', 'Monthly Subscription']
16
+
17
+ data = []
18
+ base_date = datetime.now() - timedelta(days=30)
19
+
20
+ for i in range(50):
21
+ vendor = random.choice(vendors)
22
+ amount = round(random.uniform(100, 5000), 2)
23
+ date = base_date + timedelta(days=random.randint(0, 30))
24
+ invoice = f"INV-{random.randint(1000, 9999)}"
25
+ description = random.choice(descriptions)
26
+
27
+ # Create some intentional duplicates
28
+ if i % 8 == 0 and i > 0: # Every 8th record, create a near-duplicate
29
+ prev_record = data[i-1]
30
+ vendor = prev_record['Vendor']
31
+ amount = prev_record['Amount'] + random.uniform(-10, 10) # Slight variation
32
+ description = prev_record['Description']
33
+
34
+ data.append({
35
+ 'Vendor': vendor,
36
+ 'Amount': amount,
37
+ 'Date': date.strftime('%Y-%m-%d'),
38
+ 'Invoice': invoice,
39
+ 'Description': description
40
+ })
41
+
42
+ return pd.DataFrame(data)
43
+
44
+ def preprocess_data(df):
45
+ """Preprocess data for K-means clustering"""
46
+ # Create numerical features
47
+ features = []
48
+
49
+ # Amount feature
50
+ amounts = df['Amount'].values.reshape(-1, 1)
51
+ scaler_amount = StandardScaler()
52
+ amounts_scaled = scaler_amount.fit_transform(amounts)
53
+
54
+ # Date feature (days since earliest date)
55
+ df['Date'] = pd.to_datetime(df['Date'])
56
+ min_date = df['Date'].min()
57
+ date_features = (df['Date'] - min_date).dt.days.values.reshape(-1, 1)
58
+ scaler_date = StandardScaler()
59
+ date_features_scaled = scaler_date.fit_transform(date_features)
60
+
61
+ # Text features for vendor and description
62
+ text_data = df['Vendor'].astype(str) + ' ' + df['Description'].astype(str)
63
+ vectorizer = TfidfVectorizer(max_features=10, stop_words='english')
64
+ text_features = vectorizer.fit_transform(text_data).toarray()
65
+
66
+ # Combine all features
67
+ features = np.hstack([amounts_scaled, date_features_scaled, text_features])
68
+
69
+ return features
70
+
71
+ def detect_duplicates(df, n_clusters=5):
72
+ """Detect potential duplicate payments using K-means clustering"""
73
+ if len(df) < 2:
74
+ return df, "Not enough data for analysis"
75
+
76
+ try:
77
+ # Preprocess data
78
+ features = preprocess_data(df)
79
+
80
+ # Apply K-means clustering
81
+ n_clusters = min(n_clusters, len(df))
82
+ kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
83
+ clusters = kmeans.fit_predict(features)
84
+
85
+ # Add cluster information to dataframe
86
+ df_result = df.copy()
87
+ df_result['Cluster'] = clusters
88
+
89
+ # Find potential duplicates (clusters with multiple entries)
90
+ duplicate_pairs = []
91
+ cluster_counts = pd.Series(clusters).value_counts()
92
+
93
+ for cluster_id in cluster_counts[cluster_counts > 1].index:
94
+ cluster_data = df_result[df_result['Cluster'] == cluster_id]
95
+
96
+ # Calculate similarity within cluster
97
+ for i, row1 in cluster_data.iterrows():
98
+ for j, row2 in cluster_data.iterrows():
99
+ if i < j: # Avoid duplicate pairs
100
+ # Calculate similarity score
101
+ vendor_match = 1 if row1['Vendor'].lower() == row2['Vendor'].lower() else 0
102
+ amount_diff = abs(row1['Amount'] - row2['Amount'])
103
+ amount_similarity = max(0, 1 - amount_diff / max(row1['Amount'], row2['Amount']))
104
+ desc_similarity = len(set(row1['Description'].lower().split()) &
105
+ set(row2['Description'].lower().split())) / \
106
+ len(set(row1['Description'].lower().split()) |
107
+ set(row2['Description'].lower().split()))
108
+
109
+ similarity_score = (vendor_match * 0.4 + amount_similarity * 0.4 + desc_similarity * 0.2)
110
+
111
+ if similarity_score > 0.5: # Threshold for potential duplicates
112
+ duplicate_pairs.append({
113
+ 'Index_1': i,
114
+ 'Index_2': j,
115
+ 'Vendor_1': row1['Vendor'],
116
+ 'Vendor_2': row2['Vendor'],
117
+ 'Amount_1': row1['Amount'],
118
+ 'Amount_2': row2['Amount'],
119
+ 'Date_1': row1['Date'].strftime('%Y-%m-%d') if hasattr(row1['Date'], 'strftime') else row1['Date'],
120
+ 'Date_2': row2['Date'].strftime('%Y-%m-%d') if hasattr(row2['Date'], 'strftime') else row2['Date'],
121
+ 'Invoice_1': row1['Invoice'],
122
+ 'Invoice_2': row2['Invoice'],
123
+ 'Description_1': row1['Description'],
124
+ 'Description_2': row2['Description'],
125
+ 'Similarity_Score': round(similarity_score * 100, 2),
126
+ 'Cluster': cluster_id
127
+ })
128
+
129
+ if duplicate_pairs:
130
+ duplicate_df = pd.DataFrame(duplicate_pairs)
131
+ duplicate_df = duplicate_df.sort_values('Similarity_Score', ascending=False)
132
+ return duplicate_df, f"Found {len(duplicate_pairs)} potential duplicate pairs"
133
+ else:
134
+ return pd.DataFrame(), "No potential duplicates found"
135
+
136
+ except Exception as e:
137
+ return pd.DataFrame(), f"Error in analysis: {str(e)}"
138
+
139
+ def analyze_payments(file_input, n_clusters):
140
+ """Main analysis function"""
141
+ try:
142
+ if file_input is None:
143
+ return pd.DataFrame(), "Please upload a file or load sample data"
144
+
145
+ # Read the uploaded file
146
+ if file_input.name.endswith('.csv'):
147
+ df = pd.read_csv(file_input.name)
148
+ elif file_input.name.endswith(('.xlsx', '.xls')):
149
+ df = pd.read_excel(file_input.name)
150
+ else:
151
+ return pd.DataFrame(), "Please upload a CSV or Excel file"
152
+
153
+ # Check required columns
154
+ required_columns = ['Vendor', 'Amount', 'Date', 'Invoice', 'Description']
155
+ missing_columns = [col for col in required_columns if col not in df.columns]
156
+
157
+ if missing_columns:
158
+ return pd.DataFrame(), f"Missing required columns: {', '.join(missing_columns)}"
159
+
160
+ # Perform duplicate detection
161
+ result_df, message = detect_duplicates(df, n_clusters)
162
+
163
+ return result_df, message
164
+
165
+ except Exception as e:
166
+ return pd.DataFrame(), f"Error processing file: {str(e)}"
167
+
168
+ def load_sample():
169
+ """Load sample data for demonstration"""
170
+ sample_df = generate_sample_data()
171
+ # Save to temporary file
172
+ sample_df.to_csv("sample_data.csv", index=False)
173
+ return "sample_data.csv"
174
+
175
+ # Create Gradio interface
176
+ with gr.Blocks(theme=gr.themes.Soft(), title="Vendor Duplicate Analyzer") as app:
177
+
178
+ # Header
179
+ gr.HTML("""
180
+ <div style="text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 10px; margin-bottom: 20px;">
181
+ <h1 style="color: white; margin: 0; font-size: 2.5em;">πŸ” Vendor Duplicate Analyzer</h1>
182
+ <p style="color: white; margin: 10px 0 0 0; font-size: 1.2em;">Using K-means Clustering for Duplicate Detection</p>
183
+ </div>
184
+ """)
185
+
186
+ with gr.Row():
187
+ with gr.Column(scale=1):
188
+ gr.HTML("<h3>πŸ“€ Upload Data</h3>")
189
+
190
+ file_input = gr.File(
191
+ label="Upload Payment CSV/Excel",
192
+ file_types=[".csv", ".xlsx", ".xls"],
193
+ type="filepath"
194
+ )
195
+
196
+ gr.HTML("""
197
+ <div style="margin: 15px 0; padding: 15px; background-color: #f8f9fa; border-radius: 8px; border-left: 4px solid #17a2b8;">
198
+ <strong>πŸ“‹ CSV Format Required</strong><br>
199
+ Your CSV must have columns: <strong>Vendor, Amount, Date, Invoice, Description</strong>
200
+ </div>
201
+ """)
202
+
203
+ with gr.Row():
204
+ sample_btn = gr.Button("πŸ“Š Load Sample Data", variant="secondary")
205
+ analyze_btn = gr.Button("πŸ” Analyze with K-means", variant="primary")
206
+
207
+ gr.HTML("<h3>βš™οΈ Parameters</h3>")
208
+ n_clusters = gr.Slider(
209
+ minimum=2,
210
+ maximum=10,
211
+ value=5,
212
+ step=1,
213
+ label="Number of Clusters",
214
+ info="K-means will group similar payments into this many clusters"
215
+ )
216
+
217
+ gr.HTML("""
218
+ <div style="margin-top: 20px; padding: 15px; background-color: #e8f4f8; border-radius: 8px;">
219
+ <strong>πŸ€– How K-means Works Here</strong>
220
+ <ol style="margin: 10px 0 0 20px;">
221
+ <li>Extracts numerical features from payment records</li>
222
+ <li>Groups similar payments into clusters</li>
223
+ <li>Compares payments within each cluster</li>
224
+ <li>Calculates similarity scores for potential duplicates</li>
225
+ </ol>
226
+ </div>
227
+ """)
228
+
229
+ with gr.Column(scale=2):
230
+ gr.HTML("<h3>πŸ“Š Results</h3>")
231
+
232
+ status_message = gr.Textbox(
233
+ label="Analysis Status",
234
+ interactive=False,
235
+ placeholder="Upload data and click 'Analyze' to begin..."
236
+ )
237
+
238
+ results_table = gr.Dataframe(
239
+ label="Potential Duplicate Pairs Found by K-means",
240
+ interactive=False,
241
+ wrap=True,
242
+ column_widths=["10%", "15%", "15%", "10%", "10%", "10%", "10%", "10%", "10%"]
243
+ )
244
+
245
+ # Event handlers
246
+ def handle_sample_load():
247
+ sample_file = load_sample()
248
+ return sample_file, "Sample data loaded successfully! Click 'Analyze' to detect duplicates."
249
+
250
+ def handle_analysis(file_input, n_clusters):
251
+ if file_input is None:
252
+ return "Please upload a file first!", pd.DataFrame()
253
+
254
+ result_df, message = analyze_payments(file_input, n_clusters)
255
+ return message, result_df
256
+
257
+ sample_btn.click(
258
+ fn=handle_sample_load,
259
+ outputs=[file_input, status_message]
260
+ )
261
+
262
+ analyze_btn.click(
263
+ fn=handle_analysis,
264
+ inputs=[file_input, n_clusters],
265
+ outputs=[status_message, results_table]
266
+ )
267
+
268
+ # Launch the app
269
+ if __name__ == "__main__":
270
+ app.launch(share=True)