Spaces:

prernajeet01
/

vendor

Runtime error

App Files Files Community

prernajeet01 commited on May 28, 2025

Commit

b69ea4e

verified ·

1 Parent(s): 2e78b4d

Create app.py

Browse files

Files changed (1) hide show

app.py +270 -0

app.py ADDED Viewed

	@@ -0,0 +1,270 @@

+import gradio as gr
+import pandas as pd
+import numpy as np
+from sklearn.cluster import KMeans
+from sklearn.preprocessing import StandardScaler
+from sklearn.feature_extraction.text import TfidfVectorizer
+from datetime import datetime, timedelta
+import random
+import string
+import re
+def generate_sample_data():
+    """Generate sample payment data for demonstration"""
+    vendors = ['ABC Corp', 'XYZ Ltd', 'Tech Solutions', 'Global Services', 'ABC Corp', 'XYZ Ltd']
+    descriptions = ['Software License', 'Consulting Services', 'Hardware Purchase', 'Monthly Subscription']
+    data = []
+    base_date = datetime.now() - timedelta(days=30)
+    for i in range(50):
+        vendor = random.choice(vendors)
+        amount = round(random.uniform(100, 5000), 2)
+        date = base_date + timedelta(days=random.randint(0, 30))
+        invoice = f"INV-{random.randint(1000, 9999)}"
+        description = random.choice(descriptions)
+        # Create some intentional duplicates
+        if i % 8 == 0 and i > 0:  # Every 8th record, create a near-duplicate
+            prev_record = data[i-1]
+            vendor = prev_record['Vendor']
+            amount = prev_record['Amount'] + random.uniform(-10, 10)  # Slight variation
+            description = prev_record['Description']
+        data.append({
+            'Vendor': vendor,
+            'Amount': amount,
+            'Date': date.strftime('%Y-%m-%d'),
+            'Invoice': invoice,
+            'Description': description
+        })
+    return pd.DataFrame(data)
+def preprocess_data(df):
+    """Preprocess data for K-means clustering"""
+    # Create numerical features
+    features = []
+    # Amount feature
+    amounts = df['Amount'].values.reshape(-1, 1)
+    scaler_amount = StandardScaler()
+    amounts_scaled = scaler_amount.fit_transform(amounts)
+    # Date feature (days since earliest date)
+    df['Date'] = pd.to_datetime(df['Date'])
+    min_date = df['Date'].min()
+    date_features = (df['Date'] - min_date).dt.days.values.reshape(-1, 1)
+    scaler_date = StandardScaler()
+    date_features_scaled = scaler_date.fit_transform(date_features)
+    # Text features for vendor and description
+    text_data = df['Vendor'].astype(str) + ' ' + df['Description'].astype(str)
+    vectorizer = TfidfVectorizer(max_features=10, stop_words='english')
+    text_features = vectorizer.fit_transform(text_data).toarray()
+    # Combine all features
+    features = np.hstack([amounts_scaled, date_features_scaled, text_features])
+    return features
+def detect_duplicates(df, n_clusters=5):
+    """Detect potential duplicate payments using K-means clustering"""
+    if len(df) < 2:
+        return df, "Not enough data for analysis"
+    try:
+        # Preprocess data
+        features = preprocess_data(df)
+        # Apply K-means clustering
+        n_clusters = min(n_clusters, len(df))
+        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
+        clusters = kmeans.fit_predict(features)
+        # Add cluster information to dataframe
+        df_result = df.copy()
+        df_result['Cluster'] = clusters
+        # Find potential duplicates (clusters with multiple entries)
+        duplicate_pairs = []
+        cluster_counts = pd.Series(clusters).value_counts()
+        for cluster_id in cluster_counts[cluster_counts > 1].index:
+            cluster_data = df_result[df_result['Cluster'] == cluster_id]
+            # Calculate similarity within cluster
+            for i, row1 in cluster_data.iterrows():
+                for j, row2 in cluster_data.iterrows():
+                    if i < j:  # Avoid duplicate pairs
+                        # Calculate similarity score
+                        vendor_match = 1 if row1['Vendor'].lower() == row2['Vendor'].lower() else 0
+                        amount_diff = abs(row1['Amount'] - row2['Amount'])
+                        amount_similarity = max(0, 1 - amount_diff / max(row1['Amount'], row2['Amount']))
+                        desc_similarity = len(set(row1['Description'].lower().split()) &
+                                            set(row2['Description'].lower().split())) / \
+                                        len(set(row1['Description'].lower().split()) |
+                                            set(row2['Description'].lower().split()))
+                        similarity_score = (vendor_match * 0.4 + amount_similarity * 0.4 + desc_similarity * 0.2)
+                        if similarity_score > 0.5:  # Threshold for potential duplicates
+                            duplicate_pairs.append({
+                                'Index_1': i,
+                                'Index_2': j,
+                                'Vendor_1': row1['Vendor'],
+                                'Vendor_2': row2['Vendor'],
+                                'Amount_1': row1['Amount'],
+                                'Amount_2': row2['Amount'],
+                                'Date_1': row1['Date'].strftime('%Y-%m-%d') if hasattr(row1['Date'], 'strftime') else row1['Date'],
+                                'Date_2': row2['Date'].strftime('%Y-%m-%d') if hasattr(row2['Date'], 'strftime') else row2['Date'],
+                                'Invoice_1': row1['Invoice'],
+                                'Invoice_2': row2['Invoice'],
+                                'Description_1': row1['Description'],
+                                'Description_2': row2['Description'],
+                                'Similarity_Score': round(similarity_score * 100, 2),
+                                'Cluster': cluster_id
+                            })
+        if duplicate_pairs:
+            duplicate_df = pd.DataFrame(duplicate_pairs)
+            duplicate_df = duplicate_df.sort_values('Similarity_Score', ascending=False)
+            return duplicate_df, f"Found {len(duplicate_pairs)} potential duplicate pairs"
+        else:
+            return pd.DataFrame(), "No potential duplicates found"
+    except Exception as e:
+        return pd.DataFrame(), f"Error in analysis: {str(e)}"
+def analyze_payments(file_input, n_clusters):
+    """Main analysis function"""
+    try:
+        if file_input is None:
+            return pd.DataFrame(), "Please upload a file or load sample data"
+        # Read the uploaded file
+        if file_input.name.endswith('.csv'):
+            df = pd.read_csv(file_input.name)
+        elif file_input.name.endswith(('.xlsx', '.xls')):
+            df = pd.read_excel(file_input.name)
+        else:
+            return pd.DataFrame(), "Please upload a CSV or Excel file"
+        # Check required columns
+        required_columns = ['Vendor', 'Amount', 'Date', 'Invoice', 'Description']
+        missing_columns = [col for col in required_columns if col not in df.columns]
+        if missing_columns:
+            return pd.DataFrame(), f"Missing required columns: {', '.join(missing_columns)}"
+        # Perform duplicate detection
+        result_df, message = detect_duplicates(df, n_clusters)
+        return result_df, message
+    except Exception as e:
+        return pd.DataFrame(), f"Error processing file: {str(e)}"
+def load_sample():
+    """Load sample data for demonstration"""
+    sample_df = generate_sample_data()
+    # Save to temporary file
+    sample_df.to_csv("sample_data.csv", index=False)
+    return "sample_data.csv"
+# Create Gradio interface
+with gr.Blocks(theme=gr.themes.Soft(), title="Vendor Duplicate Analyzer") as app:
+    # Header
+    gr.HTML("""
+    <div style="text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 10px; margin-bottom: 20px;">
+        <h1 style="color: white; margin: 0; font-size: 2.5em;">🔍 Vendor Duplicate Analyzer</h1>
+        <p style="color: white; margin: 10px 0 0 0; font-size: 1.2em;">Using K-means Clustering for Duplicate Detection</p>
+    </div>
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.HTML("<h3>📤 Upload Data</h3>")
+            file_input = gr.File(
+                label="Upload Payment CSV/Excel",
+                file_types=[".csv", ".xlsx", ".xls"],
+                type="filepath"
+            )
+            gr.HTML("""
+            <div style="margin: 15px 0; padding: 15px; background-color: #f8f9fa; border-radius: 8px; border-left: 4px solid #17a2b8;">
+                <strong>📋 CSV Format Required</strong><br>
+                Your CSV must have columns: <strong>Vendor, Amount, Date, Invoice, Description</strong>
+            </div>
+            """)
+            with gr.Row():
+                sample_btn = gr.Button("📊 Load Sample Data", variant="secondary")
+                analyze_btn = gr.Button("🔍 Analyze with K-means", variant="primary")
+            gr.HTML("<h3>⚙️ Parameters</h3>")
+            n_clusters = gr.Slider(
+                minimum=2,
+                maximum=10,
+                value=5,
+                step=1,
+                label="Number of Clusters",
+                info="K-means will group similar payments into this many clusters"
+            )
+            gr.HTML("""
+            <div style="margin-top: 20px; padding: 15px; background-color: #e8f4f8; border-radius: 8px;">
+                <strong>🤖 How K-means Works Here</strong>
+                <ol style="margin: 10px 0 0 20px;">
+                    <li>Extracts numerical features from payment records</li>
+                    <li>Groups similar payments into clusters</li>
+                    <li>Compares payments within each cluster</li>
+                    <li>Calculates similarity scores for potential duplicates</li>
+                </ol>
+            </div>
+            """)
+        with gr.Column(scale=2):
+            gr.HTML("<h3>📊 Results</h3>")
+            status_message = gr.Textbox(
+                label="Analysis Status",
+                interactive=False,
+                placeholder="Upload data and click 'Analyze' to begin..."
+            )
+            results_table = gr.Dataframe(
+                label="Potential Duplicate Pairs Found by K-means",
+                interactive=False,
+                wrap=True,
+                column_widths=["10%", "15%", "15%", "10%", "10%", "10%", "10%", "10%", "10%"]
+            )
+    # Event handlers
+    def handle_sample_load():
+        sample_file = load_sample()
+        return sample_file, "Sample data loaded successfully! Click 'Analyze' to detect duplicates."
+    def handle_analysis(file_input, n_clusters):
+        if file_input is None:
+            return "Please upload a file first!", pd.DataFrame()
+        result_df, message = analyze_payments(file_input, n_clusters)
+        return message, result_df
+    sample_btn.click(
+        fn=handle_sample_load,
+        outputs=[file_input, status_message]
+    )
+    analyze_btn.click(
+        fn=handle_analysis,
+        inputs=[file_input, n_clusters],
+        outputs=[status_message, results_table]
+    )
+# Launch the app
+if __name__ == "__main__":
+    app.launch(share=True)