Spaces:
Build error
Build error
| import gradio as gr | |
| import pandas as pd | |
| import numpy as np | |
| from sklearn.cluster import KMeans | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import google.generativeai as genai | |
| import os | |
| import re | |
| from datetime import datetime, timedelta | |
| import io | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| # Configure Gemini API | |
| def setup_gemini(): | |
| """Setup Gemini API with API key from environment or Hugging Face secrets""" | |
| try: | |
| # Try to get from Hugging Face secrets first | |
| from huggingface_hub import HfApi | |
| api_key = os.getenv('GEMINI_API_KEY') | |
| if not api_key: | |
| # If running on HF Spaces, try to get from secrets | |
| try: | |
| import spaces | |
| api_key = spaces.config.GEMINI_API_KEY | |
| except: | |
| pass | |
| if api_key: | |
| genai.configure(api_key=api_key) | |
| return True | |
| else: | |
| print("Warning: GEMINI_API_KEY not found. LLM analysis will be limited.") | |
| return False | |
| except Exception as e: | |
| print(f"Error setting up Gemini: {e}") | |
| return False | |
| # Sample data generator | |
| def generate_sample_data(): | |
| """Generate sample payment data with potential duplicates""" | |
| vendors = [ | |
| "ABC Corp", "ABC Corporation", "XYZ Ltd", "XYZ Limited", | |
| "Tech Solutions Inc", "Tech Sol Inc", "Global Services", | |
| "International Trade Co", "Int'l Trade Company", "Quick Fix LLC", | |
| "QuickFix Limited", "Alpha Systems", "Beta Enterprises", | |
| "Gamma Holdings", "Delta Corp", "Epsilon Ltd" | |
| ] | |
| amounts = [1000, 1500, 2000, 2500, 3000, 5000, 7500, 10000] | |
| data = [] | |
| for i in range(50): | |
| # Create some intentional duplicates | |
| if i % 10 == 0 and i > 0: # Every 10th record, create a potential duplicate | |
| prev_record = data[i-5] | |
| vendor = prev_record['vendor_name'] | |
| amount = prev_record['amount'] + np.random.uniform(-50, 50) # Slight variation | |
| date = pd.to_datetime(prev_record['payment_date']) + timedelta(days=np.random.randint(1, 5)) | |
| else: | |
| vendor = np.random.choice(vendors) | |
| amount = np.random.choice(amounts) + np.random.uniform(-100, 100) | |
| date = datetime.now() - timedelta(days=np.random.randint(1, 365)) | |
| data.append({ | |
| 'payment_id': f"PAY_{i+1:04d}", | |
| 'vendor_name': vendor, | |
| 'amount': round(amount, 2), | |
| 'payment_date': date.strftime('%Y-%m-%d'), | |
| 'invoice_number': f"INV_{np.random.randint(1000, 9999)}", | |
| 'description': f"Payment for services - {np.random.choice(['Consulting', 'Software', 'Hardware', 'Maintenance'])}", | |
| 'payment_method': np.random.choice(['Wire Transfer', 'Check', 'ACH', 'Credit Card']) | |
| }) | |
| return pd.DataFrame(data) | |
| class VendorDuplicateAnalyzer: | |
| def __init__(self): | |
| self.scaler = StandardScaler() | |
| self.vectorizer = TfidfVectorizer(stop_words='english', max_features=100) | |
| self.gemini_available = setup_gemini() | |
| def preprocess_data(self, df): | |
| """Preprocess the data for analysis""" | |
| # Clean vendor names | |
| df['vendor_clean'] = df['vendor_name'].str.lower().str.strip() | |
| df['vendor_clean'] = df['vendor_clean'].str.replace(r'[^\w\s]', '', regex=True) | |
| df['vendor_clean'] = df['vendor_clean'].str.replace(r'\s+', ' ', regex=True) | |
| # Convert amount to numeric | |
| df['amount'] = pd.to_numeric(df['amount'], errors='coerce') | |
| # Convert date | |
| df['payment_date'] = pd.to_datetime(df['payment_date'], errors='coerce') | |
| return df | |
| def extract_features(self, df): | |
| """Extract features for clustering""" | |
| features = [] | |
| # Vendor name similarity features using TF-IDF | |
| vendor_tfidf = self.vectorizer.fit_transform(df['vendor_clean'].fillna('')) | |
| # Amount features (normalized) | |
| amount_features = self.scaler.fit_transform(df[['amount']].fillna(0)) | |
| # Date features (days since earliest date) | |
| min_date = df['payment_date'].min() | |
| date_features = (df['payment_date'] - min_date).dt.days.fillna(0).values.reshape(-1, 1) | |
| date_features = self.scaler.fit_transform(date_features) | |
| # Combine features | |
| features = np.hstack([ | |
| vendor_tfidf.toarray(), | |
| amount_features, | |
| date_features | |
| ]) | |
| return features | |
| def find_duplicates_kmeans(self, df, n_clusters=5): | |
| """Find potential duplicates using K-means clustering""" | |
| df_clean = self.preprocess_data(df.copy()) | |
| features = self.extract_features(df_clean) | |
| # Apply K-means clustering | |
| kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) | |
| clusters = kmeans.fit_predict(features) | |
| df_clean['cluster'] = clusters | |
| # Find potential duplicates within clusters | |
| duplicate_pairs = [] | |
| for cluster_id in range(n_clusters): | |
| cluster_data = df_clean[df_clean['cluster'] == cluster_id] | |
| if len(cluster_data) > 1: | |
| # Calculate pairwise similarities within cluster | |
| cluster_indices = cluster_data.index.tolist() | |
| for i in range(len(cluster_indices)): | |
| for j in range(i + 1, len(cluster_indices)): | |
| idx1, idx2 = cluster_indices[i], cluster_indices[j] | |
| # Calculate similarity score | |
| similarity_score = self.calculate_similarity( | |
| df_clean.loc[idx1], df_clean.loc[idx2] | |
| ) | |
| if similarity_score > 0.6: # Threshold for potential duplicates | |
| duplicate_pairs.append({ | |
| 'index1': idx1, | |
| 'index2': idx2, | |
| 'similarity_score': similarity_score, | |
| 'cluster': cluster_id | |
| }) | |
| return duplicate_pairs, df_clean | |
| def calculate_similarity(self, row1, row2): | |
| """Calculate similarity between two payment records""" | |
| # Vendor name similarity (Jaccard similarity) | |
| vendor1_words = set(row1['vendor_clean'].split()) | |
| vendor2_words = set(row2['vendor_clean'].split()) | |
| if len(vendor1_words) == 0 and len(vendor2_words) == 0: | |
| vendor_sim = 1.0 | |
| elif len(vendor1_words) == 0 or len(vendor2_words) == 0: | |
| vendor_sim = 0.0 | |
| else: | |
| vendor_sim = len(vendor1_words & vendor2_words) / len(vendor1_words | vendor2_words) | |
| # Amount similarity (using relative difference) | |
| amount_diff = abs(row1['amount'] - row2['amount']) / max(row1['amount'], row2['amount'], 1e-6) | |
| amount_sim = max(0, 1 - amount_diff) | |
| # Date similarity (within 30 days gets high similarity) | |
| date_diff = abs((row1['payment_date'] - row2['payment_date']).days) | |
| date_sim = max(0, 1 - date_diff / 30) | |
| # Combined similarity (weighted average) | |
| total_sim = (vendor_sim * 0.5 + amount_sim * 0.3 + date_sim * 0.2) | |
| return total_sim | |
| def analyze_with_llm(self, duplicate_pairs, df): | |
| """Analyze duplicates using Gemini LLM""" | |
| if not self.gemini_available: | |
| return "LLM analysis not available. Please set GEMINI_API_KEY." | |
| try: | |
| model = genai.GenerativeModel('gemini-pro') | |
| analysis_text = "Analyze these potential duplicate payments:\n\n" | |
| for i, pair in enumerate(duplicate_pairs[:5]): # Limit to first 5 pairs | |
| row1 = df.iloc[pair['index1']] | |
| row2 = df.iloc[pair['index2']] | |
| analysis_text += f"Potential Duplicate Pair {i+1}:\n" | |
| analysis_text += f"Payment 1: {row1['vendor_name']} - ${row1['amount']} on {row1['payment_date']}\n" | |
| analysis_text += f"Payment 2: {row2['vendor_name']} - ${row2['amount']} on {row2['payment_date']}\n" | |
| analysis_text += f"Similarity Score: {pair['similarity_score']:.3f}\n\n" | |
| prompt = f""" | |
| {analysis_text} | |
| Please analyze these potential duplicate payments and provide: | |
| 1. Assessment of whether each pair is likely a true duplicate | |
| 2. Risk level (High/Medium/Low) for each pair | |
| 3. Recommended actions for investigation | |
| 4. Overall summary of findings | |
| Focus on practical business insights for payment processing teams. | |
| """ | |
| response = model.generate_content(prompt) | |
| return response.text | |
| except Exception as e: | |
| return f"Error in LLM analysis: {str(e)}" | |
| # Initialize analyzer | |
| analyzer = VendorDuplicateAnalyzer() | |
| def analyze_file(file, n_clusters): | |
| """Main analysis function""" | |
| try: | |
| if file is None: | |
| return "Please upload a file first.", None, "" | |
| # Read file | |
| if file.name.endswith('.csv'): | |
| df = pd.read_csv(file.name) | |
| elif file.name.endswith(('.xlsx', '.xls')): | |
| df = pd.read_excel(file.name) | |
| else: | |
| return "Unsupported file format. Please upload CSV or Excel file.", None, "" | |
| # Validate required columns | |
| required_columns = ['vendor_name', 'amount', 'payment_date'] | |
| missing_columns = [col for col in required_columns if col not in df.columns] | |
| if missing_columns: | |
| return f"Missing required columns: {missing_columns}. Required: {required_columns}", None, "" | |
| # Find duplicates | |
| duplicate_pairs, df_processed = analyzer.find_duplicates_kmeans(df, n_clusters) | |
| if not duplicate_pairs: | |
| return "No potential duplicates found.", None, "No duplicates detected in the uploaded data." | |
| # Create results DataFrame | |
| results = [] | |
| for pair in duplicate_pairs: | |
| row1 = df.iloc[pair['index1']] | |
| row2 = df.iloc[pair['index2']] | |
| results.append({ | |
| 'Pair_ID': len(results) + 1, | |
| 'Vendor_1': row1['vendor_name'], | |
| 'Amount_1': row1['amount'], | |
| 'Date_1': row1['payment_date'], | |
| 'Vendor_2': row2['vendor_name'], | |
| 'Amount_2': row2['amount'], | |
| 'Date_2': row2['payment_date'], | |
| 'Similarity_Score': f"{pair['similarity_score']:.3f}", | |
| 'Cluster': pair['cluster'] | |
| }) | |
| results_df = pd.DataFrame(results) | |
| # LLM Analysis | |
| llm_analysis = analyzer.analyze_with_llm(duplicate_pairs, df) | |
| status_message = f"Analysis complete! Found {len(duplicate_pairs)} potential duplicate pairs." | |
| return status_message, results_df, llm_analysis | |
| except Exception as e: | |
| return f"Error analyzing file: {str(e)}", None, "" | |
| def load_sample_data(): | |
| """Load sample data""" | |
| sample_df = generate_sample_data() | |
| return sample_df.to_csv(index=False), "Sample data loaded successfully!" | |
| # Create Gradio interface | |
| def create_interface(): | |
| with gr.Blocks(title="Vendor Duplicate Payment Analyzer", theme=gr.themes.Soft()) as demo: | |
| gr.HTML(""" | |
| <div style="text-align: center; padding: 20px; background: linear-gradient(45deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 10px; margin-bottom: 20px;"> | |
| <h1>π Vendor Duplicate Payment Analyzer</h1> | |
| <p>Using K-means Clustering and AI for Duplicate Detection</p> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.HTML("<h3>π Upload Data</h3>") | |
| file_input = gr.File( | |
| label="Upload Payment CSV/Excel", | |
| file_types=[".csv", ".xlsx", ".xls"], | |
| type="filepath" | |
| ) | |
| with gr.Row(): | |
| sample_btn = gr.Button("π Load Sample Data", variant="secondary") | |
| sample_output = gr.File(label="Sample Data", visible=False) | |
| gr.HTML("<h3>βοΈ Parameters</h3>") | |
| n_clusters = gr.Slider( | |
| minimum=2, | |
| maximum=10, | |
| value=5, | |
| step=1, | |
| label="Number of Clusters", | |
| info="K-means will group similar payments into this many clusters" | |
| ) | |
| analyze_btn = gr.Button("π Analyze with K-means", variant="primary", size="lg") | |
| with gr.Column(scale=2): | |
| gr.HTML("<h3>π Results</h3>") | |
| status_output = gr.Textbox( | |
| label="Analysis Status", | |
| value="Upload data and click 'Analyze' to begin...", | |
| interactive=False | |
| ) | |
| results_output = gr.Dataframe( | |
| label="Potential Duplicate Pairs Found by K-means", | |
| headers=["Pair_ID", "Vendor_1", "Amount_1", "Date_1", "Vendor_2", "Amount_2", "Date_2", "Similarity_Score", "Cluster"], | |
| interactive=False | |
| ) | |
| gr.HTML("<h3>π€ AI Analysis</h3>") | |
| llm_output = gr.Textbox( | |
| label="Gemini AI Analysis", | |
| lines=10, | |
| placeholder="AI analysis will appear here after processing...", | |
| interactive=False | |
| ) | |
| # Event handlers | |
| sample_btn.click( | |
| fn=load_sample_data, | |
| outputs=[sample_output, status_output] | |
| ) | |
| analyze_btn.click( | |
| fn=analyze_file, | |
| inputs=[file_input, n_clusters], | |
| outputs=[status_output, results_output, llm_output] | |
| ) | |
| # Instructions | |
| gr.HTML(""" | |
| <div style="margin-top: 20px; padding: 15px; background-color: #f8f9fa; border-radius: 8px;"> | |
| <h4>π Instructions:</h4> | |
| <ol> | |
| <li><strong>Upload Data:</strong> CSV/Excel file with columns: vendor_name, amount, payment_date</li> | |
| <li><strong>Set Parameters:</strong> Adjust number of clusters (more clusters = more granular grouping)</li> | |
| <li><strong>Analyze:</strong> Click analyze to run K-means clustering and AI analysis</li> | |
| <li><strong>Review Results:</strong> Check potential duplicates and AI recommendations</li> | |
| </ol> | |
| <h4>π§ Setup for Hugging Face Spaces:</h4> | |
| <p>Set your <code>GEMINI_API_KEY</code> in the Hugging Face Spaces secrets for AI analysis.</p> | |
| </div> | |
| """) | |
| return demo | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo = create_interface() | |
| demo.launch(share=True) |