Spaces:
Paused
Paused
| import gradio as gr | |
| import pandas as pd | |
| import numpy as np | |
| import re | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.preprocessing import LabelEncoder | |
| from fuzzywuzzy import process | |
| # Enhanced data generation with realistic fraud patterns | |
| def load_data(): | |
| np.random.seed(42) | |
| cities = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'] | |
| age_groups = ['18-25', '26-35', '36-45', '46-55', '56+'] | |
| incomes = ['Low', 'Medium', 'High'] | |
| data = pd.DataFrame({ | |
| 'TransactionID': range(1, 1001), | |
| 'Amount': np.random.uniform(10, 15000, 1000).round(2), | |
| 'Type': np.random.choice(['Credit', 'Debit'], 1000), | |
| 'City': np.random.choice(cities, 1000), | |
| 'Age': np.random.randint(18, 70, 1000), | |
| 'Income': np.random.choice(incomes, 1000, p=[0.4, 0.4, 0.2]) | |
| }) | |
| # Create realistic fraud patterns | |
| data['Fraud'] = 0 | |
| data.loc[ | |
| ((data['Amount'] > 5000) & (data['Income'] == 'Low')) | | |
| ((data['Type'] == 'Credit') & (data['Amount'] > 8000)) | | |
| ((data['City'] == 'New York') & (data['Age'].between(20, 35)) & (data['Amount'] > 6000)), | |
| 'Fraud' | |
| ] = 1 | |
| return data | |
| data = load_data() | |
| # Initialize separate encoders for each feature | |
| le_type = LabelEncoder() | |
| le_city = LabelEncoder() | |
| le_income = LabelEncoder() | |
| # Fit encoders on full dataset (or training data in real scenarios) | |
| data['Type_encoded'] = le_type.fit_transform(data['Type']) | |
| data['City_encoded'] = le_city.fit_transform(data['City']) | |
| data['Income_encoded'] = le_income.fit_transform(data['Income']) | |
| # Train model | |
| features = ['Amount', 'Type_encoded', 'City_encoded', 'Age', 'Income_encoded'] | |
| X = data[features] | |
| y = data['Fraud'] | |
| model = RandomForestClassifier(random_state=42, n_estimators=100) | |
| model.fit(X, y) | |
| def process_nl_query(query): | |
| try: | |
| # Extract amount | |
| amount_match = re.search(r'\$?(\d+(?:,\d{3})*(?:\.\d{2})?)', query) | |
| if amount_match: | |
| amount = float(amount_match.group(1).replace(',', '')) | |
| else: | |
| return "Error: Could not extract transaction amount." | |
| # Extract transaction type | |
| trans_type = 'Credit' if 'credit' in query.lower() else 'Debit' | |
| # Fuzzy match city | |
| cities = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'] | |
| city_match = process.extractOne(query, cities) | |
| city = city_match[0] if city_match[1] > 70 else 'Unknown' | |
| # Extract age | |
| age_match = re.search(r'(\d+)\s*(?:years?|yrs?)?(?:\s*old)?', query) | |
| age = int(age_match.group(1)) if age_match else None | |
| # Extract income level | |
| income = 'Low' if 'low' in query.lower() else \ | |
| 'High' if 'high' in query.lower() else 'Medium' | |
| # Handle unseen labels | |
| city_encoded = le_city.transform([city])[0] if city in le_city.classes_ else -1 | |
| income_encoded = le_income.transform([income])[0] if income in le_income.classes_ else -1 | |
| # Prepare input | |
| input_df = pd.DataFrame({ | |
| 'Amount': [amount], | |
| 'Type_encoded': le_type.transform([trans_type])[0], | |
| 'City_encoded': city_encoded, | |
| 'Age': [age] if age else data['Age'].median(), # Handle missing age | |
| 'Income_encoded': income_encoded | |
| }) | |
| # Predict | |
| proba = model.predict_proba(input_df)[0][1] | |
| prediction = model.predict(input_df)[0] | |
| # Generate explanation | |
| explanation = [] | |
| if amount > 5000 and income == 'Low': | |
| explanation.append("High amount for low income") | |
| if amount > 8000 and trans_type == 'Credit': | |
| explanation.append("Unusually large credit transaction") | |
| if city == 'New York' and 20 <= age <= 35 and amount > 6000: | |
| explanation.append("Suspicious pattern for young adults in NYC") | |
| return ( | |
| f"Transaction Details:\n" | |
| f"- Amount: ${amount:,.2f}\n" | |
| f"- Type: {trans_type}\n" | |
| f"- City: {city}\n" | |
| f"- Age: {age}\n" | |
| f"- Income Level: {income}\n\n" | |
| f"Fraud Analysis:\n" | |
| f"- Prediction: {'Potentially Fraudulent' if prediction else 'Likely Legitimate'}\n" | |
| f"- Confidence: {proba*100:.1f}%\n" | |
| f"- Risk Factors: {', '.join(explanation) if explanation else 'No specific risk factors identified'}" | |
| ) | |
| except Exception as e: | |
| return f"Error processing query: {str(e)}" | |
| # Gradio Interface | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## Enhanced Fraud Detection System") | |
| with gr.Tab("Natural Language Query"): | |
| gr.Markdown("**Example:** 'Check a $6000 credit in New York for a 26-year-old with low income'") | |
| nl_input = gr.Textbox(label="Enter your transaction query:") | |
| nl_output = gr.Textbox(label="Fraud Analysis", lines=10) | |
| gr.Examples( | |
| examples=[ | |
| "Is a $8000 credit in Chicago for a 45-year-old medium income safe?", | |
| "Verify a $300 debit in Phoenix for a 60-year-old high income client" | |
| ], | |
| inputs=nl_input | |
| ) | |
| nl_input.submit(fn=process_nl_query, inputs=nl_input, outputs=nl_output) | |
| with gr.Tab("Data Insights"): | |
| gr.Markdown("### Fraud Pattern Analysis") | |
| gr.DataFrame(data[data['Fraud'] == 1].describe()) | |
| demo.launch() | |