Fraud-Chatbot / app.py
ahmzakif's picture
Update app.py
dcc0a1e verified
"""Gradio interface for Fraud Detection Chatbot."""
import logging
import warnings
import os
# Suppress warnings for cleaner output
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', message='.*LangChain.*')
# Disable ChromaDB telemetry to avoid errors
os.environ['ANONYMIZED_TELEMETRY'] = 'False'
import gradio as gr
from pathlib import Path
import pandas as pd
from src.data.processor import FraudDataProcessor
from src.llm.groq_client import GroqClient
from src.rag.document_loader import DocumentLoader
from src.rag.vector_store import VectorStore
from src.services.fraud_analyzer import FraudAnalyzer
from src.services.quality_scorer import ResponseQualityScorer
from src.config.config import settings
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Suppress chromadb logging
logging.getLogger('chromadb').setLevel(logging.ERROR)
logging.getLogger('chromadb.telemetry').setLevel(logging.CRITICAL)
# Initialize components globally
groq_client = None
vector_store = None
fraud_analyzer = None
data_processor = None
quality_scorer = ResponseQualityScorer()
def initialize_system():
"""Initialize the fraud detection system."""
global groq_client, vector_store, fraud_analyzer, data_processor
logger.info("Initializing Fraud Detection System...")
# Initialize Groq client
groq_client = GroqClient()
logger.info("βœ“ Groq client initialized")
# Initialize data processor
data_processor = FraudDataProcessor()
logger.info("βœ“ Data processor initialized")
# Setup RAG system
try:
document_loader = DocumentLoader(
chunk_size=settings.chunk_size,
chunk_overlap=settings.chunk_overlap,
)
all_documents = []
# Load PDF documents
pdf_documents = document_loader.load_pdfs_from_directory(settings.pdf_dir)
if pdf_documents:
all_documents.extend(pdf_documents)
logger.info(f"βœ“ Loaded {len(pdf_documents)} PDF documents")
else:
logger.warning("⚠ No PDF documents found")
# Load CSV insights
csv_path = settings.train_data_path
if csv_path.exists():
try:
csv_documents = document_loader.load_csv_insights(csv_path, sample_size=1050000)
all_documents.extend(csv_documents)
logger.info(f"βœ“ Loaded {len(csv_documents)} CSV insight documents")
except Exception as e:
logger.warning(f"⚠ Failed to load CSV insights: {e}")
else:
logger.warning(f"⚠ CSV file not found: {csv_path}")
# Add all documents to vector store
if all_documents:
vector_store = VectorStore()
vector_store.add_documents(all_documents)
logger.info(f"βœ“ RAG system initialized with {len(all_documents)} total documents")
else:
logger.warning("⚠ No documents loaded for RAG system")
except Exception as e:
logger.warning(f"⚠ RAG setup failed: {e}")
# Create fraud analyzer
fraud_analyzer = FraudAnalyzer(
groq_client=groq_client,
vector_store=vector_store,
)
logger.info("βœ“ Fraud analyzer initialized")
return "βœ… System initialized successfully!"
def analyze_by_transaction_id(transaction_id: int, use_rag: bool):
"""Analyze fraud by transaction ID."""
if fraud_analyzer is None:
return "❌ System not initialized. Please wait for initialization to complete."
try:
transaction_id = int(transaction_id)
result = fraud_analyzer.analyze_transaction(
transaction_id=transaction_id,
use_rag=use_rag,
)
# Format the response
transaction = result['transaction']
analysis = result['analysis']
response = f"""### πŸ“Š Transaction Details
**Merchant:** {transaction.get('merchant', 'N/A')}
**Category:** {transaction.get('category', 'N/A')}
**Amount:** ${transaction.get('amt', 0):.2f}
**City:** {transaction.get('city', 'N/A')}
**State:** {transaction.get('state', 'N/A')}
---
### πŸ” Fraud Analysis
{analysis}
"""
return response
except Exception as e:
logger.error(f"Analysis failed: {e}")
return f"❌ Error: {str(e)}"
def analyze_by_manual_data(
merchant: str, category: str, amount: float, city: str, state: str, use_rag: bool,
gender: str = None, age: int = None, job: str = None, zip_code: str = None,
city_pop: int = None, merch_lat: float = None, merch_long: float = None
):
"""Analyze fraud by manual transaction data."""
if fraud_analyzer is None:
return "❌ System not initialized. Please wait for initialization to complete."
try:
# Clean merchant name from prefix if present
clean_merchant = merchant.replace('fraud_', '') if merchant else merchant
transaction_data = {
"merchant": clean_merchant,
"category": category,
"amt": float(amount),
"city": city,
"state": state,
}
# Add advanced fields if provided
if gender:
transaction_data["gender"] = gender
if age:
transaction_data["age"] = age
if job:
transaction_data["job"] = job
if zip_code:
transaction_data["zip"] = zip_code
if city_pop:
transaction_data["city_pop"] = city_pop
if merch_lat is not None:
transaction_data["merch_lat"] = merch_lat
if merch_long is not None:
transaction_data["merch_long"] = merch_long
result = fraud_analyzer.analyze_transaction(
transaction_data=transaction_data,
use_rag=use_rag,
)
analysis = result['analysis']
response = f"""### πŸ“Š Transaction Details
**Merchant:** {merchant}
**Category:** {category}
**Amount:** ${amount:.2f}
**City:** {city}
**State:** {state}
"""
# Add advanced fields to display if provided
if gender or age or job:
response += "\n**Cardholder Info:**\n"
if gender:
response += f"- Gender: {gender}\n"
if age:
response += f"- Age: {age}\n"
if job:
response += f"- Job: {job}\n"
if zip_code or city_pop:
response += "\n**Location Details:**\n"
if zip_code:
response += f"- ZIP: {zip_code}\n"
if city_pop:
response += f"- City Population: {city_pop:,}\n"
if merch_lat is not None or merch_long is not None:
response += "\n**Merchant Location:**\n"
response += f"- Coordinates: ({merch_lat}, {merch_long})\n"
response += f"""
---
### πŸ” Fraud Analysis
{analysis}
"""
return response
except Exception as e:
logger.error(f"Analysis failed: {e}")
return f"❌ Error: {str(e)}"
def get_dataset_summary():
"""Get dataset summary statistics including RAG documents."""
if data_processor is None:
return "❌ System not initialized."
try:
# Get transaction data summary
summary = data_processor.get_transaction_summary()
response = f"""### πŸ“Š Transaction Dataset Summary
**Total Transactions:** {summary['total_transactions']:,}
**Fraud Cases:** {summary['fraud_count']:,}
**Fraud Rate:** {summary['fraud_percentage']:.2f}%
**Average Amount:** ${summary['average_amount']:.2f}
---
**Top Transaction Categories:**
"""
for category, count in list(summary['categories'].items())[:10]:
response += f"\n- {category}: {count:,}"
# Add RAG document summary if available
if vector_store is not None:
response += "\n\n---\n\n### πŸ“š RAG Knowledge Base\n\n"
# Count documents by type
try:
# Get all documents from vector store
all_docs = vector_store.vector_store._collection.get()
if all_docs and 'metadatas' in all_docs:
metadatas = all_docs['metadatas']
# Count by source type
pdf_count = 0
csv_pattern_count = 0
csv_merchant_count = 0
csv_location_count = 0
csv_stats_count = 0
pdf_sources = set()
for meta in metadatas:
doc_type = meta.get('type', 'document')
source = meta.get('source', '')
if doc_type == 'fraud_pattern':
csv_pattern_count += 1
elif doc_type == 'merchant_profile':
csv_merchant_count += 1
elif doc_type == 'location_insight':
csv_location_count += 1
elif doc_type == 'statistical_summary':
csv_stats_count += 1
else:
# PDF document
pdf_count += 1
if source.endswith('.pdf'):
pdf_sources.add(source)
response += f"**Total Documents in RAG:** {len(metadatas):,}\n\n"
if pdf_count > 0:
response += f"**πŸ“„ PDF Research Documents:** {pdf_count:,}\n"
for pdf in sorted(pdf_sources):
response += f" - {pdf}\n"
response += "\n"
csv_total = csv_pattern_count + csv_merchant_count + csv_location_count + csv_stats_count
if csv_total > 0:
response += f"**πŸ“Š CSV-Derived Insights:** {csv_total:,}\n"
if csv_pattern_count > 0:
response += f" - Fraud Pattern Analysis: {csv_pattern_count}\n"
if csv_merchant_count > 0:
response += f" - Merchant Profiles: {csv_merchant_count}\n"
if csv_location_count > 0:
response += f" - Location Insights: {csv_location_count}\n"
if csv_stats_count > 0:
response += f" - Statistical Summaries: {csv_stats_count}\n"
else:
response += "**Status:** RAG system initialized but no document metadata available."
except Exception as e:
logger.warning(f"Could not retrieve RAG document stats: {e}")
response += "**Status:** RAG system active (document count unavailable)"
return response
except Exception as e:
logger.error(f"Summary failed: {e}")
return f"❌ Error: {str(e)}"
def chat_with_fraud_expert(message: str, history: list, use_rag: bool):
"""Chat with fraud detection expert."""
if groq_client is None:
return history + [[message, "❌ System not initialized. Please wait for initialization to complete."]]
try:
# Check if message is asking about a specific transaction ID
import re
transaction_query = re.search(r'transaction\s+(?:id\s+)?(\d+)', message.lower())
transaction_context = ""
if transaction_query and data_processor is not None:
transaction_id = int(transaction_query.group(1))
try:
# Get transaction data
transaction = data_processor.get_transaction_summary(transaction_id)
# Format transaction details with all relevant columns
transaction_context = f"\n\n**Transaction ID {transaction_id} Details:**\n"
transaction_context += f"- **Transaction Number:** {transaction.get('trans_num', 'N/A')}\n"
transaction_context += f"- **Date/Time:** {transaction.get('trans_date_trans_time', 'N/A')}\n"
transaction_context += f"- **Merchant:** {transaction.get('merchant', 'N/A')}\n"
transaction_context += f"- **Category:** {transaction.get('category', 'N/A')}\n"
transaction_context += f"- **Amount:** ${transaction.get('amt', 0):.2f}\n"
transaction_context += f"- **Location:** {transaction.get('city', 'N/A')}, {transaction.get('state', 'N/A')}\n"
transaction_context += f"- **Merchant Coordinates:** ({transaction.get('merch_lat', 'N/A')}, {transaction.get('merch_long', 'N/A')})\n"
transaction_context += f"\n**Cardholder Information:**\n"
transaction_context += f"- **Name:** {transaction.get('first', 'N/A')} {transaction.get('last', 'N/A')}\n"
transaction_context += f"- **Gender:** {transaction.get('gender', 'N/A')}\n"
transaction_context += f"- **Date of Birth:** {transaction.get('dob', 'N/A')}\n"
transaction_context += f"- **Job:** {transaction.get('job', 'N/A')}\n"
transaction_context += f"- **Street:** {transaction.get('street', 'N/A')}\n"
transaction_context += f"- **City/State/ZIP:** {transaction.get('city', 'N/A')}, {transaction.get('state', 'N/A')} {transaction.get('zip', 'N/A')}\n"
transaction_context += f"- **Cardholder Coordinates:** ({transaction.get('lat', 'N/A')}, {transaction.get('long', 'N/A')})\n"
transaction_context += f"- **City Population:** {transaction.get('city_pop', 'N/A')}\n"
transaction_context += f"\n**Card Information:**\n"
transaction_context += f"- **Card Number:** {transaction.get('cc_num', 'N/A')}\n"
transaction_context += f"\n**Fraud Status:**\n"
transaction_context += f"- **Actual Status:** {'🚨 FRAUD' if transaction.get('is_fraud', 0) == 1 else 'βœ… LEGITIMATE'}\n"
logger.info(f"Found transaction {transaction_id} for chat query")
except ValueError as e:
transaction_context = f"\n\n**Note:** {str(e)}\n"
except Exception as e:
logger.warning(f"Could not fetch transaction {transaction_id}: {e}")
# If RAG is enabled and vector store is available, get relevant context
context = ""
source_references = []
if use_rag and vector_store is not None:
docs = vector_store.similarity_search(message, k=3)
if docs:
context = "\n\nRelevant context from fraud detection documents:\n"
for i, doc in enumerate(docs, 1):
# Add context with source number
context += f"\n[Source {i}] {doc.page_content[:500]}...\n"
# Collect source information for reference list
source_file = doc.metadata.get('source', 'Unknown')
page_num = doc.metadata.get('page', 'N/A')
doc_type = doc.metadata.get('type', 'document')
# Format source info
if doc_type == 'fraud_pattern':
category = doc.metadata.get('category', 'N/A')
source_references.append(f"Source {i}: CSV Data - Fraud Pattern Analysis ({category})")
elif doc_type == 'statistical_summary':
scope = doc.metadata.get('scope', 'N/A')
source_references.append(f"Source {i}: CSV Data - Statistical Summary ({scope})")
elif doc_type == 'merchant_profile':
merchant = doc.metadata.get('merchant', 'N/A')
source_references.append(f"Source {i}: CSV Data - Merchant Profile ({merchant})")
elif doc_type == 'location_insight':
state = doc.metadata.get('state', 'N/A')
source_references.append(f"Source {i}: CSV Data - Location Analysis ({state})")
else:
# PDF document
if page_num != 'N/A':
source_references.append(f"Source {i}: {source_file}, Page {page_num}")
else:
source_references.append(f"Source {i}: {source_file}")
# Create prompt with transaction data and context
full_prompt = message
if transaction_context:
full_prompt = f"{message}\n{transaction_context}"
if context:
full_prompt = f"{full_prompt}\n{context}"
# Enhanced system message with inline citation instructions
system_message = """You are an expert fraud detection analyst. Help users understand fraud patterns, detection methods, and transaction analysis.
IMPORTANT CITATION RULES:
- When using information from the provided context sources, you MUST add an inline citation immediately after the relevant sentence or paragraph.
- Format citations as: [Source X] where X is the source number from the context.
- Place citations at the end of sentences that use information from that source.
- You can cite multiple sources in one paragraph if needed: [Source 1, Source 2]
- Be specific and reference the data when using information from sources.
TRANSACTION ANALYSIS:
- If transaction details are provided, analyze them thoroughly.
- Note: Ignore "fraud_" prefix in merchant names; it is an artifact of the synthetic dataset and NOT an indicator of fraud.
- Compare transaction characteristics against known fraud patterns.
- Provide a clear fraud risk assessment (Low/Medium/High).
- Explain your reasoning with specific indicators.
Example:
"Online gaming merchants often experience higher fraud rates due to card-not-present transactions. [Source 1] The average fraud rate in this category is 5.2%. [Source 2]"
Provide clear, actionable insights with proper inline citations."""
# Get response from LLM
response = groq_client.invoke(
prompt=full_prompt,
system_message=system_message,
)
# Score response quality
score_result = quality_scorer.score_response(
response=response,
query=message,
has_rag=use_rag and vector_store is not None,
sources=source_references,
)
# Add quality score display
quality_display = quality_scorer.format_score_display(score_result)
response += quality_display
# Add source reference list at the end
if source_references:
response += "\n**πŸ“š Source References:**\n"
for ref in source_references:
response += f"\n- {ref}"
# Log quality score
logger.info(f"Response quality score: {score_result['overall_score']}/100 (Grade: {score_result['grade']})")
history.append({"role": "user", "content": message})
history.append({"role": "assistant", "content": response})
return history
except Exception as e:
logger.error(f"Chat failed: {e}")
history.append({"role": "user", "content": message})
history.append({"role": "assistant", "content": f"❌ Error: {str(e)}"})
return history
# Create Gradio interface
def create_interface():
"""Create the Gradio interface."""
with gr.Blocks(
theme=gr.themes.Soft(
primary_hue="blue",
secondary_hue="slate",
font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
),
title="Fraud Detection Chatbot",
css="""
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
* {
font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif !important;
}
.gradio-container {
max-width: 1200px !important;
}
h1, h2, h3, h4, h5, h6 {
font-weight: 600 !important;
}
.markdown-text {
font-size: 15px !important;
line-height: 1.6 !important;
}
button {
font-weight: 500 !important;
}
"""
) as demo:
gr.Markdown("""
# πŸ›‘οΈ Fraud Detection Chatbot
AI-powered fraud detection system using LangChain, Groq, and RAG (Retrieval Augmented Generation).
""")
# System status
with gr.Row():
init_status = gr.Textbox(
label="System Status",
value="Initializing...",
interactive=False,
)
# Tabs for different functionalities
with gr.Tabs():
# Tab 1: Chat with Expert
with gr.Tab("πŸ’¬ Chat with Fraud Expert"):
gr.Markdown("""
Ask questions about fraud detection, transaction patterns, or get expert advice.
""")
with gr.Row():
chat_use_rag = gr.Checkbox(
label="Use RAG (Enhanced with fraud detection documents + CSV data)",
value=True,
)
chatbot = gr.Chatbot(
label="Fraud Detection Expert",
height=500,
)
with gr.Row():
chat_input = gr.Textbox(
label="Your Question",
placeholder="Ask about fraud detection, transaction analysis, etc...",
scale=4,
)
chat_submit = gr.Button("Send", variant="primary", scale=1)
chat_clear = gr.Button("Clear Chat")
# Chat examples
gr.Examples(
examples=[
"What are common indicators of credit card fraud?",
"How can I detect unusual transaction patterns?",
"What are fraud patterns in grocery transactions?",
"Which merchants have high fraud rates?",
"What states have elevated fraud activity?",
],
inputs=chat_input,
)
# Tab 2: Analyze by Transaction ID
with gr.Tab("πŸ” Analyze by Transaction ID"):
gr.Markdown("""
Analyze a specific transaction from the dataset by its ID.
""")
txn_id_input = gr.Number(
label="Transaction ID",
value=0,
precision=0,
)
txn_id_use_rag = gr.Checkbox(
label="Use RAG (Enhanced analysis)",
value=True,
)
txn_id_submit = gr.Button("Analyze Transaction", variant="primary")
txn_id_output = gr.Markdown(label="Analysis Result")
# Tab 3: Analyze Manual Transaction
with gr.Tab("✍️ Analyze Manual Transaction"):
gr.Markdown("""
Enter transaction details manually for fraud analysis.
""")
# Basic Fields
gr.Markdown("### Basic Transaction Information")
manual_merchant = gr.Textbox(
label="Merchant Name",
placeholder="e.g., Amazon, Walmart",
)
manual_category = gr.Dropdown(
label="Category",
choices=[
"grocery_pos", "gas_transport", "misc_net",
"shopping_net", "shopping_pos", "entertainment",
"food_dining", "personal_care", "health_fitness",
"travel", "kids_pets", "home"
],
value="grocery_pos",
)
manual_amount = gr.Number(
label="Amount ($)",
value=100.0,
)
manual_city = gr.Textbox(
label="City",
placeholder="e.g., Jakarta",
)
manual_state = gr.Textbox(
label="State",
placeholder="e.g., DKI",
)
# Advanced Fields (Accordion)
with gr.Accordion("πŸ”§ Advanced Fields (Optional)", open=False):
gr.Markdown("*Provide additional details for more accurate fraud analysis*")
with gr.Row():
manual_gender = gr.Radio(
label="Cardholder Gender",
choices=["M", "F"],
value="M",
)
manual_age = gr.Number(
label="Cardholder Age",
value=35,
precision=0,
)
manual_job = gr.Textbox(
label="Cardholder Job",
placeholder="e.g., Engineer, Teacher",
)
with gr.Row():
manual_zip = gr.Textbox(
label="ZIP Code",
placeholder="e.g., 12345",
)
manual_city_pop = gr.Number(
label="City Population",
value=100000,
precision=0,
)
with gr.Row():
manual_merch_lat = gr.Number(
label="Merchant Latitude",
value=0.0,
)
manual_merch_long = gr.Number(
label="Merchant Longitude",
value=0.0,
)
manual_use_rag = gr.Checkbox(
label="Use RAG (Enhanced analysis)",
value=True,
)
manual_submit = gr.Button("Analyze Transaction", variant="primary")
manual_output = gr.Markdown(label="Analysis Result")
# Tab 4: Dataset Summary
with gr.Tab("πŸ“Š Dataset Summary"):
gr.Markdown("""
View statistics and insights from the fraud detection dataset.
""")
summary_button = gr.Button("Get Dataset Summary", variant="primary")
summary_output = gr.Markdown(label="Summary")
# Event handlers
def chat_fn(message, history, use_rag):
return chat_with_fraud_expert(message, history, use_rag)
chat_submit.click(
fn=chat_fn,
inputs=[chat_input, chatbot, chat_use_rag],
outputs=chatbot,
).then(
lambda: "",
outputs=chat_input,
)
chat_input.submit(
fn=chat_fn,
inputs=[chat_input, chatbot, chat_use_rag],
outputs=chatbot,
).then(
lambda: "",
outputs=chat_input,
)
chat_clear.click(
lambda: [],
outputs=chatbot,
)
txn_id_submit.click(
fn=analyze_by_transaction_id,
inputs=[txn_id_input, txn_id_use_rag],
outputs=txn_id_output,
)
manual_submit.click(
fn=analyze_by_manual_data,
inputs=[
manual_merchant,
manual_category,
manual_amount,
manual_city,
manual_state,
manual_use_rag,
manual_gender,
manual_age,
manual_job,
manual_zip,
manual_city_pop,
manual_merch_lat,
manual_merch_long,
],
outputs=manual_output,
)
summary_button.click(
fn=get_dataset_summary,
outputs=summary_output,
)
# Initialize system on load
demo.load(
fn=initialize_system,
outputs=init_status,
)
return demo
if __name__ == "__main__":
demo = create_interface()
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
)