Spaces:

ahmzakif
/

Fraud-Chatbot

Sleeping

App Files Files Community

Fraud-Chatbot / app.py

ahmzakif

Update app.py

dcc0a1e verified about 2 months ago

raw

history blame contribute delete

29.7 kB

	"""Gradio interface for Fraud Detection Chatbot."""

	import logging
	import warnings
	import os

	# Suppress warnings for cleaner output
	warnings.filterwarnings('ignore', category=FutureWarning)
	warnings.filterwarnings('ignore', category=DeprecationWarning)
	warnings.filterwarnings('ignore', category=UserWarning)
	warnings.filterwarnings('ignore', message='.LangChain.')

	# Disable ChromaDB telemetry to avoid errors
	os.environ['ANONYMIZED_TELEMETRY'] = 'False'

	import gradio as gr
	from pathlib import Path
	import pandas as pd

	from src.data.processor import FraudDataProcessor
	from src.llm.groq_client import GroqClient
	from src.rag.document_loader import DocumentLoader
	from src.rag.vector_store import VectorStore
	from src.services.fraud_analyzer import FraudAnalyzer
	from src.services.quality_scorer import ResponseQualityScorer
	from src.config.config import settings

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Suppress chromadb logging
	logging.getLogger('chromadb').setLevel(logging.ERROR)
	logging.getLogger('chromadb.telemetry').setLevel(logging.CRITICAL)

	# Initialize components globally
	groq_client = None
	vector_store = None
	fraud_analyzer = None
	data_processor = None
	quality_scorer = ResponseQualityScorer()


	def initialize_system():
	"""Initialize the fraud detection system."""
	global groq_client, vector_store, fraud_analyzer, data_processor

	logger.info("Initializing Fraud Detection System...")

	# Initialize Groq client
	groq_client = GroqClient()
	logger.info("✓ Groq client initialized")

	# Initialize data processor
	data_processor = FraudDataProcessor()
	logger.info("✓ Data processor initialized")

	# Setup RAG system
	try:
	document_loader = DocumentLoader(
	chunk_size=settings.chunk_size,
	chunk_overlap=settings.chunk_overlap,
	)

	all_documents = []

	# Load PDF documents
	pdf_documents = document_loader.load_pdfs_from_directory(settings.pdf_dir)
	if pdf_documents:
	all_documents.extend(pdf_documents)
	logger.info(f"✓ Loaded {len(pdf_documents)} PDF documents")
	else:
	logger.warning("⚠ No PDF documents found")

	# Load CSV insights
	csv_path = settings.train_data_path
	if csv_path.exists():
	try:
	csv_documents = document_loader.load_csv_insights(csv_path, sample_size=1050000)
	all_documents.extend(csv_documents)
	logger.info(f"✓ Loaded {len(csv_documents)} CSV insight documents")
	except Exception as e:
	logger.warning(f"⚠ Failed to load CSV insights: {e}")
	else:
	logger.warning(f"⚠ CSV file not found: {csv_path}")

	# Add all documents to vector store
	if all_documents:
	vector_store = VectorStore()
	vector_store.add_documents(all_documents)
	logger.info(f"✓ RAG system initialized with {len(all_documents)} total documents")
	else:
	logger.warning("⚠ No documents loaded for RAG system")

	except Exception as e:
	logger.warning(f"⚠ RAG setup failed: {e}")


	# Create fraud analyzer
	fraud_analyzer = FraudAnalyzer(
	groq_client=groq_client,
	vector_store=vector_store,
	)
	logger.info("✓ Fraud analyzer initialized")

	return "✅ System initialized successfully!"


	def analyze_by_transaction_id(transaction_id: int, use_rag: bool):
	"""Analyze fraud by transaction ID."""
	if fraud_analyzer is None:
	return "❌ System not initialized. Please wait for initialization to complete."

	try:
	transaction_id = int(transaction_id)
	result = fraud_analyzer.analyze_transaction(
	transaction_id=transaction_id,
	use_rag=use_rag,
	)

	# Format the response
	transaction = result['transaction']
	analysis = result['analysis']

	response = f"""### 📊 Transaction Details
	Merchant: {transaction.get('merchant', 'N/A')}
	Category: {transaction.get('category', 'N/A')}
	Amount: ${transaction.get('amt', 0):.2f}
	City: {transaction.get('city', 'N/A')}
	State: {transaction.get('state', 'N/A')}

	---

	### 🔍 Fraud Analysis
	{analysis}
	"""
	return response

	except Exception as e:
	logger.error(f"Analysis failed: {e}")
	return f"❌ Error: {str(e)}"


	def analyze_by_manual_data(
	merchant: str, category: str, amount: float, city: str, state: str, use_rag: bool,
	gender: str = None, age: int = None, job: str = None, zip_code: str = None,
	city_pop: int = None, merch_lat: float = None, merch_long: float = None
	):
	"""Analyze fraud by manual transaction data."""
	if fraud_analyzer is None:
	return "❌ System not initialized. Please wait for initialization to complete."

	try:
	# Clean merchant name from prefix if present
	clean_merchant = merchant.replace('fraud_', '') if merchant else merchant

	transaction_data = {
	"merchant": clean_merchant,
	"category": category,
	"amt": float(amount),
	"city": city,
	"state": state,
	}

	# Add advanced fields if provided
	if gender:
	transaction_data["gender"] = gender
	if age:
	transaction_data["age"] = age
	if job:
	transaction_data["job"] = job
	if zip_code:
	transaction_data["zip"] = zip_code
	if city_pop:
	transaction_data["city_pop"] = city_pop
	if merch_lat is not None:
	transaction_data["merch_lat"] = merch_lat
	if merch_long is not None:
	transaction_data["merch_long"] = merch_long

	result = fraud_analyzer.analyze_transaction(
	transaction_data=transaction_data,
	use_rag=use_rag,
	)

	analysis = result['analysis']

	response = f"""### 📊 Transaction Details
	Merchant: {merchant}
	Category: {category}
	Amount: ${amount:.2f}
	City: {city}
	State: {state}
	"""

	# Add advanced fields to display if provided
	if gender or age or job:
	response += "\nCardholder Info:\n"
	if gender:
	response += f"- Gender: {gender}\n"
	if age:
	response += f"- Age: {age}\n"
	if job:
	response += f"- Job: {job}\n"

	if zip_code or city_pop:
	response += "\nLocation Details:\n"
	if zip_code:
	response += f"- ZIP: {zip_code}\n"
	if city_pop:
	response += f"- City Population: {city_pop:,}\n"

	if merch_lat is not None or merch_long is not None:
	response += "\nMerchant Location:\n"
	response += f"- Coordinates: ({merch_lat}, {merch_long})\n"

	response += f"""
	---

	### 🔍 Fraud Analysis
	{analysis}
	"""
	return response

	except Exception as e:
	logger.error(f"Analysis failed: {e}")
	return f"❌ Error: {str(e)}"



	def get_dataset_summary():
	"""Get dataset summary statistics including RAG documents."""
	if data_processor is None:
	return "❌ System not initialized."

	try:
	# Get transaction data summary
	summary = data_processor.get_transaction_summary()

	response = f"""### 📊 Transaction Dataset Summary

	Total Transactions: {summary['total_transactions']:,}
	Fraud Cases: {summary['fraud_count']:,}
	Fraud Rate: {summary['fraud_percentage']:.2f}%
	Average Amount: ${summary['average_amount']:.2f}

	---

	Top Transaction Categories:
	"""
	for category, count in list(summary['categories'].items())[:10]:
	response += f"\n- {category}: {count:,}"

	# Add RAG document summary if available
	if vector_store is not None:
	response += "\n\n---\n\n### 📚 RAG Knowledge Base\n\n"

	# Count documents by type
	try:
	# Get all documents from vector store
	all_docs = vector_store.vector_store._collection.get()

	if all_docs and 'metadatas' in all_docs:
	metadatas = all_docs['metadatas']

	# Count by source type
	pdf_count = 0
	csv_pattern_count = 0
	csv_merchant_count = 0
	csv_location_count = 0
	csv_stats_count = 0

	pdf_sources = set()

	for meta in metadatas:
	doc_type = meta.get('type', 'document')
	source = meta.get('source', '')

	if doc_type == 'fraud_pattern':
	csv_pattern_count += 1
	elif doc_type == 'merchant_profile':
	csv_merchant_count += 1
	elif doc_type == 'location_insight':
	csv_location_count += 1
	elif doc_type == 'statistical_summary':
	csv_stats_count += 1
	else:
	# PDF document
	pdf_count += 1
	if source.endswith('.pdf'):
	pdf_sources.add(source)

	response += f"Total Documents in RAG: {len(metadatas):,}\n\n"

	if pdf_count > 0:
	response += f"📄 PDF Research Documents: {pdf_count:,}\n"
	for pdf in sorted(pdf_sources):
	response += f" - {pdf}\n"
	response += "\n"

	csv_total = csv_pattern_count + csv_merchant_count + csv_location_count + csv_stats_count
	if csv_total > 0:
	response += f"📊 CSV-Derived Insights: {csv_total:,}\n"
	if csv_pattern_count > 0:
	response += f" - Fraud Pattern Analysis: {csv_pattern_count}\n"
	if csv_merchant_count > 0:
	response += f" - Merchant Profiles: {csv_merchant_count}\n"
	if csv_location_count > 0:
	response += f" - Location Insights: {csv_location_count}\n"
	if csv_stats_count > 0:
	response += f" - Statistical Summaries: {csv_stats_count}\n"
	else:
	response += "Status: RAG system initialized but no document metadata available."

	except Exception as e:
	logger.warning(f"Could not retrieve RAG document stats: {e}")
	response += "Status: RAG system active (document count unavailable)"

	return response

	except Exception as e:
	logger.error(f"Summary failed: {e}")
	return f"❌ Error: {str(e)}"


	def chat_with_fraud_expert(message: str, history: list, use_rag: bool):
	"""Chat with fraud detection expert."""
	if groq_client is None:
	return history + [[message, "❌ System not initialized. Please wait for initialization to complete."]]

	try:
	# Check if message is asking about a specific transaction ID
	import re
	transaction_query = re.search(r'transaction\s+(?:id\s+)?(\d+)', message.lower())
	transaction_context = ""

	if transaction_query and data_processor is not None:
	transaction_id = int(transaction_query.group(1))
	try:
	# Get transaction data
	transaction = data_processor.get_transaction_summary(transaction_id)

	# Format transaction details with all relevant columns
	transaction_context = f"\n\nTransaction ID {transaction_id} Details:\n"
	transaction_context += f"- Transaction Number: {transaction.get('trans_num', 'N/A')}\n"
	transaction_context += f"- Date/Time: {transaction.get('trans_date_trans_time', 'N/A')}\n"
	transaction_context += f"- Merchant: {transaction.get('merchant', 'N/A')}\n"
	transaction_context += f"- Category: {transaction.get('category', 'N/A')}\n"
	transaction_context += f"- Amount: ${transaction.get('amt', 0):.2f}\n"
	transaction_context += f"- Location: {transaction.get('city', 'N/A')}, {transaction.get('state', 'N/A')}\n"
	transaction_context += f"- Merchant Coordinates: ({transaction.get('merch_lat', 'N/A')}, {transaction.get('merch_long', 'N/A')})\n"
	transaction_context += f"\nCardholder Information:\n"
	transaction_context += f"- Name: {transaction.get('first', 'N/A')} {transaction.get('last', 'N/A')}\n"
	transaction_context += f"- Gender: {transaction.get('gender', 'N/A')}\n"
	transaction_context += f"- Date of Birth: {transaction.get('dob', 'N/A')}\n"
	transaction_context += f"- Job: {transaction.get('job', 'N/A')}\n"
	transaction_context += f"- Street: {transaction.get('street', 'N/A')}\n"
	transaction_context += f"- City/State/ZIP: {transaction.get('city', 'N/A')}, {transaction.get('state', 'N/A')} {transaction.get('zip', 'N/A')}\n"
	transaction_context += f"- Cardholder Coordinates: ({transaction.get('lat', 'N/A')}, {transaction.get('long', 'N/A')})\n"
	transaction_context += f"- City Population: {transaction.get('city_pop', 'N/A')}\n"
	transaction_context += f"\nCard Information:\n"
	transaction_context += f"- Card Number: {transaction.get('cc_num', 'N/A')}\n"
	transaction_context += f"\nFraud Status:\n"
	transaction_context += f"- Actual Status: {'🚨 FRAUD' if transaction.get('is_fraud', 0) == 1 else '✅ LEGITIMATE'}\n"

	logger.info(f"Found transaction {transaction_id} for chat query")
	except ValueError as e:
	transaction_context = f"\n\nNote: {str(e)}\n"
	except Exception as e:
	logger.warning(f"Could not fetch transaction {transaction_id}: {e}")

	# If RAG is enabled and vector store is available, get relevant context
	context = ""
	source_references = []

	if use_rag and vector_store is not None:
	docs = vector_store.similarity_search(message, k=3)
	if docs:
	context = "\n\nRelevant context from fraud detection documents:\n"
	for i, doc in enumerate(docs, 1):
	# Add context with source number
	context += f"\n[Source {i}] {doc.page_content[:500]}...\n"

	# Collect source information for reference list
	source_file = doc.metadata.get('source', 'Unknown')
	page_num = doc.metadata.get('page', 'N/A')
	doc_type = doc.metadata.get('type', 'document')

	# Format source info
	if doc_type == 'fraud_pattern':
	category = doc.metadata.get('category', 'N/A')
	source_references.append(f"Source {i}: CSV Data - Fraud Pattern Analysis ({category})")
	elif doc_type == 'statistical_summary':
	scope = doc.metadata.get('scope', 'N/A')
	source_references.append(f"Source {i}: CSV Data - Statistical Summary ({scope})")
	elif doc_type == 'merchant_profile':
	merchant = doc.metadata.get('merchant', 'N/A')
	source_references.append(f"Source {i}: CSV Data - Merchant Profile ({merchant})")
	elif doc_type == 'location_insight':
	state = doc.metadata.get('state', 'N/A')
	source_references.append(f"Source {i}: CSV Data - Location Analysis ({state})")
	else:
	# PDF document
	if page_num != 'N/A':
	source_references.append(f"Source {i}: {source_file}, Page {page_num}")
	else:
	source_references.append(f"Source {i}: {source_file}")

	# Create prompt with transaction data and context
	full_prompt = message
	if transaction_context:
	full_prompt = f"{message}\n{transaction_context}"
	if context:
	full_prompt = f"{full_prompt}\n{context}"

	# Enhanced system message with inline citation instructions
	system_message = """You are an expert fraud detection analyst. Help users understand fraud patterns, detection methods, and transaction analysis.

	IMPORTANT CITATION RULES:
	- When using information from the provided context sources, you MUST add an inline citation immediately after the relevant sentence or paragraph.
	- Format citations as: [Source X] where X is the source number from the context.
	- Place citations at the end of sentences that use information from that source.
	- You can cite multiple sources in one paragraph if needed: [Source 1, Source 2]
	- Be specific and reference the data when using information from sources.

	TRANSACTION ANALYSIS:
	- If transaction details are provided, analyze them thoroughly.
	- Note: Ignore "fraud_" prefix in merchant names; it is an artifact of the synthetic dataset and NOT an indicator of fraud.
	- Compare transaction characteristics against known fraud patterns.
	- Provide a clear fraud risk assessment (Low/Medium/High).
	- Explain your reasoning with specific indicators.

	Example:
	"Online gaming merchants often experience higher fraud rates due to card-not-present transactions. [Source 1] The average fraud rate in this category is 5.2%. [Source 2]"

	Provide clear, actionable insights with proper inline citations."""

	# Get response from LLM
	response = groq_client.invoke(
	prompt=full_prompt,
	system_message=system_message,
	)

	# Score response quality
	score_result = quality_scorer.score_response(
	response=response,
	query=message,
	has_rag=use_rag and vector_store is not None,
	sources=source_references,
	)

	# Add quality score display
	quality_display = quality_scorer.format_score_display(score_result)
	response += quality_display

	# Add source reference list at the end
	if source_references:
	response += "\n📚 Source References:\n"
	for ref in source_references:
	response += f"\n- {ref}"

	# Log quality score
	logger.info(f"Response quality score: {score_result['overall_score']}/100 (Grade: {score_result['grade']})")

	history.append({"role": "user", "content": message})
	history.append({"role": "assistant", "content": response})
	return history

	except Exception as e:
	logger.error(f"Chat failed: {e}")
	history.append({"role": "user", "content": message})
	history.append({"role": "assistant", "content": f"❌ Error: {str(e)}"})
	return history



	# Create Gradio interface
	def create_interface():
	"""Create the Gradio interface."""

	with gr.Blocks(
	theme=gr.themes.Soft(
	primary_hue="blue",
	secondary_hue="slate",
	font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
	),
	title="Fraud Detection Chatbot",
	css="""
	@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');

	* {
	font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif !important;
	}

	.gradio-container {
	max-width: 1200px !important;
	}

	h1, h2, h3, h4, h5, h6 {
	font-weight: 600 !important;
	}

	.markdown-text {
	font-size: 15px !important;
	line-height: 1.6 !important;
	}

	button {
	font-weight: 500 !important;
	}
	"""
	) as demo:

	gr.Markdown("""
	# 🛡️ Fraud Detection Chatbot

	AI-powered fraud detection system using LangChain, Groq, and RAG (Retrieval Augmented Generation).
	""")

	# System status
	with gr.Row():
	init_status = gr.Textbox(
	label="System Status",
	value="Initializing...",
	interactive=False,
	)

	# Tabs for different functionalities
	with gr.Tabs():

	# Tab 1: Chat with Expert
	with gr.Tab("💬 Chat with Fraud Expert"):
	gr.Markdown("""
	Ask questions about fraud detection, transaction patterns, or get expert advice.
	""")

	with gr.Row():
	chat_use_rag = gr.Checkbox(
	label="Use RAG (Enhanced with fraud detection documents + CSV data)",
	value=True,
	)

	chatbot = gr.Chatbot(
	label="Fraud Detection Expert",
	height=500,
	)

	with gr.Row():
	chat_input = gr.Textbox(
	label="Your Question",
	placeholder="Ask about fraud detection, transaction analysis, etc...",
	scale=4,
	)
	chat_submit = gr.Button("Send", variant="primary", scale=1)

	chat_clear = gr.Button("Clear Chat")

	# Chat examples
	gr.Examples(
	examples=[
	"What are common indicators of credit card fraud?",
	"How can I detect unusual transaction patterns?",
	"What are fraud patterns in grocery transactions?",
	"Which merchants have high fraud rates?",
	"What states have elevated fraud activity?",
	],
	inputs=chat_input,
	)

	# Tab 2: Analyze by Transaction ID
	with gr.Tab("🔍 Analyze by Transaction ID"):
	gr.Markdown("""
	Analyze a specific transaction from the dataset by its ID.
	""")

	txn_id_input = gr.Number(
	label="Transaction ID",
	value=0,
	precision=0,
	)
	txn_id_use_rag = gr.Checkbox(
	label="Use RAG (Enhanced analysis)",
	value=True,
	)
	txn_id_submit = gr.Button("Analyze Transaction", variant="primary")

	txn_id_output = gr.Markdown(label="Analysis Result")


	# Tab 3: Analyze Manual Transaction
	with gr.Tab("✍️ Analyze Manual Transaction"):
	gr.Markdown("""
	Enter transaction details manually for fraud analysis.
	""")

	# Basic Fields
	gr.Markdown("### Basic Transaction Information")
	manual_merchant = gr.Textbox(
	label="Merchant Name",
	placeholder="e.g., Amazon, Walmart",
	)
	manual_category = gr.Dropdown(
	label="Category",
	choices=[
	"grocery_pos", "gas_transport", "misc_net",
	"shopping_net", "shopping_pos", "entertainment",
	"food_dining", "personal_care", "health_fitness",
	"travel", "kids_pets", "home"
	],
	value="grocery_pos",
	)
	manual_amount = gr.Number(
	label="Amount ($)",
	value=100.0,
	)
	manual_city = gr.Textbox(
	label="City",
	placeholder="e.g., Jakarta",
	)
	manual_state = gr.Textbox(
	label="State",
	placeholder="e.g., DKI",
	)

	# Advanced Fields (Accordion)
	with gr.Accordion("🔧 Advanced Fields (Optional)", open=False):
	gr.Markdown("Provide additional details for more accurate fraud analysis")

	with gr.Row():
	manual_gender = gr.Radio(
	label="Cardholder Gender",
	choices=["M", "F"],
	value="M",
	)
	manual_age = gr.Number(
	label="Cardholder Age",
	value=35,
	precision=0,
	)

	manual_job = gr.Textbox(
	label="Cardholder Job",
	placeholder="e.g., Engineer, Teacher",
	)

	with gr.Row():
	manual_zip = gr.Textbox(
	label="ZIP Code",
	placeholder="e.g., 12345",
	)
	manual_city_pop = gr.Number(
	label="City Population",
	value=100000,
	precision=0,
	)

	with gr.Row():
	manual_merch_lat = gr.Number(
	label="Merchant Latitude",
	value=0.0,
	)
	manual_merch_long = gr.Number(
	label="Merchant Longitude",
	value=0.0,
	)

	manual_use_rag = gr.Checkbox(
	label="Use RAG (Enhanced analysis)",
	value=True,
	)
	manual_submit = gr.Button("Analyze Transaction", variant="primary")

	manual_output = gr.Markdown(label="Analysis Result")


	# Tab 4: Dataset Summary
	with gr.Tab("📊 Dataset Summary"):
	gr.Markdown("""
	View statistics and insights from the fraud detection dataset.
	""")

	summary_button = gr.Button("Get Dataset Summary", variant="primary")
	summary_output = gr.Markdown(label="Summary")

	# Event handlers
	def chat_fn(message, history, use_rag):
	return chat_with_fraud_expert(message, history, use_rag)

	chat_submit.click(
	fn=chat_fn,
	inputs=[chat_input, chatbot, chat_use_rag],
	outputs=chatbot,
	).then(
	lambda: "",
	outputs=chat_input,
	)

	chat_input.submit(
	fn=chat_fn,
	inputs=[chat_input, chatbot, chat_use_rag],
	outputs=chatbot,
	).then(
	lambda: "",
	outputs=chat_input,
	)

	chat_clear.click(
	lambda: [],
	outputs=chatbot,
	)

	txn_id_submit.click(
	fn=analyze_by_transaction_id,
	inputs=[txn_id_input, txn_id_use_rag],
	outputs=txn_id_output,
	)

	manual_submit.click(
	fn=analyze_by_manual_data,
	inputs=[
	manual_merchant,
	manual_category,
	manual_amount,
	manual_city,
	manual_state,
	manual_use_rag,
	manual_gender,
	manual_age,
	manual_job,
	manual_zip,
	manual_city_pop,
	manual_merch_lat,
	manual_merch_long,
	],
	outputs=manual_output,
	)

	summary_button.click(
	fn=get_dataset_summary,
	outputs=summary_output,
	)

	# Initialize system on load
	demo.load(
	fn=initialize_system,
	outputs=init_status,
	)

	return demo


	if __name__ == "__main__":
	demo = create_interface()
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	)