Spaces:

mohsinbhatti
/

insightgenai

Sleeping

App Files Files Community

insightgenai / modules /chat_engine.py

mohsinbhatti

Initial commit - InsightGenAI files

e478478 about 1 month ago

raw

history blame contribute delete

25.7 kB

	"""
	Chat Engine Module - InsightGenAI
	=================================
	Natural language interface for data querying.
	Converts natural language questions to pandas queries.
	Includes fallback to LLM API for complex queries.

	Author: InsightGenAI Team
	Version: 1.0.0
	"""

	import pandas as pd
	import numpy as np
	import re
	from typing import Dict, List, Tuple, Optional, Any, Callable
	import streamlit as st
	import json
	import os


	class ChatEngine:
	"""
	Natural language chat interface for data analysis.

	Supports:
	- Pattern-based query parsing
	- Pandas code generation
	- LLM API fallback for complex queries
	"""

	# Query patterns for common data operations
	QUERY_PATTERNS = {
	# Summary queries
	'show_head': {
	'patterns': [
	r'show (?:me )?(?:the )?(?:first )?(\d+ )?rows?',
	r'display (?:the )?(?:first )?(\d+ )?rows?',
	r'head (?:of )?(?:the )?data',
	r'show (?:me )?the (?:beginning\|start)'
	],
	'handler': '_handle_show_head'
	},
	'show_tail': {
	'patterns': [
	r'show (?:me )?(?:the )?last (\d+ )?rows?',
	r'display (?:the )?last (\d+ )?rows?',
	r'tail (?:of )?(?:the )?data',
	r'show (?:me )?the end'
	],
	'handler': '_handle_show_tail'
	},
	'show_shape': {
	'patterns': [
	r'how many rows',
	r'how many columns',
	r'what is the shape',
	r'size of (?:the )?data',
	r'dimensions? of (?:the )?data'
	],
	'handler': '_handle_show_shape'
	},
	'show_info': {
	'patterns': [
	r'show (?:me )?info',
	r'data types?',
	r'column types?',
	r'what columns',
	r'list columns'
	],
	'handler': '_handle_show_info'
	},
	'show_describe': {
	'patterns': [
	r'describe (?:the )?data',
	r'summary statistics?',
	r'statistical summary',
	r'basic statistics?'
	],
	'handler': '_handle_show_describe'
	},

	# Column-specific queries
	'column_stats': {
	'patterns': [
	r'stats (?:for \|of )?(?:column )?([\w\s]+)',
	r'statistics (?:for \|of )?(?:column )?([\w\s]+)',
	r'describe (?:column )?([\w\s]+)',
	r'info (?:about \|on )?(?:column )?([\w\s]+)'
	],
	'handler': '_handle_column_stats'
	},
	'column_mean': {
	'patterns': [
	r'(?:what is \|calculate )?(?:the )?mean (?:of \|for )?(?:column )?([\w\s]+)',
	r'(?:what is \|calculate )?(?:the )?average (?:of \|for )?(?:column )?([\w\s]+)',
	r'average (?:of \|for )?([\w\s]+)'
	],
	'handler': '_handle_column_mean'
	},
	'column_sum': {
	'patterns': [
	r'(?:what is \|calculate )?(?:the )?sum (?:of \|for )?(?:column )?([\w\s]+)',
	r'total (?:of \|for )?([\w\s]+)',
	r'sum (?:of \|for )?([\w\s]+)'
	],
	'handler': '_handle_column_sum'
	},
	'column_max': {
	'patterns': [
	r'(?:what is \|find )?(?:the )?max(?:imum)? (?:of \|for )?(?:column )?([\w\s]+)',
	r'highest (?:value (?:in \|of )?)?([\w\s]+)',
	r'max (?:of \|for )?([\w\s]+)'
	],
	'handler': '_handle_column_max'
	},
	'column_min': {
	'patterns': [
	r'(?:what is \|find )?(?:the )?min(?:imum)? (?:of \|for )?(?:column )?([\w\s]+)',
	r'lowest (?:value (?:in \|of )?)?([\w\s]+)',
	r'min (?:of \|for )?([\w\s]+)'
	],
	'handler': '_handle_column_min'
	},
	'value_counts': {
	'patterns': [
	r'value counts? (?:for \|of )?(?:column )?([\w\s]+)',
	r'unique values? (?:in \|of )?([\w\s]+)',
	r'how many unique (?:values )?(?:in )?([\w\s]+)',
	r'frequency (?:of \|for )?([\w\s]+)'
	],
	'handler': '_handle_value_counts'
	},

	# Filtering queries
	'filter_greater': {
	'patterns': [
	r'show (?:rows? )?where ([\w\s]+) (?:is )?greater than (\d+\.?\d*)',
	r'show (?:rows? )?where ([\w\s]+) (?:is )?more than (\d+\.?\d*)',
	r'show (?:rows? )?where ([\w\s]+) > (\d+\.?\d*)',
	r'filter ([\w\s]+) > (\d+\.?\d*)'
	],
	'handler': '_handle_filter_greater'
	},
	'filter_less': {
	'patterns': [
	r'show (?:rows? )?where ([\w\s]+) (?:is )?less than (\d+\.?\d*)',
	r'show (?:rows? )?where ([\w\s]+) (?:is )?fewer than (\d+\.?\d*)',
	r'show (?:rows? )?where ([\w\s]+) < (\d+\.?\d*)',
	r'filter ([\w\s]+) < (\d+\.?\d*)'
	],
	'handler': '_handle_filter_less'
	},
	'filter_equal': {
	'patterns': [
	r'show (?:rows? )?where ([\w\s]+) (?:is \|equals? )?([\w\s]+)',
	r'show (?:rows? )?where ([\w\s]+) = ([\w\s]+)',
	r'filter ([\w\s]+) = ([\w\s]+)'
	],
	'handler': '_handle_filter_equal'
	},
	'top_n': {
	'patterns': [
	r'top (\d+) (?:by \|sorted by )?([\w\s]+)',
	r'show (?:me )?top (\d+)',
	r'highest (\d+) (?:by )?([\w\s]+)'
	],
	'handler': '_handle_top_n'
	},

	# Grouping queries
	'group_by': {
	'patterns': [
	r'group (?:by )?([\w\s]+) (?:and )?(?:calculate )?(?:the )?(mean\|sum\|count\|avg\|average\|max\|min)?',
	r'aggregate (?:by )?([\w\s]+)',
	r'([\w\s]+) (?:grouped \|aggregation )by ([\w\s]+)'
	],
	'handler': '_handle_group_by'
	},

	# Correlation queries
	'correlation': {
	'patterns': [
	r'correlation (?:between )?([\w\s]+) (?:and )?([\w\s]+)',
	r'correlate ([\w\s]+) (?:with \|and )?([\w\s]+)',
	r'how (?:are \|is )?([\w\s]+) (?:and )?([\w\s]+) related'
	],
	'handler': '_handle_correlation'
	},

	# Missing values
	'missing_values': {
	'patterns': [
	r'missing values?',
	r'null values?',
	r'how many missing',
	r'na values?'
	],
	'handler': '_handle_missing_values'
	},

	# Duplicates
	'duplicates': {
	'patterns': [
	r'duplicate rows?',
	r'how many duplicates',
	r'are there duplicates'
	],
	'handler': '_handle_duplicates'
	}
	}

	def __init__(self, df: pd.DataFrame, column_types: Optional[Dict[str, str]] = None):
	"""
	Initialize the Chat Engine.

	Args:
	df: Dataset to query
	column_types: Dictionary of column types
	"""
	self.df = df.copy()
	self.column_types = column_types or {}
	self.chat_history: List[Dict[str, str]] = []
	self.llm_api_key: Optional[str] = None
	self.llm_provider: str = 'openai' # or 'huggingface'

	def set_llm_config(self, api_key: str, provider: str = 'openai') -> None:
	"""
	Configure LLM API for fallback queries.

	Args:
	api_key: API key for the LLM service
	provider: LLM provider ('openai' or 'huggingface')
	"""
	self.llm_api_key = api_key
	self.llm_provider = provider

	def process_query(self, query: str) -> Dict[str, Any]:
	"""
	Process a natural language query.

	Args:
	query: Natural language query string

	Returns:
	Dict with response data
	"""
	query_lower = query.lower().strip()

	# Try pattern matching first
	for query_type, config in self.QUERY_PATTERNS.items():
	for pattern in config['patterns']:
	match = re.search(pattern, query_lower)
	if match:
	handler = getattr(self, config['handler'])
	result = handler(match)

	# Add to chat history
	self.chat_history.append({
	'query': query,
	'response_type': 'pattern',
	'result': result
	})

	return {
	'success': True,
	'type': query_type,
	'result': result,
	'method': 'pattern'
	}

	# Fallback to LLM if configured
	if self.llm_api_key:
	return self._query_llm(query)

	# No match found
	return {
	'success': False,
	'error': "I couldn't understand that query. Try rephrasing or use simpler terms.",
	'suggestions': self._get_suggestions()
	}

	def _get_suggestions(self) -> List[str]:
	"""Get query suggestions for the user."""
	return [
	"Show me the first 10 rows",
	"What is the average of [column_name]?",
	"Show rows where [column] > 100",
	"Group by [column] and calculate mean",
	"What is the correlation between [col1] and [col2]?",
	"Show missing values"
	]

	# Pattern handlers
	def _handle_show_head(self, match) -> Dict:
	"""Handle show head query."""
	n = int(match.group(1)) if match.group(1) else 5
	return {
	'data': self.df.head(n),
	'message': f"Showing first {min(n, len(self.df))} rows"
	}

	def _handle_show_tail(self, match) -> Dict:
	"""Handle show tail query."""
	n = int(match.group(1)) if match.group(1) else 5
	return {
	'data': self.df.tail(n),
	'message': f"Showing last {min(n, len(self.df))} rows"
	}

	def _handle_show_shape(self, match) -> Dict:
	"""Handle shape query."""
	rows, cols = self.df.shape
	return {
	'message': f"The dataset has {rows:,} rows and {cols} columns",
	'shape': (rows, cols)
	}

	def _handle_show_info(self, match) -> Dict:
	"""Handle info query."""
	info_df = pd.DataFrame({
	'Column': self.df.columns,
	'Type': self.df.dtypes.values,
	'Non-Null Count': self.df.count().values,
	'Null Count': self.df.isnull().sum().values
	})
	return {
	'data': info_df,
	'message': f"Dataset has {len(self.df.columns)} columns"
	}

	def _handle_show_describe(self, match) -> Dict:
	"""Handle describe query."""
	return {
	'data': self.df.describe(),
	'message': "Statistical summary of numeric columns"
	}

	def _handle_column_stats(self, match) -> Dict:
	"""Handle column stats query."""
	col = match.group(1).strip()
	# Find closest column name
	col = self._find_column(col)

	if col and col in self.df.columns:
	stats = self.df[col].describe()
	return {
	'data': stats,
	'message': f"Statistics for column '{col}'"
	}
	return {'error': f"Column '{col}' not found"}

	def _handle_column_mean(self, match) -> Dict:
	"""Handle column mean query."""
	col = match.group(1).strip()
	col = self._find_column(col)

	if col and col in self.df.columns:
	mean_val = self.df[col].mean()
	return {
	'message': f"Mean of '{col}': {mean_val:.4f}",
	'value': mean_val
	}
	return {'error': f"Column '{col}' not found"}

	def _handle_column_sum(self, match) -> Dict:
	"""Handle column sum query."""
	col = match.group(1).strip()
	col = self._find_column(col)

	if col and col in self.df.columns:
	sum_val = self.df[col].sum()
	return {
	'message': f"Sum of '{col}': {sum_val:,.2f}",
	'value': sum_val
	}
	return {'error': f"Column '{col}' not found"}

	def _handle_column_max(self, match) -> Dict:
	"""Handle column max query."""
	col = match.group(1).strip()
	col = self._find_column(col)

	if col and col in self.df.columns:
	max_val = self.df[col].max()
	return {
	'message': f"Maximum of '{col}': {max_val}",
	'value': max_val
	}
	return {'error': f"Column '{col}' not found"}

	def _handle_column_min(self, match) -> Dict:
	"""Handle column min query."""
	col = match.group(1).strip()
	col = self._find_column(col)

	if col and col in self.df.columns:
	min_val = self.df[col].min()
	return {
	'message': f"Minimum of '{col}': {min_val}",
	'value': min_val
	}
	return {'error': f"Column '{col}' not found"}

	def _handle_value_counts(self, match) -> Dict:
	"""Handle value counts query."""
	col = match.group(1).strip()
	col = self._find_column(col)

	if col and col in self.df.columns:
	counts = self.df[col].value_counts().head(10)
	return {
	'data': counts,
	'message': f"Top 10 values in '{col}'"
	}
	return {'error': f"Column '{col}' not found"}

	def _handle_filter_greater(self, match) -> Dict:
	"""Handle filter greater than query."""
	col = match.group(1).strip()
	value = float(match.group(2))
	col = self._find_column(col)

	if col and col in self.df.columns:
	filtered = self.df[self.df[col] > value]
	return {
	'data': filtered.head(20),
	'message': f"Found {len(filtered)} rows where '{col}' > {value}"
	}
	return {'error': f"Column '{col}' not found"}

	def _handle_filter_less(self, match) -> Dict:
	"""Handle filter less than query."""
	col = match.group(1).strip()
	value = float(match.group(2))
	col = self._find_column(col)

	if col and col in self.df.columns:
	filtered = self.df[self.df[col] < value]
	return {
	'data': filtered.head(20),
	'message': f"Found {len(filtered)} rows where '{col}' < {value}"
	}
	return {'error': f"Column '{col}' not found"}

	def _handle_filter_equal(self, match) -> Dict:
	"""Handle filter equal query."""
	col = match.group(1).strip()
	value = match.group(2).strip()
	col = self._find_column(col)

	if col and col in self.df.columns:
	# Try to convert value to appropriate type
	try:
	value = float(value)
	except:
	pass

	filtered = self.df[self.df[col] == value]
	return {
	'data': filtered.head(20),
	'message': f"Found {len(filtered)} rows where '{col}' = '{value}'"
	}
	return {'error': f"Column '{col}' not found"}

	def _handle_top_n(self, match) -> Dict:
	"""Handle top N query."""
	n = int(match.group(1))
	col = match.group(2).strip() if match.group(2) else self.df.columns[0]
	col = self._find_column(col)

	if col and col in self.df.columns:
	top_n = self.df.nlargest(n, col)
	return {
	'data': top_n,
	'message': f"Top {n} rows by '{col}'"
	}
	return {'error': f"Column '{col}' not found"}

	def _handle_group_by(self, match) -> Dict:
	"""Handle group by query."""
	col = match.group(1).strip()
	agg_func = match.group(2) if match.group(2) else 'mean'
	col = self._find_column(col)

	if col and col in self.df.columns:
	agg_map = {
	'mean': 'mean', 'avg': 'mean', 'average': 'mean',
	'sum': 'sum', 'count': 'count',
	'max': 'max', 'min': 'min'
	}
	func = agg_map.get(agg_func, 'mean')

	numeric_cols = self.df.select_dtypes(include=[np.number]).columns
	grouped = self.df.groupby(col)[numeric_cols].agg(func)

	return {
	'data': grouped.head(20),
	'message': f"Grouped by '{col}' with {func} aggregation"
	}
	return {'error': f"Column '{col}' not found"}

	def _handle_correlation(self, match) -> Dict:
	"""Handle correlation query."""
	col1 = match.group(1).strip()
	col2 = match.group(2).strip()
	col1 = self._find_column(col1)
	col2 = self._find_column(col2)

	if col1 in self.df.columns and col2 in self.df.columns:
	corr = self.df[col1].corr(self.df[col2])
	return {
	'message': f"Correlation between '{col1}' and '{col2}': {corr:.4f}",
	'value': corr
	}
	return {'error': f"One or both columns not found"}

	def _handle_missing_values(self, match) -> Dict:
	"""Handle missing values query."""
	missing = self.df.isnull().sum()
	missing = missing[missing > 0]

	if len(missing) > 0:
	return {
	'data': missing,
	'message': f"Found missing values in {len(missing)} columns"
	}
	return {'message': "No missing values found! 🎉"}

	def _handle_duplicates(self, match) -> Dict:
	"""Handle duplicates query."""
	n_duplicates = self.df.duplicated().sum()
	return {
	'message': f"Found {n_duplicates} duplicate rows",
	'count': n_duplicates
	}

	def _find_column(self, col_name: str) -> Optional[str]:
	"""
	Find the closest matching column name.

	Args:
	col_name: Column name to find

	Returns:
	Actual column name or None
	"""
	col_name = col_name.lower().strip()

	# Exact match
	for col in self.df.columns:
	if col.lower() == col_name:
	return col

	# Substring match
	for col in self.df.columns:
	if col_name in col.lower() or col.lower() in col_name:
	return col

	return None

	def _query_llm(self, query: str) -> Dict[str, Any]:
	"""
	Query LLM API for complex questions.

	Args:
	query: Natural language query

	Returns:
	Dict with LLM response
	"""
	if self.llm_provider == 'openai':
	return self._query_openai(query)
	else:
	return self._query_huggingface(query)

	def _query_openai(self, query: str) -> Dict[str, Any]:
	"""Query OpenAI API."""
	try:
	import openai
	openai.api_key = self.llm_api_key

	# Create context about the dataset
	columns_info = "\n".join([
	f"- {col} ({self.df[col].dtype})"
	for col in self.df.columns[:20] # Limit to first 20 columns
	])

	prompt = f"""You are a data analysis assistant. Answer the following question about a dataset.

	Dataset Information:
	- Shape: {self.df.shape}
	- Columns:
	{columns_info}

	User Question: {query}

	Provide a clear, concise answer based on the dataset structure."""

	response = openai.ChatCompletion.create(
	model="gpt-3.5-turbo",
	messages=[
	{"role": "system", "content": "You are a helpful data analysis assistant."},
	{"role": "user", "content": prompt}
	],
	max_tokens=500
	)

	answer = response.choices[0].message.content

	return {
	'success': True,
	'type': 'llm_response',
	'result': {'message': answer},
	'method': 'llm'
	}

	except Exception as e:
	return {
	'success': False,
	'error': f"LLM query failed: {str(e)}"
	}

	def _query_huggingface(self, query: str) -> Dict[str, Any]:
	"""Query HuggingFace Inference API."""
	try:
	import requests

	API_URL = "https://api-inference.huggingface.co/models/google/flan-t5-large"
	headers = {"Authorization": f"Bearer {self.llm_api_key}"}

	payload = {
	"inputs": f"Answer this data question: {query}",
	"parameters": {"max_length": 200}
	}

	response = requests.post(API_URL, headers=headers, json=payload)
	result = response.json()

	if isinstance(result, list) and len(result) > 0:
	answer = result[0].get('generated_text', 'No response')
	else:
	answer = str(result)

	return {
	'success': True,
	'type': 'llm_response',
	'result': {'message': answer},
	'method': 'llm'
	}

	except Exception as e:
	return {
	'success': False,
	'error': f"HuggingFace query failed: {str(e)}"
	}

	def get_chat_history(self) -> List[Dict[str, str]]:
	"""Get the chat history."""
	return self.chat_history

	def clear_history(self) -> None:
	"""Clear the chat history."""
	self.chat_history = []


	# Streamlit display functions
	def display_chat_interface(df: pd.DataFrame, column_types: Optional[Dict[str, str]] = None):
	"""Display chat interface in Streamlit."""
	st.subheader("💬 Chat With Your Data")

	# Initialize chat engine
	if 'chat_engine' not in st.session_state:
	st.session_state.chat_engine = ChatEngine(df, column_types)

	chat_engine = st.session_state.chat_engine

	# LLM configuration
	with st.expander("⚙️ LLM Configuration (Optional)"):
	col1, col2 = st.columns(2)
	with col1:
	provider = st.selectbox(
	"LLM Provider",
	options=['None', 'openai', 'huggingface'],
	help="Select LLM provider for complex queries"
	)
	with col2:
	if provider != 'None':
	api_key = st.text_input(
	"API Key",
	type="password",
	help=f"Enter your {provider} API key"
	)
	if api_key:
	chat_engine.set_llm_config(api_key, provider)

	# Chat input
	query = st.text_input(
	"Ask a question about your data",
	placeholder="e.g., 'What is the average age?' or 'Show rows where salary > 50000'"
	)

	if st.button("Ask", type="primary") and query:
	with st.spinner("Processing..."):
	response = chat_engine.process_query(query)

	if response['success']:
	result = response['result']

	# Display message
	if 'message' in result:
	st.info(result['message'])

	# Display data
	if 'data' in result:
	st.dataframe(result['data'], use_container_width=True)

	# Display single value
	if 'value' in result:
	st.metric("Result", f"{result['value']:.4f}" if isinstance(result['value'], float) else result['value'])
	else:
	st.error(response.get('error', 'Unknown error'))

	if 'suggestions' in response:
	st.write("Try these queries:")
	for suggestion in response['suggestions']:
	st.code(suggestion)

	# Example queries
	with st.expander("📖 Example Queries"):
	st.markdown("""
	Basic Queries:
	- `show me the first 10 rows`
	- `how many rows and columns?`
	- `describe the data`

	Column Queries:
	- `what is the average of [column]?`
	- `what is the maximum of [column]?`
	- `show value counts for [column]`

	Filtering:
	- `show rows where [column] > 100`
	- `show rows where [column] = value`
	- `top 10 by [column]`

	Aggregation:
	- `group by [column] and calculate mean`
	- `correlation between [col1] and [col2]`

	Data Quality:
	- `show missing values`
	- `how many duplicates?`
	""")