Spaces:
Sleeping
Sleeping
| """ | |
| Chat Engine Module - InsightGenAI | |
| ================================= | |
| Natural language interface for data querying. | |
| Converts natural language questions to pandas queries. | |
| Includes fallback to LLM API for complex queries. | |
| Author: InsightGenAI Team | |
| Version: 1.0.0 | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| import re | |
| from typing import Dict, List, Tuple, Optional, Any, Callable | |
| import streamlit as st | |
| import json | |
| import os | |
| class ChatEngine: | |
| """ | |
| Natural language chat interface for data analysis. | |
| Supports: | |
| - Pattern-based query parsing | |
| - Pandas code generation | |
| - LLM API fallback for complex queries | |
| """ | |
| # Query patterns for common data operations | |
| QUERY_PATTERNS = { | |
| # Summary queries | |
| 'show_head': { | |
| 'patterns': [ | |
| r'show (?:me )?(?:the )?(?:first )?(\d+ )?rows?', | |
| r'display (?:the )?(?:first )?(\d+ )?rows?', | |
| r'head (?:of )?(?:the )?data', | |
| r'show (?:me )?the (?:beginning|start)' | |
| ], | |
| 'handler': '_handle_show_head' | |
| }, | |
| 'show_tail': { | |
| 'patterns': [ | |
| r'show (?:me )?(?:the )?last (\d+ )?rows?', | |
| r'display (?:the )?last (\d+ )?rows?', | |
| r'tail (?:of )?(?:the )?data', | |
| r'show (?:me )?the end' | |
| ], | |
| 'handler': '_handle_show_tail' | |
| }, | |
| 'show_shape': { | |
| 'patterns': [ | |
| r'how many rows', | |
| r'how many columns', | |
| r'what is the shape', | |
| r'size of (?:the )?data', | |
| r'dimensions? of (?:the )?data' | |
| ], | |
| 'handler': '_handle_show_shape' | |
| }, | |
| 'show_info': { | |
| 'patterns': [ | |
| r'show (?:me )?info', | |
| r'data types?', | |
| r'column types?', | |
| r'what columns', | |
| r'list columns' | |
| ], | |
| 'handler': '_handle_show_info' | |
| }, | |
| 'show_describe': { | |
| 'patterns': [ | |
| r'describe (?:the )?data', | |
| r'summary statistics?', | |
| r'statistical summary', | |
| r'basic statistics?' | |
| ], | |
| 'handler': '_handle_show_describe' | |
| }, | |
| # Column-specific queries | |
| 'column_stats': { | |
| 'patterns': [ | |
| r'stats (?:for |of )?(?:column )?([\w\s]+)', | |
| r'statistics (?:for |of )?(?:column )?([\w\s]+)', | |
| r'describe (?:column )?([\w\s]+)', | |
| r'info (?:about |on )?(?:column )?([\w\s]+)' | |
| ], | |
| 'handler': '_handle_column_stats' | |
| }, | |
| 'column_mean': { | |
| 'patterns': [ | |
| r'(?:what is |calculate )?(?:the )?mean (?:of |for )?(?:column )?([\w\s]+)', | |
| r'(?:what is |calculate )?(?:the )?average (?:of |for )?(?:column )?([\w\s]+)', | |
| r'average (?:of |for )?([\w\s]+)' | |
| ], | |
| 'handler': '_handle_column_mean' | |
| }, | |
| 'column_sum': { | |
| 'patterns': [ | |
| r'(?:what is |calculate )?(?:the )?sum (?:of |for )?(?:column )?([\w\s]+)', | |
| r'total (?:of |for )?([\w\s]+)', | |
| r'sum (?:of |for )?([\w\s]+)' | |
| ], | |
| 'handler': '_handle_column_sum' | |
| }, | |
| 'column_max': { | |
| 'patterns': [ | |
| r'(?:what is |find )?(?:the )?max(?:imum)? (?:of |for )?(?:column )?([\w\s]+)', | |
| r'highest (?:value (?:in |of )?)?([\w\s]+)', | |
| r'max (?:of |for )?([\w\s]+)' | |
| ], | |
| 'handler': '_handle_column_max' | |
| }, | |
| 'column_min': { | |
| 'patterns': [ | |
| r'(?:what is |find )?(?:the )?min(?:imum)? (?:of |for )?(?:column )?([\w\s]+)', | |
| r'lowest (?:value (?:in |of )?)?([\w\s]+)', | |
| r'min (?:of |for )?([\w\s]+)' | |
| ], | |
| 'handler': '_handle_column_min' | |
| }, | |
| 'value_counts': { | |
| 'patterns': [ | |
| r'value counts? (?:for |of )?(?:column )?([\w\s]+)', | |
| r'unique values? (?:in |of )?([\w\s]+)', | |
| r'how many unique (?:values )?(?:in )?([\w\s]+)', | |
| r'frequency (?:of |for )?([\w\s]+)' | |
| ], | |
| 'handler': '_handle_value_counts' | |
| }, | |
| # Filtering queries | |
| 'filter_greater': { | |
| 'patterns': [ | |
| r'show (?:rows? )?where ([\w\s]+) (?:is )?greater than (\d+\.?\d*)', | |
| r'show (?:rows? )?where ([\w\s]+) (?:is )?more than (\d+\.?\d*)', | |
| r'show (?:rows? )?where ([\w\s]+) > (\d+\.?\d*)', | |
| r'filter ([\w\s]+) > (\d+\.?\d*)' | |
| ], | |
| 'handler': '_handle_filter_greater' | |
| }, | |
| 'filter_less': { | |
| 'patterns': [ | |
| r'show (?:rows? )?where ([\w\s]+) (?:is )?less than (\d+\.?\d*)', | |
| r'show (?:rows? )?where ([\w\s]+) (?:is )?fewer than (\d+\.?\d*)', | |
| r'show (?:rows? )?where ([\w\s]+) < (\d+\.?\d*)', | |
| r'filter ([\w\s]+) < (\d+\.?\d*)' | |
| ], | |
| 'handler': '_handle_filter_less' | |
| }, | |
| 'filter_equal': { | |
| 'patterns': [ | |
| r'show (?:rows? )?where ([\w\s]+) (?:is |equals? )?([\w\s]+)', | |
| r'show (?:rows? )?where ([\w\s]+) = ([\w\s]+)', | |
| r'filter ([\w\s]+) = ([\w\s]+)' | |
| ], | |
| 'handler': '_handle_filter_equal' | |
| }, | |
| 'top_n': { | |
| 'patterns': [ | |
| r'top (\d+) (?:by |sorted by )?([\w\s]+)', | |
| r'show (?:me )?top (\d+)', | |
| r'highest (\d+) (?:by )?([\w\s]+)' | |
| ], | |
| 'handler': '_handle_top_n' | |
| }, | |
| # Grouping queries | |
| 'group_by': { | |
| 'patterns': [ | |
| r'group (?:by )?([\w\s]+) (?:and )?(?:calculate )?(?:the )?(mean|sum|count|avg|average|max|min)?', | |
| r'aggregate (?:by )?([\w\s]+)', | |
| r'([\w\s]+) (?:grouped |aggregation )by ([\w\s]+)' | |
| ], | |
| 'handler': '_handle_group_by' | |
| }, | |
| # Correlation queries | |
| 'correlation': { | |
| 'patterns': [ | |
| r'correlation (?:between )?([\w\s]+) (?:and )?([\w\s]+)', | |
| r'correlate ([\w\s]+) (?:with |and )?([\w\s]+)', | |
| r'how (?:are |is )?([\w\s]+) (?:and )?([\w\s]+) related' | |
| ], | |
| 'handler': '_handle_correlation' | |
| }, | |
| # Missing values | |
| 'missing_values': { | |
| 'patterns': [ | |
| r'missing values?', | |
| r'null values?', | |
| r'how many missing', | |
| r'na values?' | |
| ], | |
| 'handler': '_handle_missing_values' | |
| }, | |
| # Duplicates | |
| 'duplicates': { | |
| 'patterns': [ | |
| r'duplicate rows?', | |
| r'how many duplicates', | |
| r'are there duplicates' | |
| ], | |
| 'handler': '_handle_duplicates' | |
| } | |
| } | |
| def __init__(self, df: pd.DataFrame, column_types: Optional[Dict[str, str]] = None): | |
| """ | |
| Initialize the Chat Engine. | |
| Args: | |
| df: Dataset to query | |
| column_types: Dictionary of column types | |
| """ | |
| self.df = df.copy() | |
| self.column_types = column_types or {} | |
| self.chat_history: List[Dict[str, str]] = [] | |
| self.llm_api_key: Optional[str] = None | |
| self.llm_provider: str = 'openai' # or 'huggingface' | |
| def set_llm_config(self, api_key: str, provider: str = 'openai') -> None: | |
| """ | |
| Configure LLM API for fallback queries. | |
| Args: | |
| api_key: API key for the LLM service | |
| provider: LLM provider ('openai' or 'huggingface') | |
| """ | |
| self.llm_api_key = api_key | |
| self.llm_provider = provider | |
| def process_query(self, query: str) -> Dict[str, Any]: | |
| """ | |
| Process a natural language query. | |
| Args: | |
| query: Natural language query string | |
| Returns: | |
| Dict with response data | |
| """ | |
| query_lower = query.lower().strip() | |
| # Try pattern matching first | |
| for query_type, config in self.QUERY_PATTERNS.items(): | |
| for pattern in config['patterns']: | |
| match = re.search(pattern, query_lower) | |
| if match: | |
| handler = getattr(self, config['handler']) | |
| result = handler(match) | |
| # Add to chat history | |
| self.chat_history.append({ | |
| 'query': query, | |
| 'response_type': 'pattern', | |
| 'result': result | |
| }) | |
| return { | |
| 'success': True, | |
| 'type': query_type, | |
| 'result': result, | |
| 'method': 'pattern' | |
| } | |
| # Fallback to LLM if configured | |
| if self.llm_api_key: | |
| return self._query_llm(query) | |
| # No match found | |
| return { | |
| 'success': False, | |
| 'error': "I couldn't understand that query. Try rephrasing or use simpler terms.", | |
| 'suggestions': self._get_suggestions() | |
| } | |
| def _get_suggestions(self) -> List[str]: | |
| """Get query suggestions for the user.""" | |
| return [ | |
| "Show me the first 10 rows", | |
| "What is the average of [column_name]?", | |
| "Show rows where [column] > 100", | |
| "Group by [column] and calculate mean", | |
| "What is the correlation between [col1] and [col2]?", | |
| "Show missing values" | |
| ] | |
| # Pattern handlers | |
| def _handle_show_head(self, match) -> Dict: | |
| """Handle show head query.""" | |
| n = int(match.group(1)) if match.group(1) else 5 | |
| return { | |
| 'data': self.df.head(n), | |
| 'message': f"Showing first {min(n, len(self.df))} rows" | |
| } | |
| def _handle_show_tail(self, match) -> Dict: | |
| """Handle show tail query.""" | |
| n = int(match.group(1)) if match.group(1) else 5 | |
| return { | |
| 'data': self.df.tail(n), | |
| 'message': f"Showing last {min(n, len(self.df))} rows" | |
| } | |
| def _handle_show_shape(self, match) -> Dict: | |
| """Handle shape query.""" | |
| rows, cols = self.df.shape | |
| return { | |
| 'message': f"The dataset has {rows:,} rows and {cols} columns", | |
| 'shape': (rows, cols) | |
| } | |
| def _handle_show_info(self, match) -> Dict: | |
| """Handle info query.""" | |
| info_df = pd.DataFrame({ | |
| 'Column': self.df.columns, | |
| 'Type': self.df.dtypes.values, | |
| 'Non-Null Count': self.df.count().values, | |
| 'Null Count': self.df.isnull().sum().values | |
| }) | |
| return { | |
| 'data': info_df, | |
| 'message': f"Dataset has {len(self.df.columns)} columns" | |
| } | |
| def _handle_show_describe(self, match) -> Dict: | |
| """Handle describe query.""" | |
| return { | |
| 'data': self.df.describe(), | |
| 'message': "Statistical summary of numeric columns" | |
| } | |
| def _handle_column_stats(self, match) -> Dict: | |
| """Handle column stats query.""" | |
| col = match.group(1).strip() | |
| # Find closest column name | |
| col = self._find_column(col) | |
| if col and col in self.df.columns: | |
| stats = self.df[col].describe() | |
| return { | |
| 'data': stats, | |
| 'message': f"Statistics for column '{col}'" | |
| } | |
| return {'error': f"Column '{col}' not found"} | |
| def _handle_column_mean(self, match) -> Dict: | |
| """Handle column mean query.""" | |
| col = match.group(1).strip() | |
| col = self._find_column(col) | |
| if col and col in self.df.columns: | |
| mean_val = self.df[col].mean() | |
| return { | |
| 'message': f"Mean of '{col}': {mean_val:.4f}", | |
| 'value': mean_val | |
| } | |
| return {'error': f"Column '{col}' not found"} | |
| def _handle_column_sum(self, match) -> Dict: | |
| """Handle column sum query.""" | |
| col = match.group(1).strip() | |
| col = self._find_column(col) | |
| if col and col in self.df.columns: | |
| sum_val = self.df[col].sum() | |
| return { | |
| 'message': f"Sum of '{col}': {sum_val:,.2f}", | |
| 'value': sum_val | |
| } | |
| return {'error': f"Column '{col}' not found"} | |
| def _handle_column_max(self, match) -> Dict: | |
| """Handle column max query.""" | |
| col = match.group(1).strip() | |
| col = self._find_column(col) | |
| if col and col in self.df.columns: | |
| max_val = self.df[col].max() | |
| return { | |
| 'message': f"Maximum of '{col}': {max_val}", | |
| 'value': max_val | |
| } | |
| return {'error': f"Column '{col}' not found"} | |
| def _handle_column_min(self, match) -> Dict: | |
| """Handle column min query.""" | |
| col = match.group(1).strip() | |
| col = self._find_column(col) | |
| if col and col in self.df.columns: | |
| min_val = self.df[col].min() | |
| return { | |
| 'message': f"Minimum of '{col}': {min_val}", | |
| 'value': min_val | |
| } | |
| return {'error': f"Column '{col}' not found"} | |
| def _handle_value_counts(self, match) -> Dict: | |
| """Handle value counts query.""" | |
| col = match.group(1).strip() | |
| col = self._find_column(col) | |
| if col and col in self.df.columns: | |
| counts = self.df[col].value_counts().head(10) | |
| return { | |
| 'data': counts, | |
| 'message': f"Top 10 values in '{col}'" | |
| } | |
| return {'error': f"Column '{col}' not found"} | |
| def _handle_filter_greater(self, match) -> Dict: | |
| """Handle filter greater than query.""" | |
| col = match.group(1).strip() | |
| value = float(match.group(2)) | |
| col = self._find_column(col) | |
| if col and col in self.df.columns: | |
| filtered = self.df[self.df[col] > value] | |
| return { | |
| 'data': filtered.head(20), | |
| 'message': f"Found {len(filtered)} rows where '{col}' > {value}" | |
| } | |
| return {'error': f"Column '{col}' not found"} | |
| def _handle_filter_less(self, match) -> Dict: | |
| """Handle filter less than query.""" | |
| col = match.group(1).strip() | |
| value = float(match.group(2)) | |
| col = self._find_column(col) | |
| if col and col in self.df.columns: | |
| filtered = self.df[self.df[col] < value] | |
| return { | |
| 'data': filtered.head(20), | |
| 'message': f"Found {len(filtered)} rows where '{col}' < {value}" | |
| } | |
| return {'error': f"Column '{col}' not found"} | |
| def _handle_filter_equal(self, match) -> Dict: | |
| """Handle filter equal query.""" | |
| col = match.group(1).strip() | |
| value = match.group(2).strip() | |
| col = self._find_column(col) | |
| if col and col in self.df.columns: | |
| # Try to convert value to appropriate type | |
| try: | |
| value = float(value) | |
| except: | |
| pass | |
| filtered = self.df[self.df[col] == value] | |
| return { | |
| 'data': filtered.head(20), | |
| 'message': f"Found {len(filtered)} rows where '{col}' = '{value}'" | |
| } | |
| return {'error': f"Column '{col}' not found"} | |
| def _handle_top_n(self, match) -> Dict: | |
| """Handle top N query.""" | |
| n = int(match.group(1)) | |
| col = match.group(2).strip() if match.group(2) else self.df.columns[0] | |
| col = self._find_column(col) | |
| if col and col in self.df.columns: | |
| top_n = self.df.nlargest(n, col) | |
| return { | |
| 'data': top_n, | |
| 'message': f"Top {n} rows by '{col}'" | |
| } | |
| return {'error': f"Column '{col}' not found"} | |
| def _handle_group_by(self, match) -> Dict: | |
| """Handle group by query.""" | |
| col = match.group(1).strip() | |
| agg_func = match.group(2) if match.group(2) else 'mean' | |
| col = self._find_column(col) | |
| if col and col in self.df.columns: | |
| agg_map = { | |
| 'mean': 'mean', 'avg': 'mean', 'average': 'mean', | |
| 'sum': 'sum', 'count': 'count', | |
| 'max': 'max', 'min': 'min' | |
| } | |
| func = agg_map.get(agg_func, 'mean') | |
| numeric_cols = self.df.select_dtypes(include=[np.number]).columns | |
| grouped = self.df.groupby(col)[numeric_cols].agg(func) | |
| return { | |
| 'data': grouped.head(20), | |
| 'message': f"Grouped by '{col}' with {func} aggregation" | |
| } | |
| return {'error': f"Column '{col}' not found"} | |
| def _handle_correlation(self, match) -> Dict: | |
| """Handle correlation query.""" | |
| col1 = match.group(1).strip() | |
| col2 = match.group(2).strip() | |
| col1 = self._find_column(col1) | |
| col2 = self._find_column(col2) | |
| if col1 in self.df.columns and col2 in self.df.columns: | |
| corr = self.df[col1].corr(self.df[col2]) | |
| return { | |
| 'message': f"Correlation between '{col1}' and '{col2}': {corr:.4f}", | |
| 'value': corr | |
| } | |
| return {'error': f"One or both columns not found"} | |
| def _handle_missing_values(self, match) -> Dict: | |
| """Handle missing values query.""" | |
| missing = self.df.isnull().sum() | |
| missing = missing[missing > 0] | |
| if len(missing) > 0: | |
| return { | |
| 'data': missing, | |
| 'message': f"Found missing values in {len(missing)} columns" | |
| } | |
| return {'message': "No missing values found! 🎉"} | |
| def _handle_duplicates(self, match) -> Dict: | |
| """Handle duplicates query.""" | |
| n_duplicates = self.df.duplicated().sum() | |
| return { | |
| 'message': f"Found {n_duplicates} duplicate rows", | |
| 'count': n_duplicates | |
| } | |
| def _find_column(self, col_name: str) -> Optional[str]: | |
| """ | |
| Find the closest matching column name. | |
| Args: | |
| col_name: Column name to find | |
| Returns: | |
| Actual column name or None | |
| """ | |
| col_name = col_name.lower().strip() | |
| # Exact match | |
| for col in self.df.columns: | |
| if col.lower() == col_name: | |
| return col | |
| # Substring match | |
| for col in self.df.columns: | |
| if col_name in col.lower() or col.lower() in col_name: | |
| return col | |
| return None | |
| def _query_llm(self, query: str) -> Dict[str, Any]: | |
| """ | |
| Query LLM API for complex questions. | |
| Args: | |
| query: Natural language query | |
| Returns: | |
| Dict with LLM response | |
| """ | |
| if self.llm_provider == 'openai': | |
| return self._query_openai(query) | |
| else: | |
| return self._query_huggingface(query) | |
| def _query_openai(self, query: str) -> Dict[str, Any]: | |
| """Query OpenAI API.""" | |
| try: | |
| import openai | |
| openai.api_key = self.llm_api_key | |
| # Create context about the dataset | |
| columns_info = "\n".join([ | |
| f"- {col} ({self.df[col].dtype})" | |
| for col in self.df.columns[:20] # Limit to first 20 columns | |
| ]) | |
| prompt = f"""You are a data analysis assistant. Answer the following question about a dataset. | |
| Dataset Information: | |
| - Shape: {self.df.shape} | |
| - Columns: | |
| {columns_info} | |
| User Question: {query} | |
| Provide a clear, concise answer based on the dataset structure.""" | |
| response = openai.ChatCompletion.create( | |
| model="gpt-3.5-turbo", | |
| messages=[ | |
| {"role": "system", "content": "You are a helpful data analysis assistant."}, | |
| {"role": "user", "content": prompt} | |
| ], | |
| max_tokens=500 | |
| ) | |
| answer = response.choices[0].message.content | |
| return { | |
| 'success': True, | |
| 'type': 'llm_response', | |
| 'result': {'message': answer}, | |
| 'method': 'llm' | |
| } | |
| except Exception as e: | |
| return { | |
| 'success': False, | |
| 'error': f"LLM query failed: {str(e)}" | |
| } | |
| def _query_huggingface(self, query: str) -> Dict[str, Any]: | |
| """Query HuggingFace Inference API.""" | |
| try: | |
| import requests | |
| API_URL = "https://api-inference.huggingface.co/models/google/flan-t5-large" | |
| headers = {"Authorization": f"Bearer {self.llm_api_key}"} | |
| payload = { | |
| "inputs": f"Answer this data question: {query}", | |
| "parameters": {"max_length": 200} | |
| } | |
| response = requests.post(API_URL, headers=headers, json=payload) | |
| result = response.json() | |
| if isinstance(result, list) and len(result) > 0: | |
| answer = result[0].get('generated_text', 'No response') | |
| else: | |
| answer = str(result) | |
| return { | |
| 'success': True, | |
| 'type': 'llm_response', | |
| 'result': {'message': answer}, | |
| 'method': 'llm' | |
| } | |
| except Exception as e: | |
| return { | |
| 'success': False, | |
| 'error': f"HuggingFace query failed: {str(e)}" | |
| } | |
| def get_chat_history(self) -> List[Dict[str, str]]: | |
| """Get the chat history.""" | |
| return self.chat_history | |
| def clear_history(self) -> None: | |
| """Clear the chat history.""" | |
| self.chat_history = [] | |
| # Streamlit display functions | |
| def display_chat_interface(df: pd.DataFrame, column_types: Optional[Dict[str, str]] = None): | |
| """Display chat interface in Streamlit.""" | |
| st.subheader("💬 Chat With Your Data") | |
| # Initialize chat engine | |
| if 'chat_engine' not in st.session_state: | |
| st.session_state.chat_engine = ChatEngine(df, column_types) | |
| chat_engine = st.session_state.chat_engine | |
| # LLM configuration | |
| with st.expander("⚙️ LLM Configuration (Optional)"): | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| provider = st.selectbox( | |
| "LLM Provider", | |
| options=['None', 'openai', 'huggingface'], | |
| help="Select LLM provider for complex queries" | |
| ) | |
| with col2: | |
| if provider != 'None': | |
| api_key = st.text_input( | |
| "API Key", | |
| type="password", | |
| help=f"Enter your {provider} API key" | |
| ) | |
| if api_key: | |
| chat_engine.set_llm_config(api_key, provider) | |
| # Chat input | |
| query = st.text_input( | |
| "Ask a question about your data", | |
| placeholder="e.g., 'What is the average age?' or 'Show rows where salary > 50000'" | |
| ) | |
| if st.button("Ask", type="primary") and query: | |
| with st.spinner("Processing..."): | |
| response = chat_engine.process_query(query) | |
| if response['success']: | |
| result = response['result'] | |
| # Display message | |
| if 'message' in result: | |
| st.info(result['message']) | |
| # Display data | |
| if 'data' in result: | |
| st.dataframe(result['data'], use_container_width=True) | |
| # Display single value | |
| if 'value' in result: | |
| st.metric("Result", f"{result['value']:.4f}" if isinstance(result['value'], float) else result['value']) | |
| else: | |
| st.error(response.get('error', 'Unknown error')) | |
| if 'suggestions' in response: | |
| st.write("Try these queries:") | |
| for suggestion in response['suggestions']: | |
| st.code(suggestion) | |
| # Example queries | |
| with st.expander("📖 Example Queries"): | |
| st.markdown(""" | |
| **Basic Queries:** | |
| - `show me the first 10 rows` | |
| - `how many rows and columns?` | |
| - `describe the data` | |
| **Column Queries:** | |
| - `what is the average of [column]?` | |
| - `what is the maximum of [column]?` | |
| - `show value counts for [column]` | |
| **Filtering:** | |
| - `show rows where [column] > 100` | |
| - `show rows where [column] = value` | |
| - `top 10 by [column]` | |
| **Aggregation:** | |
| - `group by [column] and calculate mean` | |
| - `correlation between [col1] and [col2]` | |
| **Data Quality:** | |
| - `show missing values` | |
| - `how many duplicates?` | |
| """) | |