Fola-AI's picture
Upload 3 files
928639d verified
# ===== MULTIMODAL TEXT ANALYTICS AI ASSISTANT =====
# This is a comprehensive text analytics system with multiple AI API integrations
# and smart column detection capabilities for customer feedback analysis
# ===== IMPORTS SECTION =====
# Core Python libraries for basic functionality
import os # Operating system interface for environment variables and file operations
import warnings # Python warnings control to suppress unnecessary warnings
warnings.filterwarnings('ignore') # Suppress all warnings to keep output clean
# Environment and API management
from dotenv import load_dotenv # Load environment variables from .env file for API keys
from anthropic import Anthropic # Anthropic's Claude AI API client
# Additional AI APIs - using try/except to handle missing dependencies gracefully
try:
from openai import OpenAI # OpenAI's GPT API client
except ImportError:
OpenAI = None # Set to None if not installed, will be checked later
try:
from groq import Groq # Groq's fast inference API client
except ImportError:
Groq = None # Set to None if not installed
try:
import google.generativeai as genai # Google's Gemini API client
except ImportError:
genai = None # Set to None if not installed
# Data processing and manipulation libraries
import pandas as pd # Primary data manipulation library for DataFrames
import numpy as np # Numerical computing library for array operations
from datetime import datetime, timedelta # Date and time handling utilities
import json # JSON data format handling
import gc # Garbage collection for memory management - important for large datasets
# Natural Language Processing libraries
import nltk # Natural Language Toolkit - comprehensive NLP library
from nltk.corpus import stopwords # Common words to filter out (the, and, or, etc.)
from nltk.tokenize import word_tokenize # Split text into individual words/tokens
from nltk.stem import WordNetLemmatizer # Reduce words to their root form (running -> run)
from textblob import TextBlob # Simple API for diving into common NLP tasks
import re # Regular expressions for text pattern matching and cleaning
from collections import Counter # Efficient counting of hashable objects
# Machine Learning libraries for text analysis
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer # Convert text to numerical features
from sklearn.decomposition import LatentDirichletAllocation # Topic modeling algorithm
from sklearn.cluster import KMeans # Clustering algorithm for grouping similar texts
from sklearn.preprocessing import StandardScaler # Normalize numerical features
from sklearn.metrics.pairwise import cosine_similarity # Measure similarity between text vectors
# Visualization libraries for creating charts and graphs
import plotly.express as px # High-level plotting interface
import plotly.graph_objects as go # Low-level plotting interface for custom charts
from plotly.subplots import make_subplots # Create multiple charts in one figure
import matplotlib.pyplot as plt # Traditional plotting library
import seaborn as sns # Statistical data visualization built on matplotlib
# Web interface framework
import gradio as gr # Create web interfaces for machine learning models
# Download required NLTK data packages - these contain language models and corpora
nltk.download('punkt', quiet=True) # Sentence tokenizer models
nltk.download('punkt_tab', quiet=True) # New tokenizer format for latest NLTK versions
nltk.download('stopwords', quiet=True) # Lists of common words to filter out
nltk.download('wordnet', quiet=True) # Lexical database for lemmatization
nltk.download('averaged_perceptron_tagger', quiet=True) # Part-of-speech tagger
nltk.download('omw-1.4', quiet=True) # Open Multilingual Wordnet for lemmatizer
nltk.download('brown', quiet=True) # Brown corpus required for TextBlob
# Download TextBlob corpora for sentiment analysis
try:
from textblob import download_corpora # Import corpora downloader
download_corpora.main() # Download all required corpora
except:
# Alternative method if the above doesn't work - use subprocess
import subprocess # Execute shell commands from Python
import sys # System-specific parameters and functions
try:
# Run TextBlob download command as subprocess with timeout
subprocess.run([sys.executable, "-m", "textblob.download_corpora"],
capture_output=True, text=True, timeout=30)
except:
# If download fails, print warning but continue execution
print("Warning: Could not download TextBlob corpora. Sentiment analysis may not work properly.")
print("Please run: python -m textblob.download_corpora")
# Load environment variables from .env file, override existing ones
load_dotenv(override=True)
# ===== SMART COLUMN DETECTOR CLASS =====
class SmartColumnDetector:
"""
Intelligently detect and extract relevant columns from uploaded data
This class automatically identifies what type of data each column contains
"""
def __init__(self):
"""Initialize the detector with keyword lists for different column types"""
# Keywords for detecting text/feedback columns - these usually contain the main content
self.text_keywords = ['comment', 'feedback', 'review', 'description', 'text',
'response', 'opinion', 'message', 'notes', 'remarks']
# Keywords for detecting ID/identifier columns - these uniquely identify records
self.id_keywords = ['id', 'identifier', 'key', 'number', 'code', 'ref',
'reference', 'index', 'uuid']
# Keywords for detecting product/category columns - these describe what's being reviewed
self.product_keywords = ['product', 'item', 'model', 'variant', 'type',
'category', 'brand', 'name', 'sku']
# Keywords for detecting date/time columns - these show when feedback was given
self.date_keywords = ['date', 'time', 'created', 'updated', 'timestamp']
def detect_column_types(self, df):
"""
Detect column types based on column names and content analysis
Returns a dictionary categorizing each column by its likely purpose
"""
# Initialize results dictionary with empty lists for each category
detected = {
'text_columns': [], # Columns containing feedback/comments
'id_columns': [], # Columns containing unique identifiers
'product_columns': [], # Columns describing products/categories
'date_columns': [], # Columns containing dates/timestamps
'other_columns': [] # Everything else
}
# Iterate through each column in the dataframe
for col in df.columns:
col_lower = col.lower() # Convert to lowercase for case-insensitive matching
# Check if column name contains text-related keywords
if any(keyword in col_lower for keyword in self.text_keywords):
detected['text_columns'].append(col)
# Check if column name contains ID-related keywords
elif any(keyword in col_lower for keyword in self.id_keywords):
detected['id_columns'].append(col)
# Check if column name contains product-related keywords
elif any(keyword in col_lower for keyword in self.product_keywords):
detected['product_columns'].append(col)
# Check if column name contains date-related keywords
elif any(keyword in col_lower for keyword in self.date_keywords):
detected['date_columns'].append(col)
else:
# If no keywords match, analyze the actual content to determine type
sample = df[col].dropna().head(100) # Get first 100 non-null values
if len(sample) > 0: # If we have sample data
# Check if column contains text data (object dtype in pandas)
if df[col].dtype == 'object':
# Calculate average length of text in this column
avg_length = sample.astype(str).str.len().mean()
if avg_length > 50: # Long text likely indicates feedback/comments
detected['text_columns'].append(col)
elif avg_length < 20 and df[col].nunique() / len(df) > 0.5:
# Short, mostly unique values likely indicate IDs
detected['id_columns'].append(col)
else:
# Short, non-unique text likely indicates categories/products
detected['product_columns'].append(col)
else:
# Non-text columns go to 'other' category
detected['other_columns'].append(col)
return detected # Return the categorized column dictionary
def extract_relevant_data(self, df):
"""
Extract only relevant columns and create optimized dataset for analysis
This reduces memory usage and focuses on important data
"""
# First, detect what type each column is
detected = self.detect_column_types(df)
# Create new dataframe with only relevant columns
extracted_data = pd.DataFrame()
# Add unique identifier column - use existing ID or create one
if detected['id_columns'] and len(detected['id_columns']) > 0:
# Use first detected ID column
extracted_data['unique_id'] = df[detected['id_columns'][0]]
else:
# Create sequential ID numbers if no ID column exists
extracted_data['unique_id'] = range(1, len(df) + 1)
# Add product information columns (limit to first 2 to avoid too many columns)
if detected['product_columns'] and len(detected['product_columns']) > 0:
# Convert to list if needed and limit to 2 product columns
product_cols = list(detected['product_columns'])[:2]
for col in product_cols:
# Add with 'product_' prefix to make purpose clear
extracted_data[f'product_{col}'] = df[col]
# Combine all text columns into a single 'combined_text' column
if detected['text_columns'] and len(detected['text_columns']) > 0:
text_cols = list(detected['text_columns']) # Ensure it's a list
text_data = [] # Initialize list to store combined text
# For each row, combine all text columns
for idx in df.index:
combined_text = ' '.join([
str(df.loc[idx, col]) # Convert to string
for col in text_cols # For each text column
if col in df.columns and pd.notna(df.loc[idx, col]) # If column exists and value is not null
])
text_data.append(combined_text) # Add to our list
extracted_data['combined_text'] = text_data # Add as new column
else:
# If no text columns detected, create empty combined_text column
extracted_data['combined_text'] = [''] * len(df)
# Add date column if available (use first detected date column)
if detected['date_columns'] and len(detected['date_columns']) > 0:
# Convert to datetime format, handle errors gracefully
extracted_data['date'] = pd.to_datetime(df[detected['date_columns'][0]], errors='coerce')
# Return both the extracted data and the detection results
return extracted_data, detected
# ===== ENHANCED TEXT PROCESSOR CLASS =====
class EnhancedTextProcessor:
"""
Enhanced text preprocessing with actionable insights extraction
This class handles text cleaning and extracts meaningful patterns from customer feedback
"""
def __init__(self):
"""Initialize the text processor with NLP tools and insight dictionaries"""
self.lemmatizer = WordNetLemmatizer() # Tool to reduce words to root form
self.stop_words = set(stopwords.words('english')) # Common words to ignore
# Dictionary mapping actionable items to keywords that indicate them
# This helps identify what customers want improved
self.actionable_dictionary = {
'improve speed': ['slow', 'faster', 'quick', 'speed up', 'takes too long', 'waiting'],
'better quality': ['poor quality', 'cheap', 'breaks', 'defective', 'flimsy', 'weak'],
'enhance ui': ['confusing', 'hard to use', 'complicated', 'not intuitive', 'difficult to navigate'],
'fix bugs': ['bug', 'error', 'crash', 'freeze', 'not working', 'glitch', 'broken'],
'add features': ['missing', 'need', 'want', 'should have', 'would be nice', 'lacks'],
'improve support': ['no response', 'unhelpful', 'rude', 'poor service', 'bad support'],
'better packaging': ['damaged', 'poor packaging', 'arrived broken', 'not protected'],
'clearer docs': ['unclear', 'no instructions', 'confusing manual', 'hard to understand'],
'reduce price': ['expensive', 'overpriced', 'too costly', 'not worth', 'cheaper'],
'faster delivery': ['late', 'delayed', 'slow shipping', 'took forever', 'still waiting'],
'better communication': ['no updates', 'not informed', 'lack of communication', 'no tracking'],
'improve reliability': ['unreliable', 'stops working', 'inconsistent', 'sometimes works'],
'enhance performance': ['slow performance', 'laggy', 'sluggish', 'not responsive'],
'better design': ['ugly', 'poor design', 'looks cheap', 'not attractive', 'outdated look'],
'more options': ['limited options', 'no variety', 'need more choices', 'only one option']
}
def clean_text(self, text):
"""
Clean and normalize text for analysis
Removes special characters and standardizes format
"""
# Handle null or empty text
if pd.isna(text) or text == '':
return ""
text = str(text).lower() # Convert to lowercase string
text = re.sub(r'[^a-zA-Z0-9\s]', '', text) # Remove special characters, keep only letters, numbers, spaces
text = ' '.join(text.split()) # Remove extra whitespace
return text
def extract_actionable_insights(self, text):
"""
Extract actionable insights using dictionary matching
Returns comma-separated list of suggested improvements
"""
# Handle null or empty text
if pd.isna(text) or text == '':
return ""
text_lower = text.lower() # Convert to lowercase for matching
found_insights = [] # List to store found actionable items
# Check each actionable item against the text
for action, keywords in self.actionable_dictionary.items():
for keyword in keywords:
if keyword in text_lower: # If keyword found in text
found_insights.append(action) # Add the actionable item
break # Only add each action once per text
# Return top 3 most relevant insights to avoid overwhelming output
if found_insights:
return ', '.join(found_insights[:3])
return ""
def extract_specific_topics(self, text):
"""
Extract specific topics from text using keyword extraction and noun phrase detection
Returns list of 3 topics (may include empty strings if not enough topics found)
"""
# Handle null, empty, or very short text
if pd.isna(text) or text == '' or len(text) < 10:
return ['', '', ''] # Return 3 empty strings
text_lower = text.lower() # Convert to lowercase
# Remove stopwords for better topic extraction
words = word_tokenize(text_lower) # Split into individual words
# Filter out stopwords and very short words
filtered_words = [w for w in words if w not in self.stop_words and len(w) > 3]
# Extract noun phrases using TextBlob (these are usually good topics)
blob = TextBlob(text)
noun_phrases = blob.noun_phrases # Get noun phrases from text
topics = [] # Initialize topics list
# Add noun phrases (these are usually good topics)
for phrase in noun_phrases[:5]: # Limit to top 5 noun phrases
if len(phrase.split()) <= 3: # Only include short phrases (3 words or less)
topics.append(phrase)
# Add frequent meaningful words if we don't have enough topics
if len(topics) < 3:
word_freq = Counter(filtered_words) # Count word frequencies
for word, _ in word_freq.most_common(5): # Get top 5 most common words
if word not in str(topics): # Avoid duplicates
topics.append(word)
if len(topics) >= 3: # Stop when we have 3 topics
break
# Ensure we always return exactly 3 items
topics = topics[:3] # Take only first 3
while len(topics) < 3: # Add empty strings if needed
topics.append('')
return topics
def determine_topic(self, text):
"""
Legacy method kept for compatibility - returns first specific topic
This maintains backward compatibility with older versions
"""
topics = self.extract_specific_topics(text) # Get all topics
return topics[0] if topics[0] else 'General' # Return first topic or 'General'
# ===== SEARCH ENGINE CLASS =====
class TextSearchEngine:
"""
Advanced search functionality for text data with semantic capabilities
Uses TF-IDF vectorization and cosine similarity for intelligent text search
"""
def __init__(self):
"""Initialize the search engine with TF-IDF vectorizer and synonym dictionary"""
# TF-IDF (Term Frequency-Inverse Document Frequency) vectorizer
# Converts text to numerical vectors for similarity calculations
self.vectorizer = TfidfVectorizer(
max_features=1000, # Limit to top 1000 most important terms
ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams for better matching
stop_words='english', # Remove common English words
use_idf=True, # Use inverse document frequency weighting
smooth_idf=True, # Add smoothing to IDF
sublinear_tf=True # Apply sublinear tf scaling for better performance
)
self.tfidf_matrix = None # Will store the TF-IDF matrix after building index
self.data = None # Will store the original data
# Synonym dictionary for semantic search - helps find related terms
self.synonyms = {
'fast': ['quick', 'rapid', 'speedy', 'swift', 'prompt'],
'slow': ['sluggish', 'delayed', 'laggy', 'lengthy', 'prolonged'],
'good': ['excellent', 'great', 'wonderful', 'fantastic', 'amazing', 'positive'],
'bad': ['poor', 'terrible', 'awful', 'negative', 'horrible', 'disappointing'],
'problem': ['issue', 'bug', 'error', 'defect', 'fault', 'glitch'],
'help': ['support', 'assistance', 'aid', 'service'],
'price': ['cost', 'fee', 'charge', 'rate', 'payment', 'expensive', 'cheap'],
'quality': ['standard', 'grade', 'condition', 'caliber'],
'delivery': ['shipping', 'dispatch', 'arrival', 'transport'],
'easy': ['simple', 'straightforward', 'effortless', 'user-friendly'],
'hard': ['difficult', 'complex', 'complicated', 'challenging'],
'broken': ['damaged', 'defective', 'faulty', 'malfunctioning'],
'love': ['like', 'enjoy', 'appreciate', 'adore'],
'hate': ['dislike', 'despise', 'detest'],
'feature': ['function', 'capability', 'option', 'characteristic'],
'customer': ['client', 'buyer', 'purchaser', 'consumer', 'user']
}
def expand_query_with_synonyms(self, query):
"""
Expand search query with synonyms for better semantic matching
This helps find relevant results even when different words are used
"""
query_words = query.lower().split() # Split query into individual words
expanded_terms = [] # List to store original words and synonyms
for word in query_words:
expanded_terms.append(word) # Add the original word
# Add synonyms if available for this word
if word in self.synonyms:
expanded_terms.extend(self.synonyms[word])
# Check if word is a synonym of something else and add related terms
for key, syns in self.synonyms.items():
if word in syns: # If current word is a synonym
expanded_terms.append(key) # Add the main term
expanded_terms.extend([s for s in syns if s != word]) # Add other synonyms
# Remove duplicates while preserving order
seen = set()
unique_terms = []
for term in expanded_terms:
if term not in seen:
unique_terms.append(term)
seen.add(term)
return ' '.join(unique_terms) # Return expanded query as single string
def build_index(self, df, text_column):
"""
Build search index from text data
Creates TF-IDF vectors for all documents to enable fast similarity search
"""
self.data = df.copy() # Store copy of the data
texts = df[text_column].fillna('').tolist() # Get all text, fill nulls with empty string
# Add other searchable columns to improve search accuracy
if 'topic_1' in df.columns:
# Combine main text with topic information for better searchability
texts = [f"{text} {df.iloc[i]['topic_1']} {df.iloc[i]['topic_2']} {df.iloc[i]['topic_3']}"
for i, text in enumerate(texts)]
if 'actionable_insights' in df.columns:
# Also include actionable insights in searchable text
texts = [f"{texts[i]} {df.iloc[i]['actionable_insights']}"
for i in range(len(texts))]
# Create TF-IDF matrix from all texts
self.tfidf_matrix = self.vectorizer.fit_transform(texts)
def search(self, query, top_k=10):
"""
Enhanced search with semantic understanding
Returns top matching documents with similarity scores
"""
# Check if index has been built
if self.tfidf_matrix is None:
return pd.DataFrame() # Return empty DataFrame if no index
# Expand query with synonyms for better semantic matching
expanded_query = self.expand_query_with_synonyms(query)
# Vectorize both original and expanded queries
query_vector = self.vectorizer.transform([query]) # Original query vector
expanded_vector = self.vectorizer.transform([expanded_query]) # Expanded query vector
# Calculate similarities for both queries against all documents
similarities_orig = cosine_similarity(query_vector, self.tfidf_matrix).flatten()
similarities_exp = cosine_similarity(expanded_vector, self.tfidf_matrix).flatten()
# Combine scores (weighted average - original query gets more weight)
combined_similarities = (0.7 * similarities_orig + 0.3 * similarities_exp)
# Get top results
top_indices = combined_similarities.argsort()[-top_k:][::-1] # Get indices of top scores, reverse order
top_scores = combined_similarities[top_indices] # Get the actual scores
# Filter results with score > 0.05 (lower threshold for better recall)
valid_indices = [idx for idx, score in zip(top_indices, top_scores) if score > 0.05]
if valid_indices:
# Create results dataframe from valid matches
results = self.data.iloc[valid_indices].copy()
results['search_score'] = [combined_similarities[idx] for idx in valid_indices]
# Boost results that have exact matches in the text
query_lower = query.lower()
for idx in results.index:
if 'combined_text' in results.columns:
# If exact query appears in text, boost the score
if query_lower in str(results.at[idx, 'combined_text']).lower():
results.at[idx, 'search_score'] *= 1.5 # 50% boost for exact matches
return results.sort_values('search_score', ascending=False) # Return sorted by relevance
return pd.DataFrame() # Return empty DataFrame if no valid results
# ===== AI MODEL MANAGER CLASS =====
class AIModelManager:
"""
Manages multiple AI model APIs and provides unified interface
Supports OpenAI, Anthropic, Deepseek, Groq, and Google Gemini
"""
def __init__(self):
"""Initialize the model manager and set up all available AI APIs"""
self.available_models = {} # Dictionary to store available models
self.clients = {} # Dictionary to store API clients
self.current_model = None # Currently selected model
self.initialize_apis() # Set up all APIs
def initialize_apis(self):
"""Initialize all available AI APIs based on environment variables"""
# Anthropic Claude API setup
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY") # Get API key from environment
if ANTHROPIC_API_KEY: # If API key exists
try:
self.clients['anthropic'] = Anthropic(api_key=ANTHROPIC_API_KEY) # Create client
# Add Claude model to available models
self.available_models['Claude 3 Haiku'] = {
'provider': 'anthropic',
'model': 'claude-3-haiku-20240307'
}
print(f"Anthropic API Key exists and begins {ANTHROPIC_API_KEY[:4]}") # Confirm setup
except Exception as e:
print(f"Error initializing Anthropic: {e}")
else:
print("Anthropic API Key not set")
# OpenAI API setup
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if OPENAI_API_KEY and OpenAI: # Check both API key and library availability
try:
self.clients['openai'] = OpenAI(api_key=OPENAI_API_KEY)
# Add multiple OpenAI models
self.available_models['GPT-4o-mini'] = {
'provider': 'openai',
'model': 'gpt-4o-mini'
}
self.available_models['GPT-3.5 Turbo'] = {
'provider': 'openai',
'model': 'gpt-3.5-turbo'
}
print(f"OpenAI API Key exists and begins {OPENAI_API_KEY[:7]}")
except Exception as e:
print(f"Error initializing OpenAI: {e}")
else:
print("OpenAI API Key not set or library not installed")
# Deepseek API setup (uses OpenAI-compatible API)
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")
if DEEPSEEK_API_KEY and OpenAI:
try:
# Deepseek uses OpenAI client with different base URL
self.clients['deepseek'] = OpenAI(
api_key=DEEPSEEK_API_KEY,
base_url="https://api.deepseek.com" # Deepseek's API endpoint
)
self.available_models['Deepseek Chat'] = {
'provider': 'deepseek',
'model': 'deepseek-chat'
}
print(f"Deepseek API Key exists and begins {DEEPSEEK_API_KEY[:7]}")
except Exception as e:
print(f"Error initializing Deepseek: {e}")
else:
print("Deepseek API Key not set or OpenAI library not installed")
# Groq API setup
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
if GROQ_API_KEY and Groq:
try:
self.clients['groq'] = Groq(api_key=GROQ_API_KEY)
# Add multiple Groq models
self.available_models['Llama 3.3 70B'] = {
'provider': 'groq',
'model': 'llama-3.3-70b-versatile'
}
self.available_models['Mixtral 8x7B'] = {
'provider': 'groq',
'model': 'mixtral-8x7b-32768'
}
print(f"Groq API Key exists and begins {GROQ_API_KEY[:4]}")
except Exception as e:
print(f"Error initializing Groq: {e}")
else:
print("Groq API Key not set or library not installed")
# Google Gemini API setup
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
if GOOGLE_API_KEY and genai:
try:
genai.configure(api_key=GOOGLE_API_KEY) # Configure Google AI
self.clients['google'] = genai # Store the configured module
# Add Google models
self.available_models['Gemini 1.5 Flash'] = {
'provider': 'google',
'model': 'gemini-1.5-flash'
}
self.available_models['Gemini 1.5 Pro'] = {
'provider': 'google',
'model': 'gemini-1.5-pro'
}
print(f"Google API Key exists and begins {GOOGLE_API_KEY[:2]}")
except Exception as e:
print(f"Error initializing Google Gemini: {e}")
else:
print("Google API Key not set or library not installed")
# Set default model to first available model
if self.available_models:
self.current_model = list(self.available_models.keys())[0]
def get_available_models(self):
"""Return list of available model names"""
return list(self.available_models.keys())
def set_model(self, model_name):
"""Set the current model for text generation"""
if model_name in self.available_models:
self.current_model = model_name
return True # Success
return False # Model not available
def generate_text(self, prompt, max_tokens=1000):
"""
Generate text using the current model
Handles different API formats for each provider
"""
# Check if we have a valid current model
if not self.current_model or self.current_model not in self.available_models:
return None
model_info = self.available_models[self.current_model] # Get model configuration
provider = model_info['provider'] # Which API provider to use
model = model_info['model'] # Specific model name
try:
# Handle Anthropic API format
if provider == 'anthropic':
client = self.clients['anthropic']
response = client.messages.create(
model=model,
max_tokens=max_tokens,
messages=[{"role": "user", "content": prompt}]
)
return response.content[0].text # Extract text from response
# Handle OpenAI and Deepseek API format (both use OpenAI-compatible format)
elif provider in ['openai', 'deepseek']:
client = self.clients[provider]
response = client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}],
max_tokens=max_tokens
)
return response.choices[0].message.content # Extract text from response
# Handle Groq API format (similar to OpenAI)
elif provider == 'groq':
client = self.clients['groq']
response = client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}],
max_tokens=max_tokens
)
return response.choices[0].message.content
# Handle Google Gemini API format
elif provider == 'google':
model_obj = genai.GenerativeModel(model) # Create model object
response = model_obj.generate_content(prompt) # Generate response
return response.text # Extract text
except Exception as e:
print(f"Error generating text with {self.current_model}: {e}")
return None
# Initialize the model manager globally so it can be used throughout the application
model_manager = AIModelManager()
# ===== ENHANCED TEXT ANALYZER CLASS =====
class EnhancedTextAnalyzer:
"""
Main analysis engine with all enhanced features and multi-model support
This is the core class that orchestrates all text analysis functionality
"""
def __init__(self, model_manager=None):
"""Initialize the analyzer with all component classes"""
self.model_manager = model_manager # AI model manager for generating insights
self.column_detector = SmartColumnDetector() # Smart column detection
self.text_processor = EnhancedTextProcessor() # Text processing and insights
self.search_engine = TextSearchEngine() # Text search functionality
self.original_df = None # Store original data
self.processed_df = None # Store processed data
self.results = {} # Store analysis results
self.visualizations = {} # Store generated visualizations
def load_file(self, file):
"""
Load data from various file formats (CSV, Excel, JSON)
Returns the loaded dataframe and a status message
"""
try:
# Determine file type based on extension and load accordingly
if file.name.endswith('.csv'):
df = pd.read_csv(file.name) # Load CSV file
elif file.name.endswith(('.xlsx', '.xls')):
df = pd.read_excel(file.name) # Load Excel file
elif file.name.endswith('.json'):
df = pd.read_json(file.name) # Load JSON file
else:
return None, "Unsupported file format" # Return error for unsupported formats
return df, f"File loaded: {len(df)} records" # Return success message with record count
except Exception as e:
return None, f"Error loading file: {str(e)}" # Return error message
def process_data(self, df):
"""
Process data with smart extraction and analysis
This is the main processing pipeline that analyzes the uploaded data
"""
# Step 1: Extract relevant columns using smart detection
extracted_df, detected_columns = self.column_detector.extract_relevant_data(df)
# Step 2: Store processed data for later use
self.processed_df = extracted_df
# Step 3: Clean up memory by deleting original large dataframe
del df
gc.collect() # Force garbage collection to free memory
# Step 4: Add analysis columns if we have text data to analyze
if 'combined_text' in extracted_df.columns:
# Initialize lists to store analysis results for each row
sentiments = [] # Positive/Negative/Neutral sentiment classification
polarities = [] # Numerical sentiment scores (-1 to 1)
topics_1 = [] # Primary topic for each text
topics_2 = [] # Secondary topic for each text
topics_3 = [] # Tertiary topic for each text
insights = [] # Actionable insights for each text
# Process each text entry
for text in extracted_df['combined_text']:
# Sentiment analysis using TextBlob
blob = TextBlob(text)
polarity = blob.sentiment.polarity # Get numerical sentiment score
# Convert numerical score to categorical sentiment
if polarity > 0.1: # Positive threshold
sentiment = 'Positive'
elif polarity < -0.1: # Negative threshold
sentiment = 'Negative'
else: # Neutral range
sentiment = 'Neutral'
sentiments.append(sentiment) # Add categorical sentiment
polarities.append(polarity) # Add numerical score
# Extract specific topics (3 separate topics per text)
specific_topics = self.text_processor.extract_specific_topics(text)
topics_1.append(specific_topics[0]) # Primary topic
topics_2.append(specific_topics[1]) # Secondary topic
topics_3.append(specific_topics[2]) # Tertiary topic
# Extract actionable insights using dictionary matching
insight = self.text_processor.extract_actionable_insights(text)
insights.append(insight)
# Add all analysis results as new columns to the dataframe
extracted_df['sentiment'] = sentiments # Categorical sentiment
extracted_df['sentiment_score'] = polarities # Numerical sentiment score
extracted_df['topic_1'] = topics_1 # Primary topic
extracted_df['topic_2'] = topics_2 # Secondary topic
extracted_df['topic_3'] = topics_3 # Tertiary topic
extracted_df['actionable_insights'] = insights # Actionable insights
# Build search index with enhanced search capabilities
self.search_engine.build_index(extracted_df, 'combined_text')
# Step 5: Save processed data to Excel file for download
output_file = 'processed_data.xlsx'
extracted_df.to_excel(output_file, index=False)
# Return processed data, detected column info, and output file path
return extracted_df, detected_columns, output_file
def generate_ai_insights(self, df, num_samples=5):
"""
Generate AI-powered insights using selected model
Takes sample texts and generates high-level insights using AI
"""
# Check if AI model is available
if not self.model_manager or not self.model_manager.current_model:
return "No AI model available for generating insights"
# Check if we have text data to analyze
if 'combined_text' not in df.columns or df.empty:
return "No text data available for AI analysis"
# Sample some texts for analysis (to avoid sending too much data to AI)
sample_texts = df['combined_text'].dropna().head(num_samples).tolist()
if not sample_texts:
return "No valid text samples found"
# Create prompt for AI analysis
# This prompt asks the AI to analyze the customer feedback samples
prompt = f"""Analyze the following customer feedback samples and provide key insights:
Samples:
{chr(10).join([f"{i+1}. {text[:200]}..." if len(text) > 200 else f"{i+1}. {text}" for i, text in enumerate(sample_texts)])}
Please provide:
1. Main themes and patterns
2. Key sentiment indicators
3. Actionable recommendations
4. Areas of concern
Keep the response concise and focused on actionable insights."""
# Generate insights using selected model
try:
response = self.model_manager.generate_text(prompt, max_tokens=500)
if response:
return f"**AI Insights (using {self.model_manager.current_model}):**\n\n{response}"
else:
return "Failed to generate AI insights. Please check your API configuration."
except Exception as e:
return f"Error generating AI insights: {str(e)}"
def generate_visualizations(self, df):
"""
Generate various visualizations from the analyzed data
Creates interactive charts using Plotly for better user experience
"""
visualizations = {} # Dictionary to store all visualizations
# Generate sentiment distribution pie chart
if 'sentiment' in df.columns:
sentiment_counts = df['sentiment'].value_counts() # Count each sentiment category
fig_sentiment = px.pie(
values=sentiment_counts.values, # Values for pie slices
names=sentiment_counts.index, # Labels for pie slices
title="Sentiment Distribution", # Chart title
color_discrete_map={ # Custom colors for each sentiment
'Positive': '#27AE60', # Green for positive
'Negative': '#E74C3C', # Red for negative
'Neutral': '#95A5A6' # Gray for neutral
}
)
visualizations['Sentiment Distribution'] = fig_sentiment
# Generate topic distribution bar chart
if 'topic_1' in df.columns:
# Combine all topics from all three topic columns
all_topics = []
for col in ['topic_1', 'topic_2', 'topic_3']:
if col in df.columns:
topics = df[col].dropna().tolist() # Get non-null topics
all_topics.extend([t for t in topics if t != '']) # Add non-empty topics
if all_topics:
topic_counts = Counter(all_topics) # Count topic frequencies
top_topics = dict(topic_counts.most_common(15)) # Get top 15 topics
fig_topics = px.bar(
x=list(top_topics.values()), # Frequency values
y=list(top_topics.keys()), # Topic names
orientation='h', # Horizontal bar chart
title="Top 15 Specific Topics", # Chart title
labels={'x': 'Count', 'y': 'Topic'} # Axis labels
)
visualizations['Topic Distribution'] = fig_topics
# Generate sentiment by topic heatmap
if 'sentiment' in df.columns and 'topic_1' in df.columns:
df_temp = df[df['topic_1'] != ''].copy() # Filter out empty topics
if not df_temp.empty:
# Get top 10 topics for cleaner visualization
top_topics = df_temp['topic_1'].value_counts().head(10).index
df_filtered = df_temp[df_temp['topic_1'].isin(top_topics)]
# Create cross-tabulation of topics vs sentiments
pivot_table = pd.crosstab(df_filtered['topic_1'], df_filtered['sentiment'])
fig_heatmap = px.imshow(
pivot_table, # Data for heatmap
labels=dict(x="Sentiment", y="Primary Topic", color="Count"), # Labels
title="Sentiment by Primary Topic Heatmap", # Title
color_continuous_scale="RdYlGn" # Color scale (red to green)
)
visualizations['Sentiment by Topic'] = fig_heatmap
# Generate sentiment timeline if date data is available
if 'date' in df.columns and 'sentiment' in df.columns:
df_time = df.copy()
df_time['date'] = pd.to_datetime(df_time['date']) # Ensure date format
# Group by month and sentiment to show trends over time
time_data = df_time.groupby([pd.Grouper(key='date', freq='M'), 'sentiment']).size().reset_index(name='count')
fig_timeline = px.line(
time_data,
x='date', # X-axis: time
y='count', # Y-axis: count
color='sentiment', # Different lines for each sentiment
title="Sentiment Trends Over Time", # Chart title
color_discrete_map={ # Custom colors
'Positive': '#27AE60',
'Negative': '#E74C3C',
'Neutral': '#95A5A6'
}
)
visualizations['Sentiment Timeline'] = fig_timeline
# Generate actionable insights bar chart
if 'actionable_insights' in df.columns:
all_insights = [] # List to store all individual insights
for insight in df['actionable_insights']:
if insight and insight != "":
# Split by comma as we're now using comma-separated insights
all_insights.extend([i.strip() for i in insight.split(',')])
if all_insights:
insight_counts = Counter(all_insights) # Count insight frequencies
top_insights = dict(insight_counts.most_common(10)) # Get top 10 insights
fig_insights = px.bar(
x=list(top_insights.values()), # Frequency values
y=list(top_insights.keys()), # Insight names
orientation='h', # Horizontal bar chart
title="Top 10 Actionable Insights", # Chart title
labels={'x': 'Frequency', 'y': 'Insight'} # Axis labels
)
visualizations['Top Insights'] = fig_insights
return visualizations # Return dictionary of all generated visualizations
# ===== GRADIO INTERFACE FUNCTIONS =====
# Global variables to maintain state across function calls
analyzer = None # Main analyzer instance
current_data = None # Currently processed data
current_visualizations = None # Currently generated visualizations
def update_model(model_name):
"""Update the selected AI model"""
global model_manager
if model_manager.set_model(model_name): # Try to set the new model
return f"βœ… Model switched to: {model_name}"
else:
return f"❌ Failed to switch to: {model_name}"
def process_file(file, model_name):
"""
Process uploaded file with selected model
This is the main function called when user uploads a file
"""
global analyzer, current_data, current_visualizations, model_manager
# Check if file was uploaded
if file is None:
return "Please upload a file", None, None, None, None, None, gr.update(choices=[])
try:
# Update model if changed
if model_name and model_manager:
model_manager.set_model(model_name)
# Create new analyzer instance
analyzer = EnhancedTextAnalyzer(model_manager)
# Load the uploaded file
df, message = analyzer.load_file(file)
if df is None: # If file loading failed
return message, None, None, None, None, None, gr.update(choices=[])
# Process the loaded data
processed_df, detected_cols, output_file = analyzer.process_data(df)
current_data = processed_df # Store for later use
# Generate visualizations from processed data
visualizations = analyzer.generate_visualizations(processed_df)
current_visualizations = visualizations # Store for later use
# Generate AI insights using the selected model
ai_insights = analyzer.generate_ai_insights(processed_df)
# Create summary of processing results
# Safely handle detected columns (convert to lists and limit length)
text_cols = list(detected_cols.get('text_columns', []))[:3] if detected_cols.get('text_columns') else []
id_cols = list(detected_cols.get('id_columns', []))[:3] if detected_cols.get('id_columns') else []
product_cols = list(detected_cols.get('product_columns', []))[:3] if detected_cols.get('product_columns') else []
summary = f"""
### βœ… File Processing Complete!
**Detected Columns:**
- Text Columns: {', '.join(text_cols) if text_cols else 'None'}
- ID Columns: {', '.join(id_cols) if id_cols else 'Auto-generated'}
- Product Columns: {', '.join(product_cols) if product_cols else 'None'}
**Analysis Results:**
- Total Records: {len(processed_df)}
- Processed File Saved: {output_file}
- AI Model Used: {model_manager.current_model if model_manager else 'None'}
"""
# Create data preview (first 10 rows for display)
preview = processed_df.head(10)
# Get first visualization for immediate display
first_viz = list(visualizations.values())[0] if visualizations else None
# Return all results for the Gradio interface
return (
summary, # Processing status
preview, # Data preview
output_file, # Downloadable processed file
ai_insights, # AI-generated insights
first_viz, # First visualization
"Ready for search", # Search status
gr.update(choices=list(visualizations.keys())) # Update visualization dropdown
)
except Exception as e:
# Return error message if anything goes wrong
return f"Error: {str(e)}", None, None, None, None, None, gr.update(choices=[])
def search_data(query):
"""
Search through the data with enhanced semantic search
Uses the built search engine to find relevant text entries
"""
global analyzer, current_data
# Check if data has been processed
if analyzer is None or current_data is None:
return "Please process a file first", None, None
# Check if search query was provided
if not query:
return "Please enter a search query", None, None
try:
# Perform the search using the search engine
results = analyzer.search_engine.search(query, top_k=10)
# Check if any results were found
if results.empty:
return "No results found", None, None
# Select relevant columns for display (updated to include new topic columns)
display_cols = ['unique_id', 'combined_text', 'sentiment', 'topic_1', 'topic_2', 'topic_3', 'actionable_insights', 'search_score']
display_cols = [col for col in display_cols if col in results.columns] # Only include existing columns
results_display = results[display_cols] # Create display dataframe
# Save search results to file for download
search_output = f"search_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
results_display.to_excel(search_output, index=False)
# Return search results and status
return f"Found {len(results)} results", results_display.head(10), search_output
except Exception as e:
return f"Search error: {str(e)}", None, None
def update_visualization(viz_type):
"""
Update displayed visualization based on user selection
Called when user selects a different visualization from dropdown
"""
global current_visualizations
# Check if visualization exists and return it
if current_visualizations and viz_type in current_visualizations:
return current_visualizations[viz_type]
return None # Return None if visualization not found
def export_results(format_type):
"""
Export processed data in different formats (Excel or CSV)
Allows users to download their analyzed data
"""
global current_data
# Check if there's data to export
if current_data is None:
return "No data to export", None
try:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') # Create timestamp for unique filename
# Export based on selected format
if format_type == "Excel":
output_file = f"analysis_results_{timestamp}.xlsx"
current_data.to_excel(output_file, index=False) # Save as Excel
else: # CSV
output_file = f"analysis_results_{timestamp}.csv"
current_data.to_csv(output_file, index=False) # Save as CSV
return f"Data exported to {output_file}", output_file
except Exception as e:
return f"Export error: {str(e)}", None
# ===== GRADIO INTERFACE CREATION =====
def create_interface():
"""
Create the Gradio interface with model selection
This function builds the entire web interface using Gradio
"""
# Create the main Gradio application with soft theme
with gr.Blocks(theme=gr.themes.Soft()) as app:
# Main title and description
gr.Markdown(
"""
# πŸ“Š Enhanced Text Analytics AI Agent
### Smart Column Detection & Comprehensive Text Analysis with Multiple AI Models
**Features:**
- πŸ€– Multiple AI Model Support (OpenAI, Anthropic, Deepseek, Groq, Google)
- πŸ” Automatic detection of text, ID, and product columns
- πŸ’Ύ Memory-efficient processing with automatic file cleanup
- 😊 Sentiment analysis with scoring
- 🎯 Topic/theme extraction
- πŸ’‘ Actionable insights generation
- πŸ”Ž Advanced text search with similarity scoring
- πŸ“ˆ Multiple visualization options
- πŸ“₯ Export results in Excel or CSV format
"""
)
# Tab 1: Upload & Process
with gr.Tab("πŸ“€ Upload & Process"):
with gr.Row():
with gr.Column(scale=1): # Left column for controls
# Model selection dropdown
model_dropdown = gr.Dropdown(
label="πŸ€– Select AI Model",
choices=model_manager.get_available_models(), # Get available models
value=model_manager.current_model if model_manager.current_model else None,
interactive=True
)
# File upload component
file_upload = gr.File(
label="Upload Data File",
file_types=[".csv", ".xlsx", ".xls", ".json"] # Supported file types
)
# Process button
process_btn = gr.Button("πŸš€ Process File", variant="primary")
with gr.Column(scale=2): # Right column for results
status_output = gr.Markdown(label="Processing Status") # Processing status display
ai_insights = gr.Markdown(label="AI-Generated Insights") # AI insights display
# Data preview section
with gr.Row():
data_preview = gr.Dataframe(
label="Data Preview (First 10 rows)",
interactive=False # Read-only display
)
# Processed file download
processed_file = gr.File(
label="πŸ“ Processed Data File",
interactive=False # Read-only, for download only
)
# Tab 2: Search
with gr.Tab("πŸ” Search"):
gr.Markdown("### Search through your text data")
with gr.Row():
# Search input box
search_input = gr.Textbox(
label="Enter search query",
placeholder="Type keywords to search..."
)
# Search button
search_btn = gr.Button("πŸ”Ž Search", variant="primary")
# Search results display
search_status = gr.Markdown(label="Search Status") # Search status
search_results = gr.Dataframe( # Search results table
label="Search Results",
interactive=False
)
search_file = gr.File( # Download search results
label="πŸ“₯ Download Search Results",
interactive=False
)
# Tab 3: Visualizations
with gr.Tab("πŸ“ˆ Visualizations"):
with gr.Row():
# Visualization selector dropdown
viz_selector = gr.Dropdown(
label="Select Visualization",
choices=[], # Will be populated after processing
interactive=True
)
# Visualization display area
viz_plot = gr.Plot(label="Visualization")
# Tab 4: Export
with gr.Tab("πŸ“₯ Export"):
gr.Markdown("### Export your analyzed data")
with gr.Row():
# Export format selection
export_format = gr.Radio(
choices=["Excel", "CSV"],
value="Excel",
label="Export Format"
)
# Export button
export_btn = gr.Button("πŸ“₯ Export Data", variant="primary")
# Export results display
export_status = gr.Markdown(label="Export Status") # Export status
export_file = gr.File( # Download exported file
label="πŸ“ Download Exported File",
interactive=False
)
# ===== EVENT HANDLERS =====
# These connect user interactions to the backend functions
# Model selection change handler
model_dropdown.change(
fn=update_model, # Function to call
inputs=[model_dropdown], # Input components
outputs=[status_output] # Output components
)
# File processing button click handler
process_btn.click(
fn=process_file, # Function to call
inputs=[file_upload, model_dropdown], # Input components
outputs=[ # Output components
status_output,
data_preview,
processed_file,
ai_insights,
viz_plot,
search_status,
viz_selector
]
)
# Search button click handler
search_btn.click(
fn=search_data, # Function to call
inputs=[search_input], # Input components
outputs=[search_status, search_results, search_file] # Output components
)
# Visualization selector change handler
viz_selector.change(
fn=update_visualization, # Function to call
inputs=[viz_selector], # Input components
outputs=[viz_plot] # Output components
)
# Export button click handler
export_btn.click(
fn=export_results, # Function to call
inputs=[export_format], # Input components
outputs=[export_status, export_file] # Output components
)
return app # Return the complete Gradio application
# ===== APPLICATION LAUNCH =====
# Launch the application when script is run directly
if __name__ == "__main__":
app = create_interface() # Create the Gradio interface
app.launch(share=True, debug=True) # Launch with public sharing and debug mode