Spaces:

Fola-AI
/

Multimodal_Text_Data_Analysis

Sleeping

App Files Files Community

Multimodal_Text_Data_Analysis / app.py

Fola-AI

Upload 3 files

928639d verified 7 months ago

raw

history blame

61.7 kB

	# ===== MULTIMODAL TEXT ANALYTICS AI ASSISTANT =====
	# This is a comprehensive text analytics system with multiple AI API integrations
	# and smart column detection capabilities for customer feedback analysis

	# ===== IMPORTS SECTION =====
	# Core Python libraries for basic functionality
	import os # Operating system interface for environment variables and file operations
	import warnings # Python warnings control to suppress unnecessary warnings
	warnings.filterwarnings('ignore') # Suppress all warnings to keep output clean

	# Environment and API management
	from dotenv import load_dotenv # Load environment variables from .env file for API keys
	from anthropic import Anthropic # Anthropic's Claude AI API client

	# Additional AI APIs - using try/except to handle missing dependencies gracefully
	try:
	from openai import OpenAI # OpenAI's GPT API client
	except ImportError:
	OpenAI = None # Set to None if not installed, will be checked later

	try:
	from groq import Groq # Groq's fast inference API client
	except ImportError:
	Groq = None # Set to None if not installed

	try:
	import google.generativeai as genai # Google's Gemini API client
	except ImportError:
	genai = None # Set to None if not installed

	# Data processing and manipulation libraries
	import pandas as pd # Primary data manipulation library for DataFrames
	import numpy as np # Numerical computing library for array operations
	from datetime import datetime, timedelta # Date and time handling utilities
	import json # JSON data format handling
	import gc # Garbage collection for memory management - important for large datasets

	# Natural Language Processing libraries
	import nltk # Natural Language Toolkit - comprehensive NLP library
	from nltk.corpus import stopwords # Common words to filter out (the, and, or, etc.)
	from nltk.tokenize import word_tokenize # Split text into individual words/tokens
	from nltk.stem import WordNetLemmatizer # Reduce words to their root form (running -> run)
	from textblob import TextBlob # Simple API for diving into common NLP tasks
	import re # Regular expressions for text pattern matching and cleaning
	from collections import Counter # Efficient counting of hashable objects

	# Machine Learning libraries for text analysis
	from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer # Convert text to numerical features
	from sklearn.decomposition import LatentDirichletAllocation # Topic modeling algorithm
	from sklearn.cluster import KMeans # Clustering algorithm for grouping similar texts
	from sklearn.preprocessing import StandardScaler # Normalize numerical features
	from sklearn.metrics.pairwise import cosine_similarity # Measure similarity between text vectors

	# Visualization libraries for creating charts and graphs
	import plotly.express as px # High-level plotting interface
	import plotly.graph_objects as go # Low-level plotting interface for custom charts
	from plotly.subplots import make_subplots # Create multiple charts in one figure
	import matplotlib.pyplot as plt # Traditional plotting library
	import seaborn as sns # Statistical data visualization built on matplotlib

	# Web interface framework
	import gradio as gr # Create web interfaces for machine learning models

	# Download required NLTK data packages - these contain language models and corpora
	nltk.download('punkt', quiet=True) # Sentence tokenizer models
	nltk.download('punkt_tab', quiet=True) # New tokenizer format for latest NLTK versions
	nltk.download('stopwords', quiet=True) # Lists of common words to filter out
	nltk.download('wordnet', quiet=True) # Lexical database for lemmatization
	nltk.download('averaged_perceptron_tagger', quiet=True) # Part-of-speech tagger
	nltk.download('omw-1.4', quiet=True) # Open Multilingual Wordnet for lemmatizer
	nltk.download('brown', quiet=True) # Brown corpus required for TextBlob

	# Download TextBlob corpora for sentiment analysis
	try:
	from textblob import download_corpora # Import corpora downloader
	download_corpora.main() # Download all required corpora
	except:
	# Alternative method if the above doesn't work - use subprocess
	import subprocess # Execute shell commands from Python
	import sys # System-specific parameters and functions
	try:
	# Run TextBlob download command as subprocess with timeout
	subprocess.run([sys.executable, "-m", "textblob.download_corpora"],
	capture_output=True, text=True, timeout=30)
	except:
	# If download fails, print warning but continue execution
	print("Warning: Could not download TextBlob corpora. Sentiment analysis may not work properly.")
	print("Please run: python -m textblob.download_corpora")

	# Load environment variables from .env file, override existing ones
	load_dotenv(override=True)

	# ===== SMART COLUMN DETECTOR CLASS =====
	class SmartColumnDetector:
	"""
	Intelligently detect and extract relevant columns from uploaded data
	This class automatically identifies what type of data each column contains
	"""

	def __init__(self):
	"""Initialize the detector with keyword lists for different column types"""
	# Keywords for detecting text/feedback columns - these usually contain the main content
	self.text_keywords = ['comment', 'feedback', 'review', 'description', 'text',
	'response', 'opinion', 'message', 'notes', 'remarks']

	# Keywords for detecting ID/identifier columns - these uniquely identify records
	self.id_keywords = ['id', 'identifier', 'key', 'number', 'code', 'ref',
	'reference', 'index', 'uuid']

	# Keywords for detecting product/category columns - these describe what's being reviewed
	self.product_keywords = ['product', 'item', 'model', 'variant', 'type',
	'category', 'brand', 'name', 'sku']

	# Keywords for detecting date/time columns - these show when feedback was given
	self.date_keywords = ['date', 'time', 'created', 'updated', 'timestamp']

	def detect_column_types(self, df):
	"""
	Detect column types based on column names and content analysis
	Returns a dictionary categorizing each column by its likely purpose
	"""
	# Initialize results dictionary with empty lists for each category
	detected = {
	'text_columns': [], # Columns containing feedback/comments
	'id_columns': [], # Columns containing unique identifiers
	'product_columns': [], # Columns describing products/categories
	'date_columns': [], # Columns containing dates/timestamps
	'other_columns': [] # Everything else
	}

	# Iterate through each column in the dataframe
	for col in df.columns:
	col_lower = col.lower() # Convert to lowercase for case-insensitive matching

	# Check if column name contains text-related keywords
	if any(keyword in col_lower for keyword in self.text_keywords):
	detected['text_columns'].append(col)
	# Check if column name contains ID-related keywords
	elif any(keyword in col_lower for keyword in self.id_keywords):
	detected['id_columns'].append(col)
	# Check if column name contains product-related keywords
	elif any(keyword in col_lower for keyword in self.product_keywords):
	detected['product_columns'].append(col)
	# Check if column name contains date-related keywords
	elif any(keyword in col_lower for keyword in self.date_keywords):
	detected['date_columns'].append(col)
	else:
	# If no keywords match, analyze the actual content to determine type
	sample = df[col].dropna().head(100) # Get first 100 non-null values
	if len(sample) > 0: # If we have sample data
	# Check if column contains text data (object dtype in pandas)
	if df[col].dtype == 'object':
	# Calculate average length of text in this column
	avg_length = sample.astype(str).str.len().mean()
	if avg_length > 50: # Long text likely indicates feedback/comments
	detected['text_columns'].append(col)
	elif avg_length < 20 and df[col].nunique() / len(df) > 0.5:
	# Short, mostly unique values likely indicate IDs
	detected['id_columns'].append(col)
	else:
	# Short, non-unique text likely indicates categories/products
	detected['product_columns'].append(col)
	else:
	# Non-text columns go to 'other' category
	detected['other_columns'].append(col)

	return detected # Return the categorized column dictionary

	def extract_relevant_data(self, df):
	"""
	Extract only relevant columns and create optimized dataset for analysis
	This reduces memory usage and focuses on important data
	"""
	# First, detect what type each column is
	detected = self.detect_column_types(df)

	# Create new dataframe with only relevant columns
	extracted_data = pd.DataFrame()

	# Add unique identifier column - use existing ID or create one
	if detected['id_columns'] and len(detected['id_columns']) > 0:
	# Use first detected ID column
	extracted_data['unique_id'] = df[detected['id_columns'][0]]
	else:
	# Create sequential ID numbers if no ID column exists
	extracted_data['unique_id'] = range(1, len(df) + 1)

	# Add product information columns (limit to first 2 to avoid too many columns)
	if detected['product_columns'] and len(detected['product_columns']) > 0:
	# Convert to list if needed and limit to 2 product columns
	product_cols = list(detected['product_columns'])[:2]
	for col in product_cols:
	# Add with 'product_' prefix to make purpose clear
	extracted_data[f'product_{col}'] = df[col]

	# Combine all text columns into a single 'combined_text' column
	if detected['text_columns'] and len(detected['text_columns']) > 0:
	text_cols = list(detected['text_columns']) # Ensure it's a list
	text_data = [] # Initialize list to store combined text

	# For each row, combine all text columns
	for idx in df.index:
	combined_text = ' '.join([
	str(df.loc[idx, col]) # Convert to string
	for col in text_cols # For each text column
	if col in df.columns and pd.notna(df.loc[idx, col]) # If column exists and value is not null
	])
	text_data.append(combined_text) # Add to our list
	extracted_data['combined_text'] = text_data # Add as new column
	else:
	# If no text columns detected, create empty combined_text column
	extracted_data['combined_text'] = [''] * len(df)

	# Add date column if available (use first detected date column)
	if detected['date_columns'] and len(detected['date_columns']) > 0:
	# Convert to datetime format, handle errors gracefully
	extracted_data['date'] = pd.to_datetime(df[detected['date_columns'][0]], errors='coerce')

	# Return both the extracted data and the detection results
	return extracted_data, detected

	# ===== ENHANCED TEXT PROCESSOR CLASS =====
	class EnhancedTextProcessor:
	"""
	Enhanced text preprocessing with actionable insights extraction
	This class handles text cleaning and extracts meaningful patterns from customer feedback
	"""

	def __init__(self):
	"""Initialize the text processor with NLP tools and insight dictionaries"""
	self.lemmatizer = WordNetLemmatizer() # Tool to reduce words to root form
	self.stop_words = set(stopwords.words('english')) # Common words to ignore

	# Dictionary mapping actionable items to keywords that indicate them
	# This helps identify what customers want improved
	self.actionable_dictionary = {
	'improve speed': ['slow', 'faster', 'quick', 'speed up', 'takes too long', 'waiting'],
	'better quality': ['poor quality', 'cheap', 'breaks', 'defective', 'flimsy', 'weak'],
	'enhance ui': ['confusing', 'hard to use', 'complicated', 'not intuitive', 'difficult to navigate'],
	'fix bugs': ['bug', 'error', 'crash', 'freeze', 'not working', 'glitch', 'broken'],
	'add features': ['missing', 'need', 'want', 'should have', 'would be nice', 'lacks'],
	'improve support': ['no response', 'unhelpful', 'rude', 'poor service', 'bad support'],
	'better packaging': ['damaged', 'poor packaging', 'arrived broken', 'not protected'],
	'clearer docs': ['unclear', 'no instructions', 'confusing manual', 'hard to understand'],
	'reduce price': ['expensive', 'overpriced', 'too costly', 'not worth', 'cheaper'],
	'faster delivery': ['late', 'delayed', 'slow shipping', 'took forever', 'still waiting'],
	'better communication': ['no updates', 'not informed', 'lack of communication', 'no tracking'],
	'improve reliability': ['unreliable', 'stops working', 'inconsistent', 'sometimes works'],
	'enhance performance': ['slow performance', 'laggy', 'sluggish', 'not responsive'],
	'better design': ['ugly', 'poor design', 'looks cheap', 'not attractive', 'outdated look'],
	'more options': ['limited options', 'no variety', 'need more choices', 'only one option']
	}

	def clean_text(self, text):
	"""
	Clean and normalize text for analysis
	Removes special characters and standardizes format
	"""
	# Handle null or empty text
	if pd.isna(text) or text == '':
	return ""

	text = str(text).lower() # Convert to lowercase string
	text = re.sub(r'[^a-zA-Z0-9\s]', '', text) # Remove special characters, keep only letters, numbers, spaces
	text = ' '.join(text.split()) # Remove extra whitespace
	return text

	def extract_actionable_insights(self, text):
	"""
	Extract actionable insights using dictionary matching
	Returns comma-separated list of suggested improvements
	"""
	# Handle null or empty text
	if pd.isna(text) or text == '':
	return ""

	text_lower = text.lower() # Convert to lowercase for matching
	found_insights = [] # List to store found actionable items

	# Check each actionable item against the text
	for action, keywords in self.actionable_dictionary.items():
	for keyword in keywords:
	if keyword in text_lower: # If keyword found in text
	found_insights.append(action) # Add the actionable item
	break # Only add each action once per text

	# Return top 3 most relevant insights to avoid overwhelming output
	if found_insights:
	return ', '.join(found_insights[:3])
	return ""

	def extract_specific_topics(self, text):
	"""
	Extract specific topics from text using keyword extraction and noun phrase detection
	Returns list of 3 topics (may include empty strings if not enough topics found)
	"""
	# Handle null, empty, or very short text
	if pd.isna(text) or text == '' or len(text) < 10:
	return ['', '', ''] # Return 3 empty strings

	text_lower = text.lower() # Convert to lowercase

	# Remove stopwords for better topic extraction
	words = word_tokenize(text_lower) # Split into individual words
	# Filter out stopwords and very short words
	filtered_words = [w for w in words if w not in self.stop_words and len(w) > 3]

	# Extract noun phrases using TextBlob (these are usually good topics)
	blob = TextBlob(text)
	noun_phrases = blob.noun_phrases # Get noun phrases from text

	topics = [] # Initialize topics list

	# Add noun phrases (these are usually good topics)
	for phrase in noun_phrases[:5]: # Limit to top 5 noun phrases
	if len(phrase.split()) <= 3: # Only include short phrases (3 words or less)
	topics.append(phrase)

	# Add frequent meaningful words if we don't have enough topics
	if len(topics) < 3:
	word_freq = Counter(filtered_words) # Count word frequencies
	for word, _ in word_freq.most_common(5): # Get top 5 most common words
	if word not in str(topics): # Avoid duplicates
	topics.append(word)
	if len(topics) >= 3: # Stop when we have 3 topics
	break

	# Ensure we always return exactly 3 items
	topics = topics[:3] # Take only first 3
	while len(topics) < 3: # Add empty strings if needed
	topics.append('')

	return topics

	def determine_topic(self, text):
	"""
	Legacy method kept for compatibility - returns first specific topic
	This maintains backward compatibility with older versions
	"""
	topics = self.extract_specific_topics(text) # Get all topics
	return topics[0] if topics[0] else 'General' # Return first topic or 'General'

	# ===== SEARCH ENGINE CLASS =====
	class TextSearchEngine:
	"""
	Advanced search functionality for text data with semantic capabilities
	Uses TF-IDF vectorization and cosine similarity for intelligent text search
	"""

	def __init__(self):
	"""Initialize the search engine with TF-IDF vectorizer and synonym dictionary"""
	# TF-IDF (Term Frequency-Inverse Document Frequency) vectorizer
	# Converts text to numerical vectors for similarity calculations
	self.vectorizer = TfidfVectorizer(
	max_features=1000, # Limit to top 1000 most important terms
	ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams for better matching
	stop_words='english', # Remove common English words
	use_idf=True, # Use inverse document frequency weighting
	smooth_idf=True, # Add smoothing to IDF
	sublinear_tf=True # Apply sublinear tf scaling for better performance
	)
	self.tfidf_matrix = None # Will store the TF-IDF matrix after building index
	self.data = None # Will store the original data

	# Synonym dictionary for semantic search - helps find related terms
	self.synonyms = {
	'fast': ['quick', 'rapid', 'speedy', 'swift', 'prompt'],
	'slow': ['sluggish', 'delayed', 'laggy', 'lengthy', 'prolonged'],
	'good': ['excellent', 'great', 'wonderful', 'fantastic', 'amazing', 'positive'],
	'bad': ['poor', 'terrible', 'awful', 'negative', 'horrible', 'disappointing'],
	'problem': ['issue', 'bug', 'error', 'defect', 'fault', 'glitch'],
	'help': ['support', 'assistance', 'aid', 'service'],
	'price': ['cost', 'fee', 'charge', 'rate', 'payment', 'expensive', 'cheap'],
	'quality': ['standard', 'grade', 'condition', 'caliber'],
	'delivery': ['shipping', 'dispatch', 'arrival', 'transport'],
	'easy': ['simple', 'straightforward', 'effortless', 'user-friendly'],
	'hard': ['difficult', 'complex', 'complicated', 'challenging'],
	'broken': ['damaged', 'defective', 'faulty', 'malfunctioning'],
	'love': ['like', 'enjoy', 'appreciate', 'adore'],
	'hate': ['dislike', 'despise', 'detest'],
	'feature': ['function', 'capability', 'option', 'characteristic'],
	'customer': ['client', 'buyer', 'purchaser', 'consumer', 'user']
	}

	def expand_query_with_synonyms(self, query):
	"""
	Expand search query with synonyms for better semantic matching
	This helps find relevant results even when different words are used
	"""
	query_words = query.lower().split() # Split query into individual words
	expanded_terms = [] # List to store original words and synonyms

	for word in query_words:
	expanded_terms.append(word) # Add the original word

	# Add synonyms if available for this word
	if word in self.synonyms:
	expanded_terms.extend(self.synonyms[word])

	# Check if word is a synonym of something else and add related terms
	for key, syns in self.synonyms.items():
	if word in syns: # If current word is a synonym
	expanded_terms.append(key) # Add the main term
	expanded_terms.extend([s for s in syns if s != word]) # Add other synonyms

	# Remove duplicates while preserving order
	seen = set()
	unique_terms = []
	for term in expanded_terms:
	if term not in seen:
	unique_terms.append(term)
	seen.add(term)

	return ' '.join(unique_terms) # Return expanded query as single string

	def build_index(self, df, text_column):
	"""
	Build search index from text data
	Creates TF-IDF vectors for all documents to enable fast similarity search
	"""
	self.data = df.copy() # Store copy of the data
	texts = df[text_column].fillna('').tolist() # Get all text, fill nulls with empty string

	# Add other searchable columns to improve search accuracy
	if 'topic_1' in df.columns:
	# Combine main text with topic information for better searchability
	texts = [f"{text} {df.iloc[i]['topic_1']} {df.iloc[i]['topic_2']} {df.iloc[i]['topic_3']}"
	for i, text in enumerate(texts)]
	if 'actionable_insights' in df.columns:
	# Also include actionable insights in searchable text
	texts = [f"{texts[i]} {df.iloc[i]['actionable_insights']}"
	for i in range(len(texts))]

	# Create TF-IDF matrix from all texts
	self.tfidf_matrix = self.vectorizer.fit_transform(texts)

	def search(self, query, top_k=10):
	"""
	Enhanced search with semantic understanding
	Returns top matching documents with similarity scores
	"""
	# Check if index has been built
	if self.tfidf_matrix is None:
	return pd.DataFrame() # Return empty DataFrame if no index

	# Expand query with synonyms for better semantic matching
	expanded_query = self.expand_query_with_synonyms(query)

	# Vectorize both original and expanded queries
	query_vector = self.vectorizer.transform([query]) # Original query vector
	expanded_vector = self.vectorizer.transform([expanded_query]) # Expanded query vector

	# Calculate similarities for both queries against all documents
	similarities_orig = cosine_similarity(query_vector, self.tfidf_matrix).flatten()
	similarities_exp = cosine_similarity(expanded_vector, self.tfidf_matrix).flatten()

	# Combine scores (weighted average - original query gets more weight)
	combined_similarities = (0.7 * similarities_orig + 0.3 * similarities_exp)

	# Get top results
	top_indices = combined_similarities.argsort()[-top_k:][::-1] # Get indices of top scores, reverse order
	top_scores = combined_similarities[top_indices] # Get the actual scores

	# Filter results with score > 0.05 (lower threshold for better recall)
	valid_indices = [idx for idx, score in zip(top_indices, top_scores) if score > 0.05]

	if valid_indices:
	# Create results dataframe from valid matches
	results = self.data.iloc[valid_indices].copy()
	results['search_score'] = [combined_similarities[idx] for idx in valid_indices]

	# Boost results that have exact matches in the text
	query_lower = query.lower()
	for idx in results.index:
	if 'combined_text' in results.columns:
	# If exact query appears in text, boost the score
	if query_lower in str(results.at[idx, 'combined_text']).lower():
	results.at[idx, 'search_score'] *= 1.5 # 50% boost for exact matches

	return results.sort_values('search_score', ascending=False) # Return sorted by relevance

	return pd.DataFrame() # Return empty DataFrame if no valid results

	# ===== AI MODEL MANAGER CLASS =====
	class AIModelManager:
	"""
	Manages multiple AI model APIs and provides unified interface
	Supports OpenAI, Anthropic, Deepseek, Groq, and Google Gemini
	"""

	def __init__(self):
	"""Initialize the model manager and set up all available AI APIs"""
	self.available_models = {} # Dictionary to store available models
	self.clients = {} # Dictionary to store API clients
	self.current_model = None # Currently selected model
	self.initialize_apis() # Set up all APIs

	def initialize_apis(self):
	"""Initialize all available AI APIs based on environment variables"""

	# Anthropic Claude API setup
	ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY") # Get API key from environment
	if ANTHROPIC_API_KEY: # If API key exists
	try:
	self.clients['anthropic'] = Anthropic(api_key=ANTHROPIC_API_KEY) # Create client
	# Add Claude model to available models
	self.available_models['Claude 3 Haiku'] = {
	'provider': 'anthropic',
	'model': 'claude-3-haiku-20240307'
	}
	print(f"Anthropic API Key exists and begins {ANTHROPIC_API_KEY[:4]}") # Confirm setup
	except Exception as e:
	print(f"Error initializing Anthropic: {e}")
	else:
	print("Anthropic API Key not set")

	# OpenAI API setup
	OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
	if OPENAI_API_KEY and OpenAI: # Check both API key and library availability
	try:
	self.clients['openai'] = OpenAI(api_key=OPENAI_API_KEY)
	# Add multiple OpenAI models
	self.available_models['GPT-4o-mini'] = {
	'provider': 'openai',
	'model': 'gpt-4o-mini'
	}
	self.available_models['GPT-3.5 Turbo'] = {
	'provider': 'openai',
	'model': 'gpt-3.5-turbo'
	}
	print(f"OpenAI API Key exists and begins {OPENAI_API_KEY[:7]}")
	except Exception as e:
	print(f"Error initializing OpenAI: {e}")
	else:
	print("OpenAI API Key not set or library not installed")

	# Deepseek API setup (uses OpenAI-compatible API)
	DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")
	if DEEPSEEK_API_KEY and OpenAI:
	try:
	# Deepseek uses OpenAI client with different base URL
	self.clients['deepseek'] = OpenAI(
	api_key=DEEPSEEK_API_KEY,
	base_url="https://api.deepseek.com" # Deepseek's API endpoint
	)
	self.available_models['Deepseek Chat'] = {
	'provider': 'deepseek',
	'model': 'deepseek-chat'
	}
	print(f"Deepseek API Key exists and begins {DEEPSEEK_API_KEY[:7]}")
	except Exception as e:
	print(f"Error initializing Deepseek: {e}")
	else:
	print("Deepseek API Key not set or OpenAI library not installed")

	# Groq API setup
	GROQ_API_KEY = os.getenv("GROQ_API_KEY")
	if GROQ_API_KEY and Groq:
	try:
	self.clients['groq'] = Groq(api_key=GROQ_API_KEY)
	# Add multiple Groq models
	self.available_models['Llama 3.3 70B'] = {
	'provider': 'groq',
	'model': 'llama-3.3-70b-versatile'
	}
	self.available_models['Mixtral 8x7B'] = {
	'provider': 'groq',
	'model': 'mixtral-8x7b-32768'
	}
	print(f"Groq API Key exists and begins {GROQ_API_KEY[:4]}")
	except Exception as e:
	print(f"Error initializing Groq: {e}")
	else:
	print("Groq API Key not set or library not installed")

	# Google Gemini API setup
	GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
	if GOOGLE_API_KEY and genai:
	try:
	genai.configure(api_key=GOOGLE_API_KEY) # Configure Google AI
	self.clients['google'] = genai # Store the configured module
	# Add Google models
	self.available_models['Gemini 1.5 Flash'] = {
	'provider': 'google',
	'model': 'gemini-1.5-flash'
	}
	self.available_models['Gemini 1.5 Pro'] = {
	'provider': 'google',
	'model': 'gemini-1.5-pro'
	}
	print(f"Google API Key exists and begins {GOOGLE_API_KEY[:2]}")
	except Exception as e:
	print(f"Error initializing Google Gemini: {e}")
	else:
	print("Google API Key not set or library not installed")

	# Set default model to first available model
	if self.available_models:
	self.current_model = list(self.available_models.keys())[0]

	def get_available_models(self):
	"""Return list of available model names"""
	return list(self.available_models.keys())

	def set_model(self, model_name):
	"""Set the current model for text generation"""
	if model_name in self.available_models:
	self.current_model = model_name
	return True # Success
	return False # Model not available

	def generate_text(self, prompt, max_tokens=1000):
	"""
	Generate text using the current model
	Handles different API formats for each provider
	"""
	# Check if we have a valid current model
	if not self.current_model or self.current_model not in self.available_models:
	return None

	model_info = self.available_models[self.current_model] # Get model configuration
	provider = model_info['provider'] # Which API provider to use
	model = model_info['model'] # Specific model name

	try:
	# Handle Anthropic API format
	if provider == 'anthropic':
	client = self.clients['anthropic']
	response = client.messages.create(
	model=model,
	max_tokens=max_tokens,
	messages=[{"role": "user", "content": prompt}]
	)
	return response.content[0].text # Extract text from response

	# Handle OpenAI and Deepseek API format (both use OpenAI-compatible format)
	elif provider in ['openai', 'deepseek']:
	client = self.clients[provider]
	response = client.chat.completions.create(
	model=model,
	messages=[{"role": "user", "content": prompt}],
	max_tokens=max_tokens
	)
	return response.choices[0].message.content # Extract text from response

	# Handle Groq API format (similar to OpenAI)
	elif provider == 'groq':
	client = self.clients['groq']
	response = client.chat.completions.create(
	model=model,
	messages=[{"role": "user", "content": prompt}],
	max_tokens=max_tokens
	)
	return response.choices[0].message.content

	# Handle Google Gemini API format
	elif provider == 'google':
	model_obj = genai.GenerativeModel(model) # Create model object
	response = model_obj.generate_content(prompt) # Generate response
	return response.text # Extract text

	except Exception as e:
	print(f"Error generating text with {self.current_model}: {e}")
	return None

	# Initialize the model manager globally so it can be used throughout the application
	model_manager = AIModelManager()

	# ===== ENHANCED TEXT ANALYZER CLASS =====
	class EnhancedTextAnalyzer:
	"""
	Main analysis engine with all enhanced features and multi-model support
	This is the core class that orchestrates all text analysis functionality
	"""

	def __init__(self, model_manager=None):
	"""Initialize the analyzer with all component classes"""
	self.model_manager = model_manager # AI model manager for generating insights
	self.column_detector = SmartColumnDetector() # Smart column detection
	self.text_processor = EnhancedTextProcessor() # Text processing and insights
	self.search_engine = TextSearchEngine() # Text search functionality
	self.original_df = None # Store original data
	self.processed_df = None # Store processed data
	self.results = {} # Store analysis results
	self.visualizations = {} # Store generated visualizations

	def load_file(self, file):
	"""
	Load data from various file formats (CSV, Excel, JSON)
	Returns the loaded dataframe and a status message
	"""
	try:
	# Determine file type based on extension and load accordingly
	if file.name.endswith('.csv'):
	df = pd.read_csv(file.name) # Load CSV file
	elif file.name.endswith(('.xlsx', '.xls')):
	df = pd.read_excel(file.name) # Load Excel file
	elif file.name.endswith('.json'):
	df = pd.read_json(file.name) # Load JSON file
	else:
	return None, "Unsupported file format" # Return error for unsupported formats

	return df, f"File loaded: {len(df)} records" # Return success message with record count
	except Exception as e:
	return None, f"Error loading file: {str(e)}" # Return error message

	def process_data(self, df):
	"""
	Process data with smart extraction and analysis
	This is the main processing pipeline that analyzes the uploaded data
	"""
	# Step 1: Extract relevant columns using smart detection
	extracted_df, detected_columns = self.column_detector.extract_relevant_data(df)

	# Step 2: Store processed data for later use
	self.processed_df = extracted_df

	# Step 3: Clean up memory by deleting original large dataframe
	del df
	gc.collect() # Force garbage collection to free memory

	# Step 4: Add analysis columns if we have text data to analyze
	if 'combined_text' in extracted_df.columns:
	# Initialize lists to store analysis results for each row
	sentiments = [] # Positive/Negative/Neutral sentiment classification
	polarities = [] # Numerical sentiment scores (-1 to 1)
	topics_1 = [] # Primary topic for each text
	topics_2 = [] # Secondary topic for each text
	topics_3 = [] # Tertiary topic for each text
	insights = [] # Actionable insights for each text

	# Process each text entry
	for text in extracted_df['combined_text']:
	# Sentiment analysis using TextBlob
	blob = TextBlob(text)
	polarity = blob.sentiment.polarity # Get numerical sentiment score

	# Convert numerical score to categorical sentiment
	if polarity > 0.1: # Positive threshold
	sentiment = 'Positive'
	elif polarity < -0.1: # Negative threshold
	sentiment = 'Negative'
	else: # Neutral range
	sentiment = 'Neutral'

	sentiments.append(sentiment) # Add categorical sentiment
	polarities.append(polarity) # Add numerical score

	# Extract specific topics (3 separate topics per text)
	specific_topics = self.text_processor.extract_specific_topics(text)
	topics_1.append(specific_topics[0]) # Primary topic
	topics_2.append(specific_topics[1]) # Secondary topic
	topics_3.append(specific_topics[2]) # Tertiary topic

	# Extract actionable insights using dictionary matching
	insight = self.text_processor.extract_actionable_insights(text)
	insights.append(insight)

	# Add all analysis results as new columns to the dataframe
	extracted_df['sentiment'] = sentiments # Categorical sentiment
	extracted_df['sentiment_score'] = polarities # Numerical sentiment score
	extracted_df['topic_1'] = topics_1 # Primary topic
	extracted_df['topic_2'] = topics_2 # Secondary topic
	extracted_df['topic_3'] = topics_3 # Tertiary topic
	extracted_df['actionable_insights'] = insights # Actionable insights

	# Build search index with enhanced search capabilities
	self.search_engine.build_index(extracted_df, 'combined_text')

	# Step 5: Save processed data to Excel file for download
	output_file = 'processed_data.xlsx'
	extracted_df.to_excel(output_file, index=False)

	# Return processed data, detected column info, and output file path
	return extracted_df, detected_columns, output_file

	def generate_ai_insights(self, df, num_samples=5):
	"""
	Generate AI-powered insights using selected model
	Takes sample texts and generates high-level insights using AI
	"""
	# Check if AI model is available
	if not self.model_manager or not self.model_manager.current_model:
	return "No AI model available for generating insights"

	# Check if we have text data to analyze
	if 'combined_text' not in df.columns or df.empty:
	return "No text data available for AI analysis"

	# Sample some texts for analysis (to avoid sending too much data to AI)
	sample_texts = df['combined_text'].dropna().head(num_samples).tolist()
	if not sample_texts:
	return "No valid text samples found"

	# Create prompt for AI analysis
	# This prompt asks the AI to analyze the customer feedback samples
	prompt = f"""Analyze the following customer feedback samples and provide key insights:

	Samples:
	{chr(10).join([f"{i+1}. {text[:200]}..." if len(text) > 200 else f"{i+1}. {text}" for i, text in enumerate(sample_texts)])}

	Please provide:
	1. Main themes and patterns
	2. Key sentiment indicators
	3. Actionable recommendations
	4. Areas of concern

	Keep the response concise and focused on actionable insights."""

	# Generate insights using selected model
	try:
	response = self.model_manager.generate_text(prompt, max_tokens=500)
	if response:
	return f"AI Insights (using {self.model_manager.current_model}):\n\n{response}"
	else:
	return "Failed to generate AI insights. Please check your API configuration."
	except Exception as e:
	return f"Error generating AI insights: {str(e)}"

	def generate_visualizations(self, df):
	"""
	Generate various visualizations from the analyzed data
	Creates interactive charts using Plotly for better user experience
	"""
	visualizations = {} # Dictionary to store all visualizations

	# Generate sentiment distribution pie chart
	if 'sentiment' in df.columns:
	sentiment_counts = df['sentiment'].value_counts() # Count each sentiment category
	fig_sentiment = px.pie(
	values=sentiment_counts.values, # Values for pie slices
	names=sentiment_counts.index, # Labels for pie slices
	title="Sentiment Distribution", # Chart title
	color_discrete_map={ # Custom colors for each sentiment
	'Positive': '#27AE60', # Green for positive
	'Negative': '#E74C3C', # Red for negative
	'Neutral': '#95A5A6' # Gray for neutral
	}
	)
	visualizations['Sentiment Distribution'] = fig_sentiment

	# Generate topic distribution bar chart
	if 'topic_1' in df.columns:
	# Combine all topics from all three topic columns
	all_topics = []
	for col in ['topic_1', 'topic_2', 'topic_3']:
	if col in df.columns:
	topics = df[col].dropna().tolist() # Get non-null topics
	all_topics.extend([t for t in topics if t != '']) # Add non-empty topics

	if all_topics:
	topic_counts = Counter(all_topics) # Count topic frequencies
	top_topics = dict(topic_counts.most_common(15)) # Get top 15 topics

	fig_topics = px.bar(
	x=list(top_topics.values()), # Frequency values
	y=list(top_topics.keys()), # Topic names
	orientation='h', # Horizontal bar chart
	title="Top 15 Specific Topics", # Chart title
	labels={'x': 'Count', 'y': 'Topic'} # Axis labels
	)
	visualizations['Topic Distribution'] = fig_topics

	# Generate sentiment by topic heatmap
	if 'sentiment' in df.columns and 'topic_1' in df.columns:
	df_temp = df[df['topic_1'] != ''].copy() # Filter out empty topics
	if not df_temp.empty:
	# Get top 10 topics for cleaner visualization
	top_topics = df_temp['topic_1'].value_counts().head(10).index
	df_filtered = df_temp[df_temp['topic_1'].isin(top_topics)]

	# Create cross-tabulation of topics vs sentiments
	pivot_table = pd.crosstab(df_filtered['topic_1'], df_filtered['sentiment'])
	fig_heatmap = px.imshow(
	pivot_table, # Data for heatmap
	labels=dict(x="Sentiment", y="Primary Topic", color="Count"), # Labels
	title="Sentiment by Primary Topic Heatmap", # Title
	color_continuous_scale="RdYlGn" # Color scale (red to green)
	)
	visualizations['Sentiment by Topic'] = fig_heatmap

	# Generate sentiment timeline if date data is available
	if 'date' in df.columns and 'sentiment' in df.columns:
	df_time = df.copy()
	df_time['date'] = pd.to_datetime(df_time['date']) # Ensure date format
	# Group by month and sentiment to show trends over time
	time_data = df_time.groupby([pd.Grouper(key='date', freq='M'), 'sentiment']).size().reset_index(name='count')

	fig_timeline = px.line(
	time_data,
	x='date', # X-axis: time
	y='count', # Y-axis: count
	color='sentiment', # Different lines for each sentiment
	title="Sentiment Trends Over Time", # Chart title
	color_discrete_map={ # Custom colors
	'Positive': '#27AE60',
	'Negative': '#E74C3C',
	'Neutral': '#95A5A6'
	}
	)
	visualizations['Sentiment Timeline'] = fig_timeline

	# Generate actionable insights bar chart
	if 'actionable_insights' in df.columns:
	all_insights = [] # List to store all individual insights
	for insight in df['actionable_insights']:
	if insight and insight != "":
	# Split by comma as we're now using comma-separated insights
	all_insights.extend([i.strip() for i in insight.split(',')])

	if all_insights:
	insight_counts = Counter(all_insights) # Count insight frequencies
	top_insights = dict(insight_counts.most_common(10)) # Get top 10 insights

	fig_insights = px.bar(
	x=list(top_insights.values()), # Frequency values
	y=list(top_insights.keys()), # Insight names
	orientation='h', # Horizontal bar chart
	title="Top 10 Actionable Insights", # Chart title
	labels={'x': 'Frequency', 'y': 'Insight'} # Axis labels
	)
	visualizations['Top Insights'] = fig_insights

	return visualizations # Return dictionary of all generated visualizations

	# ===== GRADIO INTERFACE FUNCTIONS =====
	# Global variables to maintain state across function calls
	analyzer = None # Main analyzer instance
	current_data = None # Currently processed data
	current_visualizations = None # Currently generated visualizations

	def update_model(model_name):
	"""Update the selected AI model"""
	global model_manager

	if model_manager.set_model(model_name): # Try to set the new model
	return f"✅ Model switched to: {model_name}"
	else:
	return f"❌ Failed to switch to: {model_name}"

	def process_file(file, model_name):
	"""
	Process uploaded file with selected model
	This is the main function called when user uploads a file
	"""
	global analyzer, current_data, current_visualizations, model_manager

	# Check if file was uploaded
	if file is None:
	return "Please upload a file", None, None, None, None, None, gr.update(choices=[])

	try:
	# Update model if changed
	if model_name and model_manager:
	model_manager.set_model(model_name)

	# Create new analyzer instance
	analyzer = EnhancedTextAnalyzer(model_manager)

	# Load the uploaded file
	df, message = analyzer.load_file(file)
	if df is None: # If file loading failed
	return message, None, None, None, None, None, gr.update(choices=[])

	# Process the loaded data
	processed_df, detected_cols, output_file = analyzer.process_data(df)
	current_data = processed_df # Store for later use

	# Generate visualizations from processed data
	visualizations = analyzer.generate_visualizations(processed_df)
	current_visualizations = visualizations # Store for later use

	# Generate AI insights using the selected model
	ai_insights = analyzer.generate_ai_insights(processed_df)

	# Create summary of processing results
	# Safely handle detected columns (convert to lists and limit length)
	text_cols = list(detected_cols.get('text_columns', []))[:3] if detected_cols.get('text_columns') else []
	id_cols = list(detected_cols.get('id_columns', []))[:3] if detected_cols.get('id_columns') else []
	product_cols = list(detected_cols.get('product_columns', []))[:3] if detected_cols.get('product_columns') else []

	summary = f"""
	### ✅ File Processing Complete!

	Detected Columns:
	- Text Columns: {', '.join(text_cols) if text_cols else 'None'}
	- ID Columns: {', '.join(id_cols) if id_cols else 'Auto-generated'}
	- Product Columns: {', '.join(product_cols) if product_cols else 'None'}

	Analysis Results:
	- Total Records: {len(processed_df)}
	- Processed File Saved: {output_file}
	- AI Model Used: {model_manager.current_model if model_manager else 'None'}
	"""

	# Create data preview (first 10 rows for display)
	preview = processed_df.head(10)

	# Get first visualization for immediate display
	first_viz = list(visualizations.values())[0] if visualizations else None

	# Return all results for the Gradio interface
	return (
	summary, # Processing status
	preview, # Data preview
	output_file, # Downloadable processed file
	ai_insights, # AI-generated insights
	first_viz, # First visualization
	"Ready for search", # Search status
	gr.update(choices=list(visualizations.keys())) # Update visualization dropdown
	)

	except Exception as e:
	# Return error message if anything goes wrong
	return f"Error: {str(e)}", None, None, None, None, None, gr.update(choices=[])

	def search_data(query):
	"""
	Search through the data with enhanced semantic search
	Uses the built search engine to find relevant text entries
	"""
	global analyzer, current_data

	# Check if data has been processed
	if analyzer is None or current_data is None:
	return "Please process a file first", None, None

	# Check if search query was provided
	if not query:
	return "Please enter a search query", None, None

	try:
	# Perform the search using the search engine
	results = analyzer.search_engine.search(query, top_k=10)

	# Check if any results were found
	if results.empty:
	return "No results found", None, None

	# Select relevant columns for display (updated to include new topic columns)
	display_cols = ['unique_id', 'combined_text', 'sentiment', 'topic_1', 'topic_2', 'topic_3', 'actionable_insights', 'search_score']
	display_cols = [col for col in display_cols if col in results.columns] # Only include existing columns

	results_display = results[display_cols] # Create display dataframe

	# Save search results to file for download
	search_output = f"search_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
	results_display.to_excel(search_output, index=False)

	# Return search results and status
	return f"Found {len(results)} results", results_display.head(10), search_output

	except Exception as e:
	return f"Search error: {str(e)}", None, None

	def update_visualization(viz_type):
	"""
	Update displayed visualization based on user selection
	Called when user selects a different visualization from dropdown
	"""
	global current_visualizations

	# Check if visualization exists and return it
	if current_visualizations and viz_type in current_visualizations:
	return current_visualizations[viz_type]
	return None # Return None if visualization not found

	def export_results(format_type):
	"""
	Export processed data in different formats (Excel or CSV)
	Allows users to download their analyzed data
	"""
	global current_data

	# Check if there's data to export
	if current_data is None:
	return "No data to export", None

	try:
	timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') # Create timestamp for unique filename

	# Export based on selected format
	if format_type == "Excel":
	output_file = f"analysis_results_{timestamp}.xlsx"
	current_data.to_excel(output_file, index=False) # Save as Excel
	else: # CSV
	output_file = f"analysis_results_{timestamp}.csv"
	current_data.to_csv(output_file, index=False) # Save as CSV

	return f"Data exported to {output_file}", output_file

	except Exception as e:
	return f"Export error: {str(e)}", None

	# ===== GRADIO INTERFACE CREATION =====
	def create_interface():
	"""
	Create the Gradio interface with model selection
	This function builds the entire web interface using Gradio
	"""

	# Create the main Gradio application with soft theme
	with gr.Blocks(theme=gr.themes.Soft()) as app:
	# Main title and description
	gr.Markdown(
	"""
	# 📊 Enhanced Text Analytics AI Agent
	### Smart Column Detection & Comprehensive Text Analysis with Multiple AI Models

	Features:
	- 🤖 Multiple AI Model Support (OpenAI, Anthropic, Deepseek, Groq, Google)
	- 🔍 Automatic detection of text, ID, and product columns
	- 💾 Memory-efficient processing with automatic file cleanup
	- 😊 Sentiment analysis with scoring
	- 🎯 Topic/theme extraction
	- 💡 Actionable insights generation
	- 🔎 Advanced text search with similarity scoring
	- 📈 Multiple visualization options
	- 📥 Export results in Excel or CSV format
	"""
	)

	# Tab 1: Upload & Process
	with gr.Tab("📤 Upload & Process"):
	with gr.Row():
	with gr.Column(scale=1): # Left column for controls
	# Model selection dropdown
	model_dropdown = gr.Dropdown(
	label="🤖 Select AI Model",
	choices=model_manager.get_available_models(), # Get available models
	value=model_manager.current_model if model_manager.current_model else None,
	interactive=True
	)

	# File upload component
	file_upload = gr.File(
	label="Upload Data File",
	file_types=[".csv", ".xlsx", ".xls", ".json"] # Supported file types
	)

	# Process button
	process_btn = gr.Button("🚀 Process File", variant="primary")

	with gr.Column(scale=2): # Right column for results
	status_output = gr.Markdown(label="Processing Status") # Processing status display
	ai_insights = gr.Markdown(label="AI-Generated Insights") # AI insights display

	# Data preview section
	with gr.Row():
	data_preview = gr.Dataframe(
	label="Data Preview (First 10 rows)",
	interactive=False # Read-only display
	)

	# Processed file download
	processed_file = gr.File(
	label="📁 Processed Data File",
	interactive=False # Read-only, for download only
	)

	# Tab 2: Search
	with gr.Tab("🔍 Search"):
	gr.Markdown("### Search through your text data")

	with gr.Row():
	# Search input box
	search_input = gr.Textbox(
	label="Enter search query",
	placeholder="Type keywords to search..."
	)
	# Search button
	search_btn = gr.Button("🔎 Search", variant="primary")

	# Search results display
	search_status = gr.Markdown(label="Search Status") # Search status
	search_results = gr.Dataframe( # Search results table
	label="Search Results",
	interactive=False
	)
	search_file = gr.File( # Download search results
	label="📥 Download Search Results",
	interactive=False
	)

	# Tab 3: Visualizations
	with gr.Tab("📈 Visualizations"):
	with gr.Row():
	# Visualization selector dropdown
	viz_selector = gr.Dropdown(
	label="Select Visualization",
	choices=[], # Will be populated after processing
	interactive=True
	)

	# Visualization display area
	viz_plot = gr.Plot(label="Visualization")

	# Tab 4: Export
	with gr.Tab("📥 Export"):
	gr.Markdown("### Export your analyzed data")

	with gr.Row():
	# Export format selection
	export_format = gr.Radio(
	choices=["Excel", "CSV"],
	value="Excel",
	label="Export Format"
	)
	# Export button
	export_btn = gr.Button("📥 Export Data", variant="primary")

	# Export results display
	export_status = gr.Markdown(label="Export Status") # Export status
	export_file = gr.File( # Download exported file
	label="📁 Download Exported File",
	interactive=False
	)

	# ===== EVENT HANDLERS =====
	# These connect user interactions to the backend functions

	# Model selection change handler
	model_dropdown.change(
	fn=update_model, # Function to call
	inputs=[model_dropdown], # Input components
	outputs=[status_output] # Output components
	)

	# File processing button click handler
	process_btn.click(
	fn=process_file, # Function to call
	inputs=[file_upload, model_dropdown], # Input components
	outputs=[ # Output components
	status_output,
	data_preview,
	processed_file,
	ai_insights,
	viz_plot,
	search_status,
	viz_selector
	]
	)

	# Search button click handler
	search_btn.click(
	fn=search_data, # Function to call
	inputs=[search_input], # Input components
	outputs=[search_status, search_results, search_file] # Output components
	)

	# Visualization selector change handler
	viz_selector.change(
	fn=update_visualization, # Function to call
	inputs=[viz_selector], # Input components
	outputs=[viz_plot] # Output components
	)

	# Export button click handler
	export_btn.click(
	fn=export_results, # Function to call
	inputs=[export_format], # Input components
	outputs=[export_status, export_file] # Output components
	)

	return app # Return the complete Gradio application

	# ===== APPLICATION LAUNCH =====
	# Launch the application when script is run directly
	if __name__ == "__main__":
	app = create_interface() # Create the Gradio interface
	app.launch(share=True, debug=True) # Launch with public sharing and debug mode