# Text Data Analysis AI Assistant with Gradio
 - Intelligent Customer Feedback Analysis System with Multiple AI APIs

In [1]:
# ===== IMPORTS SECTION =====
# Core libraries
import os
import warnings
warnings.filterwarnings('ignore')

# Environment and API
from dotenv import load_dotenv
from anthropic import Anthropic

# Additional AI APIs
try:
    from openai import OpenAI
except ImportError:
    OpenAI = None
    
try:
    from groq import Groq
except ImportError:
    Groq = None
    
try:
    import google.generativeai as genai
except ImportError:
    genai = None

# Data processing
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import json
import gc  # For garbage collection

# Natural Language Processing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
import re
from collections import Counter

# Machine Learning
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Visualization
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns

# Web interface
import gradio as gr

# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)  # New tokenizer format
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('omw-1.4', quiet=True)  # For WordNet lemmatizer
nltk.download('brown', quiet=True)  # Required for TextBlob

# Download TextBlob corpora
try:
    from textblob import download_corpora
    download_corpora.main()
except:
    # Alternative method if the above doesn't work
    import subprocess
    import sys
    try:
        subprocess.run([sys.executable, "-m", "textblob.download_corpora"], 
                      capture_output=True, text=True, timeout=30)
    except:
        print("Warning: Could not download TextBlob corpora. Sentiment analysis may not work properly.")
        print("Please run: python -m textblob.download_corpora")

[nltk_data] Downloading package brown to /Users/fola-ai/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/fola-
[nltk_data]     ai/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/fola-ai/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/fola-ai/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package conll2000 to /Users/fola-
[nltk_data]     ai/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package movie_reviews to /Users/fola-
[nltk_data]     ai/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


Finished.


In [2]:
load_dotenv(override=True)

True

In [3]:
# ===== SMART COLUMN DETECTOR =====
class SmartColumnDetector:
    """Intelligently detect and extract relevant columns from uploaded data"""
    
    def __init__(self):
        # Keywords for detecting different column types
        self.text_keywords = ['comment', 'feedback', 'review', 'description', 'text', 
                             'response', 'opinion', 'message', 'notes', 'remarks']
        self.id_keywords = ['id', 'identifier', 'key', 'number', 'code', 'ref', 
                           'reference', 'index', 'uuid']
        self.product_keywords = ['product', 'item', 'model', 'variant', 'type', 
                                'category', 'brand', 'name', 'sku']
        self.date_keywords = ['date', 'time', 'created', 'updated', 'timestamp']
        
    def detect_column_types(self, df):
        """Detect column types based on column names and content"""
        detected = {
            'text_columns': [],
            'id_columns': [],
            'product_columns': [],
            'date_columns': [],
            'other_columns': []
        }
        
        for col in df.columns:
            col_lower = col.lower()
            
            # Check for text columns
            if any(keyword in col_lower for keyword in self.text_keywords):
                detected['text_columns'].append(col)
            # Check for ID columns
            elif any(keyword in col_lower for keyword in self.id_keywords):
                detected['id_columns'].append(col)
            # Check for product columns
            elif any(keyword in col_lower for keyword in self.product_keywords):
                detected['product_columns'].append(col)
            # Check for date columns
            elif any(keyword in col_lower for keyword in self.date_keywords):
                detected['date_columns'].append(col)
            else:
                # Analyze content to determine type
                sample = df[col].dropna().head(100)
                if len(sample) > 0:
                    # Check if mostly text
                    if df[col].dtype == 'object':
                        avg_length = sample.astype(str).str.len().mean()
                        if avg_length > 50:  # Likely text content
                            detected['text_columns'].append(col)
                        elif avg_length < 20 and df[col].nunique() / len(df) > 0.5:
                            detected['id_columns'].append(col)
                        else:
                            detected['product_columns'].append(col)
                    else:
                        detected['other_columns'].append(col)
        
        return detected
    
    def extract_relevant_data(self, df):
        """Extract only relevant columns and create optimized dataset"""
        detected = self.detect_column_types(df)
        
        # Create new dataframe with relevant columns
        extracted_data = pd.DataFrame()
        
        # Add unique identifier
        if detected['id_columns'] and len(detected['id_columns']) > 0:
            extracted_data['unique_id'] = df[detected['id_columns'][0]]
        else:
            extracted_data['unique_id'] = range(1, len(df) + 1)
        
        # Add product information
        if detected['product_columns'] and len(detected['product_columns']) > 0:
            # Convert to list if needed and limit to 2 product columns
            product_cols = list(detected['product_columns'])[:2]
            for col in product_cols:
                extracted_data[f'product_{col}'] = df[col]
        
        # Combine text columns
        if detected['text_columns'] and len(detected['text_columns']) > 0:
            text_cols = list(detected['text_columns'])  # Ensure it's a list
            text_data = []
            for idx in df.index:
                combined_text = ' '.join([
                    str(df.loc[idx, col]) 
                    for col in text_cols 
                    if col in df.columns and pd.notna(df.loc[idx, col])
                ])
                text_data.append(combined_text)
            extracted_data['combined_text'] = text_data
        else:
            # If no text columns detected, create empty combined_text
            extracted_data['combined_text'] = [''] * len(df)
        
        # Add date columns
        if detected['date_columns'] and len(detected['date_columns']) > 0:
            extracted_data['date'] = pd.to_datetime(df[detected['date_columns'][0]], errors='coerce')
        
        return extracted_data, detected

In [4]:
# ===== ENHANCED TEXT PROCESSOR =====
class EnhancedTextProcessor:
    """Enhanced text preprocessing with actionable insights extraction"""

    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        
        # Initialize actionable insights dictionary with common customer feedback phrases
        self.actionable_dictionary = {
            'improve speed': ['slow', 'faster', 'quick', 'speed up', 'takes too long', 'waiting'],
            'better quality': ['poor quality', 'cheap', 'breaks', 'defective', 'flimsy', 'weak'],
            'enhance ui': ['confusing', 'hard to use', 'complicated', 'not intuitive', 'difficult to navigate'],
            'fix bugs': ['bug', 'error', 'crash', 'freeze', 'not working', 'glitch', 'broken'],
            'add features': ['missing', 'need', 'want', 'should have', 'would be nice', 'lacks'],
            'improve support': ['no response', 'unhelpful', 'rude', 'poor service', 'bad support'],
            'better packaging': ['damaged', 'poor packaging', 'arrived broken', 'not protected'],
            'clearer docs': ['unclear', 'no instructions', 'confusing manual', 'hard to understand'],
            'reduce price': ['expensive', 'overpriced', 'too costly', 'not worth', 'cheaper'],
            'faster delivery': ['late', 'delayed', 'slow shipping', 'took forever', 'still waiting'],
            'better communication': ['no updates', 'not informed', 'lack of communication', 'no tracking'],
            'improve reliability': ['unreliable', 'stops working', 'inconsistent', 'sometimes works'],
            'enhance performance': ['slow performance', 'laggy', 'sluggish', 'not responsive'],
            'better design': ['ugly', 'poor design', 'looks cheap', 'not attractive', 'outdated look'],
            'more options': ['limited options', 'no variety', 'need more choices', 'only one option']
        }

    def clean_text(self, text):
        """Clean and normalize text"""
        if pd.isna(text) or text == '':
            return ""

        text = str(text).lower()
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        text = ' '.join(text.split())
        return text

    def extract_actionable_insights(self, text):
        """Extract actionable insights using dictionary matching"""
        if pd.isna(text) or text == '':
            return ""
        
        text_lower = text.lower()
        found_insights = []
        
        # Check each actionable item against the text
        for action, keywords in self.actionable_dictionary.items():
            for keyword in keywords:
                if keyword in text_lower:
                    found_insights.append(action)
                    break  # Only add each action once
        
        # Return top 3 most relevant insights
        if found_insights:
            return ', '.join(found_insights[:3])
        return ""

    def extract_specific_topics(self, text):
        """Extract specific topics from text using keyword extraction"""
        if pd.isna(text) or text == '' or len(text) < 10:
            return ['', '', '']
        
        # Clean text first
        text_lower = text.lower()
        
        # Remove stopwords for better topic extraction
        words = word_tokenize(text_lower)
        filtered_words = [w for w in words if w not in self.stop_words and len(w) > 3]
        
        # Extract noun phrases and important terms
        blob = TextBlob(text)
        noun_phrases = blob.noun_phrases
        
        # Combine noun phrases with high-frequency meaningful words
        topics = []
        
        # Add noun phrases (these are usually good topics)
        for phrase in noun_phrases[:5]:  # Limit to top 5 noun phrases
            if len(phrase.split()) <= 3:  # Only short phrases
                topics.append(phrase)
        
        # Add frequent meaningful words if we don't have enough topics
        if len(topics) < 3:
            word_freq = Counter(filtered_words)
            for word, _ in word_freq.most_common(5):
                if word not in str(topics):  # Avoid duplicates
                    topics.append(word)
                if len(topics) >= 3:
                    break
        
        # Ensure we always return 3 items (empty string if not enough topics)
        topics = topics[:3]
        while len(topics) < 3:
            topics.append('')
        
        return topics

    def determine_topic(self, text):
        """Legacy method kept for compatibility - returns first specific topic"""
        topics = self.extract_specific_topics(text)
        return topics[0] if topics[0] else 'General'

In [5]:
# ===== SEARCH ENGINE =====
class TextSearchEngine:
    """Advanced search functionality for text data with semantic capabilities"""
    
    def __init__(self):
        self.vectorizer = TfidfVectorizer(
            max_features=1000,
            ngram_range=(1, 3),  # Include unigrams, bigrams, and trigrams for better matching
            stop_words='english',
            use_idf=True,
            smooth_idf=True,
            sublinear_tf=True  # Apply sublinear tf scaling
        )
        self.tfidf_matrix = None
        self.data = None
        
        # Synonym dictionary for semantic search
        self.synonyms = {
            'fast': ['quick', 'rapid', 'speedy', 'swift', 'prompt'],
            'slow': ['sluggish', 'delayed', 'laggy', 'lengthy', 'prolonged'],
            'good': ['excellent', 'great', 'wonderful', 'fantastic', 'amazing', 'positive'],
            'bad': ['poor', 'terrible', 'awful', 'negative', 'horrible', 'disappointing'],
            'problem': ['issue', 'bug', 'error', 'defect', 'fault', 'glitch'],
            'help': ['support', 'assistance', 'aid', 'service'],
            'price': ['cost', 'fee', 'charge', 'rate', 'payment', 'expensive', 'cheap'],
            'quality': ['standard', 'grade', 'condition', 'caliber'],
            'delivery': ['shipping', 'dispatch', 'arrival', 'transport'],
            'easy': ['simple', 'straightforward', 'effortless', 'user-friendly'],
            'hard': ['difficult', 'complex', 'complicated', 'challenging'],
            'broken': ['damaged', 'defective', 'faulty', 'malfunctioning'],
            'love': ['like', 'enjoy', 'appreciate', 'adore'],
            'hate': ['dislike', 'despise', 'detest'],
            'feature': ['function', 'capability', 'option', 'characteristic'],
            'customer': ['client', 'buyer', 'purchaser', 'consumer', 'user']
        }
        
    def expand_query_with_synonyms(self, query):
        """Expand search query with synonyms for better semantic matching"""
        query_words = query.lower().split()
        expanded_terms = []
        
        for word in query_words:
            # Add the original word
            expanded_terms.append(word)
            
            # Add synonyms if available
            if word in self.synonyms:
                expanded_terms.extend(self.synonyms[word])
            
            # Check if word is a synonym of something else
            for key, syns in self.synonyms.items():
                if word in syns:
                    expanded_terms.append(key)
                    expanded_terms.extend([s for s in syns if s != word])
        
        # Remove duplicates while preserving order
        seen = set()
        unique_terms = []
        for term in expanded_terms:
            if term not in seen:
                unique_terms.append(term)
                seen.add(term)
        
        return ' '.join(unique_terms)
        
    def build_index(self, df, text_column):
        """Build search index from text data"""
        self.data = df.copy()
        texts = df[text_column].fillna('').tolist()
        
        # Add other searchable columns to improve search
        if 'topic_1' in df.columns:
            texts = [f"{text} {df.iloc[i]['topic_1']} {df.iloc[i]['topic_2']} {df.iloc[i]['topic_3']}" 
                    for i, text in enumerate(texts)]
        if 'actionable_insights' in df.columns:
            texts = [f"{texts[i]} {df.iloc[i]['actionable_insights']}" 
                    for i in range(len(texts))]
            
        self.tfidf_matrix = self.vectorizer.fit_transform(texts)
        
    def search(self, query, top_k=10):
        """Enhanced search with semantic understanding"""
        if self.tfidf_matrix is None:
            return pd.DataFrame()
        
        # Expand query with synonyms
        expanded_query = self.expand_query_with_synonyms(query)
        
        # Vectorize both original and expanded queries
        query_vector = self.vectorizer.transform([query])
        expanded_vector = self.vectorizer.transform([expanded_query])
        
        # Calculate similarities for both
        similarities_orig = cosine_similarity(query_vector, self.tfidf_matrix).flatten()
        similarities_exp = cosine_similarity(expanded_vector, self.tfidf_matrix).flatten()
        
        # Combine scores (weighted average - original query gets more weight)
        combined_similarities = (0.7 * similarities_orig + 0.3 * similarities_exp)
        
        # Get top results
        top_indices = combined_similarities.argsort()[-top_k:][::-1]
        top_scores = combined_similarities[top_indices]
        
        # Filter results with score > 0.05 (lower threshold for better recall)
        valid_indices = [idx for idx, score in zip(top_indices, top_scores) if score > 0.05]
        
        if valid_indices:
            results = self.data.iloc[valid_indices].copy()
            results['search_score'] = [combined_similarities[idx] for idx in valid_indices]
            
            # Boost results that have exact matches
            query_lower = query.lower()
            for idx in results.index:
                if 'combined_text' in results.columns:
                    if query_lower in str(results.at[idx, 'combined_text']).lower():
                        results.at[idx, 'search_score'] *= 1.5  # Boost exact matches
                        
            return results.sort_values('search_score', ascending=False)
        
        return pd.DataFrame()


In [6]:
# ===== API CONFIGURATION =====
class AIModelManager:
    """Manages multiple AI model APIs and provides unified interface"""
    
    def __init__(self):
        self.available_models = {}
        self.clients = {}
        self.current_model = None
        self.initialize_apis()
        
    def initialize_apis(self):
        """Initialize all available AI APIs"""
        
        # Anthropic
        ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
        if ANTHROPIC_API_KEY:
            try:
                self.clients['anthropic'] = Anthropic(api_key=ANTHROPIC_API_KEY)
                self.available_models['Claude 3 Haiku'] = {
                    'provider': 'anthropic',
                    'model': 'claude-3-haiku-20240307'
                }
                print(f"Anthropic API Key exists and begins {ANTHROPIC_API_KEY[:4]}")
            except Exception as e:
                print(f"Error initializing Anthropic: {e}")
        else:
            print("Anthropic API Key not set")
            
        # OpenAI
        OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
        if OPENAI_API_KEY and OpenAI:
            try:
                self.clients['openai'] = OpenAI(api_key=OPENAI_API_KEY)
                self.available_models['GPT-4o-mini'] = {
                    'provider': 'openai',
                    'model': 'gpt-4o-mini'
                }
                self.available_models['GPT-3.5 Turbo'] = {
                    'provider': 'openai',
                    'model': 'gpt-3.5-turbo'
                }
                print(f"OpenAI API Key exists and begins {OPENAI_API_KEY[:7]}")
            except Exception as e:
                print(f"Error initializing OpenAI: {e}")
        else:
            print("OpenAI API Key not set or library not installed")
            
        # Deepseek (uses OpenAI-compatible API)
        DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")
        if DEEPSEEK_API_KEY and OpenAI:
            try:
                self.clients['deepseek'] = OpenAI(
                    api_key=DEEPSEEK_API_KEY,
                    base_url="https://api.deepseek.com"
                )
                self.available_models['Deepseek Chat'] = {
                    'provider': 'deepseek',
                    'model': 'deepseek-chat'
                }
                print(f"Deepseek API Key exists and begins {DEEPSEEK_API_KEY[:7]}")
            except Exception as e:
                print(f"Error initializing Deepseek: {e}")
        else:
            print("Deepseek API Key not set or OpenAI library not installed")
            
        # Groq
        GROQ_API_KEY = os.getenv("GROQ_API_KEY")
        if GROQ_API_KEY and Groq:
            try:
                self.clients['groq'] = Groq(api_key=GROQ_API_KEY)
                self.available_models['Llama 3.3 70B'] = {
                    'provider': 'groq',
                    'model': 'llama-3.3-70b-versatile'
                }
                self.available_models['Mixtral 8x7B'] = {
                    'provider': 'groq',
                    'model': 'mixtral-8x7b-32768'
                }
                print(f"Groq API Key exists and begins {GROQ_API_KEY[:4]}")
            except Exception as e:
                print(f"Error initializing Groq: {e}")
        else:
            print("Groq API Key not set or library not installed")
            
        # Google Gemini
        GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
        if GOOGLE_API_KEY and genai:
            try:
                genai.configure(api_key=GOOGLE_API_KEY)
                self.clients['google'] = genai
                self.available_models['Gemini 1.5 Flash'] = {
                    'provider': 'google',
                    'model': 'gemini-1.5-flash'
                }
                self.available_models['Gemini 1.5 Pro'] = {
                    'provider': 'google',
                    'model': 'gemini-1.5-pro'
                }
                print(f"Google API Key exists and begins {GOOGLE_API_KEY[:2]}")
            except Exception as e:
                print(f"Error initializing Google Gemini: {e}")
        else:
            print("Google API Key not set or library not installed")
            
        # Set default model
        if self.available_models:
            self.current_model = list(self.available_models.keys())[0]
            
    def get_available_models(self):
        """Return list of available model names"""
        return list(self.available_models.keys())
    
    def set_model(self, model_name):
        """Set the current model"""
        if model_name in self.available_models:
            self.current_model = model_name
            return True
        return False
    
    def generate_text(self, prompt, max_tokens=1000):
        """Generate text using the current model"""
        if not self.current_model or self.current_model not in self.available_models:
            return None
            
        model_info = self.available_models[self.current_model]
        provider = model_info['provider']
        model = model_info['model']
        
        try:
            if provider == 'anthropic':
                client = self.clients['anthropic']
                response = client.messages.create(
                    model=model,
                    max_tokens=max_tokens,
                    messages=[{"role": "user", "content": prompt}]
                )
                return response.content[0].text
                
            elif provider in ['openai', 'deepseek']:
                client = self.clients[provider]
                response = client.chat.completions.create(
                    model=model,
                    messages=[{"role": "user", "content": prompt}],
                    max_tokens=max_tokens
                )
                return response.choices[0].message.content
                
            elif provider == 'groq':
                client = self.clients['groq']
                response = client.chat.completions.create(
                    model=model,
                    messages=[{"role": "user", "content": prompt}],
                    max_tokens=max_tokens
                )
                return response.choices[0].message.content
                
            elif provider == 'google':
                model_obj = genai.GenerativeModel(model)
                response = model_obj.generate_content(prompt)
                return response.text
                
        except Exception as e:
            print(f"Error generating text with {self.current_model}: {e}")
            return None

In [7]:
# Initialize the model manager globally
model_manager = AIModelManager()

Anthropic API Key exists and begins sk-a
OpenAI API Key exists and begins sk-proj
Deepseek API Key exists and begins sk-1099
Groq API Key exists and begins gsk_
Google API Key exists and begins AI


In [8]:
# ===== ENHANCED ANALYZER WITH MULTI-MODEL SUPPORT =====

class EnhancedTextAnalyzer:
    """Main analysis engine with all enhanced features and multi-model support"""
    
    def __init__(self, model_manager=None):
        self.model_manager = model_manager
        self.column_detector = SmartColumnDetector()
        self.text_processor = EnhancedTextProcessor()
        self.search_engine = TextSearchEngine()
        self.original_df = None
        self.processed_df = None
        self.results = {}
        self.visualizations = {}
        
    def load_file(self, file):
        """Load data from various file formats"""
        try:
            if file.name.endswith('.csv'):
                df = pd.read_csv(file.name)
            elif file.name.endswith(('.xlsx', '.xls')):
                df = pd.read_excel(file.name)
            elif file.name.endswith('.json'):
                df = pd.read_json(file.name)
            else:
                return None, "Unsupported file format"
            
            return df, f"File loaded: {len(df)} records"
        except Exception as e:
            return None, f"Error loading file: {str(e)}"
    
    def process_data(self, df):
        """Process data with smart extraction and analysis"""
        # Extract relevant columns
        extracted_df, detected_columns = self.column_detector.extract_relevant_data(df)
        
        # Store for reference
        self.processed_df = extracted_df
        
        # Clear original from memory
        del df
        gc.collect()
        
        # Add analysis columns
        if 'combined_text' in extracted_df.columns:
            # Sentiment analysis
            sentiments = []
            polarities = []
            topics_1 = []
            topics_2 = []
            topics_3 = []
            insights = []
            
            for text in extracted_df['combined_text']:
                # Sentiment
                blob = TextBlob(text)
                polarity = blob.sentiment.polarity
                if polarity > 0.1:
                    sentiment = 'Positive'
                elif polarity < -0.1:
                    sentiment = 'Negative'
                else:
                    sentiment = 'Neutral'
                
                sentiments.append(sentiment)
                polarities.append(polarity)
                
                # Extract specific topics (3 separate topics)
                specific_topics = self.text_processor.extract_specific_topics(text)
                topics_1.append(specific_topics[0])
                topics_2.append(specific_topics[1])
                topics_3.append(specific_topics[2])
                
                # Actionable insights using dictionary matching
                insight = self.text_processor.extract_actionable_insights(text)
                insights.append(insight)
            
            extracted_df['sentiment'] = sentiments
            extracted_df['sentiment_score'] = polarities
            extracted_df['topic_1'] = topics_1
            extracted_df['topic_2'] = topics_2
            extracted_df['topic_3'] = topics_3
            extracted_df['actionable_insights'] = insights
            
            # Build search index with enhanced search capabilities
            self.search_engine.build_index(extracted_df, 'combined_text')
        
        # Save processed data
        output_file = 'processed_data.xlsx'
        extracted_df.to_excel(output_file, index=False)
        
        return extracted_df, detected_columns, output_file
    
    def generate_ai_insights(self, df, num_samples=5):
        """Generate AI-powered insights using selected model"""
        if not self.model_manager or not self.model_manager.current_model:
            return "No AI model available for generating insights"
        
        if 'combined_text' not in df.columns or df.empty:
            return "No text data available for AI analysis"
        
        # Sample some texts for analysis
        sample_texts = df['combined_text'].dropna().head(num_samples).tolist()
        if not sample_texts:
            return "No valid text samples found"
        
        # Create prompt for AI analysis
        prompt = f"""Analyze the following customer feedback samples and provide key insights:

Samples:
{chr(10).join([f"{i+1}. {text[:200]}..." if len(text) > 200 else f"{i+1}. {text}" for i, text in enumerate(sample_texts)])}

Please provide:
1. Main themes and patterns
2. Key sentiment indicators
3. Actionable recommendations
4. Areas of concern

Keep the response concise and focused on actionable insights."""

        # Generate insights using selected model
        try:
            response = self.model_manager.generate_text(prompt, max_tokens=500)
            if response:
                return f"**AI Insights (using {self.model_manager.current_model}):**\n\n{response}"
            else:
                return "Failed to generate AI insights. Please check your API configuration."
        except Exception as e:
            return f"Error generating AI insights: {str(e)}"
    
    def generate_visualizations(self, df):
        """Generate various visualizations"""
        visualizations = {}
        
        if 'sentiment' in df.columns:
            # Sentiment distribution
            sentiment_counts = df['sentiment'].value_counts()
            fig_sentiment = px.pie(
                values=sentiment_counts.values,
                names=sentiment_counts.index,
                title="Sentiment Distribution",
                color_discrete_map={
                    'Positive': '#27AE60',
                    'Negative': '#E74C3C',
                    'Neutral': '#95A5A6'
                }
            )
            visualizations['Sentiment Distribution'] = fig_sentiment
            
        if 'topic_1' in df.columns:
            # Combine all topics for overall topic distribution
            all_topics = []
            for col in ['topic_1', 'topic_2', 'topic_3']:
                if col in df.columns:
                    topics = df[col].dropna().tolist()
                    all_topics.extend([t for t in topics if t != ''])
            
            if all_topics:
                topic_counts = Counter(all_topics)
                top_topics = dict(topic_counts.most_common(15))
                
                fig_topics = px.bar(
                    x=list(top_topics.values()),
                    y=list(top_topics.keys()),
                    orientation='h',
                    title="Top 15 Specific Topics",
                    labels={'x': 'Count', 'y': 'Topic'}
                )
                visualizations['Topic Distribution'] = fig_topics
            
        if 'sentiment' in df.columns and 'topic_1' in df.columns:
            # Sentiment by primary topic (topic_1)
            df_temp = df[df['topic_1'] != ''].copy()
            if not df_temp.empty:
                # Get top 10 topics for cleaner visualization
                top_topics = df_temp['topic_1'].value_counts().head(10).index
                df_filtered = df_temp[df_temp['topic_1'].isin(top_topics)]
                
                pivot_table = pd.crosstab(df_filtered['topic_1'], df_filtered['sentiment'])
                fig_heatmap = px.imshow(
                    pivot_table,
                    labels=dict(x="Sentiment", y="Primary Topic", color="Count"),
                    title="Sentiment by Primary Topic Heatmap",
                    color_continuous_scale="RdYlGn"
                )
                visualizations['Sentiment by Topic'] = fig_heatmap
            
        if 'date' in df.columns and 'sentiment' in df.columns:
            # Sentiment over time
            df_time = df.copy()
            df_time['date'] = pd.to_datetime(df_time['date'])
            time_data = df_time.groupby([pd.Grouper(key='date', freq='M'), 'sentiment']).size().reset_index(name='count')
            
            fig_timeline = px.line(
                time_data,
                x='date',
                y='count',
                color='sentiment',
                title="Sentiment Trends Over Time",
                color_discrete_map={
                    'Positive': '#27AE60',
                    'Negative': '#E74C3C',
                    'Neutral': '#95A5A6'
                }
            )
            visualizations['Sentiment Timeline'] = fig_timeline
        
        if 'actionable_insights' in df.columns:
            # Top actionable insights
            all_insights = []
            for insight in df['actionable_insights']:
                if insight and insight != "":
                    # Split by comma as we're now using comma-separated insights
                    all_insights.extend([i.strip() for i in insight.split(',')])
            
            if all_insights:
                insight_counts = Counter(all_insights)
                top_insights = dict(insight_counts.most_common(10))
                
                fig_insights = px.bar(
                    x=list(top_insights.values()),
                    y=list(top_insights.keys()),
                    orientation='h',
                    title="Top 10 Actionable Insights",
                    labels={'x': 'Frequency', 'y': 'Insight'}
                )
                visualizations['Top Insights'] = fig_insights
        
        return visualizations

In [9]:
# ===== GRADIO INTERFACE =====
# Global variables
analyzer = None
current_data = None
current_visualizations = None

def update_model(model_name):
    """Update the selected AI model"""
    global model_manager
    
    if model_manager.set_model(model_name):
        return f"‚úÖ Model switched to: {model_name}"
    else:
        return f"‚ùå Failed to switch to: {model_name}"

def process_file(file, model_name):
    """Process uploaded file with selected model"""
    global analyzer, current_data, current_visualizations, model_manager
    
    if file is None:
        return "Please upload a file", None, None, None, None, None, gr.update(choices=[])
    
    try:
        # Update model if changed
        if model_name and model_manager:
            model_manager.set_model(model_name)
        
        analyzer = EnhancedTextAnalyzer(model_manager)
        
        # Load file
        df, message = analyzer.load_file(file)
        if df is None:
            return message, None, None, None, None, None, gr.update(choices=[])
        
        # Process data
        processed_df, detected_cols, output_file = analyzer.process_data(df)
        current_data = processed_df
        
        # Generate visualizations
        visualizations = analyzer.generate_visualizations(processed_df)
        current_visualizations = visualizations
        
        # Generate AI insights
        ai_insights = analyzer.generate_ai_insights(processed_df)
        
        # Create summary - safely handle detected columns
        text_cols = list(detected_cols.get('text_columns', []))[:3] if detected_cols.get('text_columns') else []
        id_cols = list(detected_cols.get('id_columns', []))[:3] if detected_cols.get('id_columns') else []
        product_cols = list(detected_cols.get('product_columns', []))[:3] if detected_cols.get('product_columns') else []
        
        summary = f"""
        ### ‚úÖ File Processing Complete!
        
        **Detected Columns:**
        - Text Columns: {', '.join(text_cols) if text_cols else 'None'}
        - ID Columns: {', '.join(id_cols) if id_cols else 'Auto-generated'}
        - Product Columns: {', '.join(product_cols) if product_cols else 'None'}
        
        **Analysis Results:**
        - Total Records: {len(processed_df)}
        - Processed File Saved: {output_file}
        - AI Model Used: {model_manager.current_model if model_manager else 'None'}
        """
        
        # Data preview
        preview = processed_df.head(10)
        
        # Get first visualization
        first_viz = list(visualizations.values())[0] if visualizations else None
        
        return (
            summary,
            preview,
            output_file,
            ai_insights,
            first_viz,
            "Ready for search",
            gr.update(choices=list(visualizations.keys()))
        )
        
    except Exception as e:
        return f"Error: {str(e)}", None, None, None, None, None, gr.update(choices=[])

def search_data(query):
    """Search through the data with enhanced semantic search"""
    global analyzer, current_data
    
    if analyzer is None or current_data is None:
        return "Please process a file first", None, None
    
    if not query:
        return "Please enter a search query", None, None
    
    try:
        results = analyzer.search_engine.search(query, top_k=10)
        
        if results.empty:
            return "No results found", None, None
        
        # Select relevant columns for display (updated to include new topic columns)
        display_cols = ['unique_id', 'combined_text', 'sentiment', 'topic_1', 'topic_2', 'topic_3', 'actionable_insights', 'search_score']
        display_cols = [col for col in display_cols if col in results.columns]
        
        results_display = results[display_cols]
        
        # Save search results
        search_output = f"search_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
        results_display.to_excel(search_output, index=False)
        
        return f"Found {len(results)} results", results_display.head(10), search_output
        
    except Exception as e:
        return f"Search error: {str(e)}", None, None

def update_visualization(viz_type):
    """Update displayed visualization"""
    global current_visualizations
    
    if current_visualizations and viz_type in current_visualizations:
        return current_visualizations[viz_type]
    return None

def export_results(format_type):
    """Export processed data in different formats"""
    global current_data
    
    if current_data is None:
        return "No data to export", None
    
    try:
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        
        if format_type == "Excel":
            output_file = f"analysis_results_{timestamp}.xlsx"
            current_data.to_excel(output_file, index=False)
        else:  # CSV
            output_file = f"analysis_results_{timestamp}.csv"
            current_data.to_csv(output_file, index=False)
        
        return f"Data exported to {output_file}", output_file
    
    except Exception as e:
        return f"Export error: {str(e)}", None

In [10]:
# Create Gradio interface
def create_interface():
    """Create the Gradio interface with model selection"""
    
    with gr.Blocks(theme=gr.themes.Soft()) as app:
        gr.Markdown(
            """
            # üìä Enhanced Text Analytics AI Agent
            ### Smart Column Detection & Comprehensive Text Analysis with Multiple AI Models
            
            **Features:**
            - ü§ñ Multiple AI Model Support (OpenAI, Anthropic, Deepseek, Groq, Google)
            - üîç Automatic detection of text, ID, and product columns
            - üíæ Memory-efficient processing with automatic file cleanup
            - üòä Sentiment analysis with scoring
            - üéØ Topic/theme extraction
            - üí° Actionable insights generation
            - üîé Advanced text search with similarity scoring
            - üìà Multiple visualization options
            - üì• Export results in Excel or CSV format
            """
        )
        
        with gr.Tab("üì§ Upload & Process"):
            with gr.Row():
                with gr.Column(scale=1):
                    # Model selection dropdown
                    model_dropdown = gr.Dropdown(
                        label="ü§ñ Select AI Model",
                        choices=model_manager.get_available_models(),
                        value=model_manager.current_model if model_manager.current_model else None,
                        interactive=True
                    )
                    
                    file_upload = gr.File(
                        label="Upload Data File",
                        file_types=[".csv", ".xlsx", ".xls", ".json"]
                    )
                    process_btn = gr.Button("üöÄ Process File", variant="primary")
                
                with gr.Column(scale=2):
                    status_output = gr.Markdown(label="Processing Status")
                    ai_insights = gr.Markdown(label="AI-Generated Insights")
            
            with gr.Row():
                data_preview = gr.Dataframe(
                    label="Data Preview (First 10 rows)",
                    interactive=False
                )
            
            processed_file = gr.File(
                label="üìÅ Processed Data File",
                interactive=False
            )
        
        with gr.Tab("üîç Search"):
            gr.Markdown("### Search through your text data")
            
            with gr.Row():
                search_input = gr.Textbox(
                    label="Enter search query",
                    placeholder="Type keywords to search..."
                )
                search_btn = gr.Button("üîé Search", variant="primary")
            
            search_status = gr.Markdown(label="Search Status")
            search_results = gr.Dataframe(
                label="Search Results",
                interactive=False
            )
            search_file = gr.File(
                label="üì• Download Search Results",
                interactive=False
            )
        
        with gr.Tab("üìà Visualizations"):
            with gr.Row():
                viz_selector = gr.Dropdown(
                    label="Select Visualization",
                    choices=[],
                    interactive=True
                )
            
            viz_plot = gr.Plot(label="Visualization")
        
        with gr.Tab("üì• Export"):
            gr.Markdown("### Export your analyzed data")
            
            with gr.Row():
                export_format = gr.Radio(
                    choices=["Excel", "CSV"],
                    value="Excel",
                    label="Export Format"
                )
                export_btn = gr.Button("üì• Export Data", variant="primary")
            
            export_status = gr.Markdown(label="Export Status")
            export_file = gr.File(
                label="üìÅ Download Exported File",
                interactive=False
            )
        
        # Event handlers
        model_dropdown.change(
            fn=update_model,
            inputs=[model_dropdown],
            outputs=[status_output]
        )
        
        process_btn.click(
            fn=process_file,
            inputs=[file_upload, model_dropdown],
            outputs=[
                status_output,
                data_preview,
                processed_file,
                ai_insights,
                viz_plot,
                search_status,
                viz_selector
            ]
        )
        
        search_btn.click(
            fn=search_data,
            inputs=[search_input],
            outputs=[search_status, search_results, search_file]
        )
        
        viz_selector.change(
            fn=update_visualization,
            inputs=[viz_selector],
            outputs=[viz_plot]
        )
        
        export_btn.click(
            fn=export_results,
            inputs=[export_format],
            outputs=[export_status, export_file]
        )
    
    return app

In [11]:
# Launch the application
if __name__ == "__main__":
    app = create_interface()
    app.launch(share=True, debug=True)

* Running on local URL:  http://127.0.0.1:7861
* Running on public URL: https://8190830de481785995.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7861 <> https://8190830de481785995.gradio.live


In [12]:
python -m textblob.download_corpora

SyntaxError: invalid syntax (2621292756.py, line 1)