{ "cells": [ { "cell_type": "markdown", "id": "3baa95af-73a1-4d3c-a562-f90777f1f0c0", "metadata": {}, "source": [ "# Text Data Analysis AI Assistant with Gradio\n", " - Intelligent Customer Feedback Analysis System with Multiple AI APIs" ] }, { "cell_type": "code", "execution_count": 1, "id": "31a6bbea-df57-40ed-afd3-4df75cc86d0a", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package brown to /Users/fola-ai/nltk_data...\n", "[nltk_data] Package brown is already up-to-date!\n", "[nltk_data] Downloading package punkt_tab to /Users/fola-\n", "[nltk_data] ai/nltk_data...\n", "[nltk_data] Package punkt_tab is already up-to-date!\n", "[nltk_data] Downloading package wordnet to /Users/fola-ai/nltk_data...\n", "[nltk_data] Package wordnet is already up-to-date!\n", "[nltk_data] Downloading package averaged_perceptron_tagger_eng to\n", "[nltk_data] /Users/fola-ai/nltk_data...\n", "[nltk_data] Unzipping taggers/averaged_perceptron_tagger_eng.zip.\n", "[nltk_data] Downloading package conll2000 to /Users/fola-\n", "[nltk_data] ai/nltk_data...\n", "[nltk_data] Unzipping corpora/conll2000.zip.\n", "[nltk_data] Downloading package movie_reviews to /Users/fola-\n", "[nltk_data] ai/nltk_data...\n", "[nltk_data] Unzipping corpora/movie_reviews.zip.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Finished.\n" ] } ], "source": [ "# ===== IMPORTS SECTION =====\n", "# Core libraries\n", "import os\n", "import warnings\n", "warnings.filterwarnings('ignore')\n", "\n", "# Environment and API\n", "from dotenv import load_dotenv\n", "from anthropic import Anthropic\n", "\n", "# Additional AI APIs\n", "try:\n", " from openai import OpenAI\n", "except ImportError:\n", " OpenAI = None\n", " \n", "try:\n", " from groq import Groq\n", "except ImportError:\n", " Groq = None\n", " \n", "try:\n", " import google.generativeai as genai\n", "except ImportError:\n", " genai = None\n", "\n", "# Data processing\n", "import pandas as pd\n", "import numpy as np\n", "from datetime import datetime, timedelta\n", "import json\n", "import gc # For garbage collection\n", "\n", "# Natural Language Processing\n", "import nltk\n", "from nltk.corpus import stopwords\n", "from nltk.tokenize import word_tokenize\n", "from nltk.stem import WordNetLemmatizer\n", "from textblob import TextBlob\n", "import re\n", "from collections import Counter\n", "\n", "# Machine Learning\n", "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n", "from sklearn.decomposition import LatentDirichletAllocation\n", "from sklearn.cluster import KMeans\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "\n", "# Visualization\n", "import plotly.express as px\n", "import plotly.graph_objects as go\n", "from plotly.subplots import make_subplots\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "# Web interface\n", "import gradio as gr\n", "\n", "# Download required NLTK data\n", "nltk.download('punkt', quiet=True)\n", "nltk.download('punkt_tab', quiet=True) # New tokenizer format\n", "nltk.download('stopwords', quiet=True)\n", "nltk.download('wordnet', quiet=True)\n", "nltk.download('averaged_perceptron_tagger', quiet=True)\n", "nltk.download('omw-1.4', quiet=True) # For WordNet lemmatizer\n", "nltk.download('brown', quiet=True) # Required for TextBlob\n", "\n", "# Download TextBlob corpora\n", "try:\n", " from textblob import download_corpora\n", " download_corpora.main()\n", "except:\n", " # Alternative method if the above doesn't work\n", " import subprocess\n", " import sys\n", " try:\n", " subprocess.run([sys.executable, \"-m\", \"textblob.download_corpora\"], \n", " capture_output=True, text=True, timeout=30)\n", " except:\n", " print(\"Warning: Could not download TextBlob corpora. Sentiment analysis may not work properly.\")\n", " print(\"Please run: python -m textblob.download_corpora\")" ] }, { "cell_type": "code", "execution_count": 2, "id": "db7c1e72-7960-4968-9a72-0f62ca7140d9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "load_dotenv(override=True)" ] }, { "cell_type": "code", "execution_count": 3, "id": "bded62da-82ab-4e17-bbf5-3edfe1b39398", "metadata": {}, "outputs": [], "source": [ "# ===== SMART COLUMN DETECTOR =====\n", "class SmartColumnDetector:\n", " \"\"\"Intelligently detect and extract relevant columns from uploaded data\"\"\"\n", " \n", " def __init__(self):\n", " # Keywords for detecting different column types\n", " self.text_keywords = ['comment', 'feedback', 'review', 'description', 'text', \n", " 'response', 'opinion', 'message', 'notes', 'remarks']\n", " self.id_keywords = ['id', 'identifier', 'key', 'number', 'code', 'ref', \n", " 'reference', 'index', 'uuid']\n", " self.product_keywords = ['product', 'item', 'model', 'variant', 'type', \n", " 'category', 'brand', 'name', 'sku']\n", " self.date_keywords = ['date', 'time', 'created', 'updated', 'timestamp']\n", " \n", " def detect_column_types(self, df):\n", " \"\"\"Detect column types based on column names and content\"\"\"\n", " detected = {\n", " 'text_columns': [],\n", " 'id_columns': [],\n", " 'product_columns': [],\n", " 'date_columns': [],\n", " 'other_columns': []\n", " }\n", " \n", " for col in df.columns:\n", " col_lower = col.lower()\n", " \n", " # Check for text columns\n", " if any(keyword in col_lower for keyword in self.text_keywords):\n", " detected['text_columns'].append(col)\n", " # Check for ID columns\n", " elif any(keyword in col_lower for keyword in self.id_keywords):\n", " detected['id_columns'].append(col)\n", " # Check for product columns\n", " elif any(keyword in col_lower for keyword in self.product_keywords):\n", " detected['product_columns'].append(col)\n", " # Check for date columns\n", " elif any(keyword in col_lower for keyword in self.date_keywords):\n", " detected['date_columns'].append(col)\n", " else:\n", " # Analyze content to determine type\n", " sample = df[col].dropna().head(100)\n", " if len(sample) > 0:\n", " # Check if mostly text\n", " if df[col].dtype == 'object':\n", " avg_length = sample.astype(str).str.len().mean()\n", " if avg_length > 50: # Likely text content\n", " detected['text_columns'].append(col)\n", " elif avg_length < 20 and df[col].nunique() / len(df) > 0.5:\n", " detected['id_columns'].append(col)\n", " else:\n", " detected['product_columns'].append(col)\n", " else:\n", " detected['other_columns'].append(col)\n", " \n", " return detected\n", " \n", " def extract_relevant_data(self, df):\n", " \"\"\"Extract only relevant columns and create optimized dataset\"\"\"\n", " detected = self.detect_column_types(df)\n", " \n", " # Create new dataframe with relevant columns\n", " extracted_data = pd.DataFrame()\n", " \n", " # Add unique identifier\n", " if detected['id_columns'] and len(detected['id_columns']) > 0:\n", " extracted_data['unique_id'] = df[detected['id_columns'][0]]\n", " else:\n", " extracted_data['unique_id'] = range(1, len(df) + 1)\n", " \n", " # Add product information\n", " if detected['product_columns'] and len(detected['product_columns']) > 0:\n", " # Convert to list if needed and limit to 2 product columns\n", " product_cols = list(detected['product_columns'])[:2]\n", " for col in product_cols:\n", " extracted_data[f'product_{col}'] = df[col]\n", " \n", " # Combine text columns\n", " if detected['text_columns'] and len(detected['text_columns']) > 0:\n", " text_cols = list(detected['text_columns']) # Ensure it's a list\n", " text_data = []\n", " for idx in df.index:\n", " combined_text = ' '.join([\n", " str(df.loc[idx, col]) \n", " for col in text_cols \n", " if col in df.columns and pd.notna(df.loc[idx, col])\n", " ])\n", " text_data.append(combined_text)\n", " extracted_data['combined_text'] = text_data\n", " else:\n", " # If no text columns detected, create empty combined_text\n", " extracted_data['combined_text'] = [''] * len(df)\n", " \n", " # Add date columns\n", " if detected['date_columns'] and len(detected['date_columns']) > 0:\n", " extracted_data['date'] = pd.to_datetime(df[detected['date_columns'][0]], errors='coerce')\n", " \n", " return extracted_data, detected" ] }, { "cell_type": "code", "execution_count": 4, "id": "626af7bf-b4cf-4259-b409-18e5225555aa", "metadata": {}, "outputs": [], "source": [ "# ===== ENHANCED TEXT PROCESSOR =====\n", "class EnhancedTextProcessor:\n", " \"\"\"Enhanced text preprocessing with actionable insights extraction\"\"\"\n", "\n", " def __init__(self):\n", " self.lemmatizer = WordNetLemmatizer()\n", " self.stop_words = set(stopwords.words('english'))\n", " \n", " # Initialize actionable insights dictionary with common customer feedback phrases\n", " self.actionable_dictionary = {\n", " 'improve speed': ['slow', 'faster', 'quick', 'speed up', 'takes too long', 'waiting'],\n", " 'better quality': ['poor quality', 'cheap', 'breaks', 'defective', 'flimsy', 'weak'],\n", " 'enhance ui': ['confusing', 'hard to use', 'complicated', 'not intuitive', 'difficult to navigate'],\n", " 'fix bugs': ['bug', 'error', 'crash', 'freeze', 'not working', 'glitch', 'broken'],\n", " 'add features': ['missing', 'need', 'want', 'should have', 'would be nice', 'lacks'],\n", " 'improve support': ['no response', 'unhelpful', 'rude', 'poor service', 'bad support'],\n", " 'better packaging': ['damaged', 'poor packaging', 'arrived broken', 'not protected'],\n", " 'clearer docs': ['unclear', 'no instructions', 'confusing manual', 'hard to understand'],\n", " 'reduce price': ['expensive', 'overpriced', 'too costly', 'not worth', 'cheaper'],\n", " 'faster delivery': ['late', 'delayed', 'slow shipping', 'took forever', 'still waiting'],\n", " 'better communication': ['no updates', 'not informed', 'lack of communication', 'no tracking'],\n", " 'improve reliability': ['unreliable', 'stops working', 'inconsistent', 'sometimes works'],\n", " 'enhance performance': ['slow performance', 'laggy', 'sluggish', 'not responsive'],\n", " 'better design': ['ugly', 'poor design', 'looks cheap', 'not attractive', 'outdated look'],\n", " 'more options': ['limited options', 'no variety', 'need more choices', 'only one option']\n", " }\n", "\n", " def clean_text(self, text):\n", " \"\"\"Clean and normalize text\"\"\"\n", " if pd.isna(text) or text == '':\n", " return \"\"\n", "\n", " text = str(text).lower()\n", " text = re.sub(r'[^a-zA-Z0-9\\s]', '', text)\n", " text = ' '.join(text.split())\n", " return text\n", "\n", " def extract_actionable_insights(self, text):\n", " \"\"\"Extract actionable insights using dictionary matching\"\"\"\n", " if pd.isna(text) or text == '':\n", " return \"\"\n", " \n", " text_lower = text.lower()\n", " found_insights = []\n", " \n", " # Check each actionable item against the text\n", " for action, keywords in self.actionable_dictionary.items():\n", " for keyword in keywords:\n", " if keyword in text_lower:\n", " found_insights.append(action)\n", " break # Only add each action once\n", " \n", " # Return top 3 most relevant insights\n", " if found_insights:\n", " return ', '.join(found_insights[:3])\n", " return \"\"\n", "\n", " def extract_specific_topics(self, text):\n", " \"\"\"Extract specific topics from text using keyword extraction\"\"\"\n", " if pd.isna(text) or text == '' or len(text) < 10:\n", " return ['', '', '']\n", " \n", " # Clean text first\n", " text_lower = text.lower()\n", " \n", " # Remove stopwords for better topic extraction\n", " words = word_tokenize(text_lower)\n", " filtered_words = [w for w in words if w not in self.stop_words and len(w) > 3]\n", " \n", " # Extract noun phrases and important terms\n", " blob = TextBlob(text)\n", " noun_phrases = blob.noun_phrases\n", " \n", " # Combine noun phrases with high-frequency meaningful words\n", " topics = []\n", " \n", " # Add noun phrases (these are usually good topics)\n", " for phrase in noun_phrases[:5]: # Limit to top 5 noun phrases\n", " if len(phrase.split()) <= 3: # Only short phrases\n", " topics.append(phrase)\n", " \n", " # Add frequent meaningful words if we don't have enough topics\n", " if len(topics) < 3:\n", " word_freq = Counter(filtered_words)\n", " for word, _ in word_freq.most_common(5):\n", " if word not in str(topics): # Avoid duplicates\n", " topics.append(word)\n", " if len(topics) >= 3:\n", " break\n", " \n", " # Ensure we always return 3 items (empty string if not enough topics)\n", " topics = topics[:3]\n", " while len(topics) < 3:\n", " topics.append('')\n", " \n", " return topics\n", "\n", " def determine_topic(self, text):\n", " \"\"\"Legacy method kept for compatibility - returns first specific topic\"\"\"\n", " topics = self.extract_specific_topics(text)\n", " return topics[0] if topics[0] else 'General'" ] }, { "cell_type": "code", "execution_count": 5, "id": "b2eb5f17-7400-4591-8c0e-de7645b87c72", "metadata": {}, "outputs": [], "source": [ "# ===== SEARCH ENGINE =====\n", "class TextSearchEngine:\n", " \"\"\"Advanced search functionality for text data with semantic capabilities\"\"\"\n", " \n", " def __init__(self):\n", " self.vectorizer = TfidfVectorizer(\n", " max_features=1000,\n", " ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams for better matching\n", " stop_words='english',\n", " use_idf=True,\n", " smooth_idf=True,\n", " sublinear_tf=True # Apply sublinear tf scaling\n", " )\n", " self.tfidf_matrix = None\n", " self.data = None\n", " \n", " # Synonym dictionary for semantic search\n", " self.synonyms = {\n", " 'fast': ['quick', 'rapid', 'speedy', 'swift', 'prompt'],\n", " 'slow': ['sluggish', 'delayed', 'laggy', 'lengthy', 'prolonged'],\n", " 'good': ['excellent', 'great', 'wonderful', 'fantastic', 'amazing', 'positive'],\n", " 'bad': ['poor', 'terrible', 'awful', 'negative', 'horrible', 'disappointing'],\n", " 'problem': ['issue', 'bug', 'error', 'defect', 'fault', 'glitch'],\n", " 'help': ['support', 'assistance', 'aid', 'service'],\n", " 'price': ['cost', 'fee', 'charge', 'rate', 'payment', 'expensive', 'cheap'],\n", " 'quality': ['standard', 'grade', 'condition', 'caliber'],\n", " 'delivery': ['shipping', 'dispatch', 'arrival', 'transport'],\n", " 'easy': ['simple', 'straightforward', 'effortless', 'user-friendly'],\n", " 'hard': ['difficult', 'complex', 'complicated', 'challenging'],\n", " 'broken': ['damaged', 'defective', 'faulty', 'malfunctioning'],\n", " 'love': ['like', 'enjoy', 'appreciate', 'adore'],\n", " 'hate': ['dislike', 'despise', 'detest'],\n", " 'feature': ['function', 'capability', 'option', 'characteristic'],\n", " 'customer': ['client', 'buyer', 'purchaser', 'consumer', 'user']\n", " }\n", " \n", " def expand_query_with_synonyms(self, query):\n", " \"\"\"Expand search query with synonyms for better semantic matching\"\"\"\n", " query_words = query.lower().split()\n", " expanded_terms = []\n", " \n", " for word in query_words:\n", " # Add the original word\n", " expanded_terms.append(word)\n", " \n", " # Add synonyms if available\n", " if word in self.synonyms:\n", " expanded_terms.extend(self.synonyms[word])\n", " \n", " # Check if word is a synonym of something else\n", " for key, syns in self.synonyms.items():\n", " if word in syns:\n", " expanded_terms.append(key)\n", " expanded_terms.extend([s for s in syns if s != word])\n", " \n", " # Remove duplicates while preserving order\n", " seen = set()\n", " unique_terms = []\n", " for term in expanded_terms:\n", " if term not in seen:\n", " unique_terms.append(term)\n", " seen.add(term)\n", " \n", " return ' '.join(unique_terms)\n", " \n", " def build_index(self, df, text_column):\n", " \"\"\"Build search index from text data\"\"\"\n", " self.data = df.copy()\n", " texts = df[text_column].fillna('').tolist()\n", " \n", " # Add other searchable columns to improve search\n", " if 'topic_1' in df.columns:\n", " texts = [f\"{text} {df.iloc[i]['topic_1']} {df.iloc[i]['topic_2']} {df.iloc[i]['topic_3']}\" \n", " for i, text in enumerate(texts)]\n", " if 'actionable_insights' in df.columns:\n", " texts = [f\"{texts[i]} {df.iloc[i]['actionable_insights']}\" \n", " for i in range(len(texts))]\n", " \n", " self.tfidf_matrix = self.vectorizer.fit_transform(texts)\n", " \n", " def search(self, query, top_k=10):\n", " \"\"\"Enhanced search with semantic understanding\"\"\"\n", " if self.tfidf_matrix is None:\n", " return pd.DataFrame()\n", " \n", " # Expand query with synonyms\n", " expanded_query = self.expand_query_with_synonyms(query)\n", " \n", " # Vectorize both original and expanded queries\n", " query_vector = self.vectorizer.transform([query])\n", " expanded_vector = self.vectorizer.transform([expanded_query])\n", " \n", " # Calculate similarities for both\n", " similarities_orig = cosine_similarity(query_vector, self.tfidf_matrix).flatten()\n", " similarities_exp = cosine_similarity(expanded_vector, self.tfidf_matrix).flatten()\n", " \n", " # Combine scores (weighted average - original query gets more weight)\n", " combined_similarities = (0.7 * similarities_orig + 0.3 * similarities_exp)\n", " \n", " # Get top results\n", " top_indices = combined_similarities.argsort()[-top_k:][::-1]\n", " top_scores = combined_similarities[top_indices]\n", " \n", " # Filter results with score > 0.05 (lower threshold for better recall)\n", " valid_indices = [idx for idx, score in zip(top_indices, top_scores) if score > 0.05]\n", " \n", " if valid_indices:\n", " results = self.data.iloc[valid_indices].copy()\n", " results['search_score'] = [combined_similarities[idx] for idx in valid_indices]\n", " \n", " # Boost results that have exact matches\n", " query_lower = query.lower()\n", " for idx in results.index:\n", " if 'combined_text' in results.columns:\n", " if query_lower in str(results.at[idx, 'combined_text']).lower():\n", " results.at[idx, 'search_score'] *= 1.5 # Boost exact matches\n", " \n", " return results.sort_values('search_score', ascending=False)\n", " \n", " return pd.DataFrame()\n" ] }, { "cell_type": "code", "execution_count": 6, "id": "e8b88155-971f-4dd5-b26c-104a737bc426", "metadata": {}, "outputs": [], "source": [ "# ===== API CONFIGURATION =====\n", "class AIModelManager:\n", " \"\"\"Manages multiple AI model APIs and provides unified interface\"\"\"\n", " \n", " def __init__(self):\n", " self.available_models = {}\n", " self.clients = {}\n", " self.current_model = None\n", " self.initialize_apis()\n", " \n", " def initialize_apis(self):\n", " \"\"\"Initialize all available AI APIs\"\"\"\n", " \n", " # Anthropic\n", " ANTHROPIC_API_KEY = os.getenv(\"ANTHROPIC_API_KEY\")\n", " if ANTHROPIC_API_KEY:\n", " try:\n", " self.clients['anthropic'] = Anthropic(api_key=ANTHROPIC_API_KEY)\n", " self.available_models['Claude 3 Haiku'] = {\n", " 'provider': 'anthropic',\n", " 'model': 'claude-3-haiku-20240307'\n", " }\n", " print(f\"Anthropic API Key exists and begins {ANTHROPIC_API_KEY[:4]}\")\n", " except Exception as e:\n", " print(f\"Error initializing Anthropic: {e}\")\n", " else:\n", " print(\"Anthropic API Key not set\")\n", " \n", " # OpenAI\n", " OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n", " if OPENAI_API_KEY and OpenAI:\n", " try:\n", " self.clients['openai'] = OpenAI(api_key=OPENAI_API_KEY)\n", " self.available_models['GPT-4o-mini'] = {\n", " 'provider': 'openai',\n", " 'model': 'gpt-4o-mini'\n", " }\n", " self.available_models['GPT-3.5 Turbo'] = {\n", " 'provider': 'openai',\n", " 'model': 'gpt-3.5-turbo'\n", " }\n", " print(f\"OpenAI API Key exists and begins {OPENAI_API_KEY[:7]}\")\n", " except Exception as e:\n", " print(f\"Error initializing OpenAI: {e}\")\n", " else:\n", " print(\"OpenAI API Key not set or library not installed\")\n", " \n", " # Deepseek (uses OpenAI-compatible API)\n", " DEEPSEEK_API_KEY = os.getenv(\"DEEPSEEK_API_KEY\")\n", " if DEEPSEEK_API_KEY and OpenAI:\n", " try:\n", " self.clients['deepseek'] = OpenAI(\n", " api_key=DEEPSEEK_API_KEY,\n", " base_url=\"https://api.deepseek.com\"\n", " )\n", " self.available_models['Deepseek Chat'] = {\n", " 'provider': 'deepseek',\n", " 'model': 'deepseek-chat'\n", " }\n", " print(f\"Deepseek API Key exists and begins {DEEPSEEK_API_KEY[:7]}\")\n", " except Exception as e:\n", " print(f\"Error initializing Deepseek: {e}\")\n", " else:\n", " print(\"Deepseek API Key not set or OpenAI library not installed\")\n", " \n", " # Groq\n", " GROQ_API_KEY = os.getenv(\"GROQ_API_KEY\")\n", " if GROQ_API_KEY and Groq:\n", " try:\n", " self.clients['groq'] = Groq(api_key=GROQ_API_KEY)\n", " self.available_models['Llama 3.3 70B'] = {\n", " 'provider': 'groq',\n", " 'model': 'llama-3.3-70b-versatile'\n", " }\n", " self.available_models['Mixtral 8x7B'] = {\n", " 'provider': 'groq',\n", " 'model': 'mixtral-8x7b-32768'\n", " }\n", " print(f\"Groq API Key exists and begins {GROQ_API_KEY[:4]}\")\n", " except Exception as e:\n", " print(f\"Error initializing Groq: {e}\")\n", " else:\n", " print(\"Groq API Key not set or library not installed\")\n", " \n", " # Google Gemini\n", " GOOGLE_API_KEY = os.getenv(\"GOOGLE_API_KEY\")\n", " if GOOGLE_API_KEY and genai:\n", " try:\n", " genai.configure(api_key=GOOGLE_API_KEY)\n", " self.clients['google'] = genai\n", " self.available_models['Gemini 1.5 Flash'] = {\n", " 'provider': 'google',\n", " 'model': 'gemini-1.5-flash'\n", " }\n", " self.available_models['Gemini 1.5 Pro'] = {\n", " 'provider': 'google',\n", " 'model': 'gemini-1.5-pro'\n", " }\n", " print(f\"Google API Key exists and begins {GOOGLE_API_KEY[:2]}\")\n", " except Exception as e:\n", " print(f\"Error initializing Google Gemini: {e}\")\n", " else:\n", " print(\"Google API Key not set or library not installed\")\n", " \n", " # Set default model\n", " if self.available_models:\n", " self.current_model = list(self.available_models.keys())[0]\n", " \n", " def get_available_models(self):\n", " \"\"\"Return list of available model names\"\"\"\n", " return list(self.available_models.keys())\n", " \n", " def set_model(self, model_name):\n", " \"\"\"Set the current model\"\"\"\n", " if model_name in self.available_models:\n", " self.current_model = model_name\n", " return True\n", " return False\n", " \n", " def generate_text(self, prompt, max_tokens=1000):\n", " \"\"\"Generate text using the current model\"\"\"\n", " if not self.current_model or self.current_model not in self.available_models:\n", " return None\n", " \n", " model_info = self.available_models[self.current_model]\n", " provider = model_info['provider']\n", " model = model_info['model']\n", " \n", " try:\n", " if provider == 'anthropic':\n", " client = self.clients['anthropic']\n", " response = client.messages.create(\n", " model=model,\n", " max_tokens=max_tokens,\n", " messages=[{\"role\": \"user\", \"content\": prompt}]\n", " )\n", " return response.content[0].text\n", " \n", " elif provider in ['openai', 'deepseek']:\n", " client = self.clients[provider]\n", " response = client.chat.completions.create(\n", " model=model,\n", " messages=[{\"role\": \"user\", \"content\": prompt}],\n", " max_tokens=max_tokens\n", " )\n", " return response.choices[0].message.content\n", " \n", " elif provider == 'groq':\n", " client = self.clients['groq']\n", " response = client.chat.completions.create(\n", " model=model,\n", " messages=[{\"role\": \"user\", \"content\": prompt}],\n", " max_tokens=max_tokens\n", " )\n", " return response.choices[0].message.content\n", " \n", " elif provider == 'google':\n", " model_obj = genai.GenerativeModel(model)\n", " response = model_obj.generate_content(prompt)\n", " return response.text\n", " \n", " except Exception as e:\n", " print(f\"Error generating text with {self.current_model}: {e}\")\n", " return None" ] }, { "cell_type": "code", "execution_count": 7, "id": "809f4c47-6ea8-4eaa-bac1-5ca83daac733", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Anthropic API Key exists and begins sk-a\n", "OpenAI API Key exists and begins sk-proj\n", "Deepseek API Key exists and begins sk-1099\n", "Groq API Key exists and begins gsk_\n", "Google API Key exists and begins AI\n" ] } ], "source": [ "# Initialize the model manager globally\n", "model_manager = AIModelManager()" ] }, { "cell_type": "code", "execution_count": 8, "id": "ad5f99f2-efd9-4759-88dc-df7f2f5359fb", "metadata": {}, "outputs": [], "source": [ "# ===== ENHANCED ANALYZER WITH MULTI-MODEL SUPPORT =====\n", "\n", "class EnhancedTextAnalyzer:\n", " \"\"\"Main analysis engine with all enhanced features and multi-model support\"\"\"\n", " \n", " def __init__(self, model_manager=None):\n", " self.model_manager = model_manager\n", " self.column_detector = SmartColumnDetector()\n", " self.text_processor = EnhancedTextProcessor()\n", " self.search_engine = TextSearchEngine()\n", " self.original_df = None\n", " self.processed_df = None\n", " self.results = {}\n", " self.visualizations = {}\n", " \n", " def load_file(self, file):\n", " \"\"\"Load data from various file formats\"\"\"\n", " try:\n", " if file.name.endswith('.csv'):\n", " df = pd.read_csv(file.name)\n", " elif file.name.endswith(('.xlsx', '.xls')):\n", " df = pd.read_excel(file.name)\n", " elif file.name.endswith('.json'):\n", " df = pd.read_json(file.name)\n", " else:\n", " return None, \"Unsupported file format\"\n", " \n", " return df, f\"File loaded: {len(df)} records\"\n", " except Exception as e:\n", " return None, f\"Error loading file: {str(e)}\"\n", " \n", " def process_data(self, df):\n", " \"\"\"Process data with smart extraction and analysis\"\"\"\n", " # Extract relevant columns\n", " extracted_df, detected_columns = self.column_detector.extract_relevant_data(df)\n", " \n", " # Store for reference\n", " self.processed_df = extracted_df\n", " \n", " # Clear original from memory\n", " del df\n", " gc.collect()\n", " \n", " # Add analysis columns\n", " if 'combined_text' in extracted_df.columns:\n", " # Sentiment analysis\n", " sentiments = []\n", " polarities = []\n", " topics_1 = []\n", " topics_2 = []\n", " topics_3 = []\n", " insights = []\n", " \n", " for text in extracted_df['combined_text']:\n", " # Sentiment\n", " blob = TextBlob(text)\n", " polarity = blob.sentiment.polarity\n", " if polarity > 0.1:\n", " sentiment = 'Positive'\n", " elif polarity < -0.1:\n", " sentiment = 'Negative'\n", " else:\n", " sentiment = 'Neutral'\n", " \n", " sentiments.append(sentiment)\n", " polarities.append(polarity)\n", " \n", " # Extract specific topics (3 separate topics)\n", " specific_topics = self.text_processor.extract_specific_topics(text)\n", " topics_1.append(specific_topics[0])\n", " topics_2.append(specific_topics[1])\n", " topics_3.append(specific_topics[2])\n", " \n", " # Actionable insights using dictionary matching\n", " insight = self.text_processor.extract_actionable_insights(text)\n", " insights.append(insight)\n", " \n", " extracted_df['sentiment'] = sentiments\n", " extracted_df['sentiment_score'] = polarities\n", " extracted_df['topic_1'] = topics_1\n", " extracted_df['topic_2'] = topics_2\n", " extracted_df['topic_3'] = topics_3\n", " extracted_df['actionable_insights'] = insights\n", " \n", " # Build search index with enhanced search capabilities\n", " self.search_engine.build_index(extracted_df, 'combined_text')\n", " \n", " # Save processed data\n", " output_file = 'processed_data.xlsx'\n", " extracted_df.to_excel(output_file, index=False)\n", " \n", " return extracted_df, detected_columns, output_file\n", " \n", " def generate_ai_insights(self, df, num_samples=5):\n", " \"\"\"Generate AI-powered insights using selected model\"\"\"\n", " if not self.model_manager or not self.model_manager.current_model:\n", " return \"No AI model available for generating insights\"\n", " \n", " if 'combined_text' not in df.columns or df.empty:\n", " return \"No text data available for AI analysis\"\n", " \n", " # Sample some texts for analysis\n", " sample_texts = df['combined_text'].dropna().head(num_samples).tolist()\n", " if not sample_texts:\n", " return \"No valid text samples found\"\n", " \n", " # Create prompt for AI analysis\n", " prompt = f\"\"\"Analyze the following customer feedback samples and provide key insights:\n", "\n", "Samples:\n", "{chr(10).join([f\"{i+1}. {text[:200]}...\" if len(text) > 200 else f\"{i+1}. {text}\" for i, text in enumerate(sample_texts)])}\n", "\n", "Please provide:\n", "1. Main themes and patterns\n", "2. Key sentiment indicators\n", "3. Actionable recommendations\n", "4. Areas of concern\n", "\n", "Keep the response concise and focused on actionable insights.\"\"\"\n", "\n", " # Generate insights using selected model\n", " try:\n", " response = self.model_manager.generate_text(prompt, max_tokens=500)\n", " if response:\n", " return f\"**AI Insights (using {self.model_manager.current_model}):**\\n\\n{response}\"\n", " else:\n", " return \"Failed to generate AI insights. Please check your API configuration.\"\n", " except Exception as e:\n", " return f\"Error generating AI insights: {str(e)}\"\n", " \n", " def generate_visualizations(self, df):\n", " \"\"\"Generate various visualizations\"\"\"\n", " visualizations = {}\n", " \n", " if 'sentiment' in df.columns:\n", " # Sentiment distribution\n", " sentiment_counts = df['sentiment'].value_counts()\n", " fig_sentiment = px.pie(\n", " values=sentiment_counts.values,\n", " names=sentiment_counts.index,\n", " title=\"Sentiment Distribution\",\n", " color_discrete_map={\n", " 'Positive': '#27AE60',\n", " 'Negative': '#E74C3C',\n", " 'Neutral': '#95A5A6'\n", " }\n", " )\n", " visualizations['Sentiment Distribution'] = fig_sentiment\n", " \n", " if 'topic_1' in df.columns:\n", " # Combine all topics for overall topic distribution\n", " all_topics = []\n", " for col in ['topic_1', 'topic_2', 'topic_3']:\n", " if col in df.columns:\n", " topics = df[col].dropna().tolist()\n", " all_topics.extend([t for t in topics if t != ''])\n", " \n", " if all_topics:\n", " topic_counts = Counter(all_topics)\n", " top_topics = dict(topic_counts.most_common(15))\n", " \n", " fig_topics = px.bar(\n", " x=list(top_topics.values()),\n", " y=list(top_topics.keys()),\n", " orientation='h',\n", " title=\"Top 15 Specific Topics\",\n", " labels={'x': 'Count', 'y': 'Topic'}\n", " )\n", " visualizations['Topic Distribution'] = fig_topics\n", " \n", " if 'sentiment' in df.columns and 'topic_1' in df.columns:\n", " # Sentiment by primary topic (topic_1)\n", " df_temp = df[df['topic_1'] != ''].copy()\n", " if not df_temp.empty:\n", " # Get top 10 topics for cleaner visualization\n", " top_topics = df_temp['topic_1'].value_counts().head(10).index\n", " df_filtered = df_temp[df_temp['topic_1'].isin(top_topics)]\n", " \n", " pivot_table = pd.crosstab(df_filtered['topic_1'], df_filtered['sentiment'])\n", " fig_heatmap = px.imshow(\n", " pivot_table,\n", " labels=dict(x=\"Sentiment\", y=\"Primary Topic\", color=\"Count\"),\n", " title=\"Sentiment by Primary Topic Heatmap\",\n", " color_continuous_scale=\"RdYlGn\"\n", " )\n", " visualizations['Sentiment by Topic'] = fig_heatmap\n", " \n", " if 'date' in df.columns and 'sentiment' in df.columns:\n", " # Sentiment over time\n", " df_time = df.copy()\n", " df_time['date'] = pd.to_datetime(df_time['date'])\n", " time_data = df_time.groupby([pd.Grouper(key='date', freq='M'), 'sentiment']).size().reset_index(name='count')\n", " \n", " fig_timeline = px.line(\n", " time_data,\n", " x='date',\n", " y='count',\n", " color='sentiment',\n", " title=\"Sentiment Trends Over Time\",\n", " color_discrete_map={\n", " 'Positive': '#27AE60',\n", " 'Negative': '#E74C3C',\n", " 'Neutral': '#95A5A6'\n", " }\n", " )\n", " visualizations['Sentiment Timeline'] = fig_timeline\n", " \n", " if 'actionable_insights' in df.columns:\n", " # Top actionable insights\n", " all_insights = []\n", " for insight in df['actionable_insights']:\n", " if insight and insight != \"\":\n", " # Split by comma as we're now using comma-separated insights\n", " all_insights.extend([i.strip() for i in insight.split(',')])\n", " \n", " if all_insights:\n", " insight_counts = Counter(all_insights)\n", " top_insights = dict(insight_counts.most_common(10))\n", " \n", " fig_insights = px.bar(\n", " x=list(top_insights.values()),\n", " y=list(top_insights.keys()),\n", " orientation='h',\n", " title=\"Top 10 Actionable Insights\",\n", " labels={'x': 'Frequency', 'y': 'Insight'}\n", " )\n", " visualizations['Top Insights'] = fig_insights\n", " \n", " return visualizations" ] }, { "cell_type": "code", "execution_count": 9, "id": "5ee86a52-b195-4010-a2b7-3abf57bf9949", "metadata": {}, "outputs": [], "source": [ "# ===== GRADIO INTERFACE =====\n", "# Global variables\n", "analyzer = None\n", "current_data = None\n", "current_visualizations = None\n", "\n", "def update_model(model_name):\n", " \"\"\"Update the selected AI model\"\"\"\n", " global model_manager\n", " \n", " if model_manager.set_model(model_name):\n", " return f\"✅ Model switched to: {model_name}\"\n", " else:\n", " return f\"❌ Failed to switch to: {model_name}\"\n", "\n", "def process_file(file, model_name):\n", " \"\"\"Process uploaded file with selected model\"\"\"\n", " global analyzer, current_data, current_visualizations, model_manager\n", " \n", " if file is None:\n", " return \"Please upload a file\", None, None, None, None, None, gr.update(choices=[])\n", " \n", " try:\n", " # Update model if changed\n", " if model_name and model_manager:\n", " model_manager.set_model(model_name)\n", " \n", " analyzer = EnhancedTextAnalyzer(model_manager)\n", " \n", " # Load file\n", " df, message = analyzer.load_file(file)\n", " if df is None:\n", " return message, None, None, None, None, None, gr.update(choices=[])\n", " \n", " # Process data\n", " processed_df, detected_cols, output_file = analyzer.process_data(df)\n", " current_data = processed_df\n", " \n", " # Generate visualizations\n", " visualizations = analyzer.generate_visualizations(processed_df)\n", " current_visualizations = visualizations\n", " \n", " # Generate AI insights\n", " ai_insights = analyzer.generate_ai_insights(processed_df)\n", " \n", " # Create summary - safely handle detected columns\n", " text_cols = list(detected_cols.get('text_columns', []))[:3] if detected_cols.get('text_columns') else []\n", " id_cols = list(detected_cols.get('id_columns', []))[:3] if detected_cols.get('id_columns') else []\n", " product_cols = list(detected_cols.get('product_columns', []))[:3] if detected_cols.get('product_columns') else []\n", " \n", " summary = f\"\"\"\n", " ### ✅ File Processing Complete!\n", " \n", " **Detected Columns:**\n", " - Text Columns: {', '.join(text_cols) if text_cols else 'None'}\n", " - ID Columns: {', '.join(id_cols) if id_cols else 'Auto-generated'}\n", " - Product Columns: {', '.join(product_cols) if product_cols else 'None'}\n", " \n", " **Analysis Results:**\n", " - Total Records: {len(processed_df)}\n", " - Processed File Saved: {output_file}\n", " - AI Model Used: {model_manager.current_model if model_manager else 'None'}\n", " \"\"\"\n", " \n", " # Data preview\n", " preview = processed_df.head(10)\n", " \n", " # Get first visualization\n", " first_viz = list(visualizations.values())[0] if visualizations else None\n", " \n", " return (\n", " summary,\n", " preview,\n", " output_file,\n", " ai_insights,\n", " first_viz,\n", " \"Ready for search\",\n", " gr.update(choices=list(visualizations.keys()))\n", " )\n", " \n", " except Exception as e:\n", " return f\"Error: {str(e)}\", None, None, None, None, None, gr.update(choices=[])\n", "\n", "def search_data(query):\n", " \"\"\"Search through the data with enhanced semantic search\"\"\"\n", " global analyzer, current_data\n", " \n", " if analyzer is None or current_data is None:\n", " return \"Please process a file first\", None, None\n", " \n", " if not query:\n", " return \"Please enter a search query\", None, None\n", " \n", " try:\n", " results = analyzer.search_engine.search(query, top_k=10)\n", " \n", " if results.empty:\n", " return \"No results found\", None, None\n", " \n", " # Select relevant columns for display (updated to include new topic columns)\n", " display_cols = ['unique_id', 'combined_text', 'sentiment', 'topic_1', 'topic_2', 'topic_3', 'actionable_insights', 'search_score']\n", " display_cols = [col for col in display_cols if col in results.columns]\n", " \n", " results_display = results[display_cols]\n", " \n", " # Save search results\n", " search_output = f\"search_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx\"\n", " results_display.to_excel(search_output, index=False)\n", " \n", " return f\"Found {len(results)} results\", results_display.head(10), search_output\n", " \n", " except Exception as e:\n", " return f\"Search error: {str(e)}\", None, None\n", "\n", "def update_visualization(viz_type):\n", " \"\"\"Update displayed visualization\"\"\"\n", " global current_visualizations\n", " \n", " if current_visualizations and viz_type in current_visualizations:\n", " return current_visualizations[viz_type]\n", " return None\n", "\n", "def export_results(format_type):\n", " \"\"\"Export processed data in different formats\"\"\"\n", " global current_data\n", " \n", " if current_data is None:\n", " return \"No data to export\", None\n", " \n", " try:\n", " timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')\n", " \n", " if format_type == \"Excel\":\n", " output_file = f\"analysis_results_{timestamp}.xlsx\"\n", " current_data.to_excel(output_file, index=False)\n", " else: # CSV\n", " output_file = f\"analysis_results_{timestamp}.csv\"\n", " current_data.to_csv(output_file, index=False)\n", " \n", " return f\"Data exported to {output_file}\", output_file\n", " \n", " except Exception as e:\n", " return f\"Export error: {str(e)}\", None" ] }, { "cell_type": "code", "execution_count": 10, "id": "38bf0375-9ef8-488c-821f-288c4f59ff5d", "metadata": {}, "outputs": [], "source": [ "# Create Gradio interface\n", "def create_interface():\n", " \"\"\"Create the Gradio interface with model selection\"\"\"\n", " \n", " with gr.Blocks(theme=gr.themes.Soft()) as app:\n", " gr.Markdown(\n", " \"\"\"\n", " # 📊 Enhanced Text Analytics AI Agent\n", " ### Smart Column Detection & Comprehensive Text Analysis with Multiple AI Models\n", " \n", " **Features:**\n", " - 🤖 Multiple AI Model Support (OpenAI, Anthropic, Deepseek, Groq, Google)\n", " - 🔍 Automatic detection of text, ID, and product columns\n", " - 💾 Memory-efficient processing with automatic file cleanup\n", " - 😊 Sentiment analysis with scoring\n", " - 🎯 Topic/theme extraction\n", " - 💡 Actionable insights generation\n", " - 🔎 Advanced text search with similarity scoring\n", " - 📈 Multiple visualization options\n", " - 📥 Export results in Excel or CSV format\n", " \"\"\"\n", " )\n", " \n", " with gr.Tab(\"📤 Upload & Process\"):\n", " with gr.Row():\n", " with gr.Column(scale=1):\n", " # Model selection dropdown\n", " model_dropdown = gr.Dropdown(\n", " label=\"🤖 Select AI Model\",\n", " choices=model_manager.get_available_models(),\n", " value=model_manager.current_model if model_manager.current_model else None,\n", " interactive=True\n", " )\n", " \n", " file_upload = gr.File(\n", " label=\"Upload Data File\",\n", " file_types=[\".csv\", \".xlsx\", \".xls\", \".json\"]\n", " )\n", " process_btn = gr.Button(\"🚀 Process File\", variant=\"primary\")\n", " \n", " with gr.Column(scale=2):\n", " status_output = gr.Markdown(label=\"Processing Status\")\n", " ai_insights = gr.Markdown(label=\"AI-Generated Insights\")\n", " \n", " with gr.Row():\n", " data_preview = gr.Dataframe(\n", " label=\"Data Preview (First 10 rows)\",\n", " interactive=False\n", " )\n", " \n", " processed_file = gr.File(\n", " label=\"📁 Processed Data File\",\n", " interactive=False\n", " )\n", " \n", " with gr.Tab(\"🔍 Search\"):\n", " gr.Markdown(\"### Search through your text data\")\n", " \n", " with gr.Row():\n", " search_input = gr.Textbox(\n", " label=\"Enter search query\",\n", " placeholder=\"Type keywords to search...\"\n", " )\n", " search_btn = gr.Button(\"🔎 Search\", variant=\"primary\")\n", " \n", " search_status = gr.Markdown(label=\"Search Status\")\n", " search_results = gr.Dataframe(\n", " label=\"Search Results\",\n", " interactive=False\n", " )\n", " search_file = gr.File(\n", " label=\"📥 Download Search Results\",\n", " interactive=False\n", " )\n", " \n", " with gr.Tab(\"📈 Visualizations\"):\n", " with gr.Row():\n", " viz_selector = gr.Dropdown(\n", " label=\"Select Visualization\",\n", " choices=[],\n", " interactive=True\n", " )\n", " \n", " viz_plot = gr.Plot(label=\"Visualization\")\n", " \n", " with gr.Tab(\"📥 Export\"):\n", " gr.Markdown(\"### Export your analyzed data\")\n", " \n", " with gr.Row():\n", " export_format = gr.Radio(\n", " choices=[\"Excel\", \"CSV\"],\n", " value=\"Excel\",\n", " label=\"Export Format\"\n", " )\n", " export_btn = gr.Button(\"📥 Export Data\", variant=\"primary\")\n", " \n", " export_status = gr.Markdown(label=\"Export Status\")\n", " export_file = gr.File(\n", " label=\"📁 Download Exported File\",\n", " interactive=False\n", " )\n", " \n", " # Event handlers\n", " model_dropdown.change(\n", " fn=update_model,\n", " inputs=[model_dropdown],\n", " outputs=[status_output]\n", " )\n", " \n", " process_btn.click(\n", " fn=process_file,\n", " inputs=[file_upload, model_dropdown],\n", " outputs=[\n", " status_output,\n", " data_preview,\n", " processed_file,\n", " ai_insights,\n", " viz_plot,\n", " search_status,\n", " viz_selector\n", " ]\n", " )\n", " \n", " search_btn.click(\n", " fn=search_data,\n", " inputs=[search_input],\n", " outputs=[search_status, search_results, search_file]\n", " )\n", " \n", " viz_selector.change(\n", " fn=update_visualization,\n", " inputs=[viz_selector],\n", " outputs=[viz_plot]\n", " )\n", " \n", " export_btn.click(\n", " fn=export_results,\n", " inputs=[export_format],\n", " outputs=[export_status, export_file]\n", " )\n", " \n", " return app" ] }, { "cell_type": "code", "execution_count": 11, "id": "6c5a0767-a788-43a8-911c-04e81814f4c4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "* Running on local URL: http://127.0.0.1:7861\n", "* Running on public URL: https://8190830de481785995.gradio.live\n", "\n", "This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)\n" ] }, { "data": { "text/html": [ "
" ], "text/plain": [ "