Spaces:

Fola-AI
/

Multimodal_Text_Data_Analysis

Sleeping

File size: 59,979 Bytes

928639d

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "3baa95af-73a1-4d3c-a562-f90777f1f0c0",
   "metadata": {},
   "source": [
    "# Text Data Analysis AI Assistant with Gradio\n",
    " - Intelligent Customer Feedback Analysis System with Multiple AI APIs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "31a6bbea-df57-40ed-afd3-4df75cc86d0a",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package brown to /Users/fola-ai/nltk_data...\n",
      "[nltk_data]   Package brown is already up-to-date!\n",
      "[nltk_data] Downloading package punkt_tab to /Users/fola-\n",
      "[nltk_data]     ai/nltk_data...\n",
      "[nltk_data]   Package punkt_tab is already up-to-date!\n",
      "[nltk_data] Downloading package wordnet to /Users/fola-ai/nltk_data...\n",
      "[nltk_data]   Package wordnet is already up-to-date!\n",
      "[nltk_data] Downloading package averaged_perceptron_tagger_eng to\n",
      "[nltk_data]     /Users/fola-ai/nltk_data...\n",
      "[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.\n",
      "[nltk_data] Downloading package conll2000 to /Users/fola-\n",
      "[nltk_data]     ai/nltk_data...\n",
      "[nltk_data]   Unzipping corpora/conll2000.zip.\n",
      "[nltk_data] Downloading package movie_reviews to /Users/fola-\n",
      "[nltk_data]     ai/nltk_data...\n",
      "[nltk_data]   Unzipping corpora/movie_reviews.zip.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Finished.\n"
     ]
    }
   ],
   "source": [
    "# ===== IMPORTS SECTION =====\n",
    "# Core libraries\n",
    "import os\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "# Environment and API\n",
    "from dotenv import load_dotenv\n",
    "from anthropic import Anthropic\n",
    "\n",
    "# Additional AI APIs\n",
    "try:\n",
    "    from openai import OpenAI\n",
    "except ImportError:\n",
    "    OpenAI = None\n",
    "    \n",
    "try:\n",
    "    from groq import Groq\n",
    "except ImportError:\n",
    "    Groq = None\n",
    "    \n",
    "try:\n",
    "    import google.generativeai as genai\n",
    "except ImportError:\n",
    "    genai = None\n",
    "\n",
    "# Data processing\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from datetime import datetime, timedelta\n",
    "import json\n",
    "import gc  # For garbage collection\n",
    "\n",
    "# Natural Language Processing\n",
    "import nltk\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.tokenize import word_tokenize\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "from textblob import TextBlob\n",
    "import re\n",
    "from collections import Counter\n",
    "\n",
    "# Machine Learning\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n",
    "from sklearn.decomposition import LatentDirichletAllocation\n",
    "from sklearn.cluster import KMeans\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "\n",
    "# Visualization\n",
    "import plotly.express as px\n",
    "import plotly.graph_objects as go\n",
    "from plotly.subplots import make_subplots\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "# Web interface\n",
    "import gradio as gr\n",
    "\n",
    "# Download required NLTK data\n",
    "nltk.download('punkt', quiet=True)\n",
    "nltk.download('punkt_tab', quiet=True)  # New tokenizer format\n",
    "nltk.download('stopwords', quiet=True)\n",
    "nltk.download('wordnet', quiet=True)\n",
    "nltk.download('averaged_perceptron_tagger', quiet=True)\n",
    "nltk.download('omw-1.4', quiet=True)  # For WordNet lemmatizer\n",
    "nltk.download('brown', quiet=True)  # Required for TextBlob\n",
    "\n",
    "# Download TextBlob corpora\n",
    "try:\n",
    "    from textblob import download_corpora\n",
    "    download_corpora.main()\n",
    "except:\n",
    "    # Alternative method if the above doesn't work\n",
    "    import subprocess\n",
    "    import sys\n",
    "    try:\n",
    "        subprocess.run([sys.executable, \"-m\", \"textblob.download_corpora\"], \n",
    "                      capture_output=True, text=True, timeout=30)\n",
    "    except:\n",
    "        print(\"Warning: Could not download TextBlob corpora. Sentiment analysis may not work properly.\")\n",
    "        print(\"Please run: python -m textblob.download_corpora\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "db7c1e72-7960-4968-9a72-0f62ca7140d9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "load_dotenv(override=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "bded62da-82ab-4e17-bbf5-3edfe1b39398",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ===== SMART COLUMN DETECTOR =====\n",
    "class SmartColumnDetector:\n",
    "    \"\"\"Intelligently detect and extract relevant columns from uploaded data\"\"\"\n",
    "    \n",
    "    def __init__(self):\n",
    "        # Keywords for detecting different column types\n",
    "        self.text_keywords = ['comment', 'feedback', 'review', 'description', 'text', \n",
    "                             'response', 'opinion', 'message', 'notes', 'remarks']\n",
    "        self.id_keywords = ['id', 'identifier', 'key', 'number', 'code', 'ref', \n",
    "                           'reference', 'index', 'uuid']\n",
    "        self.product_keywords = ['product', 'item', 'model', 'variant', 'type', \n",
    "                                'category', 'brand', 'name', 'sku']\n",
    "        self.date_keywords = ['date', 'time', 'created', 'updated', 'timestamp']\n",
    "        \n",
    "    def detect_column_types(self, df):\n",
    "        \"\"\"Detect column types based on column names and content\"\"\"\n",
    "        detected = {\n",
    "            'text_columns': [],\n",
    "            'id_columns': [],\n",
    "            'product_columns': [],\n",
    "            'date_columns': [],\n",
    "            'other_columns': []\n",
    "        }\n",
    "        \n",
    "        for col in df.columns:\n",
    "            col_lower = col.lower()\n",
    "            \n",
    "            # Check for text columns\n",
    "            if any(keyword in col_lower for keyword in self.text_keywords):\n",
    "                detected['text_columns'].append(col)\n",
    "            # Check for ID columns\n",
    "            elif any(keyword in col_lower for keyword in self.id_keywords):\n",
    "                detected['id_columns'].append(col)\n",
    "            # Check for product columns\n",
    "            elif any(keyword in col_lower for keyword in self.product_keywords):\n",
    "                detected['product_columns'].append(col)\n",
    "            # Check for date columns\n",
    "            elif any(keyword in col_lower for keyword in self.date_keywords):\n",
    "                detected['date_columns'].append(col)\n",
    "            else:\n",
    "                # Analyze content to determine type\n",
    "                sample = df[col].dropna().head(100)\n",
    "                if len(sample) > 0:\n",
    "                    # Check if mostly text\n",
    "                    if df[col].dtype == 'object':\n",
    "                        avg_length = sample.astype(str).str.len().mean()\n",
    "                        if avg_length > 50:  # Likely text content\n",
    "                            detected['text_columns'].append(col)\n",
    "                        elif avg_length < 20 and df[col].nunique() / len(df) > 0.5:\n",
    "                            detected['id_columns'].append(col)\n",
    "                        else:\n",
    "                            detected['product_columns'].append(col)\n",
    "                    else:\n",
    "                        detected['other_columns'].append(col)\n",
    "        \n",
    "        return detected\n",
    "    \n",
    "    def extract_relevant_data(self, df):\n",
    "        \"\"\"Extract only relevant columns and create optimized dataset\"\"\"\n",
    "        detected = self.detect_column_types(df)\n",
    "        \n",
    "        # Create new dataframe with relevant columns\n",
    "        extracted_data = pd.DataFrame()\n",
    "        \n",
    "        # Add unique identifier\n",
    "        if detected['id_columns'] and len(detected['id_columns']) > 0:\n",
    "            extracted_data['unique_id'] = df[detected['id_columns'][0]]\n",
    "        else:\n",
    "            extracted_data['unique_id'] = range(1, len(df) + 1)\n",
    "        \n",
    "        # Add product information\n",
    "        if detected['product_columns'] and len(detected['product_columns']) > 0:\n",
    "            # Convert to list if needed and limit to 2 product columns\n",
    "            product_cols = list(detected['product_columns'])[:2]\n",
    "            for col in product_cols:\n",
    "                extracted_data[f'product_{col}'] = df[col]\n",
    "        \n",
    "        # Combine text columns\n",
    "        if detected['text_columns'] and len(detected['text_columns']) > 0:\n",
    "            text_cols = list(detected['text_columns'])  # Ensure it's a list\n",
    "            text_data = []\n",
    "            for idx in df.index:\n",
    "                combined_text = ' '.join([\n",
    "                    str(df.loc[idx, col]) \n",
    "                    for col in text_cols \n",
    "                    if col in df.columns and pd.notna(df.loc[idx, col])\n",
    "                ])\n",
    "                text_data.append(combined_text)\n",
    "            extracted_data['combined_text'] = text_data\n",
    "        else:\n",
    "            # If no text columns detected, create empty combined_text\n",
    "            extracted_data['combined_text'] = [''] * len(df)\n",
    "        \n",
    "        # Add date columns\n",
    "        if detected['date_columns'] and len(detected['date_columns']) > 0:\n",
    "            extracted_data['date'] = pd.to_datetime(df[detected['date_columns'][0]], errors='coerce')\n",
    "        \n",
    "        return extracted_data, detected"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "626af7bf-b4cf-4259-b409-18e5225555aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ===== ENHANCED TEXT PROCESSOR =====\n",
    "class EnhancedTextProcessor:\n",
    "    \"\"\"Enhanced text preprocessing with actionable insights extraction\"\"\"\n",
    "\n",
    "    def __init__(self):\n",
    "        self.lemmatizer = WordNetLemmatizer()\n",
    "        self.stop_words = set(stopwords.words('english'))\n",
    "        \n",
    "        # Initialize actionable insights dictionary with common customer feedback phrases\n",
    "        self.actionable_dictionary = {\n",
    "            'improve speed': ['slow', 'faster', 'quick', 'speed up', 'takes too long', 'waiting'],\n",
    "            'better quality': ['poor quality', 'cheap', 'breaks', 'defective', 'flimsy', 'weak'],\n",
    "            'enhance ui': ['confusing', 'hard to use', 'complicated', 'not intuitive', 'difficult to navigate'],\n",
    "            'fix bugs': ['bug', 'error', 'crash', 'freeze', 'not working', 'glitch', 'broken'],\n",
    "            'add features': ['missing', 'need', 'want', 'should have', 'would be nice', 'lacks'],\n",
    "            'improve support': ['no response', 'unhelpful', 'rude', 'poor service', 'bad support'],\n",
    "            'better packaging': ['damaged', 'poor packaging', 'arrived broken', 'not protected'],\n",
    "            'clearer docs': ['unclear', 'no instructions', 'confusing manual', 'hard to understand'],\n",
    "            'reduce price': ['expensive', 'overpriced', 'too costly', 'not worth', 'cheaper'],\n",
    "            'faster delivery': ['late', 'delayed', 'slow shipping', 'took forever', 'still waiting'],\n",
    "            'better communication': ['no updates', 'not informed', 'lack of communication', 'no tracking'],\n",
    "            'improve reliability': ['unreliable', 'stops working', 'inconsistent', 'sometimes works'],\n",
    "            'enhance performance': ['slow performance', 'laggy', 'sluggish', 'not responsive'],\n",
    "            'better design': ['ugly', 'poor design', 'looks cheap', 'not attractive', 'outdated look'],\n",
    "            'more options': ['limited options', 'no variety', 'need more choices', 'only one option']\n",
    "        }\n",
    "\n",
    "    def clean_text(self, text):\n",
    "        \"\"\"Clean and normalize text\"\"\"\n",
    "        if pd.isna(text) or text == '':\n",
    "            return \"\"\n",
    "\n",
    "        text = str(text).lower()\n",
    "        text = re.sub(r'[^a-zA-Z0-9\\s]', '', text)\n",
    "        text = ' '.join(text.split())\n",
    "        return text\n",
    "\n",
    "    def extract_actionable_insights(self, text):\n",
    "        \"\"\"Extract actionable insights using dictionary matching\"\"\"\n",
    "        if pd.isna(text) or text == '':\n",
    "            return \"\"\n",
    "        \n",
    "        text_lower = text.lower()\n",
    "        found_insights = []\n",
    "        \n",
    "        # Check each actionable item against the text\n",
    "        for action, keywords in self.actionable_dictionary.items():\n",
    "            for keyword in keywords:\n",
    "                if keyword in text_lower:\n",
    "                    found_insights.append(action)\n",
    "                    break  # Only add each action once\n",
    "        \n",
    "        # Return top 3 most relevant insights\n",
    "        if found_insights:\n",
    "            return ', '.join(found_insights[:3])\n",
    "        return \"\"\n",
    "\n",
    "    def extract_specific_topics(self, text):\n",
    "        \"\"\"Extract specific topics from text using keyword extraction\"\"\"\n",
    "        if pd.isna(text) or text == '' or len(text) < 10:\n",
    "            return ['', '', '']\n",
    "        \n",
    "        # Clean text first\n",
    "        text_lower = text.lower()\n",
    "        \n",
    "        # Remove stopwords for better topic extraction\n",
    "        words = word_tokenize(text_lower)\n",
    "        filtered_words = [w for w in words if w not in self.stop_words and len(w) > 3]\n",
    "        \n",
    "        # Extract noun phrases and important terms\n",
    "        blob = TextBlob(text)\n",
    "        noun_phrases = blob.noun_phrases\n",
    "        \n",
    "        # Combine noun phrases with high-frequency meaningful words\n",
    "        topics = []\n",
    "        \n",
    "        # Add noun phrases (these are usually good topics)\n",
    "        for phrase in noun_phrases[:5]:  # Limit to top 5 noun phrases\n",
    "            if len(phrase.split()) <= 3:  # Only short phrases\n",
    "                topics.append(phrase)\n",
    "        \n",
    "        # Add frequent meaningful words if we don't have enough topics\n",
    "        if len(topics) < 3:\n",
    "            word_freq = Counter(filtered_words)\n",
    "            for word, _ in word_freq.most_common(5):\n",
    "                if word not in str(topics):  # Avoid duplicates\n",
    "                    topics.append(word)\n",
    "                if len(topics) >= 3:\n",
    "                    break\n",
    "        \n",
    "        # Ensure we always return 3 items (empty string if not enough topics)\n",
    "        topics = topics[:3]\n",
    "        while len(topics) < 3:\n",
    "            topics.append('')\n",
    "        \n",
    "        return topics\n",
    "\n",
    "    def determine_topic(self, text):\n",
    "        \"\"\"Legacy method kept for compatibility - returns first specific topic\"\"\"\n",
    "        topics = self.extract_specific_topics(text)\n",
    "        return topics[0] if topics[0] else 'General'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "b2eb5f17-7400-4591-8c0e-de7645b87c72",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ===== SEARCH ENGINE =====\n",
    "class TextSearchEngine:\n",
    "    \"\"\"Advanced search functionality for text data with semantic capabilities\"\"\"\n",
    "    \n",
    "    def __init__(self):\n",
    "        self.vectorizer = TfidfVectorizer(\n",
    "            max_features=1000,\n",
    "            ngram_range=(1, 3),  # Include unigrams, bigrams, and trigrams for better matching\n",
    "            stop_words='english',\n",
    "            use_idf=True,\n",
    "            smooth_idf=True,\n",
    "            sublinear_tf=True  # Apply sublinear tf scaling\n",
    "        )\n",
    "        self.tfidf_matrix = None\n",
    "        self.data = None\n",
    "        \n",
    "        # Synonym dictionary for semantic search\n",
    "        self.synonyms = {\n",
    "            'fast': ['quick', 'rapid', 'speedy', 'swift', 'prompt'],\n",
    "            'slow': ['sluggish', 'delayed', 'laggy', 'lengthy', 'prolonged'],\n",
    "            'good': ['excellent', 'great', 'wonderful', 'fantastic', 'amazing', 'positive'],\n",
    "            'bad': ['poor', 'terrible', 'awful', 'negative', 'horrible', 'disappointing'],\n",
    "            'problem': ['issue', 'bug', 'error', 'defect', 'fault', 'glitch'],\n",
    "            'help': ['support', 'assistance', 'aid', 'service'],\n",
    "            'price': ['cost', 'fee', 'charge', 'rate', 'payment', 'expensive', 'cheap'],\n",
    "            'quality': ['standard', 'grade', 'condition', 'caliber'],\n",
    "            'delivery': ['shipping', 'dispatch', 'arrival', 'transport'],\n",
    "            'easy': ['simple', 'straightforward', 'effortless', 'user-friendly'],\n",
    "            'hard': ['difficult', 'complex', 'complicated', 'challenging'],\n",
    "            'broken': ['damaged', 'defective', 'faulty', 'malfunctioning'],\n",
    "            'love': ['like', 'enjoy', 'appreciate', 'adore'],\n",
    "            'hate': ['dislike', 'despise', 'detest'],\n",
    "            'feature': ['function', 'capability', 'option', 'characteristic'],\n",
    "            'customer': ['client', 'buyer', 'purchaser', 'consumer', 'user']\n",
    "        }\n",
    "        \n",
    "    def expand_query_with_synonyms(self, query):\n",
    "        \"\"\"Expand search query with synonyms for better semantic matching\"\"\"\n",
    "        query_words = query.lower().split()\n",
    "        expanded_terms = []\n",
    "        \n",
    "        for word in query_words:\n",
    "            # Add the original word\n",
    "            expanded_terms.append(word)\n",
    "            \n",
    "            # Add synonyms if available\n",
    "            if word in self.synonyms:\n",
    "                expanded_terms.extend(self.synonyms[word])\n",
    "            \n",
    "            # Check if word is a synonym of something else\n",
    "            for key, syns in self.synonyms.items():\n",
    "                if word in syns:\n",
    "                    expanded_terms.append(key)\n",
    "                    expanded_terms.extend([s for s in syns if s != word])\n",
    "        \n",
    "        # Remove duplicates while preserving order\n",
    "        seen = set()\n",
    "        unique_terms = []\n",
    "        for term in expanded_terms:\n",
    "            if term not in seen:\n",
    "                unique_terms.append(term)\n",
    "                seen.add(term)\n",
    "        \n",
    "        return ' '.join(unique_terms)\n",
    "        \n",
    "    def build_index(self, df, text_column):\n",
    "        \"\"\"Build search index from text data\"\"\"\n",
    "        self.data = df.copy()\n",
    "        texts = df[text_column].fillna('').tolist()\n",
    "        \n",
    "        # Add other searchable columns to improve search\n",
    "        if 'topic_1' in df.columns:\n",
    "            texts = [f\"{text} {df.iloc[i]['topic_1']} {df.iloc[i]['topic_2']} {df.iloc[i]['topic_3']}\" \n",
    "                    for i, text in enumerate(texts)]\n",
    "        if 'actionable_insights' in df.columns:\n",
    "            texts = [f\"{texts[i]} {df.iloc[i]['actionable_insights']}\" \n",
    "                    for i in range(len(texts))]\n",
    "            \n",
    "        self.tfidf_matrix = self.vectorizer.fit_transform(texts)\n",
    "        \n",
    "    def search(self, query, top_k=10):\n",
    "        \"\"\"Enhanced search with semantic understanding\"\"\"\n",
    "        if self.tfidf_matrix is None:\n",
    "            return pd.DataFrame()\n",
    "        \n",
    "        # Expand query with synonyms\n",
    "        expanded_query = self.expand_query_with_synonyms(query)\n",
    "        \n",
    "        # Vectorize both original and expanded queries\n",
    "        query_vector = self.vectorizer.transform([query])\n",
    "        expanded_vector = self.vectorizer.transform([expanded_query])\n",
    "        \n",
    "        # Calculate similarities for both\n",
    "        similarities_orig = cosine_similarity(query_vector, self.tfidf_matrix).flatten()\n",
    "        similarities_exp = cosine_similarity(expanded_vector, self.tfidf_matrix).flatten()\n",
    "        \n",
    "        # Combine scores (weighted average - original query gets more weight)\n",
    "        combined_similarities = (0.7 * similarities_orig + 0.3 * similarities_exp)\n",
    "        \n",
    "        # Get top results\n",
    "        top_indices = combined_similarities.argsort()[-top_k:][::-1]\n",
    "        top_scores = combined_similarities[top_indices]\n",
    "        \n",
    "        # Filter results with score > 0.05 (lower threshold for better recall)\n",
    "        valid_indices = [idx for idx, score in zip(top_indices, top_scores) if score > 0.05]\n",
    "        \n",
    "        if valid_indices:\n",
    "            results = self.data.iloc[valid_indices].copy()\n",
    "            results['search_score'] = [combined_similarities[idx] for idx in valid_indices]\n",
    "            \n",
    "            # Boost results that have exact matches\n",
    "            query_lower = query.lower()\n",
    "            for idx in results.index:\n",
    "                if 'combined_text' in results.columns:\n",
    "                    if query_lower in str(results.at[idx, 'combined_text']).lower():\n",
    "                        results.at[idx, 'search_score'] *= 1.5  # Boost exact matches\n",
    "                        \n",
    "            return results.sort_values('search_score', ascending=False)\n",
    "        \n",
    "        return pd.DataFrame()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "e8b88155-971f-4dd5-b26c-104a737bc426",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ===== API CONFIGURATION =====\n",
    "class AIModelManager:\n",
    "    \"\"\"Manages multiple AI model APIs and provides unified interface\"\"\"\n",
    "    \n",
    "    def __init__(self):\n",
    "        self.available_models = {}\n",
    "        self.clients = {}\n",
    "        self.current_model = None\n",
    "        self.initialize_apis()\n",
    "        \n",
    "    def initialize_apis(self):\n",
    "        \"\"\"Initialize all available AI APIs\"\"\"\n",
    "        \n",
    "        # Anthropic\n",
    "        ANTHROPIC_API_KEY = os.getenv(\"ANTHROPIC_API_KEY\")\n",
    "        if ANTHROPIC_API_KEY:\n",
    "            try:\n",
    "                self.clients['anthropic'] = Anthropic(api_key=ANTHROPIC_API_KEY)\n",
    "                self.available_models['Claude 3 Haiku'] = {\n",
    "                    'provider': 'anthropic',\n",
    "                    'model': 'claude-3-haiku-20240307'\n",
    "                }\n",
    "                print(f\"Anthropic API Key exists and begins {ANTHROPIC_API_KEY[:4]}\")\n",
    "            except Exception as e:\n",
    "                print(f\"Error initializing Anthropic: {e}\")\n",
    "        else:\n",
    "            print(\"Anthropic API Key not set\")\n",
    "            \n",
    "        # OpenAI\n",
    "        OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n",
    "        if OPENAI_API_KEY and OpenAI:\n",
    "            try:\n",
    "                self.clients['openai'] = OpenAI(api_key=OPENAI_API_KEY)\n",
    "                self.available_models['GPT-4o-mini'] = {\n",
    "                    'provider': 'openai',\n",
    "                    'model': 'gpt-4o-mini'\n",
    "                }\n",
    "                self.available_models['GPT-3.5 Turbo'] = {\n",
    "                    'provider': 'openai',\n",
    "                    'model': 'gpt-3.5-turbo'\n",
    "                }\n",
    "                print(f\"OpenAI API Key exists and begins {OPENAI_API_KEY[:7]}\")\n",
    "            except Exception as e:\n",
    "                print(f\"Error initializing OpenAI: {e}\")\n",
    "        else:\n",
    "            print(\"OpenAI API Key not set or library not installed\")\n",
    "            \n",
    "        # Deepseek (uses OpenAI-compatible API)\n",
    "        DEEPSEEK_API_KEY = os.getenv(\"DEEPSEEK_API_KEY\")\n",
    "        if DEEPSEEK_API_KEY and OpenAI:\n",
    "            try:\n",
    "                self.clients['deepseek'] = OpenAI(\n",
    "                    api_key=DEEPSEEK_API_KEY,\n",
    "                    base_url=\"https://api.deepseek.com\"\n",
    "                )\n",
    "                self.available_models['Deepseek Chat'] = {\n",
    "                    'provider': 'deepseek',\n",
    "                    'model': 'deepseek-chat'\n",
    "                }\n",
    "                print(f\"Deepseek API Key exists and begins {DEEPSEEK_API_KEY[:7]}\")\n",
    "            except Exception as e:\n",
    "                print(f\"Error initializing Deepseek: {e}\")\n",
    "        else:\n",
    "            print(\"Deepseek API Key not set or OpenAI library not installed\")\n",
    "            \n",
    "        # Groq\n",
    "        GROQ_API_KEY = os.getenv(\"GROQ_API_KEY\")\n",
    "        if GROQ_API_KEY and Groq:\n",
    "            try:\n",
    "                self.clients['groq'] = Groq(api_key=GROQ_API_KEY)\n",
    "                self.available_models['Llama 3.3 70B'] = {\n",
    "                    'provider': 'groq',\n",
    "                    'model': 'llama-3.3-70b-versatile'\n",
    "                }\n",
    "                self.available_models['Mixtral 8x7B'] = {\n",
    "                    'provider': 'groq',\n",
    "                    'model': 'mixtral-8x7b-32768'\n",
    "                }\n",
    "                print(f\"Groq API Key exists and begins {GROQ_API_KEY[:4]}\")\n",
    "            except Exception as e:\n",
    "                print(f\"Error initializing Groq: {e}\")\n",
    "        else:\n",
    "            print(\"Groq API Key not set or library not installed\")\n",
    "            \n",
    "        # Google Gemini\n",
    "        GOOGLE_API_KEY = os.getenv(\"GOOGLE_API_KEY\")\n",
    "        if GOOGLE_API_KEY and genai:\n",
    "            try:\n",
    "                genai.configure(api_key=GOOGLE_API_KEY)\n",
    "                self.clients['google'] = genai\n",
    "                self.available_models['Gemini 1.5 Flash'] = {\n",
    "                    'provider': 'google',\n",
    "                    'model': 'gemini-1.5-flash'\n",
    "                }\n",
    "                self.available_models['Gemini 1.5 Pro'] = {\n",
    "                    'provider': 'google',\n",
    "                    'model': 'gemini-1.5-pro'\n",
    "                }\n",
    "                print(f\"Google API Key exists and begins {GOOGLE_API_KEY[:2]}\")\n",
    "            except Exception as e:\n",
    "                print(f\"Error initializing Google Gemini: {e}\")\n",
    "        else:\n",
    "            print(\"Google API Key not set or library not installed\")\n",
    "            \n",
    "        # Set default model\n",
    "        if self.available_models:\n",
    "            self.current_model = list(self.available_models.keys())[0]\n",
    "            \n",
    "    def get_available_models(self):\n",
    "        \"\"\"Return list of available model names\"\"\"\n",
    "        return list(self.available_models.keys())\n",
    "    \n",
    "    def set_model(self, model_name):\n",
    "        \"\"\"Set the current model\"\"\"\n",
    "        if model_name in self.available_models:\n",
    "            self.current_model = model_name\n",
    "            return True\n",
    "        return False\n",
    "    \n",
    "    def generate_text(self, prompt, max_tokens=1000):\n",
    "        \"\"\"Generate text using the current model\"\"\"\n",
    "        if not self.current_model or self.current_model not in self.available_models:\n",
    "            return None\n",
    "            \n",
    "        model_info = self.available_models[self.current_model]\n",
    "        provider = model_info['provider']\n",
    "        model = model_info['model']\n",
    "        \n",
    "        try:\n",
    "            if provider == 'anthropic':\n",
    "                client = self.clients['anthropic']\n",
    "                response = client.messages.create(\n",
    "                    model=model,\n",
    "                    max_tokens=max_tokens,\n",
    "                    messages=[{\"role\": \"user\", \"content\": prompt}]\n",
    "                )\n",
    "                return response.content[0].text\n",
    "                \n",
    "            elif provider in ['openai', 'deepseek']:\n",
    "                client = self.clients[provider]\n",
    "                response = client.chat.completions.create(\n",
    "                    model=model,\n",
    "                    messages=[{\"role\": \"user\", \"content\": prompt}],\n",
    "                    max_tokens=max_tokens\n",
    "                )\n",
    "                return response.choices[0].message.content\n",
    "                \n",
    "            elif provider == 'groq':\n",
    "                client = self.clients['groq']\n",
    "                response = client.chat.completions.create(\n",
    "                    model=model,\n",
    "                    messages=[{\"role\": \"user\", \"content\": prompt}],\n",
    "                    max_tokens=max_tokens\n",
    "                )\n",
    "                return response.choices[0].message.content\n",
    "                \n",
    "            elif provider == 'google':\n",
    "                model_obj = genai.GenerativeModel(model)\n",
    "                response = model_obj.generate_content(prompt)\n",
    "                return response.text\n",
    "                \n",
    "        except Exception as e:\n",
    "            print(f\"Error generating text with {self.current_model}: {e}\")\n",
    "            return None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "809f4c47-6ea8-4eaa-bac1-5ca83daac733",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Anthropic API Key exists and begins sk-a\n",
      "OpenAI API Key exists and begins sk-proj\n",
      "Deepseek API Key exists and begins sk-1099\n",
      "Groq API Key exists and begins gsk_\n",
      "Google API Key exists and begins AI\n"
     ]
    }
   ],
   "source": [
    "# Initialize the model manager globally\n",
    "model_manager = AIModelManager()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "ad5f99f2-efd9-4759-88dc-df7f2f5359fb",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ===== ENHANCED ANALYZER WITH MULTI-MODEL SUPPORT =====\n",
    "\n",
    "class EnhancedTextAnalyzer:\n",
    "    \"\"\"Main analysis engine with all enhanced features and multi-model support\"\"\"\n",
    "    \n",
    "    def __init__(self, model_manager=None):\n",
    "        self.model_manager = model_manager\n",
    "        self.column_detector = SmartColumnDetector()\n",
    "        self.text_processor = EnhancedTextProcessor()\n",
    "        self.search_engine = TextSearchEngine()\n",
    "        self.original_df = None\n",
    "        self.processed_df = None\n",
    "        self.results = {}\n",
    "        self.visualizations = {}\n",
    "        \n",
    "    def load_file(self, file):\n",
    "        \"\"\"Load data from various file formats\"\"\"\n",
    "        try:\n",
    "            if file.name.endswith('.csv'):\n",
    "                df = pd.read_csv(file.name)\n",
    "            elif file.name.endswith(('.xlsx', '.xls')):\n",
    "                df = pd.read_excel(file.name)\n",
    "            elif file.name.endswith('.json'):\n",
    "                df = pd.read_json(file.name)\n",
    "            else:\n",
    "                return None, \"Unsupported file format\"\n",
    "            \n",
    "            return df, f\"File loaded: {len(df)} records\"\n",
    "        except Exception as e:\n",
    "            return None, f\"Error loading file: {str(e)}\"\n",
    "    \n",
    "    def process_data(self, df):\n",
    "        \"\"\"Process data with smart extraction and analysis\"\"\"\n",
    "        # Extract relevant columns\n",
    "        extracted_df, detected_columns = self.column_detector.extract_relevant_data(df)\n",
    "        \n",
    "        # Store for reference\n",
    "        self.processed_df = extracted_df\n",
    "        \n",
    "        # Clear original from memory\n",
    "        del df\n",
    "        gc.collect()\n",
    "        \n",
    "        # Add analysis columns\n",
    "        if 'combined_text' in extracted_df.columns:\n",
    "            # Sentiment analysis\n",
    "            sentiments = []\n",
    "            polarities = []\n",
    "            topics_1 = []\n",
    "            topics_2 = []\n",
    "            topics_3 = []\n",
    "            insights = []\n",
    "            \n",
    "            for text in extracted_df['combined_text']:\n",
    "                # Sentiment\n",
    "                blob = TextBlob(text)\n",
    "                polarity = blob.sentiment.polarity\n",
    "                if polarity > 0.1:\n",
    "                    sentiment = 'Positive'\n",
    "                elif polarity < -0.1:\n",
    "                    sentiment = 'Negative'\n",
    "                else:\n",
    "                    sentiment = 'Neutral'\n",
    "                \n",
    "                sentiments.append(sentiment)\n",
    "                polarities.append(polarity)\n",
    "                \n",
    "                # Extract specific topics (3 separate topics)\n",
    "                specific_topics = self.text_processor.extract_specific_topics(text)\n",
    "                topics_1.append(specific_topics[0])\n",
    "                topics_2.append(specific_topics[1])\n",
    "                topics_3.append(specific_topics[2])\n",
    "                \n",
    "                # Actionable insights using dictionary matching\n",
    "                insight = self.text_processor.extract_actionable_insights(text)\n",
    "                insights.append(insight)\n",
    "            \n",
    "            extracted_df['sentiment'] = sentiments\n",
    "            extracted_df['sentiment_score'] = polarities\n",
    "            extracted_df['topic_1'] = topics_1\n",
    "            extracted_df['topic_2'] = topics_2\n",
    "            extracted_df['topic_3'] = topics_3\n",
    "            extracted_df['actionable_insights'] = insights\n",
    "            \n",
    "            # Build search index with enhanced search capabilities\n",
    "            self.search_engine.build_index(extracted_df, 'combined_text')\n",
    "        \n",
    "        # Save processed data\n",
    "        output_file = 'processed_data.xlsx'\n",
    "        extracted_df.to_excel(output_file, index=False)\n",
    "        \n",
    "        return extracted_df, detected_columns, output_file\n",
    "    \n",
    "    def generate_ai_insights(self, df, num_samples=5):\n",
    "        \"\"\"Generate AI-powered insights using selected model\"\"\"\n",
    "        if not self.model_manager or not self.model_manager.current_model:\n",
    "            return \"No AI model available for generating insights\"\n",
    "        \n",
    "        if 'combined_text' not in df.columns or df.empty:\n",
    "            return \"No text data available for AI analysis\"\n",
    "        \n",
    "        # Sample some texts for analysis\n",
    "        sample_texts = df['combined_text'].dropna().head(num_samples).tolist()\n",
    "        if not sample_texts:\n",
    "            return \"No valid text samples found\"\n",
    "        \n",
    "        # Create prompt for AI analysis\n",
    "        prompt = f\"\"\"Analyze the following customer feedback samples and provide key insights:\n",
    "\n",
    "Samples:\n",
    "{chr(10).join([f\"{i+1}. {text[:200]}...\" if len(text) > 200 else f\"{i+1}. {text}\" for i, text in enumerate(sample_texts)])}\n",
    "\n",
    "Please provide:\n",
    "1. Main themes and patterns\n",
    "2. Key sentiment indicators\n",
    "3. Actionable recommendations\n",
    "4. Areas of concern\n",
    "\n",
    "Keep the response concise and focused on actionable insights.\"\"\"\n",
    "\n",
    "        # Generate insights using selected model\n",
    "        try:\n",
    "            response = self.model_manager.generate_text(prompt, max_tokens=500)\n",
    "            if response:\n",
    "                return f\"**AI Insights (using {self.model_manager.current_model}):**\\n\\n{response}\"\n",
    "            else:\n",
    "                return \"Failed to generate AI insights. Please check your API configuration.\"\n",
    "        except Exception as e:\n",
    "            return f\"Error generating AI insights: {str(e)}\"\n",
    "    \n",
    "    def generate_visualizations(self, df):\n",
    "        \"\"\"Generate various visualizations\"\"\"\n",
    "        visualizations = {}\n",
    "        \n",
    "        if 'sentiment' in df.columns:\n",
    "            # Sentiment distribution\n",
    "            sentiment_counts = df['sentiment'].value_counts()\n",
    "            fig_sentiment = px.pie(\n",
    "                values=sentiment_counts.values,\n",
    "                names=sentiment_counts.index,\n",
    "                title=\"Sentiment Distribution\",\n",
    "                color_discrete_map={\n",
    "                    'Positive': '#27AE60',\n",
    "                    'Negative': '#E74C3C',\n",
    "                    'Neutral': '#95A5A6'\n",
    "                }\n",
    "            )\n",
    "            visualizations['Sentiment Distribution'] = fig_sentiment\n",
    "            \n",
    "        if 'topic_1' in df.columns:\n",
    "            # Combine all topics for overall topic distribution\n",
    "            all_topics = []\n",
    "            for col in ['topic_1', 'topic_2', 'topic_3']:\n",
    "                if col in df.columns:\n",
    "                    topics = df[col].dropna().tolist()\n",
    "                    all_topics.extend([t for t in topics if t != ''])\n",
    "            \n",
    "            if all_topics:\n",
    "                topic_counts = Counter(all_topics)\n",
    "                top_topics = dict(topic_counts.most_common(15))\n",
    "                \n",
    "                fig_topics = px.bar(\n",
    "                    x=list(top_topics.values()),\n",
    "                    y=list(top_topics.keys()),\n",
    "                    orientation='h',\n",
    "                    title=\"Top 15 Specific Topics\",\n",
    "                    labels={'x': 'Count', 'y': 'Topic'}\n",
    "                )\n",
    "                visualizations['Topic Distribution'] = fig_topics\n",
    "            \n",
    "        if 'sentiment' in df.columns and 'topic_1' in df.columns:\n",
    "            # Sentiment by primary topic (topic_1)\n",
    "            df_temp = df[df['topic_1'] != ''].copy()\n",
    "            if not df_temp.empty:\n",
    "                # Get top 10 topics for cleaner visualization\n",
    "                top_topics = df_temp['topic_1'].value_counts().head(10).index\n",
    "                df_filtered = df_temp[df_temp['topic_1'].isin(top_topics)]\n",
    "                \n",
    "                pivot_table = pd.crosstab(df_filtered['topic_1'], df_filtered['sentiment'])\n",
    "                fig_heatmap = px.imshow(\n",
    "                    pivot_table,\n",
    "                    labels=dict(x=\"Sentiment\", y=\"Primary Topic\", color=\"Count\"),\n",
    "                    title=\"Sentiment by Primary Topic Heatmap\",\n",
    "                    color_continuous_scale=\"RdYlGn\"\n",
    "                )\n",
    "                visualizations['Sentiment by Topic'] = fig_heatmap\n",
    "            \n",
    "        if 'date' in df.columns and 'sentiment' in df.columns:\n",
    "            # Sentiment over time\n",
    "            df_time = df.copy()\n",
    "            df_time['date'] = pd.to_datetime(df_time['date'])\n",
    "            time_data = df_time.groupby([pd.Grouper(key='date', freq='M'), 'sentiment']).size().reset_index(name='count')\n",
    "            \n",
    "            fig_timeline = px.line(\n",
    "                time_data,\n",
    "                x='date',\n",
    "                y='count',\n",
    "                color='sentiment',\n",
    "                title=\"Sentiment Trends Over Time\",\n",
    "                color_discrete_map={\n",
    "                    'Positive': '#27AE60',\n",
    "                    'Negative': '#E74C3C',\n",
    "                    'Neutral': '#95A5A6'\n",
    "                }\n",
    "            )\n",
    "            visualizations['Sentiment Timeline'] = fig_timeline\n",
    "        \n",
    "        if 'actionable_insights' in df.columns:\n",
    "            # Top actionable insights\n",
    "            all_insights = []\n",
    "            for insight in df['actionable_insights']:\n",
    "                if insight and insight != \"\":\n",
    "                    # Split by comma as we're now using comma-separated insights\n",
    "                    all_insights.extend([i.strip() for i in insight.split(',')])\n",
    "            \n",
    "            if all_insights:\n",
    "                insight_counts = Counter(all_insights)\n",
    "                top_insights = dict(insight_counts.most_common(10))\n",
    "                \n",
    "                fig_insights = px.bar(\n",
    "                    x=list(top_insights.values()),\n",
    "                    y=list(top_insights.keys()),\n",
    "                    orientation='h',\n",
    "                    title=\"Top 10 Actionable Insights\",\n",
    "                    labels={'x': 'Frequency', 'y': 'Insight'}\n",
    "                )\n",
    "                visualizations['Top Insights'] = fig_insights\n",
    "        \n",
    "        return visualizations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "5ee86a52-b195-4010-a2b7-3abf57bf9949",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ===== GRADIO INTERFACE =====\n",
    "# Global variables\n",
    "analyzer = None\n",
    "current_data = None\n",
    "current_visualizations = None\n",
    "\n",
    "def update_model(model_name):\n",
    "    \"\"\"Update the selected AI model\"\"\"\n",
    "    global model_manager\n",
    "    \n",
    "    if model_manager.set_model(model_name):\n",
    "        return f\"✅ Model switched to: {model_name}\"\n",
    "    else:\n",
    "        return f\"❌ Failed to switch to: {model_name}\"\n",
    "\n",
    "def process_file(file, model_name):\n",
    "    \"\"\"Process uploaded file with selected model\"\"\"\n",
    "    global analyzer, current_data, current_visualizations, model_manager\n",
    "    \n",
    "    if file is None:\n",
    "        return \"Please upload a file\", None, None, None, None, None, gr.update(choices=[])\n",
    "    \n",
    "    try:\n",
    "        # Update model if changed\n",
    "        if model_name and model_manager:\n",
    "            model_manager.set_model(model_name)\n",
    "        \n",
    "        analyzer = EnhancedTextAnalyzer(model_manager)\n",
    "        \n",
    "        # Load file\n",
    "        df, message = analyzer.load_file(file)\n",
    "        if df is None:\n",
    "            return message, None, None, None, None, None, gr.update(choices=[])\n",
    "        \n",
    "        # Process data\n",
    "        processed_df, detected_cols, output_file = analyzer.process_data(df)\n",
    "        current_data = processed_df\n",
    "        \n",
    "        # Generate visualizations\n",
    "        visualizations = analyzer.generate_visualizations(processed_df)\n",
    "        current_visualizations = visualizations\n",
    "        \n",
    "        # Generate AI insights\n",
    "        ai_insights = analyzer.generate_ai_insights(processed_df)\n",
    "        \n",
    "        # Create summary - safely handle detected columns\n",
    "        text_cols = list(detected_cols.get('text_columns', []))[:3] if detected_cols.get('text_columns') else []\n",
    "        id_cols = list(detected_cols.get('id_columns', []))[:3] if detected_cols.get('id_columns') else []\n",
    "        product_cols = list(detected_cols.get('product_columns', []))[:3] if detected_cols.get('product_columns') else []\n",
    "        \n",
    "        summary = f\"\"\"\n",
    "        ### ✅ File Processing Complete!\n",
    "        \n",
    "        **Detected Columns:**\n",
    "        - Text Columns: {', '.join(text_cols) if text_cols else 'None'}\n",
    "        - ID Columns: {', '.join(id_cols) if id_cols else 'Auto-generated'}\n",
    "        - Product Columns: {', '.join(product_cols) if product_cols else 'None'}\n",
    "        \n",
    "        **Analysis Results:**\n",
    "        - Total Records: {len(processed_df)}\n",
    "        - Processed File Saved: {output_file}\n",
    "        - AI Model Used: {model_manager.current_model if model_manager else 'None'}\n",
    "        \"\"\"\n",
    "        \n",
    "        # Data preview\n",
    "        preview = processed_df.head(10)\n",
    "        \n",
    "        # Get first visualization\n",
    "        first_viz = list(visualizations.values())[0] if visualizations else None\n",
    "        \n",
    "        return (\n",
    "            summary,\n",
    "            preview,\n",
    "            output_file,\n",
    "            ai_insights,\n",
    "            first_viz,\n",
    "            \"Ready for search\",\n",
    "            gr.update(choices=list(visualizations.keys()))\n",
    "        )\n",
    "        \n",
    "    except Exception as e:\n",
    "        return f\"Error: {str(e)}\", None, None, None, None, None, gr.update(choices=[])\n",
    "\n",
    "def search_data(query):\n",
    "    \"\"\"Search through the data with enhanced semantic search\"\"\"\n",
    "    global analyzer, current_data\n",
    "    \n",
    "    if analyzer is None or current_data is None:\n",
    "        return \"Please process a file first\", None, None\n",
    "    \n",
    "    if not query:\n",
    "        return \"Please enter a search query\", None, None\n",
    "    \n",
    "    try:\n",
    "        results = analyzer.search_engine.search(query, top_k=10)\n",
    "        \n",
    "        if results.empty:\n",
    "            return \"No results found\", None, None\n",
    "        \n",
    "        # Select relevant columns for display (updated to include new topic columns)\n",
    "        display_cols = ['unique_id', 'combined_text', 'sentiment', 'topic_1', 'topic_2', 'topic_3', 'actionable_insights', 'search_score']\n",
    "        display_cols = [col for col in display_cols if col in results.columns]\n",
    "        \n",
    "        results_display = results[display_cols]\n",
    "        \n",
    "        # Save search results\n",
    "        search_output = f\"search_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx\"\n",
    "        results_display.to_excel(search_output, index=False)\n",
    "        \n",
    "        return f\"Found {len(results)} results\", results_display.head(10), search_output\n",
    "        \n",
    "    except Exception as e:\n",
    "        return f\"Search error: {str(e)}\", None, None\n",
    "\n",
    "def update_visualization(viz_type):\n",
    "    \"\"\"Update displayed visualization\"\"\"\n",
    "    global current_visualizations\n",
    "    \n",
    "    if current_visualizations and viz_type in current_visualizations:\n",
    "        return current_visualizations[viz_type]\n",
    "    return None\n",
    "\n",
    "def export_results(format_type):\n",
    "    \"\"\"Export processed data in different formats\"\"\"\n",
    "    global current_data\n",
    "    \n",
    "    if current_data is None:\n",
    "        return \"No data to export\", None\n",
    "    \n",
    "    try:\n",
    "        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')\n",
    "        \n",
    "        if format_type == \"Excel\":\n",
    "            output_file = f\"analysis_results_{timestamp}.xlsx\"\n",
    "            current_data.to_excel(output_file, index=False)\n",
    "        else:  # CSV\n",
    "            output_file = f\"analysis_results_{timestamp}.csv\"\n",
    "            current_data.to_csv(output_file, index=False)\n",
    "        \n",
    "        return f\"Data exported to {output_file}\", output_file\n",
    "    \n",
    "    except Exception as e:\n",
    "        return f\"Export error: {str(e)}\", None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "38bf0375-9ef8-488c-821f-288c4f59ff5d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create Gradio interface\n",
    "def create_interface():\n",
    "    \"\"\"Create the Gradio interface with model selection\"\"\"\n",
    "    \n",
    "    with gr.Blocks(theme=gr.themes.Soft()) as app:\n",
    "        gr.Markdown(\n",
    "            \"\"\"\n",
    "            # 📊 Enhanced Text Analytics AI Agent\n",
    "            ### Smart Column Detection & Comprehensive Text Analysis with Multiple AI Models\n",
    "            \n",
    "            **Features:**\n",
    "            - 🤖 Multiple AI Model Support (OpenAI, Anthropic, Deepseek, Groq, Google)\n",
    "            - 🔍 Automatic detection of text, ID, and product columns\n",
    "            - 💾 Memory-efficient processing with automatic file cleanup\n",
    "            - 😊 Sentiment analysis with scoring\n",
    "            - 🎯 Topic/theme extraction\n",
    "            - 💡 Actionable insights generation\n",
    "            - 🔎 Advanced text search with similarity scoring\n",
    "            - 📈 Multiple visualization options\n",
    "            - 📥 Export results in Excel or CSV format\n",
    "            \"\"\"\n",
    "        )\n",
    "        \n",
    "        with gr.Tab(\"📤 Upload & Process\"):\n",
    "            with gr.Row():\n",
    "                with gr.Column(scale=1):\n",
    "                    # Model selection dropdown\n",
    "                    model_dropdown = gr.Dropdown(\n",
    "                        label=\"🤖 Select AI Model\",\n",
    "                        choices=model_manager.get_available_models(),\n",
    "                        value=model_manager.current_model if model_manager.current_model else None,\n",
    "                        interactive=True\n",
    "                    )\n",
    "                    \n",
    "                    file_upload = gr.File(\n",
    "                        label=\"Upload Data File\",\n",
    "                        file_types=[\".csv\", \".xlsx\", \".xls\", \".json\"]\n",
    "                    )\n",
    "                    process_btn = gr.Button(\"🚀 Process File\", variant=\"primary\")\n",
    "                \n",
    "                with gr.Column(scale=2):\n",
    "                    status_output = gr.Markdown(label=\"Processing Status\")\n",
    "                    ai_insights = gr.Markdown(label=\"AI-Generated Insights\")\n",
    "            \n",
    "            with gr.Row():\n",
    "                data_preview = gr.Dataframe(\n",
    "                    label=\"Data Preview (First 10 rows)\",\n",
    "                    interactive=False\n",
    "                )\n",
    "            \n",
    "            processed_file = gr.File(\n",
    "                label=\"📁 Processed Data File\",\n",
    "                interactive=False\n",
    "            )\n",
    "        \n",
    "        with gr.Tab(\"🔍 Search\"):\n",
    "            gr.Markdown(\"### Search through your text data\")\n",
    "            \n",
    "            with gr.Row():\n",
    "                search_input = gr.Textbox(\n",
    "                    label=\"Enter search query\",\n",
    "                    placeholder=\"Type keywords to search...\"\n",
    "                )\n",
    "                search_btn = gr.Button(\"🔎 Search\", variant=\"primary\")\n",
    "            \n",
    "            search_status = gr.Markdown(label=\"Search Status\")\n",
    "            search_results = gr.Dataframe(\n",
    "                label=\"Search Results\",\n",
    "                interactive=False\n",
    "            )\n",
    "            search_file = gr.File(\n",
    "                label=\"📥 Download Search Results\",\n",
    "                interactive=False\n",
    "            )\n",
    "        \n",
    "        with gr.Tab(\"📈 Visualizations\"):\n",
    "            with gr.Row():\n",
    "                viz_selector = gr.Dropdown(\n",
    "                    label=\"Select Visualization\",\n",
    "                    choices=[],\n",
    "                    interactive=True\n",
    "                )\n",
    "            \n",
    "            viz_plot = gr.Plot(label=\"Visualization\")\n",
    "        \n",
    "        with gr.Tab(\"📥 Export\"):\n",
    "            gr.Markdown(\"### Export your analyzed data\")\n",
    "            \n",
    "            with gr.Row():\n",
    "                export_format = gr.Radio(\n",
    "                    choices=[\"Excel\", \"CSV\"],\n",
    "                    value=\"Excel\",\n",
    "                    label=\"Export Format\"\n",
    "                )\n",
    "                export_btn = gr.Button(\"📥 Export Data\", variant=\"primary\")\n",
    "            \n",
    "            export_status = gr.Markdown(label=\"Export Status\")\n",
    "            export_file = gr.File(\n",
    "                label=\"📁 Download Exported File\",\n",
    "                interactive=False\n",
    "            )\n",
    "        \n",
    "        # Event handlers\n",
    "        model_dropdown.change(\n",
    "            fn=update_model,\n",
    "            inputs=[model_dropdown],\n",
    "            outputs=[status_output]\n",
    "        )\n",
    "        \n",
    "        process_btn.click(\n",
    "            fn=process_file,\n",
    "            inputs=[file_upload, model_dropdown],\n",
    "            outputs=[\n",
    "                status_output,\n",
    "                data_preview,\n",
    "                processed_file,\n",
    "                ai_insights,\n",
    "                viz_plot,\n",
    "                search_status,\n",
    "                viz_selector\n",
    "            ]\n",
    "        )\n",
    "        \n",
    "        search_btn.click(\n",
    "            fn=search_data,\n",
    "            inputs=[search_input],\n",
    "            outputs=[search_status, search_results, search_file]\n",
    "        )\n",
    "        \n",
    "        viz_selector.change(\n",
    "            fn=update_visualization,\n",
    "            inputs=[viz_selector],\n",
    "            outputs=[viz_plot]\n",
    "        )\n",
    "        \n",
    "        export_btn.click(\n",
    "            fn=export_results,\n",
    "            inputs=[export_format],\n",
    "            outputs=[export_status, export_file]\n",
    "        )\n",
    "    \n",
    "    return app"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "6c5a0767-a788-43a8-911c-04e81814f4c4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "* Running on local URL:  http://127.0.0.1:7861\n",
      "* Running on public URL: https://8190830de481785995.gradio.live\n",
      "\n",
      "This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div><iframe src=\"https://8190830de481785995.gradio.live\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Keyboard interruption in main thread... closing server.\n",
      "Killing tunnel 127.0.0.1:7861 <> https://8190830de481785995.gradio.live\n"
     ]
    }
   ],
   "source": [
    "# Launch the application\n",
    "if __name__ == \"__main__\":\n",
    "    app = create_interface()\n",
    "    app.launch(share=True, debug=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "4f382d04-cee3-40ea-9687-5f2dff2282f7",
   "metadata": {},
   "outputs": [
    {
     "ename": "SyntaxError",
     "evalue": "invalid syntax (2621292756.py, line 1)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;36m  Cell \u001b[0;32mIn[12], line 1\u001b[0;36m\u001b[0m\n\u001b[0;31m    python -m textblob.download_corpora\u001b[0m\n\u001b[0m              ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n"
     ]
    }
   ],
   "source": [
    "python -m textblob.download_corpora"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "63afdaca-562b-4846-8fb2-c699f7ab6615",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d82bb0bb-053e-4c29-af8b-b732dfcb47ad",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "12da3957-a063-48f8-8916-e552cc317280",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}