-- Enhanced database schema for comprehensive bot data collection -- This extends the existing schema with enterprise-ready features -- Drop existing tables if they exist (for development) -- In production, use proper migration scripts -- DROP TABLE IF EXISTS bot_errors CASCADE; -- DROP TABLE IF EXISTS channel_analytics CASCADE; -- DROP TABLE IF EXISTS user_analytics CASCADE; -- DROP TABLE IF EXISTS message_analytics CASCADE; -- Enhanced channels table with more metadata CREATE TABLE IF NOT EXISTS channels_enhanced ( id VARCHAR(255) PRIMARY KEY, name VARCHAR(255) NOT NULL, platform VARCHAR(50) NOT NULL CHECK (platform IN ('slack', 'discord')), type VARCHAR(100), -- channel, dm, group, thread, etc. guild_id VARCHAR(255), -- Discord guild or Slack team parent_channel_id VARCHAR(255), -- For threads topic TEXT, description TEXT, is_private BOOLEAN DEFAULT false, is_archived BOOLEAN DEFAULT false, member_count INTEGER, created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, metadata JSONB, FOREIGN KEY (parent_channel_id) REFERENCES channels_enhanced(id) ); -- Enhanced messages table with comprehensive metadata CREATE TABLE IF NOT EXISTS messages_enhanced ( id VARCHAR(255) PRIMARY KEY, -- Platform message ID channel_id VARCHAR(255) NOT NULL, user_id_hash VARCHAR(255), -- Hashed for privacy content TEXT, timestamp TIMESTAMP WITH TIME ZONE NOT NULL, edited_at TIMESTAMP WITH TIME ZONE, message_type VARCHAR(100) DEFAULT 'message', thread_ts VARCHAR(255), -- For threaded conversations is_thread_reply BOOLEAN DEFAULT false, reply_to_message_id VARCHAR(255), -- Content analysis word_count INTEGER, character_count INTEGER, sentiment_score DECIMAL(3,2), -- -1 to 1 toxicity_score DECIMAL(3,2), -- 0 to 1 language VARCHAR(10), -- Engagement metrics reaction_count INTEGER DEFAULT 0, reply_count INTEGER DEFAULT 0, mention_count INTEGER DEFAULT 0, -- File attachments has_attachments BOOLEAN DEFAULT false, attachment_count INTEGER DEFAULT 0, -- Links and media has_links BOOLEAN DEFAULT false, link_count INTEGER DEFAULT 0, has_media BOOLEAN DEFAULT false, -- Platform specific platform VARCHAR(50) NOT NULL CHECK (platform IN ('slack', 'discord')), metadata JSONB, created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, FOREIGN KEY (channel_id) REFERENCES channels_enhanced(id), FOREIGN KEY (reply_to_message_id) REFERENCES messages_enhanced(id) ); -- User analytics (privacy-conscious with hashed IDs) CREATE TABLE IF NOT EXISTS user_analytics ( user_id_hash VARCHAR(255) PRIMARY KEY, platform VARCHAR(50) NOT NULL, -- Activity metrics total_messages INTEGER DEFAULT 0, total_words INTEGER DEFAULT 0, total_characters INTEGER DEFAULT 0, -- Engagement metrics reactions_given INTEGER DEFAULT 0, reactions_received INTEGER DEFAULT 0, mentions_given INTEGER DEFAULT 0, mentions_received INTEGER DEFAULT 0, -- Behavioral patterns avg_message_length DECIMAL(8,2), most_active_hour INTEGER, -- 0-23 most_active_day INTEGER, -- 0-6 (Sunday-Saturday) avg_sentiment DECIMAL(3,2), -- Time tracking first_message_at TIMESTAMP WITH TIME ZONE, last_message_at TIMESTAMP WITH TIME ZONE, created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP ); -- Channel analytics CREATE TABLE IF NOT EXISTS channel_analytics ( channel_id VARCHAR(255) PRIMARY KEY, platform VARCHAR(50) NOT NULL, -- Volume metrics total_messages INTEGER DEFAULT 0, total_words INTEGER DEFAULT 0, unique_users INTEGER DEFAULT 0, -- Time-based metrics messages_last_24h INTEGER DEFAULT 0, messages_last_7d INTEGER DEFAULT 0, messages_last_30d INTEGER DEFAULT 0, -- Engagement metrics avg_messages_per_user DECIMAL(8,2), avg_message_length DECIMAL(8,2), total_reactions INTEGER DEFAULT 0, -- Content analysis avg_sentiment DECIMAL(3,2), top_keywords TEXT[], -- Array of popular keywords -- Activity patterns peak_hour INTEGER, -- Most active hour peak_day INTEGER, -- Most active day -- Health metrics spam_score DECIMAL(3,2), -- 0-1, higher = more spam-like toxicity_score DECIMAL(3,2), -- 0-1, higher = more toxic created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, FOREIGN KEY (channel_id) REFERENCES channels_enhanced(id) ); -- Message reactions CREATE TABLE IF NOT EXISTS message_reactions ( id SERIAL PRIMARY KEY, message_id VARCHAR(255) NOT NULL, user_id_hash VARCHAR(255), emoji VARCHAR(255) NOT NULL, timestamp TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, FOREIGN KEY (message_id) REFERENCES messages_enhanced(id), UNIQUE(message_id, user_id_hash, emoji) ); -- Message mentions (users, channels, roles) CREATE TABLE IF NOT EXISTS message_mentions ( id SERIAL PRIMARY KEY, message_id VARCHAR(255) NOT NULL, mention_type VARCHAR(50) NOT NULL CHECK (mention_type IN ('user', 'channel', 'role', 'everyone', 'here')), mentioned_id_hash VARCHAR(255), -- Hashed ID of mentioned entity FOREIGN KEY (message_id) REFERENCES messages_enhanced(id) ); -- Message attachments CREATE TABLE IF NOT EXISTS message_attachments ( id SERIAL PRIMARY KEY, message_id VARCHAR(255) NOT NULL, filename VARCHAR(255), file_type VARCHAR(100), file_size INTEGER, -- In bytes url TEXT, content_type VARCHAR(255), is_image BOOLEAN DEFAULT false, is_video BOOLEAN DEFAULT false, is_audio BOOLEAN DEFAULT false, FOREIGN KEY (message_id) REFERENCES messages_enhanced(id) ); -- Bot errors and monitoring CREATE TABLE IF NOT EXISTS bot_errors ( id SERIAL PRIMARY KEY, platform VARCHAR(50) NOT NULL, error_type VARCHAR(255) NOT NULL, message TEXT NOT NULL, severity VARCHAR(50) NOT NULL CHECK (severity IN ('low', 'medium', 'high', 'critical')), context JSONB, traceback TEXT, timestamp TIMESTAMP WITH TIME ZONE NOT NULL, resolved BOOLEAN DEFAULT false, resolved_at TIMESTAMP WITH TIME ZONE ); -- Clustering results CREATE TABLE IF NOT EXISTS clustering_results ( id SERIAL PRIMARY KEY, cluster_id VARCHAR(255) NOT NULL, algorithm VARCHAR(100) NOT NULL, parameters JSONB, message_count INTEGER NOT NULL, channel_ids TEXT[], -- Array of channel IDs included date_range_start TIMESTAMP WITH TIME ZONE, date_range_end TIMESTAMP WITH TIME ZONE, quality_score DECIMAL(3,2), -- Clustering quality metric -- Cluster characteristics dominant_topics TEXT[], -- Top topics in cluster avg_sentiment DECIMAL(3,2), avg_message_length DECIMAL(8,2), created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, metadata JSONB ); -- Message cluster assignments CREATE TABLE IF NOT EXISTS message_clusters ( id SERIAL PRIMARY KEY, message_id VARCHAR(255) NOT NULL, cluster_id VARCHAR(255) NOT NULL, confidence_score DECIMAL(3,2), -- How confident the assignment is distance_to_centroid DECIMAL(10,6), -- Distance to cluster center FOREIGN KEY (message_id) REFERENCES messages_enhanced(id) ); -- API usage tracking CREATE TABLE IF NOT EXISTS api_usage_log ( id SERIAL PRIMARY KEY, platform VARCHAR(50) NOT NULL, endpoint VARCHAR(255) NOT NULL, method VARCHAR(20) NOT NULL, status_code INTEGER, response_time_ms INTEGER, rate_limited BOOLEAN DEFAULT false, timestamp TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, error_message TEXT ); -- Create indexes for performance CREATE INDEX IF NOT EXISTS idx_messages_enhanced_channel_timestamp ON messages_enhanced(channel_id, timestamp DESC); CREATE INDEX IF NOT EXISTS idx_messages_enhanced_user_timestamp ON messages_enhanced(user_id_hash, timestamp DESC); CREATE INDEX IF NOT EXISTS idx_messages_enhanced_platform ON messages_enhanced(platform); CREATE INDEX IF NOT EXISTS idx_messages_enhanced_content_gin ON messages_enhanced USING gin(to_tsvector('english', content)); CREATE INDEX IF NOT EXISTS idx_messages_enhanced_metadata_gin ON messages_enhanced USING gin(metadata); CREATE INDEX IF NOT EXISTS idx_bot_errors_platform_timestamp ON bot_errors(platform, timestamp DESC); CREATE INDEX IF NOT EXISTS idx_bot_errors_severity ON bot_errors(severity); CREATE INDEX IF NOT EXISTS idx_bot_errors_resolved ON bot_errors(resolved); CREATE INDEX IF NOT EXISTS idx_user_analytics_platform ON user_analytics(platform); CREATE INDEX IF NOT EXISTS idx_channel_analytics_platform ON channel_analytics(platform); CREATE INDEX IF NOT EXISTS idx_message_reactions_message_id ON message_reactions(message_id); CREATE INDEX IF NOT EXISTS idx_message_mentions_message_id ON message_mentions(message_id); CREATE INDEX IF NOT EXISTS idx_message_attachments_message_id ON message_attachments(message_id); -- Create views for common analytics queries CREATE OR REPLACE VIEW message_analytics_summary AS SELECT platform, COUNT(*) as total_messages, COUNT(DISTINCT channel_id) as active_channels, COUNT(DISTINCT user_id_hash) as active_users, AVG(word_count) as avg_word_count, AVG(sentiment_score) as avg_sentiment, DATE_TRUNC('day', timestamp) as date FROM messages_enhanced WHERE timestamp >= NOW() - INTERVAL '30 days' GROUP BY platform, DATE_TRUNC('day', timestamp) ORDER BY date DESC; CREATE OR REPLACE VIEW channel_activity_summary AS SELECT c.id, c.name, c.platform, ca.total_messages, ca.unique_users, ca.messages_last_24h, ca.messages_last_7d, ca.avg_sentiment, ca.updated_at FROM channels_enhanced c LEFT JOIN channel_analytics ca ON c.id = ca.channel_id ORDER BY ca.messages_last_7d DESC NULLS LAST; -- Functions for automatic analytics updates CREATE OR REPLACE FUNCTION update_user_analytics() RETURNS TRIGGER AS $$ BEGIN INSERT INTO user_analytics ( user_id_hash, platform, total_messages, total_words, total_characters, first_message_at, last_message_at, updated_at ) VALUES ( NEW.user_id_hash, NEW.platform, 1, NEW.word_count, NEW.character_count, NEW.timestamp, NEW.timestamp, CURRENT_TIMESTAMP ) ON CONFLICT (user_id_hash) DO UPDATE SET total_messages = user_analytics.total_messages + 1, total_words = user_analytics.total_words + COALESCE(NEW.word_count, 0), total_characters = user_analytics.total_characters + COALESCE(NEW.character_count, 0), last_message_at = NEW.timestamp, avg_message_length = (user_analytics.total_characters + COALESCE(NEW.character_count, 0))::decimal / (user_analytics.total_messages + 1), updated_at = CURRENT_TIMESTAMP; RETURN NEW; END; $$ LANGUAGE plpgsql; -- Trigger to automatically update user analytics DROP TRIGGER IF EXISTS trigger_update_user_analytics ON messages_enhanced; CREATE TRIGGER trigger_update_user_analytics AFTER INSERT ON messages_enhanced FOR EACH ROW EXECUTE FUNCTION update_user_analytics(); -- Function to calculate message statistics CREATE OR REPLACE FUNCTION calculate_message_stats() RETURNS TRIGGER AS $$ BEGIN -- Calculate word count NEW.word_count = array_length(string_to_array(trim(NEW.content), ' '), 1); -- Calculate character count NEW.character_count = length(NEW.content); -- Set content flags NEW.has_links = NEW.content ~ 'https?://[^\s]+'; NEW.link_count = (length(NEW.content) - length(regexp_replace(NEW.content, 'https?://[^\s]+', '', 'g'))) / 7; -- Rough estimate RETURN NEW; END; $$ LANGUAGE plpgsql; -- Trigger to calculate message statistics DROP TRIGGER IF EXISTS trigger_calculate_message_stats ON messages_enhanced; CREATE TRIGGER trigger_calculate_message_stats BEFORE INSERT OR UPDATE ON messages_enhanced FOR EACH ROW EXECUTE FUNCTION calculate_message_stats();