import gradio as gr import pandas as pd import warnings import random import re import time import os import sys warnings.filterwarnings('ignore') # Pre-download NLTK data at startup import nltk print("Downloading NLTK data...") nltk.download('stopwords', quiet=True) nltk.download('punkt', quiet=True) print("NLTK data downloaded.") # Reliable model names and descriptions - replaced KeyBERT with working alternatives KEYWORD_MODELS = { 'yake_yake': 'YAKE - Yet Another Keyword Extractor (statistical)', 'tfidf_cosine': 'TF-IDF with Cosine Similarity - Document similarity approach', 'rake_nltk': 'RAKE-NLTK - Rapid Automatic Keyword Extraction', 'textrank': 'TextRank - Graph-based ranking algorithm' } # Color palette for keywords based on scores SCORE_COLORS = { 'high': '#00B894', # Green - High relevance 'medium': '#F9CA24', # Yellow - Medium relevance 'low': '#FF6B6B' # Red - Low relevance } # Additional colors for variety KEYWORD_COLORS = [ '#4ECDC4', '#45B7D1', '#6C5CE7', '#A0E7E5', '#FD79A8', '#8E8E93', '#55A3FF', '#E17055', '#DDA0DD', '#FF9F43', '#10AC84', '#EE5A24', '#0FBC89', '#5F27CD', '#FF3838' ] class KeywordExtractionManager: def __init__(self): self.rake_extractor = None self.models_initialized = False self.initialize_models() def initialize_models(self): """Pre-initialize models to check availability""" print("Initializing models...") # Test YAKE try: import yake print("✓ YAKE available") except ImportError as e: print(f"✗ YAKE not available: {e}") # Test RAKE try: from rake_nltk import Rake print("✓ RAKE-NLTK available") except ImportError as e: print(f"✗ RAKE-NLTK not available: {e}") # Test sklearn for TF-IDF try: from sklearn.feature_extraction.text import TfidfVectorizer print("✓ Scikit-learn available for TF-IDF") except ImportError as e: print(f"✗ Scikit-learn not available: {e}") # Test networkx for TextRank try: import networkx print("✓ NetworkX available for TextRank") except ImportError as e: print(f"✗ NetworkX not available: {e}") self.models_initialized = True def load_rake_extractor(self): """Load RAKE extractor with better error handling""" if self.rake_extractor is None: try: from rake_nltk import Rake # Create RAKE instance self.rake_extractor = Rake() print("✓ RAKE extractor loaded successfully") except Exception as e: print(f"Error loading RAKE extractor: {str(e)}") print(f"Full error: {type(e).__name__}: {str(e)}") return None return self.rake_extractor def extract_keywords(self, text, model_name, num_keywords=10, ngram_range=(1, 3), progress=None): """Extract keywords using the specified model""" try: if progress: progress(0.3, desc="Loading model...") print(f"Attempting to extract keywords with {model_name}") # Handle different model types if model_name.startswith('yake_'): return self.extract_yake_keywords(text, num_keywords, ngram_range, progress) elif model_name.startswith('tfidf_'): return self.extract_tfidf_cosine_keywords(text, num_keywords, ngram_range, progress) elif model_name.startswith('rake_'): return self.extract_rake_keywords(text, num_keywords, progress) elif model_name.startswith('textrank'): return self.extract_textrank_keywords(text, num_keywords, ngram_range, progress) else: raise ValueError(f"Unknown model: {model_name}") except Exception as e: print(f"Error with {model_name}: {str(e)}") print(f"Full error: {type(e).__name__}: {str(e)}") return self.fallback_keyword_extraction(text, num_keywords) def extract_yake_keywords(self, text, num_keywords, ngram_range, progress): """Extract keywords using YAKE""" try: import yake if progress: progress(0.5, desc="Processing with YAKE...") # Configure YAKE kw_extractor = yake.KeywordExtractor( lan="en", n=ngram_range[1], dedupLim=0.7, top=num_keywords ) if progress: progress(0.7, desc="Extracting keywords...") keywords = kw_extractor.extract_keywords(text) # Format results (YAKE returns lower scores for better keywords) results = [] for keyword, score in keywords: # Invert score for consistency (higher = better) inverted_score = 1.0 / (1.0 + score) results.append({ 'keyword': keyword, 'score': inverted_score, 'model': 'YAKE' }) print(f"YAKE extracted {len(results)} keywords") return results except Exception as e: print(f"YAKE extraction failed: {type(e).__name__}: {str(e)}") return self.fallback_keyword_extraction(text, num_keywords) def extract_tfidf_cosine_keywords(self, text, num_keywords, ngram_range, progress): """Extract keywords using TF-IDF with cosine similarity""" try: from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import numpy as np if progress: progress(0.5, desc="Processing with TF-IDF...") # Create TF-IDF vectorizer vectorizer = TfidfVectorizer( ngram_range=ngram_range, stop_words='english', max_features=5000, min_df=1, max_df=0.95 ) # Extract candidate keywords/phrases words = re.findall(r'\b[a-z]+\b', text.lower()) candidates = [] # Generate n-grams for n in range(ngram_range[0], ngram_range[1] + 1): for i in range(len(words) - n + 1): candidate = ' '.join(words[i:i+n]) if len(candidate) > 2 and candidate not in candidates: candidates.append(candidate) if not candidates: return self.fallback_keyword_extraction(text, num_keywords) # Limit candidates to prevent memory issues candidates = candidates[:300] if progress: progress(0.7, desc="Computing similarities...") try: # Create document embedding doc_embedding = vectorizer.fit_transform([text]) # Create embeddings for candidates candidate_embeddings = vectorizer.transform(candidates) # Calculate similarities similarities = cosine_similarity(doc_embedding, candidate_embeddings)[0] # Get top keywords top_indices = similarities.argsort()[-num_keywords:][::-1] results = [] for idx in top_indices: if similarities[idx] > 0: results.append({ 'keyword': candidates[idx], 'score': float(similarities[idx]), 'model': 'TF-IDF-Cosine' }) if progress: progress(0.8, desc="Formatting results...") print(f"TF-IDF extracted {len(results)} keywords") return results except Exception as e: print(f"TF-IDF approach failed: {e}") # Fall back to simple TF-IDF return self.simple_tfidf_extraction(text, num_keywords, ngram_range) except ImportError: print("scikit-learn not available for TF-IDF") return self.fallback_keyword_extraction(text, num_keywords) except Exception as e: print(f"TF-IDF extraction failed: {e}") return self.fallback_keyword_extraction(text, num_keywords) def extract_textrank_keywords(self, text, num_keywords, ngram_range, progress): """Extract keywords using TextRank algorithm""" try: import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import networkx as nx if progress: progress(0.5, desc="Processing with TextRank...") # Split text into sentences sentences = re.split(r'[.!?]+', text) sentences = [s.strip() for s in sentences if s.strip()] if len(sentences) < 2: # If text is too short, use simple extraction return self.simple_tfidf_extraction(text, num_keywords, ngram_range) # Create TF-IDF matrix vectorizer = TfidfVectorizer( ngram_range=(1, 1), # Use unigrams for sentence similarity stop_words='english' ) tfidf_matrix = vectorizer.fit_transform(sentences) # Calculate similarity matrix similarity_matrix = cosine_similarity(tfidf_matrix) if progress: progress(0.6, desc="Building graph...") # Build graph nx_graph = nx.from_numpy_array(similarity_matrix) # Calculate PageRank scores scores = nx.pagerank(nx_graph) # Extract keywords from top-ranked sentences top_sentence_indices = sorted(scores.keys(), key=lambda x: scores[x], reverse=True)[:3] # Extract keywords from top sentences keyword_vectorizer = TfidfVectorizer( ngram_range=ngram_range, stop_words='english', max_features=num_keywords * 2 ) top_sentences = [sentences[i] for i in top_sentence_indices] top_text = ' '.join(top_sentences) if progress: progress(0.7, desc="Extracting keywords...") tfidf_matrix = keyword_vectorizer.fit_transform([top_text]) feature_names = keyword_vectorizer.get_feature_names_out() tfidf_scores = tfidf_matrix.toarray()[0] # Get top keywords top_indices = tfidf_scores.argsort()[-num_keywords:][::-1] results = [] for idx in top_indices: if tfidf_scores[idx] > 0: results.append({ 'keyword': feature_names[idx], 'score': float(tfidf_scores[idx]), 'model': 'TextRank' }) print(f"TextRank extracted {len(results)} keywords") return results except ImportError as e: print(f"Required library not available for TextRank: {e}") return self.fallback_keyword_extraction(text, num_keywords) except Exception as e: print(f"TextRank extraction failed: {e}") return self.fallback_keyword_extraction(text, num_keywords) def simple_tfidf_extraction(self, text, num_keywords, ngram_range): """Simple TF-IDF extraction without cosine similarity""" try: from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer( ngram_range=ngram_range, stop_words='english', max_features=num_keywords * 2 ) # Fit and transform tfidf_matrix = vectorizer.fit_transform([text]) # Get feature names and scores feature_names = vectorizer.get_feature_names_out() scores = tfidf_matrix.toarray()[0] # Get top keywords top_indices = scores.argsort()[-num_keywords:][::-1] results = [] for idx in top_indices: if scores[idx] > 0: results.append({ 'keyword': feature_names[idx], 'score': float(scores[idx]), 'model': 'TF-IDF-Simple' }) return results except Exception as e: print(f"Simple TF-IDF failed: {e}") return self.fallback_keyword_extraction(text, num_keywords) def extract_rake_keywords(self, text, num_keywords, progress): """Extract keywords using RAKE""" try: if progress: progress(0.5, desc="Processing with RAKE...") rake_extractor = self.load_rake_extractor() if rake_extractor is None: print("RAKE extractor could not be loaded") return self.fallback_keyword_extraction(text, num_keywords) if progress: progress(0.7, desc="Extracting keywords...") # Extract keywords rake_extractor.extract_keywords_from_text(text) keywords_with_scores = rake_extractor.get_ranked_phrases_with_scores() # Normalize scores if keywords_with_scores: max_score = max(score for score, _ in keywords_with_scores) # Format results results = [] for score, keyword in keywords_with_scores[:num_keywords]: normalized_score = score / max_score if max_score > 0 else 0 results.append({ 'keyword': keyword, 'score': normalized_score, 'model': 'RAKE-NLTK' }) print(f"RAKE extracted {len(results)} keywords") return results else: print("RAKE returned no keywords") return self.fallback_keyword_extraction(text, num_keywords) except Exception as e: print(f"RAKE extraction failed: {type(e).__name__}: {str(e)}") return self.fallback_keyword_extraction(text, num_keywords) def fallback_keyword_extraction(self, text, num_keywords=10): """Simple fallback keyword extraction using basic statistics""" print("Using fallback keyword extraction") import re from collections import Counter # Simple tokenization and filtering words = re.findall(r'\b[a-z]+\b', text.lower()) # Remove common stop words stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been', 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they'} filtered_words = [w for w in words if w not in stop_words and len(w) > 3] # Count frequencies word_freq = Counter(filtered_words) # Get top keywords results = [] for word, freq in word_freq.most_common(num_keywords): score = freq / len(filtered_words) # Normalize by total words results.append({ 'keyword': word, 'score': score, 'model': 'Fallback-TFIDF' }) return results def get_score_color(score, max_score): """Get color based on score relative to max score""" if max_score == 0: return SCORE_COLORS['medium'] relative_score = score / max_score if relative_score >= 0.7: return SCORE_COLORS['high'] elif relative_score >= 0.4: return SCORE_COLORS['medium'] else: return SCORE_COLORS['low'] def get_relevance_level(score, max_score): """Get relevance level name based on score""" if max_score == 0: return 'medium' relative_score = score / max_score if relative_score >= 0.7: return 'high' elif relative_score >= 0.4: return 'medium' else: return 'low' def create_highlighted_html(text, keywords): """Create HTML with highlighted keywords in the text""" if not keywords: return f"
{text}
No keywords found.
" # Sort by score sorted_keywords = sorted(keywords, key=lambda x: x['score'], reverse=True) max_score = sorted_keywords[0]['score'] if sorted_keywords else 1 table_html = """| Rank | Keyword | Score | Relevance | Model |
|---|---|---|---|---|
| #{i+1} | {kw_data['keyword']} | {score:.4f} | {relevance_bar} | {kw_data['model']} |
Learn more about the algorithms used in this tool:
This Keyword Extraction Explorer Tool was created as part of the Digital Scholarship at Oxford (DiSc) funded research project: Extracting Keywords from Crowdsourced Collections.
The code for this tool was built with the aid of Claude Opus 4.