from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity def calculate_cpi(author_data, user_query): # Weights based on your requested methodology W1, W2, W3 = 0.3, 0.5, 0.2 # Dimension A: Authority (h-index normalization) auth_score = min(author_data.get('h_index', 0) / 100, 1.0) # Dimension B: Topical Density (Semantic similarity) # Compare query against author interests and publication corpus corpus = " ".join(author_data['interests']) + " " + " ".join(author_data.get('pub_titles', [])) vectorizer = TfidfVectorizer() tfidf = vectorizer.fit_transform([user_query.lower(), corpus.lower()]) depth_score = cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0] # Dimension C: Centrality (Placeholder) centrality_score = 0.5 # Composite Performance Index Formula final_index = (W1 * auth_score) + (W2 * depth_score) + (W3 * centrality_score) return round(final_index * 100, 1)