| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| def calculate_cpi(author_data, user_query): | |
| # Weights based on your requested methodology | |
| W1, W2, W3 = 0.3, 0.5, 0.2 | |
| # Dimension A: Authority (h-index normalization) | |
| auth_score = min(author_data.get('h_index', 0) / 100, 1.0) | |
| # Dimension B: Topical Density (Semantic similarity) | |
| # Compare query against author interests and publication corpus | |
| corpus = " ".join(author_data['interests']) + " " + " ".join(author_data.get('pub_titles', [])) | |
| vectorizer = TfidfVectorizer() | |
| tfidf = vectorizer.fit_transform([user_query.lower(), corpus.lower()]) | |
| depth_score = cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0] | |
| # Dimension C: Centrality (Placeholder) | |
| centrality_score = 0.5 | |
| # Composite Performance Index Formula | |
| final_index = (W1 * auth_score) + (W2 * depth_score) + (W3 * centrality_score) | |
| return round(final_index * 100, 1) |