| |
| """ |
| UMAP Generator for arXiv papers |
| Creates 2D and 3D projections with density-weighted centroids |
| """ |
|
|
| import json |
| import numpy as np |
| import pandas as pd |
| from sklearn.feature_extraction.text import TfidfVectorizer |
| from sklearn.decomposition import TruncatedSVD |
| import umap |
| import os |
| import shutil |
| from datetime import datetime |
| from collections import Counter |
|
|
| def load_papers(filename="arxiv_monthly_papers.json"): |
| """Load papers from JSON file""" |
| if not os.path.exists(filename): |
| print(f"❌ File {filename} not found!") |
| print("💡 Run fetch_arxiv_api.py first") |
| return None |
| |
| with open(filename, 'r', encoding='utf-8') as f: |
| papers = json.load(f) |
| |
| print(f"📚 {len(papers)} papers loaded from {filename}") |
| return papers |
|
|
| def preprocess_papers(papers, sample_rate=5): |
| """Preprocess papers and sample if necessary""" |
| print(f"🔄 Preprocessing papers...") |
| |
| |
| valid_papers = [] |
| for paper in papers: |
| if (paper.get('title') and |
| paper.get('summary') and |
| paper.get('primary_category')): |
| valid_papers.append(paper) |
| |
| print(f"✅ {len(valid_papers)} valid papers after filtering") |
| |
| |
| if sample_rate > 1: |
| sampled_papers = valid_papers[::sample_rate] |
| print(f"📊 Sampling 1/{sample_rate}: {len(sampled_papers)} papers retained") |
| return sampled_papers |
| |
| return valid_papers |
|
|
| def create_embeddings(papers, max_features=5000, n_components=50): |
| """Create TF-IDF + SVD embeddings of papers""" |
| print(f"🔢 Creating embeddings (max_features={max_features}, n_components={n_components})") |
| |
| |
| texts = [] |
| for paper in papers: |
| title = paper.get('title', '').strip() |
| summary = paper.get('summary', '').strip() |
| combined = f"{title} {summary}" |
| texts.append(combined) |
| |
| |
| print(" 📝 TF-IDF vectorization...") |
| tfidf = TfidfVectorizer( |
| max_features=max_features, |
| stop_words='english', |
| ngram_range=(1, 2), |
| min_df=2, |
| max_df=0.95 |
| ) |
| |
| tfidf_matrix = tfidf.fit_transform(texts) |
| print(f" ✅ TF-IDF: {tfidf_matrix.shape}") |
| |
| |
| print(f" 🔄 SVD reduction to {n_components} dimensions...") |
| svd = TruncatedSVD(n_components=n_components, random_state=42) |
| embeddings = svd.fit_transform(tfidf_matrix) |
| |
| print(f" ✅ Final embeddings: {embeddings.shape}") |
| print(f" 📊 Explained variance: {svd.explained_variance_ratio_.sum():.3f}") |
| |
| return embeddings |
|
|
| def map_to_families(papers): |
| """Map categories to 9 main scientific families""" |
| |
| |
| domain_to_family = { |
| 'cs': 'Computer Science', |
| 'math': 'Mathematics', |
| 'physics': 'Physics', |
| 'stat': 'Statistics', |
| 'q-bio': 'Biology', |
| 'eess': 'Engineering', |
| 'astro-ph': 'Astrophysics', |
| 'cond-mat': 'Condensed Matter', |
| 'nucl': 'Nuclear Physics' |
| } |
| |
| families = [] |
| for paper in papers: |
| primary_cat = paper.get('primary_category', '') |
| if primary_cat: |
| domain = primary_cat.split('.')[0] |
| family = domain_to_family.get(domain, 'Other') |
| else: |
| family = 'Other' |
| families.append(family) |
| |
| family_counts = Counter(families) |
| print(f"📊 Distribution by family:") |
| for family, count in family_counts.most_common(): |
| print(f" {family}: {count} papers") |
| |
| return families |
|
|
| def generate_umap_projection(embeddings, families, n_neighbors=50, min_dist=0.1, spread=0.5, n_components=2): |
| """Generate UMAP projection""" |
| print(f"🎯 UMAP projection (n_neighbors={n_neighbors}, min_dist={min_dist}, spread={spread}, n_components={n_components})") |
| |
| |
| reducer = umap.UMAP( |
| n_neighbors=n_neighbors, |
| min_dist=min_dist, |
| spread=spread, |
| n_components=n_components, |
| random_state=42, |
| metric='cosine' |
| ) |
| |
| |
| projection = reducer.fit_transform(embeddings) |
| print(f"✅ Projection UMAP: {projection.shape}") |
| |
| return projection |
|
|
| def calculate_density_weighted_centroids(projection, families, families_list): |
| """Calculate density-weighted centroids""" |
| print("🎯 Calculating density-weighted centroids...") |
| |
| centroids = {} |
| |
| for family in families_list: |
| |
| family_mask = np.array(families) == family |
| family_points = projection[family_mask] |
| |
| if len(family_points) < 30: |
| continue |
| |
| if projection.shape[1] == 2: |
| |
| densities = [] |
| for point in family_points: |
| distances = np.linalg.norm(family_points - point, axis=1) |
| density = np.sum(distances < np.percentile(distances, 20)) |
| densities.append(density) |
|
|
| densities = np.array(densities) |
| weights = densities / densities.sum() |
|
|
| |
| centroid_x = np.sum(family_points[:, 0] * weights) |
| centroid_y = np.sum(family_points[:, 1] * weights) |
| |
| centroids[family] = { |
| 'x': float(centroid_x), |
| 'y': float(centroid_y), |
| 'count': len(family_points) |
| } |
| |
| else: |
| |
| densities = [] |
| for point in family_points: |
| distances = np.linalg.norm(family_points - point, axis=1) |
| density = np.sum(distances < np.percentile(distances, 20)) |
| densities.append(density) |
|
|
| densities = np.array(densities) |
| weights = densities / densities.sum() |
|
|
| |
| centroid_x = np.sum(family_points[:, 0] * weights) |
| centroid_y = np.sum(family_points[:, 1] * weights) |
| centroid_z = np.sum(family_points[:, 2] * weights) |
| |
| centroids[family] = { |
| 'x': float(centroid_x), |
| 'y': float(centroid_y), |
| 'z': float(centroid_z), |
| 'count': len(family_points) |
| } |
| |
| print(f"✅ {len(centroids)} centroids calculated") |
| return centroids |
|
|
| def save_visualization_data(papers, projection, families, centroids, output_prefix): |
| """Save visualization data""" |
| |
| |
| viz_data = [] |
| for i, paper in enumerate(papers): |
| if projection.shape[1] == 2: |
| point = { |
| 'id': paper.get('id', f'paper_{i}'), |
| 'title': paper.get('title', ''), |
| 'summary': paper.get('summary', '')[:200] + '...', |
| 'authors': ', '.join(paper.get('authors', [])[:3]), |
| 'category': paper.get('primary_category', ''), |
| 'family': families[i], |
| 'x': float(projection[i, 0]), |
| 'y': float(projection[i, 1]) |
| } |
| else: |
| point = { |
| 'id': paper.get('id', f'paper_{i}'), |
| 'title': paper.get('title', ''), |
| 'summary': paper.get('summary', '')[:200] + '...', |
| 'authors': ', '.join(paper.get('authors', [])[:3]), |
| 'category': paper.get('primary_category', ''), |
| 'family': families[i], |
| 'x': float(projection[i, 0]), |
| 'y': float(projection[i, 1]), |
| 'z': float(projection[i, 2]) |
| } |
| viz_data.append(point) |
| |
| |
| viz_data_with_centroids = { |
| 'points': viz_data, |
| 'centroids': centroids, |
| 'metadata': { |
| 'total_papers': len(papers), |
| 'dimensions': projection.shape[1], |
| 'families': list(set(families)), |
| 'generated': datetime.now().isoformat() |
| } |
| } |
| |
| |
| output_file = f"{output_prefix}.json" |
| with open(output_file, 'w', encoding='utf-8') as f: |
| json.dump(viz_data_with_centroids, f, indent=2, ensure_ascii=False) |
| |
| size_mb = os.path.getsize(output_file) / 1024 / 1024 |
| print(f"💾 Data saved: {output_file} ({size_mb:.1f} MB)") |
| |
| return output_file |
|
|
| def main(): |
| """Main UMAP generation pipeline""" |
| print("🚀 ArXiv UMAP Generator") |
| print("=" * 40) |
| |
| |
| papers = load_papers() |
| if not papers: |
| return |
| |
| |
| papers = preprocess_papers(papers, sample_rate=5) |
| |
| |
| families = map_to_families(papers) |
| families_list = list(set(families)) |
| |
| |
| embeddings = create_embeddings(papers, max_features=3000, n_components=50) |
| |
| |
| |
| |
| print("\n🎯 Generating 2D UMAP...") |
| projection_2d = generate_umap_projection( |
| embeddings, families, |
| n_neighbors=50, min_dist=0.8, spread=1.0, n_components=2 |
| ) |
| |
| centroids_2d = calculate_density_weighted_centroids(projection_2d, families, families_list) |
| |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
| output_2d = save_visualization_data( |
| papers, projection_2d, families, centroids_2d, |
| f"arxiv_umap_viz_2d_{timestamp}" |
| ) |
| |
| |
| print("\n🎯 Generating 3D UMAP...") |
| projection_3d = generate_umap_projection( |
| embeddings, families, |
| n_neighbors=50, min_dist=0.8, spread=1.0, n_components=3 |
| ) |
| |
| centroids_3d = calculate_density_weighted_centroids(projection_3d, families, families_list) |
| |
| output_3d = save_visualization_data( |
| papers, projection_3d, families, centroids_3d, |
| f"arxiv_umap_viz_3d_{timestamp}" |
| ) |
| |
| |
| import shutil |
| source_file = output_2d |
| target_dir = "../../assets/data" |
| target_file = os.path.join(target_dir, "data.json") |
| |
| try: |
| |
| os.makedirs(target_dir, exist_ok=True) |
| shutil.copy2(source_file, target_file) |
| print(f"\n✅ AUTOMATIC COPY SUCCESSFUL!") |
| print(f"📁 {source_file} → {target_file}") |
| except Exception as e: |
| print(f"\n⚠️ Automatic copy failed: {e}") |
| |
| print(f"\n🎉 Generation completed!") |
| print(f"📁 Files created:") |
| for f in [output_2d, output_3d]: |
| if os.path.exists(f): |
| size = os.path.getsize(f) / 1024 / 1024 |
| print(f" - {f} ({size:.1f} MB)") |
|
|
| if __name__ == "__main__": |
| main() |
|
|