Spaces:

OliverPerrin
/

LexiMind

Runtime error

App Files Files Community

OliverPerrin commited on Jan 14

Commit

baf3026

1 Parent(s): 4667b2a

Fixed compiling issue, added legnth penalty, and atttempting freezing encoder layers 0-5 to lower parameters and preserve T5's langauge understanding.

Browse files

Files changed (15) hide show

README.md +2 -2
artifacts/labels.json +6 -3
configs/data/datasets.yaml +2 -2
configs/training/dev.yaml +26 -16
configs/training/full.yaml +25 -16
configs/training/medium.yaml +23 -14
docs/architecture.md +2 -2
outputs/evaluation_report.json +0 -81
outputs/training_history.json +70 -70
scripts/demo_gradio.py +14 -14
scripts/download_data.py +444 -211
scripts/train.py +57 -12
src/inference/pipeline.py +2 -0
src/models/decoder.py +12 -0
src/training/trainer.py +4 -2

README.md CHANGED Viewed

@@ -18,9 +18,9 @@ This project is built with industry-standard MLOps practices, including configur
 ## Core Features
-* **Abstractive Summarization:** Generates concise, coherent summaries of long-form text using encoder-decoder attention. Trained on CNN/DailyMail (news) and BookSum (literary).
 * **Emotion Classification:** Identifies 28 emotions from Google's GoEmotions dataset (admiration, amusement, anger, joy, love, etc.).
-* **Topic Classification:** Classifies documents into 4 categories (World, Sports, Business, Sci/Tech) using AG News.
 ## Model Architecture

 ## Core Features
+* **Abstractive Summarization:** Generates concise, coherent summaries of long-form text using encoder-decoder attention. Trained on BookSum (literary) and arXiv (academic papers).
 * **Emotion Classification:** Identifies 28 emotions from Google's GoEmotions dataset (admiration, amusement, anger, joy, love, etc.).
+* **Topic Classification:** Classifies documents into 8 categories (Fiction, Science, Technology, Philosophy, History, Psychology, Business, Arts).
 ## Model Architecture

artifacts/labels.json CHANGED Viewed

@@ -30,9 +30,12 @@
     "surprise"
   ],
   "topic": [
     "Business",
-    "Sci/Tech",
-    "Sports",
-    "World"
   ]
 }

     "surprise"
   ],
   "topic": [
+    "Arts",
     "Business",
+    "Fiction",
+    "History",
+    "Philosophy",
+    "Science",
+    "Technology"
   ]
 }

configs/data/datasets.yaml CHANGED Viewed

@@ -2,9 +2,9 @@
 # Data is downloaded via: python scripts/download_data.py
 processed:
-  summarization: data/processed/summarization  # CNN/DailyMail + BookSum
   emotion: data/processed/emotion              # GoEmotions (28 labels)
-  topic: data/processed/topic                  # AG News (4 labels)
   books: data/processed/books                  # Gutenberg prose chunks
 tokenizer:

 # Data is downloaded via: python scripts/download_data.py
 processed:
+  summarization: data/processed/summarization  # BookSum + arXiv
   emotion: data/processed/emotion              # GoEmotions (28 labels)
+  topic: data/processed/topic                  # Books + Papers (8 labels)
   books: data/processed/books                  # Gutenberg prose chunks
 tokenizer:

configs/training/dev.yaml CHANGED Viewed

@@ -1,11 +1,11 @@
 # Development/Testing Configuration for FLAN-T5-base
-# Fast iteration for debugging and testing changes
-# VRAM Usage: ~8-9GB peak (12GB available)
-# Training time: ~10-15 minutes on RTX 4070 12GB
 # Use: python scripts/train.py training=dev
 dataloader:
-  batch_size: 5  # Conservative for 12GB VRAM
   shuffle: true
   num_workers: 4
   pin_memory: true
@@ -14,32 +14,42 @@ dataloader:
 optimizer:
   name: adamw
-  lr: 5.0e-5  # Higher LR for faster convergence in dev
   weight_decay: 0.01
   eps: 1.0e-8
   betas: [0.9, 0.999]
 scheduler:
   name: cosine
-  warmup_steps: 100  # ~2% of training steps for smoother start
 trainer:
   max_epochs: 3
   gradient_clip_norm: 1.0
-  gradient_accumulation_steps: 12  # Effective batch: 60 (5*12)
   validation_max_length: 128
-  label_smoothing: 0.1
   task_weights:
     summarization: 1.0
-    emotion: 0.5
-    topic: 0.5
-  max_train_samples: 3000  # 3k samples for better validation
   max_val_samples: 300
-  early_stopping_patience: 5  # Stop if no improvement
   log_grad_norm_frequency: 100
-# Disable compile for faster startup in dev
-compile_encoder: false
-compile_decoder: false
-tokenizer_max_length: 512

 # Development/Testing Configuration for FLAN-T5-base
+# FAST iteration for debugging - optimized for speed
+# VRAM Usage: ~9-10GB peak (12GB available)
+# Training time: ~5 minutes on RTX 4070 12GB
 # Use: python scripts/train.py training=dev
 dataloader:
+  batch_size: 10  # Optimal with FlashAttention
   shuffle: true
   num_workers: 4
   pin_memory: true
 optimizer:
   name: adamw
+  lr: 5.0e-5
   weight_decay: 0.01
   eps: 1.0e-8
   betas: [0.9, 0.999]
 scheduler:
   name: cosine
+  warmup_steps: 50  # Less warmup for short runs
 trainer:
   max_epochs: 3
   gradient_clip_norm: 1.0
+  gradient_accumulation_steps: 6  # Effective batch: 60 (10*6)
   validation_max_length: 128
+  label_smoothing: 0.0  # Simpler backward graph for dev
   task_weights:
     summarization: 1.0
+    emotion: 1.5
+    topic: 0.5  # Reduced - topic already saturated at 86%
+  max_train_samples: 3000
   max_val_samples: 300
+  early_stopping_patience: 5
   log_grad_norm_frequency: 100
+# Enable compile for speed (worth the startup cost)
+compile_encoder: true
+compile_decoder: true
+# Speed optimizations
+tokenizer_max_length: 256
+gradient_checkpointing: true
+# FLAN-T5 has NO learned positional embeddings - only relative position bias
+# Disabling this causes repetition loops (model can't track sequence position)
+use_relative_position_bias: true
+# Freeze lower encoder layers (0-5) to preserve pretrained knowledge
+# Upper layers (6-11) adapt to summarization style
+freeze_encoder_layers: 6

configs/training/full.yaml CHANGED Viewed

@@ -1,11 +1,11 @@
 # Full Training Configuration for FLAN-T5-base
-# Complete training run with capped samples for reasonable time
-# VRAM Usage: ~11GB peak (12GB available)
-# Training time: ~2 hours on RTX 4070 12GB with torch.compile
 # Use: python scripts/train.py training=full
 dataloader:
-  batch_size: 6  # Keep at 6 to stay within 12GB VRAM
   shuffle: true
   num_workers: 4
   pin_memory: true
@@ -14,32 +14,41 @@ dataloader:
 optimizer:
   name: adamw
-  lr: 5.0e-5  # Slightly higher LR for faster convergence
   weight_decay: 0.01
   eps: 1.0e-6
   betas: [0.9, 0.999]
 scheduler:
   name: cosine
-  warmup_steps: 500  # Less warmup needed
 trainer:
-  max_epochs: 5  # Converges by epoch 4-5
   gradient_clip_norm: 1.0
-  gradient_accumulation_steps: 10  # Effective batch: 60 (6*10)
   validation_max_length: 128
   label_smoothing: 0.1
   task_weights:
-    summarization: 1.0  # Reduced from 1.2 to give emotion room
-    emotion: 1.5        # Increased to prevent degradation
-    topic: 0.8          # Reduced since topic already near SOTA
-  max_train_samples: 50000  # Cap training for speed
-  max_val_samples: 3000     # Faster validation
-  early_stopping_patience: 3
   log_grad_norm_frequency: 100
-# Enable torch.compile for maximum speed
 compile_encoder: true
 compile_decoder: true
-tokenizer_max_length: 512

 # Full Training Configuration for FLAN-T5-base
+# BEST QUALITY - use for final model training
+# VRAM Usage: ~9-10GB (12GB available)
+# Training time: ~1 hour on RTX 4070 12GB
 # Use: python scripts/train.py training=full
 dataloader:
+  batch_size: 10  # Optimal for RTX 4070 12GB
   shuffle: true
   num_workers: 4
   pin_memory: true
 optimizer:
   name: adamw
+  lr: 2.0e-5  # Lower LR for best convergence
   weight_decay: 0.01
   eps: 1.0e-6
   betas: [0.9, 0.999]
 scheduler:
   name: cosine
+  warmup_steps: 500
 trainer:
+  max_epochs: 8  # More epochs for best results
   gradient_clip_norm: 1.0
+  gradient_accumulation_steps: 6  # Effective batch: 60 (10*6)
   validation_max_length: 128
   label_smoothing: 0.1
   task_weights:
+    summarization: 1.0
+    emotion: 1.5  # Boost emotion (tends to underfit)
+    topic: 0.5  # Reduced - topic already saturated at 86%
+  max_train_samples: 50000
+  max_val_samples: 3000
+  early_stopping_patience: 4
   log_grad_norm_frequency: 100
 compile_encoder: true
 compile_decoder: true
+# FULL QUALITY SETTINGS
+tokenizer_max_length: 512  # Full context for summarization
+gradient_checkpointing: true
+# FLAN-T5 has NO learned positional embeddings - only relative position bias
+# Disabling this causes repetition loops (model can't track sequence position)
+use_relative_position_bias: true
+# Freeze lower encoder layers (0-5) to preserve pretrained knowledge
+# Upper layers (6-11) adapt to summarization style
+freeze_encoder_layers: 6

configs/training/medium.yaml CHANGED Viewed

@@ -1,11 +1,11 @@
 # Medium Configuration for FLAN-T5-base
-# Balanced approach - good results in reasonable time
-# VRAM Usage: ~9-10GB peak (12GB available)
-# Training time: ~45-60 minutes on RTX 4070 12GB with torch.compile
 # Use: python scripts/train.py training=medium
 dataloader:
-  batch_size: 6  # Conservative for 12GB VRAM with torch.compile
   shuffle: true
   num_workers: 4
   pin_memory: true
@@ -14,32 +14,41 @@ dataloader:
 optimizer:
   name: adamw
-  lr: 3.0e-5  # Balanced LR for quality
   weight_decay: 0.01
   eps: 1.0e-6
   betas: [0.9, 0.999]
 scheduler:
   name: cosine
-  warmup_steps: 500  # ~2% warmup for 25k steps
 trainer:
-  max_epochs: 5  # More epochs for better convergence
   gradient_clip_norm: 1.0
-  gradient_accumulation_steps: 12  # Effective batch: 72 (6*12)
   validation_max_length: 128
   label_smoothing: 0.1
   task_weights:
-    summarization: 1.2  # Slightly prioritize summarization
-    emotion: 0.8
-    topic: 0.8
-  max_train_samples: 25000  # 25k samples - good balance
   max_val_samples: 2500
   early_stopping_patience: 3
   log_grad_norm_frequency: 100
-# Enable torch.compile for 1.5-2x speedup
 compile_encoder: true
 compile_decoder: true
-tokenizer_max_length: 512

 # Medium Configuration for FLAN-T5-base
+# Balanced: good quality with reasonable speed
+# VRAM Usage: ~8-9GB (12GB available)
+# Training time: ~25-35 minutes on RTX 4070 12GB
 # Use: python scripts/train.py training=medium
 dataloader:
+  batch_size: 10  # Optimal for RTX 4070 12GB
   shuffle: true
   num_workers: 4
   pin_memory: true
 optimizer:
   name: adamw
+  lr: 3.0e-5  # Slightly lower LR for stability
   weight_decay: 0.01
   eps: 1.0e-6
   betas: [0.9, 0.999]
 scheduler:
   name: cosine
+  warmup_steps: 300
 trainer:
+  max_epochs: 5
   gradient_clip_norm: 1.0
+  gradient_accumulation_steps: 6  # Effective batch: 60 (10*6)
   validation_max_length: 128
   label_smoothing: 0.1
   task_weights:
+    summarization: 1.0
+    emotion: 1.5
+    topic: 0.5  # Reduced - topic already saturated at 86%
+  max_train_samples: 25000
   max_val_samples: 2500
   early_stopping_patience: 3
   log_grad_norm_frequency: 100
 compile_encoder: true
 compile_decoder: true
+# Balance: shorter sequences but keep T5's relative position bias for quality
+tokenizer_max_length: 384
+gradient_checkpointing: true
+# FLAN-T5 has NO learned positional embeddings - only relative position bias
+# Disabling this causes repetition loops (model can't track sequence position)
+use_relative_position_bias: true
+# Freeze lower encoder layers (0-5) to preserve pretrained knowledge
+# Upper layers (6-11) adapt to summarization style
+freeze_encoder_layers: 6

docs/architecture.md CHANGED Viewed

@@ -51,9 +51,9 @@ The `factory.py` module loads weights from FLAN-T5-base, which uses a compatible
 | Task | Dataset | Size | Labels |
 | ---- | ------- | ---- | ------ |
-| Summarization | CNN/DailyMail + BookSum | ~110K | Text→Summary |
 | Emotion | GoEmotions | ~43K | 28 emotions (multi-label) |
-| Topic | AG News | ~120K | 4 categories |
 | Books | Gutenberg (prose chunks) | ~30K | Literary text |
 ### T5 Tokenizer Differences

 | Task | Dataset | Size | Labels |
 | ---- | ------- | ---- | ------ |
+| Summarization | BookSum + arXiv | ~90K | Text→Summary |
 | Emotion | GoEmotions | ~43K | 28 emotions (multi-label) |
+| Topic | Books + Papers | ~50K | 8 categories (Fiction, Science, Technology, etc.) |
 | Books | Gutenberg (prose chunks) | ~30K | Literary text |
 ### T5 Tokenizer Differences

outputs/evaluation_report.json DELETED Viewed

@@ -1,81 +0,0 @@
-{
-  "split": "val",
-  "summarization": {
-    "rouge_like": 0.2817535277055523,
-    "bleu": 0.06501593900536834
-  },
-  "emotion": {
-    "f1_macro": 0.4053446650505066
-  },
-  "topic": {
-    "accuracy": 0.7548042704626334,
-    "classification_report": {
-      "Business & Finance": {
-        "precision": 0.6826859776168532,
-        "recall": 0.5221550855991943,
-        "f1-score": 0.5917261055634807,
-        "support": 1986
-      },
-      "Computers & Internet": {
-        "precision": 0.8468166586883676,
-        "recall": 0.894790085988872,
-        "f1-score": 0.8701426463354648,
-        "support": 1977
-      },
-      "Education & Reference": {
-        "precision": 0.6067106710671067,
-        "recall": 0.5627551020408164,
-        "f1-score": 0.5839068290100582,
-        "support": 1960
-      },
-      "Entertainment & Music": {
-        "precision": 0.732976653696498,
-        "recall": 0.7708439897698209,
-        "f1-score": 0.7514335577162802,
-        "support": 1955
-      },
-      "Family & Relationships": {
-        "precision": 0.7356746765249538,
-        "recall": 0.8101781170483461,
-        "f1-score": 0.7711310244611286,
-        "support": 1965
-      },
-      "Health": {
-        "precision": 0.7917267917267917,
-        "recall": 0.8372329603255341,
-        "f1-score": 0.8138442521631644,
-        "support": 1966
-      },
-      "Politics & Government": {
-        "precision": 0.7916459472899056,
-        "recall": 0.8097660223804679,
-        "f1-score": 0.8006034699522253,
-        "support": 1966
-      },
-      "Science & Mathematics": {
-        "precision": 0.749162278602202,
-        "recall": 0.7972491085073866,
-        "f1-score": 0.7724580454096742,
-        "support": 1963
-      },
-      "Society & Culture": {
-        "precision": 0.6588683351468988,
-        "recall": 0.6181725370086779,
-        "f1-score": 0.637872004213853,
-        "support": 1959
-      },
-      "Sports": {
-        "precision": 0.909317389138017,
-        "recall": 0.9249873289406995,
-        "f1-score": 0.9170854271356784,
-        "support": 1973
-      },
-      "macro avg": {
-        "precision": 0.7505585379497595,
-        "recall": 0.7548130337609815,
-        "f1-score": 0.7510203361961008,
-        "support": 19670
-      }
-    }
-  }
-}

outputs/training_history.json CHANGED Viewed

@@ -1,92 +1,92 @@
 {
   "train_epoch_1": {
-    "summarization_loss": 3.7986922026081054,
-    "summarization_rouge_like": 0.38785950375542677,
-    "emotion_loss": 0.6569146523665603,
-    "emotion_f1": 0.0803471759769852,
-    "topic_loss": 1.3537324049331485,
-    "topic_accuracy": 0.4645228381729452,
-    "total_loss": 6.166948288969483
   },
   "val_epoch_1": {
-    "summarization_loss": 3.1010914066140884,
-    "summarization_rouge_like": 0.4547831050626749,
-    "emotion_loss": 0.47831222164831,
-    "emotion_f1": 0.07989733061380237,
-    "topic_loss": 1.1463579110962023,
-    "topic_accuracy": 0.8397282174260592,
-    "total_loss": 5.021045794132517
   },
   "train_epoch_2": {
-    "summarization_loss": 3.519661677836342,
-    "summarization_rouge_like": 0.40693338191007866,
-    "emotion_loss": 0.2990482480142052,
-    "emotion_f1": 0.25253565061903593,
-    "topic_loss": 0.5421501434865632,
-    "topic_accuracy": 0.8869290456763608,
-    "total_loss": 4.896552726604225
   },
   "val_epoch_2": {
-    "summarization_loss": 3.022662199944329,
-    "summarization_rouge_like": 0.45815133655381807,
-    "emotion_loss": 0.19708226060124037,
-    "emotion_f1": 0.302215425453955,
-    "topic_loss": 0.28093130860647425,
-    "topic_accuracy": 0.9172661870503583,
-    "total_loss": 4.009605495299369
   },
   "train_epoch_3": {
-    "summarization_loss": 3.456413923878735,
-    "summarization_rouge_like": 0.4113752870178118,
-    "emotion_loss": 0.18330693083835614,
-    "emotion_f1": 0.30698023489509907,
-    "topic_loss": 0.2889783758940973,
-    "topic_accuracy": 0.9169066474682156,
-    "total_loss": 4.525524954040441
   },
   "val_epoch_3": {
-    "summarization_loss": 3.0019707325265275,
-    "summarization_rouge_like": 0.4592321986281997,
-    "emotion_loss": 0.16639868924014575,
-    "emotion_f1": 0.3015063897543531,
-    "topic_loss": 0.23863075083072524,
-    "topic_accuracy": 0.9280575539568332,
-    "total_loss": 3.9263884310885304
   },
   "train_epoch_4": {
-    "summarization_loss": 3.4258855361860663,
-    "summarization_rouge_like": 0.4135803384924355,
-    "emotion_loss": 0.16595664669032975,
-    "emotion_f1": 0.31446844452103895,
-    "topic_loss": 0.24658246585826152,
-    "topic_accuracy": 0.9276857851372029,
-    "total_loss": 4.441093933462159
   },
   "val_epoch_4": {
-    "summarization_loss": 2.992023795628719,
-    "summarization_rouge_like": 0.4595829821013028,
-    "emotion_loss": 0.16106250848201253,
-    "emotion_f1": 0.299241534820635,
-    "topic_loss": 0.2258928704747765,
-    "topic_accuracy": 0.9280575539568333,
-    "total_loss": 3.8999928579198935
   },
   "train_epoch_5": {
-    "summarization_loss": 3.4150345063421232,
-    "summarization_rouge_like": 0.41468036090685273,
-    "emotion_loss": 0.1624394242665394,
-    "emotion_f1": 0.31033963250845154,
-    "topic_loss": 0.2336994289211126,
-    "topic_accuracy": 0.9319654427645914,
-    "total_loss": 4.4149524901606805
   },
   "val_epoch_5": {
-    "summarization_loss": 2.9899252604523436,
-    "summarization_rouge_like": 0.45984993646884514,
-    "emotion_loss": 0.15985918722207026,
-    "emotion_f1": 0.2971099066666419,
-    "topic_loss": 0.22285484572162303,
-    "topic_accuracy": 0.9284572342126283,
-    "total_loss": 3.894081538897767
   }
 }

 {
   "train_epoch_1": {
+    "summarization_loss": 4.343819652843475,
+    "summarization_rouge_like": 0.18423229737482247,
+    "emotion_loss": 0.4579651211887598,
+    "emotion_f1": 0.11036156222745776,
+    "topic_loss": 1.6671979689359664,
+    "topic_accuracy": 0.4339600000000011,
+    "total_loss": 6.364525709775378
   },
   "val_epoch_1": {
+    "summarization_loss": 4.259150079727172,
+    "summarization_rouge_like": 0.17393867024365672,
+    "emotion_loss": 0.15817135846614838,
+    "emotion_f1": 0.07330303180590272,
+    "topic_loss": 0.8542358543872833,
+    "topic_accuracy": 0.7782222222222225,
+    "total_loss": 5.179795800936217
   },
   "train_epoch_2": {
+    "summarization_loss": 4.103479218292236,
+    "summarization_rouge_like": 0.1940134706014566,
+    "emotion_loss": 0.15515640188455582,
+    "emotion_f1": 0.2337232402900234,
+    "topic_loss": 0.4888198138475418,
+    "topic_accuracy": 0.9067600000000139,
+    "total_loss": 4.7272696721971075
   },
   "val_epoch_2": {
+    "summarization_loss": 4.18841742515564,
+    "summarization_rouge_like": 0.17885957731314292,
+    "emotion_loss": 0.1549839802980423,
+    "emotion_f1": 0.3034666753411293,
+    "topic_loss": 0.4580393745005131,
+    "topic_accuracy": 0.852800000000002,
+    "total_loss": 4.787324895203119
   },
   "train_epoch_3": {
+    "summarization_loss": 4.041395119285584,
+    "summarization_rouge_like": 0.1970914256375089,
+    "emotion_loss": 0.15249912014603614,
+    "emotion_f1": 0.24187604461871087,
+    "topic_loss": 0.21472855980992317,
+    "topic_accuracy": 0.9627200000000102,
+    "total_loss": 4.441926647352566
   },
   "val_epoch_3": {
+    "summarization_loss": 4.16257409954071,
+    "summarization_rouge_like": 0.18115953723449993,
+    "emotion_loss": 0.15324361461400987,
+    "emotion_f1": 0.30253334194421766,
+    "topic_loss": 0.4939193711131811,
+    "topic_accuracy": 0.8632000000000015,
+    "total_loss": 4.7875750183522765
   },
   "train_epoch_4": {
+    "summarization_loss": 4.012135830116272,
+    "summarization_rouge_like": 0.19873380769300908,
+    "emotion_loss": 0.15166676665246487,
+    "emotion_f1": 0.24661330536156892,
+    "topic_loss": 0.14288409658223392,
+    "topic_accuracy": 0.9780000000000073,
+    "total_loss": 4.353943257360758
   },
   "val_epoch_4": {
+    "summarization_loss": 4.1532666339874265,
+    "summarization_rouge_like": 0.18147128191578765,
+    "emotion_loss": 0.15282477751374246,
+    "emotion_f1": 0.2984000087380409,
+    "topic_loss": 0.5214869263619184,
+    "topic_accuracy": 0.8580000000000017,
+    "total_loss": 4.799693341347577
   },
   "train_epoch_5": {
+    "summarization_loss": 4.002264401054382,
+    "summarization_rouge_like": 0.1992749810224614,
+    "emotion_loss": 0.15127245344221593,
+    "emotion_f1": 0.24676951464861632,
+    "topic_loss": 0.12673698243945836,
+    "topic_accuracy": 0.9796800000000072,
+    "total_loss": 4.330562667169272
   },
   "val_epoch_5": {
+    "summarization_loss": 4.149239055633545,
+    "summarization_rouge_like": 0.18202557571683906,
+    "emotion_loss": 0.15270190620422364,
+    "emotion_f1": 0.3021333419680595,
+    "topic_loss": 0.5217973904460669,
+    "topic_accuracy": 0.8580000000000011,
+    "total_loss": 4.795729827296732
   }
 }

scripts/demo_gradio.py CHANGED Viewed

@@ -107,20 +107,20 @@ def analyze_text(text: str) -> tuple[str, str, str]:
 # --------------- Sample Texts ---------------
 SAMPLES = {
-    "business": """Global markets tumbled today as investors reacted to rising inflation concerns.
-The Federal Reserve hinted at potential interest rate hikes, sending shockwaves through technology
-and banking sectors. Analysts predict continued volatility as economic uncertainty persists.
-Major indices fell by over 2%, with tech stocks leading the decline.""",
-    "science": """Scientists at MIT have developed a breakthrough quantum computing chip that
 operates at room temperature. This advancement could revolutionize drug discovery, cryptography,
 and artificial intelligence. The research team published their findings in Nature, demonstrating
 stable qubit operations for over 100 microseconds.""",
-    "sports": """The championship game ended in dramatic fashion as the underdog team scored in
-the final seconds to secure victory. Fans rushed the field in celebration, marking the team's
-first title in 25 years. The winning goal came from a rookie player who had only joined the
-team this season.""",
 }
@@ -146,7 +146,7 @@ with gr.Blocks(title="LexiMind") as demo:
                     text_input = gr.Textbox(
                         label="Input Text",
                         lines=6,
-                        placeholder="Paste a news article or any text to analyze...",
                     )
                     with gr.Row():
                         analyze_btn = gr.Button("Analyze", variant="primary")
@@ -154,9 +154,9 @@ with gr.Blocks(title="LexiMind") as demo:
                     gr.Markdown("**Quick samples:**")
                     with gr.Row():
-                        btn_business = gr.Button("Business", size="sm")
                         btn_science = gr.Button("Science", size="sm")
-                        btn_sports = gr.Button("Sports", size="sm")
                 with gr.Column(scale=2):
                     summary_output = gr.Textbox(label="Generated Summary", lines=4, interactive=False)
@@ -167,9 +167,9 @@ with gr.Blocks(title="LexiMind") as demo:
             # Event handlers
             analyze_btn.click(analyze_text, inputs=[text_input], outputs=[summary_output, emotions_output, topic_output])
             clear_btn.click(lambda: ("", "", "", ""), outputs=[text_input, summary_output, emotions_output, topic_output])
-            btn_business.click(lambda: SAMPLES["business"], outputs=[text_input])
             btn_science.click(lambda: SAMPLES["science"], outputs=[text_input])
-            btn_sports.click(lambda: SAMPLES["sports"], outputs=[text_input])
         # ===================== TAB 2: METRICS =====================
         with gr.Tab("Metrics"):

 # --------------- Sample Texts ---------------
 SAMPLES = {
+    "fiction": """The old lighthouse keeper had watched countless storms batter the rocky coast,
+but nothing prepared him for what emerged from the fog that evening. A ship unlike any he'd
+seen before - its hull seemingly made of living shadow - drifted silently toward the rocks.
+He rang the warning bell, knowing somehow it wouldn't matter.""",
+    "science": """Researchers at MIT have developed a breakthrough quantum computing chip that
 operates at room temperature. This advancement could revolutionize drug discovery, cryptography,
 and artificial intelligence. The research team published their findings in Nature, demonstrating
 stable qubit operations for over 100 microseconds.""",
+    "technology": """The new large language model demonstrates unprecedented reasoning capabilities,
+solving complex mathematical proofs and generating functional code across multiple programming
+languages. Benchmarks show it outperforms previous systems by significant margins on tasks
+requiring multi-step logical inference and long-context understanding.""",
 }
                     text_input = gr.Textbox(
                         label="Input Text",
                         lines=6,
+                        placeholder="Paste a book excerpt, research abstract, or any text to analyze...",
                     )
                     with gr.Row():
                         analyze_btn = gr.Button("Analyze", variant="primary")
                     gr.Markdown("**Quick samples:**")
                     with gr.Row():
+                        btn_fiction = gr.Button("Fiction", size="sm")
                         btn_science = gr.Button("Science", size="sm")
+                        btn_tech = gr.Button("Technology", size="sm")
                 with gr.Column(scale=2):
                     summary_output = gr.Textbox(label="Generated Summary", lines=4, interactive=False)
             # Event handlers
             analyze_btn.click(analyze_text, inputs=[text_input], outputs=[summary_output, emotions_output, topic_output])
             clear_btn.click(lambda: ("", "", "", ""), outputs=[text_input, summary_output, emotions_output, topic_output])
+            btn_fiction.click(lambda: SAMPLES["fiction"], outputs=[text_input])
             btn_science.click(lambda: SAMPLES["science"], outputs=[text_input])
+            btn_tech.click(lambda: SAMPLES["technology"], outputs=[text_input])
         # ===================== TAB 2: METRICS =====================
         with gr.Tab("Metrics"):

scripts/download_data.py CHANGED Viewed

@@ -5,19 +5,24 @@
 """
 Dataset download script for LexiMind.
-Downloads and prepares training datasets:
-- CNN/DailyMail + BookSum for summarization (news + literary)
-- Project Gutenberg books for additional literary training
 - GoEmotions for emotion classification (28 labels)
-- AG News for topic classification (4 labels: World, Sports, Business, Sci/Tech)
 Usage:
     python scripts/download_data.py              # Download all
-    python scripts/download_data.py --task topic # Download specific task
-    python scripts/download_data.py --max-books 30000 --max-gutenberg 20000
 Author: Oliver Perrin
-Date: December 2025
 """
 from __future__ import annotations
@@ -35,7 +40,9 @@ from tqdm import tqdm
 # Output directory
 OUTPUT_DIR = Path(__file__).parent.parent / "data" / "processed"
-# Label definitions
 EMOTION_LABELS = [
     "admiration", "amusement", "anger", "annoyance", "approval", "caring",
     "confusion", "curiosity", "desire", "disappointment", "disapproval",
@@ -44,7 +51,57 @@ EMOTION_LABELS = [
     "relief", "remorse", "sadness", "surprise", "neutral",
 ]
-TOPIC_LABELS = ["World", "Sports", "Business", "Sci/Tech"]
 def write_jsonl(records: list[dict[str, Any]], path: Path, desc: str = "Writing") -> None:
@@ -56,225 +113,359 @@ def write_jsonl(records: list[dict[str, Any]], path: Path, desc: str = "Writing"
     print(f"  ✓ {len(records):,} samples → {path}")
-def download_summarization(max_news: int = 80000, max_books: int = 30000) -> None:
-    """Download CNN/DailyMail + BookSum for summarization."""
-    print("\n📰 Downloading Summarization...")
-    out_dir = OUTPUT_DIR / "summarization"
-    all_train: list[dict[str, Any]] = []
-    all_val: list[dict[str, Any]] = []
-    all_test: list[dict[str, Any]] = []
-    # CNN/DailyMail - great for news summarization
-    print("  Loading CNN/DailyMail...")
-    cnn = load_dataset("cnn_dailymail", "3.0.0")
-    for split_name in cnn.keys():
-        split = str(split_name)
-        data = cnn[split_name]
-        limit = max_news if "train" in split else max_news // 10
-        indices = random.sample(range(len(data)), min(len(data), limit))
-        records: list[dict[str, Any]] = []
-        for i in indices:
-            item = data[i]
-            article = item["article"]
-            highlights = item["highlights"]
-            if article and highlights:
-                records.append({"source": article, "summary": highlights})
-        if "train" in split:
-            all_train.extend(records)
-        elif "val" in split:
-            all_val.extend(records)
-        else:
-            all_test.extend(records)
-        print(f"    {split}: {len(records):,}")
-    # BookSum - literary text summarization (chapters → summaries)
-    print("  Loading BookSum...")
     booksum = load_dataset("kmfoda/booksum")
     for split_name in booksum.keys():
         split = str(split_name)
         data = booksum[split_name]
-        limit = max_books if "train" in split else max_books // 10
         indices = random.sample(range(len(data)), min(len(data), limit))
         records = []
-        for i in indices:
             item = data[i]
             chapter = item.get("chapter", "")
             summary = item.get("summary_text") or item.get("summary", "")
             if chapter and summary and len(chapter) > 300:
-                # Truncate very long chapters to fit model context
-                records.append({"source": chapter[:4000], "summary": summary})
-        if "train" in split:
-            all_train.extend(records)
-        elif "val" in split:
-            all_val.extend(records)
-        else:
-            all_test.extend(records)
         print(f"    {split}: {len(records):,}")
-    random.shuffle(all_train)
-    write_jsonl(all_train, out_dir / "train.jsonl", "train")
-    write_jsonl(all_val, out_dir / "validation.jsonl", "validation")
-    write_jsonl(all_test, out_dir / "test.jsonl", "test")
-# Patterns to filter out Gutenberg boilerplate
-GUTENBERG_JUNK_PATTERNS = [
-    r"Project Gutenberg",
-    r"www\.gutenberg\.org",
-    r"This ebook is for the use of",
-    r"You may copy it, give it away",
-    r"Gutenberg License",
-    r"^\*\*\* START OF",
-    r"^\*\*\* END OF",
-    r"Produced by",
-    r"Transcriber's Note",
-    r"Editor's Note",
-    r"TABLE OF CONTENTS",
-    r"CONTENTS\s*$",
-    r"^\s*CHAPTER\s+[IVXLC\d]+",
-    r"^\s*Chapter\s+[IVXLC\d]+",
-    r"^\s*BOOK\s+[IVXLC\d]+",
-    r"^\s*PART\s+[IVXLC\d]+",
-    r"^\s*PREFACE\s*$",
-    r"^\s*INTRODUCTION\s*$",
-    r"^\s*EPILOGUE\s*$",
-    r"^\s*PROLOGUE\s*$",
-    r"^\s*APPENDIX",
-    r"^\s*INDEX\s*$",
-    r"^\s*FOOTNOTES?\s*$",
-    r"^\s*\[Illustration",
-    r"^\s*\[Transcriber",
-    r"E-text prepared by",
-    r"Internet Archive",
-    r"This file was produced",
-    r"Distributed Proofreaders",
-    r"^\s*_+\s*$",  # Lines of underscores
-    r"^\s*\*+\s*$",  # Lines of asterisks
-]
-GUTENBERG_JUNK_REGEX = re.compile("|".join(GUTENBERG_JUNK_PATTERNS), re.IGNORECASE)
-def is_clean_prose(text: str) -> bool:
-    """Check if text is clean literary prose (not boilerplate/metadata)."""
-    # Must be substantial
-    if len(text) < 300 or len(text) > 3000:
-        return False
-    # Skip if contains Gutenberg boilerplate
-    if GUTENBERG_JUNK_REGEX.search(text):
-        return False
-    # Must have actual sentences (prose check)
-    # Good prose has periods, commas, and lowercase letters
-    if text.count('.') < 2:
-        return False
-    # Skip if mostly uppercase (headers, titles)
-    uppercase_ratio = sum(1 for c in text if c.isupper()) / max(len(text), 1)
-    if uppercase_ratio > 0.3:
-        return False
-    # Skip if too many numbers (tables, dates, page numbers)
-    digit_ratio = sum(1 for c in text if c.isdigit()) / max(len(text), 1)
-    if digit_ratio > 0.1:
-        return False
-    return True
-def download_gutenberg(max_samples: int = 20000) -> None:
     """
-    Download Project Gutenberg books for literary language modeling.
-    Uses the standardized_gutenberg dataset which has clean, parsed books.
-    Creates paragraph-level chunks for training diversity.
-    Filters out boilerplate (headers, licenses, TOC, etc).
     """
-    print("\n📚 Downloading Gutenberg Books...")
-    out_dir = OUTPUT_DIR / "books"
-    out_dir.mkdir(parents=True, exist_ok=True)
-    # Load Gutenberg dataset - has ~60K books
-    print("  Loading standardized_gutenberg dataset...")
     try:
         gutenberg = load_dataset("sedthh/gutenberg_english", split="train")
     except Exception:
-        # Fallback to alternative dataset
-        print("  Trying alternative: pg19...")
         gutenberg = load_dataset("pg19", split="train")
     records: list[dict[str, Any]] = []
-    books_processed = 0
-    chunks_filtered = 0
-    # Sample books randomly
     indices = list(range(len(gutenberg)))
     random.shuffle(indices)
-    print("  Processing books into clean prose chunks...")
-    for i in tqdm(indices, desc="Books", leave=False):
         if len(records) >= max_samples:
             break
         item = gutenberg[i]
-        # Handle both uppercase (sedthh/gutenberg_english) and lowercase (pg19) keys
-        text = item.get("TEXT", "") or item.get("text", "") or item.get("content", "")
         metadata = item.get("METADATA", {}) or {}
-        title = metadata.get("title", "") if isinstance(metadata, dict) else ""
-        if not title:
-            title = item.get("title", f"Book_{i}")
         if not text or len(text) < 1000:
             continue
-        # Split into paragraphs for diverse training samples
-        paragraphs = re.split(r'\n\s*\n', text)
-        for para in paragraphs:
-            para = para.strip()
-            # Use strict filtering for clean prose only
-            if is_clean_prose(para):
-                records.append({
-                    "text": para,
-                    "title": title,
-                    "type": "gutenberg"
-                })
-                if len(records) >= max_samples:
-                    break
-            else:
-                chunks_filtered += 1
-        books_processed += 1
-    # Split into train/val/test (90/5/5)
-    random.shuffle(records)
-    n = len(records)
-    train_end = int(n * 0.9)
-    val_end = int(n * 0.95)
-    train_records = records[:train_end]
-    val_records = records[train_end:val_end]
-    test_records = records[val_end:]
-    write_jsonl(train_records, out_dir / "train.jsonl", "train")
-    write_jsonl(val_records, out_dir / "validation.jsonl", "validation")
-    write_jsonl(test_records, out_dir / "test.jsonl", "test")
-    print(f"  ✓ {books_processed:,} books → {len(records):,} clean prose chunks")
-    print(f"  ✓ Filtered out {chunks_filtered:,} boilerplate/metadata chunks")
 def download_emotions() -> None:
     """Download GoEmotions for emotion classification."""
-    print("\n😊 Downloading Emotions...")
     out_dir = OUTPUT_DIR / "emotion"
     ds = load_dataset("google-research-datasets/go_emotions", "simplified")
@@ -297,53 +488,94 @@ def download_emotions() -> None:
     print(f"  ✓ {len(EMOTION_LABELS)} emotion labels saved")
-def download_topics(max_samples: int = 100000) -> None:
-    """Download AG News for topic classification (4 clean categories)."""
-    print("\n📂 Downloading Topics...")
-    out_dir = OUTPUT_DIR / "topic"
-    ds = load_dataset("fancyzhx/ag_news")
-    train_data = ds["train"]
-    test_data = ds["test"]
-    # Split train into train/val
-    all_idx = list(range(len(train_data)))
-    random.shuffle(all_idx)
-    train_idx = all_idx[:max_samples]
-    val_idx = all_idx[max_samples:max_samples + max_samples // 10]
-    splits_config = [
-        ("train", train_idx, train_data),
-        ("validation", val_idx, train_data),
-        ("test", list(range(len(test_data))), test_data),
-    ]
-    for split_name, indices, data in splits_config:
-        records: list[dict[str, Any]] = []
-        for i in tqdm(indices, desc=split_name, leave=False):
-            item = data[i]
-            text = item.get("text", "")
-            label = item.get("label", 0)
-            if text and len(text) > 50:
-                records.append({"text": text, "topic": TOPIC_LABELS[label]})
-        write_jsonl(records, out_dir / f"{split_name}.jsonl", split_name)
-    (out_dir / "labels.json").write_text(json.dumps(TOPIC_LABELS, indent=2))
-    print(f"  ✓ {len(TOPIC_LABELS)} topic labels saved")
 def main() -> None:
     parser = argparse.ArgumentParser(description="Download LexiMind datasets")
     parser.add_argument(
-        "--task",
         choices=["all", "summarization", "emotion", "topic", "gutenberg"],
-        default="all",
         help="Dataset to download"
     )
-    parser.add_argument("--max-news", type=int, default=80000, help="Max news articles")
-    parser.add_argument("--max-books", type=int, default=30000, help="Max BookSum chapters")
-    parser.add_argument("--max-gutenberg", type=int, default=20000, help="Max Gutenberg chunks")
-    parser.add_argument("--max-topics", type=int, default=100000, help="Max topic samples")
     parser.add_argument("--seed", type=int, default=42, help="Random seed")
     args = parser.parse_args()
@@ -351,16 +583,17 @@ def main() -> None:
     print("=" * 60)
     print("LexiMind Dataset Download")
     print("=" * 60)
     if args.task in ["all", "summarization"]:
-        download_summarization(args.max_news, args.max_books)
-    if args.task in ["all", "gutenberg"]:
-        download_gutenberg(args.max_gutenberg)
     if args.task in ["all", "emotion"]:
         download_emotions()
     if args.task in ["all", "topic"]:
         download_topics(args.max_topics)
     print("\n" + "=" * 60)
     print("✅ Download complete!")

 """
 Dataset download script for LexiMind.
+Focus: Books, Academic Papers, Technical Writing
+- NO news articles (overdone, dated)
+- YES literary text, research, technical writing
+Datasets:
+- BookSum for literary summarization
+- arXiv for academic paper summarization
+- Project Gutenberg for literary language
 - GoEmotions for emotion classification (28 labels)
+- Custom topic classification: Fiction, Science, Technology, etc.
 Usage:
     python scripts/download_data.py              # Download all
+    python scripts/download_data.py --task arxiv # Download specific task
+    python scripts/download_data.py --max-arxiv 50000
 Author: Oliver Perrin
+Date: January 2026
 """
 from __future__ import annotations
 # Output directory
 OUTPUT_DIR = Path(__file__).parent.parent / "data" / "processed"
+# ============== LABEL DEFINITIONS ==============
+# 28 emotions from GoEmotions - works for all text types
 EMOTION_LABELS = [
     "admiration", "amusement", "anger", "annoyance", "approval", "caring",
     "confusion", "curiosity", "desire", "disappointment", "disapproval",
     "relief", "remorse", "sadness", "surprise", "neutral",
 ]
+# New topic labels for books + papers + blogs
+TOPIC_LABELS = [
+    "Fiction",           # Novels, short stories, literary fiction
+    "Science",           # Physics, chemistry, biology, nature
+    "Technology",        # CS, engineering, programming, AI/ML
+    "Philosophy",        # Ethics, logic, metaphysics, epistemology
+    "History",           # Historical texts, biographies, memoirs
+    "Psychology",        # Mind, behavior, self-help, mental health
+    "Business",          # Economics, finance, entrepreneurship
+    "Arts",              # Music, visual arts, film, architecture
+]
+# arXiv category → our topic mapping
+ARXIV_CATEGORY_MAP = {
+    # Computer Science
+    "cs.AI": "Technology", "cs.CL": "Technology", "cs.CV": "Technology",
+    "cs.LG": "Technology", "cs.NE": "Technology", "cs.RO": "Technology",
+    "cs.SE": "Technology", "cs.PL": "Technology", "cs.DB": "Technology",
+    "cs.DS": "Technology", "cs.CR": "Technology", "cs.DC": "Technology",
+    "cs.HC": "Technology", "cs.IR": "Technology", "cs.IT": "Technology",
+    "cs.MA": "Technology", "cs.MM": "Technology", "cs.NI": "Technology",
+    "cs.OS": "Technology", "cs.PF": "Technology", "cs.SY": "Technology",
+    # Physics
+    "physics": "Science", "astro-ph": "Science", "cond-mat": "Science",
+    "gr-qc": "Science", "hep-ex": "Science", "hep-lat": "Science",
+    "hep-ph": "Science", "hep-th": "Science", "math-ph": "Science",
+    "nlin": "Science", "nucl-ex": "Science", "nucl-th": "Science",
+    "quant-ph": "Science",
+    # Math
+    "math": "Science",
+    # Biology/Medicine
+    "q-bio": "Science", "stat": "Science",
+    # Economics/Finance
+    "econ": "Business", "q-fin": "Business",
+    # Electrical Engineering
+    "eess": "Technology",
+}
+# Gutenberg subject → our topic mapping
+GUTENBERG_SUBJECT_MAP = {
+    "fiction": "Fiction", "novel": "Fiction", "stories": "Fiction",
+    "poetry": "Arts", "drama": "Arts", "plays": "Arts",
+    "science": "Science", "physics": "Science", "chemistry": "Science",
+    "biology": "Science", "nature": "Science", "astronomy": "Science",
+    "philosophy": "Philosophy", "ethics": "Philosophy", "logic": "Philosophy",
+    "history": "History", "biography": "History", "memoir": "History",
+    "psychology": "Psychology", "mind": "Psychology",
+    "economics": "Business", "business": "Business", "finance": "Business",
+    "art": "Arts", "music": "Arts", "architecture": "Arts",
+    "technology": "Technology", "engineering": "Technology",
+}
 def write_jsonl(records: list[dict[str, Any]], path: Path, desc: str = "Writing") -> None:
     print(f"  ✓ {len(records):,} samples → {path}")
+# ============== SUMMARIZATION: BOOKS + ARXIV ==============
+def download_booksum(max_samples: int = 40000) -> list[dict[str, Any]]:
+    """Download BookSum - literary chapter summarization."""
+    print("\n📖 Loading BookSum (literary summarization)...")
+    all_records: list[dict[str, Any]] = []
     booksum = load_dataset("kmfoda/booksum")
     for split_name in booksum.keys():
         split = str(split_name)
         data = booksum[split_name]
+        limit = max_samples if "train" in split else max_samples // 10
         indices = random.sample(range(len(data)), min(len(data), limit))
         records = []
+        for i in tqdm(indices, desc=f"BookSum {split}", leave=False):
             item = data[i]
             chapter = item.get("chapter", "")
             summary = item.get("summary_text") or item.get("summary", "")
             if chapter and summary and len(chapter) > 300:
+                records.append({
+                    "source": chapter[:4000],
+                    "summary": summary,
+                    "type": "literary",
+                    "split": split,
+                })
+        all_records.extend(records)
         print(f"    {split}: {len(records):,}")
+    return all_records
+def clean_arxiv_text(text: str) -> str:
+    """Clean arXiv LaTeX-style text to make it more readable."""
+    import re
+    # Remove LaTeX math placeholders
+    text = re.sub(r'@xmath\d+', '', text)
+    text = re.sub(r'@xcite', '', text)
+    # Remove excessive whitespace
+    text = re.sub(r'\s+', ' ', text)
+    # Remove LaTeX commands
+    text = re.sub(r'\\[a-zA-Z]+\{[^}]*\}', '', text)
+    text = re.sub(r'\\[a-zA-Z]+', '', text)
+    return text.strip()
+def download_arxiv_summarization(max_samples: int = 50000) -> list[dict[str, Any]]:
+    """
+    Download arXiv papers for academic summarization only.
+    Note: This dataset doesn't have categories, so can't be used for topic classification.
+    Returns: summarization_records
+    """
+    print("\n🎓 Loading arXiv (academic papers for summarization)...")
+    print("  Loading dataset (this may take a minute)...")
+    arxiv = load_dataset("ccdv/arxiv-summarization", split="train")
+    summ_records: list[dict[str, Any]] = []
+    indices = list(range(len(arxiv)))
+    random.shuffle(indices)
+    print("  Processing papers...")
+    for i in tqdm(indices[:max_samples * 2], desc="arXiv", leave=False):
+        if len(summ_records) >= max_samples:
+            break
+        item = arxiv[i]
+        # Get abstract and article
+        abstract = item.get("abstract", "")
+        article = item.get("article", "")
+        if not abstract or len(abstract) < 100:
+            continue
+        # Clean LaTeX artifacts
+        abstract = clean_arxiv_text(abstract)
+        article = clean_arxiv_text(article)
+        # Skip if still has too many weird characters after cleaning
+        if '@' in abstract or '@' in article[:500]:
+            continue
+        # Summarization: article → abstract
+        if article and len(article) > 500:
+            summ_records.append({
+                "source": article[:4000],
+                "summary": abstract,
+                "type": "academic",
+            })
+    print(f"    Summarization: {len(summ_records):,}")
+    return summ_records
+def download_topics_from_datasets(max_samples: int = 50000) -> list[dict[str, Any]]:
     """
+    Download topic classification data from multiple sources with real categories.
+    Sources:
+    - 20 Newsgroups (classic topic classification)
+    - Wikipedia (article categories)
     """
+    print("\n📂 Loading topic classification datasets...")
+    records: list[dict[str, Any]] = []
+    # 20 Newsgroups - classic topic dataset
+    print("  Loading 20 Newsgroups...")
+    try:
+        newsgroups = load_dataset("SetFit/20_newsgroups", split="train")
+        # Map 20 newsgroups categories to our 8 topics
+        newsgroup_map = {
+            # Science
+            "sci.crypt": "Science", "sci.electronics": "Science",
+            "sci.med": "Science", "sci.space": "Science",
+            # Technology
+            "comp.graphics": "Technology", "comp.os.ms-windows.misc": "Technology",
+            "comp.sys.ibm.pc.hardware": "Technology", "comp.sys.mac.hardware": "Technology",
+            "comp.windows.x": "Technology",
+            # Philosophy/Religion
+            "alt.atheism": "Philosophy", "soc.religion.christian": "Philosophy",
+            "talk.religion.misc": "Philosophy",
+            # History/Politics
+            "talk.politics.guns": "History", "talk.politics.mideast": "History",
+            "talk.politics.misc": "History",
+            # Business
+            "misc.forsale": "Business",
+            # Sports/Recreation
+            "rec.autos": "Arts", "rec.motorcycles": "Arts",
+            "rec.sport.baseball": "Arts", "rec.sport.hockey": "Arts",
+        }
+        for item in tqdm(newsgroups, desc="20 Newsgroups", leave=False):
+            if len(records) >= max_samples:
+                break
+            label_name = item.get("label_text", "")
+            text = item.get("text", "")
+            if label_name in newsgroup_map and text and len(text) > 100:
+                records.append({
+                    "text": text[:1500],
+                    "topic": newsgroup_map[label_name],
+                    "source": "newsgroups",
+                })
+        print(f"    20 Newsgroups: {len(records):,}")
+    except Exception as e:
+        print(f"    20 Newsgroups failed: {e}")
+    # Add from Gutenberg for Fiction
+    gutenberg_topics = download_gutenberg_topics(max_samples // 4)
+    records.extend(gutenberg_topics)
+    # Add from scientific papers abstract dataset for more Science/Tech
+    print("  Loading scientific papers...")
+    try:
+        sci_papers = load_dataset("scientific_papers", "arxiv", split="train", streaming=True)
+        sci_count = 0
+        for item in tqdm(sci_papers, desc="Scientific papers", leave=False, total=max_samples//4):
+            if sci_count >= max_samples // 4:
+                break
+            abstract = item.get("abstract", "")
+            if abstract and len(abstract) > 100:
+                # Alternate between Science and Technology
+                topic = "Science" if sci_count % 2 == 0 else "Technology"
+                records.append({
+                    "text": abstract[:1500],
+                    "topic": topic,
+                    "source": "scientific_papers",
+                })
+                sci_count += 1
+        print(f"    Scientific papers: {sci_count:,}")
+    except Exception as e:
+        print(f"    Scientific papers failed: {e}")
+    return records
+def download_summarization(max_books: int = 40000, max_arxiv: int = 50000) -> None:
+    """Download all summarization data (books + arxiv, NO news)."""
+    print("\n📝 Downloading Summarization Data...")
+    out_dir = OUTPUT_DIR / "summarization"
+    all_records: list[dict[str, Any]] = []
+    # BookSum - literary
+    book_records = download_booksum(max_books)
+    all_records.extend(book_records)
+    # arXiv - academic (summarization only, no categories in this dataset)
+    arxiv_summ = download_arxiv_summarization(max_arxiv)
+    all_records.extend(arxiv_summ)
+    # Shuffle and split
+    random.shuffle(all_records)
+    # Split by original split if available, else 90/5/5
+    train_records = [r for r in all_records if r.get("split", "train") == "train" or "split" not in r]
+    val_records = [r for r in all_records if r.get("split") == "validation"]
+    test_records = [r for r in all_records if r.get("split") == "test"]
+    # If no split info, do 90/5/5
+    if len(val_records) < 100:
+        n = len(train_records)
+        random.shuffle(train_records)
+        val_records = train_records[int(n*0.9):int(n*0.95)]
+        test_records = train_records[int(n*0.95):]
+        train_records = train_records[:int(n*0.9)]
+    # Remove split key before saving
+    for r in train_records + val_records + test_records:
+        r.pop("split", None)
+    write_jsonl(train_records, out_dir / "train.jsonl", "train")
+    write_jsonl(val_records, out_dir / "validation.jsonl", "val")
+    write_jsonl(test_records, out_dir / "test.jsonl", "test")
+    print(f"\n  ✓ Total summarization: {len(train_records) + len(val_records) + len(test_records):,}")
+# ============== TOPIC CLASSIFICATION ==============
+def download_topics(max_samples: int = 50000) -> None:
+    """
+    Download topic classification data from multiple sources.
+    Sources:
+    - 20 Newsgroups (classic topic dataset)
+    - Gutenberg books (Fiction)
+    - Scientific papers (Science, Technology)
+    """
+    print("\n📂 Downloading Topic Classification...")
+    out_dir = OUTPUT_DIR / "topic"
+    # Get topic records from various sources
+    all_records = download_topics_from_datasets(max_samples)
+    # Balance topics
+    topic_counts: dict[str, list] = {t: [] for t in TOPIC_LABELS}
+    for r in all_records:
+        topic = r.get("topic")
+        if topic in topic_counts:
+            topic_counts[topic].append(r)
+    # Print distribution before balancing
+    print("\n  Topic distribution (before balancing):")
+    for topic, records in topic_counts.items():
+        print(f"    {topic}: {len(records):,}")
+    # Balance to min count (with some tolerance) - only from topics that have data
+    counts_with_data = [len(v) for v in topic_counts.values() if v]
+    if not counts_with_data:
+        print("  ⚠️ No topic data found!")
+        return
+    min_count = min(counts_with_data)
+    target_count = min(min_count, max_samples // len(TOPIC_LABELS))
+    balanced: list[dict[str, Any]] = []
+    for topic, records in topic_counts.items():
+        if records:
+            random.shuffle(records)
+            balanced.extend(records[:target_count])
+    random.shuffle(balanced)
+    # Split 90/5/5
+    n = len(balanced)
+    train_records = balanced[:int(n*0.9)]
+    val_records = balanced[int(n*0.9):int(n*0.95)]
+    test_records = balanced[int(n*0.95):]
+    write_jsonl(train_records, out_dir / "train.jsonl", "train")
+    write_jsonl(val_records, out_dir / "validation.jsonl", "val")
+    write_jsonl(test_records, out_dir / "test.jsonl", "test")
+    # Save labels - only labels that have data
+    used_labels = [t for t in TOPIC_LABELS if topic_counts.get(t)]
+    (out_dir / "labels.json").write_text(json.dumps(used_labels, indent=2))
+    print(f"\n  ✓ {len(used_labels)} topic labels with data: {used_labels}")
+def download_gutenberg_topics(max_samples: int = 30000) -> list[dict[str, Any]]:
+    """Extract topic-labeled samples from Gutenberg books."""
+    print("\n📚 Loading Gutenberg for topic classification...")
     try:
         gutenberg = load_dataset("sedthh/gutenberg_english", split="train")
     except Exception:
+        print("  Trying pg19...")
         gutenberg = load_dataset("pg19", split="train")
     records: list[dict[str, Any]] = []
     indices = list(range(len(gutenberg)))
     random.shuffle(indices)
+    for i in tqdm(indices, desc="Gutenberg topics", leave=False):
         if len(records) >= max_samples:
             break
         item = gutenberg[i]
+        text = item.get("TEXT", "") or item.get("text", "")
         metadata = item.get("METADATA", {}) or {}
         if not text or len(text) < 1000:
             continue
+        # Try to determine topic from metadata
+        subjects = ""
+        if isinstance(metadata, dict):
+            subjects = str(metadata.get("subjects", "")).lower()
+            subjects += " " + str(metadata.get("subject", "")).lower()
+            subjects += " " + str(metadata.get("category", "")).lower()
+        topic = None
+        for keyword, mapped_topic in GUTENBERG_SUBJECT_MAP.items():
+            if keyword in subjects:
+                topic = mapped_topic
+                break
+        # Default fiction for novels without clear subject
+        if not topic and ("novel" in subjects or not subjects.strip()):
+            topic = "Fiction"
+        if topic:
+            # Get a clean paragraph as sample
+            paragraphs = re.split(r'\n\s*\n', text)
+            for para in paragraphs[5:]:  # Skip front matter
+                para = para.strip()
+                if 200 < len(para) < 1500 and para.count('.') >= 2:
+                    records.append({
+                        "text": para,
+                        "topic": topic,
+                        "source": "gutenberg",
+                    })
+                    break
+    print(f"    Gutenberg topics: {len(records):,}")
+    return records
+# ============== EMOTIONS (unchanged) ==============
 def download_emotions() -> None:
     """Download GoEmotions for emotion classification."""
+    print("\n😊 Downloading Emotions (GoEmotions)...")
     out_dir = OUTPUT_DIR / "emotion"
     ds = load_dataset("google-research-datasets/go_emotions", "simplified")
     print(f"  ✓ {len(EMOTION_LABELS)} emotion labels saved")
+# ============== GUTENBERG BOOKS (for language modeling) ==============
+GUTENBERG_JUNK_PATTERNS = [
+    r"Project Gutenberg", r"www\.gutenberg\.org", r"This ebook is for",
+    r"Gutenberg License", r"^\*\*\* START OF", r"^\*\*\* END OF",
+    r"Produced by", r"Transcriber's Note", r"TABLE OF CONTENTS",
+    r"^\s*CHAPTER\s+[IVXLC\d]+", r"^\s*Chapter\s+[IVXLC\d]+",
+    r"^\s*BOOK\s+[IVXLC\d]+", r"^\s*PREFACE\s*$", r"^\s*INTRODUCTION\s*$",
+    r"E-text prepared by", r"Internet Archive", r"Distributed Proofreaders",
+]
+GUTENBERG_JUNK_REGEX = re.compile("|".join(GUTENBERG_JUNK_PATTERNS), re.IGNORECASE)
+def is_clean_prose(text: str) -> bool:
+    """Check if text is clean literary prose."""
+    if len(text) < 300 or len(text) > 3000:
+        return False
+    if GUTENBERG_JUNK_REGEX.search(text):
+        return False
+    if text.count('.') < 2:
+        return False
+    uppercase_ratio = sum(1 for c in text if c.isupper()) / max(len(text), 1)
+    if uppercase_ratio > 0.3:
+        return False
+    digit_ratio = sum(1 for c in text if c.isdigit()) / max(len(text), 1)
+    if digit_ratio > 0.1:
+        return False
+    return True
+def download_gutenberg(max_samples: int = 30000) -> None:
+    """Download Gutenberg books for language modeling."""
+    print("\n📚 Downloading Gutenberg Books...")
+    out_dir = OUTPUT_DIR / "books"
+    out_dir.mkdir(parents=True, exist_ok=True)
+    try:
+        gutenberg = load_dataset("sedthh/gutenberg_english", split="train")
+    except Exception:
+        gutenberg = load_dataset("pg19", split="train")
+    records: list[dict[str, Any]] = []
+    indices = list(range(len(gutenberg)))
+    random.shuffle(indices)
+    for i in tqdm(indices, desc="Books", leave=False):
+        if len(records) >= max_samples:
+            break
+        item = gutenberg[i]
+        text = item.get("TEXT", "") or item.get("text", "")
+        metadata = item.get("METADATA", {}) or {}
+        title = metadata.get("title", "") if isinstance(metadata, dict) else ""
+        if not title:
+            title = item.get("title", f"Book_{i}")
+        if not text or len(text) < 1000:
+            continue
+        paragraphs = re.split(r'\n\s*\n', text)
+        for para in paragraphs:
+            para = para.strip()
+            if is_clean_prose(para):
+                records.append({"text": para, "title": title, "type": "gutenberg"})
+                if len(records) >= max_samples:
+                    break
+    random.shuffle(records)
+    n = len(records)
+    write_jsonl(records[:int(n*0.9)], out_dir / "train.jsonl", "train")
+    write_jsonl(records[int(n*0.9):int(n*0.95)], out_dir / "validation.jsonl", "val")
+    write_jsonl(records[int(n*0.95):], out_dir / "test.jsonl", "test")
+# ============== MAIN ==============
 def main() -> None:
     parser = argparse.ArgumentParser(description="Download LexiMind datasets")
     parser.add_argument(
+        "--task",
         choices=["all", "summarization", "emotion", "topic", "gutenberg"],
+        default="all",
         help="Dataset to download"
     )
+    parser.add_argument("--max-books", type=int, default=40000, help="Max BookSum samples")
+    parser.add_argument("--max-arxiv", type=int, default=50000, help="Max arXiv samples")
+    parser.add_argument("--max-gutenberg", type=int, default=30000, help="Max Gutenberg chunks")
+    parser.add_argument("--max-topics", type=int, default=50000, help="Max topic samples")
     parser.add_argument("--seed", type=int, default=42, help="Random seed")
     args = parser.parse_args()
     print("=" * 60)
     print("LexiMind Dataset Download")
+    print("Books + Academic Papers + Topic Classification")
     print("=" * 60)
     if args.task in ["all", "summarization"]:
+        download_summarization(args.max_books, args.max_arxiv)
     if args.task in ["all", "emotion"]:
         download_emotions()
     if args.task in ["all", "topic"]:
         download_topics(args.max_topics)
+    if args.task in ["all", "gutenberg"]:
+        download_gutenberg(args.max_gutenberg)
     print("\n" + "=" * 60)
     print("✅ Download complete!")

scripts/train.py CHANGED Viewed

@@ -3,9 +3,9 @@
 Training script for LexiMind.
 Simple, clean training with multi-task learning across:
-- Summarization (CNN/DailyMail + BookSum)
 - Emotion classification (GoEmotions, 28 labels)
-- Topic classification (AG News, 4 labels)
 Usage:
     python scripts/train.py training=medium
@@ -89,11 +89,17 @@ def main(cfg: DictConfig) -> None:
     device = torch.device(cfg.device)
     # GPU optimizations for Ampere+
-    if device.type == "cuda" and torch.cuda.get_device_capability()[0] >= 8:
-        torch.set_float32_matmul_precision("high")
-        torch.backends.cuda.matmul.allow_tf32 = True
-        torch.backends.cudnn.allow_tf32 = True
-        print("✓ TF32 enabled for Ampere GPU")
     # --------------- Load Data ---------------
@@ -187,6 +193,11 @@ def main(cfg: DictConfig) -> None:
     # --------------- Model ---------------
     print("\nBuilding model...")
     model_cfg = ModelConfig(
         d_model=cfg.model.d_model,
         vocab_size=getattr(cfg.model, "vocab_size", None),
@@ -198,9 +209,15 @@ def main(cfg: DictConfig) -> None:
         use_pretrained=cfg.model.use_pretrained,
         pretrained_model_name=cfg.model.pretrained_model_name,
         activation=getattr(cfg.model, "activation", "gelu"),
-        use_relative_position_bias=getattr(cfg.model, "use_relative_position_bias", False),
     )
     model = build_multitask_model(
         tokenizer,
         num_emotions=len(emot_train.emotion_classes),
@@ -211,6 +228,26 @@ def main(cfg: DictConfig) -> None:
     param_count = sum(p.numel() for p in model.parameters())
     print(f"  Parameters: {param_count:,} ({param_count/1e6:.1f}M)")
     # Resume from checkpoint?
     start_epoch = 1
     resume_path = cfg.get("resume_from")
@@ -223,12 +260,15 @@ def main(cfg: DictConfig) -> None:
             start_epoch = int(digits[-1]) + 1
     # Compile model for speed
     if cfg.training.get("compile_encoder", True):
-        model.encoder = torch.compile(model.encoder, backend="inductor")  # type: ignore[assignment]
-        print("  ✓ Encoder compiled")
     if cfg.training.get("compile_decoder", True):
-        model.decoder = torch.compile(model.decoder, backend="inductor")  # type: ignore[assignment]
-        print("  ✓ Decoder compiled")
     # --------------- Train ---------------
@@ -236,11 +276,16 @@ def main(cfg: DictConfig) -> None:
     opt_cfg = cfg.training.get("optimizer", {})
     sched_cfg = cfg.training.get("scheduler", {})
     optimizer = torch.optim.AdamW(
         model.parameters(),
         lr=float(opt_cfg.get("lr", 3e-5)),
         weight_decay=float(opt_cfg.get("weight_decay", 0.01)),
     )
     trainer = Trainer(
         model=model,

 Training script for LexiMind.
 Simple, clean training with multi-task learning across:
+- Summarization (BookSum + arXiv papers)
 - Emotion classification (GoEmotions, 28 labels)
+- Topic classification (Books + Papers, 8 labels: Fiction, Science, Technology, etc.)
 Usage:
     python scripts/train.py training=medium
     device = torch.device(cfg.device)
     # GPU optimizations for Ampere+
+    if device.type == "cuda":
+        # Enable cudnn benchmark for fixed-size inputs (10-20% speedup)
+        torch.backends.cudnn.benchmark = True
+        if torch.cuda.get_device_capability()[0] >= 8:
+            torch.set_float32_matmul_precision("high")
+            torch.backends.cuda.matmul.allow_tf32 = True
+            torch.backends.cudnn.allow_tf32 = True
+            print("✓ TF32 + cudnn.benchmark enabled for Ampere GPU")
+        else:
+            print("✓ cudnn.benchmark enabled")
     # --------------- Load Data ---------------
     # --------------- Model ---------------
     print("\nBuilding model...")
+    # Check for overrides in training config
+    grad_ckpt = cfg.training.get("gradient_checkpointing", cfg.model.get("gradient_checkpointing", False))
+    use_rel_pos = cfg.training.get("use_relative_position_bias", cfg.model.get("use_relative_position_bias", False))
     model_cfg = ModelConfig(
         d_model=cfg.model.d_model,
         vocab_size=getattr(cfg.model, "vocab_size", None),
         use_pretrained=cfg.model.use_pretrained,
         pretrained_model_name=cfg.model.pretrained_model_name,
         activation=getattr(cfg.model, "activation", "gelu"),
+        use_relative_position_bias=use_rel_pos,
+        gradient_checkpointing=grad_ckpt,
     )
+    if grad_ckpt:
+        print("  ✓ Gradient checkpointing enabled")
+    if not use_rel_pos:
+        print("  ✓ FlashAttention enabled (no relative position bias)")
     model = build_multitask_model(
         tokenizer,
         num_emotions=len(emot_train.emotion_classes),
     param_count = sum(p.numel() for p in model.parameters())
     print(f"  Parameters: {param_count:,} ({param_count/1e6:.1f}M)")
+    # Freeze lower encoder layers (keeps pretrained language understanding, adapts upper layers)
+    freeze_layers = cfg.training.get("freeze_encoder_layers", 0)
+    if freeze_layers > 0:
+        frozen_params = 0
+        # Freeze embedding layer
+        if hasattr(model.encoder, 'embed_tokens'):
+            for p in model.encoder.embed_tokens.parameters():
+                p.requires_grad = False
+                frozen_params += p.numel()
+        # Freeze specified number of encoder layers
+        if hasattr(model.encoder, 'layers'):
+            for i, layer in enumerate(model.encoder.layers):
+                if i < freeze_layers:
+                    for p in layer.parameters():
+                        p.requires_grad = False
+                        frozen_params += p.numel()
+        trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        print(f"  ✓ Frozen encoder layers 0-{freeze_layers-1} ({frozen_params/1e6:.1f}M params)")
+        print(f"  Trainable: {trainable:,} ({trainable/1e6:.1f}M)")
     # Resume from checkpoint?
     start_epoch = 1
     resume_path = cfg.get("resume_from")
             start_epoch = int(digits[-1]) + 1
     # Compile model for speed
+    # Note: "reduce-overhead" mode uses CUDA graphs which conflicts with gradient checkpointing
+    # Use "default" mode when checkpointing is enabled
+    compile_mode = "default" if grad_ckpt else "reduce-overhead"
     if cfg.training.get("compile_encoder", True):
+        model.encoder = torch.compile(model.encoder, mode=compile_mode)  # type: ignore[assignment]
+        print(f"  ✓ Encoder compiled ({compile_mode})")
     if cfg.training.get("compile_decoder", True):
+        model.decoder = torch.compile(model.decoder, mode=compile_mode)  # type: ignore[assignment]
+        print(f"  ✓ Decoder compiled ({compile_mode})")
     # --------------- Train ---------------
     opt_cfg = cfg.training.get("optimizer", {})
     sched_cfg = cfg.training.get("scheduler", {})
+    # Use fused AdamW on CUDA for ~5-10% speedup
+    use_fused = device.type == "cuda" and "fused" in torch.optim.AdamW.__init__.__code__.co_varnames
     optimizer = torch.optim.AdamW(
         model.parameters(),
         lr=float(opt_cfg.get("lr", 3e-5)),
         weight_decay=float(opt_cfg.get("weight_decay", 0.01)),
+        fused=use_fused,
     )
+    if use_fused:
+        print("  ✓ Fused AdamW optimizer")
     trainer = Trainer(
         model=model,

src/inference/pipeline.py CHANGED Viewed

@@ -68,6 +68,7 @@ class InferenceConfig:
     summary_max_length: int = 128
     summary_repetition_penalty: float = 1.2  # Penalize repeated tokens
     summary_formatting: bool = True  # Apply text cleanup/formatting to generated summaries
     emotion_threshold: float = 0.5
     device: str | None = None
@@ -157,6 +158,7 @@ class InferencePipeline:
                 ban_token_ids=[i for i in ban_ids if i is not None],
                 no_repeat_ngram_size=3,
                 repetition_penalty=self.config.summary_repetition_penalty,
                 memory_mask=src_mask,
             )

     summary_max_length: int = 128
     summary_repetition_penalty: float = 1.2  # Penalize repeated tokens
+    summary_length_penalty: float = 1.5  # Encourage EOS token as length increases (>1 = shorter)
     summary_formatting: bool = True  # Apply text cleanup/formatting to generated summaries
     emotion_threshold: float = 0.5
     device: str | None = None
                 ban_token_ids=[i for i in ban_ids if i is not None],
                 no_repeat_ngram_size=3,
                 repetition_penalty=self.config.summary_repetition_penalty,
+                length_penalty=self.config.summary_length_penalty,
                 memory_mask=src_mask,
             )

src/models/decoder.py CHANGED Viewed

@@ -445,10 +445,15 @@ class TransformerDecoder(nn.Module):
         ban_token_ids: Optional[List[int]] = None,
         no_repeat_ngram_size: int = 0,
         repetition_penalty: float = 1.0,
         memory_mask: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """
         Greedy decoding with KV caching for O(N) complexity.
         """
         if device is None:
             device = memory.device
@@ -519,6 +524,13 @@ class TransformerDecoder(nn.Module):
                     if banned_for_this_batch:
                         next_step_logits[b, list(banned_for_this_batch)] = float("-inf")
             # Greedy selection
             next_token = next_step_logits.argmax(dim=-1, keepdim=True)  # (B, 1)

         ban_token_ids: Optional[List[int]] = None,
         no_repeat_ngram_size: int = 0,
         repetition_penalty: float = 1.0,
+        length_penalty: float = 1.0,
         memory_mask: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """
         Greedy decoding with KV caching for O(N) complexity.
+        Args:
+            length_penalty: Values > 1.0 encourage shorter sequences by boosting EOS probability
+                           as sequence length increases. Default 1.0 (no penalty).
         """
         if device is None:
             device = memory.device
                     if banned_for_this_batch:
                         next_step_logits[b, list(banned_for_this_batch)] = float("-inf")
+            # Length penalty to boost EOS probability as sequence grows (encourages shorter outputs)
+            if length_penalty != 1.0 and end_token_id is not None and generated.size(1) >= min_len:
+                # Scale EOS logit based on current length relative to max
+                length_ratio = generated.size(1) / max_len
+                eos_boost = length_penalty * length_ratio  # Grows as we approach max_len
+                next_step_logits[:, end_token_id] = next_step_logits[:, end_token_id] + eos_boost
             # Greedy selection
             next_token = next_step_logits.argmax(dim=-1, keepdim=True)  # (B, 1)

src/training/trainer.py CHANGED Viewed

@@ -369,17 +369,19 @@ class Trainer:
                 if src_mask is not None:
                     src_mask = src_mask[:1]
-                # Generate
                 model: Any = self.model
                 enc_mask = src_mask.unsqueeze(1) & src_mask.unsqueeze(2) if src_mask is not None else None
                 memory = model.encoder(src_ids, mask=enc_mask)
-                generated = model.decoder.greedy_decode_naive(
                     memory=memory,
                     max_len=self.config.validation_max_length,
                     start_token_id=self.tokenizer.bos_token_id,
                     end_token_id=self.tokenizer.eos_token_id,
                     device=self.device,
                     memory_mask=src_mask,
                 )
                 src = self.tokenizer.decode(src_ids[0].tolist())

                 if src_mask is not None:
                     src_mask = src_mask[:1]
+                # Generate with anti-repetition
                 model: Any = self.model
                 enc_mask = src_mask.unsqueeze(1) & src_mask.unsqueeze(2) if src_mask is not None else None
                 memory = model.encoder(src_ids, mask=enc_mask)
+                generated = model.decoder.greedy_decode(
                     memory=memory,
                     max_len=self.config.validation_max_length,
                     start_token_id=self.tokenizer.bos_token_id,
                     end_token_id=self.tokenizer.eos_token_id,
                     device=self.device,
                     memory_mask=src_mask,
+                    no_repeat_ngram_size=3,
+                    repetition_penalty=1.2,
                 )
                 src = self.tokenizer.decode(src_ids[0].tolist())