LexiMind / scripts /build_discovery_dataset.py
OliverPerrin
Full training results & evaluation with BERTScore
1e95f87
#!/usr/bin/env python3
"""Build a discovery dataset for the HuggingFace Space demo.
This script samples from the already-filtered training data (processed by
download_data.py), runs inference to generate descriptions/topics/emotions,
and uploads the result to HuggingFace Datasets.
The training data has already been filtered for:
- English content only
- Quality text (no metadata, errata, technical manuals)
- No Shakespeare/plays (excluded titles)
- Proper book descriptions (from Goodreads, not plot summaries)
"""
import json
import random
import sys
from collections import defaultdict
from pathlib import Path
from typing import Any
# Add project root to path
sys.path.insert(0, str(Path(__file__).parent.parent))
import torch
from datasets import Dataset
from tqdm import tqdm
from src.inference.factory import create_inference_pipeline
# --------------- Data Loading ---------------
def load_academic_papers(data_dir: Path, max_samples: int = 300) -> list[dict]:
"""Load academic paper samples from the training data."""
summ_file = data_dir / "summarization" / "train.jsonl"
if not summ_file.exists():
print(f" Warning: {summ_file} not found")
return []
academic = []
with open(summ_file) as f:
for line in f:
item = json.loads(line)
if item.get("type") != "academic":
continue
text = item.get("source", "")
if len(text) < 500:
continue
# Use title from data
title = item.get("title", "Research Paper")
academic.append({
"text": text[:2000],
"title": title,
"reference_summary": item.get("summary", "")[:500]
})
random.seed(42)
samples = random.sample(academic, min(max_samples, len(academic)))
results = []
for i, item in enumerate(samples):
results.append({
"id": f"paper_{i}",
"title": item["title"],
"text": item["text"],
"source_type": "academic",
"dataset": "arxiv",
"reference_summary": item["reference_summary"]
})
print(f" Loaded {len(results)} academic papers")
return results
def load_literary(data_dir: Path, max_samples: int = 300) -> list[dict]:
"""Load literary samples from the training data.
Training data now contains Goodreads descriptions (back-cover style)
instead of plot summaries.
"""
summ_file = data_dir / "summarization" / "train.jsonl"
if not summ_file.exists():
print(f" Warning: {summ_file} not found")
return []
literary = []
seen_titles = set()
with open(summ_file) as f:
for line in f:
item = json.loads(line)
if item.get("type") != "literary":
continue
title = item.get("title", "")
if not title or title in seen_titles:
continue
text = item.get("source", "")
summary = item.get("summary", "")
if len(text) < 500 or len(summary) < 50:
continue
seen_titles.add(title)
literary.append({
"text": text[:2000],
"title": title,
"reference_summary": summary[:600]
})
random.seed(42)
samples = random.sample(literary, min(max_samples, len(literary)))
results = []
for i, item in enumerate(samples):
results.append({
"id": f"literary_{i}",
"title": item["title"],
"text": item["text"],
"source_type": "literary",
"dataset": "goodreads",
"reference_summary": item["reference_summary"],
})
print(f" Loaded {len(results)} literary works (unique titles)")
return results
# --------------- Inference ---------------
def run_inference(pipeline: Any, samples: list[dict]) -> list[dict]:
"""Run model inference on all samples."""
results = []
for sample in tqdm(samples, desc="Running inference"):
text = sample["text"]
# Get model predictions using correct pipeline methods
summaries = pipeline.summarize([text])
topics = pipeline.predict_topics([text])
emotions = pipeline.predict_emotions([text])
# Extract first result from each list
summary = summaries[0] if summaries else ""
topic = topics[0] if topics else None
emotion = emotions[0] if emotions else None
# Get primary emotion (highest confidence if any detected)
primary_emotion = "neutral"
emotion_confidence = 0.0
if emotion and emotion.labels:
primary_emotion = emotion.labels[0]
emotion_confidence = emotion.scores[0]
result = {
"id": sample["id"],
"title": sample["title"],
"text": text,
"source_type": sample["source_type"],
"dataset": sample["dataset"],
"topic": topic.label if topic else "Unknown",
"topic_confidence": topic.confidence if topic else 0.0,
"emotion": primary_emotion,
"emotion_confidence": emotion_confidence,
"generated_summary": summary,
"reference_summary": sample.get("reference_summary", ""),
}
results.append(result)
# Print distribution stats
topic_dist = defaultdict(int)
emotion_dist = defaultdict(int)
for r in results:
topic_dist[r["topic"]] += 1
emotion_dist[r["emotion"]] += 1
print(f"\nTopic distribution: {dict(topic_dist)}")
print(f"Emotion distribution: {dict(emotion_dist)}")
return results
def main():
import argparse
parser = argparse.ArgumentParser(description="Build discovery dataset for HuggingFace Space")
parser.add_argument("--data-dir", type=Path, default=Path("data/processed"))
parser.add_argument("--checkpoint", type=Path, default=Path("checkpoints/best.pt"))
parser.add_argument("--num-papers", type=int, default=500, help="Number of academic papers")
parser.add_argument("--num-literary", type=int, default=500, help="Number of literary works")
parser.add_argument("--output", type=Path, default=Path("data/discovery_dataset.jsonl"))
parser.add_argument("--push-to-hub", action="store_true", help="Push to HuggingFace Hub")
parser.add_argument("--hub-repo", type=str, default="OliverPerrin/LexiMind-Discovery")
args = parser.parse_args()
print("Loading data samples from training data...")
print("(Data has already been filtered by download_data.py)")
# Load samples from training data
papers = load_academic_papers(args.data_dir, args.num_papers)
literary = load_literary(args.data_dir, args.num_literary)
all_samples = papers + literary
print(f"\nTotal samples: {len(all_samples)} ({len(papers)} papers, {len(literary)} literary)")
if not all_samples:
print("ERROR: No samples loaded! Check if data/processed exists and has data.")
print("Run: python scripts/download_data.py --task summarization")
return
# Load model and run inference
print(f"\nLoading model from {args.checkpoint}...")
labels_path = Path("artifacts/labels.json")
pipeline, labels = create_inference_pipeline(
args.checkpoint,
labels_path,
device="cuda" if torch.cuda.is_available() else "cpu"
)
print("Running inference on all samples...")
results = run_inference(pipeline, all_samples)
# Save locally
print(f"\nSaving to {args.output}...")
args.output.parent.mkdir(parents=True, exist_ok=True)
with open(args.output, "w") as f:
for item in results:
f.write(json.dumps(item) + "\n")
# Push to HuggingFace Hub
if args.push_to_hub:
print(f"\nPushing to HuggingFace Hub: {args.hub_repo}")
dataset = Dataset.from_list(results)
dataset.push_to_hub(
args.hub_repo,
private=False,
commit_message="Rebuild with Goodreads descriptions (back-cover style)"
)
print(f"Dataset available at: https://huggingface.co/datasets/{args.hub_repo}")
print("\nDone!")
if __name__ == "__main__":
main()