#!/usr/bin/env python3 """Export a small subset of bright_corpus as default demo corpus for HF Spaces.""" import os import shutil from pathlib import Path import pandas as pd def export_domain(domain: str, out_dir: Path, topics_per_domain: int = 5, files_per_topic: int = 3): """Export a subset of one domain's parquet to text files.""" parquet_path = Path("corpus") / "bright_corpus" / domain / "data.parquet" if not parquet_path.exists(): print(f"WARNING: {parquet_path} not found, skipping {domain}") return df = pd.read_parquet(parquet_path) df["topic"] = df["id"].apply(lambda x: x.split("/")[0] if "/" in str(x) else "unknown") # Pick representative topics (spread across the list) all_topics = df["topic"].unique().tolist() step = max(1, len(all_topics) // topics_per_domain) selected_topics = [all_topics[i * step] for i in range(topics_per_domain)] domain_out = out_dir / domain domain_out.mkdir(parents=True, exist_ok=True) total_chars = 0 total_files = 0 for topic in selected_topics: topic_df = df[df["topic"] == topic] # Pick first N files for this topic subset = topic_df.head(files_per_topic) for _, row in subset.iterrows(): file_id = row["id"] content = str(row["content"]) if pd.notna(row["content"]) else "" # Truncate to ~1KB if needed max_chars = 1000 if len(content) > max_chars: content = content[:max_chars] + "\n...[truncated]\n" # Build file path: domain/topic_filename.txt safe_name = file_id.replace("/", "_").replace("\\", "_") out_file = domain_out / f"{safe_name}.txt" out_file.write_text(content, encoding="utf-8") total_chars += len(content) total_files += 1 print(f" {domain}: {total_files} files, ~{total_chars} chars (~{total_chars/1024:.1f} KB)") def main(): out_dir = Path("web-app/default_corpus") if out_dir.exists(): shutil.rmtree(out_dir) out_dir.mkdir(parents=True, exist_ok=True) domains = ["biology", "earth_science", "economics", "robotics"] print("Exporting default corpus samples...") for domain in domains: export_domain(domain, out_dir, topics_per_domain=5, files_per_topic=3) # Summary all_files = list(out_dir.rglob("*.txt")) total_size = sum(f.stat().st_size for f in all_files) print(f"\nTotal: {len(all_files)} files, {total_size/1024:.1f} KB") print(f"Output directory: {out_dir.resolve()}") if __name__ == "__main__": main()