Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """Export a small subset of bright_corpus as default demo corpus for HF Spaces.""" | |
| import os | |
| import shutil | |
| from pathlib import Path | |
| import pandas as pd | |
| def export_domain(domain: str, out_dir: Path, topics_per_domain: int = 5, files_per_topic: int = 3): | |
| """Export a subset of one domain's parquet to text files.""" | |
| parquet_path = Path("corpus") / "bright_corpus" / domain / "data.parquet" | |
| if not parquet_path.exists(): | |
| print(f"WARNING: {parquet_path} not found, skipping {domain}") | |
| return | |
| df = pd.read_parquet(parquet_path) | |
| df["topic"] = df["id"].apply(lambda x: x.split("/")[0] if "/" in str(x) else "unknown") | |
| # Pick representative topics (spread across the list) | |
| all_topics = df["topic"].unique().tolist() | |
| step = max(1, len(all_topics) // topics_per_domain) | |
| selected_topics = [all_topics[i * step] for i in range(topics_per_domain)] | |
| domain_out = out_dir / domain | |
| domain_out.mkdir(parents=True, exist_ok=True) | |
| total_chars = 0 | |
| total_files = 0 | |
| for topic in selected_topics: | |
| topic_df = df[df["topic"] == topic] | |
| # Pick first N files for this topic | |
| subset = topic_df.head(files_per_topic) | |
| for _, row in subset.iterrows(): | |
| file_id = row["id"] | |
| content = str(row["content"]) if pd.notna(row["content"]) else "" | |
| # Truncate to ~1KB if needed | |
| max_chars = 1000 | |
| if len(content) > max_chars: | |
| content = content[:max_chars] + "\n...[truncated]\n" | |
| # Build file path: domain/topic_filename.txt | |
| safe_name = file_id.replace("/", "_").replace("\\", "_") | |
| out_file = domain_out / f"{safe_name}.txt" | |
| out_file.write_text(content, encoding="utf-8") | |
| total_chars += len(content) | |
| total_files += 1 | |
| print(f" {domain}: {total_files} files, ~{total_chars} chars (~{total_chars/1024:.1f} KB)") | |
| def main(): | |
| out_dir = Path("web-app/default_corpus") | |
| if out_dir.exists(): | |
| shutil.rmtree(out_dir) | |
| out_dir.mkdir(parents=True, exist_ok=True) | |
| domains = ["biology", "earth_science", "economics", "robotics"] | |
| print("Exporting default corpus samples...") | |
| for domain in domains: | |
| export_domain(domain, out_dir, topics_per_domain=5, files_per_topic=3) | |
| # Summary | |
| all_files = list(out_dir.rglob("*.txt")) | |
| total_size = sum(f.stat().st_size for f in all_files) | |
| print(f"\nTotal: {len(all_files)} files, {total_size/1024:.1f} KB") | |
| print(f"Output directory: {out_dir.resolve()}") | |
| if __name__ == "__main__": | |
| main() | |