File size: 2,623 Bytes
fe62a13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/usr/bin/env python3
"""Export a small subset of bright_corpus as default demo corpus for HF Spaces."""

import os
import shutil
from pathlib import Path

import pandas as pd


def export_domain(domain: str, out_dir: Path, topics_per_domain: int = 5, files_per_topic: int = 3):
    """Export a subset of one domain's parquet to text files."""
    parquet_path = Path("corpus") / "bright_corpus" / domain / "data.parquet"
    if not parquet_path.exists():
        print(f"WARNING: {parquet_path} not found, skipping {domain}")
        return

    df = pd.read_parquet(parquet_path)
    df["topic"] = df["id"].apply(lambda x: x.split("/")[0] if "/" in str(x) else "unknown")

    # Pick representative topics (spread across the list)
    all_topics = df["topic"].unique().tolist()
    step = max(1, len(all_topics) // topics_per_domain)
    selected_topics = [all_topics[i * step] for i in range(topics_per_domain)]

    domain_out = out_dir / domain
    domain_out.mkdir(parents=True, exist_ok=True)

    total_chars = 0
    total_files = 0

    for topic in selected_topics:
        topic_df = df[df["topic"] == topic]
        # Pick first N files for this topic
        subset = topic_df.head(files_per_topic)

        for _, row in subset.iterrows():
            file_id = row["id"]
            content = str(row["content"]) if pd.notna(row["content"]) else ""

            # Truncate to ~1KB if needed
            max_chars = 1000
            if len(content) > max_chars:
                content = content[:max_chars] + "\n...[truncated]\n"

            # Build file path: domain/topic_filename.txt
            safe_name = file_id.replace("/", "_").replace("\\", "_")
            out_file = domain_out / f"{safe_name}.txt"

            out_file.write_text(content, encoding="utf-8")
            total_chars += len(content)
            total_files += 1

    print(f"  {domain}: {total_files} files, ~{total_chars} chars (~{total_chars/1024:.1f} KB)")


def main():
    out_dir = Path("web-app/default_corpus")
    if out_dir.exists():
        shutil.rmtree(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    domains = ["biology", "earth_science", "economics", "robotics"]
    print("Exporting default corpus samples...")
    for domain in domains:
        export_domain(domain, out_dir, topics_per_domain=5, files_per_topic=3)

    # Summary
    all_files = list(out_dir.rglob("*.txt"))
    total_size = sum(f.stat().st_size for f in all_files)
    print(f"\nTotal: {len(all_files)} files, {total_size/1024:.1f} KB")
    print(f"Output directory: {out_dir.resolve()}")


if __name__ == "__main__":
    main()