File size: 2,032 Bytes
14d6cbc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import gradio as gr
import os
import json
import pyarrow.parquet as pq
from datasets import load_dataset

OUTPUT_FILE = "wiki40b_es_train.jsonl"

def convert_parquet_to_jsonl(progress=gr.Progress()):
    progress(0, desc="Cargando dataset google/wiki40b (es)...")

    # Carga SOLO el shard que necesitas
    dataset = load_dataset(
        "google/wiki40b",
        "es",
        split="train",
        streaming=False
    )

    # Hugging Face guarda internamente en parquet,
    # pero aquí forzamos exportación limpia
    progress(0.2, desc="Exportando dataset a Parquet temporal...")

    parquet_path = "temp.parquet"
    dataset.to_parquet(parquet_path)

    progress(0.4, desc="Convirtiendo Parquet a JSONL...")

    parquet_file = pq.ParquetFile(parquet_path)
    total_batches = parquet_file.num_row_groups

    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        for i, batch in enumerate(parquet_file.iter_batches(batch_size=1000)):
            batch_dict = batch.to_pydict()
            rows = zip(*batch_dict.values())
            keys = list(batch_dict.keys())

            for row in rows:
                record = dict(zip(keys, row))
                f.write(json.dumps(record, ensure_ascii=False) + "\n")

            progress(0.4 + 0.6 * (i / total_batches),
                     desc=f"Procesando lote {i+1}/{total_batches}")

    os.remove(parquet_path)

    progress(1.0, desc="Conversión completada ✅")

    return OUTPUT_FILE


with gr.Blocks(title="Wiki40B ES → JSONL Converter") as app:
    gr.Markdown(
        """
        # 🧠 Wiki40B (ES) → JSONL

        Convierte el archivo  
        `es/train-00000-of-00006.parquet`  
        del dataset **google/wiki40b** a **JSONL**.

        👉 Cuando termine, podrás **descargar el archivo a tu PC**.
        """
    )

    convert_btn = gr.Button("🚀 Convertir a JSONL")
    output_file = gr.File(label="📥 Descargar JSONL")

    convert_btn.click(
        fn=convert_parquet_to_jsonl,
        outputs=output_file
    )

app.launch()