import gradio as gr import os import json import pyarrow.parquet as pq from datasets import load_dataset OUTPUT_FILE = "wiki40b_es_train.jsonl" def convert_parquet_to_jsonl(progress=gr.Progress()): progress(0, desc="Cargando dataset google/wiki40b (es)...") # Carga SOLO el shard que necesitas dataset = load_dataset( "google/wiki40b", "es", split="train", streaming=False ) # Hugging Face guarda internamente en parquet, # pero aquí forzamos exportación limpia progress(0.2, desc="Exportando dataset a Parquet temporal...") parquet_path = "temp.parquet" dataset.to_parquet(parquet_path) progress(0.4, desc="Convirtiendo Parquet a JSONL...") parquet_file = pq.ParquetFile(parquet_path) total_batches = parquet_file.num_row_groups with open(OUTPUT_FILE, "w", encoding="utf-8") as f: for i, batch in enumerate(parquet_file.iter_batches(batch_size=1000)): batch_dict = batch.to_pydict() rows = zip(*batch_dict.values()) keys = list(batch_dict.keys()) for row in rows: record = dict(zip(keys, row)) f.write(json.dumps(record, ensure_ascii=False) + "\n") progress(0.4 + 0.6 * (i / total_batches), desc=f"Procesando lote {i+1}/{total_batches}") os.remove(parquet_path) progress(1.0, desc="Conversión completada ✅") return OUTPUT_FILE with gr.Blocks(title="Wiki40B ES → JSONL Converter") as app: gr.Markdown( """ # 🧠 Wiki40B (ES) → JSONL Convierte el archivo `es/train-00000-of-00006.parquet` del dataset **google/wiki40b** a **JSONL**. 👉 Cuando termine, podrás **descargar el archivo a tu PC**. """ ) convert_btn = gr.Button("🚀 Convertir a JSONL") output_file = gr.File(label="📥 Descargar JSONL") convert_btn.click( fn=convert_parquet_to_jsonl, outputs=output_file ) app.launch()