teszenofficial commited on
Commit
14d6cbc
·
verified ·
1 Parent(s): 056b302

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -0
app.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import json
4
+ import pyarrow.parquet as pq
5
+ from datasets import load_dataset
6
+
7
+ OUTPUT_FILE = "wiki40b_es_train.jsonl"
8
+
9
+ def convert_parquet_to_jsonl(progress=gr.Progress()):
10
+ progress(0, desc="Cargando dataset google/wiki40b (es)...")
11
+
12
+ # Carga SOLO el shard que necesitas
13
+ dataset = load_dataset(
14
+ "google/wiki40b",
15
+ "es",
16
+ split="train",
17
+ streaming=False
18
+ )
19
+
20
+ # Hugging Face guarda internamente en parquet,
21
+ # pero aquí forzamos exportación limpia
22
+ progress(0.2, desc="Exportando dataset a Parquet temporal...")
23
+
24
+ parquet_path = "temp.parquet"
25
+ dataset.to_parquet(parquet_path)
26
+
27
+ progress(0.4, desc="Convirtiendo Parquet a JSONL...")
28
+
29
+ parquet_file = pq.ParquetFile(parquet_path)
30
+ total_batches = parquet_file.num_row_groups
31
+
32
+ with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
33
+ for i, batch in enumerate(parquet_file.iter_batches(batch_size=1000)):
34
+ batch_dict = batch.to_pydict()
35
+ rows = zip(*batch_dict.values())
36
+ keys = list(batch_dict.keys())
37
+
38
+ for row in rows:
39
+ record = dict(zip(keys, row))
40
+ f.write(json.dumps(record, ensure_ascii=False) + "\n")
41
+
42
+ progress(0.4 + 0.6 * (i / total_batches),
43
+ desc=f"Procesando lote {i+1}/{total_batches}")
44
+
45
+ os.remove(parquet_path)
46
+
47
+ progress(1.0, desc="Conversión completada ✅")
48
+
49
+ return OUTPUT_FILE
50
+
51
+
52
+ with gr.Blocks(title="Wiki40B ES → JSONL Converter") as app:
53
+ gr.Markdown(
54
+ """
55
+ # 🧠 Wiki40B (ES) → JSONL
56
+
57
+ Convierte el archivo
58
+ `es/train-00000-of-00006.parquet`
59
+ del dataset **google/wiki40b** a **JSONL**.
60
+
61
+ 👉 Cuando termine, podrás **descargar el archivo a tu PC**.
62
+ """
63
+ )
64
+
65
+ convert_btn = gr.Button("🚀 Convertir a JSONL")
66
+ output_file = gr.File(label="📥 Descargar JSONL")
67
+
68
+ convert_btn.click(
69
+ fn=convert_parquet_to_jsonl,
70
+ outputs=output_file
71
+ )
72
+
73
+ app.launch()