Spaces:
Build error
Build error
| import json | |
| import os | |
| import time | |
| from deep_translator import GoogleTranslator | |
| # Inisialisasi translator (Inggris ke Indonesia) | |
| translator = GoogleTranslator(source='en', target='id') | |
| def translate_text(text): | |
| """Fungsi untuk menerjemahkan teks.""" | |
| try: | |
| # Jika teks terlalu pendek atau kosong, lewati saja | |
| if not text or len(text.strip()) < 2: | |
| return text | |
| # Translate dan beri jeda agar tidak diblokir (Rate Limit) | |
| translated = translator.translate(text) | |
| time.sleep(0.5) | |
| return translated | |
| except Exception as e: | |
| print(f"Error saat menerjemahkan: {e}") | |
| return text | |
| def translate_recursive(data): | |
| """Fungsi rekursif untuk mencari dan menerjemahkan semua string.""" | |
| if isinstance(data, dict): | |
| return {key: translate_recursive(value) for key, value in data.items()} | |
| elif isinstance(data, list): | |
| return [translate_recursive(item) for item in data] | |
| elif isinstance(data, str): | |
| return translate_text(data) | |
| else: | |
| return data | |
| def process_and_merge(filepaths, output_filepath): | |
| """Membaca banyak file, menerjemahkan, dan menyatukannya ke satu file.""" | |
| print(f"Memulai proses... Hasil akhir akan disimpan di: {output_filepath}") | |
| # Buka file output utama dengan mode 'w' (write) | |
| with open(output_filepath, 'w', encoding='utf-8') as outfile: | |
| for filepath in filepaths: | |
| if not os.path.exists(filepath): | |
| print(f"\n[!] File tidak ditemukan, melewati: {filepath}") | |
| continue | |
| print(f"\n[+] Memproses file: {filepath}") | |
| ext = os.path.splitext(filepath)[1] | |
| with open(filepath, 'r', encoding='utf-8') as infile: | |
| if ext == '.jsonl': | |
| # Proses file .jsonl baris demi baris | |
| for line_num, line in enumerate(infile, 1): | |
| if not line.strip(): continue # Lewati baris kosong jika ada | |
| data = json.loads(line) | |
| translated_data = translate_recursive(data) | |
| # Tulis langsung ke file master | |
| outfile.write(json.dumps(translated_data, ensure_ascii=False) + '\n') | |
| print(f" -> Baris {line_num} selesai diterjemahkan & digabung") | |
| elif ext == '.json': | |
| # Proses file .json (biasanya berupa list of objects) | |
| data_list = json.load(infile) | |
| if isinstance(data_list, list): | |
| for i, data in enumerate(data_list, 1): | |
| translated_data = translate_recursive(data) | |
| # Tulis langsung ke file master dalam format jsonl | |
| outfile.write(json.dumps(translated_data, ensure_ascii=False) + '\n') | |
| print(f" -> Item {i} selesai diterjemahkan & digabung") | |
| else: | |
| print(f"Format .json pada {filepath} tidak didukung. Harus berupa list.") | |
| print(f"\n✅ PROSES SELESAI! Semua dataset berhasil diterjemahkan dan digabung menjadi {output_filepath}") | |
| # 1. Daftar file sumber Anda yang ada di dalam folder 'dataset' | |
| files_to_translate = [ | |
| "dataset/broad-general-dataset.jsonl", | |
| "dataset/casual-conversation-poo.json", | |
| "dataset/dataset.jsonl" | |
| ] | |
| # 2. Nama file gabungan hasil terjemahan | |
| output_file = "dataset/master_dataset_id.jsonl" | |
| # 3. Eksekusi program | |
| process_and_merge(files_to_translate, output_file) |