import json import os import tqdm from pathlib import Path from openai import OpenAI # --- Configuration --- source_language = "English" target_language = "Bangla" save_dir = "/home/mshahidul/readctrl/data/translated_data" save_path = os.path.join(save_dir, f"translation_{source_language.lower()}2{target_language.lower()}_v1.json") # Ensure the directory exists Path(save_dir).mkdir(parents=True, exist_ok=True) print(f"Translating from {source_language} to {target_language}") # Load Prompt Template with open("/home/mshahidul/readctrl/prompts/translation_prompt.txt", "r") as f: prompt_template = f.read() # API Setup api_file = "/home/mshahidul/api_new.json" with open(api_file, "r") as f: api_keys = json.load(f) openai_api_key = api_keys["openai"] client = OpenAI(api_key=openai_api_key) def openai_return(prompt, model="gpt-5"): """Send a prompt to GPT and parse JSON.""" try: response = client.chat.completions.create( model=model, messages=[ {"role": "system", "content": "You are a helpful assistant that outputs only valid JSON."}, {"role": "user", "content": prompt} ], response_format={"type": "json_object"} # Ensuring JSON mode if supported ) content = response.choices[0].message.content.strip() # Clean up possible markdown artifacts cleaned = content.replace("```json", "").replace("```", "").strip() return json.loads(cleaned) except Exception as e: print(f"⚠️ Error during API call or parsing: {e}") return content # Load existing results if they exist to resume progress res = [] if os.path.exists(save_path): with open(save_path, "r") as f: res = json.load(f) # Load Source Data with open("/home/mshahidul/readctrl/data/testing_data_gs/multiclinsum_gs_train_en.json", "r") as f: data = json.load(f) # --- Translation Loop --- # Start from the number of already processed items start_index = len(res) for item in tqdm.tqdm(data[start_index:200]): # Helper to generate prompt and call API def get_translation(text): formatted_prompt = (prompt_template .replace("", text) .replace("", source_language) .replace("", target_language)) return openai_return(formatted_prompt, model="gpt-5") # Translate Fulltext translated_full = get_translation(item["fulltext"]) # Translate Summary translated_sum = get_translation(item["summary"]) # Create the translated object translated_item = { "id": item["id"], "fulltext_translated": translated_full, "summary_translated": translated_sum, "original_id": item["id"] } res.append(translated_item) # Incremental save every 2 items if len(res) % 2 == 0: with open(save_path, "w", encoding='utf-8') as f: json.dump(res, f, indent=2, ensure_ascii=False) print(f" Saved {len(res)} samples so far.") # Final Save with open(save_path, "w", encoding='utf-8') as f: json.dump(res, f, indent=2, ensure_ascii=False) print(f"✅ Processing complete. Data saved to {save_path}")