readctrl / code /translation /translation_using_gpt5_v2.py
shahidul034's picture
Add files using upload-large-folder tool
c7a6fe6 verified
import json
import os
import tqdm
from pathlib import Path
from openai import OpenAI
# --- Configuration ---
source_language = "English"
target_language = "Bangla"
save_dir = "/home/mshahidul/readctrl/data/translated_data"
save_path = os.path.join(save_dir, f"translation_{source_language.lower()}2{target_language.lower()}_v1.json")
# Ensure the directory exists
Path(save_dir).mkdir(parents=True, exist_ok=True)
print(f"Translating from {source_language} to {target_language}")
# Load Prompt Template
with open("/home/mshahidul/readctrl/prompts/translation_prompt.txt", "r") as f:
prompt_template = f.read()
# API Setup
api_file = "/home/mshahidul/api_new.json"
with open(api_file, "r") as f:
api_keys = json.load(f)
openai_api_key = api_keys["openai"]
client = OpenAI(api_key=openai_api_key)
def openai_return(prompt, model="gpt-5"):
"""Send a prompt to GPT and parse JSON."""
try:
response = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": "You are a helpful assistant that outputs only valid JSON."},
{"role": "user", "content": prompt}
],
response_format={"type": "json_object"} # Ensuring JSON mode if supported
)
content = response.choices[0].message.content.strip()
# Clean up possible markdown artifacts
cleaned = content.replace("```json", "").replace("```", "").strip()
return json.loads(cleaned)
except Exception as e:
print(f"⚠️ Error during API call or parsing: {e}")
return content
# Load existing results if they exist to resume progress
res = []
if os.path.exists(save_path):
with open(save_path, "r") as f:
res = json.load(f)
# Load Source Data
with open("/home/mshahidul/readctrl/data/testing_data_gs/multiclinsum_gs_train_en.json", "r") as f:
data = json.load(f)
# --- Translation Loop ---
# Start from the number of already processed items
start_index = len(res)
for item in tqdm.tqdm(data[start_index:200]):
# Helper to generate prompt and call API
def get_translation(text):
formatted_prompt = (prompt_template
.replace("<MEDICAL_TEXT>", text)
.replace("<SOURCE_LANGUAGE>", source_language)
.replace("<TARGET_LANGUAGE>", target_language))
return openai_return(formatted_prompt, model="gpt-5")
# Translate Fulltext
translated_full = get_translation(item["fulltext"])
# Translate Summary
translated_sum = get_translation(item["summary"])
# Create the translated object
translated_item = {
"id": item["id"],
"fulltext_translated": translated_full,
"summary_translated": translated_sum,
"original_id": item["id"]
}
res.append(translated_item)
# Incremental save every 2 items
if len(res) % 2 == 0:
with open(save_path, "w", encoding='utf-8') as f:
json.dump(res, f, indent=2, ensure_ascii=False)
print(f" Saved {len(res)} samples so far.")
# Final Save
with open(save_path, "w", encoding='utf-8') as f:
json.dump(res, f, indent=2, ensure_ascii=False)
print(f"✅ Processing complete. Data saved to {save_path}")