File size: 3,291 Bytes

c7a6fe6

import json
import os
import tqdm
from pathlib import Path
from openai import OpenAI

# --- Configuration ---
source_language = "English"
target_language = "Bangla"
save_dir = "/home/mshahidul/readctrl/data/translated_data"
save_path = os.path.join(save_dir, f"translation_{source_language.lower()}2{target_language.lower()}_v1.json")

# Ensure the directory exists
Path(save_dir).mkdir(parents=True, exist_ok=True)

print(f"Translating from {source_language} to {target_language}")

# Load Prompt Template
with open("/home/mshahidul/readctrl/prompts/translation_prompt.txt", "r") as f:
    prompt_template = f.read()

# API Setup
api_file = "/home/mshahidul/api_new.json"
with open(api_file, "r") as f:
    api_keys = json.load(f)
openai_api_key = api_keys["openai"]

client = OpenAI(api_key=openai_api_key)

def openai_return(prompt, model="gpt-5"):
    """Send a prompt to GPT and parse JSON."""
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a helpful assistant that outputs only valid JSON."},
                {"role": "user", "content": prompt}
            ],
            response_format={"type": "json_object"} # Ensuring JSON mode if supported
        )
        content = response.choices[0].message.content.strip()
        # Clean up possible markdown artifacts
        cleaned = content.replace("```json", "").replace("```", "").strip()
        return json.loads(cleaned)
    except Exception as e:
        print(f"⚠️ Error during API call or parsing: {e}")
        return content

# Load existing results if they exist to resume progress
res = []
if os.path.exists(save_path):
    with open(save_path, "r") as f:
        res = json.load(f)

# Load Source Data
with open("/home/mshahidul/readctrl/data/testing_data_gs/multiclinsum_gs_train_en.json", "r") as f:
    data = json.load(f)

# --- Translation Loop ---
# Start from the number of already processed items
start_index = len(res)
for item in tqdm.tqdm(data[start_index:200]):
    
    # Helper to generate prompt and call API
    def get_translation(text):
        formatted_prompt = (prompt_template
                            .replace("<MEDICAL_TEXT>", text)
                            .replace("<SOURCE_LANGUAGE>", source_language)
                            .replace("<TARGET_LANGUAGE>", target_language))
        return openai_return(formatted_prompt, model="gpt-5")

    # Translate Fulltext
    translated_full = get_translation(item["fulltext"])
    
    # Translate Summary
    translated_sum = get_translation(item["summary"])

    # Create the translated object
    translated_item = {
        "id": item["id"],
        "fulltext_translated": translated_full,
        "summary_translated": translated_sum,
        "original_id": item["id"]
    }

    res.append(translated_item)

    # Incremental save every 2 items
    if len(res) % 2 == 0:
        with open(save_path, "w", encoding='utf-8') as f:
            json.dump(res, f, indent=2, ensure_ascii=False)
        print(f" Saved {len(res)} samples so far.")

# Final Save
with open(save_path, "w", encoding='utf-8') as f:
    json.dump(res, f, indent=2, ensure_ascii=False)

print(f"✅ Processing complete. Data saved to {save_path}")