readCtrl_lambda / code /translation /translation_using_gpt5_v2.py

mshahidul

Initial commit of readCtrl code without large models

030876e 7 days ago

3.29 kB

	import json
	import os
	import tqdm
	from pathlib import Path
	from openai import OpenAI

	# --- Configuration ---
	source_language = "English"
	target_language = "Bangla"
	save_dir = "/home/mshahidul/readctrl/data/translated_data"
	save_path = os.path.join(save_dir, f"translation_{source_language.lower()}2{target_language.lower()}_v1.json")

	# Ensure the directory exists
	Path(save_dir).mkdir(parents=True, exist_ok=True)

	print(f"Translating from {source_language} to {target_language}")

	# Load Prompt Template
	with open("/home/mshahidul/readctrl/prompts/translation_prompt.txt", "r") as f:
	prompt_template = f.read()

	# API Setup
	api_file = "/home/mshahidul/api_new.json"
	with open(api_file, "r") as f:
	api_keys = json.load(f)
	openai_api_key = api_keys["openai"]

	client = OpenAI(api_key=openai_api_key)

	def openai_return(prompt, model="gpt-5"):
	"""Send a prompt to GPT and parse JSON."""
	try:
	response = client.chat.completions.create(
	model=model,
	messages=[
	{"role": "system", "content": "You are a helpful assistant that outputs only valid JSON."},
	{"role": "user", "content": prompt}
	],
	response_format={"type": "json_object"} # Ensuring JSON mode if supported
	)
	content = response.choices[0].message.content.strip()
	# Clean up possible markdown artifacts
	cleaned = content.replace("```json", "").replace("```", "").strip()
	return json.loads(cleaned)
	except Exception as e:
	print(f"⚠️ Error during API call or parsing: {e}")
	return content

	# Load existing results if they exist to resume progress
	res = []
	if os.path.exists(save_path):
	with open(save_path, "r") as f:
	res = json.load(f)

	# Load Source Data
	with open("/home/mshahidul/readctrl/data/testing_data_gs/multiclinsum_gs_train_en.json", "r") as f:
	data = json.load(f)

	# --- Translation Loop ---
	# Start from the number of already processed items
	start_index = len(res)
	for item in tqdm.tqdm(data[start_index:200]):

	# Helper to generate prompt and call API
	def get_translation(text):
	formatted_prompt = (prompt_template
	.replace("<MEDICAL_TEXT>", text)
	.replace("<SOURCE_LANGUAGE>", source_language)
	.replace("<TARGET_LANGUAGE>", target_language))
	return openai_return(formatted_prompt, model="gpt-5")

	# Translate Fulltext
	translated_full = get_translation(item["fulltext"])

	# Translate Summary
	translated_sum = get_translation(item["summary"])

	# Create the translated object
	translated_item = {
	"id": item["id"],
	"fulltext_translated": translated_full,
	"summary_translated": translated_sum,
	"original_id": item["id"]
	}

	res.append(translated_item)

	# Incremental save every 2 items
	if len(res) % 2 == 0:
	with open(save_path, "w", encoding='utf-8') as f:
	json.dump(res, f, indent=2, ensure_ascii=False)
	print(f" Saved {len(res)} samples so far.")

	# Final Save
	with open(save_path, "w", encoding='utf-8') as f:
	json.dump(res, f, indent=2, ensure_ascii=False)

	print(f"✅ Processing complete. Data saved to {save_path}")