readCtrl_lambda / code /translation /misc /translate_multiclinsum_en2bn_v3.py

mshahidul

Initial commit of readCtrl code without large models

030876e about 2 months ago

4.66 kB

	import os
	import json
	import time
	from tqdm import tqdm
	from openai import OpenAI
	from transformers import AutoProcessor
	model_id = "google/translategemma-27b-it"
	processor = AutoProcessor.from_pretrained(model_id)

	# ---- Configuration ----
	DATA_PATH = "/home/mshahidul/readctrl/data/testing_data_gs/multiclinsum_gs_train_en.json"
	OUT_PATH = "/home/mshahidul/readctrl/data/translated_data/multiclinsum_gs_train_en2bn_gemma(0_200).json"

	# Translation API
	TRANSLATE_BASE_URL = "http://localhost:8081/v1"
	# Judge API
	VLLM_BASE_URL = os.environ.get("VLLM_BASE_URL", "http://localhost:8004/v1")
	JUDGE_MODEL = os.environ.get("JUDGE_MODEL", "Qwen/Qwen3-30B-A3B-Instruct-2507")

	# Initialize Clients
	translate_client = OpenAI(base_url=TRANSLATE_BASE_URL, api_key="no-key-required")
	judge_client = OpenAI(base_url=VLLM_BASE_URL, api_key="no-key-required")

	def translate_text(text, source_lang="en", target_lang="bn"):
	"""
	Sends a single string to the Gemma translation endpoint.
	"""
	# Note: If your local server supports batching natively in the completions call,
	# you can pass a list of messages. Otherwise, we loop within the batch processor.
	try:
	messages = [{
	"role": "user",
	"content": [
	{
	"type": "text",
	"source_lang_code": source_lang,
	"target_lang_code": target_lang,
	"text": text,
	}
	],
	}]

	prompt = processor.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)

	# Note: We assume the template application is handled by the server
	# or simplified here for the API call.
	completion = translate_client.chat.completions.create(
	model="translate_gemma",
	messages=prompt,
	temperature=0.1
	)
	return completion.choices[0].message.content.strip()
	except Exception as e:
	print(f"Translation error: {e}")
	return None

	def judge_translation(original, translated):
	"""
	Uses Qwen to check for hallucinations or mixed-language issues.
	Returns True if passed, False otherwise.
	"""
	prompt = f"""
	You are a linguistic judge. Evaluate the following Bengali translation of an English medical text.
	Check for:
	1. Presence of any language other than Bengali or English medical terms.
	2. Hallucinated keywords not present in the original.

	Original English: {original}
	Translated Bengali: {translated}

	Does this translation pass? Respond with ONLY 'PASS' or 'FAIL'.
	"""
	try:
	response = judge_client.chat.completions.create(
	model=JUDGE_MODEL,
	messages=[{"role": "user", "content": prompt}],
	max_tokens=5
	)
	result = response.choices[0].message.content.strip().upper()
	return "PASS" in result
	except Exception as e:
	print(f"Judge error: {e}")
	return True # Default to True to avoid getting stuck

	def process_batch(data_slice):
	results = []
	for record in data_slice:
	# Translate Fulltext
	bn_fulltext = translate_text(record['fulltext'])
	# Translate Summary
	bn_summary = translate_text(record['summary'])

	# Verify with Judge
	is_valid_full = judge_translation(record['fulltext'], bn_fulltext)
	is_valid_sum = judge_translation(record['summary'], bn_summary)

	record['translated_fulltext'] = bn_fulltext
	record['translated_summary'] = bn_summary
	record['judge_pass'] = is_valid_full and is_valid_sum

	results.append(record)
	return results

	# ---- Main Execution ----
	def main():
	# Load data
	with open(DATA_PATH, 'r', encoding='utf-8') as f:
	data = json.load(f)

	# Slice data for (0_200) as per your filename
	subset = data[0:200]

	translated_data = []
	batch_size = 10 # Adjust based on your VRAM/Server capacity

	print(f"Starting translation for {len(subset)} records...")

	for i in tqdm(range(0, len(subset), batch_size)):
	batch = subset[i:i+batch_size]
	processed_batch = process_batch(batch)
	translated_data.extend(processed_batch)

	# Intermediate save to avoid data loss
	os.makedirs(os.path.dirname(OUT_PATH), exist_ok=True)
	with open(OUT_PATH, 'w', encoding='utf-8') as f:
	json.dump(translated_data, f, ensure_ascii=False, indent=4)

	print(f"Processing complete. Saved to {OUT_PATH}")

	if __name__ == "__main__":
	main()