readCtrl_lambda / code /translation /misc /translate_multiclinsum_en2bn_v3.py
mshahidul
Initial commit of readCtrl code without large models
030876e
import os
import json
import time
from tqdm import tqdm
from openai import OpenAI
from transformers import AutoProcessor
model_id = "google/translategemma-27b-it"
processor = AutoProcessor.from_pretrained(model_id)
# ---- Configuration ----
DATA_PATH = "/home/mshahidul/readctrl/data/testing_data_gs/multiclinsum_gs_train_en.json"
OUT_PATH = "/home/mshahidul/readctrl/data/translated_data/multiclinsum_gs_train_en2bn_gemma(0_200).json"
# Translation API
TRANSLATE_BASE_URL = "http://localhost:8081/v1"
# Judge API
VLLM_BASE_URL = os.environ.get("VLLM_BASE_URL", "http://localhost:8004/v1")
JUDGE_MODEL = os.environ.get("JUDGE_MODEL", "Qwen/Qwen3-30B-A3B-Instruct-2507")
# Initialize Clients
translate_client = OpenAI(base_url=TRANSLATE_BASE_URL, api_key="no-key-required")
judge_client = OpenAI(base_url=VLLM_BASE_URL, api_key="no-key-required")
def translate_text(text, source_lang="en", target_lang="bn"):
"""
Sends a single string to the Gemma translation endpoint.
"""
# Note: If your local server supports batching natively in the completions call,
# you can pass a list of messages. Otherwise, we loop within the batch processor.
try:
messages = [{
"role": "user",
"content": [
{
"type": "text",
"source_lang_code": source_lang,
"target_lang_code": target_lang,
"text": text,
}
],
}]
prompt = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
# Note: We assume the template application is handled by the server
# or simplified here for the API call.
completion = translate_client.chat.completions.create(
model="translate_gemma",
messages=prompt,
temperature=0.1
)
return completion.choices[0].message.content.strip()
except Exception as e:
print(f"Translation error: {e}")
return None
def judge_translation(original, translated):
"""
Uses Qwen to check for hallucinations or mixed-language issues.
Returns True if passed, False otherwise.
"""
prompt = f"""
You are a linguistic judge. Evaluate the following Bengali translation of an English medical text.
Check for:
1. Presence of any language other than Bengali or English medical terms.
2. Hallucinated keywords not present in the original.
Original English: {original}
Translated Bengali: {translated}
Does this translation pass? Respond with ONLY 'PASS' or 'FAIL'.
"""
try:
response = judge_client.chat.completions.create(
model=JUDGE_MODEL,
messages=[{"role": "user", "content": prompt}],
max_tokens=5
)
result = response.choices[0].message.content.strip().upper()
return "PASS" in result
except Exception as e:
print(f"Judge error: {e}")
return True # Default to True to avoid getting stuck
def process_batch(data_slice):
results = []
for record in data_slice:
# Translate Fulltext
bn_fulltext = translate_text(record['fulltext'])
# Translate Summary
bn_summary = translate_text(record['summary'])
# Verify with Judge
is_valid_full = judge_translation(record['fulltext'], bn_fulltext)
is_valid_sum = judge_translation(record['summary'], bn_summary)
record['translated_fulltext'] = bn_fulltext
record['translated_summary'] = bn_summary
record['judge_pass'] = is_valid_full and is_valid_sum
results.append(record)
return results
# ---- Main Execution ----
def main():
# Load data
with open(DATA_PATH, 'r', encoding='utf-8') as f:
data = json.load(f)
# Slice data for (0_200) as per your filename
subset = data[0:200]
translated_data = []
batch_size = 10 # Adjust based on your VRAM/Server capacity
print(f"Starting translation for {len(subset)} records...")
for i in tqdm(range(0, len(subset), batch_size)):
batch = subset[i:i+batch_size]
processed_batch = process_batch(batch)
translated_data.extend(processed_batch)
# Intermediate save to avoid data loss
os.makedirs(os.path.dirname(OUT_PATH), exist_ok=True)
with open(OUT_PATH, 'w', encoding='utf-8') as f:
json.dump(translated_data, f, ensure_ascii=False, indent=4)
print(f"Processing complete. Saved to {OUT_PATH}")
if __name__ == "__main__":
main()