readCtrl_lambda / code /translation /misc /translate_multiclinsum_en2bn_v3.py
mshahidul
Initial commit of readCtrl code without large models
030876e
raw
history blame
4.66 kB
import os
import json
import time
from tqdm import tqdm
from openai import OpenAI
from transformers import AutoProcessor
model_id = "google/translategemma-27b-it"
processor = AutoProcessor.from_pretrained(model_id)
# ---- Configuration ----
DATA_PATH = "/home/mshahidul/readctrl/data/testing_data_gs/multiclinsum_gs_train_en.json"
OUT_PATH = "/home/mshahidul/readctrl/data/translated_data/multiclinsum_gs_train_en2bn_gemma(0_200).json"
# Translation API
TRANSLATE_BASE_URL = "http://localhost:8081/v1"
# Judge API
VLLM_BASE_URL = os.environ.get("VLLM_BASE_URL", "http://localhost:8004/v1")
JUDGE_MODEL = os.environ.get("JUDGE_MODEL", "Qwen/Qwen3-30B-A3B-Instruct-2507")
# Initialize Clients
translate_client = OpenAI(base_url=TRANSLATE_BASE_URL, api_key="no-key-required")
judge_client = OpenAI(base_url=VLLM_BASE_URL, api_key="no-key-required")
def translate_text(text, source_lang="en", target_lang="bn"):
"""
Sends a single string to the Gemma translation endpoint.
"""
# Note: If your local server supports batching natively in the completions call,
# you can pass a list of messages. Otherwise, we loop within the batch processor.
try:
messages = [{
"role": "user",
"content": [
{
"type": "text",
"source_lang_code": source_lang,
"target_lang_code": target_lang,
"text": text,
}
],
}]
prompt = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
# Note: We assume the template application is handled by the server
# or simplified here for the API call.
completion = translate_client.chat.completions.create(
model="translate_gemma",
messages=prompt,
temperature=0.1
)
return completion.choices[0].message.content.strip()
except Exception as e:
print(f"Translation error: {e}")
return None
def judge_translation(original, translated):
"""
Uses Qwen to check for hallucinations or mixed-language issues.
Returns True if passed, False otherwise.
"""
prompt = f"""
You are a linguistic judge. Evaluate the following Bengali translation of an English medical text.
Check for:
1. Presence of any language other than Bengali or English medical terms.
2. Hallucinated keywords not present in the original.
Original English: {original}
Translated Bengali: {translated}
Does this translation pass? Respond with ONLY 'PASS' or 'FAIL'.
"""
try:
response = judge_client.chat.completions.create(
model=JUDGE_MODEL,
messages=[{"role": "user", "content": prompt}],
max_tokens=5
)
result = response.choices[0].message.content.strip().upper()
return "PASS" in result
except Exception as e:
print(f"Judge error: {e}")
return True # Default to True to avoid getting stuck
def process_batch(data_slice):
results = []
for record in data_slice:
# Translate Fulltext
bn_fulltext = translate_text(record['fulltext'])
# Translate Summary
bn_summary = translate_text(record['summary'])
# Verify with Judge
is_valid_full = judge_translation(record['fulltext'], bn_fulltext)
is_valid_sum = judge_translation(record['summary'], bn_summary)
record['translated_fulltext'] = bn_fulltext
record['translated_summary'] = bn_summary
record['judge_pass'] = is_valid_full and is_valid_sum
results.append(record)
return results
# ---- Main Execution ----
def main():
# Load data
with open(DATA_PATH, 'r', encoding='utf-8') as f:
data = json.load(f)
# Slice data for (0_200) as per your filename
subset = data[0:200]
translated_data = []
batch_size = 10 # Adjust based on your VRAM/Server capacity
print(f"Starting translation for {len(subset)} records...")
for i in tqdm(range(0, len(subset), batch_size)):
batch = subset[i:i+batch_size]
processed_batch = process_batch(batch)
translated_data.extend(processed_batch)
# Intermediate save to avoid data loss
os.makedirs(os.path.dirname(OUT_PATH), exist_ok=True)
with open(OUT_PATH, 'w', encoding='utf-8') as f:
json.dump(translated_data, f, ensure_ascii=False, indent=4)
print(f"Processing complete. Saved to {OUT_PATH}")
if __name__ == "__main__":
main()