| import os |
| import json |
| import time |
| from tqdm import tqdm |
| from openai import OpenAI |
| from transformers import AutoProcessor |
| model_id = "google/translategemma-27b-it" |
| processor = AutoProcessor.from_pretrained(model_id) |
|
|
| |
| DATA_PATH = "/home/mshahidul/readctrl/data/testing_data_gs/multiclinsum_gs_train_en.json" |
| OUT_PATH = "/home/mshahidul/readctrl/data/translated_data/multiclinsum_gs_train_en2bn_gemma(0_200).json" |
|
|
| |
| TRANSLATE_BASE_URL = "http://localhost:8081/v1" |
| |
| VLLM_BASE_URL = os.environ.get("VLLM_BASE_URL", "http://localhost:8004/v1") |
| JUDGE_MODEL = os.environ.get("JUDGE_MODEL", "Qwen/Qwen3-30B-A3B-Instruct-2507") |
|
|
| |
| translate_client = OpenAI(base_url=TRANSLATE_BASE_URL, api_key="no-key-required") |
| judge_client = OpenAI(base_url=VLLM_BASE_URL, api_key="no-key-required") |
|
|
| def translate_text(text, source_lang="en", target_lang="bn"): |
| """ |
| Sends a single string to the Gemma translation endpoint. |
| """ |
| |
| |
| try: |
| messages = [{ |
| "role": "user", |
| "content": [ |
| { |
| "type": "text", |
| "source_lang_code": source_lang, |
| "target_lang_code": target_lang, |
| "text": text, |
| } |
| ], |
| }] |
| |
| prompt = processor.apply_chat_template( |
| messages, tokenize=False, add_generation_prompt=True |
| ) |
| |
| |
| |
| completion = translate_client.chat.completions.create( |
| model="translate_gemma", |
| messages=prompt, |
| temperature=0.1 |
| ) |
| return completion.choices[0].message.content.strip() |
| except Exception as e: |
| print(f"Translation error: {e}") |
| return None |
|
|
| def judge_translation(original, translated): |
| """ |
| Uses Qwen to check for hallucinations or mixed-language issues. |
| Returns True if passed, False otherwise. |
| """ |
| prompt = f""" |
| You are a linguistic judge. Evaluate the following Bengali translation of an English medical text. |
| Check for: |
| 1. Presence of any language other than Bengali or English medical terms. |
| 2. Hallucinated keywords not present in the original. |
| |
| Original English: {original} |
| Translated Bengali: {translated} |
| |
| Does this translation pass? Respond with ONLY 'PASS' or 'FAIL'. |
| """ |
| try: |
| response = judge_client.chat.completions.create( |
| model=JUDGE_MODEL, |
| messages=[{"role": "user", "content": prompt}], |
| max_tokens=5 |
| ) |
| result = response.choices[0].message.content.strip().upper() |
| return "PASS" in result |
| except Exception as e: |
| print(f"Judge error: {e}") |
| return True |
|
|
| def process_batch(data_slice): |
| results = [] |
| for record in data_slice: |
| |
| bn_fulltext = translate_text(record['fulltext']) |
| |
| bn_summary = translate_text(record['summary']) |
| |
| |
| is_valid_full = judge_translation(record['fulltext'], bn_fulltext) |
| is_valid_sum = judge_translation(record['summary'], bn_summary) |
| |
| record['translated_fulltext'] = bn_fulltext |
| record['translated_summary'] = bn_summary |
| record['judge_pass'] = is_valid_full and is_valid_sum |
| |
| results.append(record) |
| return results |
|
|
| |
| def main(): |
| |
| with open(DATA_PATH, 'r', encoding='utf-8') as f: |
| data = json.load(f) |
|
|
| |
| subset = data[0:200] |
| |
| translated_data = [] |
| batch_size = 10 |
|
|
| print(f"Starting translation for {len(subset)} records...") |
| |
| for i in tqdm(range(0, len(subset), batch_size)): |
| batch = subset[i:i+batch_size] |
| processed_batch = process_batch(batch) |
| translated_data.extend(processed_batch) |
| |
| |
| os.makedirs(os.path.dirname(OUT_PATH), exist_ok=True) |
| with open(OUT_PATH, 'w', encoding='utf-8') as f: |
| json.dump(translated_data, f, ensure_ascii=False, indent=4) |
|
|
| print(f"Processing complete. Saved to {OUT_PATH}") |
|
|
| if __name__ == "__main__": |
| main() |