import os import json import time from tqdm import tqdm from openai import OpenAI from transformers import AutoProcessor model_id = "google/translategemma-27b-it" processor = AutoProcessor.from_pretrained(model_id) # ---- Configuration ---- DATA_PATH = "/home/mshahidul/readctrl/data/testing_data_gs/multiclinsum_gs_train_en.json" OUT_PATH = "/home/mshahidul/readctrl/data/translated_data/multiclinsum_gs_train_en2bn_gemma(0_200).json" # Translation API TRANSLATE_BASE_URL = "http://localhost:8081/v1" # Judge API VLLM_BASE_URL = os.environ.get("VLLM_BASE_URL", "http://localhost:8004/v1") JUDGE_MODEL = os.environ.get("JUDGE_MODEL", "Qwen/Qwen3-30B-A3B-Instruct-2507") # Initialize Clients translate_client = OpenAI(base_url=TRANSLATE_BASE_URL, api_key="no-key-required") judge_client = OpenAI(base_url=VLLM_BASE_URL, api_key="no-key-required") def translate_text(text, source_lang="en", target_lang="bn"): """ Sends a single string to the Gemma translation endpoint. """ # Note: If your local server supports batching natively in the completions call, # you can pass a list of messages. Otherwise, we loop within the batch processor. try: messages = [{ "role": "user", "content": [ { "type": "text", "source_lang_code": source_lang, "target_lang_code": target_lang, "text": text, } ], }] prompt = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) # Note: We assume the template application is handled by the server # or simplified here for the API call. completion = translate_client.chat.completions.create( model="translate_gemma", messages=prompt, temperature=0.1 ) return completion.choices[0].message.content.strip() except Exception as e: print(f"Translation error: {e}") return None def judge_translation(original, translated): """ Uses Qwen to check for hallucinations or mixed-language issues. Returns True if passed, False otherwise. """ prompt = f""" You are a linguistic judge. Evaluate the following Bengali translation of an English medical text. Check for: 1. Presence of any language other than Bengali or English medical terms. 2. Hallucinated keywords not present in the original. Original English: {original} Translated Bengali: {translated} Does this translation pass? Respond with ONLY 'PASS' or 'FAIL'. """ try: response = judge_client.chat.completions.create( model=JUDGE_MODEL, messages=[{"role": "user", "content": prompt}], max_tokens=5 ) result = response.choices[0].message.content.strip().upper() return "PASS" in result except Exception as e: print(f"Judge error: {e}") return True # Default to True to avoid getting stuck def process_batch(data_slice): results = [] for record in data_slice: # Translate Fulltext bn_fulltext = translate_text(record['fulltext']) # Translate Summary bn_summary = translate_text(record['summary']) # Verify with Judge is_valid_full = judge_translation(record['fulltext'], bn_fulltext) is_valid_sum = judge_translation(record['summary'], bn_summary) record['translated_fulltext'] = bn_fulltext record['translated_summary'] = bn_summary record['judge_pass'] = is_valid_full and is_valid_sum results.append(record) return results # ---- Main Execution ---- def main(): # Load data with open(DATA_PATH, 'r', encoding='utf-8') as f: data = json.load(f) # Slice data for (0_200) as per your filename subset = data[0:200] translated_data = [] batch_size = 10 # Adjust based on your VRAM/Server capacity print(f"Starting translation for {len(subset)} records...") for i in tqdm(range(0, len(subset), batch_size)): batch = subset[i:i+batch_size] processed_batch = process_batch(batch) translated_data.extend(processed_batch) # Intermediate save to avoid data loss os.makedirs(os.path.dirname(OUT_PATH), exist_ok=True) with open(OUT_PATH, 'w', encoding='utf-8') as f: json.dump(translated_data, f, ensure_ascii=False, indent=4) print(f"Processing complete. Saved to {OUT_PATH}") if __name__ == "__main__": main()