File size: 4,661 Bytes
030876e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 | import os
import json
import time
from tqdm import tqdm
from openai import OpenAI
from transformers import AutoProcessor
model_id = "google/translategemma-27b-it"
processor = AutoProcessor.from_pretrained(model_id)
# ---- Configuration ----
DATA_PATH = "/home/mshahidul/readctrl/data/testing_data_gs/multiclinsum_gs_train_en.json"
OUT_PATH = "/home/mshahidul/readctrl/data/translated_data/multiclinsum_gs_train_en2bn_gemma(0_200).json"
# Translation API
TRANSLATE_BASE_URL = "http://localhost:8081/v1"
# Judge API
VLLM_BASE_URL = os.environ.get("VLLM_BASE_URL", "http://localhost:8004/v1")
JUDGE_MODEL = os.environ.get("JUDGE_MODEL", "Qwen/Qwen3-30B-A3B-Instruct-2507")
# Initialize Clients
translate_client = OpenAI(base_url=TRANSLATE_BASE_URL, api_key="no-key-required")
judge_client = OpenAI(base_url=VLLM_BASE_URL, api_key="no-key-required")
def translate_text(text, source_lang="en", target_lang="bn"):
"""
Sends a single string to the Gemma translation endpoint.
"""
# Note: If your local server supports batching natively in the completions call,
# you can pass a list of messages. Otherwise, we loop within the batch processor.
try:
messages = [{
"role": "user",
"content": [
{
"type": "text",
"source_lang_code": source_lang,
"target_lang_code": target_lang,
"text": text,
}
],
}]
prompt = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
# Note: We assume the template application is handled by the server
# or simplified here for the API call.
completion = translate_client.chat.completions.create(
model="translate_gemma",
messages=prompt,
temperature=0.1
)
return completion.choices[0].message.content.strip()
except Exception as e:
print(f"Translation error: {e}")
return None
def judge_translation(original, translated):
"""
Uses Qwen to check for hallucinations or mixed-language issues.
Returns True if passed, False otherwise.
"""
prompt = f"""
You are a linguistic judge. Evaluate the following Bengali translation of an English medical text.
Check for:
1. Presence of any language other than Bengali or English medical terms.
2. Hallucinated keywords not present in the original.
Original English: {original}
Translated Bengali: {translated}
Does this translation pass? Respond with ONLY 'PASS' or 'FAIL'.
"""
try:
response = judge_client.chat.completions.create(
model=JUDGE_MODEL,
messages=[{"role": "user", "content": prompt}],
max_tokens=5
)
result = response.choices[0].message.content.strip().upper()
return "PASS" in result
except Exception as e:
print(f"Judge error: {e}")
return True # Default to True to avoid getting stuck
def process_batch(data_slice):
results = []
for record in data_slice:
# Translate Fulltext
bn_fulltext = translate_text(record['fulltext'])
# Translate Summary
bn_summary = translate_text(record['summary'])
# Verify with Judge
is_valid_full = judge_translation(record['fulltext'], bn_fulltext)
is_valid_sum = judge_translation(record['summary'], bn_summary)
record['translated_fulltext'] = bn_fulltext
record['translated_summary'] = bn_summary
record['judge_pass'] = is_valid_full and is_valid_sum
results.append(record)
return results
# ---- Main Execution ----
def main():
# Load data
with open(DATA_PATH, 'r', encoding='utf-8') as f:
data = json.load(f)
# Slice data for (0_200) as per your filename
subset = data[0:200]
translated_data = []
batch_size = 10 # Adjust based on your VRAM/Server capacity
print(f"Starting translation for {len(subset)} records...")
for i in tqdm(range(0, len(subset), batch_size)):
batch = subset[i:i+batch_size]
processed_batch = process_batch(batch)
translated_data.extend(processed_batch)
# Intermediate save to avoid data loss
os.makedirs(os.path.dirname(OUT_PATH), exist_ok=True)
with open(OUT_PATH, 'w', encoding='utf-8') as f:
json.dump(translated_data, f, ensure_ascii=False, indent=4)
print(f"Processing complete. Saved to {OUT_PATH}")
if __name__ == "__main__":
main() |