File size: 4,661 Bytes

030876e

import os
import json
import time
from tqdm import tqdm
from openai import OpenAI
from transformers import AutoProcessor
model_id = "google/translategemma-27b-it"
processor = AutoProcessor.from_pretrained(model_id)

# ---- Configuration ----
DATA_PATH = "/home/mshahidul/readctrl/data/testing_data_gs/multiclinsum_gs_train_en.json"
OUT_PATH = "/home/mshahidul/readctrl/data/translated_data/multiclinsum_gs_train_en2bn_gemma(0_200).json"

# Translation API
TRANSLATE_BASE_URL = "http://localhost:8081/v1"
# Judge API
VLLM_BASE_URL = os.environ.get("VLLM_BASE_URL", "http://localhost:8004/v1")
JUDGE_MODEL = os.environ.get("JUDGE_MODEL", "Qwen/Qwen3-30B-A3B-Instruct-2507")

# Initialize Clients
translate_client = OpenAI(base_url=TRANSLATE_BASE_URL, api_key="no-key-required")
judge_client = OpenAI(base_url=VLLM_BASE_URL, api_key="no-key-required")

def translate_text(text, source_lang="en", target_lang="bn"):
    """
    Sends a single string to the Gemma translation endpoint.
    """
    # Note: If your local server supports batching natively in the completions call, 
    # you can pass a list of messages. Otherwise, we loop within the batch processor.
    try:
        messages = [{
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "source_lang_code": source_lang,
                    "target_lang_code": target_lang,
                    "text": text,
                }
            ],
        }]
        
        prompt = processor.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=True
            )
        
        # Note: We assume the template application is handled by the server 
        # or simplified here for the API call.
        completion = translate_client.chat.completions.create(
            model="translate_gemma",
            messages=prompt,
            temperature=0.1
        )
        return completion.choices[0].message.content.strip()
    except Exception as e:
        print(f"Translation error: {e}")
        return None

def judge_translation(original, translated):
    """
    Uses Qwen to check for hallucinations or mixed-language issues.
    Returns True if passed, False otherwise.
    """
    prompt = f"""
    You are a linguistic judge. Evaluate the following Bengali translation of an English medical text.
    Check for:
    1. Presence of any language other than Bengali or English medical terms.
    2. Hallucinated keywords not present in the original.
    
    Original English: {original}
    Translated Bengali: {translated}
    
    Does this translation pass? Respond with ONLY 'PASS' or 'FAIL'.
    """
    try:
        response = judge_client.chat.completions.create(
            model=JUDGE_MODEL,
            messages=[{"role": "user", "content": prompt}],
            max_tokens=5
        )
        result = response.choices[0].message.content.strip().upper()
        return "PASS" in result
    except Exception as e:
        print(f"Judge error: {e}")
        return True # Default to True to avoid getting stuck

def process_batch(data_slice):
    results = []
    for record in data_slice:
        # Translate Fulltext
        bn_fulltext = translate_text(record['fulltext'])
        # Translate Summary
        bn_summary = translate_text(record['summary'])
        
        # Verify with Judge
        is_valid_full = judge_translation(record['fulltext'], bn_fulltext)
        is_valid_sum = judge_translation(record['summary'], bn_summary)
        
        record['translated_fulltext'] = bn_fulltext
        record['translated_summary'] = bn_summary
        record['judge_pass'] = is_valid_full and is_valid_sum
        
        results.append(record)
    return results

# ---- Main Execution ----
def main():
    # Load data
    with open(DATA_PATH, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Slice data for (0_200) as per your filename
    subset = data[0:200]
    
    translated_data = []
    batch_size = 10 # Adjust based on your VRAM/Server capacity

    print(f"Starting translation for {len(subset)} records...")
    
    for i in tqdm(range(0, len(subset), batch_size)):
        batch = subset[i:i+batch_size]
        processed_batch = process_batch(batch)
        translated_data.extend(processed_batch)
        
        # Intermediate save to avoid data loss
        os.makedirs(os.path.dirname(OUT_PATH), exist_ok=True)
        with open(OUT_PATH, 'w', encoding='utf-8') as f:
            json.dump(translated_data, f, ensure_ascii=False, indent=4)

    print(f"Processing complete. Saved to {OUT_PATH}")

if __name__ == "__main__":
    main()