File size: 4,661 Bytes
030876e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import os
import json
import time
from tqdm import tqdm
from openai import OpenAI
from transformers import AutoProcessor
model_id = "google/translategemma-27b-it"
processor = AutoProcessor.from_pretrained(model_id)

# ---- Configuration ----
DATA_PATH = "/home/mshahidul/readctrl/data/testing_data_gs/multiclinsum_gs_train_en.json"
OUT_PATH = "/home/mshahidul/readctrl/data/translated_data/multiclinsum_gs_train_en2bn_gemma(0_200).json"

# Translation API
TRANSLATE_BASE_URL = "http://localhost:8081/v1"
# Judge API
VLLM_BASE_URL = os.environ.get("VLLM_BASE_URL", "http://localhost:8004/v1")
JUDGE_MODEL = os.environ.get("JUDGE_MODEL", "Qwen/Qwen3-30B-A3B-Instruct-2507")

# Initialize Clients
translate_client = OpenAI(base_url=TRANSLATE_BASE_URL, api_key="no-key-required")
judge_client = OpenAI(base_url=VLLM_BASE_URL, api_key="no-key-required")

def translate_text(text, source_lang="en", target_lang="bn"):
    """
    Sends a single string to the Gemma translation endpoint.
    """
    # Note: If your local server supports batching natively in the completions call, 
    # you can pass a list of messages. Otherwise, we loop within the batch processor.
    try:
        messages = [{
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "source_lang_code": source_lang,
                    "target_lang_code": target_lang,
                    "text": text,
                }
            ],
        }]
        
        prompt = processor.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=True
            )
        
        # Note: We assume the template application is handled by the server 
        # or simplified here for the API call.
        completion = translate_client.chat.completions.create(
            model="translate_gemma",
            messages=prompt,
            temperature=0.1
        )
        return completion.choices[0].message.content.strip()
    except Exception as e:
        print(f"Translation error: {e}")
        return None

def judge_translation(original, translated):
    """
    Uses Qwen to check for hallucinations or mixed-language issues.
    Returns True if passed, False otherwise.
    """
    prompt = f"""
    You are a linguistic judge. Evaluate the following Bengali translation of an English medical text.
    Check for:
    1. Presence of any language other than Bengali or English medical terms.
    2. Hallucinated keywords not present in the original.
    
    Original English: {original}
    Translated Bengali: {translated}
    
    Does this translation pass? Respond with ONLY 'PASS' or 'FAIL'.
    """
    try:
        response = judge_client.chat.completions.create(
            model=JUDGE_MODEL,
            messages=[{"role": "user", "content": prompt}],
            max_tokens=5
        )
        result = response.choices[0].message.content.strip().upper()
        return "PASS" in result
    except Exception as e:
        print(f"Judge error: {e}")
        return True # Default to True to avoid getting stuck

def process_batch(data_slice):
    results = []
    for record in data_slice:
        # Translate Fulltext
        bn_fulltext = translate_text(record['fulltext'])
        # Translate Summary
        bn_summary = translate_text(record['summary'])
        
        # Verify with Judge
        is_valid_full = judge_translation(record['fulltext'], bn_fulltext)
        is_valid_sum = judge_translation(record['summary'], bn_summary)
        
        record['translated_fulltext'] = bn_fulltext
        record['translated_summary'] = bn_summary
        record['judge_pass'] = is_valid_full and is_valid_sum
        
        results.append(record)
    return results

# ---- Main Execution ----
def main():
    # Load data
    with open(DATA_PATH, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Slice data for (0_200) as per your filename
    subset = data[0:200]
    
    translated_data = []
    batch_size = 10 # Adjust based on your VRAM/Server capacity

    print(f"Starting translation for {len(subset)} records...")
    
    for i in tqdm(range(0, len(subset), batch_size)):
        batch = subset[i:i+batch_size]
        processed_batch = process_batch(batch)
        translated_data.extend(processed_batch)
        
        # Intermediate save to avoid data loss
        os.makedirs(os.path.dirname(OUT_PATH), exist_ok=True)
        with open(OUT_PATH, 'w', encoding='utf-8') as f:
            json.dump(translated_data, f, ensure_ascii=False, indent=4)

    print(f"Processing complete. Saved to {OUT_PATH}")

if __name__ == "__main__":
    main()