mangatranslator / src /translator.py
bartwisch
Initial release v1.0.0 – Manga Translator with MIT License
376598e
from deep_translator import GoogleTranslator
import deepl
from openai import OpenAI
from typing import List, Union, Optional
import base64
import io
import json
from PIL import Image
class TranslatorService:
def __init__(self, source: str = 'en', target: str = 'de', service_type: str = 'google', api_key: Optional[str] = None):
"""
Initializes the Translator Service.
Args:
source: Source language code (default: 'en').
target: Target language code (default: 'de').
service_type: 'google', 'deepl', 'openai', or 'xai'.
api_key: API Key for DeepL, OpenAI or xAI.
"""
self.service_type = service_type
self.api_key = api_key
self.target = target
self.source = source
self.usage = {'input_tokens': 0, 'output_tokens': 0}
if self.service_type == 'deepl':
print("Using DeepL Translator")
if not self.api_key:
raise ValueError("DeepL API Key is required for DeepL service.")
self.translator = deepl.Translator(self.api_key)
elif self.service_type == 'openai':
print("Using OpenAI (GPT-4o-mini) Translator")
if not self.api_key:
raise ValueError("OpenAI API Key is required for OpenAI service.")
self.client = OpenAI(api_key=self.api_key)
elif self.service_type == 'xai':
print("Using xAI Grok Translator")
if not self.api_key:
raise ValueError("xAI API Key is required for Grok service.")
# xAI API is OpenAI-compatible
self.client = OpenAI(api_key=self.api_key, base_url="https://api.x.ai/v1")
else:
print("Using Google Translator (deep-translator)")
self.translator = GoogleTranslator(source=source, target=target)
def get_usage_stats(self):
"""Returns accumulated token usage."""
return self.usage
def get_cost_estimate(self):
"""
Returns estimated cost in USD based on GPT-4o-mini pricing.
Input: $0.15 / 1M tokens
Output: $0.60 / 1M tokens
"""
input_cost = (self.usage['input_tokens'] / 1_000_000) * 0.15
output_cost = (self.usage['output_tokens'] / 1_000_000) * 0.60
return input_cost + output_cost
def validate_api_key(self) -> None:
"""Performs a lightweight test call to validate the configured API key.
Raises:
Exception: If the key is invalid or the provider returns an auth error.
"""
# Google (deep-translator) does not use an API key
if self.service_type not in ['deepl', 'openai', 'xai']:
return
if self.service_type == 'deepl':
# Minimal ping using the official client
try:
# This will raise an exception on invalid auth
_ = self.translator.get_usage()
except Exception as e:
raise Exception(f"DeepL API key seems invalid or not authorized: {e}")
return
# OpenAI / xAI
try:
model = "gpt-4o-mini" if self.service_type == 'openai' else "grok-4-mini"
# Very small test prompt to minimize cost
response = self.client.chat.completions.create(
model=model,
messages=[
{"role": "user", "content": "test"}
],
max_tokens=1,
temperature=0.0,
)
# If we get here without exception, we assume the key works.
if response.usage:
self.usage['input_tokens'] += response.usage.prompt_tokens
self.usage['output_tokens'] += response.usage.completion_tokens
except Exception as e:
raise Exception(f"{self.service_type.capitalize()} API key seems invalid or the service is not reachable: {e}")
def translate_image_with_vision(self, image: Image.Image) -> List[dict]:
"""
Uses VLM (Vision Language Model) to detect and translate text directly from image.
Returns list of dicts: {'bbox': [x1, y1, x2, y2], 'original': str, 'translated': str}
"""
if self.service_type not in ['openai', 'xai']:
raise ValueError("Vision features only supported for OpenAI and xAI services.")
# 1. Letterbox the image to be square (helps with coordinate accuracy)
old_width, old_height = image.size
new_size = max(old_width, old_height)
square_img = Image.new("RGB", (new_size, new_size), (255, 255, 255))
# Paste original image centered or top-left? Top-left is easier for coord math.
square_img.paste(image, (0, 0))
# Convert to base64
buffered = io.BytesIO()
square_img.save(buffered, format="JPEG")
img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
img_url = f"data:image/jpeg;base64,{img_str}"
model = "gpt-4o-mini" if self.service_type == 'openai' else "grok-4-latest"
prompt = f"""
You are a Manga Translator Agent.
Look at this manga page. Identify all speech bubbles and text boxes.
For each text region:
1. Extract the English text.
2. Translate it to German.
3. Estimate the bounding box as [ymin, xmin, ymax, xmax] using a 0-1000 normalized scale based on this square image.
- (0,0) is top-left corner.
- (1000,1000) is bottom-right corner.
- Be extremely precise with the coordinates.
- The image might have white padding on the right or bottom, ignore that area.
Return ONLY a valid JSON array with this structure:
[
{{
"original": "English text",
"translated": "German translation",
"bbox": [ymin, xmin, ymax, xmax]
}}
]
Do not use markdown code blocks. Return raw JSON only.
"""
try:
response = self.client.chat.completions.create(
model=model,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {"url": img_url}
}
],
}
],
max_tokens=2000,
temperature=0.1
)
# Track usage
if response.usage:
self.usage['input_tokens'] += response.usage.prompt_tokens
self.usage['output_tokens'] += response.usage.completion_tokens
content = response.choices[0].message.content.strip()
# Cleanup markdown if present
if content.startswith("```json"):
content = content[7:]
if content.endswith("```"):
content = content[:-3]
data = json.loads(content.strip())
results = []
for item in data:
ymin, xmin, ymax, xmax = item['bbox']
# Clamp values 0-1000
ymin = max(0, min(1000, ymin))
xmin = max(0, min(1000, xmin))
ymax = max(0, min(1000, ymax))
xmax = max(0, min(1000, xmax))
# Convert from 0-1000 scale relative to the SQUARE image
abs_x_min = int((xmin / 1000) * new_size)
abs_y_min = int((ymin / 1000) * new_size)
abs_x_max = int((xmax / 1000) * new_size)
abs_y_max = int((ymax / 1000) * new_size)
# Clip to original image dimensions (remove padding area results)
abs_x_min = min(abs_x_min, old_width)
abs_y_min = min(abs_y_min, old_height)
abs_x_max = min(abs_x_max, old_width)
abs_y_max = min(abs_y_max, old_height)
# Ensure valid box
if abs_x_max > abs_x_min and abs_y_max > abs_y_min:
bbox_points = [
[abs_x_min, abs_y_min], # Top-Left
[abs_x_max, abs_y_min], # Top-Right
[abs_x_max, abs_y_max], # Bottom-Right
[abs_x_min, abs_y_max] # Bottom-Left
]
results.append({
'bbox': bbox_points,
'original': item.get('original', ''),
'translated': item.get('translated', '')
})
return results
except Exception as e:
print(f"Vision translation error: {e}")
return []
def translate_text(self, text: str) -> str:
"""
Translates a single string.
"""
if not text.strip():
return ""
try:
if self.service_type == 'deepl':
# DeepL uses slightly different language codes (e.g. 'DE' instead of 'de' usually, but 'de' works)
result = self.translator.translate_text(text, source_lang=None, target_lang=self.target)
return result.text
elif self.service_type in ['openai', 'xai']:
# Select model based on service
model = "gpt-4o-mini" if self.service_type == 'openai' else "grok-4-latest"
response = self.client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": f"You are a professional manga translator. Translate the following text from {self.source} to {self.target}. Keep the translation natural and fitting for a comic/manga context. Ensure correct handling of German special characters like ä, ö, ü, ß. Only return the translated text, nothing else."},
{"role": "user", "content": text}
],
temperature=0.3
)
# Track usage
if response.usage:
self.usage['input_tokens'] += response.usage.prompt_tokens
self.usage['output_tokens'] += response.usage.completion_tokens
return response.choices[0].message.content.strip()
else:
return self.translator.translate(text)
except Exception as e:
print(f"Translation error: {e}")
return text
def translate_batch(self, texts: List[str]) -> List[str]:
"""
Translates a list of strings.
"""
if not texts:
return []
try:
if self.service_type == 'deepl':
results = self.translator.translate_text(texts, source_lang=None, target_lang=self.target)
return [r.text for r in results]
elif self.service_type in ['openai', 'xai']:
# Select model based on service
model = "gpt-4o-mini" if self.service_type == 'openai' else "grok-4-latest"
# OpenAI/xAI batch approach
formatted_text = "\n".join([f"{i+1}. {t}" for i, t in enumerate(texts)])
prompt = f"Translate the following numbered lines from {self.source} to {self.target}. Return them as a numbered list with the same indices.\n\n{formatted_text}"
response = self.client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": f"You are a professional manga translator. Translate the text from {self.source} to {self.target}. Return ONLY the numbered list of translations."},
{"role": "user", "content": prompt}
],
temperature=0.3
)
# Track usage
if response.usage:
self.usage['input_tokens'] += response.usage.prompt_tokens
self.usage['output_tokens'] += response.usage.completion_tokens
content = response.choices[0].message.content.strip()
# Parse results back to list
translated_lines = []
# Simple parsing (robustness could be improved)
for line in content.split('\n'):
if '. ' in line:
parts = line.split('. ', 1)
if len(parts) > 1:
translated_lines.append(parts[1])
else:
translated_lines.append(line)
else:
translated_lines.append(line)
# Fallback if counts don't match (rare but possible)
if len(translated_lines) != len(texts):
return [self.translate_text(t) for t in texts]
return translated_lines
else:
return self.translator.translate_batch(texts)
except Exception as e:
print(f"Batch translation error: {e}")
return texts