Spaces:

Hugobartwisch
/

mangatranslator

Sleeping

bartwisch

Initial release v1.0.0 – Manga Translator with MIT License

376598e 3 months ago

13.7 kB

	from deep_translator import GoogleTranslator
	import deepl
	from openai import OpenAI
	from typing import List, Union, Optional
	import base64
	import io
	import json
	from PIL import Image

	class TranslatorService:
	def __init__(self, source: str = 'en', target: str = 'de', service_type: str = 'google', api_key: Optional[str] = None):
	"""
	Initializes the Translator Service.

	Args:
	source: Source language code (default: 'en').
	target: Target language code (default: 'de').
	service_type: 'google', 'deepl', 'openai', or 'xai'.
	api_key: API Key for DeepL, OpenAI or xAI.
	"""
	self.service_type = service_type
	self.api_key = api_key
	self.target = target
	self.source = source
	self.usage = {'input_tokens': 0, 'output_tokens': 0}

	if self.service_type == 'deepl':
	print("Using DeepL Translator")
	if not self.api_key:
	raise ValueError("DeepL API Key is required for DeepL service.")
	self.translator = deepl.Translator(self.api_key)

	elif self.service_type == 'openai':
	print("Using OpenAI (GPT-4o-mini) Translator")
	if not self.api_key:
	raise ValueError("OpenAI API Key is required for OpenAI service.")
	self.client = OpenAI(api_key=self.api_key)

	elif self.service_type == 'xai':
	print("Using xAI Grok Translator")
	if not self.api_key:
	raise ValueError("xAI API Key is required for Grok service.")
	# xAI API is OpenAI-compatible
	self.client = OpenAI(api_key=self.api_key, base_url="https://api.x.ai/v1")

	else:
	print("Using Google Translator (deep-translator)")
	self.translator = GoogleTranslator(source=source, target=target)

	def get_usage_stats(self):
	"""Returns accumulated token usage."""
	return self.usage

	def get_cost_estimate(self):
	"""
	Returns estimated cost in USD based on GPT-4o-mini pricing.
	Input: $0.15 / 1M tokens
	Output: $0.60 / 1M tokens
	"""
	input_cost = (self.usage['input_tokens'] / 1_000_000) * 0.15
	output_cost = (self.usage['output_tokens'] / 1_000_000) * 0.60
	return input_cost + output_cost

	def validate_api_key(self) -> None:
	"""Performs a lightweight test call to validate the configured API key.

	Raises:
	Exception: If the key is invalid or the provider returns an auth error.
	"""
	# Google (deep-translator) does not use an API key
	if self.service_type not in ['deepl', 'openai', 'xai']:
	return

	if self.service_type == 'deepl':
	# Minimal ping using the official client
	try:
	# This will raise an exception on invalid auth
	_ = self.translator.get_usage()
	except Exception as e:
	raise Exception(f"DeepL API key seems invalid or not authorized: {e}")
	return

	# OpenAI / xAI
	try:
	model = "gpt-4o-mini" if self.service_type == 'openai' else "grok-4-mini"
	# Very small test prompt to minimize cost
	response = self.client.chat.completions.create(
	model=model,
	messages=[
	{"role": "user", "content": "test"}
	],
	max_tokens=1,
	temperature=0.0,
	)
	# If we get here without exception, we assume the key works.
	if response.usage:
	self.usage['input_tokens'] += response.usage.prompt_tokens
	self.usage['output_tokens'] += response.usage.completion_tokens
	except Exception as e:
	raise Exception(f"{self.service_type.capitalize()} API key seems invalid or the service is not reachable: {e}")

	def translate_image_with_vision(self, image: Image.Image) -> List[dict]:
	"""
	Uses VLM (Vision Language Model) to detect and translate text directly from image.
	Returns list of dicts: {'bbox': [x1, y1, x2, y2], 'original': str, 'translated': str}
	"""
	if self.service_type not in ['openai', 'xai']:
	raise ValueError("Vision features only supported for OpenAI and xAI services.")

	# 1. Letterbox the image to be square (helps with coordinate accuracy)
	old_width, old_height = image.size
	new_size = max(old_width, old_height)
	square_img = Image.new("RGB", (new_size, new_size), (255, 255, 255))

	# Paste original image centered or top-left? Top-left is easier for coord math.
	square_img.paste(image, (0, 0))

	# Convert to base64
	buffered = io.BytesIO()
	square_img.save(buffered, format="JPEG")
	img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
	img_url = f"data:image/jpeg;base64,{img_str}"

	model = "gpt-4o-mini" if self.service_type == 'openai' else "grok-4-latest"

	prompt = f"""
	You are a Manga Translator Agent.
	Look at this manga page. Identify all speech bubbles and text boxes.
	For each text region:
	1. Extract the English text.
	2. Translate it to German.
	3. Estimate the bounding box as [ymin, xmin, ymax, xmax] using a 0-1000 normalized scale based on this square image.
	- (0,0) is top-left corner.
	- (1000,1000) is bottom-right corner.
	- Be extremely precise with the coordinates.
	- The image might have white padding on the right or bottom, ignore that area.

	Return ONLY a valid JSON array with this structure:
	[
	{{
	"original": "English text",
	"translated": "German translation",
	"bbox": [ymin, xmin, ymax, xmax]
	}}
	]
	Do not use markdown code blocks. Return raw JSON only.
	"""

	try:
	response = self.client.chat.completions.create(
	model=model,
	messages=[
	{
	"role": "user",
	"content": [
	{"type": "text", "text": prompt},
	{
	"type": "image_url",
	"image_url": {"url": img_url}
	}
	],
	}
	],
	max_tokens=2000,
	temperature=0.1
	)

	# Track usage
	if response.usage:
	self.usage['input_tokens'] += response.usage.prompt_tokens
	self.usage['output_tokens'] += response.usage.completion_tokens

	content = response.choices[0].message.content.strip()
	# Cleanup markdown if present
	if content.startswith("```json"):
	content = content[7:]
	if content.endswith("```"):
	content = content[:-3]

	data = json.loads(content.strip())

	results = []
	for item in data:
	ymin, xmin, ymax, xmax = item['bbox']

	# Clamp values 0-1000
	ymin = max(0, min(1000, ymin))
	xmin = max(0, min(1000, xmin))
	ymax = max(0, min(1000, ymax))
	xmax = max(0, min(1000, xmax))

	# Convert from 0-1000 scale relative to the SQUARE image
	abs_x_min = int((xmin / 1000) * new_size)
	abs_y_min = int((ymin / 1000) * new_size)
	abs_x_max = int((xmax / 1000) * new_size)
	abs_y_max = int((ymax / 1000) * new_size)

	# Clip to original image dimensions (remove padding area results)
	abs_x_min = min(abs_x_min, old_width)
	abs_y_min = min(abs_y_min, old_height)
	abs_x_max = min(abs_x_max, old_width)
	abs_y_max = min(abs_y_max, old_height)

	# Ensure valid box
	if abs_x_max > abs_x_min and abs_y_max > abs_y_min:
	bbox_points = [
	[abs_x_min, abs_y_min], # Top-Left
	[abs_x_max, abs_y_min], # Top-Right
	[abs_x_max, abs_y_max], # Bottom-Right
	[abs_x_min, abs_y_max] # Bottom-Left
	]

	results.append({
	'bbox': bbox_points,
	'original': item.get('original', ''),
	'translated': item.get('translated', '')
	})

	return results

	except Exception as e:
	print(f"Vision translation error: {e}")
	return []

	def translate_text(self, text: str) -> str:
	"""
	Translates a single string.
	"""
	if not text.strip():
	return ""

	try:
	if self.service_type == 'deepl':
	# DeepL uses slightly different language codes (e.g. 'DE' instead of 'de' usually, but 'de' works)
	result = self.translator.translate_text(text, source_lang=None, target_lang=self.target)
	return result.text

	elif self.service_type in ['openai', 'xai']:
	# Select model based on service
	model = "gpt-4o-mini" if self.service_type == 'openai' else "grok-4-latest"

	response = self.client.chat.completions.create(
	model=model,
	messages=[
	{"role": "system", "content": f"You are a professional manga translator. Translate the following text from {self.source} to {self.target}. Keep the translation natural and fitting for a comic/manga context. Ensure correct handling of German special characters like ä, ö, ü, ß. Only return the translated text, nothing else."},
	{"role": "user", "content": text}
	],
	temperature=0.3
	)

	# Track usage
	if response.usage:
	self.usage['input_tokens'] += response.usage.prompt_tokens
	self.usage['output_tokens'] += response.usage.completion_tokens

	return response.choices[0].message.content.strip()

	else:
	return self.translator.translate(text)
	except Exception as e:
	print(f"Translation error: {e}")
	return text

	def translate_batch(self, texts: List[str]) -> List[str]:
	"""
	Translates a list of strings.
	"""
	if not texts:
	return []

	try:
	if self.service_type == 'deepl':
	results = self.translator.translate_text(texts, source_lang=None, target_lang=self.target)
	return [r.text for r in results]

	elif self.service_type in ['openai', 'xai']:
	# Select model based on service
	model = "gpt-4o-mini" if self.service_type == 'openai' else "grok-4-latest"

	# OpenAI/xAI batch approach
	formatted_text = "\n".join([f"{i+1}. {t}" for i, t in enumerate(texts)])
	prompt = f"Translate the following numbered lines from {self.source} to {self.target}. Return them as a numbered list with the same indices.\n\n{formatted_text}"

	response = self.client.chat.completions.create(
	model=model,
	messages=[
	{"role": "system", "content": f"You are a professional manga translator. Translate the text from {self.source} to {self.target}. Return ONLY the numbered list of translations."},
	{"role": "user", "content": prompt}
	],
	temperature=0.3
	)

	# Track usage
	if response.usage:
	self.usage['input_tokens'] += response.usage.prompt_tokens
	self.usage['output_tokens'] += response.usage.completion_tokens

	content = response.choices[0].message.content.strip()

	# Parse results back to list
	translated_lines = []
	# Simple parsing (robustness could be improved)
	for line in content.split('\n'):
	if '. ' in line:
	parts = line.split('. ', 1)
	if len(parts) > 1:
	translated_lines.append(parts[1])
	else:
	translated_lines.append(line)
	else:
	translated_lines.append(line)

	# Fallback if counts don't match (rare but possible)
	if len(translated_lines) != len(texts):
	return [self.translate_text(t) for t in texts]

	return translated_lines

	else:
	return self.translator.translate_batch(texts)
	except Exception as e:
	print(f"Batch translation error: {e}")
	return texts