Spaces:
Sleeping
Sleeping
| from deep_translator import GoogleTranslator | |
| import deepl | |
| from openai import OpenAI | |
| from typing import List, Union, Optional | |
| import base64 | |
| import io | |
| import json | |
| from PIL import Image | |
| class TranslatorService: | |
| def __init__(self, source: str = 'en', target: str = 'de', service_type: str = 'google', api_key: Optional[str] = None): | |
| """ | |
| Initializes the Translator Service. | |
| Args: | |
| source: Source language code (default: 'en'). | |
| target: Target language code (default: 'de'). | |
| service_type: 'google', 'deepl', 'openai', or 'xai'. | |
| api_key: API Key for DeepL, OpenAI or xAI. | |
| """ | |
| self.service_type = service_type | |
| self.api_key = api_key | |
| self.target = target | |
| self.source = source | |
| self.usage = {'input_tokens': 0, 'output_tokens': 0} | |
| if self.service_type == 'deepl': | |
| print("Using DeepL Translator") | |
| if not self.api_key: | |
| raise ValueError("DeepL API Key is required for DeepL service.") | |
| self.translator = deepl.Translator(self.api_key) | |
| elif self.service_type == 'openai': | |
| print("Using OpenAI (GPT-4o-mini) Translator") | |
| if not self.api_key: | |
| raise ValueError("OpenAI API Key is required for OpenAI service.") | |
| self.client = OpenAI(api_key=self.api_key) | |
| elif self.service_type == 'xai': | |
| print("Using xAI Grok Translator") | |
| if not self.api_key: | |
| raise ValueError("xAI API Key is required for Grok service.") | |
| # xAI API is OpenAI-compatible | |
| self.client = OpenAI(api_key=self.api_key, base_url="https://api.x.ai/v1") | |
| else: | |
| print("Using Google Translator (deep-translator)") | |
| self.translator = GoogleTranslator(source=source, target=target) | |
| def get_usage_stats(self): | |
| """Returns accumulated token usage.""" | |
| return self.usage | |
| def get_cost_estimate(self): | |
| """ | |
| Returns estimated cost in USD based on GPT-4o-mini pricing. | |
| Input: $0.15 / 1M tokens | |
| Output: $0.60 / 1M tokens | |
| """ | |
| input_cost = (self.usage['input_tokens'] / 1_000_000) * 0.15 | |
| output_cost = (self.usage['output_tokens'] / 1_000_000) * 0.60 | |
| return input_cost + output_cost | |
| def validate_api_key(self) -> None: | |
| """Performs a lightweight test call to validate the configured API key. | |
| Raises: | |
| Exception: If the key is invalid or the provider returns an auth error. | |
| """ | |
| # Google (deep-translator) does not use an API key | |
| if self.service_type not in ['deepl', 'openai', 'xai']: | |
| return | |
| if self.service_type == 'deepl': | |
| # Minimal ping using the official client | |
| try: | |
| # This will raise an exception on invalid auth | |
| _ = self.translator.get_usage() | |
| except Exception as e: | |
| raise Exception(f"DeepL API key seems invalid or not authorized: {e}") | |
| return | |
| # OpenAI / xAI | |
| try: | |
| model = "gpt-4o-mini" if self.service_type == 'openai' else "grok-4-mini" | |
| # Very small test prompt to minimize cost | |
| response = self.client.chat.completions.create( | |
| model=model, | |
| messages=[ | |
| {"role": "user", "content": "test"} | |
| ], | |
| max_tokens=1, | |
| temperature=0.0, | |
| ) | |
| # If we get here without exception, we assume the key works. | |
| if response.usage: | |
| self.usage['input_tokens'] += response.usage.prompt_tokens | |
| self.usage['output_tokens'] += response.usage.completion_tokens | |
| except Exception as e: | |
| raise Exception(f"{self.service_type.capitalize()} API key seems invalid or the service is not reachable: {e}") | |
| def translate_image_with_vision(self, image: Image.Image) -> List[dict]: | |
| """ | |
| Uses VLM (Vision Language Model) to detect and translate text directly from image. | |
| Returns list of dicts: {'bbox': [x1, y1, x2, y2], 'original': str, 'translated': str} | |
| """ | |
| if self.service_type not in ['openai', 'xai']: | |
| raise ValueError("Vision features only supported for OpenAI and xAI services.") | |
| # 1. Letterbox the image to be square (helps with coordinate accuracy) | |
| old_width, old_height = image.size | |
| new_size = max(old_width, old_height) | |
| square_img = Image.new("RGB", (new_size, new_size), (255, 255, 255)) | |
| # Paste original image centered or top-left? Top-left is easier for coord math. | |
| square_img.paste(image, (0, 0)) | |
| # Convert to base64 | |
| buffered = io.BytesIO() | |
| square_img.save(buffered, format="JPEG") | |
| img_str = base64.b64encode(buffered.getvalue()).decode("utf-8") | |
| img_url = f"data:image/jpeg;base64,{img_str}" | |
| model = "gpt-4o-mini" if self.service_type == 'openai' else "grok-4-latest" | |
| prompt = f""" | |
| You are a Manga Translator Agent. | |
| Look at this manga page. Identify all speech bubbles and text boxes. | |
| For each text region: | |
| 1. Extract the English text. | |
| 2. Translate it to German. | |
| 3. Estimate the bounding box as [ymin, xmin, ymax, xmax] using a 0-1000 normalized scale based on this square image. | |
| - (0,0) is top-left corner. | |
| - (1000,1000) is bottom-right corner. | |
| - Be extremely precise with the coordinates. | |
| - The image might have white padding on the right or bottom, ignore that area. | |
| Return ONLY a valid JSON array with this structure: | |
| [ | |
| {{ | |
| "original": "English text", | |
| "translated": "German translation", | |
| "bbox": [ymin, xmin, ymax, xmax] | |
| }} | |
| ] | |
| Do not use markdown code blocks. Return raw JSON only. | |
| """ | |
| try: | |
| response = self.client.chat.completions.create( | |
| model=model, | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "text", "text": prompt}, | |
| { | |
| "type": "image_url", | |
| "image_url": {"url": img_url} | |
| } | |
| ], | |
| } | |
| ], | |
| max_tokens=2000, | |
| temperature=0.1 | |
| ) | |
| # Track usage | |
| if response.usage: | |
| self.usage['input_tokens'] += response.usage.prompt_tokens | |
| self.usage['output_tokens'] += response.usage.completion_tokens | |
| content = response.choices[0].message.content.strip() | |
| # Cleanup markdown if present | |
| if content.startswith("```json"): | |
| content = content[7:] | |
| if content.endswith("```"): | |
| content = content[:-3] | |
| data = json.loads(content.strip()) | |
| results = [] | |
| for item in data: | |
| ymin, xmin, ymax, xmax = item['bbox'] | |
| # Clamp values 0-1000 | |
| ymin = max(0, min(1000, ymin)) | |
| xmin = max(0, min(1000, xmin)) | |
| ymax = max(0, min(1000, ymax)) | |
| xmax = max(0, min(1000, xmax)) | |
| # Convert from 0-1000 scale relative to the SQUARE image | |
| abs_x_min = int((xmin / 1000) * new_size) | |
| abs_y_min = int((ymin / 1000) * new_size) | |
| abs_x_max = int((xmax / 1000) * new_size) | |
| abs_y_max = int((ymax / 1000) * new_size) | |
| # Clip to original image dimensions (remove padding area results) | |
| abs_x_min = min(abs_x_min, old_width) | |
| abs_y_min = min(abs_y_min, old_height) | |
| abs_x_max = min(abs_x_max, old_width) | |
| abs_y_max = min(abs_y_max, old_height) | |
| # Ensure valid box | |
| if abs_x_max > abs_x_min and abs_y_max > abs_y_min: | |
| bbox_points = [ | |
| [abs_x_min, abs_y_min], # Top-Left | |
| [abs_x_max, abs_y_min], # Top-Right | |
| [abs_x_max, abs_y_max], # Bottom-Right | |
| [abs_x_min, abs_y_max] # Bottom-Left | |
| ] | |
| results.append({ | |
| 'bbox': bbox_points, | |
| 'original': item.get('original', ''), | |
| 'translated': item.get('translated', '') | |
| }) | |
| return results | |
| except Exception as e: | |
| print(f"Vision translation error: {e}") | |
| return [] | |
| def translate_text(self, text: str) -> str: | |
| """ | |
| Translates a single string. | |
| """ | |
| if not text.strip(): | |
| return "" | |
| try: | |
| if self.service_type == 'deepl': | |
| # DeepL uses slightly different language codes (e.g. 'DE' instead of 'de' usually, but 'de' works) | |
| result = self.translator.translate_text(text, source_lang=None, target_lang=self.target) | |
| return result.text | |
| elif self.service_type in ['openai', 'xai']: | |
| # Select model based on service | |
| model = "gpt-4o-mini" if self.service_type == 'openai' else "grok-4-latest" | |
| response = self.client.chat.completions.create( | |
| model=model, | |
| messages=[ | |
| {"role": "system", "content": f"You are a professional manga translator. Translate the following text from {self.source} to {self.target}. Keep the translation natural and fitting for a comic/manga context. Ensure correct handling of German special characters like ä, ö, ü, ß. Only return the translated text, nothing else."}, | |
| {"role": "user", "content": text} | |
| ], | |
| temperature=0.3 | |
| ) | |
| # Track usage | |
| if response.usage: | |
| self.usage['input_tokens'] += response.usage.prompt_tokens | |
| self.usage['output_tokens'] += response.usage.completion_tokens | |
| return response.choices[0].message.content.strip() | |
| else: | |
| return self.translator.translate(text) | |
| except Exception as e: | |
| print(f"Translation error: {e}") | |
| return text | |
| def translate_batch(self, texts: List[str]) -> List[str]: | |
| """ | |
| Translates a list of strings. | |
| """ | |
| if not texts: | |
| return [] | |
| try: | |
| if self.service_type == 'deepl': | |
| results = self.translator.translate_text(texts, source_lang=None, target_lang=self.target) | |
| return [r.text for r in results] | |
| elif self.service_type in ['openai', 'xai']: | |
| # Select model based on service | |
| model = "gpt-4o-mini" if self.service_type == 'openai' else "grok-4-latest" | |
| # OpenAI/xAI batch approach | |
| formatted_text = "\n".join([f"{i+1}. {t}" for i, t in enumerate(texts)]) | |
| prompt = f"Translate the following numbered lines from {self.source} to {self.target}. Return them as a numbered list with the same indices.\n\n{formatted_text}" | |
| response = self.client.chat.completions.create( | |
| model=model, | |
| messages=[ | |
| {"role": "system", "content": f"You are a professional manga translator. Translate the text from {self.source} to {self.target}. Return ONLY the numbered list of translations."}, | |
| {"role": "user", "content": prompt} | |
| ], | |
| temperature=0.3 | |
| ) | |
| # Track usage | |
| if response.usage: | |
| self.usage['input_tokens'] += response.usage.prompt_tokens | |
| self.usage['output_tokens'] += response.usage.completion_tokens | |
| content = response.choices[0].message.content.strip() | |
| # Parse results back to list | |
| translated_lines = [] | |
| # Simple parsing (robustness could be improved) | |
| for line in content.split('\n'): | |
| if '. ' in line: | |
| parts = line.split('. ', 1) | |
| if len(parts) > 1: | |
| translated_lines.append(parts[1]) | |
| else: | |
| translated_lines.append(line) | |
| else: | |
| translated_lines.append(line) | |
| # Fallback if counts don't match (rare but possible) | |
| if len(translated_lines) != len(texts): | |
| return [self.translate_text(t) for t in texts] | |
| return translated_lines | |
| else: | |
| return self.translator.translate_batch(texts) | |
| except Exception as e: | |
| print(f"Batch translation error: {e}") | |
| return texts | |