Spaces:
Running on Zero
Running on Zero
bartwisch commited on
Commit ·
376598e
0
Parent(s):
Initial release v1.0.0 – Manga Translator with MIT License
Browse files- .gitignore +34 -0
- LICENSE +21 -0
- README.md +139 -0
- app.py +467 -0
- packages.txt +3 -0
- pages/config.py +327 -0
- requirements-optional.txt +12 -0
- requirements.txt +22 -0
- run.sh +10 -0
- setup.bat +68 -0
- setup.sh +40 -0
- src/__init__.py +0 -0
- src/image_processor.py +171 -0
- src/ocr_handler.py +434 -0
- src/pdf_handler.py +80 -0
- src/translator.py +323 -0
- src/ui_state.py +21 -0
- tests/test_ui_state.py +19 -0
.gitignore
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.pyc
|
| 6 |
+
|
| 7 |
+
# Virtual Environment
|
| 8 |
+
venv/
|
| 9 |
+
env/
|
| 10 |
+
.env
|
| 11 |
+
|
| 12 |
+
# Streamlit
|
| 13 |
+
.streamlit/
|
| 14 |
+
|
| 15 |
+
# IDE / Editor
|
| 16 |
+
.gemini/
|
| 17 |
+
.vscode/
|
| 18 |
+
.idea/
|
| 19 |
+
*.swp
|
| 20 |
+
*.swo
|
| 21 |
+
|
| 22 |
+
# Internal notes
|
| 23 |
+
TASK.md
|
| 24 |
+
|
| 25 |
+
# Output Files
|
| 26 |
+
translated_manga.pdf
|
| 27 |
+
*.pdf
|
| 28 |
+
*.png
|
| 29 |
+
*.jpg
|
| 30 |
+
*.jpeg
|
| 31 |
+
|
| 32 |
+
# OS Files
|
| 33 |
+
.DS_Store
|
| 34 |
+
Thumbs.db
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2025 Christoph Bartwisch
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
README.md
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 📚 Manga Translator
|
| 2 |
+
|
| 3 |
+
[](https://share.streamlit.io/)
|
| 4 |
+
|
| 5 |
+
An AI-powered application to translate Manga/Comic PDFs from English to German. It preserves the original layout by detecting speech bubbles, removing the original text, and overlaying the translated text.
|
| 6 |
+
|
| 7 |
+
**Repository:** [github.com/bartwisch/mangatranslator](https://github.com/bartwisch/mangatranslator/)
|
| 8 |
+
|
| 9 |
+
## ✨ Features
|
| 10 |
+
|
| 11 |
+
* **Multiple OCR Engines** (Lazy Loading):
|
| 12 |
+
* **Magi** ⭐ (The Manga Whisperer) - Default, best for manga, detects speech bubbles automatically
|
| 13 |
+
* **manga-ocr** - Specialized for manga fonts (optional)
|
| 14 |
+
* **PaddleOCR** - Good general purpose (optional)
|
| 15 |
+
* **EasyOCR** - Multi-language support (optional)
|
| 16 |
+
* **Speech Bubble Grouping**: Automatically groups text lines within speech bubbles for context-aware translation
|
| 17 |
+
* **Multiple Translation Engines**:
|
| 18 |
+
* **Google Translate** (Free)
|
| 19 |
+
* **DeepL** (High Quality, requires API Key)
|
| 20 |
+
* **OpenAI GPT-4o-mini** (Context-aware, requires API Key)
|
| 21 |
+
* **xAI Grok** (Context-aware, requires API Key)
|
| 22 |
+
* **xAI Grok Vision** (No OCR needed, uses vision model)
|
| 23 |
+
* **Smart Layout**: Automatically cleans speech bubbles and fits translated text (dynamically resizing fonts).
|
| 24 |
+
* **Interactive Preview**: Select specific pages to translate visually.
|
| 25 |
+
* **OCR Config Page**: Live preview to tune OCR parameters and bubble grouping.
|
| 26 |
+
* **Cost Estimation**: Shows token usage and estimated costs for AI models.
|
| 27 |
+
|
| 28 |
+
## 🚀 How to Deploy on Streamlit Cloud
|
| 29 |
+
|
| 30 |
+
This is the easiest way to run the app for free.
|
| 31 |
+
|
| 32 |
+
1. Go to **[share.streamlit.io](https://share.streamlit.io/)**.
|
| 33 |
+
2. Click **"New App"**.
|
| 34 |
+
3. Select "Use existing repo" and enter: `bartwisch/mangatranslator`.
|
| 35 |
+
4. Set **Main file path** to `app.py`.
|
| 36 |
+
5. Click **Deploy!** 🎈
|
| 37 |
+
|
| 38 |
+
*Note: The first deployment might take 3-5 minutes because it needs to install PyTorch and OCR models.*
|
| 39 |
+
|
| 40 |
+
## 🛠️ Local Installation
|
| 41 |
+
|
| 42 |
+
### Quick Setup (Recommended)
|
| 43 |
+
|
| 44 |
+
**macOS / Linux:**
|
| 45 |
+
```bash
|
| 46 |
+
git clone https://github.com/bartwisch/mangatranslator.git
|
| 47 |
+
cd mangatranslator
|
| 48 |
+
./setup.sh
|
| 49 |
+
```
|
| 50 |
+
|
| 51 |
+
**Windows:**
|
| 52 |
+
```cmd
|
| 53 |
+
git clone https://github.com/bartwisch/mangatranslator.git
|
| 54 |
+
cd mangatranslator
|
| 55 |
+
setup.bat
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
### Manual Installation
|
| 59 |
+
|
| 60 |
+
1. **Clone the repository**:
|
| 61 |
+
```bash
|
| 62 |
+
git clone https://github.com/bartwisch/mangatranslator.git
|
| 63 |
+
cd mangatranslator
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
2. **Set up Python environment**:
|
| 67 |
+
```bash
|
| 68 |
+
python3 -m venv venv
|
| 69 |
+
source venv/bin/activate # Windows: venv\Scripts\activate.bat
|
| 70 |
+
|
| 71 |
+
# Install base requirements (includes Magi OCR)
|
| 72 |
+
pip install -r requirements.txt
|
| 73 |
+
```
|
| 74 |
+
|
| 75 |
+
3. **Optional: Install additional OCR engines**:
|
| 76 |
+
```bash
|
| 77 |
+
# Install all optional engines
|
| 78 |
+
pip install -r requirements-optional.txt
|
| 79 |
+
|
| 80 |
+
# Or install individually:
|
| 81 |
+
pip install manga-ocr paddlepaddle paddleocr # Manga-OCR
|
| 82 |
+
pip install paddlepaddle paddleocr # PaddleOCR only
|
| 83 |
+
pip install easyocr # EasyOCR only
|
| 84 |
+
```
|
| 85 |
+
|
| 86 |
+
4. **Run the app**:
|
| 87 |
+
```bash
|
| 88 |
+
streamlit run app.py
|
| 89 |
+
# Or use: ./run.sh
|
| 90 |
+
```
|
| 91 |
+
|
| 92 |
+
5. **Open in browser**: http://localhost:8501
|
| 93 |
+
|
| 94 |
+
### OCR Configuration Page
|
| 95 |
+
|
| 96 |
+
Navigate to the **Configuration** page in the app to:
|
| 97 |
+
- Select your preferred OCR engine (Magi is default)
|
| 98 |
+
- Choose OCR preprocessing mode
|
| 99 |
+
- Upload a PDF and preview OCR detection
|
| 100 |
+
- Adjust bubble grouping threshold
|
| 101 |
+
- Compare different OCR engines
|
| 102 |
+
|
| 103 |
+
## 🔑 API Keys
|
| 104 |
+
|
| 105 |
+
The app requires API Keys for **DeepL**, **OpenAI**, or **xAI** if you choose to use those services.
|
| 106 |
+
* Keys are entered securely in the Configuration page.
|
| 107 |
+
* Keys are **NOT** stored in the repository.
|
| 108 |
+
* Google Translate is available as a free fallback.
|
| 109 |
+
|
| 110 |
+
## 📋 Requirements
|
| 111 |
+
|
| 112 |
+
* Python 3.10+
|
| 113 |
+
* See `requirements.txt` for base Python packages (includes Magi OCR).
|
| 114 |
+
* See `requirements-optional.txt` for optional OCR engines.
|
| 115 |
+
* See `packages.txt` for system dependencies (required for Linux/Cloud deployment).
|
| 116 |
+
|
| 117 |
+
## 🎯 OCR Engine Comparison
|
| 118 |
+
|
| 119 |
+
| Engine | Best For | Speed | Quality | Installation |
|
| 120 |
+
|--------|----------|-------|---------|--------------|
|
| 121 |
+
| **Magi** ⭐ | Manga (auto bubble detection) | Medium | Excellent | Default ✅ |
|
| 122 |
+
| Manga-OCR | Manga/Comic fonts | Fast | Very Good | Optional |
|
| 123 |
+
| PaddleOCR | General purpose | Fast | Good | Optional |
|
| 124 |
+
| EasyOCR | Multi-language | Slow | Good | Optional |
|
| 125 |
+
|
| 126 |
+
## 📜 License
|
| 127 |
+
|
| 128 |
+
This project is licensed under the **MIT License**. See the `LICENSE` file for details.
|
| 129 |
+
|
| 130 |
+
### Third-Party OCR Libraries
|
| 131 |
+
|
| 132 |
+
This application uses or optionally supports several third-party OCR engines and libraries, including but not limited to:
|
| 133 |
+
|
| 134 |
+
- `magi-ocr` (custom model stack based on PyTorch and Transformers)
|
| 135 |
+
- `manga-ocr` (MIT License)
|
| 136 |
+
- `PaddleOCR` (Apache-2.0 License)
|
| 137 |
+
- `EasyOCR` (Apache-2.0 License)
|
| 138 |
+
|
| 139 |
+
These components are subject to their respective licenses as provided by their authors.
|
app.py
ADDED
|
@@ -0,0 +1,467 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import os
|
| 3 |
+
import tempfile
|
| 4 |
+
import certifi
|
| 5 |
+
from src.pdf_handler import PDFHandler
|
| 6 |
+
from src.ocr_handler import OCRHandler
|
| 7 |
+
from src.translator import TranslatorService
|
| 8 |
+
from src.image_processor import ImageProcessor
|
| 9 |
+
from src.ui_state import should_display_thumbnails
|
| 10 |
+
|
| 11 |
+
# Fix SSL issue permanently for this session
|
| 12 |
+
os.environ['SSL_CERT_FILE'] = certifi.where()
|
| 13 |
+
|
| 14 |
+
st.set_page_config(page_title="Manga Translator", page_icon="logo.png")
|
| 15 |
+
|
| 16 |
+
st.title("📚 Manga Translator (English -> German)")
|
| 17 |
+
|
| 18 |
+
@st.cache_resource
|
| 19 |
+
def load_ocr(ocr_engine: str = 'magi'):
|
| 20 |
+
# Magi is best for manga (detects speech bubbles + OCR)
|
| 21 |
+
# manga-ocr is specialized for manga/comic fonts
|
| 22 |
+
# PaddleOCR is good general purpose
|
| 23 |
+
return OCRHandler(lang_list=['en'], gpu=False, ocr_engine=ocr_engine)
|
| 24 |
+
|
| 25 |
+
def load_translator(service_type: str, api_key: str = None):
|
| 26 |
+
# We don't cache this resource anymore if the key can change dynamically
|
| 27 |
+
return TranslatorService(source='en', target='de', service_type=service_type, api_key=api_key)
|
| 28 |
+
|
| 29 |
+
def parse_page_range(range_str: str) -> list[int]:
|
| 30 |
+
"""Parse a page range string (e.g., "1-3, 5, 7-9") into a list of 0-indexed integers."""
|
| 31 |
+
if not range_str.strip():
|
| 32 |
+
return None
|
| 33 |
+
|
| 34 |
+
pages = set()
|
| 35 |
+
parts = range_str.split(',')
|
| 36 |
+
|
| 37 |
+
for part in parts:
|
| 38 |
+
part = part.strip()
|
| 39 |
+
if '-' in part:
|
| 40 |
+
try:
|
| 41 |
+
start, end = map(int, part.split('-'))
|
| 42 |
+
# Convert to 0-indexed, inclusive
|
| 43 |
+
pages.update(range(start - 1, end))
|
| 44 |
+
except ValueError:
|
| 45 |
+
continue
|
| 46 |
+
else:
|
| 47 |
+
try:
|
| 48 |
+
# Convert to 0-indexed
|
| 49 |
+
pages.add(int(part) - 1)
|
| 50 |
+
except ValueError:
|
| 51 |
+
continue
|
| 52 |
+
|
| 53 |
+
return sorted(list(pages))
|
| 54 |
+
|
| 55 |
+
def main():
|
| 56 |
+
# Session State Initialization
|
| 57 |
+
if 'preview_images' not in st.session_state:
|
| 58 |
+
st.session_state.preview_images = []
|
| 59 |
+
if 'temp_pdf_path' not in st.session_state:
|
| 60 |
+
st.session_state.temp_pdf_path = None
|
| 61 |
+
if 'last_uploaded_file_id' not in st.session_state:
|
| 62 |
+
st.session_state.last_uploaded_file_id = None
|
| 63 |
+
|
| 64 |
+
# Persistent API Keys & Settings (Defaults if not set in Config)
|
| 65 |
+
if 'stored_deepl_key' not in st.session_state:
|
| 66 |
+
st.session_state.stored_deepl_key = ""
|
| 67 |
+
if 'stored_openai_key' not in st.session_state:
|
| 68 |
+
st.session_state.stored_openai_key = ""
|
| 69 |
+
if 'stored_xai_key' not in st.session_state:
|
| 70 |
+
st.session_state.stored_xai_key = ""
|
| 71 |
+
if 'translation_service_selection' not in st.session_state:
|
| 72 |
+
st.session_state.translation_service_selection = "OpenAI GPT-4o-mini (API Key - Recommended)"
|
| 73 |
+
if 'debug_mode_checkbox' not in st.session_state:
|
| 74 |
+
st.session_state.debug_mode_checkbox = True
|
| 75 |
+
if 'show_boxes_checkbox' not in st.session_state:
|
| 76 |
+
st.session_state.show_boxes_checkbox = False
|
| 77 |
+
if 'bubble_threshold_setting' not in st.session_state:
|
| 78 |
+
st.session_state.bubble_threshold_setting = 160
|
| 79 |
+
if 'ocr_engine_selection' not in st.session_state:
|
| 80 |
+
st.session_state.ocr_engine_selection = "magi"
|
| 81 |
+
if 'ocr_preprocess_mode' not in st.session_state:
|
| 82 |
+
st.session_state.ocr_preprocess_mode = "gentle"
|
| 83 |
+
if 'stop_translation' not in st.session_state:
|
| 84 |
+
st.session_state.stop_translation = False
|
| 85 |
+
|
| 86 |
+
# Read settings from Session State
|
| 87 |
+
service_choice = st.session_state.translation_service_selection
|
| 88 |
+
debug_mode = st.session_state.debug_mode_checkbox
|
| 89 |
+
show_boxes = st.session_state.show_boxes_checkbox
|
| 90 |
+
bubble_threshold = st.session_state.bubble_threshold_setting
|
| 91 |
+
ocr_engine = st.session_state.ocr_engine_selection
|
| 92 |
+
ocr_preprocess = st.session_state.ocr_preprocess_mode
|
| 93 |
+
|
| 94 |
+
pdf_handler = PDFHandler()
|
| 95 |
+
image_processor = ImageProcessor()
|
| 96 |
+
|
| 97 |
+
# --- Quick Settings Panel ---
|
| 98 |
+
with st.expander("⚙️ Quick Settings", expanded=not st.session_state.stored_openai_key):
|
| 99 |
+
st.markdown("**OpenAI API Key** (Recommended for best translation quality)")
|
| 100 |
+
col_key_input, col_key_test = st.columns([3, 1])
|
| 101 |
+
|
| 102 |
+
with col_key_input:
|
| 103 |
+
st.text_input(
|
| 104 |
+
"Enter your OpenAI API Key",
|
| 105 |
+
type="password",
|
| 106 |
+
key="stored_openai_key",
|
| 107 |
+
help="Your API key is stored securely in this session only.",
|
| 108 |
+
label_visibility="collapsed"
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
with col_key_test:
|
| 112 |
+
if st.button("Test Key", key="test_openai_key"):
|
| 113 |
+
if not st.session_state.stored_openai_key:
|
| 114 |
+
st.error("Please enter an OpenAI API Key before testing.")
|
| 115 |
+
else:
|
| 116 |
+
try:
|
| 117 |
+
tester = TranslatorService(
|
| 118 |
+
source='en',
|
| 119 |
+
target='de',
|
| 120 |
+
service_type='openai',
|
| 121 |
+
api_key=st.session_state.stored_openai_key,
|
| 122 |
+
)
|
| 123 |
+
if hasattr(tester, "validate_api_key"):
|
| 124 |
+
tester.validate_api_key()
|
| 125 |
+
except Exception as e:
|
| 126 |
+
st.error(f"❌ API Key Error: {e}")
|
| 127 |
+
else:
|
| 128 |
+
st.success("✓ OpenAI API Key is valid.")
|
| 129 |
+
|
| 130 |
+
if st.session_state.stored_openai_key:
|
| 131 |
+
st.success("✓ API Key configured")
|
| 132 |
+
else:
|
| 133 |
+
st.info("💡 Get your API key from [platform.openai.com/api-keys](https://platform.openai.com/api-keys)")
|
| 134 |
+
|
| 135 |
+
st.divider()
|
| 136 |
+
st.markdown("🔧 **[Visit Configuration Page](/config)** for advanced settings (OCR engine, translation service, debug options, etc.)")
|
| 137 |
+
|
| 138 |
+
# --- Service Configuration Logic ---
|
| 139 |
+
api_key = None
|
| 140 |
+
service_type = 'google'
|
| 141 |
+
use_vision = False
|
| 142 |
+
|
| 143 |
+
if "DeepL" in service_choice:
|
| 144 |
+
service_type = 'deepl'
|
| 145 |
+
api_key = st.session_state.stored_deepl_key
|
| 146 |
+
|
| 147 |
+
elif "OpenAI" in service_choice:
|
| 148 |
+
service_type = 'openai'
|
| 149 |
+
api_key = st.session_state.stored_openai_key
|
| 150 |
+
|
| 151 |
+
elif "xAI" in service_choice: # Covers both Grok and Vision
|
| 152 |
+
service_type = 'xai'
|
| 153 |
+
if "Vision" in service_choice:
|
| 154 |
+
use_vision = True
|
| 155 |
+
api_key = st.session_state.stored_xai_key
|
| 156 |
+
|
| 157 |
+
# Show API key warning at the top if needed
|
| 158 |
+
if (service_type in ['deepl', 'openai', 'xai']) and not api_key:
|
| 159 |
+
st.error(f"⚠️ **Missing API Key:** Please enter your {service_type.capitalize()} API Key in the Quick Settings above or visit the **[Configuration](/config)** page.")
|
| 160 |
+
|
| 161 |
+
uploaded_file = st.file_uploader("Upload a Manga PDF (English)", type=["pdf"])
|
| 162 |
+
|
| 163 |
+
# Only load OCR if NOT using vision mode
|
| 164 |
+
if not use_vision:
|
| 165 |
+
ocr_handler = load_ocr(ocr_engine=ocr_engine)
|
| 166 |
+
else:
|
| 167 |
+
ocr_handler = None
|
| 168 |
+
|
| 169 |
+
# Initialize translator
|
| 170 |
+
if (service_type in ['deepl', 'openai', 'xai']) and not api_key:
|
| 171 |
+
translator = None
|
| 172 |
+
else:
|
| 173 |
+
try:
|
| 174 |
+
translator = load_translator(service_type=service_type, api_key=api_key)
|
| 175 |
+
except Exception as e:
|
| 176 |
+
st.error(f"❌ **Translation Error:** Failed to initialize translator: {e}")
|
| 177 |
+
translator = None
|
| 178 |
+
|
| 179 |
+
if uploaded_file is not None:
|
| 180 |
+
# Check for new file upload
|
| 181 |
+
current_file_id = f"{uploaded_file.name}_{uploaded_file.size}"
|
| 182 |
+
|
| 183 |
+
if st.session_state.last_uploaded_file_id != current_file_id:
|
| 184 |
+
# New file detected! Reset state.
|
| 185 |
+
st.session_state.last_uploaded_file_id = current_file_id
|
| 186 |
+
|
| 187 |
+
# Cleanup old temp file
|
| 188 |
+
if st.session_state.temp_pdf_path and os.path.exists(st.session_state.temp_pdf_path):
|
| 189 |
+
try:
|
| 190 |
+
os.remove(st.session_state.temp_pdf_path)
|
| 191 |
+
except:
|
| 192 |
+
pass
|
| 193 |
+
|
| 194 |
+
# Save new temp file
|
| 195 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
|
| 196 |
+
tmp_file.write(uploaded_file.read())
|
| 197 |
+
st.session_state.temp_pdf_path = tmp_file.name
|
| 198 |
+
|
| 199 |
+
# Generate Previews (Low Res)
|
| 200 |
+
with st.spinner("Generating page previews..."):
|
| 201 |
+
st.session_state.preview_images = pdf_handler.extract_images_from_pdf(st.session_state.temp_pdf_path, zoom=1)
|
| 202 |
+
|
| 203 |
+
# Initialize all pages as selected
|
| 204 |
+
for i in range(len(st.session_state.preview_images)):
|
| 205 |
+
st.session_state[f"page_select_{i}"] = True
|
| 206 |
+
|
| 207 |
+
st.success("File uploaded successfully!")
|
| 208 |
+
|
| 209 |
+
# --- Page Selection UI ---
|
| 210 |
+
st.subheader("Select Pages to Translate")
|
| 211 |
+
|
| 212 |
+
should_show_thumbnails = should_display_thumbnails(
|
| 213 |
+
st.session_state.get('translation_in_progress', False)
|
| 214 |
+
)
|
| 215 |
+
start_translation = False
|
| 216 |
+
|
| 217 |
+
if should_show_thumbnails:
|
| 218 |
+
# Selection Buttons
|
| 219 |
+
col_sel1, col_sel2, col_sel3, col_sel4 = st.columns([1, 1, 2, 2])
|
| 220 |
+
if col_sel1.button("Select All"):
|
| 221 |
+
for i in range(len(st.session_state.preview_images)):
|
| 222 |
+
st.session_state[f"page_select_{i}"] = True
|
| 223 |
+
st.rerun()
|
| 224 |
+
|
| 225 |
+
if col_sel2.button("Deselect All"):
|
| 226 |
+
for i in range(len(st.session_state.preview_images)):
|
| 227 |
+
st.session_state[f"page_select_{i}"] = False
|
| 228 |
+
st.rerun()
|
| 229 |
+
|
| 230 |
+
# Top Start Translation Button
|
| 231 |
+
if col_sel4.button("🚀 Start Translation", type="primary", key="start_translate_top"):
|
| 232 |
+
# We'll use session state to trigger translation
|
| 233 |
+
st.session_state.trigger_translation = True
|
| 234 |
+
|
| 235 |
+
# Grid Display
|
| 236 |
+
num_cols = 4
|
| 237 |
+
cols = st.columns(num_cols)
|
| 238 |
+
|
| 239 |
+
# CSS for compact checkboxes
|
| 240 |
+
st.markdown("""
|
| 241 |
+
<style>
|
| 242 |
+
/* Prevent checkbox label wrapping */
|
| 243 |
+
div[data-testid="stCheckbox"] label span {
|
| 244 |
+
white-space: nowrap;
|
| 245 |
+
}
|
| 246 |
+
</style>
|
| 247 |
+
""", unsafe_allow_html=True)
|
| 248 |
+
|
| 249 |
+
for i, img in enumerate(st.session_state.preview_images):
|
| 250 |
+
with cols[i % num_cols]:
|
| 251 |
+
# Create a bordered container for the "card" look
|
| 252 |
+
with st.container(border=True):
|
| 253 |
+
# Check selection state
|
| 254 |
+
key = f"page_select_{i}"
|
| 255 |
+
# Use False as default to prevent accidental selection
|
| 256 |
+
# The key should always exist after upload (initialized in line 176-177)
|
| 257 |
+
is_selected = st.session_state.get(key, False)
|
| 258 |
+
|
| 259 |
+
# Toggle Button (acts as header)
|
| 260 |
+
btn_label = f"✅ Page {i+1}" if is_selected else f"⬜ Page {i+1}"
|
| 261 |
+
btn_type = "primary" if is_selected else "secondary"
|
| 262 |
+
|
| 263 |
+
if st.button(btn_label, key=f"btn_{i}", type=btn_type, width="stretch"):
|
| 264 |
+
# Toggle state
|
| 265 |
+
st.session_state[key] = not is_selected
|
| 266 |
+
st.rerun()
|
| 267 |
+
|
| 268 |
+
# Image with negative margin to pull it up closer
|
| 269 |
+
st.markdown('<div style="margin-top: -10px;"></div>', unsafe_allow_html=True)
|
| 270 |
+
st.image(img, width="stretch")
|
| 271 |
+
|
| 272 |
+
# Second toggle button below image (for clicking on image area)
|
| 273 |
+
# Using a minimal icon-only button
|
| 274 |
+
toggle_icon = "✓" if is_selected else "○"
|
| 275 |
+
if st.button(toggle_icon, key=f"img_btn_{i}", width="stretch", help="Click to toggle selection"):
|
| 276 |
+
# Toggle state
|
| 277 |
+
st.session_state[key] = not is_selected
|
| 278 |
+
st.rerun()
|
| 279 |
+
else:
|
| 280 |
+
st.info("Translation has started. Thumbnails are hidden while processing is underway.")
|
| 281 |
+
|
| 282 |
+
selected_indices = [
|
| 283 |
+
i for i in range(len(st.session_state.preview_images))
|
| 284 |
+
if st.session_state.get(f"page_select_{i}", True)
|
| 285 |
+
]
|
| 286 |
+
st.write(f"Selected {len(selected_indices)} pages.")
|
| 287 |
+
|
| 288 |
+
# --- Translation Trigger (Bottom Button or Top Button) ---
|
| 289 |
+
if should_show_thumbnails:
|
| 290 |
+
# Disable start button if translation is already in progress
|
| 291 |
+
start_translation = st.button("Start Translation", type="primary", key="start_translate_bottom", disabled=st.session_state.get('translation_in_progress', False)) or st.session_state.get('trigger_translation', False)
|
| 292 |
+
else:
|
| 293 |
+
start_translation = st.session_state.get('trigger_translation', False)
|
| 294 |
+
|
| 295 |
+
# Reset trigger
|
| 296 |
+
if st.session_state.get('trigger_translation', False):
|
| 297 |
+
st.session_state.trigger_translation = False
|
| 298 |
+
|
| 299 |
+
if start_translation:
|
| 300 |
+
if not selected_indices:
|
| 301 |
+
st.error("Please select at least one page.")
|
| 302 |
+
elif translator is None:
|
| 303 |
+
st.error("Translator not initialized. Please check your API Key in Configuration.")
|
| 304 |
+
else:
|
| 305 |
+
# Validate API key before starting translation to surface clear errors early
|
| 306 |
+
try:
|
| 307 |
+
if hasattr(translator, "validate_api_key"):
|
| 308 |
+
translator.validate_api_key()
|
| 309 |
+
except Exception as e:
|
| 310 |
+
st.error(f"❌ API Key Error: {e}")
|
| 311 |
+
else:
|
| 312 |
+
# Set translation in progress flag
|
| 313 |
+
st.session_state.translation_in_progress = True
|
| 314 |
+
st.rerun()
|
| 315 |
+
|
| 316 |
+
# Check if translation is in progress
|
| 317 |
+
if st.session_state.get('translation_in_progress', False):
|
| 318 |
+
tmp_path = st.session_state.temp_pdf_path
|
| 319 |
+
|
| 320 |
+
# Hide thumbnails and show progress
|
| 321 |
+
st.markdown("---")
|
| 322 |
+
col1, col2 = st.columns([3, 1])
|
| 323 |
+
with col1:
|
| 324 |
+
st.subheader("🔄 Translation in Progress")
|
| 325 |
+
with col2:
|
| 326 |
+
if st.button("⏹️ Stop", type="secondary", help="Stop translation and save completed pages"):
|
| 327 |
+
st.session_state.stop_translation = True
|
| 328 |
+
st.rerun()
|
| 329 |
+
st.info(f"Translating {len(selected_indices)} selected pages...")
|
| 330 |
+
|
| 331 |
+
try:
|
| 332 |
+
# 1. Extract Images (High Res for processing)
|
| 333 |
+
# Only extract the pages we actually need
|
| 334 |
+
status_placeholder = st.empty()
|
| 335 |
+
status_placeholder.info(f"📄 Extracting {len(selected_indices)} pages in high resolution...")
|
| 336 |
+
|
| 337 |
+
images = pdf_handler.extract_images_from_pdf(tmp_path, pages=selected_indices, zoom=2)
|
| 338 |
+
|
| 339 |
+
processed_images = []
|
| 340 |
+
all_text_data = [] # For debug mode
|
| 341 |
+
|
| 342 |
+
# Progress tracking
|
| 343 |
+
progress_bar = st.progress(0)
|
| 344 |
+
progress_text = st.empty()
|
| 345 |
+
|
| 346 |
+
for i, img in enumerate(images):
|
| 347 |
+
# Check if user requested to stop
|
| 348 |
+
if st.session_state.get('stop_translation', False):
|
| 349 |
+
st.warning("⏹️ Translation stopped by user. Saving completed pages...")
|
| 350 |
+
break
|
| 351 |
+
|
| 352 |
+
original_page_num = selected_indices[i] + 1
|
| 353 |
+
progress_text.text(f"🔍 Processing page {original_page_num} ({i+1}/{len(images)})...")
|
| 354 |
+
|
| 355 |
+
text_regions = []
|
| 356 |
+
|
| 357 |
+
if use_vision:
|
| 358 |
+
# VISION MODE
|
| 359 |
+
st.info(f"Analyzing page {original_page_num} with Grok Vision...")
|
| 360 |
+
vision_results = translator.translate_image_with_vision(img)
|
| 361 |
+
|
| 362 |
+
for item in vision_results:
|
| 363 |
+
bbox = item['bbox']
|
| 364 |
+
original = item['original']
|
| 365 |
+
translated = item['translated']
|
| 366 |
+
text_regions.append((bbox, original, translated))
|
| 367 |
+
|
| 368 |
+
if debug_mode:
|
| 369 |
+
all_text_data.append({"Page": original_page_num, "Original": original, "Translated": translated, "Type": "Vision"})
|
| 370 |
+
|
| 371 |
+
else:
|
| 372 |
+
# CLASSIC OCR MODE
|
| 373 |
+
# 2. OCR mit Sprechblasen-Gruppierung
|
| 374 |
+
# Verwendet detect_and_group_text um nahe Textblöcke zusammenzufassen
|
| 375 |
+
ocr_results = ocr_handler.detect_and_group_text(
|
| 376 |
+
img,
|
| 377 |
+
distance_threshold=bubble_threshold,
|
| 378 |
+
preprocess_mode=ocr_preprocess
|
| 379 |
+
)
|
| 380 |
+
|
| 381 |
+
for bbox, text in ocr_results:
|
| 382 |
+
# Überspringe leere oder sehr kurze Texte
|
| 383 |
+
if len(text.strip()) < 2:
|
| 384 |
+
continue
|
| 385 |
+
|
| 386 |
+
# Übersetze den gesamten gruppierten Text
|
| 387 |
+
translated_text = translator.translate_text(text)
|
| 388 |
+
text_regions.append((bbox, text, translated_text))
|
| 389 |
+
|
| 390 |
+
if debug_mode:
|
| 391 |
+
all_text_data.append({"Page": original_page_num, "Original": text, "Translated": translated_text, "Type": "OCR"})
|
| 392 |
+
|
| 393 |
+
# 4. Image Processing (Common for both modes)
|
| 394 |
+
if show_boxes:
|
| 395 |
+
# Nur Rahmen zeichnen ohne Text zu ersetzen
|
| 396 |
+
processed_img = image_processor.draw_boxes_only(img.copy(), text_regions)
|
| 397 |
+
else:
|
| 398 |
+
processed_img = image_processor.overlay_text(img.copy(), text_regions)
|
| 399 |
+
processed_images.append(processed_img)
|
| 400 |
+
|
| 401 |
+
progress_bar.progress((i + 1) / len(images))
|
| 402 |
+
|
| 403 |
+
# 5. Save Result
|
| 404 |
+
if not processed_images:
|
| 405 |
+
st.error("No pages were processed. No PDF generated.")
|
| 406 |
+
st.session_state.translation_in_progress = False
|
| 407 |
+
st.session_state.stop_translation = False
|
| 408 |
+
return
|
| 409 |
+
|
| 410 |
+
output_pdf_path = "translated_manga.pdf"
|
| 411 |
+
pdf_handler.save_images_as_pdf(processed_images, output_pdf_path)
|
| 412 |
+
|
| 413 |
+
# Check if translation was stopped
|
| 414 |
+
if st.session_state.get('stop_translation', False):
|
| 415 |
+
st.warning(f"⏹️ Translation stopped. Saved {len(processed_images)} out of {len(selected_indices)} pages.")
|
| 416 |
+
else:
|
| 417 |
+
st.success("Translation Complete!")
|
| 418 |
+
|
| 419 |
+
# Display Cost Stats if available
|
| 420 |
+
if hasattr(translator, 'get_usage_stats'):
|
| 421 |
+
stats = translator.get_usage_stats()
|
| 422 |
+
if stats['input_tokens'] > 0:
|
| 423 |
+
st.divider()
|
| 424 |
+
st.subheader("📊 Cost & Usage Estimate")
|
| 425 |
+
col_cost1, col_cost2, col_cost3 = st.columns(3)
|
| 426 |
+
col_cost1.metric("Input Tokens", f"{stats['input_tokens']:,}")
|
| 427 |
+
col_cost2.metric("Output Tokens", f"{stats['output_tokens']:,}")
|
| 428 |
+
|
| 429 |
+
cost = translator.get_cost_estimate()
|
| 430 |
+
col_cost3.metric("Estimated Cost", f"${cost:.4f}")
|
| 431 |
+
st.caption("Note: Cost estimate based on GPT-4o-mini pricing ($0.15/$0.60 per 1M tokens).")
|
| 432 |
+
|
| 433 |
+
if debug_mode and all_text_data:
|
| 434 |
+
st.divider()
|
| 435 |
+
st.subheader("🐛 Debug: OCR & Translation Data")
|
| 436 |
+
st.dataframe(all_text_data, width="stretch")
|
| 437 |
+
|
| 438 |
+
# Download Button
|
| 439 |
+
with open(output_pdf_path, "rb") as f:
|
| 440 |
+
pdf_data = f.read()
|
| 441 |
+
st.download_button(
|
| 442 |
+
label="Download Translated PDF",
|
| 443 |
+
data=pdf_data,
|
| 444 |
+
file_name="translated_manga.pdf",
|
| 445 |
+
mime="application/pdf"
|
| 446 |
+
)
|
| 447 |
+
|
| 448 |
+
# Show Preview Images (More reliable than PDF iframe)
|
| 449 |
+
st.divider()
|
| 450 |
+
st.markdown("### 👀 Preview (Processed Pages)")
|
| 451 |
+
for i, p_img in enumerate(processed_images):
|
| 452 |
+
st.image(p_img, caption=f"Translated Page {selected_indices[i] + 1}", width="stretch")
|
| 453 |
+
|
| 454 |
+
# Reset translation flags
|
| 455 |
+
st.session_state.translation_in_progress = False
|
| 456 |
+
st.session_state.stop_translation = False
|
| 457 |
+
|
| 458 |
+
except Exception as e:
|
| 459 |
+
st.error(f"An error occurred: {e}")
|
| 460 |
+
st.session_state.translation_in_progress = False
|
| 461 |
+
st.session_state.stop_translation = False
|
| 462 |
+
finally:
|
| 463 |
+
# Cleanup - don't remove temp file, we might need it for retries
|
| 464 |
+
pass
|
| 465 |
+
|
| 466 |
+
if __name__ == "__main__":
|
| 467 |
+
main()
|
packages.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
libgl1
|
| 2 |
+
libgl1-mesa-glx
|
| 3 |
+
libglib2.0-0
|
pages/config.py
ADDED
|
@@ -0,0 +1,327 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import os
|
| 3 |
+
import tempfile
|
| 4 |
+
from PIL import Image, ImageDraw, ImageFont
|
| 5 |
+
import numpy as np
|
| 6 |
+
from src.pdf_handler import PDFHandler
|
| 7 |
+
|
| 8 |
+
st.set_page_config(page_title="Configuration", page_icon="⚙️", layout="wide")
|
| 9 |
+
|
| 10 |
+
st.title("⚙️ Configuration")
|
| 11 |
+
|
| 12 |
+
# Initialize PDF handler globally
|
| 13 |
+
pdf_handler = PDFHandler()
|
| 14 |
+
|
| 15 |
+
# Session state initialization
|
| 16 |
+
if 'config_pdf_path' not in st.session_state:
|
| 17 |
+
st.session_state.config_pdf_path = None
|
| 18 |
+
if 'config_previews' not in st.session_state:
|
| 19 |
+
st.session_state.config_previews = []
|
| 20 |
+
if 'selected_page' not in st.session_state:
|
| 21 |
+
st.session_state.selected_page = None
|
| 22 |
+
if 'ocr_cache' not in st.session_state:
|
| 23 |
+
st.session_state.ocr_cache = {}
|
| 24 |
+
if 'high_res_images' not in st.session_state:
|
| 25 |
+
st.session_state.high_res_images = {}
|
| 26 |
+
|
| 27 |
+
# --- Global Settings (Session State) ---
|
| 28 |
+
if 'stored_deepl_key' not in st.session_state:
|
| 29 |
+
st.session_state.stored_deepl_key = ""
|
| 30 |
+
if 'stored_openai_key' not in st.session_state:
|
| 31 |
+
st.session_state.stored_openai_key = ""
|
| 32 |
+
if 'stored_xai_key' not in st.session_state:
|
| 33 |
+
st.session_state.stored_xai_key = ""
|
| 34 |
+
if 'translation_service_selection' not in st.session_state:
|
| 35 |
+
st.session_state.translation_service_selection = "OpenAI GPT-4o-mini (API Key - Recommended)"
|
| 36 |
+
if 'debug_mode_checkbox' not in st.session_state:
|
| 37 |
+
st.session_state.debug_mode_checkbox = True
|
| 38 |
+
if 'show_boxes_checkbox' not in st.session_state:
|
| 39 |
+
st.session_state.show_boxes_checkbox = False
|
| 40 |
+
if 'bubble_threshold_setting' not in st.session_state:
|
| 41 |
+
st.session_state.bubble_threshold_setting = 160
|
| 42 |
+
if 'ocr_engine_selection' not in st.session_state:
|
| 43 |
+
st.session_state.ocr_engine_selection = "magi"
|
| 44 |
+
if 'ocr_preprocess_mode' not in st.session_state:
|
| 45 |
+
st.session_state.ocr_preprocess_mode = "gentle"
|
| 46 |
+
|
| 47 |
+
# Create tabs for different configuration sections
|
| 48 |
+
tab_general, tab_ocr_tool = st.tabs(["🌍 General Settings", "🔧 OCR Tool"])
|
| 49 |
+
|
| 50 |
+
with tab_general:
|
| 51 |
+
st.header("Global Application Settings")
|
| 52 |
+
|
| 53 |
+
col1, col2 = st.columns(2)
|
| 54 |
+
|
| 55 |
+
with col1:
|
| 56 |
+
st.subheader("Translation Service")
|
| 57 |
+
service_options = [
|
| 58 |
+
"OpenAI GPT-4o-mini (API Key - Recommended)",
|
| 59 |
+
"Google Translate (Free - Experimental)",
|
| 60 |
+
"DeepL (API Key - Experimental)",
|
| 61 |
+
"xAI Grok (API Key - Experimental)",
|
| 62 |
+
"xAI Grok Vision (No OCR - Experimental)"
|
| 63 |
+
]
|
| 64 |
+
|
| 65 |
+
st.selectbox(
|
| 66 |
+
"Select Translation Service",
|
| 67 |
+
options=service_options,
|
| 68 |
+
key='translation_service_selection'
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
service_choice = st.session_state.translation_service_selection
|
| 72 |
+
|
| 73 |
+
if "DeepL" in service_choice:
|
| 74 |
+
st.text_input("DeepL API Key", type="password", key="stored_deepl_key", help="Paste your DeepL API Key here.")
|
| 75 |
+
elif "OpenAI" in service_choice:
|
| 76 |
+
st.text_input("OpenAI API Key", type="password", key="stored_openai_key", help="Paste your OpenAI API Key here.")
|
| 77 |
+
elif "xAI" in service_choice:
|
| 78 |
+
st.text_input("xAI API Key", type="password", key="stored_xai_key", help="Paste your xAI API Key here.")
|
| 79 |
+
|
| 80 |
+
with col2:
|
| 81 |
+
st.subheader("Debug & Display Options")
|
| 82 |
+
st.checkbox("Debug Mode", help="Show OCR text vs. Translation table.", key="debug_mode_checkbox")
|
| 83 |
+
st.checkbox("Show OCR Boxes", help="Zeigt nur die erkannten Textbereiche als Rahmen.", key="show_boxes_checkbox")
|
| 84 |
+
|
| 85 |
+
st.slider(
|
| 86 |
+
"Bubble Grouping Distance (Global)",
|
| 87 |
+
min_value=30,
|
| 88 |
+
max_value=300,
|
| 89 |
+
step=10,
|
| 90 |
+
key="bubble_threshold_setting",
|
| 91 |
+
help="Maximaler Abstand (Pixel) um Textzeilen zu einer Sprechblase zusammenzufassen. Höher = mehr Gruppierung."
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
with tab_ocr_tool:
|
| 95 |
+
st.header("OCR Configuration & Testing Tool")
|
| 96 |
+
|
| 97 |
+
# OCR Settings at the top
|
| 98 |
+
st.subheader("🔧 Global OCR Settings")
|
| 99 |
+
col_ocr1, col_ocr2 = st.columns(2)
|
| 100 |
+
|
| 101 |
+
with col_ocr1:
|
| 102 |
+
st.selectbox(
|
| 103 |
+
"OCR Engine",
|
| 104 |
+
options=['magi', 'manga-ocr', 'paddleocr', 'easyocr'],
|
| 105 |
+
key='ocr_engine_selection',
|
| 106 |
+
help="'magi' = best for manga (detects speech bubbles) [DEFAULT], 'manga-ocr' = specialized for manga fonts, 'paddleocr' = fast and general purpose, 'easyocr' = multi-language support"
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
with col_ocr2:
|
| 110 |
+
st.selectbox(
|
| 111 |
+
"OCR Preprocessing",
|
| 112 |
+
options=['gentle', 'none', 'aggressive'],
|
| 113 |
+
key='ocr_preprocess_mode',
|
| 114 |
+
help="'gentle' = recommended for manga, 'none' = original image, 'aggressive' = strong binarization"
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
st.divider()
|
| 118 |
+
st.subheader("📄 Test OCR on PDF Pages")
|
| 119 |
+
st.markdown("Lade ein PDF hoch, klicke auf eine Seite, und passe den Threshold an um die Sprechblasen-Erkennung zu optimieren.")
|
| 120 |
+
|
| 121 |
+
def draw_boxes(image: Image.Image, text_results):
|
| 122 |
+
"""Zeichnet farbige Boxen auf das Bild"""
|
| 123 |
+
img_copy = image.copy()
|
| 124 |
+
draw = ImageDraw.Draw(img_copy)
|
| 125 |
+
|
| 126 |
+
# Lade Font
|
| 127 |
+
try:
|
| 128 |
+
font = ImageFont.truetype("Arial.ttf", 14)
|
| 129 |
+
except:
|
| 130 |
+
try:
|
| 131 |
+
font = ImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 14)
|
| 132 |
+
except:
|
| 133 |
+
font = ImageFont.load_default()
|
| 134 |
+
|
| 135 |
+
colors = ["#FF0000", "#0066FF", "#00CC00", "#FF9900", "#9900FF", "#00CCCC", "#FF00FF", "#FFCC00"]
|
| 136 |
+
|
| 137 |
+
for i, item in enumerate(text_results):
|
| 138 |
+
bbox = item[0]
|
| 139 |
+
text = item[1] if len(item) > 1 else ""
|
| 140 |
+
|
| 141 |
+
pts = np.array(bbox)
|
| 142 |
+
x_min = int(np.min(pts[:, 0]))
|
| 143 |
+
y_min = int(np.min(pts[:, 1]))
|
| 144 |
+
x_max = int(np.max(pts[:, 0]))
|
| 145 |
+
y_max = int(np.max(pts[:, 1]))
|
| 146 |
+
|
| 147 |
+
box_color = colors[i % len(colors)]
|
| 148 |
+
|
| 149 |
+
# Dicken Rahmen zeichnen
|
| 150 |
+
for offset in range(4):
|
| 151 |
+
draw.rectangle(
|
| 152 |
+
[x_min - offset, y_min - offset, x_max + offset, y_max + offset],
|
| 153 |
+
outline=box_color
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
# Hintergrund für Label
|
| 157 |
+
label = f"[{i+1}]"
|
| 158 |
+
draw.rectangle([x_min, y_min - 20, x_min + 30, y_min], fill=box_color)
|
| 159 |
+
draw.text((x_min + 2, y_min - 18), label, fill="white", font=font)
|
| 160 |
+
|
| 161 |
+
return img_copy
|
| 162 |
+
|
| 163 |
+
def select_page(page_num):
|
| 164 |
+
st.session_state.selected_page = page_num
|
| 165 |
+
|
| 166 |
+
# PDF Upload
|
| 167 |
+
uploaded_pdf = st.file_uploader("📄 PDF hochladen (für OCR Config)", type=["pdf"])
|
| 168 |
+
|
| 169 |
+
if uploaded_pdf:
|
| 170 |
+
# Check if new file
|
| 171 |
+
file_id = f"{uploaded_pdf.name}_{uploaded_pdf.size}"
|
| 172 |
+
|
| 173 |
+
if st.session_state.config_pdf_path is None or not os.path.exists(st.session_state.config_pdf_path):
|
| 174 |
+
# Save temp PDF
|
| 175 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
|
| 176 |
+
tmp.write(uploaded_pdf.read())
|
| 177 |
+
st.session_state.config_pdf_path = tmp.name
|
| 178 |
+
|
| 179 |
+
# Generate previews
|
| 180 |
+
with st.spinner("Lade Seiten-Vorschau..."):
|
| 181 |
+
st.session_state.config_previews = pdf_handler.extract_images_from_pdf(
|
| 182 |
+
st.session_state.config_pdf_path, zoom=0.8
|
| 183 |
+
)
|
| 184 |
+
st.session_state.selected_page = None
|
| 185 |
+
st.session_state.ocr_cache = {}
|
| 186 |
+
st.session_state.high_res_images = {}
|
| 187 |
+
|
| 188 |
+
if st.session_state.config_previews:
|
| 189 |
+
if st.session_state.selected_page is None:
|
| 190 |
+
# Show page grid
|
| 191 |
+
st.subheader("📖 Seite auswählen")
|
| 192 |
+
|
| 193 |
+
num_cols = 5
|
| 194 |
+
cols = st.columns(num_cols)
|
| 195 |
+
|
| 196 |
+
for i, preview in enumerate(st.session_state.config_previews):
|
| 197 |
+
with cols[i % num_cols]:
|
| 198 |
+
st.markdown(f"Seite {i+1}")
|
| 199 |
+
|
| 200 |
+
st.image(preview, width="stretch")
|
| 201 |
+
|
| 202 |
+
st.button(
|
| 203 |
+
f"Auswählen",
|
| 204 |
+
key=f"select_page_{i}",
|
| 205 |
+
on_click=select_page,
|
| 206 |
+
args=(i,),
|
| 207 |
+
type="secondary"
|
| 208 |
+
)
|
| 209 |
+
|
| 210 |
+
# If page selected, show OCR config
|
| 211 |
+
if st.session_state.selected_page is not None:
|
| 212 |
+
if st.button("← Zurück zur Übersicht"):
|
| 213 |
+
st.session_state.selected_page = None
|
| 214 |
+
st.rerun()
|
| 215 |
+
|
| 216 |
+
st.divider()
|
| 217 |
+
|
| 218 |
+
page_idx = st.session_state.selected_page
|
| 219 |
+
|
| 220 |
+
# Sidebar controls for OCR Tool
|
| 221 |
+
st.sidebar.header("🔧 OCR Tool Einstellungen")
|
| 222 |
+
|
| 223 |
+
# Local threshold for this tool, defaulting to global setting
|
| 224 |
+
tool_bubble_threshold = st.sidebar.slider(
|
| 225 |
+
"Bubble Grouping Distance (Test)",
|
| 226 |
+
min_value=30,
|
| 227 |
+
max_value=400,
|
| 228 |
+
value=st.session_state.bubble_threshold_setting,
|
| 229 |
+
step=10,
|
| 230 |
+
help="Test-Wert für diesen Viewer. Ändert nicht die globale Einstellung."
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
ocr_engine = st.sidebar.selectbox(
|
| 234 |
+
"OCR Engine",
|
| 235 |
+
options=['magi', 'manga-ocr', 'paddleocr', 'easyocr'],
|
| 236 |
+
index=0,
|
| 237 |
+
help="'magi' = beste für Manga (erkennt Sprechblasen), 'manga-ocr' = gut, 'paddleocr' = schnell"
|
| 238 |
+
)
|
| 239 |
+
|
| 240 |
+
preprocess_mode = st.sidebar.selectbox(
|
| 241 |
+
"OCR Preprocessing",
|
| 242 |
+
options=['gentle', 'none', 'aggressive'],
|
| 243 |
+
index=0,
|
| 244 |
+
help="'gentle' = empfohlen für Manga, 'none' = Originalbild, 'aggressive' = starke Binarisierung"
|
| 245 |
+
)
|
| 246 |
+
|
| 247 |
+
show_raw = st.sidebar.checkbox("Zeige Roh-OCR zum Vergleich", value=False)
|
| 248 |
+
|
| 249 |
+
st.sidebar.divider()
|
| 250 |
+
st.sidebar.info("💡 **Tipps:**\n- Magi ist am besten für Manga\n- Gentle preprocessing empfohlen")
|
| 251 |
+
|
| 252 |
+
# Load high-res image for selected page
|
| 253 |
+
cache_key = f"page_{page_idx}"
|
| 254 |
+
|
| 255 |
+
if cache_key not in st.session_state.high_res_images:
|
| 256 |
+
with st.spinner(f"Lade Seite {page_idx + 1} in hoher Auflösung..."):
|
| 257 |
+
high_res = pdf_handler.extract_images_from_pdf(
|
| 258 |
+
st.session_state.config_pdf_path,
|
| 259 |
+
pages=[page_idx],
|
| 260 |
+
zoom=2
|
| 261 |
+
)
|
| 262 |
+
if high_res:
|
| 263 |
+
st.session_state.high_res_images[cache_key] = high_res[0]
|
| 264 |
+
|
| 265 |
+
if cache_key in st.session_state.high_res_images:
|
| 266 |
+
image = st.session_state.high_res_images[cache_key]
|
| 267 |
+
|
| 268 |
+
# Run OCR (cached per page, engine, preprocess mode)
|
| 269 |
+
ocr_key = f"ocr_{page_idx}_{ocr_engine}_{preprocess_mode}"
|
| 270 |
+
if ocr_key not in st.session_state.ocr_cache:
|
| 271 |
+
with st.spinner(f"🔍 Analysiere Text mit {ocr_engine.upper()}..."):
|
| 272 |
+
# Lazy load OCR handler here to avoid circular imports if any
|
| 273 |
+
from src.ocr_handler import OCRHandler
|
| 274 |
+
ocr_handler_tool = OCRHandler(lang_list=['en'], gpu=False, ocr_engine=ocr_engine)
|
| 275 |
+
|
| 276 |
+
raw_results = ocr_handler_tool.detect_text(
|
| 277 |
+
image,
|
| 278 |
+
paragraph=False,
|
| 279 |
+
preprocess_mode=preprocess_mode
|
| 280 |
+
)
|
| 281 |
+
st.session_state.ocr_cache[ocr_key] = raw_results
|
| 282 |
+
|
| 283 |
+
raw_results = st.session_state.ocr_cache[ocr_key]
|
| 284 |
+
|
| 285 |
+
# Group with current threshold
|
| 286 |
+
from src.ocr_handler import OCRHandler
|
| 287 |
+
ocr_handler_tool = OCRHandler(lang_list=['en'], gpu=False, ocr_engine=ocr_engine)
|
| 288 |
+
grouped_results = ocr_handler_tool.group_text_into_bubbles(raw_results, distance_threshold=tool_bubble_threshold)
|
| 289 |
+
|
| 290 |
+
# Display
|
| 291 |
+
st.subheader(f"📄 Seite {page_idx + 1} - OCR Ergebnis")
|
| 292 |
+
|
| 293 |
+
if show_raw:
|
| 294 |
+
col1, col2 = st.columns(2)
|
| 295 |
+
|
| 296 |
+
with col1:
|
| 297 |
+
st.markdown(f"**🔴 Roh-OCR: {len(raw_results)} Boxen**")
|
| 298 |
+
raw_image = draw_boxes(image, raw_results)
|
| 299 |
+
st.image(raw_image, width="stretch")
|
| 300 |
+
|
| 301 |
+
with col2:
|
| 302 |
+
st.markdown(f"**🟢 Gruppiert: {len(grouped_results)} Boxen** (Threshold: {tool_bubble_threshold}px)")
|
| 303 |
+
grouped_image = draw_boxes(image, grouped_results)
|
| 304 |
+
st.image(grouped_image, width="stretch")
|
| 305 |
+
else:
|
| 306 |
+
st.markdown(f"**🟢 Gruppiert: {len(grouped_results)} Boxen** (Threshold: {tool_bubble_threshold}px)")
|
| 307 |
+
grouped_image = draw_boxes(image, grouped_results)
|
| 308 |
+
st.image(grouped_image, width="stretch")
|
| 309 |
+
|
| 310 |
+
# Show detected texts
|
| 311 |
+
with st.expander(f"📝 Erkannte Texte ({len(grouped_results)} Gruppen)", expanded=True):
|
| 312 |
+
for i, item in enumerate(grouped_results):
|
| 313 |
+
text = item[1] if len(item) > 1 else ""
|
| 314 |
+
colors = ["🔴", "🔵", "🟢", "🟠", "🟣", "🩵", "🩷", "🟡"]
|
| 315 |
+
color = colors[i % len(colors)]
|
| 316 |
+
st.markdown(f"{color} **[{i+1}]** {text}")
|
| 317 |
+
|
| 318 |
+
# Stats
|
| 319 |
+
st.divider()
|
| 320 |
+
col_stat1, col_stat2, col_stat3 = st.columns(3)
|
| 321 |
+
col_stat1.metric("Roh-Boxen", len(raw_results))
|
| 322 |
+
col_stat2.metric("Gruppierte Boxen", len(grouped_results))
|
| 323 |
+
reduction = 100 - (len(grouped_results) / max(len(raw_results), 1) * 100)
|
| 324 |
+
col_stat3.metric("Reduktion", f"{reduction:.0f}%")
|
| 325 |
+
|
| 326 |
+
else:
|
| 327 |
+
st.info("👆 Lade ein PDF hoch um die OCR-Boxen zu konfigurieren.")
|
requirements-optional.txt
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Optional OCR Engines
|
| 2 |
+
# Install only the ones you want to use:
|
| 3 |
+
|
| 4 |
+
# Manga-OCR (specialized for manga/comic fonts)
|
| 5 |
+
manga-ocr
|
| 6 |
+
|
| 7 |
+
# PaddleOCR (fast and general purpose)
|
| 8 |
+
paddlepaddle
|
| 9 |
+
paddleocr
|
| 10 |
+
|
| 11 |
+
# EasyOCR (multi-language support)
|
| 12 |
+
easyocr
|
requirements.txt
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit
|
| 2 |
+
PyMuPDF
|
| 3 |
+
deep-translator
|
| 4 |
+
opencv-python-headless
|
| 5 |
+
Pillow
|
| 6 |
+
numpy
|
| 7 |
+
scipy
|
| 8 |
+
certifi
|
| 9 |
+
openai
|
| 10 |
+
deepl
|
| 11 |
+
|
| 12 |
+
# Default OCR Engine (Magi - best for manga)
|
| 13 |
+
torch
|
| 14 |
+
torchvision
|
| 15 |
+
transformers
|
| 16 |
+
einops
|
| 17 |
+
timm
|
| 18 |
+
matplotlib
|
| 19 |
+
watchdog
|
| 20 |
+
|
| 21 |
+
# Geometry / spatial dependencies used by Magi
|
| 22 |
+
shapely
|
run.sh
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# Navigate to the project directory
|
| 4 |
+
cd /Users/christoph/Dokumente/entwicklung/mangatranslator
|
| 5 |
+
|
| 6 |
+
# Activate the virtual environment
|
| 7 |
+
source venv/bin/activate
|
| 8 |
+
|
| 9 |
+
# Run the Streamlit app
|
| 10 |
+
streamlit run app.py
|
setup.bat
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
@echo off
|
| 2 |
+
REM Manga Translator Setup Script for Windows
|
| 3 |
+
REM ==========================================
|
| 4 |
+
|
| 5 |
+
echo.
|
| 6 |
+
echo 📚 Manga Translator Setup (Windows)
|
| 7 |
+
echo ====================================
|
| 8 |
+
echo.
|
| 9 |
+
|
| 10 |
+
REM Check Python
|
| 11 |
+
python --version >nul 2>&1
|
| 12 |
+
if errorlevel 1 (
|
| 13 |
+
echo ❌ Python not found! Install Python 3.10+ from https://python.org
|
| 14 |
+
pause
|
| 15 |
+
exit /b 1
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
REM Check for Tesseract
|
| 19 |
+
where tesseract >nul 2>&1
|
| 20 |
+
if errorlevel 1 (
|
| 21 |
+
echo.
|
| 22 |
+
echo ⚠️ Tesseract OCR not found!
|
| 23 |
+
echo.
|
| 24 |
+
echo Please install manually:
|
| 25 |
+
echo 1. Download from: https://github.com/UB-Mannheim/tesseract/wiki
|
| 26 |
+
echo 2. Run installer and add to PATH
|
| 27 |
+
echo 3. Re-run this script
|
| 28 |
+
echo.
|
| 29 |
+
pause
|
| 30 |
+
exit /b 1
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
echo ✅ Tesseract found
|
| 34 |
+
|
| 35 |
+
REM Create virtual environment
|
| 36 |
+
echo.
|
| 37 |
+
echo 🐍 Creating Python virtual environment...
|
| 38 |
+
python -m venv venv
|
| 39 |
+
|
| 40 |
+
REM Activate virtual environment
|
| 41 |
+
echo 🔌 Activating virtual environment...
|
| 42 |
+
call venv\Scripts\activate.bat
|
| 43 |
+
|
| 44 |
+
REM Upgrade pip
|
| 45 |
+
echo.
|
| 46 |
+
echo 📦 Upgrading pip...
|
| 47 |
+
python -m pip install --upgrade pip
|
| 48 |
+
|
| 49 |
+
REM Install PyTorch first (CPU version for compatibility)
|
| 50 |
+
echo.
|
| 51 |
+
echo 📦 Installing PyTorch (CPU)...
|
| 52 |
+
pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu
|
| 53 |
+
|
| 54 |
+
REM Install other dependencies
|
| 55 |
+
echo.
|
| 56 |
+
echo 📦 Installing remaining packages (this may take a few minutes)...
|
| 57 |
+
pip install -r requirements.txt
|
| 58 |
+
|
| 59 |
+
echo.
|
| 60 |
+
echo ✅ Setup complete!
|
| 61 |
+
echo.
|
| 62 |
+
echo To start the app:
|
| 63 |
+
echo venv\Scripts\activate.bat
|
| 64 |
+
echo streamlit run app.py
|
| 65 |
+
echo.
|
| 66 |
+
echo Then open: http://localhost:8501
|
| 67 |
+
echo.
|
| 68 |
+
pause
|
setup.sh
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# Manga Translator Setup Script
|
| 4 |
+
# ==============================
|
| 5 |
+
|
| 6 |
+
echo "📚 Manga Translator Setup"
|
| 7 |
+
echo "========================="
|
| 8 |
+
|
| 9 |
+
# Check OS (currently no extra system packages are required)
|
| 10 |
+
if [[ "$OSTYPE" == "darwin"* ]]; then
|
| 11 |
+
echo "🍎 macOS detected"
|
| 12 |
+
elif [[ "$OSTYPE" == "linux-gnu"* ]]; then
|
| 13 |
+
echo "🐧 Linux detected"
|
| 14 |
+
fi
|
| 15 |
+
|
| 16 |
+
# Create virtual environment
|
| 17 |
+
echo ""
|
| 18 |
+
echo "🐍 Creating Python virtual environment..."
|
| 19 |
+
python3 -m venv venv
|
| 20 |
+
|
| 21 |
+
# Activate virtual environment
|
| 22 |
+
echo "🔌 Activating virtual environment..."
|
| 23 |
+
source venv/bin/activate
|
| 24 |
+
|
| 25 |
+
# Install Python dependencies
|
| 26 |
+
echo ""
|
| 27 |
+
echo "📦 Installing Python packages (this may take a few minutes)..."
|
| 28 |
+
pip install --upgrade pip
|
| 29 |
+
pip install -r requirements.txt
|
| 30 |
+
|
| 31 |
+
echo ""
|
| 32 |
+
echo "✅ Setup complete!"
|
| 33 |
+
echo ""
|
| 34 |
+
echo "To start the app:"
|
| 35 |
+
echo " source venv/bin/activate"
|
| 36 |
+
echo " streamlit run app.py"
|
| 37 |
+
echo ""
|
| 38 |
+
echo "Then open: http://localhost:8501"
|
| 39 |
+
echo ""
|
| 40 |
+
echo "OCR Config Page: http://localhost:8501/config"
|
src/__init__.py
ADDED
|
File without changes
|
src/image_processor.py
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from PIL import Image, ImageDraw, ImageFont
|
| 2 |
+
import numpy as np
|
| 3 |
+
from typing import List, Tuple
|
| 4 |
+
|
| 5 |
+
class ImageProcessor:
|
| 6 |
+
def __init__(self):
|
| 7 |
+
pass
|
| 8 |
+
|
| 9 |
+
def draw_boxes_only(self, image: Image.Image, text_regions: List[Tuple[List[List[int]], str, str]]) -> Image.Image:
|
| 10 |
+
"""
|
| 11 |
+
Zeichnet nur rote Rahmen um die erkannten Textbereiche (ohne Text zu ersetzen).
|
| 12 |
+
|
| 13 |
+
Args:
|
| 14 |
+
image: The original PIL Image.
|
| 15 |
+
text_regions: List of tuples (bbox, original_text, translated_text).
|
| 16 |
+
|
| 17 |
+
Returns:
|
| 18 |
+
Image with red boxes drawn around text regions.
|
| 19 |
+
"""
|
| 20 |
+
draw = ImageDraw.Draw(image)
|
| 21 |
+
|
| 22 |
+
for bbox, original, translated in text_regions:
|
| 23 |
+
# Calculate bounding rectangle
|
| 24 |
+
pts = np.array(bbox)
|
| 25 |
+
x_min = int(np.min(pts[:, 0]))
|
| 26 |
+
y_min = int(np.min(pts[:, 1]))
|
| 27 |
+
x_max = int(np.max(pts[:, 0]))
|
| 28 |
+
y_max = int(np.max(pts[:, 1]))
|
| 29 |
+
|
| 30 |
+
# Draw red rectangle outline (3px thick)
|
| 31 |
+
for offset in range(3):
|
| 32 |
+
draw.rectangle(
|
| 33 |
+
[x_min - offset, y_min - offset, x_max + offset, y_max + offset],
|
| 34 |
+
outline="red"
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
# Draw text label above box
|
| 38 |
+
try:
|
| 39 |
+
font = self._load_font(12)
|
| 40 |
+
except:
|
| 41 |
+
font = ImageFont.load_default()
|
| 42 |
+
|
| 43 |
+
# Truncate text if too long
|
| 44 |
+
label = original[:50] + "..." if len(original) > 50 else original
|
| 45 |
+
draw.text((x_min, y_min - 15), label, fill="red", font=font)
|
| 46 |
+
|
| 47 |
+
return image
|
| 48 |
+
|
| 49 |
+
def overlay_text(self, image: Image.Image, text_regions: List[Tuple[List[List[int]], str, str]]) -> Image.Image:
|
| 50 |
+
"""
|
| 51 |
+
Overlays translated text onto the image.
|
| 52 |
+
|
| 53 |
+
Args:
|
| 54 |
+
image: The original PIL Image.
|
| 55 |
+
text_regions: List of tuples (bbox, original_text, translated_text).
|
| 56 |
+
bbox is [[x1,y1], [x2,y2], [x3,y3], [x4,y4]].
|
| 57 |
+
|
| 58 |
+
Returns:
|
| 59 |
+
Processed PIL Image.
|
| 60 |
+
"""
|
| 61 |
+
draw = ImageDraw.Draw(image)
|
| 62 |
+
|
| 63 |
+
for bbox, original, translated in text_regions:
|
| 64 |
+
# Calculate bounding rectangle
|
| 65 |
+
pts = np.array(bbox)
|
| 66 |
+
x_min = int(np.min(pts[:, 0]))
|
| 67 |
+
y_min = int(np.min(pts[:, 1]))
|
| 68 |
+
x_max = int(np.max(pts[:, 0]))
|
| 69 |
+
y_max = int(np.max(pts[:, 1]))
|
| 70 |
+
|
| 71 |
+
# Draw white rectangle (inpainting)
|
| 72 |
+
draw.rectangle([x_min, y_min, x_max, y_max], fill="white", outline="white")
|
| 73 |
+
|
| 74 |
+
# Calculate box dimensions
|
| 75 |
+
box_width = x_max - x_min
|
| 76 |
+
box_height = y_max - y_min
|
| 77 |
+
|
| 78 |
+
# Draw text
|
| 79 |
+
self._draw_text_in_box(draw, translated, x_min, y_min, box_width, box_height)
|
| 80 |
+
|
| 81 |
+
return image
|
| 82 |
+
|
| 83 |
+
def _draw_text_in_box(self, draw: ImageDraw.ImageDraw, text: str, x: int, y: int, w: int, h: int):
|
| 84 |
+
"""
|
| 85 |
+
Fits text inside a box by iteratively reducing font size and wrapping.
|
| 86 |
+
"""
|
| 87 |
+
import textwrap
|
| 88 |
+
|
| 89 |
+
# Minimum legible font size
|
| 90 |
+
min_fontsize = 8
|
| 91 |
+
start_fontsize = 18 # Start ambitious
|
| 92 |
+
|
| 93 |
+
padding = 4
|
| 94 |
+
available_w = max(1, w - 2*padding)
|
| 95 |
+
available_h = max(1, h - 2*padding)
|
| 96 |
+
|
| 97 |
+
best_font = None
|
| 98 |
+
best_wrapped_text = text
|
| 99 |
+
|
| 100 |
+
# Iteratively try to fit text
|
| 101 |
+
for fontsize in range(start_fontsize, min_fontsize - 1, -2):
|
| 102 |
+
try:
|
| 103 |
+
# Load font
|
| 104 |
+
font = self._load_font(fontsize)
|
| 105 |
+
|
| 106 |
+
# Estimate char width (heuristic: usually ~0.6 * fontsize for proportional fonts)
|
| 107 |
+
# A better way is to measure 'x' or 'M'
|
| 108 |
+
bbox = font.getbbox("M")
|
| 109 |
+
char_w = bbox[2] - bbox[0] if bbox else fontsize * 0.6
|
| 110 |
+
|
| 111 |
+
# Calculate max chars per line
|
| 112 |
+
chars_per_line = max(1, int(available_w / char_w))
|
| 113 |
+
|
| 114 |
+
# Wrap text
|
| 115 |
+
# break_long_words=False ensures we don't split words like "Unbelievable" into "Unbelievab-le"
|
| 116 |
+
# Instead, if a word is too long, the width check below will fail, and we'll try a smaller font.
|
| 117 |
+
wrapped_text = textwrap.fill(text, width=chars_per_line, break_long_words=False)
|
| 118 |
+
|
| 119 |
+
# Measure total height
|
| 120 |
+
# getbbox returns (left, top, right, bottom)
|
| 121 |
+
# For multiline, we need to rely on draw.multiline_textbbox if available (Pillow 8.0+)
|
| 122 |
+
if hasattr(draw, 'multiline_textbbox'):
|
| 123 |
+
text_bbox = draw.multiline_textbbox((0,0), wrapped_text, font=font)
|
| 124 |
+
text_h = text_bbox[3] - text_bbox[1]
|
| 125 |
+
text_w = text_bbox[2] - text_bbox[0]
|
| 126 |
+
else:
|
| 127 |
+
# Fallback for older Pillow
|
| 128 |
+
text_w, text_h = draw.textsize(wrapped_text, font=font)
|
| 129 |
+
|
| 130 |
+
# Check if fits vertically and horizontally (roughly)
|
| 131 |
+
if text_h <= available_h and text_w <= available_w * 1.1: # Allow slight overflow width-wise due to wrap inaccuracy
|
| 132 |
+
best_font = font
|
| 133 |
+
best_wrapped_text = wrapped_text
|
| 134 |
+
break # Found a fit!
|
| 135 |
+
|
| 136 |
+
except Exception as e:
|
| 137 |
+
print(f"Font fitting error: {e}")
|
| 138 |
+
continue
|
| 139 |
+
|
| 140 |
+
# If loop finishes without break, we use the smallest font (last one tried)
|
| 141 |
+
if best_font is None:
|
| 142 |
+
best_font = self._load_font(min_fontsize)
|
| 143 |
+
# Re-wrap for min font
|
| 144 |
+
bbox = best_font.getbbox("M")
|
| 145 |
+
char_w = bbox[2] - bbox[0] if bbox else min_fontsize * 0.6
|
| 146 |
+
chars_per_line = max(1, int(available_w / char_w))
|
| 147 |
+
best_wrapped_text = textwrap.fill(text, width=chars_per_line)
|
| 148 |
+
|
| 149 |
+
# Center text vertically
|
| 150 |
+
if hasattr(draw, 'multiline_textbbox'):
|
| 151 |
+
final_bbox = draw.multiline_textbbox((0,0), best_wrapped_text, font=best_font)
|
| 152 |
+
final_h = final_bbox[3] - final_bbox[1]
|
| 153 |
+
else:
|
| 154 |
+
_, final_h = draw.textsize(best_wrapped_text, font=best_font)
|
| 155 |
+
|
| 156 |
+
center_y = y + (h - final_h) // 2
|
| 157 |
+
center_y = max(y, center_y) # Don't go above box
|
| 158 |
+
|
| 159 |
+
# Draw text (black)
|
| 160 |
+
draw.multiline_text((x + padding, center_y), best_wrapped_text, fill="black", font=best_font, align="center")
|
| 161 |
+
|
| 162 |
+
def _load_font(self, fontsize: int):
|
| 163 |
+
"""Helper to load a font with fallback"""
|
| 164 |
+
font_names = ["Arial.ttf", "/System/Library/Fonts/Helvetica.ttc", "/System/Library/Fonts/Supplemental/Arial.ttf", "DejaVuSans.ttf"]
|
| 165 |
+
for name in font_names:
|
| 166 |
+
try:
|
| 167 |
+
return ImageFont.truetype(name, fontsize)
|
| 168 |
+
except:
|
| 169 |
+
continue
|
| 170 |
+
return ImageFont.load_default()
|
| 171 |
+
|
src/ocr_handler.py
ADDED
|
@@ -0,0 +1,434 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import cv2
|
| 3 |
+
from PIL import Image
|
| 4 |
+
from typing import List, Tuple, Any, Union, Optional
|
| 5 |
+
from scipy.spatial.distance import cdist
|
| 6 |
+
from scipy.cluster.hierarchy import fcluster, linkage
|
| 7 |
+
|
| 8 |
+
class OCRHandler:
|
| 9 |
+
def __init__(self, lang_list: List[str] = ['en'], gpu: bool = False, ocr_engine: str = 'magi'):
|
| 10 |
+
"""
|
| 11 |
+
Initializes the OCR handler with lazy loading.
|
| 12 |
+
|
| 13 |
+
Args:
|
| 14 |
+
lang_list: List of languages to detect (default: ['en']).
|
| 15 |
+
gpu: Boolean to enable GPU usage (default: False).
|
| 16 |
+
ocr_engine: 'magi' (default), 'manga-ocr', 'paddleocr', or 'easyocr'.
|
| 17 |
+
"""
|
| 18 |
+
self.ocr_engine = ocr_engine
|
| 19 |
+
self.lang_list = lang_list
|
| 20 |
+
self.gpu = gpu
|
| 21 |
+
|
| 22 |
+
# Lazy loading - modules are loaded on first use
|
| 23 |
+
self._magi_model = None
|
| 24 |
+
self._manga_ocr = None
|
| 25 |
+
self._detector = None
|
| 26 |
+
self._paddle_reader = None
|
| 27 |
+
self._easy_reader = None
|
| 28 |
+
|
| 29 |
+
print(f"OCR Handler initialized with engine: {ocr_engine} (lazy loading enabled)")
|
| 30 |
+
|
| 31 |
+
def _load_magi(self):
|
| 32 |
+
"""Lazy load Magi model."""
|
| 33 |
+
if self._magi_model is None:
|
| 34 |
+
print("Loading Magi (The Manga Whisperer)...")
|
| 35 |
+
try:
|
| 36 |
+
from transformers import AutoModel
|
| 37 |
+
import torch
|
| 38 |
+
self._magi_model = AutoModel.from_pretrained("ragavsachdeva/magi", trust_remote_code=True)
|
| 39 |
+
if torch.cuda.is_available() and self.gpu:
|
| 40 |
+
self._magi_model = self._magi_model.cuda()
|
| 41 |
+
self._magi_model.eval()
|
| 42 |
+
print("✓ Magi loaded successfully")
|
| 43 |
+
except ImportError as e:
|
| 44 |
+
raise ImportError(
|
| 45 |
+
"Magi dependencies not installed. "
|
| 46 |
+
"This should not happen as Magi is the default engine. "
|
| 47 |
+
f"Error: {e}"
|
| 48 |
+
)
|
| 49 |
+
return self._magi_model
|
| 50 |
+
|
| 51 |
+
def _load_manga_ocr(self):
|
| 52 |
+
"""Lazy load Manga-OCR."""
|
| 53 |
+
if self._manga_ocr is None:
|
| 54 |
+
print("Loading Manga-OCR...")
|
| 55 |
+
try:
|
| 56 |
+
from manga_ocr import MangaOcr
|
| 57 |
+
from paddleocr import PaddleOCR
|
| 58 |
+
self._manga_ocr = MangaOcr()
|
| 59 |
+
self._detector = PaddleOCR(lang='en', show_log=False)
|
| 60 |
+
print("✓ Manga-OCR loaded successfully")
|
| 61 |
+
except ImportError:
|
| 62 |
+
raise ImportError(
|
| 63 |
+
"Manga-OCR not installed. Install with:\n"
|
| 64 |
+
"pip install -r requirements-optional.txt\n"
|
| 65 |
+
"or: pip install manga-ocr paddlepaddle paddleocr"
|
| 66 |
+
)
|
| 67 |
+
return self._manga_ocr, self._detector
|
| 68 |
+
|
| 69 |
+
def _load_paddleocr(self):
|
| 70 |
+
"""Lazy load PaddleOCR."""
|
| 71 |
+
if self._paddle_reader is None:
|
| 72 |
+
print("Loading PaddleOCR...")
|
| 73 |
+
try:
|
| 74 |
+
from paddleocr import PaddleOCR
|
| 75 |
+
self._paddle_reader = PaddleOCR(lang='en', show_log=False)
|
| 76 |
+
print("✓ PaddleOCR loaded successfully")
|
| 77 |
+
except ImportError:
|
| 78 |
+
raise ImportError(
|
| 79 |
+
"PaddleOCR not installed. Install with:\n"
|
| 80 |
+
"pip install paddlepaddle paddleocr"
|
| 81 |
+
)
|
| 82 |
+
return self._paddle_reader
|
| 83 |
+
|
| 84 |
+
def _load_easyocr(self):
|
| 85 |
+
"""Lazy load EasyOCR."""
|
| 86 |
+
if self._easy_reader is None:
|
| 87 |
+
print("Loading EasyOCR (this may take a while on first run)...")
|
| 88 |
+
try:
|
| 89 |
+
import easyocr
|
| 90 |
+
self._easy_reader = easyocr.Reader(self.lang_list, gpu=self.gpu)
|
| 91 |
+
print("✓ EasyOCR loaded successfully")
|
| 92 |
+
except ImportError:
|
| 93 |
+
raise ImportError(
|
| 94 |
+
"EasyOCR not installed. Install with:\n"
|
| 95 |
+
"pip install easyocr"
|
| 96 |
+
)
|
| 97 |
+
return self._easy_reader
|
| 98 |
+
|
| 99 |
+
def preprocess_image(self, image: np.ndarray, mode: str = 'gentle') -> np.ndarray:
|
| 100 |
+
"""
|
| 101 |
+
Applies preprocessing to improve OCR quality.
|
| 102 |
+
|
| 103 |
+
Args:
|
| 104 |
+
image: Input image as numpy array (RGB).
|
| 105 |
+
mode: Preprocessing mode:
|
| 106 |
+
- 'none': No preprocessing, use original image
|
| 107 |
+
- 'gentle': Light preprocessing (recommended for manga)
|
| 108 |
+
- 'aggressive': Heavy preprocessing (old behavior)
|
| 109 |
+
"""
|
| 110 |
+
if mode == 'none':
|
| 111 |
+
# Scale up 3x for better recognition of thin characters like "I"
|
| 112 |
+
return cv2.resize(image, None, fx=3, fy=3, interpolation=cv2.INTER_CUBIC)
|
| 113 |
+
|
| 114 |
+
# Convert to grayscale
|
| 115 |
+
gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
|
| 116 |
+
|
| 117 |
+
# Scaling (2x) - helpful for small text
|
| 118 |
+
scaled = cv2.resize(gray, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
|
| 119 |
+
|
| 120 |
+
if mode == 'gentle':
|
| 121 |
+
# Gentle preprocessing - preserve thin strokes like "I", "l", etc.
|
| 122 |
+
# Light contrast enhancement instead of harsh binarization
|
| 123 |
+
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
|
| 124 |
+
enhanced = clahe.apply(scaled)
|
| 125 |
+
|
| 126 |
+
# Very light denoising to preserve details
|
| 127 |
+
denoised = cv2.fastNlMeansDenoising(enhanced, h=5, templateWindowSize=7, searchWindowSize=21)
|
| 128 |
+
|
| 129 |
+
return denoised
|
| 130 |
+
|
| 131 |
+
else: # aggressive
|
| 132 |
+
# Denoising
|
| 133 |
+
denoised = cv2.fastNlMeansDenoising(scaled, h=10, templateWindowSize=7, searchWindowSize=21)
|
| 134 |
+
|
| 135 |
+
# Thresholding (Binarization) - can destroy thin characters!
|
| 136 |
+
_, binary = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
| 137 |
+
|
| 138 |
+
return binary
|
| 139 |
+
|
| 140 |
+
def detect_text(self, image: Union[Image.Image, np.ndarray], paragraph: bool = True, preprocess_mode: str = 'gentle', tesseract_psm: int = 6, tesseract_confidence: int = 60) -> List[Tuple[List[Tuple[int, int]], str]]:
|
| 141 |
+
"""
|
| 142 |
+
Detects text in an image.
|
| 143 |
+
|
| 144 |
+
Args:
|
| 145 |
+
image: PIL Image or numpy array.
|
| 146 |
+
paragraph: If True, combines text lines into paragraphs (better for translation).
|
| 147 |
+
preprocess_mode: Preprocessing mode ('gentle', 'none', 'aggressive').
|
| 148 |
+
|
| 149 |
+
Returns:
|
| 150 |
+
List of tuples: (bounding_box, text) or (bounding_box, text, confidence)
|
| 151 |
+
"""
|
| 152 |
+
if isinstance(image, Image.Image):
|
| 153 |
+
image = np.array(image)
|
| 154 |
+
|
| 155 |
+
# Apply preprocessing for detection
|
| 156 |
+
processed_image = self.preprocess_image(image, mode=preprocess_mode)
|
| 157 |
+
|
| 158 |
+
# Scale factor depends on preprocessing mode
|
| 159 |
+
scale_factor = 3 if preprocess_mode == 'none' else 2
|
| 160 |
+
|
| 161 |
+
if self.ocr_engine == 'magi':
|
| 162 |
+
return self._detect_with_magi(processed_image, scale_factor)
|
| 163 |
+
elif self.ocr_engine == 'manga-ocr':
|
| 164 |
+
return self._detect_with_manga_ocr(processed_image, scale_factor)
|
| 165 |
+
elif self.ocr_engine == 'paddleocr':
|
| 166 |
+
return self._detect_with_paddleocr(processed_image, scale_factor)
|
| 167 |
+
elif self.ocr_engine == 'easyocr':
|
| 168 |
+
return self._detect_with_easyocr(processed_image, paragraph, scale_factor)
|
| 169 |
+
else:
|
| 170 |
+
raise ValueError(f"Unknown OCR engine: {self.ocr_engine}")
|
| 171 |
+
|
| 172 |
+
def _detect_with_magi(self, processed_image: np.ndarray, scale_factor: int) -> List[Tuple]:
|
| 173 |
+
"""Detect text using Magi - The Manga Whisperer (best for manga)."""
|
| 174 |
+
import torch
|
| 175 |
+
|
| 176 |
+
model = self._load_magi()
|
| 177 |
+
|
| 178 |
+
# Magi expects RGB numpy array
|
| 179 |
+
if len(processed_image.shape) == 2:
|
| 180 |
+
# Grayscale to RGB
|
| 181 |
+
processed_image = np.stack([processed_image] * 3, axis=-1)
|
| 182 |
+
|
| 183 |
+
with torch.no_grad():
|
| 184 |
+
# Detect text boxes
|
| 185 |
+
results = model.predict_detections_and_associations([processed_image])
|
| 186 |
+
text_bboxes = [results[0]["texts"]]
|
| 187 |
+
|
| 188 |
+
# Run OCR on detected text boxes
|
| 189 |
+
ocr_results = model.predict_ocr([processed_image], text_bboxes)
|
| 190 |
+
|
| 191 |
+
final_results = []
|
| 192 |
+
|
| 193 |
+
if results and len(results) > 0:
|
| 194 |
+
text_boxes = results[0].get("texts", [])
|
| 195 |
+
ocr_texts = ocr_results[0] if ocr_results else []
|
| 196 |
+
|
| 197 |
+
for i, bbox in enumerate(text_boxes):
|
| 198 |
+
# bbox format: [x1, y1, x2, y2]
|
| 199 |
+
x1, y1, x2, y2 = bbox
|
| 200 |
+
|
| 201 |
+
# Convert to 4-point format and scale back
|
| 202 |
+
bbox_4pt = [
|
| 203 |
+
[int(x1 / scale_factor), int(y1 / scale_factor)],
|
| 204 |
+
[int(x2 / scale_factor), int(y1 / scale_factor)],
|
| 205 |
+
[int(x2 / scale_factor), int(y2 / scale_factor)],
|
| 206 |
+
[int(x1 / scale_factor), int(y2 / scale_factor)]
|
| 207 |
+
]
|
| 208 |
+
|
| 209 |
+
# Get OCR text if available
|
| 210 |
+
text = ocr_texts[i] if i < len(ocr_texts) else ""
|
| 211 |
+
|
| 212 |
+
if text.strip():
|
| 213 |
+
final_results.append((bbox_4pt, text.strip(), 0.95))
|
| 214 |
+
|
| 215 |
+
return final_results
|
| 216 |
+
|
| 217 |
+
def _detect_with_manga_ocr(self, processed_image: np.ndarray, scale_factor: int) -> List[Tuple]:
|
| 218 |
+
"""Detect text using Manga-OCR - specialized for manga/comic fonts."""
|
| 219 |
+
manga_ocr, detector = self._load_manga_ocr()
|
| 220 |
+
|
| 221 |
+
# Use PaddleOCR for detection, then manga-ocr for recognition
|
| 222 |
+
detection_result = detector.ocr(processed_image, cls=False)
|
| 223 |
+
|
| 224 |
+
final_results = []
|
| 225 |
+
|
| 226 |
+
if detection_result and detection_result[0]:
|
| 227 |
+
for item in detection_result[0]:
|
| 228 |
+
# PaddleOCR returns [[bbox], (text, conf)] - we only need bbox
|
| 229 |
+
if isinstance(item[0], list) and isinstance(item[0][0], (list, tuple)):
|
| 230 |
+
bbox_raw = item[0] # [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
|
| 231 |
+
else:
|
| 232 |
+
bbox_raw = item # Already just bbox
|
| 233 |
+
|
| 234 |
+
pts = np.array(bbox_raw).astype(int)
|
| 235 |
+
x_min, y_min = pts.min(axis=0)
|
| 236 |
+
x_max, y_max = pts.max(axis=0)
|
| 237 |
+
|
| 238 |
+
# Ensure valid crop region
|
| 239 |
+
x_min = max(0, x_min)
|
| 240 |
+
y_min = max(0, y_min)
|
| 241 |
+
x_max = min(processed_image.shape[1], x_max)
|
| 242 |
+
y_max = min(processed_image.shape[0], y_max)
|
| 243 |
+
|
| 244 |
+
if x_max <= x_min or y_max <= y_min:
|
| 245 |
+
continue
|
| 246 |
+
|
| 247 |
+
# Crop the text region
|
| 248 |
+
cropped = processed_image[y_min:y_max, x_min:x_max]
|
| 249 |
+
|
| 250 |
+
if cropped.size == 0:
|
| 251 |
+
continue
|
| 252 |
+
|
| 253 |
+
# Convert to PIL for manga-ocr
|
| 254 |
+
cropped_pil = Image.fromarray(cropped)
|
| 255 |
+
|
| 256 |
+
# Recognize with manga-ocr
|
| 257 |
+
try:
|
| 258 |
+
text = manga_ocr(cropped_pil)
|
| 259 |
+
except Exception as e:
|
| 260 |
+
print(f"Manga-OCR error: {e}")
|
| 261 |
+
continue
|
| 262 |
+
|
| 263 |
+
if not text.strip():
|
| 264 |
+
continue
|
| 265 |
+
|
| 266 |
+
# Scale bbox back
|
| 267 |
+
bbox = [[int(p[0]/scale_factor), int(p[1]/scale_factor)] for p in bbox_raw]
|
| 268 |
+
|
| 269 |
+
final_results.append((bbox, text.strip(), 0.95))
|
| 270 |
+
|
| 271 |
+
return final_results
|
| 272 |
+
|
| 273 |
+
def _detect_with_paddleocr(self, processed_image: np.ndarray, scale_factor: int) -> List[Tuple]:
|
| 274 |
+
"""Detect text using PaddleOCR - fast and general purpose."""
|
| 275 |
+
reader = self._load_paddleocr()
|
| 276 |
+
|
| 277 |
+
# PaddleOCR expects BGR or RGB numpy array
|
| 278 |
+
result = reader.ocr(processed_image, cls=True)
|
| 279 |
+
|
| 280 |
+
final_results = []
|
| 281 |
+
|
| 282 |
+
# PaddleOCR returns: [[[box], (text, confidence)], ...]
|
| 283 |
+
if result and result[0]:
|
| 284 |
+
for line in result[0]:
|
| 285 |
+
bbox_raw = line[0] # [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
|
| 286 |
+
text = line[1][0]
|
| 287 |
+
confidence = line[1][1]
|
| 288 |
+
|
| 289 |
+
# Skip empty or low confidence
|
| 290 |
+
if not text.strip() or confidence < 0.5:
|
| 291 |
+
continue
|
| 292 |
+
|
| 293 |
+
# Scale bbox back
|
| 294 |
+
bbox = [[int(p[0]/scale_factor), int(p[1]/scale_factor)] for p in bbox_raw]
|
| 295 |
+
|
| 296 |
+
final_results.append((bbox, text.strip(), confidence))
|
| 297 |
+
|
| 298 |
+
return final_results
|
| 299 |
+
|
| 300 |
+
def _detect_with_easyocr(self, processed_image: np.ndarray, paragraph: bool, scale_factor: int) -> List[Tuple]:
|
| 301 |
+
"""Detect text using EasyOCR."""
|
| 302 |
+
reader = self._load_easyocr()
|
| 303 |
+
|
| 304 |
+
results = reader.readtext(
|
| 305 |
+
processed_image,
|
| 306 |
+
paragraph=paragraph,
|
| 307 |
+
contrast_ths=0.05,
|
| 308 |
+
text_threshold=0.5,
|
| 309 |
+
low_text=0.2,
|
| 310 |
+
width_ths=0.5,
|
| 311 |
+
height_ths=0.5,
|
| 312 |
+
min_size=5,
|
| 313 |
+
rotation_info=[0],
|
| 314 |
+
)
|
| 315 |
+
|
| 316 |
+
final_results = []
|
| 317 |
+
for item in results:
|
| 318 |
+
if len(item) == 2:
|
| 319 |
+
bbox, text = item
|
| 320 |
+
new_bbox = [[int(p[0]/scale_factor), int(p[1]/scale_factor)] for p in bbox]
|
| 321 |
+
final_results.append((new_bbox, text))
|
| 322 |
+
elif len(item) == 3:
|
| 323 |
+
bbox, text, prob = item
|
| 324 |
+
new_bbox = [[int(p[0]/scale_factor), int(p[1]/scale_factor)] for p in bbox]
|
| 325 |
+
final_results.append((new_bbox, text, prob))
|
| 326 |
+
|
| 327 |
+
return final_results
|
| 328 |
+
|
| 329 |
+
def get_text_regions(self, image: Union[Image.Image, np.ndarray]) -> List[Any]:
|
| 330 |
+
"""
|
| 331 |
+
Returns raw results from OCR.
|
| 332 |
+
"""
|
| 333 |
+
return self.detect_text(image)
|
| 334 |
+
|
| 335 |
+
def group_text_into_bubbles(self, text_results: List[Tuple], distance_threshold: float = 50) -> List[Tuple[List[List[int]], str]]:
|
| 336 |
+
"""
|
| 337 |
+
Gruppiert nahe beieinanderliegende Textblöcke zu Sprechblasen.
|
| 338 |
+
|
| 339 |
+
Args:
|
| 340 |
+
text_results: Liste von (bbox, text) Tupeln aus detect_text.
|
| 341 |
+
distance_threshold: Maximaler Abstand zwischen Textblöcken, um sie zu gruppieren.
|
| 342 |
+
|
| 343 |
+
Returns:
|
| 344 |
+
Liste von (merged_bbox, combined_text) Tupeln.
|
| 345 |
+
"""
|
| 346 |
+
if not text_results or len(text_results) == 0:
|
| 347 |
+
return []
|
| 348 |
+
|
| 349 |
+
if len(text_results) == 1:
|
| 350 |
+
# Nur ein Textblock, direkt zurückgeben
|
| 351 |
+
bbox, text = text_results[0][:2]
|
| 352 |
+
return [(bbox, text)]
|
| 353 |
+
|
| 354 |
+
# Berechne Zentren aller Bounding Boxes
|
| 355 |
+
centers = []
|
| 356 |
+
for item in text_results:
|
| 357 |
+
bbox = item[0]
|
| 358 |
+
pts = np.array(bbox)
|
| 359 |
+
center_x = np.mean(pts[:, 0])
|
| 360 |
+
center_y = np.mean(pts[:, 1])
|
| 361 |
+
centers.append([center_x, center_y])
|
| 362 |
+
|
| 363 |
+
centers = np.array(centers)
|
| 364 |
+
|
| 365 |
+
# Hierarchisches Clustering basierend auf Distanz
|
| 366 |
+
if len(centers) > 1:
|
| 367 |
+
linkage_matrix = linkage(centers, method='average')
|
| 368 |
+
clusters = fcluster(linkage_matrix, t=distance_threshold, criterion='distance')
|
| 369 |
+
else:
|
| 370 |
+
clusters = [1]
|
| 371 |
+
|
| 372 |
+
# Gruppiere Textblöcke nach Cluster
|
| 373 |
+
cluster_groups = {}
|
| 374 |
+
for idx, cluster_id in enumerate(clusters):
|
| 375 |
+
if cluster_id not in cluster_groups:
|
| 376 |
+
cluster_groups[cluster_id] = []
|
| 377 |
+
cluster_groups[cluster_id].append(idx)
|
| 378 |
+
|
| 379 |
+
# Erstelle zusammengeführte Ergebnisse
|
| 380 |
+
merged_results = []
|
| 381 |
+
for cluster_id, indices in cluster_groups.items():
|
| 382 |
+
# Sammle alle Bboxes und Texte dieser Gruppe
|
| 383 |
+
all_bboxes = []
|
| 384 |
+
all_texts = []
|
| 385 |
+
|
| 386 |
+
# Sortiere nach Y-Position (oben nach unten)
|
| 387 |
+
sorted_indices = sorted(indices, key=lambda i: np.mean(np.array(text_results[i][0])[:, 1]))
|
| 388 |
+
|
| 389 |
+
for idx in sorted_indices:
|
| 390 |
+
item = text_results[idx]
|
| 391 |
+
bbox = item[0]
|
| 392 |
+
text = item[1]
|
| 393 |
+
all_bboxes.append(bbox)
|
| 394 |
+
all_texts.append(text)
|
| 395 |
+
|
| 396 |
+
# Kombiniere alle Bboxes zu einer großen Bbox
|
| 397 |
+
all_points = []
|
| 398 |
+
for bbox in all_bboxes:
|
| 399 |
+
all_points.extend(bbox)
|
| 400 |
+
all_points = np.array(all_points)
|
| 401 |
+
|
| 402 |
+
x_min = int(np.min(all_points[:, 0]))
|
| 403 |
+
y_min = int(np.min(all_points[:, 1]))
|
| 404 |
+
x_max = int(np.max(all_points[:, 0]))
|
| 405 |
+
y_max = int(np.max(all_points[:, 1]))
|
| 406 |
+
|
| 407 |
+
merged_bbox = [[x_min, y_min], [x_max, y_min], [x_max, y_max], [x_min, y_max]]
|
| 408 |
+
|
| 409 |
+
# Kombiniere Texte mit Leerzeichen (für natürlichen Lesefluss)
|
| 410 |
+
combined_text = ' '.join(all_texts)
|
| 411 |
+
|
| 412 |
+
merged_results.append((merged_bbox, combined_text))
|
| 413 |
+
|
| 414 |
+
return merged_results
|
| 415 |
+
|
| 416 |
+
def detect_and_group_text(self, image: Union[Image.Image, np.ndarray], distance_threshold: float = 50, preprocess_mode: str = 'gentle') -> List[Tuple[List[List[int]], str]]:
|
| 417 |
+
"""
|
| 418 |
+
Erkennt Text und gruppiert ihn automatisch nach Sprechblasen.
|
| 419 |
+
|
| 420 |
+
Args:
|
| 421 |
+
image: PIL Image oder numpy array.
|
| 422 |
+
distance_threshold: Maximaler Abstand für Gruppierung (in Pixeln).
|
| 423 |
+
preprocess_mode: Preprocessing mode ('gentle', 'none', 'aggressive').
|
| 424 |
+
|
| 425 |
+
Returns:
|
| 426 |
+
Liste von (bbox, combined_text) Tupeln, gruppiert nach Sprechblasen.
|
| 427 |
+
"""
|
| 428 |
+
# Erst einzelne Textblöcke erkennen (paragraph=False für feinere Kontrolle)
|
| 429 |
+
raw_results = self.detect_text(image, paragraph=False, preprocess_mode=preprocess_mode)
|
| 430 |
+
|
| 431 |
+
# Dann nach räumlicher Nähe gruppieren
|
| 432 |
+
grouped_results = self.group_text_into_bubbles(raw_results, distance_threshold)
|
| 433 |
+
|
| 434 |
+
return grouped_results
|
src/pdf_handler.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import fitz # PyMuPDF
|
| 2 |
+
from PIL import Image
|
| 3 |
+
import io
|
| 4 |
+
import os
|
| 5 |
+
from typing import List, Union
|
| 6 |
+
|
| 7 |
+
class PDFHandler:
|
| 8 |
+
def __init__(self):
|
| 9 |
+
pass
|
| 10 |
+
|
| 11 |
+
def extract_images_from_pdf(self, pdf_path: str, zoom: int = 2, pages: List[int] = None) -> List[Image.Image]:
|
| 12 |
+
"""
|
| 13 |
+
Converts each page of the PDF into a PIL Image.
|
| 14 |
+
|
| 15 |
+
Args:
|
| 16 |
+
pdf_path: Path to the source PDF file.
|
| 17 |
+
zoom: Zoom factor for higher resolution (default 2 for better OCR).
|
| 18 |
+
pages: Optional list of 0-indexed page numbers to extract. If None, extracts all.
|
| 19 |
+
|
| 20 |
+
Returns:
|
| 21 |
+
List of PIL Image objects.
|
| 22 |
+
"""
|
| 23 |
+
if not os.path.exists(pdf_path):
|
| 24 |
+
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
|
| 25 |
+
|
| 26 |
+
doc = fitz.open(pdf_path)
|
| 27 |
+
images = []
|
| 28 |
+
|
| 29 |
+
# Matrix for zooming (higher resolution for better OCR)
|
| 30 |
+
mat = fitz.Matrix(zoom, zoom)
|
| 31 |
+
|
| 32 |
+
# Determine which pages to process
|
| 33 |
+
if pages is None:
|
| 34 |
+
page_indices = range(len(doc))
|
| 35 |
+
else:
|
| 36 |
+
# Filter out invalid page numbers
|
| 37 |
+
page_indices = [p for p in pages if 0 <= p < len(doc)]
|
| 38 |
+
|
| 39 |
+
for page_num in page_indices:
|
| 40 |
+
page = doc.load_page(page_num)
|
| 41 |
+
|
| 42 |
+
pix = page.get_pixmap(matrix=mat)
|
| 43 |
+
|
| 44 |
+
# Convert to PIL Image
|
| 45 |
+
img_data = pix.tobytes("png")
|
| 46 |
+
img = Image.open(io.BytesIO(img_data))
|
| 47 |
+
images.append(img)
|
| 48 |
+
|
| 49 |
+
doc.close()
|
| 50 |
+
return images
|
| 51 |
+
|
| 52 |
+
def save_images_as_pdf(self, images: List[Image.Image], output_path: str):
|
| 53 |
+
"""
|
| 54 |
+
Saves a list of PIL Images as a single PDF file.
|
| 55 |
+
|
| 56 |
+
Args:
|
| 57 |
+
images: List of PIL Image objects.
|
| 58 |
+
output_path: Path where the new PDF should be saved.
|
| 59 |
+
"""
|
| 60 |
+
if not images:
|
| 61 |
+
print("No images to save.")
|
| 62 |
+
return
|
| 63 |
+
|
| 64 |
+
# Convert PIL images to RGB if necessary and save
|
| 65 |
+
pdf_images = []
|
| 66 |
+
for img in images:
|
| 67 |
+
if img.mode == 'RGBA':
|
| 68 |
+
img = img.convert('RGB')
|
| 69 |
+
pdf_images.append(img)
|
| 70 |
+
|
| 71 |
+
if pdf_images:
|
| 72 |
+
pdf_images[0].save(
|
| 73 |
+
output_path,
|
| 74 |
+
save_all=True,
|
| 75 |
+
append_images=pdf_images[1:],
|
| 76 |
+
resolution=100.0,
|
| 77 |
+
quality=95,
|
| 78 |
+
optimize=True
|
| 79 |
+
)
|
| 80 |
+
print(f"PDF saved successfully at {output_path}")
|
src/translator.py
ADDED
|
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from deep_translator import GoogleTranslator
|
| 2 |
+
import deepl
|
| 3 |
+
from openai import OpenAI
|
| 4 |
+
from typing import List, Union, Optional
|
| 5 |
+
import base64
|
| 6 |
+
import io
|
| 7 |
+
import json
|
| 8 |
+
from PIL import Image
|
| 9 |
+
|
| 10 |
+
class TranslatorService:
|
| 11 |
+
def __init__(self, source: str = 'en', target: str = 'de', service_type: str = 'google', api_key: Optional[str] = None):
|
| 12 |
+
"""
|
| 13 |
+
Initializes the Translator Service.
|
| 14 |
+
|
| 15 |
+
Args:
|
| 16 |
+
source: Source language code (default: 'en').
|
| 17 |
+
target: Target language code (default: 'de').
|
| 18 |
+
service_type: 'google', 'deepl', 'openai', or 'xai'.
|
| 19 |
+
api_key: API Key for DeepL, OpenAI or xAI.
|
| 20 |
+
"""
|
| 21 |
+
self.service_type = service_type
|
| 22 |
+
self.api_key = api_key
|
| 23 |
+
self.target = target
|
| 24 |
+
self.source = source
|
| 25 |
+
self.usage = {'input_tokens': 0, 'output_tokens': 0}
|
| 26 |
+
|
| 27 |
+
if self.service_type == 'deepl':
|
| 28 |
+
print("Using DeepL Translator")
|
| 29 |
+
if not self.api_key:
|
| 30 |
+
raise ValueError("DeepL API Key is required for DeepL service.")
|
| 31 |
+
self.translator = deepl.Translator(self.api_key)
|
| 32 |
+
|
| 33 |
+
elif self.service_type == 'openai':
|
| 34 |
+
print("Using OpenAI (GPT-4o-mini) Translator")
|
| 35 |
+
if not self.api_key:
|
| 36 |
+
raise ValueError("OpenAI API Key is required for OpenAI service.")
|
| 37 |
+
self.client = OpenAI(api_key=self.api_key)
|
| 38 |
+
|
| 39 |
+
elif self.service_type == 'xai':
|
| 40 |
+
print("Using xAI Grok Translator")
|
| 41 |
+
if not self.api_key:
|
| 42 |
+
raise ValueError("xAI API Key is required for Grok service.")
|
| 43 |
+
# xAI API is OpenAI-compatible
|
| 44 |
+
self.client = OpenAI(api_key=self.api_key, base_url="https://api.x.ai/v1")
|
| 45 |
+
|
| 46 |
+
else:
|
| 47 |
+
print("Using Google Translator (deep-translator)")
|
| 48 |
+
self.translator = GoogleTranslator(source=source, target=target)
|
| 49 |
+
|
| 50 |
+
def get_usage_stats(self):
|
| 51 |
+
"""Returns accumulated token usage."""
|
| 52 |
+
return self.usage
|
| 53 |
+
|
| 54 |
+
def get_cost_estimate(self):
|
| 55 |
+
"""
|
| 56 |
+
Returns estimated cost in USD based on GPT-4o-mini pricing.
|
| 57 |
+
Input: $0.15 / 1M tokens
|
| 58 |
+
Output: $0.60 / 1M tokens
|
| 59 |
+
"""
|
| 60 |
+
input_cost = (self.usage['input_tokens'] / 1_000_000) * 0.15
|
| 61 |
+
output_cost = (self.usage['output_tokens'] / 1_000_000) * 0.60
|
| 62 |
+
return input_cost + output_cost
|
| 63 |
+
|
| 64 |
+
def validate_api_key(self) -> None:
|
| 65 |
+
"""Performs a lightweight test call to validate the configured API key.
|
| 66 |
+
|
| 67 |
+
Raises:
|
| 68 |
+
Exception: If the key is invalid or the provider returns an auth error.
|
| 69 |
+
"""
|
| 70 |
+
# Google (deep-translator) does not use an API key
|
| 71 |
+
if self.service_type not in ['deepl', 'openai', 'xai']:
|
| 72 |
+
return
|
| 73 |
+
|
| 74 |
+
if self.service_type == 'deepl':
|
| 75 |
+
# Minimal ping using the official client
|
| 76 |
+
try:
|
| 77 |
+
# This will raise an exception on invalid auth
|
| 78 |
+
_ = self.translator.get_usage()
|
| 79 |
+
except Exception as e:
|
| 80 |
+
raise Exception(f"DeepL API key seems invalid or not authorized: {e}")
|
| 81 |
+
return
|
| 82 |
+
|
| 83 |
+
# OpenAI / xAI
|
| 84 |
+
try:
|
| 85 |
+
model = "gpt-4o-mini" if self.service_type == 'openai' else "grok-4-mini"
|
| 86 |
+
# Very small test prompt to minimize cost
|
| 87 |
+
response = self.client.chat.completions.create(
|
| 88 |
+
model=model,
|
| 89 |
+
messages=[
|
| 90 |
+
{"role": "user", "content": "test"}
|
| 91 |
+
],
|
| 92 |
+
max_tokens=1,
|
| 93 |
+
temperature=0.0,
|
| 94 |
+
)
|
| 95 |
+
# If we get here without exception, we assume the key works.
|
| 96 |
+
if response.usage:
|
| 97 |
+
self.usage['input_tokens'] += response.usage.prompt_tokens
|
| 98 |
+
self.usage['output_tokens'] += response.usage.completion_tokens
|
| 99 |
+
except Exception as e:
|
| 100 |
+
raise Exception(f"{self.service_type.capitalize()} API key seems invalid or the service is not reachable: {e}")
|
| 101 |
+
|
| 102 |
+
def translate_image_with_vision(self, image: Image.Image) -> List[dict]:
|
| 103 |
+
"""
|
| 104 |
+
Uses VLM (Vision Language Model) to detect and translate text directly from image.
|
| 105 |
+
Returns list of dicts: {'bbox': [x1, y1, x2, y2], 'original': str, 'translated': str}
|
| 106 |
+
"""
|
| 107 |
+
if self.service_type not in ['openai', 'xai']:
|
| 108 |
+
raise ValueError("Vision features only supported for OpenAI and xAI services.")
|
| 109 |
+
|
| 110 |
+
# 1. Letterbox the image to be square (helps with coordinate accuracy)
|
| 111 |
+
old_width, old_height = image.size
|
| 112 |
+
new_size = max(old_width, old_height)
|
| 113 |
+
square_img = Image.new("RGB", (new_size, new_size), (255, 255, 255))
|
| 114 |
+
|
| 115 |
+
# Paste original image centered or top-left? Top-left is easier for coord math.
|
| 116 |
+
square_img.paste(image, (0, 0))
|
| 117 |
+
|
| 118 |
+
# Convert to base64
|
| 119 |
+
buffered = io.BytesIO()
|
| 120 |
+
square_img.save(buffered, format="JPEG")
|
| 121 |
+
img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
|
| 122 |
+
img_url = f"data:image/jpeg;base64,{img_str}"
|
| 123 |
+
|
| 124 |
+
model = "gpt-4o-mini" if self.service_type == 'openai' else "grok-4-latest"
|
| 125 |
+
|
| 126 |
+
prompt = f"""
|
| 127 |
+
You are a Manga Translator Agent.
|
| 128 |
+
Look at this manga page. Identify all speech bubbles and text boxes.
|
| 129 |
+
For each text region:
|
| 130 |
+
1. Extract the English text.
|
| 131 |
+
2. Translate it to German.
|
| 132 |
+
3. Estimate the bounding box as [ymin, xmin, ymax, xmax] using a 0-1000 normalized scale based on this square image.
|
| 133 |
+
- (0,0) is top-left corner.
|
| 134 |
+
- (1000,1000) is bottom-right corner.
|
| 135 |
+
- Be extremely precise with the coordinates.
|
| 136 |
+
- The image might have white padding on the right or bottom, ignore that area.
|
| 137 |
+
|
| 138 |
+
Return ONLY a valid JSON array with this structure:
|
| 139 |
+
[
|
| 140 |
+
{{
|
| 141 |
+
"original": "English text",
|
| 142 |
+
"translated": "German translation",
|
| 143 |
+
"bbox": [ymin, xmin, ymax, xmax]
|
| 144 |
+
}}
|
| 145 |
+
]
|
| 146 |
+
Do not use markdown code blocks. Return raw JSON only.
|
| 147 |
+
"""
|
| 148 |
+
|
| 149 |
+
try:
|
| 150 |
+
response = self.client.chat.completions.create(
|
| 151 |
+
model=model,
|
| 152 |
+
messages=[
|
| 153 |
+
{
|
| 154 |
+
"role": "user",
|
| 155 |
+
"content": [
|
| 156 |
+
{"type": "text", "text": prompt},
|
| 157 |
+
{
|
| 158 |
+
"type": "image_url",
|
| 159 |
+
"image_url": {"url": img_url}
|
| 160 |
+
}
|
| 161 |
+
],
|
| 162 |
+
}
|
| 163 |
+
],
|
| 164 |
+
max_tokens=2000,
|
| 165 |
+
temperature=0.1
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
# Track usage
|
| 169 |
+
if response.usage:
|
| 170 |
+
self.usage['input_tokens'] += response.usage.prompt_tokens
|
| 171 |
+
self.usage['output_tokens'] += response.usage.completion_tokens
|
| 172 |
+
|
| 173 |
+
content = response.choices[0].message.content.strip()
|
| 174 |
+
# Cleanup markdown if present
|
| 175 |
+
if content.startswith("```json"):
|
| 176 |
+
content = content[7:]
|
| 177 |
+
if content.endswith("```"):
|
| 178 |
+
content = content[:-3]
|
| 179 |
+
|
| 180 |
+
data = json.loads(content.strip())
|
| 181 |
+
|
| 182 |
+
results = []
|
| 183 |
+
for item in data:
|
| 184 |
+
ymin, xmin, ymax, xmax = item['bbox']
|
| 185 |
+
|
| 186 |
+
# Clamp values 0-1000
|
| 187 |
+
ymin = max(0, min(1000, ymin))
|
| 188 |
+
xmin = max(0, min(1000, xmin))
|
| 189 |
+
ymax = max(0, min(1000, ymax))
|
| 190 |
+
xmax = max(0, min(1000, xmax))
|
| 191 |
+
|
| 192 |
+
# Convert from 0-1000 scale relative to the SQUARE image
|
| 193 |
+
abs_x_min = int((xmin / 1000) * new_size)
|
| 194 |
+
abs_y_min = int((ymin / 1000) * new_size)
|
| 195 |
+
abs_x_max = int((xmax / 1000) * new_size)
|
| 196 |
+
abs_y_max = int((ymax / 1000) * new_size)
|
| 197 |
+
|
| 198 |
+
# Clip to original image dimensions (remove padding area results)
|
| 199 |
+
abs_x_min = min(abs_x_min, old_width)
|
| 200 |
+
abs_y_min = min(abs_y_min, old_height)
|
| 201 |
+
abs_x_max = min(abs_x_max, old_width)
|
| 202 |
+
abs_y_max = min(abs_y_max, old_height)
|
| 203 |
+
|
| 204 |
+
# Ensure valid box
|
| 205 |
+
if abs_x_max > abs_x_min and abs_y_max > abs_y_min:
|
| 206 |
+
bbox_points = [
|
| 207 |
+
[abs_x_min, abs_y_min], # Top-Left
|
| 208 |
+
[abs_x_max, abs_y_min], # Top-Right
|
| 209 |
+
[abs_x_max, abs_y_max], # Bottom-Right
|
| 210 |
+
[abs_x_min, abs_y_max] # Bottom-Left
|
| 211 |
+
]
|
| 212 |
+
|
| 213 |
+
results.append({
|
| 214 |
+
'bbox': bbox_points,
|
| 215 |
+
'original': item.get('original', ''),
|
| 216 |
+
'translated': item.get('translated', '')
|
| 217 |
+
})
|
| 218 |
+
|
| 219 |
+
return results
|
| 220 |
+
|
| 221 |
+
except Exception as e:
|
| 222 |
+
print(f"Vision translation error: {e}")
|
| 223 |
+
return []
|
| 224 |
+
|
| 225 |
+
def translate_text(self, text: str) -> str:
|
| 226 |
+
"""
|
| 227 |
+
Translates a single string.
|
| 228 |
+
"""
|
| 229 |
+
if not text.strip():
|
| 230 |
+
return ""
|
| 231 |
+
|
| 232 |
+
try:
|
| 233 |
+
if self.service_type == 'deepl':
|
| 234 |
+
# DeepL uses slightly different language codes (e.g. 'DE' instead of 'de' usually, but 'de' works)
|
| 235 |
+
result = self.translator.translate_text(text, source_lang=None, target_lang=self.target)
|
| 236 |
+
return result.text
|
| 237 |
+
|
| 238 |
+
elif self.service_type in ['openai', 'xai']:
|
| 239 |
+
# Select model based on service
|
| 240 |
+
model = "gpt-4o-mini" if self.service_type == 'openai' else "grok-4-latest"
|
| 241 |
+
|
| 242 |
+
response = self.client.chat.completions.create(
|
| 243 |
+
model=model,
|
| 244 |
+
messages=[
|
| 245 |
+
{"role": "system", "content": f"You are a professional manga translator. Translate the following text from {self.source} to {self.target}. Keep the translation natural and fitting for a comic/manga context. Ensure correct handling of German special characters like ä, ö, ü, ß. Only return the translated text, nothing else."},
|
| 246 |
+
{"role": "user", "content": text}
|
| 247 |
+
],
|
| 248 |
+
temperature=0.3
|
| 249 |
+
)
|
| 250 |
+
|
| 251 |
+
# Track usage
|
| 252 |
+
if response.usage:
|
| 253 |
+
self.usage['input_tokens'] += response.usage.prompt_tokens
|
| 254 |
+
self.usage['output_tokens'] += response.usage.completion_tokens
|
| 255 |
+
|
| 256 |
+
return response.choices[0].message.content.strip()
|
| 257 |
+
|
| 258 |
+
else:
|
| 259 |
+
return self.translator.translate(text)
|
| 260 |
+
except Exception as e:
|
| 261 |
+
print(f"Translation error: {e}")
|
| 262 |
+
return text
|
| 263 |
+
|
| 264 |
+
def translate_batch(self, texts: List[str]) -> List[str]:
|
| 265 |
+
"""
|
| 266 |
+
Translates a list of strings.
|
| 267 |
+
"""
|
| 268 |
+
if not texts:
|
| 269 |
+
return []
|
| 270 |
+
|
| 271 |
+
try:
|
| 272 |
+
if self.service_type == 'deepl':
|
| 273 |
+
results = self.translator.translate_text(texts, source_lang=None, target_lang=self.target)
|
| 274 |
+
return [r.text for r in results]
|
| 275 |
+
|
| 276 |
+
elif self.service_type in ['openai', 'xai']:
|
| 277 |
+
# Select model based on service
|
| 278 |
+
model = "gpt-4o-mini" if self.service_type == 'openai' else "grok-4-latest"
|
| 279 |
+
|
| 280 |
+
# OpenAI/xAI batch approach
|
| 281 |
+
formatted_text = "\n".join([f"{i+1}. {t}" for i, t in enumerate(texts)])
|
| 282 |
+
prompt = f"Translate the following numbered lines from {self.source} to {self.target}. Return them as a numbered list with the same indices.\n\n{formatted_text}"
|
| 283 |
+
|
| 284 |
+
response = self.client.chat.completions.create(
|
| 285 |
+
model=model,
|
| 286 |
+
messages=[
|
| 287 |
+
{"role": "system", "content": f"You are a professional manga translator. Translate the text from {self.source} to {self.target}. Return ONLY the numbered list of translations."},
|
| 288 |
+
{"role": "user", "content": prompt}
|
| 289 |
+
],
|
| 290 |
+
temperature=0.3
|
| 291 |
+
)
|
| 292 |
+
|
| 293 |
+
# Track usage
|
| 294 |
+
if response.usage:
|
| 295 |
+
self.usage['input_tokens'] += response.usage.prompt_tokens
|
| 296 |
+
self.usage['output_tokens'] += response.usage.completion_tokens
|
| 297 |
+
|
| 298 |
+
content = response.choices[0].message.content.strip()
|
| 299 |
+
|
| 300 |
+
# Parse results back to list
|
| 301 |
+
translated_lines = []
|
| 302 |
+
# Simple parsing (robustness could be improved)
|
| 303 |
+
for line in content.split('\n'):
|
| 304 |
+
if '. ' in line:
|
| 305 |
+
parts = line.split('. ', 1)
|
| 306 |
+
if len(parts) > 1:
|
| 307 |
+
translated_lines.append(parts[1])
|
| 308 |
+
else:
|
| 309 |
+
translated_lines.append(line)
|
| 310 |
+
else:
|
| 311 |
+
translated_lines.append(line)
|
| 312 |
+
|
| 313 |
+
# Fallback if counts don't match (rare but possible)
|
| 314 |
+
if len(translated_lines) != len(texts):
|
| 315 |
+
return [self.translate_text(t) for t in texts]
|
| 316 |
+
|
| 317 |
+
return translated_lines
|
| 318 |
+
|
| 319 |
+
else:
|
| 320 |
+
return self.translator.translate_batch(texts)
|
| 321 |
+
except Exception as e:
|
| 322 |
+
print(f"Batch translation error: {e}")
|
| 323 |
+
return texts
|
src/ui_state.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Utilities for managing UI states across the Streamlit app."""
|
| 2 |
+
|
| 3 |
+
from typing import Any
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def should_display_thumbnails(translation_in_progress: bool) -> bool:
|
| 7 |
+
"""Determine whether page thumbnails should be rendered.
|
| 8 |
+
|
| 9 |
+
Args:
|
| 10 |
+
translation_in_progress (bool): Flag indicating if the translation workflow is active.
|
| 11 |
+
|
| 12 |
+
Returns:
|
| 13 |
+
bool: True if thumbnails should be shown, otherwise False.
|
| 14 |
+
|
| 15 |
+
Raises:
|
| 16 |
+
ValueError: If the provided flag is not a boolean.
|
| 17 |
+
"""
|
| 18 |
+
if not isinstance(translation_in_progress, bool):
|
| 19 |
+
raise ValueError("translation_in_progress must be a boolean")
|
| 20 |
+
|
| 21 |
+
return not translation_in_progress
|
tests/test_ui_state.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
|
| 3 |
+
from src.ui_state import should_display_thumbnails
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def test_should_display_thumbnails_returns_true_when_not_in_progress():
|
| 7 |
+
"""Thumbnails render while translation is idle."""
|
| 8 |
+
assert should_display_thumbnails(False) is True
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def test_should_display_thumbnails_returns_false_during_progress():
|
| 12 |
+
"""Thumbnails stay hidden once translation starts."""
|
| 13 |
+
assert should_display_thumbnails(True) is False
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def test_should_display_thumbnails_rejects_non_boolean_input():
|
| 17 |
+
"""Helper enforces boolean input to catch state bugs early."""
|
| 18 |
+
with pytest.raises(ValueError):
|
| 19 |
+
should_display_thumbnails("yes")
|