Spaces:

Ruhivig65
/

Novelt

Sleeping

App Files Files Community

Ruhivig65 commited on Mar 11

Commit

6cc0372

verified ·

1 Parent(s): bd06613

Upload 6 files

Browse files

Files changed (6) hide show

Dockerfile +26 -0
README.md +7 -5
app.py +486 -0
requirements.txt +7 -0
task_manager.py +439 -0
translator_engine.py +740 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,26 @@

+FROM python:3.10.14-slim
+# Create non-root user (HuggingFace requirement)
+RUN useradd -m -u 1000 appuser
+WORKDIR /app
+# Install dependencies first (Docker cache optimization)
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy all application code
+COPY . .
+# Create necessary directories with proper permissions
+RUN mkdir -p uploads outputs && \
+    chown -R appuser:appuser /app
+# Switch to non-root user
+USER appuser
+# HuggingFace Spaces expects port 7860
+EXPOSE 7860
+# Launch command
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,10 +1,12 @@
 ---
-title: Novelt
-emoji: 🏆
-colorFrom: gray
-colorTo: red
 sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Massive Text Translator EN-HI
+emoji: 📚
+colorFrom: blue
+colorTo: green
 sdk: docker
+app_port: 7860
 pinned: false
 ---
+# Massive English to Hindi Text Translator
+Upload massive .txt files (1000+ chapters, 2-3 million words) and get translated .txt back.

app.py ADDED Viewed

	@@ -0,0 +1,486 @@

+"""
+app.py
+======
+FastAPI Web Application for Massive Text Translation (EN → HI)
+HUGGINGFACE SPACES DEPLOYMENT:
+-------------------------------
+- This is the main entry point
+- HuggingFace Spaces Docker SDK runs: uvicorn app:app --host 0.0.0.0 --port 7860
+- The app serves both the API and the frontend HTML
+ROUTES:
+-------
+GET  /                  → Frontend UI (HTML page)
+POST /upload            → Upload a .txt file and start translation
+GET  /progress          → Get real-time progress of current/latest task
+GET  /progress/{id}     → Get progress of a specific task by ID
+GET  /download/{id}     → Download the translated file
+POST /cancel/{id}       → Cancel a running translation
+GET  /health            → Health check endpoint
+FLOW:
+-----
+1. User opens / → sees the upload UI
+2. User uploads .txt file → POST /upload
+3. Server saves file, creates task, starts background thread
+4. Frontend polls GET /progress every 2 seconds
+5. When status="completed", download button appears
+6. User clicks download → GET /download/{task_id}
+7. Server streams the translated .txt file to browser
+"""
+import os
+import logging
+import shutil
+import uuid
+from typing import Optional
+from fastapi import FastAPI, File, UploadFile, HTTPException, Request
+from fastapi.responses import (
+    HTMLResponse,
+    JSONResponse,
+    FileResponse,
+    StreamingResponse,
+)
+from fastapi.staticfiles import StaticFiles
+from fastapi.templating import Jinja2Templates
+from task_manager import (
+    get_task_manager,
+    TaskStatus,
+    UPLOAD_DIR,
+    OUTPUT_DIR,
+)
+# ============================================================================
+# LOGGING
+# ============================================================================
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
+)
+logger = logging.getLogger("app")
+# ============================================================================
+# FASTAPI APP INITIALIZATION
+# ============================================================================
+app = FastAPI(
+    title="Massive Text Translator (EN → HI)",
+    description="Translate massive .txt files from English to Hindi using free Google Translate",
+    version="1.0.0",
+)
+# ============================================================================
+# TEMPLATES SETUP
+# ============================================================================
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+TEMPLATES_DIR = os.path.join(BASE_DIR, "templates")
+os.makedirs(TEMPLATES_DIR, exist_ok=True)
+templates = Jinja2Templates(directory=TEMPLATES_DIR)
+# ============================================================================
+# STARTUP EVENT
+# ============================================================================
+@app.on_event("startup")
+async def startup_event():
+    """
+    Called when the FastAPI server starts.
+    - Ensures directories exist
+    - Initializes the TaskManager
+    - Cleans up any leftover files from previous runs
+    """
+    os.makedirs(UPLOAD_DIR, exist_ok=True)
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    # Initialize the global task manager
+    tm = get_task_manager()
+    logger.info("=" * 60)
+    logger.info("Massive Text Translator started successfully!")
+    logger.info(f"Upload dir: {UPLOAD_DIR}")
+    logger.info(f"Output dir: {OUTPUT_DIR}")
+    logger.info("=" * 60)
+# ============================================================================
+# ROUTE: HOME PAGE (Frontend UI)
+# ============================================================================
+@app.get("/", response_class=HTMLResponse)
+async def home(request: Request):
+    """
+    Serve the main frontend page.
+    The HTML template handles:
+    - File upload form
+    - Real-time progress display
+    - Download button when complete
+    """
+    return templates.TemplateResponse("index.html", {"request": request})
+# ============================================================================
+# ROUTE: FILE UPLOAD
+# ============================================================================
+@app.post("/upload")
+async def upload_file(file: UploadFile = File(...)):
+    """
+    Handle .txt file upload and start translation.
+    FLOW:
+    1. Validate file (must be .txt)
+    2. Save to uploads/ directory
+    3. Create a translation task
+    4. Start translation in background thread
+    5. Return task_id for progress polling
+    ERROR SCENARIOS:
+    - File is not .txt → 400 error
+    - Another translation running → 409 Conflict
+    - Disk full → 500 error (caught by generic handler)
+    - File too large → handled by streaming save (won't OOM)
+    """
+    tm = get_task_manager()
+    # --- Validation ---
+    if not file.filename:
+        raise HTTPException(status_code=400, detail="No file provided.")
+    if not file.filename.lower().endswith(".txt"):
+        raise HTTPException(
+            status_code=400,
+            detail="Only .txt files are supported. Please upload a plain text file.",
+        )
+    # --- Check if busy ---
+    if tm.is_busy():
+        raise HTTPException(
+            status_code=409,
+            detail="A translation is already in progress. Please wait for it to "
+                   "complete or cancel it before uploading a new file.",
+        )
+    # --- Cleanup old tasks before accepting new one ---
+    tm.cleanup_old_tasks()
+    # --- Save uploaded file to disk (streaming — no full file in RAM) ---
+    file_id = str(uuid.uuid4())[:8]
+    safe_filename = f"{file_id}_{file.filename}"
+    input_path = os.path.join(UPLOAD_DIR, safe_filename)
+    try:
+        # Stream file to disk in chunks (handles files larger than RAM)
+        # IMPORTANT: We read in 1MB chunks, not the whole file at once
+        with open(input_path, "wb") as f:
+            while True:
+                chunk = await file.read(1024 * 1024)  # 1MB chunks
+                if not chunk:
+                    break
+                f.write(chunk)
+        file_size = os.path.getsize(input_path)
+        logger.info(
+            f"File uploaded: {file.filename} → {input_path} "
+            f"({file_size / 1024 / 1024:.1f} MB)"
+        )
+    except Exception as e:
+        # Clean up partial upload
+        if os.path.exists(input_path):
+            os.remove(input_path)
+        logger.exception(f"File upload failed: {e}")
+        raise HTTPException(
+            status_code=500,
+            detail=f"Failed to save uploaded file: {str(e)}",
+        )
+    # --- Create and start translation task ---
+    try:
+        task = tm.create_task(
+            original_filename=file.filename,
+            input_path=input_path,
+        )
+        tm.start_task(task.task_id)
+        return JSONResponse(
+            status_code=202,  # 202 Accepted (processing started)
+            content={
+                "message": "Translation started!",
+                "task_id": task.task_id,
+                "filename": file.filename,
+                "file_size_mb": round(file_size / 1024 / 1024, 2),
+            },
+        )
+    except RuntimeError as e:
+        # Another task started between our check and create (race condition)
+        if os.path.exists(input_path):
+            os.remove(input_path)
+        raise HTTPException(status_code=409, detail=str(e))
+    except Exception as e:
+        if os.path.exists(input_path):
+            os.remove(input_path)
+        logger.exception(f"Task creation failed: {e}")
+        raise HTTPException(
+            status_code=500,
+            detail=f"Failed to start translation: {str(e)}",
+        )
+# ============================================================================
+# ROUTE: PROGRESS POLLING (Latest Task)
+# ============================================================================
+@app.get("/progress")
+async def get_progress():
+    """
+    Get the progress of the latest/current translation task.
+    The frontend polls this endpoint every 2 seconds.
+    Returns:
+    {
+        "task_id": "a1b2c3d4",
+        "original_filename": "my_novel.txt",
+        "status": "translating",
+        "progress": {
+            "total_paragraphs": 50000,
+            "translated_paragraphs": 15000,
+            "failed_paragraphs": 3,
+            "percent_complete": 30.0,
+            "elapsed_seconds": 1800.5,
+            "speed_per_second": 8.33,
+            "eta_seconds": 4200.0,
+            ...
+        },
+        ...
+    }
+    """
+    tm = get_task_manager()
+    progress = tm.get_latest_task_progress()
+    if progress is None:
+        return JSONResponse(
+            content={
+                "status": "idle",
+                "message": "No translation tasks. Upload a .txt file to start.",
+            }
+        )
+    return JSONResponse(content=progress)
+# ============================================================================
+# ROUTE: PROGRESS POLLING (Specific Task)
+# ============================================================================
+@app.get("/progress/{task_id}")
+async def get_task_progress(task_id: str):
+    """
+    Get progress of a specific task by ID.
+    Useful if you have the task_id from the upload response.
+    """
+    tm = get_task_manager()
+    progress = tm.get_task_progress(task_id)
+    if progress is None:
+        raise HTTPException(
+            status_code=404,
+            detail=f"Task not found: {task_id}",
+        )
+    return JSONResponse(content=progress)
+# ============================================================================
+# ROUTE: DOWNLOAD TRANSLATED FILE
+# ============================================================================
+@app.get("/download/{task_id}")
+async def download_file(task_id: str):
+    """
+    Download the translated .txt file.
+    Only works for COMPLETED tasks.
+    Streams the file to the browser (doesn't load entire file in RAM).
+    SECURITY:
+    - Only serves files from the outputs/ directory
+    - Validates task_id exists in our registry
+    - Prevents path traversal attacks
+    """
+    tm = get_task_manager()
+    task = tm.get_task(task_id)
+    if task is None:
+        raise HTTPException(status_code=404, detail=f"Task not found: {task_id}")
+    if task.status != TaskStatus.COMPLETED:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Task is not completed yet. Current status: {task.status.value}",
+        )
+    output_path = tm.get_output_path(task_id)
+    if output_path is None or not os.path.exists(output_path):
+        raise HTTPException(
+            status_code=404,
+            detail="Translated file not found on disk. It may have been cleaned up.",
+        )
+    # Generate a user-friendly download filename
+    original_name = os.path.splitext(task.original_filename)[0]
+    download_filename = f"{original_name}_Hindi_Translated.txt"
+    logger.info(f"Serving download: {output_path} as '{download_filename}'")
+    # Use FileResponse — it streams the file, doesn't load into RAM
+    return FileResponse(
+        path=output_path,
+        filename=download_filename,
+        media_type="text/plain; charset=utf-8",
+    )
+# ============================================================================
+# ROUTE: CANCEL TRANSLATION
+# ============================================================================
+@app.post("/cancel/{task_id}")
+async def cancel_translation(task_id: str):
+    """
+    Cancel a running translation.
+    The translator will stop after completing its current paragraph.
+    Any already-translated content is preserved in the output file.
+    """
+    tm = get_task_manager()
+    success = tm.cancel_task(task_id)
+    if not success:
+        task = tm.get_task(task_id)
+        if task is None:
+            raise HTTPException(status_code=404, detail=f"Task not found: {task_id}")
+        raise HTTPException(
+            status_code=400,
+            detail=f"Cannot cancel task in '{task.status.value}' state.",
+        )
+    return JSONResponse(
+        content={
+            "message": "Translation cancelled. Partial output preserved on disk.",
+            "task_id": task_id,
+        }
+    )
+# ============================================================================
+# ROUTE: CANCEL LATEST TASK (convenience for simple UI)
+# ============================================================================
+@app.post("/cancel")
+async def cancel_latest():
+    """Cancel the most recent/active translation task."""
+    tm = get_task_manager()
+    task = tm.get_latest_task()
+    if task is None:
+        raise HTTPException(status_code=404, detail="No active tasks to cancel.")
+    success = tm.cancel_task(task.task_id)
+    if not success:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Cannot cancel task in '{task.status.value}' state.",
+        )
+    return JSONResponse(
+        content={
+            "message": "Translation cancelled.",
+            "task_id": task.task_id,
+        }
+    )
+# ============================================================================
+# ROUTE: HEALTH CHECK
+# ============================================================================
+@app.get("/health")
+async def health_check():
+    """
+    Health check endpoint.
+    HuggingFace Spaces uses this to verify the app is running.
+    Also useful for monitoring.
+    """
+    tm = get_task_manager()
+    return JSONResponse(
+        content={
+            "status": "healthy",
+            "server": "Massive Text Translator EN→HI",
+            "is_busy": tm.is_busy(),
+            "upload_dir": UPLOAD_DIR,
+            "output_dir": OUTPUT_DIR,
+            "upload_dir_exists": os.path.exists(UPLOAD_DIR),
+            "output_dir_exists": os.path.exists(OUTPUT_DIR),
+        }
+    )
+# ============================================================================
+# ROUTE: GET ALL TASKS (debug/admin)
+# ============================================================================
+@app.get("/tasks")
+async def list_tasks():
+    """
+    List all tasks (for debugging).
+    Shows all active, completed, and failed tasks.
+    """
+    tm = get_task_manager()
+    tasks = []
+    with tm._lock:
+        for task_id, task in tm._tasks.items():
+            tasks.append(task.to_dict())
+    # Sort by created_at descending (newest first)
+    tasks.sort(key=lambda t: t.get("created_at", 0), reverse=True)
+    return JSONResponse(content={"tasks": tasks, "total": len(tasks)})
+# ============================================================================
+# GLOBAL EXCEPTION HANDLER
+# ============================================================================
+@app.exception_handler(Exception)
+async def global_exception_handler(request: Request, exc: Exception):
+    """
+    Catch-all exception handler.
+    Logs the error and returns a clean JSON response.
+    Prevents ugly stack traces from leaking to the user.
+    """
+    logger.exception(f"Unhandled exception on {request.url}: {exc}")
+    return JSONResponse(
+        status_code=500,
+        content={
+            "detail": "An internal server error occurred. Please try again.",
+            "error": str(exc),
+        },
+    )
+# ============================================================================
+# MAIN — Direct execution (for local testing)
+# ============================================================================
+if __name__ == "__main__":
+    import uvicorn
+    print("=" * 60)
+    print("Starting Massive Text Translator...")
+    print("Open http://localhost:7860 in your browser")
+    print("=" * 60)
+    uvicorn.run(
+        "app:app",
+        host="0.0.0.0",
+        port=7860,
+        reload=False,  # No reload in production
+        workers=1,     # Single worker — we manage threads ourselves
+        log_level="info",
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+fastapi==0.110.0
+uvicorn==0.29.0
+deep-translator==1.11.4
+tenacity==8.2.3
+python-multipart==0.0.9
+jinja2==3.1.3
+aiofiles==23.2.1

task_manager.py ADDED Viewed

	@@ -0,0 +1,439 @@

+"""
+task_manager.py
+===============
+Background Task Manager for Massive File Translation
+RESPONSIBILITIES:
+-----------------
+1. Manages the lifecycle of translation tasks (create, run, track, cleanup)
+2. Runs translation in a background thread (non-blocking to FastAPI)
+3. Maintains a registry of all tasks with their progress
+4. Handles file path management (uploads, outputs)
+5. Supports single-task mode (one translation at a time on HuggingFace free tier)
+WHY SINGLE TASK MODE:
+---------------------
+On HuggingFace Spaces (free tier):
+- Shared IP = easier to get rate-limited by Google
+- Limited CPU/RAM compared to dedicated server
+- Running 2+ massive translations simultaneously would guarantee IP ban
+- So we queue: one active translation, others wait
+ARCHITECTURE:
+-------------
+    FastAPI Request → TaskManager.create_task()
+                          ↓
+                    Background Thread starts
+                          ↓
+                    MassiveFileTranslator.translate_file()
+                          ↓
+                    Progress updated in real-time
+                          ↓
+                    Task marked "completed"
+                          ↓
+                    FastAPI serves download link
+"""
+import os
+import uuid
+import time
+import shutil
+import threading
+import logging
+from typing import Optional, Dict
+from dataclasses import dataclass, field
+from enum import Enum
+from translator_engine import (
+    MassiveFileTranslator,
+    TranslatorConfig,
+    TranslationProgress,
+)
+logger = logging.getLogger("task_manager")
+# ============================================================================
+# DIRECTORY SETUP
+# ============================================================================
+# Base directories — created at module load time
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+UPLOAD_DIR = os.path.join(BASE_DIR, "uploads")
+OUTPUT_DIR = os.path.join(BASE_DIR, "outputs")
+# Ensure directories exist
+os.makedirs(UPLOAD_DIR, exist_ok=True)
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+# ============================================================================
+# TASK STATUS ENUM
+# ============================================================================
+class TaskStatus(str, Enum):
+    QUEUED = "queued"
+    PREPARING = "preparing"
+    TRANSLATING = "translating"
+    COMPLETED = "completed"
+    FAILED = "failed"
+    CANCELLED = "cancelled"
+# ============================================================================
+# SINGLE TASK REPRESENTATION
+# ============================================================================
+@dataclass
+class TranslationTask:
+    """
+    Represents one translation job.
+    Lifecycle:
+        QUEUED → PREPARING → TRANSLATING → COMPLETED
+                                         → FAILED
+                                         → CANCELLED
+    """
+    task_id: str
+    original_filename: str
+    input_path: str
+    output_path: str
+    status: TaskStatus = TaskStatus.QUEUED
+    created_at: float = field(default_factory=time.time)
+    completed_at: float = 0.0
+    error_message: str = ""
+    progress: TranslationProgress = field(default_factory=TranslationProgress)
+    translator: Optional[MassiveFileTranslator] = field(default=None, repr=False)
+    thread: Optional[threading.Thread] = field(default=None, repr=False)
+    def to_dict(self) -> dict:
+        """Serialize task info for API response."""
+        return {
+            "task_id": self.task_id,
+            "original_filename": self.original_filename,
+            "status": self.status.value,
+            "created_at": self.created_at,
+            "completed_at": self.completed_at,
+            "error_message": self.error_message,
+            "progress": self.progress.to_dict(),
+            "output_filename": os.path.basename(self.output_path),
+        }
+# ============================================================================
+# TASK MANAGER — Singleton that manages all translation tasks
+# ============================================================================
+class TaskManager:
+    """
+    Central manager for all translation tasks.
+    THREAD SAFETY:
+    - Uses a lock for task registry modifications
+    - Each task runs in its own background thread
+    - Progress objects are internally thread-safe (have their own locks)
+    SINGLE TASK ENFORCEMENT:
+    - Only one translation can be ACTIVE at a time
+    - If a new upload comes while one is running, it returns an error
+    - This prevents Google IP bans from concurrent heavy usage
+    FILE CLEANUP:
+    - Old completed tasks' files are cleaned up after configurable time
+    - Prevents disk space exhaustion on HuggingFace (limited storage)
+    """
+    def __init__(self, config: Optional[TranslatorConfig] = None):
+        self.config = config or TranslatorConfig()
+        self._tasks: Dict[str, TranslationTask] = {}
+        self._lock = threading.Lock()
+        self._active_task_id: Optional[str] = None
+        # Auto-cleanup interval (seconds) — remove completed tasks after 1 hour
+        self.cleanup_after_seconds = 3600
+        logger.info("TaskManager initialized.")
+        logger.info(f"Upload directory: {UPLOAD_DIR}")
+        logger.info(f"Output directory: {OUTPUT_DIR}")
+    # -----------------------------------------------------------------------
+    # PUBLIC API
+    # -----------------------------------------------------------------------
+    def is_busy(self) -> bool:
+        """Check if a translation is currently running."""
+        with self._lock:
+            if self._active_task_id is None:
+                return False
+            task = self._tasks.get(self._active_task_id)
+            if task is None:
+                self._active_task_id = None
+                return False
+            # If active task is done/failed/cancelled, we're not busy
+            if task.status in (
+                TaskStatus.COMPLETED,
+                TaskStatus.FAILED,
+                TaskStatus.CANCELLED,
+            ):
+                self._active_task_id = None
+                return False
+            return True
+    def create_task(self, original_filename: str, input_path: str) -> TranslationTask:
+        """
+        Create a new translation task.
+        Args:
+            original_filename: The user's original file name (for display)
+            input_path: Path to the uploaded file on disk
+        Returns:
+            TranslationTask object
+        Raises:
+            RuntimeError if another translation is already running
+        """
+        with self._lock:
+            # --- Single task enforcement ---
+            if self.is_busy():
+                raise RuntimeError(
+                    "Another translation is currently in progress. "
+                    "Please wait for it to complete before uploading a new file."
+                )
+            # Generate unique task ID
+            task_id = str(uuid.uuid4())[:12]
+            # Create output file path
+            # Original: "my_novel.txt" → Output: "my_novel_hindi_a1b2c3d4.txt"
+            name_without_ext = os.path.splitext(original_filename)[0]
+            output_filename = f"{name_without_ext}_hindi_{task_id}.txt"
+            output_path = os.path.join(OUTPUT_DIR, output_filename)
+            # Create progress tracker
+            progress = TranslationProgress()
+            progress.file_name = original_filename
+            # Create translator instance
+            translator = MassiveFileTranslator(
+                config=self.config,
+                progress=progress,
+            )
+            # Create task
+            task = TranslationTask(
+                task_id=task_id,
+                original_filename=original_filename,
+                input_path=input_path,
+                output_path=output_path,
+                progress=progress,
+                translator=translator,
+            )
+            # Register task
+            self._tasks[task_id] = task
+            self._active_task_id = task_id
+            logger.info(
+                f"Task created: {task_id} | File: {original_filename} | "
+                f"Input: {input_path} | Output: {output_path}"
+            )
+            return task
+    def start_task(self, task_id: str):
+        """
+        Start the translation task in a background thread.
+        The thread runs the translator and updates progress in real-time.
+        FastAPI can poll progress via get_task_progress().
+        """
+        with self._lock:
+            task = self._tasks.get(task_id)
+            if task is None:
+                raise ValueError(f"Task not found: {task_id}")
+            if task.status != TaskStatus.QUEUED:
+                raise RuntimeError(f"Task {task_id} is not in QUEUED state")
+        # Create and start background thread
+        thread = threading.Thread(
+            target=self._run_translation,
+            args=(task,),
+            name=f"translator-{task_id}",
+            daemon=True,  # Thread dies if main process exits
+        )
+        task.thread = thread
+        thread.start()
+        logger.info(f"Task {task_id} started in background thread.")
+    def get_task(self, task_id: str) -> Optional[TranslationTask]:
+        """Get a task by ID."""
+        with self._lock:
+            return self._tasks.get(task_id)
+    def get_task_progress(self, task_id: str) -> Optional[dict]:
+        """Get task progress as a dictionary (for API response)."""
+        task = self.get_task(task_id)
+        if task is None:
+            return None
+        return task.to_dict()
+    def get_latest_task(self) -> Optional[TranslationTask]:
+        """Get the most recent task (for simple single-task UI)."""
+        with self._lock:
+            if not self._tasks:
+                return None
+            # Return the task with the latest created_at timestamp
+            return max(self._tasks.values(), key=lambda t: t.created_at)
+    def get_latest_task_progress(self) -> Optional[dict]:
+        """Get progress of the most recent task."""
+        task = self.get_latest_task()
+        if task is None:
+            return None
+        return task.to_dict()
+    def cancel_task(self, task_id: str) -> bool:
+        """
+        Cancel a running translation task.
+        Sets a cancel flag that the translator checks periodically.
+        The translator will stop after completing its current paragraph.
+        Already-translated content is preserved on disk.
+        """
+        task = self.get_task(task_id)
+        if task is None:
+            return False
+        if task.status not in (TaskStatus.QUEUED, TaskStatus.PREPARING, TaskStatus.TRANSLATING):
+            return False  # Can't cancel a completed/failed task
+        task.translator.cancel()
+        task.status = TaskStatus.CANCELLED
+        task.completed_at = time.time()
+        with self._lock:
+            if self._active_task_id == task_id:
+                self._active_task_id = None
+        logger.info(f"Task {task_id} cancelled.")
+        return True
+    def cleanup_old_tasks(self):
+        """
+        Remove completed/failed tasks older than cleanup_after_seconds.
+        Also deletes their files from disk to free space.
+        Called periodically (e.g., before each new upload).
+        """
+        now = time.time()
+        to_remove = []
+        with self._lock:
+            for task_id, task in self._tasks.items():
+                if task.status in (
+                    TaskStatus.COMPLETED,
+                    TaskStatus.FAILED,
+                    TaskStatus.CANCELLED,
+                ):
+                    age = now - task.completed_at if task.completed_at > 0 else now - task.created_at
+                    if age > self.cleanup_after_seconds:
+                        to_remove.append(task_id)
+        for task_id in to_remove:
+            self._remove_task_files(task_id)
+            with self._lock:
+                del self._tasks[task_id]
+            logger.info(f"Cleaned up old task: {task_id}")
+    def get_output_path(self, task_id: str) -> Optional[str]:
+        """Get the output file path for a completed task."""
+        task = self.get_task(task_id)
+        if task is None:
+            return None
+        if task.status != TaskStatus.COMPLETED:
+            return None
+        if not os.path.exists(task.output_path):
+            return None
+        return task.output_path
+    # -----------------------------------------------------------------------
+    # PRIVATE METHODS
+    # -----------------------------------------------------------------------
+    def _run_translation(self, task: TranslationTask):
+        """
+        The actual translation runner — executes in a background thread.
+        ERROR HANDLING:
+        - Any exception is caught and stored in the task
+        - Task status is set to FAILED
+        - The active_task_id is cleared so new tasks can be submitted
+        - Already-written content is preserved on disk
+        """
+        try:
+            task.status = TaskStatus.PREPARING
+            task.progress.set_status("preparing", "Starting translation...")
+            logger.info(
+                f"Task {task.task_id}: Starting translation of "
+                f"'{task.original_filename}'"
+            )
+            # Run the translation (this blocks until complete)
+            task.translator.translate_file(task.input_path, task.output_path)
+            # Check if it was cancelled during translation
+            if task.translator._cancel_flag.is_set():
+                task.status = TaskStatus.CANCELLED
+                task.completed_at = time.time()
+                logger.info(f"Task {task.task_id}: Cancelled during translation.")
+            else:
+                task.status = TaskStatus.COMPLETED
+                task.completed_at = time.time()
+                logger.info(
+                    f"Task {task.task_id}: Translation COMPLETED. "
+                    f"Output: {task.output_path}"
+                )
+        except Exception as e:
+            task.status = TaskStatus.FAILED
+            task.error_message = str(e)
+            task.completed_at = time.time()
+            task.progress.set_status("failed", f"Error: {str(e)}")
+            logger.exception(f"Task {task.task_id}: FAILED with error: {e}")
+        finally:
+            # Clear active task so new uploads are accepted
+            with self._lock:
+                if self._active_task_id == task.task_id:
+                    self._active_task_id = None
+    def _remove_task_files(self, task_id: str):
+        """Safely delete task files from disk."""
+        task = self._tasks.get(task_id)
+        if task is None:
+            return
+        for path in [task.input_path, task.output_path]:
+            try:
+                if path and os.path.exists(path):
+                    os.remove(path)
+                    logger.debug(f"Deleted file: {path}")
+            except OSError as e:
+                logger.warning(f"Could not delete {path}: {e}")
+# ============================================================================
+# GLOBAL SINGLETON — Used by FastAPI app
+# ============================================================================
+# Create a single TaskManager instance shared across the entire application
+# This is safe because TaskManager is thread-safe internally
+_global_task_manager: Optional[TaskManager] = None
+def get_task_manager() -> TaskManager:
+    """Get or create the global TaskManager singleton."""
+    global _global_task_manager
+    if _global_task_manager is None:
+        config = TranslatorConfig()
+        _global_task_manager = TaskManager(config=config)
+    return _global_task_manager

translator_engine.py ADDED Viewed

	@@ -0,0 +1,740 @@

+"""
+translator_engine.py
+====================
+Core Translation Engine for Massive Text Files (English -> Hindi)
+ARCHITECTURE DECISIONS:
+-----------------------
+1. STREAMING READ/WRITE: We never hold the full translated file in RAM.
+   - Read source file paragraph-by-paragraph
+   - Translate each paragraph
+   - Immediately append translated text to output file on disk
+   - If process crashes at paragraph 25,000 — the first 24,999 are SAFE on disk
+2. CHUNKING STRATEGY:
+   - Split file by double-newline (\n\n) to get paragraphs
+   - Each paragraph is one translation unit
+   - Single newlines (\n) WITHIN a paragraph are preserved by translating
+     line-by-line inside each paragraph
+   - This guarantees ZERO text loss and exact structural preservation
+3. RATE LIMITING & IP BAN PREVENTION:
+   - Each translation call has a mandatory sleep (configurable, default 0.3s)
+   - ThreadPoolExecutor with LIMITED workers (default 2) — NOT aggressive
+   - On HTTP 429 (Too Many Requests): exponential backoff via tenacity
+   - Base wait: 10 seconds, multiplier: 2x, max wait: 320 seconds
+   - Max 10 retries per chunk before marking as FAILED
+   - On connection drop: same retry logic catches ConnectionError, Timeout
+4. CONCURRENCY MODEL:
+   - We use ThreadPoolExecutor but with STRICT controls:
+     a) Max 2 workers (Google free tier can't handle more)
+     b) Each worker has mandatory inter-request delay
+     c) A global rate limiter (threading.Semaphore) prevents burst
+   - This is NOT about max speed — it's about RELIABLE completion
+     of 50,000+ paragraphs without getting IP-banned
+5. ERROR HANDLING SCENARIOS (explicitly documented):
+   - HTTP 429 Too Many Requests → exponential backoff, retry up to 10x
+   - ConnectionError / Timeout → same retry logic, waits and retries
+   - InvalidURL / encoding error → log error, write original text as fallback
+   - Process crash → all previously written paragraphs are safe on disk
+   - Google temporary block → backoff will wait up to ~320s per retry
+"""
+import os
+import re
+import time
+import threading
+import logging
+from typing import Optional, Callable
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from dataclasses import dataclass, field
+from deep_translator import GoogleTranslator
+from tenacity import (
+    retry,
+    stop_after_attempt,
+    wait_exponential,
+    retry_if_exception_type,
+    before_sleep_log,
+    RetryError,
+)
+# ============================================================================
+# LOGGING SETUP
+# ============================================================================
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(threadName)s] %(levelname)s: %(message)s",
+)
+logger = logging.getLogger("translator_engine")
+# ============================================================================
+# CONFIGURATION — All tunables in one place
+# ============================================================================
+@dataclass
+class TranslatorConfig:
+    """
+    All configuration for the translation engine.
+    Tuned for Google Free Translator on a 16GB RAM server.
+    IMPORTANT FOR HUGGINGFACE SPACES:
+    - HF Spaces have shared IPs, so we must be EXTRA conservative
+    - Lower workers, higher delays to avoid bans
+    """
+    # --- Translation Settings ---
+    source_lang: str = "en"
+    target_lang: str = "hi"
+    # --- Chunking ---
+    # Google Translator free tier has a ~5000 char limit per request.
+    # We set our max chunk size to 4500 to leave safety margin.
+    max_chunk_chars: int = 4500
+    # --- Rate Limiting ---
+    # Minimum seconds to wait BETWEEN translation requests (per thread)
+    min_request_delay: float = 0.5
+    # How many concurrent translation threads
+    # KEEP THIS LOW — Google will ban aggressive concurrent requests
+    max_workers: int = 2
+    # Global semaphore — max simultaneous in-flight requests across all threads
+    max_concurrent_requests: int = 2
+    # --- Retry / Backoff (for HTTP 429, Connection drops) ---
+    max_retries: int = 10           # Max retries per single chunk
+    backoff_base_wait: int = 10     # First retry waits 10 seconds
+    backoff_multiplier: int = 2     # Each subsequent retry doubles wait
+    backoff_max_wait: int = 320     # Never wait more than 320s per retry
+    # --- File I/O ---
+    # How many paragraphs to batch into one write operation
+    # (each paragraph is still translated individually, but we flush to disk
+    #  every N paragraphs for I/O efficiency)
+    disk_flush_interval: int = 5
+# ============================================================================
+# PROGRESS TRACKER — Thread-safe progress monitoring
+# ============================================================================
+@dataclass
+class TranslationProgress:
+    """
+    Thread-safe progress tracker.
+    The frontend polls this to show real-time progress.
+    """
+    total_paragraphs: int = 0
+    translated_paragraphs: int = 0
+    failed_paragraphs: int = 0
+    status: str = "idle"  # idle, preparing, translating, completed, failed
+    error_message: str = ""
+    current_phase: str = ""
+    start_time: float = 0.0
+    file_name: str = ""
+    output_file: str = ""
+    _lock: threading.Lock = field(default_factory=threading.Lock, repr=False)
+    def increment_translated(self):
+        with self._lock:
+            self.translated_paragraphs += 1
+    def increment_failed(self):
+        with self._lock:
+            self.failed_paragraphs += 1
+    def set_status(self, status: str, phase: str = ""):
+        with self._lock:
+            self.status = status
+            if phase:
+                self.current_phase = phase
+    def to_dict(self) -> dict:
+        with self._lock:
+            elapsed = 0
+            speed = 0
+            eta_seconds = 0
+            if self.start_time > 0 and self.translated_paragraphs > 0:
+                elapsed = time.time() - self.start_time
+                speed = self.translated_paragraphs / elapsed  # paragraphs per second
+                remaining = self.total_paragraphs - self.translated_paragraphs
+                eta_seconds = remaining / speed if speed > 0 else 0
+            return {
+                "total_paragraphs": self.total_paragraphs,
+                "translated_paragraphs": self.translated_paragraphs,
+                "failed_paragraphs": self.failed_paragraphs,
+                "status": self.status,
+                "error_message": self.error_message,
+                "current_phase": self.current_phase,
+                "file_name": self.file_name,
+                "output_file": self.output_file,
+                "elapsed_seconds": round(elapsed, 1),
+                "speed_per_second": round(speed, 2),
+                "eta_seconds": round(eta_seconds, 1),
+                "percent_complete": round(
+                    (self.translated_paragraphs / self.total_paragraphs * 100)
+                    if self.total_paragraphs > 0
+                    else 0,
+                    1,
+                ),
+            }
+# ============================================================================
+# SINGLE CHUNK TRANSLATOR — With full retry & backoff logic
+# ============================================================================
+class ChunkTranslator:
+    """
+    Translates a single text chunk (paragraph or sub-paragraph).
+    ERROR HANDLING SCENARIOS:
+    -------------------------
+    SCENARIO 1 — HTTP 429 Too Many Requests (IP Rate Limit):
+        Google returns 429 when we're sending too many requests.
+        → tenacity catches this, waits 10s (then 20s, 40s, 80s, 160s, 320s...)
+        → Retries the EXACT same chunk up to 10 times
+        → After 10 failures, returns original text as fallback (no data loss)
+    SCENARIO 2 — Connection Drop (ConnectionError, Timeout):
+        Network issues, Google server down, DNS failure, etc.
+        → Same retry logic as above catches these exceptions
+        → Exponential backoff gives the network time to recover
+        → If persistent failure, returns original text
+    SCENARIO 3 — Encoding/Invalid Input:
+        Malformed characters, binary data mixed in text
+        → Caught by generic Exception handler
+        → Returns original text (preserving content even if untranslated)
+    SCENARIO 4 — Google Temporary IP Block (longer than rate limit):
+        Sometimes Google blocks an IP for minutes, not just seconds
+        → Our max backoff is 320 seconds (~5 minutes)
+        → With 10 retries and exponential backoff, total possible wait
+           is ~10 + 20 + 40 + 80 + 160 + 320 + 320 + 320 + 320 + 320 = ~31 minutes
+        → This is usually enough for Google to unblock
+    """
+    def __init__(self, config: TranslatorConfig):
+        self.config = config
+        # Global semaphore: limits total in-flight requests across all threads
+        self.request_semaphore = threading.Semaphore(config.max_concurrent_requests)
+        # Per-thread rate limiter
+        self._last_request_time = threading.local()
+    def _enforce_rate_limit(self):
+        """
+        Ensures minimum delay between requests on the SAME thread.
+        This prevents any single thread from hammering Google.
+        """
+        now = time.time()
+        last = getattr(self._last_request_time, "value", 0)
+        elapsed = now - last
+        if elapsed < self.config.min_request_delay:
+            sleep_time = self.config.min_request_delay - elapsed
+            time.sleep(sleep_time)
+        self._last_request_time.value = time.time()
+    @retry(
+        # Retry on these specific exceptions (network + rate limit errors)
+        retry=retry_if_exception_type((
+            Exception,  # deep-translator wraps errors in generic exceptions
+        )),
+        # Exponential backoff: 10s → 20s → 40s → 80s → 160s → 320s (capped)
+        wait=wait_exponential(
+            multiplier=2,
+            min=10,
+            max=320,
+        ),
+        # Maximum 10 retry attempts per chunk
+        stop=stop_after_attempt(10),
+        # Log before each retry sleep (helps debugging)
+        before_sleep=before_sleep_log(logger, logging.WARNING),
+        # Re-raise the final exception if all retries exhausted
+        reraise=True,
+    )
+    def _translate_with_retry(self, text: str) -> str:
+        """
+        The actual translation call, wrapped with tenacity retry decorator.
+        This is the INNERMOST function — it talks to Google Translate.
+        If Google returns 429 or connection drops, tenacity handles retry.
+        """
+        # Acquire semaphore — blocks if too many concurrent requests
+        with self.request_semaphore:
+            # Enforce per-thread rate limit
+            self._enforce_rate_limit()
+            # Create a fresh translator instance per call
+            # (deep-translator is not guaranteed thread-safe with shared instances)
+            translator = GoogleTranslator(
+                source=self.config.source_lang,
+                target=self.config.target_lang,
+            )
+            result = translator.translate(text=text)
+            # Google sometimes returns None for empty strings
+            if result is None:
+                return text
+            return result
+    def translate_chunk(self, text: str) -> str:
+        """
+        Public API: Translate a text chunk with full error handling.
+        Returns translated text on success.
+        Returns ORIGINAL text on permanent failure (zero text loss guarantee).
+        """
+        # Don't waste API calls on empty/whitespace-only text
+        if not text or not text.strip():
+            return text
+        try:
+            return self._translate_with_retry(text)
+        except RetryError as e:
+            # All 10 retries exhausted — log and return original
+            logger.error(
+                f"PERMANENT FAILURE after {self.config.max_retries} retries. "
+                f"Chunk (first 100 chars): '{text[:100]}...'. "
+                f"Error: {e.last_attempt.exception()}"
+            )
+            # ZERO TEXT LOSS: Return original text rather than losing it
+            return text
+        except Exception as e:
+            # Unexpected error — still preserve the original text
+            logger.error(f"Unexpected translation error: {e}. Preserving original text.")
+            return text
+# ============================================================================
+# PARAGRAPH SPLITTER — Preserves exact structure
+# ============================================================================
+class TextSplitter:
+    """
+    Splits massive text files into translation-ready chunks while
+    preserving EXACT line break structure.
+    STRATEGY:
+    1. Split file by double-newline (\n\n) → paragraphs
+    2. For each paragraph, if it's within char limit → translate as-is
+    3. If paragraph exceeds char limit → split by single newline (\n) → lines
+    4. If a single line exceeds char limit → split by sentences
+    5. After translation, rejoin with EXACT same separators
+    This guarantees:
+    - Every \n is preserved
+    - Every \n\n is preserved
+    - No lines are skipped or merged
+    """
+    def __init__(self, config: TranslatorConfig):
+        self.config = config
+    def split_into_paragraphs(self, file_path: str):
+        """
+        Generator: Yields paragraphs from file WITHOUT loading entire file into RAM.
+        Uses streaming read — reads file line by line, accumulates paragraphs,
+        yields when a paragraph boundary (double newline) is found.
+        Memory usage: Only ONE paragraph in RAM at a time.
+        For a 3-million-word file, this is crucial.
+        """
+        current_paragraph_lines = []
+        consecutive_empty_lines = 0
+        with open(file_path, "r", encoding="utf-8", errors="replace") as f:
+            for line in f:
+                # Remove only the trailing newline for analysis
+                stripped = line.rstrip("\n")
+                if stripped == "":
+                    consecutive_empty_lines += 1
+                    if consecutive_empty_lines >= 1 and current_paragraph_lines:
+                        # End of a paragraph — yield it
+                        paragraph_text = "\n".join(current_paragraph_lines)
+                        yield paragraph_text
+                        current_paragraph_lines = []
+                    # Yield empty line as separator (preserves exact blank lines)
+                    yield ""
+                else:
+                    consecutive_empty_lines = 0
+                    current_paragraph_lines.append(stripped)
+            # Don't forget the last paragraph (file might not end with \n\n)
+            if current_paragraph_lines:
+                paragraph_text = "\n".join(current_paragraph_lines)
+                yield paragraph_text
+    def split_paragraph_into_chunks(self, paragraph: str) -> list[str]:
+        """
+        If a paragraph is too long for one API call, split it into
+        smaller chunks while preserving line boundaries.
+        Returns a list of chunk strings. When rejoined with \n, they
+        reconstruct the original paragraph exactly.
+        """
+        if len(paragraph) <= self.config.max_chunk_chars:
+            return [paragraph]
+        # Split by lines first
+        lines = paragraph.split("\n")
+        chunks = []
+        current_chunk_lines = []
+        current_chunk_len = 0
+        for line in lines:
+            line_len = len(line) + 1  # +1 for the \n separator
+            if current_chunk_len + line_len > self.config.max_chunk_chars:
+                if current_chunk_lines:
+                    chunks.append("\n".join(current_chunk_lines))
+                    current_chunk_lines = []
+                    current_chunk_len = 0
+                # If a single line is STILL too long, split by sentences
+                if line_len > self.config.max_chunk_chars:
+                    sentence_chunks = self._split_long_line(line)
+                    chunks.extend(sentence_chunks)
+                    continue
+            current_chunk_lines.append(line)
+            current_chunk_len += line_len
+        if current_chunk_lines:
+            chunks.append("\n".join(current_chunk_lines))
+        return chunks
+    def _split_long_line(self, line: str) -> list[str]:
+        """
+        Last resort: Split a very long single line by sentence boundaries.
+        Preserves all text — just breaks it into translatable pieces.
+        """
+        # Split on sentence endings while keeping the delimiter
+        sentences = re.split(r"(?<=[.!?])\s+", line)
+        chunks = []
+        current = ""
+        for sentence in sentences:
+            if len(current) + len(sentence) + 1 > self.config.max_chunk_chars:
+                if current:
+                    chunks.append(current)
+                current = sentence
+            else:
+                current = f"{current} {sentence}" if current else sentence
+        if current:
+            chunks.append(current)
+        # If we still have chunks that are too long (no sentence boundaries),
+        # do a hard split at max_chunk_chars (last resort, very rare)
+        final_chunks = []
+        for chunk in chunks:
+            if len(chunk) > self.config.max_chunk_chars:
+                for i in range(0, len(chunk), self.config.max_chunk_chars):
+                    final_chunks.append(chunk[i : i + self.config.max_chunk_chars])
+            else:
+                final_chunks.append(chunk)
+        return final_chunks
+    def count_paragraphs(self, file_path: str) -> int:
+        """
+        Quick scan to count total paragraphs for progress tracking.
+        Streams through file — doesn't load into RAM.
+        """
+        count = 0
+        for _ in self.split_into_paragraphs(file_path):
+            count += 1
+        return count
+# ============================================================================
+# MAIN TRANSLATION ORCHESTRATOR — Ties everything together
+# ============================================================================
+class MassiveFileTranslator:
+    """
+    The main orchestrator that:
+    1. Reads the input file (streaming)
+    2. Splits into paragraphs
+    3. Translates each paragraph (with concurrency + rate limiting)
+    4. Writes translated text to output file (streaming/append)
+    5. Tracks progress in real-time
+    STREAMING WRITE GUARANTEE:
+    --------------------------
+    Every translated paragraph is IMMEDIATELY flushed to the output file.
+    If the process crashes at paragraph 25,000 out of 50,000:
+    - The output file contains paragraphs 1-24,999 fully translated
+    - No data is held only in RAM
+    - You can resume or at least salvage the partial translation
+    """
+    def __init__(
+        self,
+        config: Optional[TranslatorConfig] = None,
+        progress: Optional[TranslationProgress] = None,
+    ):
+        self.config = config or TranslatorConfig()
+        self.progress = progress or TranslationProgress()
+        self.chunk_translator = ChunkTranslator(self.config)
+        self.text_splitter = TextSplitter(self.config)
+        # Lock for sequential file writing (multiple threads, one output file)
+        self._write_lock = threading.Lock()
+        # Flag to support graceful cancellation
+        self._cancel_flag = threading.Event()
+    def cancel(self):
+        """Signal the translation to stop gracefully."""
+        self._cancel_flag.set()
+        self.progress.set_status("failed", "Cancelled by user")
+    def translate_file(self, input_path: str, output_path: str) -> str:
+        """
+        Main entry point: Translate an entire file.
+        This method is designed to be called in a background thread.
+        It streams through the input, translates, and streams to output.
+        Returns the output file path on completion.
+        """
+        try:
+            self.progress.set_status("preparing", "Counting paragraphs...")
+            self.progress.file_name = os.path.basename(input_path)
+            self.progress.output_file = os.path.basename(output_path)
+            # Phase 1: Count total paragraphs (quick streaming scan)
+            logger.info(f"Counting paragraphs in: {input_path}")
+            total = self.text_splitter.count_paragraphs(input_path)
+            self.progress.total_paragraphs = total
+            logger.info(f"Total paragraphs to translate: {total}")
+            if total == 0:
+                self.progress.set_status("completed", "File is empty")
+                # Create empty output file
+                open(output_path, "w", encoding="utf-8").close()
+                return output_path
+            # Phase 2: Clear/create output file
+            with open(output_path, "w", encoding="utf-8") as f:
+                f.write("")  # Create empty file
+            # Phase 3: Stream-translate with ordered sequential writing
+            self.progress.set_status("translating", "Translation in progress...")
+            self.progress.start_time = time.time()
+            self._translate_sequential_with_threads(input_path, output_path)
+            # Phase 4: Done
+            if not self._cancel_flag.is_set():
+                self.progress.set_status("completed", "Translation finished!")
+                logger.info(
+                    f"Translation complete. Output: {output_path}. "
+                    f"Translated: {self.progress.translated_paragraphs}, "
+                    f"Failed: {self.progress.failed_paragraphs}"
+                )
+            return output_path
+        except Exception as e:
+            logger.exception(f"Fatal error during translation: {e}")
+            self.progress.set_status("failed", f"Fatal error: {str(e)}")
+            self.progress.error_message = str(e)
+            raise
+    def _translate_sequential_with_threads(
+        self, input_path: str, output_path: str
+    ):
+        """
+        ORDERED translation with thread pool.
+        WHY ORDERED MATTERS:
+        We must write paragraphs to the output file in the EXACT same order
+        as the input file. Random ordering would scramble the book.
+        STRATEGY:
+        - We use a thread pool for concurrent translation
+        - But we process in BATCHES to maintain order
+        - Each batch = N paragraphs (N = max_workers * 3 for pipeline efficiency)
+        - Within a batch, paragraphs are translated concurrently
+        - After the entire batch is done, we write results IN ORDER to disk
+        - Then move to the next batch
+        This gives us:
+        ✅ Concurrency (multiple paragraphs translated simultaneously)
+        ✅ Strict ordering (output matches input structure exactly)
+        ✅ Streaming writes (each batch is flushed to disk immediately)
+        """
+        batch_size = self.config.max_workers * 3  # Pipeline efficiency
+        paragraph_generator = self.text_splitter.split_into_paragraphs(input_path)
+        with ThreadPoolExecutor(
+            max_workers=self.config.max_workers,
+            thread_name_prefix="translator",
+        ) as executor:
+            batch = []
+            batch_indices = []
+            paragraph_index = 0
+            for paragraph in paragraph_generator:
+                if self._cancel_flag.is_set():
+                    logger.warning("Translation cancelled.")
+                    return
+                batch.append(paragraph)
+                batch_indices.append(paragraph_index)
+                paragraph_index += 1
+                # Process batch when full
+                if len(batch) >= batch_size:
+                    self._process_batch(
+                        executor, batch, batch_indices, output_path
+                    )
+                    batch = []
+                    batch_indices = []
+            # Process remaining paragraphs in the last (partial) batch
+            if batch:
+                self._process_batch(
+                    executor, batch, batch_indices, output_path
+                )
+    def _process_batch(
+        self,
+        executor: ThreadPoolExecutor,
+        batch: list[str],
+        indices: list[int],
+        output_path: str,
+    ):
+        """
+        Process a batch of paragraphs:
+        1. Submit all to thread pool for concurrent translation
+        2. Wait for ALL to complete
+        3. Write results to disk IN ORDER
+        """
+        # Submit all paragraphs in this batch to the thread pool
+        future_to_index = {}
+        for i, paragraph in enumerate(batch):
+            if self._cancel_flag.is_set():
+                return
+            # Empty paragraphs (blank lines) don't need translation
+            if not paragraph.strip():
+                future_to_index[i] = paragraph  # Store directly
+            else:
+                future = executor.submit(self._translate_single_paragraph, paragraph)
+                future_to_index[future] = i
+        # Collect results in order
+        results = [""] * len(batch)
+        # First, fill in the empty paragraphs (no futures for these)
+        for key, value in list(future_to_index.items()):
+            if isinstance(key, int):
+                results[key] = value
+                del future_to_index[key]
+        # Wait for translation futures
+        for future in as_completed(future_to_index):
+            idx = future_to_index[future]
+            try:
+                translated_text = future.result()
+                results[idx] = translated_text
+            except Exception as e:
+                # This shouldn't happen (translate_chunk handles all errors)
+                # but just in case — preserve original text
+                logger.error(f"Unexpected future error: {e}")
+                results[idx] = batch[idx]  # Original text as fallback
+                self.progress.increment_failed()
+        # Write entire batch to disk IN ORDER
+        with self._write_lock:
+            with open(output_path, "a", encoding="utf-8") as f:
+                for i, translated in enumerate(results):
+                    if i > 0 or os.path.getsize(output_path) > 0:
+                        f.write("\n\n")  # Paragraph separator
+                    f.write(translated)
+                # Flush to disk immediately — crash protection
+                f.flush()
+                os.fsync(f.fileno())
+    def _translate_single_paragraph(self, paragraph: str) -> str:
+        """
+        Translate a single paragraph, handling the case where it might
+        be too long for a single API call.
+        Preserves internal \n line breaks exactly.
+        """
+        # Split paragraph into API-friendly chunks if needed
+        chunks = self.text_splitter.split_paragraph_into_chunks(paragraph)
+        if len(chunks) == 1:
+            # Paragraph fits in one API call
+            # But we still need to preserve internal line breaks
+            # Strategy: translate line by line within the paragraph
+            lines = paragraph.split("\n")
+            translated_lines = []
+            for line in lines:
+                if line.strip():
+                    translated_line = self.chunk_translator.translate_chunk(line)
+                    translated_lines.append(translated_line)
+                else:
+                    # Preserve empty lines within paragraph
+                    translated_lines.append(line)
+            self.progress.increment_translated()
+            return "\n".join(translated_lines)
+        else:
+            # Large paragraph — translate each chunk, then rejoin
+            translated_chunks = []
+            for chunk in chunks:
+                lines = chunk.split("\n")
+                translated_lines = []
+                for line in lines:
+                    if line.strip():
+                        translated_line = self.chunk_translator.translate_chunk(line)
+                        translated_lines.append(translated_line)
+                    else:
+                        translated_lines.append(line)
+                translated_chunks.append("\n".join(translated_lines))
+            self.progress.increment_translated()
+            return "\n".join(translated_chunks)
+# ============================================================================
+# QUICK TEST — Run this file directly to test translation
+# ============================================================================
+if __name__ == "__main__":
+    print("=" * 60)
+    print("TRANSLATOR ENGINE — Quick Self-Test")
+    print("=" * 60)
+    config = TranslatorConfig()
+    progress = TranslationProgress()
+    # Test single chunk translation
+    chunk_translator = ChunkTranslator(config)
+    test_text = "Hello, how are you? This is a test of the translation engine."
+    print(f"\nOriginal:   {test_text}")
+    translated = chunk_translator.translate_chunk(test_text)
+    print(f"Translated: {translated}")
+    # Test text splitter
+    splitter = TextSplitter(config)
+    long_text = "A" * 5000
+    chunks = splitter.split_paragraph_into_chunks(long_text)
+    print(f"\nLong text ({len(long_text)} chars) split into {len(chunks)} chunks")
+    print(f"Chunk sizes: {[len(c) for c in chunks]}")
+    print("\n✅ Self-test complete. Engine is functional.")