File size: 24,289 Bytes
a2e1879
 
1af1f14
e7a0c9a
c26a471
 
 
 
 
bb5b92d
c26a471
a2e1879
 
 
31aee82
a2e1879
 
 
 
c26a471
a2e1879
 
 
c26a471
 
a2e1879
 
 
c26a471
 
 
 
a2e1879
 
 
c26a471
a2e1879
 
 
 
 
 
 
c26a471
a2e1879
c26a471
 
 
a2e1879
 
 
c26a471
 
b880cfc
 
1af1f14
 
 
b880cfc
1af1f14
 
 
b880cfc
c26a471
e7a0c9a
 
 
 
 
 
 
 
 
 
 
b880cfc
e7a0c9a
c26a471
b880cfc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c26a471
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b880cfc
 
c26a471
 
 
 
1af1f14
 
 
 
 
 
 
 
 
 
c26a471
 
 
1af1f14
 
 
 
 
c26a471
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1af1f14
c26a471
 
 
 
1af1f14
c26a471
 
 
 
 
 
1af1f14
 
 
 
 
c26a471
 
 
1af1f14
c26a471
 
1af1f14
 
 
 
e7a0c9a
c26a471
 
 
 
 
 
 
 
e7a0c9a
c26a471
e7a0c9a
1af1f14
e7a0c9a
1af1f14
 
e7a0c9a
 
 
 
b880cfc
 
e7a0c9a
b880cfc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1af1f14
c26a471
1af1f14
 
 
 
c26a471
 
 
 
1af1f14
c26a471
 
1af1f14
 
 
c26a471
a2e1879
 
c26a471
 
 
 
a2e1879
 
 
 
 
 
 
 
 
 
 
 
 
 
c26a471
 
a2e1879
 
 
 
 
 
 
c26a471
 
a2e1879
 
 
 
 
31aee82
a2e1879
 
 
 
 
 
 
 
 
 
c26a471
a2e1879
 
c26a471
 
 
a2e1879
 
 
 
c26a471
b45369f
 
c26a471
 
 
 
 
 
 
 
 
b45369f
c26a471
 
 
 
 
 
 
 
 
 
bb181f0
c26a471
a2e1879
c26a471
1ff012c
 
 
1af1f14
c26a471
 
 
 
b45369f
1af1f14
a2e1879
 
1ff012c
 
 
 
1af1f14
 
 
 
 
1ff012c
1af1f14
 
1ff012c
 
1af1f14
 
c26a471
 
 
1af1f14
1ff012c
1af1f14
 
1ff012c
 
08546b6
 
 
 
 
 
 
 
c26a471
 
 
 
 
1ff012c
c26a471
 
 
08546b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a2e1879
1af1f14
1ff012c
 
1af1f14
 
1ff012c
1af1f14
c26a471
cee104f
 
 
1af1f14
c26a471
 
1af1f14
c26a471
1af1f14
 
 
 
a2e1879
1af1f14
 
 
a2e1879
 
c26a471
a2e1879
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c26a471
a2e1879
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
from typing import Generator, Any, Dict
import os
import sys
import platform
import subprocess
import tempfile
import zipfile
import urllib.request
import shutil
import re
from pathlib import Path
from .base import LlmClient


TRANSLATION_SYSTEM_INSTRUCTIONS = """Translate the text to English. Output only the translation, nothing else."""


class QwenTranslatorClient(LlmClient):
    """
    Translation client using Qwen3-0.6B-GGUF pre-quantized models via pre-built llama.cpp binary.
    Translates non-English text to English so it can be processed by the English-only classifier.
    
    Uses GGUF format models from unsloth/Qwen3-0.6B-GGUF - already quantized, no bitsandbytes needed.
    Uses pre-built llama.cpp binary (llama-b6995-bin-ubuntu-x64.zip) from GitHub releases - no compilation needed.
    The binary is automatically downloaded and extracted on first use.
    Optimized for Hugging Face Spaces with lazy loading and efficient CPU inference.
    """

    # Class-level cache for the binary path
    _binary_path = None
    _binary_dir = None

    def __init__(self, config_dict: Dict[str, Any], system_prompt: str):
        super().__init__(config_dict, system_prompt)
        self.repo_id = self.config.get("repo_id", "unsloth/Qwen3-0.6B-GGUF")
        self.model_file = self.config.get("model_file", "Qwen3-0.6B-IQ4_XS.gguf")
        self.temperature = float(self.config.get("temperature", 0.3))
        self.top_p = float(self.config.get("top_p", 0.9))
        self.top_k = int(self.config.get("top_k", 40))
        self.max_tokens = int(self.config.get("max_tokens", 256))
        self.context_size = int(self.config.get("context_size", 512))
        self.n_threads = int(self.config.get("n_threads", 0))  # 0 = auto-detect CPU threads
        self.n_gpu_layers = int(self.config.get("n_gpu_layers", 0))  # 0 = CPU only, >0 for GPU
        self.n_batch = int(self.config.get("n_batch", 256))
        
        # Model path will be set on first use
        self.model_path = None
        self._model_downloaded = False
        
        print(f"βœ… Qwen GGUF translator client initialized (repo: {self.repo_id}, model: {self.model_file}, will load on first use)")

    @classmethod
    def _download_binary(cls) -> str:
        """Get the llama.cpp binary - prefer compiled version, fallback to downloaded."""
        # Check OS - the binary only works on Linux
        if sys.platform == "win32":
            raise RuntimeError(
                "Translation with llama.cpp binary is not supported on Windows. "
                "The binary is for Linux only. "
                "Please use this feature on Linux or Hugging Face Spaces."
            )
        
        # First, check if we have a cached binary path
        if cls._binary_path and os.path.exists(cls._binary_path):
            # Verify it's still executable
            if os.access(cls._binary_path, os.X_OK):
                return cls._binary_path
            else:
                # Try to fix permissions
                try:
                    os.chmod(cls._binary_path, 0o755)
                    if os.access(cls._binary_path, os.X_OK):
                        return cls._binary_path
                except Exception:
                    pass
                # If we can't fix it, reset
                cls._binary_path = None
        
        # Check for compiled binary (from Dockerfile) first
        compiled_binary_paths = [
            "/usr/local/bin/llama-main",  # Standard location from Dockerfile
            "/app/llama.cpp/build/bin/main",  # Alternative location
            "/usr/bin/llama-main",  # Another possible location
        ]
        
        for compiled_path in compiled_binary_paths:
            if os.path.exists(compiled_path) and os.access(compiled_path, os.X_OK):
                cls._binary_path = compiled_path
                print(f"βœ… Using compiled llama.cpp binary at: {compiled_path}", flush=True)
                return cls._binary_path
        
        # If no compiled binary found, download pre-built binary as fallback
        print("πŸ“₯ No compiled binary found, downloading pre-built llama.cpp binary...", flush=True)
        
        # Create a temporary directory for the binary
        if cls._binary_dir is None:
            cls._binary_dir = tempfile.mkdtemp(prefix="llama_cpp_binary_")
        
        binary_dir = Path(cls._binary_dir)
        
        # Try common binary names (main is the standard, but some releases use llama-cli)
        possible_binary_names = ["main", "llama-cli", "llama"]
        binary_path = None
        
        # Check if any binary already exists
        for name in possible_binary_names:
            path = binary_dir / name
            if path.exists() and os.access(path, os.X_OK):
                cls._binary_path = str(path)
                print(f"βœ… Using existing binary at: {cls._binary_path}")
                return cls._binary_path
        
        # If not found, we'll search after extraction
        binary_path = binary_dir / "main"  # Default to 'main' (standard llama.cpp binary name)
        
        # Download the zip file
        zip_url = "https://github.com/ggml-org/llama.cpp/releases/download/b6995/llama-b6995-bin-ubuntu-x64.zip"
        zip_path = binary_dir / "llama-binary.zip"
        
        try:
            print(f"   Downloading from: {zip_url}")
            # Use a more robust download method
            try:
                urllib.request.urlretrieve(zip_url, str(zip_path))
            except Exception as download_error:
                raise RuntimeError(f"Failed to download binary from {zip_url}: {download_error}") from download_error
            
            if not zip_path.exists():
                raise RuntimeError(f"Downloaded file not found at {zip_path}")
            
            print(f"   βœ… Downloaded to: {zip_path} ({zip_path.stat().st_size / 1024 / 1024:.1f} MB)")
            
            # Extract the zip file
            print(f"   πŸ“¦ Extracting zip file...")
            try:
                with zipfile.ZipFile(str(zip_path), 'r') as zip_ref:
                    zip_ref.extractall(str(binary_dir))
            except Exception as extract_error:
                raise RuntimeError(f"Failed to extract zip file {zip_path}: {extract_error}") from extract_error
            
            # Find the binary in the extracted files
            # The binary might be called 'main', 'llama-cli', or 'llama'
            # It might be in the root or in a subdirectory
            found_binary = None
            
            # First, try common locations and names
            for name in possible_binary_names:
                possible_paths = [
                    binary_dir / name,
                    binary_dir / "bin" / name,
                    binary_dir / "llama-b6995-bin-ubuntu-x64" / name,
                ]
                for path in possible_paths:
                    if path.exists():
                        found_binary = path
                        break
                if found_binary:
                    break
            
            # Also search recursively for any executable file matching our names
            if found_binary is None:
                for root, dirs, files in os.walk(str(binary_dir)):
                    for file in files:
                        if file in possible_binary_names or file.startswith("llama"):
                            candidate = Path(root) / file
                            # Check if it's executable (or at least a regular file)
                            if candidate.is_file():
                                found_binary = candidate
                                break
                    if found_binary:
                        break
            
            if found_binary is None:
                # List what we found for debugging
                found_files = []
                for root, dirs, files in os.walk(str(binary_dir)):
                    for file in files:
                        found_files.append(str(Path(root) / file))
                raise RuntimeError(
                    f"Could not find llama.cpp binary in extracted zip. "
                    f"Searched for: {possible_binary_names}. "
                    f"Found files: {found_files[:10]}"
                )
            
            # Make it executable (Linux/Unix only)
            try:
                os.chmod(found_binary, 0o755)
            except Exception as chmod_error:
                print(f"   ⚠️  Warning: Could not set executable permissions: {chmod_error}", flush=True)
            
            # Move to expected location if needed (use 'main' as standard name)
            if found_binary != binary_path:
                if binary_path.exists():
                    binary_path.unlink()  # Remove old binary if exists
                shutil.move(str(found_binary), str(binary_path))
            
            cls._binary_path = str(binary_path)
            print(f"   βœ… Binary extracted and ready at: {cls._binary_path}", flush=True)
            
            # Verify binary is executable and test it
            if not os.access(cls._binary_path, os.X_OK):
                print(f"   ⚠️  Warning: Binary may not be executable. Attempting to fix...", flush=True)
                try:
                    os.chmod(cls._binary_path, 0o755)
                except Exception as e:
                    print(f"   ⚠️  Could not set permissions: {e}", flush=True)
            
            # Test if binary can actually run (check architecture compatibility)
            # Skip test for compiled binaries (they should work)
            is_compiled = cls._binary_path.startswith("/usr/local/bin") or cls._binary_path.startswith("/app/llama.cpp")
            
            if not is_compiled:
                print(f"   πŸ” Testing binary compatibility...", flush=True)
                machine = platform.machine()
                print(f"   System architecture: {machine}", flush=True)
                
                try:
                    # Try to run the binary with --help to verify it works
                    test_result = subprocess.run(
                        [cls._binary_path, "--help"],
                        capture_output=True,
                        text=True,
                        timeout=5
                    )
                    if test_result.returncode == 0 or "usage" in test_result.stdout.lower() or "options" in test_result.stdout.lower():
                        print(f"   βœ… Binary is compatible and executable", flush=True)
                    else:
                        print(f"   ⚠️  Binary test returned code {test_result.returncode}", flush=True)
                        if test_result.stderr:
                            print(f"   Stderr: {test_result.stderr[:200]}", flush=True)
                except subprocess.TimeoutExpired:
                    print(f"   ⚠️  Binary test timed out", flush=True)
                except OSError as os_error:
                    error_msg = str(os_error)
                    errno = getattr(os_error, 'errno', None)
                    if errno == 8 or "Exec format error" in error_msg or "cannot execute" in error_msg.lower():
                        # Check what the binary actually is using 'file' command if available
                        file_info = "unknown"
                        try:
                            file_result = subprocess.run(
                                ["file", cls._binary_path],
                                capture_output=True,
                                text=True,
                                timeout=2
                            )
                            if file_result.returncode == 0:
                                file_info = file_result.stdout.strip()
                        except (subprocess.TimeoutExpired, FileNotFoundError, Exception):
                            pass
                        
                        raise RuntimeError(
                            f"Binary architecture mismatch. The downloaded binary is not compatible with this system.\n"
                            f"System architecture: {machine}\n"
                            f"Binary info: {file_info}\n"
                            f"Error: {error_msg}\n"
                            f"The Ubuntu x64 binary may not be compatible with this system. "
                            f"Translation feature requires a compatible llama.cpp binary for this architecture."
                        ) from os_error
                    else:
                        raise
                except Exception as test_error:
                    error_msg = str(test_error)
                    print(f"   ⚠️  Binary test warning: {test_error}", flush=True)
            else:
                print(f"   βœ… Using compiled binary (no compatibility test needed)", flush=True)
            
            # Clean up zip file
            try:
                zip_path.unlink()
            except Exception:
                pass  # Ignore cleanup errors
            
            return cls._binary_path
            
        except Exception as e:
            error_msg = (
                f"Failed to download/extract llama.cpp binary from {zip_url}. "
                f"Error: {e}"
            )
            print(f"   ❌ {error_msg}")
            raise RuntimeError(error_msg) from e

    def _download_model_if_needed(self) -> str:
        """Download GGUF model file from HuggingFace if not already cached."""
        from huggingface_hub import hf_hub_download
        
        if self._model_downloaded and self.model_path and os.path.exists(self.model_path):
            return self.model_path
        
        # Set up cache directory
        cache_dir = os.environ.get('HF_HOME', os.path.expanduser("~/.cache/huggingface"))
        os.makedirs(cache_dir, exist_ok=True)
        
        try:
            print(f"   πŸ“₯ Downloading GGUF model: {self.model_file} from {self.repo_id}...")
            model_path = hf_hub_download(
                repo_id=self.repo_id,
                filename=self.model_file,
                cache_dir=cache_dir,
                resume_download=True
            )
            print(f"   βœ… Model downloaded/cached at: {model_path}")
            self.model_path = model_path
            self._model_downloaded = True
            return model_path
        except Exception as e:
            error_msg = (
                f"Failed to download GGUF model '{self.model_file}' from '{self.repo_id}'. "
                f"Error: {e}\n"
                f"Please verify:\n"
                f"1. The repository exists: https://huggingface.co/{self.repo_id}\n"
                f"2. The model file name is correct\n"
                f"3. You have internet connectivity"
            )
            raise RuntimeError(error_msg) from e

    def _build_translation_prompt(self, user_text: str) -> str:
        """Build a prompt for translation to English using Qwen's chat format."""
        # Use a very simple, direct prompt to minimize reasoning
        prompt = f"""<|im_start|>system
{TRANSLATION_SYSTEM_INSTRUCTIONS}<|im_end|>
<|im_start|>user
{user_text}<|im_end|>
<|im_start|>assistant
"""
        return prompt

    def generate_content(self, prompt: str) -> str:
        """
        Translate the input text to English using the pre-built llama.cpp binary.
        Returns the English translation as a plain string.
        """
        # Download binary and model if needed
        binary_path = self._download_binary()
        model_path = self._download_model_if_needed()
        
        # Build translation prompt
        translation_prompt = self._build_translation_prompt(prompt)
        
        # Prepare command-line arguments for llama.cpp binary
        # Standard format: ./main -m model.gguf -p "prompt" --temp 0.3 --top-p 0.9 --top-k 40 -n 128 -c 256 -t 0
        # Add --no-display-prompt to avoid printing the prompt in output
        cmd = [
            binary_path,
            "-m", model_path,
            "-p", translation_prompt,
            "--temp", str(self.temperature),
            "--top-p", str(self.top_p),
            "--top-k", str(self.top_k),
            "-n", str(self.max_tokens),  # Number of tokens to generate
            "-c", str(self.context_size),  # Context size
            "--no-display-prompt",  # Don't echo the prompt in output (if supported)
        ]
        
        # Add thread count if specified (0 means auto-detect, which is default)
        if self.n_threads > 0:
            cmd.extend(["-t", str(self.n_threads)])
        
        # Add GPU layers if specified
        if self.n_gpu_layers > 0:
            cmd.extend(["-ngl", str(self.n_gpu_layers)])
        
        # Note: Stop sequences are handled in post-processing since --stop may not be available in all llama.cpp versions
        
        try:
            # Run the binary and capture output
            print(f"   πŸ”„ Running translation with llama.cpp binary...", flush=True)
            print(f"   Command: {' '.join(cmd[:3])}... (model: {os.path.basename(model_path)})", flush=True)
            print(f"   Input prompt length: {len(translation_prompt)} chars", flush=True)
            
            result = subprocess.run(
                cmd,
                capture_output=True,
                text=True,
                timeout=30,  # 30 second timeout (should be enough for short translations)
                check=False  # Don't raise on non-zero exit, we'll check manually
            )
            
            print(f"   Binary exit code: {result.returncode}", flush=True)
            print(f"   Stdout length: {len(result.stdout)} chars", flush=True)
            print(f"   Stderr length: {len(result.stderr)} chars", flush=True)
            
            # Check if command succeeded
            if result.returncode != 0:
                error_msg = f"llama.cpp binary exited with code {result.returncode}"
                if result.stderr:
                    error_msg += f"\nStderr: {result.stderr[:500]}"
                    print(f"   Stderr: {result.stderr[:500]}", flush=True)
                if result.stdout:
                    error_msg += f"\nStdout: {result.stdout[:500]}"
                    print(f"   Stdout: {result.stdout[:500]}", flush=True)
                print(f"   ❌ {error_msg}", flush=True)
                raise RuntimeError(error_msg)
            
            # Parse the output
            output = result.stdout.strip()
            
            if not output:
                print(f"   ❌ Empty output from binary", flush=True)
                raise RuntimeError("llama.cpp binary returned empty output")
            
            print(f"   Raw output (first 200 chars): {output[:200]}", flush=True)
            
            # Remove reasoning/thinking tags and their content (Qwen models sometimes output these)
            # Remove <think>...</think> tags and content
            output = re.sub(r'<think>.*?</think>', '', output, flags=re.DOTALL)
            # Remove <thinking>...</thinking> tags if present
            output = re.sub(r'<thinking>.*?</thinking>', '', output, flags=re.DOTALL)
            # Remove any other reasoning tags
            output = re.sub(r'<reasoning>.*?</reasoning>', '', output, flags=re.DOTALL)
            
            # The output might include the prompt, so we need to extract just the generated part
            # Look for the assistant response after the prompt
            if "<|im_start|>assistant" in output:
                # Extract everything after the assistant tag
                output = output.split("<|im_start|>assistant")[-1].strip()
                print(f"   Extracted after assistant tag: {output[:200]}", flush=True)
            
            # Remove any remaining chat format tokens
            translated_text = output.replace("<|im_start|>", "").replace("<|im_end|>", "").strip()
            
            # If the text still contains reasoning-like patterns, try to extract just the translation
            # Look for patterns like "The translation is:" or "English translation:" or just clean text
            if "translation" in translated_text.lower() and len(translated_text) > 100:
                # Try to find the actual translation after common prefixes
                translation_patterns = [
                    r'(?:translation|translated|english):\s*(.+?)(?:\n|$)',
                    r'(?:the translation is|here is the translation|english translation):\s*(.+?)(?:\n|$)',
                ]
                for pattern in translation_patterns:
                    match = re.search(pattern, translated_text, re.IGNORECASE | re.DOTALL)
                    if match:
                        translated_text = match.group(1).strip()
                        print(f"   Extracted translation from pattern: {translated_text[:200]}", flush=True)
                        break
            
            # Clean up any remaining artifacts
            translated_text = re.sub(r'\n+', ' ', translated_text)  # Replace multiple newlines with space
            translated_text = translated_text.strip()
            
            if not translated_text:
                print(f"   ❌ Translation output is empty after parsing", flush=True)
                print(f"   Original output was: {result.stdout[:500]}", flush=True)
                raise RuntimeError("Translation output is empty after parsing")
            
            print(f"   βœ… Translation completed ({len(translated_text)} chars): '{translated_text[:200]}...'", flush=True)
            
        except subprocess.TimeoutExpired:
            error_msg = "Translation timed out after 30 seconds"
            print(f"   ❌ {error_msg}", flush=True)
            print(f"   This may indicate the model is too slow. Consider using a smaller model or enabling GPU.", flush=True)
            raise RuntimeError(error_msg)
        except subprocess.CalledProcessError as e:
            error_output = e.stderr if e.stderr else e.stdout
            error_msg = (
                f"Translation failed with llama.cpp binary. "
                f"Exit code: {e.returncode}, Error: {error_output[:500]}"
            )
            print(f"   ❌ {error_msg}")
            raise RuntimeError(error_msg) from e
        except Exception as e:
            error_msg = f"Translation generation failed: {e}"
            print(f"   ❌ {error_msg}")
            raise RuntimeError(error_msg) from e
        
        # Clean up the response
        translated_text = translated_text.strip()
        
        # Remove common prefixes that might be added by the model
        prefixes_to_remove = [
            "English translation:",
            "Translation:",
            "English:",
            "Here is the translation:",
            "The translation is:",
            "Assistant:"
        ]
        for prefix in prefixes_to_remove:
            if translated_text.lower().startswith(prefix.lower()):
                translated_text = translated_text[len(prefix):].strip()
        
        # Remove leading/trailing quotes if present
        translated_text = translated_text.strip('"').strip("'").strip()
        
        # If translation is empty or suspiciously short, return original
        if not translated_text or len(translated_text) < len(prompt) * 0.1:
            print(f"⚠️  Translation may have failed (too short or empty), returning original text")
            return prompt
        
        return translated_text

    def generate_content_stream(self, prompt: str) -> Generator[str, None, None]:
        """
        Stream translation using llama.cpp binary.
        For simplicity, we'll collect the full response and yield it.
        True streaming can be added later if needed.
        """
        # For now, just yield the full translation (streaming can be optimized later)
        translation = self.generate_content(prompt)
        yield translation

    def _generate_content_impl(self, prompt: str) -> str:
        return self.generate_content(prompt)

    def _generate_content_stream_impl(self, prompt: str) -> Generator[Any, None, None]:
        return self.generate_content_stream(prompt)