Marek4321 commited on
Commit
1df1e0b
·
verified ·
1 Parent(s): 962b900

Upload 14 files

Browse files
README.md ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: BabelSlide v2.0
3
+ emoji: 🌍
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: streamlit
7
+ sdk_version: 1.46.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ # 🌍 BabelSlide v2.0
14
+
15
+ Professional document translation application powered by AI. Supports PDF, DOCX, and PPTX formats with intelligent translation using ChatGPT and DeepSeek.
16
+
17
+ ## ✨ Features
18
+
19
+ - **Multi-format Support**: PDF, Microsoft Word (.docx), PowerPoint (.pptx)
20
+ - **AI-Powered Translation**: ChatGPT (GPT-4) and DeepSeek integration
21
+ - **Intelligent Processing**: Preserves document structure and formatting
22
+ - **Clean Output**: Advanced post-processing removes unwanted LLM commentary
23
+ - **Professional UI**: Modern, responsive Streamlit interface
24
+ - **Comprehensive Logging**: Detailed process tracking and error handling
25
+ - **Translation Reviews**: Automatic quality assessment generation
26
+
27
+ ## 🚀 How to Use
28
+
29
+ 1. **Configure API**: Select your AI provider (ChatGPT/DeepSeek) in the sidebar
30
+ 2. **Enter API Key**: Provide your API key (required for translation)
31
+ 3. **Set Languages**: Choose source and target languages
32
+ 4. **Upload Document**: Drag & drop or select your document (PDF/DOCX/PPTX)
33
+ 5. **Translate**: Click "🚀 Translate Document" and wait for processing
34
+ 6. **Download**: Get your translated document and quality review
35
+
36
+ ## 🔑 API Keys
37
+
38
+ ### ChatGPT (OpenAI)
39
+ - Get your API key from [OpenAI Platform](https://platform.openai.com/api-keys)
40
+ - Format: `sk-...`
41
+
42
+ ### DeepSeek
43
+ - Get your API key from [DeepSeek Platform](https://platform.deepseek.com/)
44
+ - More cost-effective alternative to ChatGPT
45
+
46
+ ## 📋 Supported Languages
47
+
48
+ Arabic, Chinese (Simplified/Traditional), Dutch, English, French, German, Greek, Hindi, Indonesian, Italian, Japanese, Korean, Polish, Portuguese, Russian, Spanish, Swedish, Thai, Turkish, Vietnamese
49
+
50
+ ## 🛡️ Privacy & Security
51
+
52
+ - **API keys are never stored** - they remain only in your browser session
53
+ - **Documents are processed temporarily** - no permanent storage
54
+ - **Secure processing** - all data is handled with privacy in mind
55
+
56
+ ## 🚨 Limitations
57
+
58
+ - **File Size**: Maximum 50MB per document
59
+ - **API Dependencies**: Requires active internet connection and valid API keys
60
+ - **PDF Formatting**: Complex layouts may require manual adjustment after translation
61
+
62
+ ## 🏗️ Technical Architecture
63
+
64
+ - **SOLID Principles**: Clean, modular, maintainable code
65
+ - **Abstract Base Classes**: Extensible translator and processor interfaces
66
+ - **Comprehensive Error Handling**: Graceful failure management
67
+ - **Advanced AI Prompting**: Minimizes hallucinations and unwanted commentary
68
+ - **Format Preservation**: Maintains document structure and styling
69
+
70
+ ## 📄 License
71
+
72
+ This project is licensed under the MIT License.
73
+
74
+ ---
75
+
76
+ **BabelSlide v2.0** - Breaking language barriers, one document at a time 🌍
app.py ADDED
@@ -0,0 +1,416 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ BabelSlide v2.0 - Professional Document Translator
4
+ Streamlit application for translating PDF, DOCX, and PPTX documents using AI
5
+ """
6
+
7
+ import streamlit as st
8
+ import tempfile
9
+ from pathlib import Path
10
+ import sys
11
+ import os
12
+
13
+ from translators.chatgpt_translator import ChatGPTTranslator
14
+ from translators.deepseek_translator import DeepSeekTranslator
15
+ from processors.pdf_processor import PDFProcessor
16
+ from processors.docx_processor import DOCXProcessor
17
+ from processors.pptx_processor import PPTXProcessor
18
+ from utils.constants import LANGUAGES, API_PROVIDERS
19
+ from utils.validator import FileValidator
20
+ from utils.logger import setup_logger, ProcessLogger
21
+ from core.exceptions import (
22
+ BabelSlideException,
23
+ ValidationError,
24
+ UnsupportedFileError,
25
+ FileSizeError,
26
+ APIKeyError,
27
+ TranslationError,
28
+ ProcessorError
29
+ )
30
+
31
+ class BabelSlideStreamlitApp:
32
+ """Streamlit interface for BabelSlide application"""
33
+
34
+ def __init__(self):
35
+ self.logger = setup_logger("BabelSlideUI")
36
+ self.process_logger = ProcessLogger(self.logger)
37
+
38
+ # Initialize session state
39
+ if 'processing' not in st.session_state:
40
+ st.session_state.processing = False
41
+ if 'translation_result' not in st.session_state:
42
+ st.session_state.translation_result = None
43
+ if 'review_result' not in st.session_state:
44
+ st.session_state.review_result = None
45
+
46
+ def setup_page_config(self):
47
+ """Configure Streamlit page"""
48
+ st.set_page_config(
49
+ page_title="BabelSlide - Document Translator",
50
+ page_icon="🌍",
51
+ layout="wide",
52
+ initial_sidebar_state="expanded"
53
+ )
54
+
55
+ # Custom CSS
56
+ st.markdown("""
57
+ <style>
58
+ .main-header {
59
+ text-align: center;
60
+ padding: 2rem 0;
61
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
62
+ color: white;
63
+ border-radius: 12px;
64
+ margin-bottom: 2rem;
65
+ }
66
+
67
+ .success-box {
68
+ background: #d1fae5;
69
+ border: 1px solid #10b981;
70
+ border-radius: 8px;
71
+ padding: 1rem;
72
+ margin: 1rem 0;
73
+ }
74
+
75
+ .error-box {
76
+ background: #fef2f2;
77
+ border: 1px solid #ef4444;
78
+ border-radius: 8px;
79
+ padding: 1rem;
80
+ margin: 1rem 0;
81
+ }
82
+
83
+ .info-box {
84
+ background: #eff6ff;
85
+ border: 1px solid #3b82f6;
86
+ border-radius: 8px;
87
+ padding: 1rem;
88
+ margin: 1rem 0;
89
+ }
90
+
91
+ .stButton > button {
92
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
93
+ color: white;
94
+ border: none;
95
+ border-radius: 8px;
96
+ padding: 0.5rem 2rem;
97
+ font-weight: 600;
98
+ }
99
+ </style>
100
+ """, unsafe_allow_html=True)
101
+
102
+ def render_header(self):
103
+ """Render application header"""
104
+ st.markdown("""
105
+ <div class="main-header">
106
+ <h1>🌍 BabelSlide v2.0</h1>
107
+ <p>Professional Document Translation using AI • PDF • DOCX • PPTX</p>
108
+ </div>
109
+ """, unsafe_allow_html=True)
110
+
111
+ def render_sidebar(self):
112
+ """Render configuration sidebar"""
113
+ st.sidebar.markdown("## ⚙️ Configuration")
114
+
115
+ # API Provider
116
+ api_provider = st.sidebar.selectbox(
117
+ "AI Provider",
118
+ options=list(API_PROVIDERS.keys()),
119
+ index=0,
120
+ help="Choose your preferred translation AI"
121
+ )
122
+
123
+ # API Key
124
+ api_key = st.sidebar.text_input(
125
+ "API Key",
126
+ type="password",
127
+ placeholder="Enter your API key (sk-... for OpenAI)",
128
+ help="Your API key is never stored permanently"
129
+ )
130
+
131
+ st.sidebar.markdown("---")
132
+
133
+ # Languages
134
+ col1, col2 = st.sidebar.columns(2)
135
+
136
+ with col1:
137
+ source_lang = st.selectbox(
138
+ "Source Language",
139
+ options=list(LANGUAGES.keys()),
140
+ index=list(LANGUAGES.keys()).index("English"),
141
+ help="Language of the original document"
142
+ )
143
+
144
+ with col2:
145
+ target_lang = st.selectbox(
146
+ "Target Language",
147
+ options=list(LANGUAGES.keys()),
148
+ index=list(LANGUAGES.keys()).index("Polish"),
149
+ help="Language to translate to"
150
+ )
151
+
152
+ st.sidebar.markdown("---")
153
+ st.sidebar.markdown("### 📝 Supported Formats")
154
+ st.sidebar.info("• PDF documents\n• DOCX (Word) files\n• PPTX (PowerPoint) presentations")
155
+ st.sidebar.warning("Maximum file size: 50 MB")
156
+
157
+ return api_provider, api_key, source_lang, target_lang
158
+
159
+ def render_file_upload(self):
160
+ """Render file upload section"""
161
+ st.markdown("## 📄 Document Upload")
162
+
163
+ uploaded_file = st.file_uploader(
164
+ "Choose a document to translate",
165
+ type=['pdf', 'docx', 'pptx'],
166
+ help="Upload PDF, DOCX, or PPTX files (max 50 MB)",
167
+ accept_multiple_files=False
168
+ )
169
+
170
+ if uploaded_file:
171
+ col1, col2, col3 = st.columns([2, 1, 1])
172
+ with col1:
173
+ st.info(f"📁 **File:** {uploaded_file.name}")
174
+ with col2:
175
+ file_size = len(uploaded_file.getvalue()) / (1024 * 1024)
176
+ st.info(f"📏 **Size:** {file_size:.1f} MB")
177
+ with col3:
178
+ file_type = uploaded_file.name.split('.')[-1].upper()
179
+ st.info(f"📋 **Type:** {file_type}")
180
+
181
+ return uploaded_file
182
+
183
+ def validate_inputs(self, file, api_provider, api_key, source_lang, target_lang):
184
+ """Validate all inputs before processing"""
185
+ errors = []
186
+
187
+ if not file:
188
+ errors.append("Please upload a document")
189
+
190
+ if not api_key or not api_key.strip():
191
+ errors.append("Please provide an API key")
192
+
193
+ if source_lang == target_lang:
194
+ errors.append("Source and target languages must be different")
195
+
196
+ # Validate file if provided
197
+ if file:
198
+ try:
199
+ # Create temporary file for validation
200
+ with tempfile.NamedTemporaryFile(delete=False, suffix=f".{file.name.split('.')[-1]}") as tmp_file:
201
+ tmp_file.write(file.getvalue())
202
+ tmp_file_path = Path(tmp_file.name)
203
+
204
+ FileValidator.validate_file(tmp_file_path)
205
+ tmp_file_path.unlink() # Clean up
206
+
207
+ except (ValidationError, UnsupportedFileError, FileSizeError) as e:
208
+ errors.append(f"File validation error: {str(e)}")
209
+
210
+ # Validate API key format
211
+ try:
212
+ if api_key:
213
+ FileValidator.validate_api_key(api_key.strip(), api_provider)
214
+ except ValidationError as e:
215
+ errors.append(f"API key error: {str(e)}")
216
+
217
+ return errors
218
+
219
+ def process_document(self, file, api_provider, api_key, source_lang, target_lang):
220
+ """Process document translation"""
221
+ try:
222
+ # Create temporary file
223
+ with tempfile.NamedTemporaryFile(delete=False, suffix=f".{file.name.split('.')[-1]}") as tmp_file:
224
+ tmp_file.write(file.getvalue())
225
+ tmp_file_path = Path(tmp_file.name)
226
+
227
+ # Create translator
228
+ if api_provider == "ChatGPT":
229
+ translator = ChatGPTTranslator(api_key.strip())
230
+ elif api_provider == "DeepSeek":
231
+ translator = DeepSeekTranslator(api_key.strip())
232
+ else:
233
+ raise ValueError(f"Unsupported provider: {api_provider}")
234
+
235
+ # Create processor based on file extension
236
+ extension = tmp_file_path.suffix.lower()
237
+ if extension == '.pdf':
238
+ processor = PDFProcessor(translator)
239
+ elif extension == '.docx':
240
+ processor = DOCXProcessor(translator)
241
+ elif extension == '.pptx':
242
+ processor = PPTXProcessor(translator)
243
+ else:
244
+ raise ValueError(f"Unsupported file format: {extension}")
245
+
246
+ # Progress tracking
247
+ progress_bar = st.progress(0)
248
+ status_text = st.empty()
249
+
250
+ def progress_callback(progress_val, desc):
251
+ progress_bar.progress(progress_val)
252
+ status_text.text(desc)
253
+
254
+ # Process document
255
+ status_text.text("Starting translation...")
256
+ output_path, summary_text = processor.process_document(
257
+ tmp_file_path,
258
+ source_lang,
259
+ target_lang,
260
+ progress_callback
261
+ )
262
+
263
+ # Generate review
264
+ status_text.text("Generating review...")
265
+ review_text = self.generate_review(summary_text, source_lang, translator)
266
+
267
+ # Clean up temp file
268
+ tmp_file_path.unlink()
269
+
270
+ progress_bar.progress(1.0)
271
+ status_text.text("✅ Translation completed!")
272
+
273
+ return output_path, review_text, summary_text
274
+
275
+ except Exception as e:
276
+ self.logger.error(f"Translation error: {str(e)}")
277
+ raise
278
+
279
+ def generate_review(self, translated_text: str, source_lang: str, translator) -> str:
280
+ """Generate translation review"""
281
+ try:
282
+ system_prompt = f"""Generate a comprehensive translation review in {source_lang} covering:
283
+ 1. Translation quality assessment
284
+ 2. Coherence and consistency
285
+ 3. Technical terminology accuracy
286
+ 4. Overall readability
287
+ 5. Recommendations for improvement
288
+
289
+ Keep the review concise but informative."""
290
+
291
+ # Use translator's API to generate review
292
+ review = translator._make_translation_request(
293
+ f"Review this translated text:\n\n{translated_text[:2000]}...",
294
+ "English",
295
+ source_lang
296
+ )
297
+
298
+ return translator._clean_translation_output(review)
299
+
300
+ except Exception as e:
301
+ return f"Review generation failed: {str(e)}"
302
+
303
+ def render_results(self):
304
+ """Render translation results"""
305
+ if st.session_state.translation_result:
306
+ st.markdown("## 📥 Results")
307
+
308
+ col1, col2 = st.columns(2)
309
+
310
+ with col1:
311
+ st.markdown("### 📄 Translated Document")
312
+ if st.session_state.translation_result:
313
+ with open(st.session_state.translation_result, 'rb') as file:
314
+ st.download_button(
315
+ label="⬇️ Download Translated Document",
316
+ data=file.read(),
317
+ file_name=Path(st.session_state.translation_result).name,
318
+ mime="application/octet-stream"
319
+ )
320
+
321
+ with col2:
322
+ st.markdown("### 📋 Translation Review")
323
+ if st.session_state.review_result:
324
+ st.download_button(
325
+ label="⬇️ Download Review",
326
+ data=st.session_state.review_result,
327
+ file_name="translation_review.txt",
328
+ mime="text/plain"
329
+ )
330
+
331
+ # Summary
332
+ if hasattr(st.session_state, 'summary_text') and st.session_state.summary_text:
333
+ st.markdown("### 📝 Translation Summary")
334
+ with st.expander("View Summary", expanded=False):
335
+ st.text_area(
336
+ "Summary",
337
+ value=st.session_state.summary_text[:1000] + "..." if len(st.session_state.summary_text) > 1000 else st.session_state.summary_text,
338
+ height=200,
339
+ disabled=True,
340
+ label_visibility="collapsed"
341
+ )
342
+
343
+ def run(self):
344
+ """Main application loop"""
345
+ self.setup_page_config()
346
+ self.render_header()
347
+
348
+ # Sidebar configuration
349
+ api_provider, api_key, source_lang, target_lang = self.render_sidebar()
350
+
351
+ # Main content
352
+ uploaded_file = self.render_file_upload()
353
+
354
+ # Translation button
355
+ st.markdown("---")
356
+ col1, col2, col3 = st.columns([1, 2, 1])
357
+ with col2:
358
+ translate_button = st.button(
359
+ "🚀 Translate Document",
360
+ disabled=st.session_state.processing,
361
+ use_container_width=True
362
+ )
363
+
364
+ # Process translation
365
+ if translate_button:
366
+ # Validate inputs
367
+ errors = self.validate_inputs(uploaded_file, api_provider, api_key, source_lang, target_lang)
368
+
369
+ if errors:
370
+ st.error("❌ **Please fix the following errors:**")
371
+ for error in errors:
372
+ st.error(f"• {error}")
373
+ else:
374
+ st.session_state.processing = True
375
+
376
+ try:
377
+ with st.spinner("Translating document..."):
378
+ output_path, review_text, summary_text = self.process_document(
379
+ uploaded_file, api_provider, api_key, source_lang, target_lang
380
+ )
381
+
382
+ # Store results
383
+ st.session_state.translation_result = output_path
384
+ st.session_state.review_result = review_text
385
+ st.session_state.summary_text = summary_text
386
+
387
+ st.success(f"✅ **Translation completed successfully!**\n\n"
388
+ f"📄 **File:** {uploaded_file.name}\n"
389
+ f"🔄 **Translation:** {source_lang} → {target_lang}\n"
390
+ f"🤖 **Provider:** {api_provider}")
391
+
392
+ # Auto-refresh to show results
393
+ st.rerun()
394
+
395
+ except Exception as e:
396
+ st.error(f"❌ **Translation failed:** {str(e)}")
397
+
398
+ finally:
399
+ st.session_state.processing = False
400
+
401
+ # Show results if available
402
+ self.render_results()
403
+
404
+ # Footer
405
+ st.markdown("---")
406
+ st.markdown(
407
+ "<div style='text-align: center; color: #666;'>"
408
+ "<strong>BabelSlide v2.0</strong> • Professional document translation • Built for global communication"
409
+ "</div>",
410
+ unsafe_allow_html=True
411
+ )
412
+
413
+ # Main entry point
414
+ if __name__ == "__main__":
415
+ app = BabelSlideStreamlitApp()
416
+ app.run()
core/base_processor.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from typing import List, Tuple, Optional, Generator
3
+ from pathlib import Path
4
+ from core.base_translator import BaseTranslator
5
+ from core.exceptions import ProcessorError
6
+ import os
7
+
8
+ class DocumentProcessor(ABC):
9
+ """Abstract base class for document processors"""
10
+
11
+ def __init__(self, translator: BaseTranslator):
12
+ self.translator = translator
13
+
14
+ @abstractmethod
15
+ def extract_text_elements(self, file_path: Path) -> Generator[Tuple[str, dict], None, None]:
16
+ """
17
+ Extract text elements from document
18
+
19
+ Args:
20
+ file_path: Path to the document
21
+
22
+ Yields:
23
+ Tuple of (text_content, metadata) for each translatable element
24
+ """
25
+ pass
26
+
27
+ @abstractmethod
28
+ def apply_translations(self, file_path: Path, translations: List[Tuple[str, dict]]) -> Path:
29
+ """
30
+ Apply translations back to the document
31
+
32
+ Args:
33
+ file_path: Path to original document
34
+ translations: List of (translated_text, metadata) tuples
35
+
36
+ Returns:
37
+ Path to the translated document
38
+ """
39
+ pass
40
+
41
+ def process_document(
42
+ self,
43
+ file_path: Path,
44
+ source_lang: str,
45
+ target_lang: str,
46
+ progress_callback: Optional[callable] = None
47
+ ) -> Tuple[Path, str]:
48
+ """
49
+ Process entire document translation
50
+
51
+ Args:
52
+ file_path: Path to document
53
+ source_lang: Source language
54
+ target_lang: Target language
55
+ progress_callback: Optional progress callback
56
+
57
+ Returns:
58
+ Tuple of (output_file_path, summary_text)
59
+ """
60
+ try:
61
+ # Extract text elements
62
+ text_elements = list(self.extract_text_elements(file_path))
63
+ total_elements = len(text_elements)
64
+
65
+ if total_elements == 0:
66
+ raise ProcessorError("No translatable text found in document")
67
+
68
+ # Translate each element
69
+ translations = []
70
+ all_translated_text = ""
71
+
72
+ for i, (text, metadata) in enumerate(text_elements):
73
+ if text.strip(): # Only translate non-empty text
74
+ translated = self.translator.translate_text(text, source_lang, target_lang)
75
+ translations.append((translated, metadata))
76
+ all_translated_text += translated + "\n"
77
+ else:
78
+ translations.append((text, metadata)) # Keep empty text as-is
79
+
80
+ # Update progress
81
+ if progress_callback:
82
+ progress_callback((i + 1) / total_elements, f"Translating element {i + 1}/{total_elements}")
83
+
84
+ # Apply translations to document
85
+ output_path = self.apply_translations(file_path, translations)
86
+
87
+ return output_path, all_translated_text
88
+
89
+ except Exception as e:
90
+ raise ProcessorError(f"Document processing failed: {str(e)}")
91
+
92
+ def generate_output_path(self, original_path: Path, suffix: str = "translated") -> Path:
93
+ """
94
+ Generate output file path
95
+
96
+ Args:
97
+ original_path: Original file path
98
+ suffix: Suffix to add to filename
99
+
100
+ Returns:
101
+ New file path with suffix
102
+ """
103
+ stem = original_path.stem
104
+ extension = original_path.suffix
105
+ directory = original_path.parent
106
+
107
+ return directory / f"{stem}_{suffix}{extension}"
108
+
109
+ @property
110
+ @abstractmethod
111
+ def supported_extensions(self) -> List[str]:
112
+ """Return list of supported file extensions"""
113
+ pass
core/base_translator.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from typing import Dict, Any
3
+ import re
4
+ from utils.constants import STRICT_TRANSLATION_PROMPT, UNWANTED_PATTERNS
5
+ from core.exceptions import TranslationError
6
+
7
+ class BaseTranslator(ABC):
8
+ """Abstract base class for all translators"""
9
+
10
+ def __init__(self, api_key: str):
11
+ self.api_key = api_key
12
+ self._validate_api_key()
13
+
14
+ @abstractmethod
15
+ def _validate_api_key(self) -> None:
16
+ """Validate API key format and accessibility"""
17
+ pass
18
+
19
+ @abstractmethod
20
+ def _make_translation_request(self, text: str, source_lang: str, target_lang: str) -> str:
21
+ """Make the actual API request for translation"""
22
+ pass
23
+
24
+ def translate_text(self, text: str, source_lang: str, target_lang: str) -> str:
25
+ """
26
+ Translate text with strict post-processing to remove LLM commentary
27
+
28
+ Args:
29
+ text: Text to translate
30
+ source_lang: Source language code
31
+ target_lang: Target language code
32
+
33
+ Returns:
34
+ Clean translated text without LLM commentary
35
+ """
36
+ if not text.strip():
37
+ return text
38
+
39
+ try:
40
+ # Get translation from API
41
+ translated = self._make_translation_request(text, source_lang, target_lang)
42
+
43
+ # Clean the response from unwanted LLM additions
44
+ cleaned = self._clean_translation_output(translated)
45
+
46
+ return cleaned
47
+
48
+ except Exception as e:
49
+ raise TranslationError(f"Translation failed: {str(e)}")
50
+
51
+ def _clean_translation_output(self, output: str) -> str:
52
+ """
53
+ Remove common LLM commentary and formatting artifacts
54
+
55
+ Args:
56
+ output: Raw output from LLM
57
+
58
+ Returns:
59
+ Cleaned translation text
60
+ """
61
+ cleaned = output.strip()
62
+
63
+ # Apply regex patterns to remove unwanted additions
64
+ for pattern in UNWANTED_PATTERNS:
65
+ cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE | re.MULTILINE)
66
+
67
+ # Remove excessive whitespace while preserving intentional formatting
68
+ cleaned = re.sub(r'\n{3,}', '\n\n', cleaned) # Max 2 consecutive newlines
69
+ cleaned = re.sub(r'[ \t]+', ' ', cleaned) # Normalize spaces
70
+
71
+ return cleaned.strip()
72
+
73
+ def get_system_prompt(self, source_lang: str, target_lang: str) -> str:
74
+ """
75
+ Get the strict system prompt for translation
76
+
77
+ Args:
78
+ source_lang: Source language name
79
+ target_lang: Target language name
80
+
81
+ Returns:
82
+ Formatted system prompt
83
+ """
84
+ return STRICT_TRANSLATION_PROMPT.format(
85
+ source_lang=source_lang,
86
+ target_lang=target_lang
87
+ )
88
+
89
+ @property
90
+ @abstractmethod
91
+ def provider_name(self) -> str:
92
+ """Return the name of the translation provider"""
93
+ pass
core/exceptions.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class BabelSlideException(Exception):
2
+ """Base exception for BabelSlide application"""
3
+ pass
4
+
5
+ class TranslationError(BabelSlideException):
6
+ """Raised when translation fails"""
7
+ pass
8
+
9
+ class ProcessorError(BabelSlideException):
10
+ """Raised when document processing fails"""
11
+ pass
12
+
13
+ class ValidationError(BabelSlideException):
14
+ """Raised when input validation fails"""
15
+ pass
16
+
17
+ class APIKeyError(BabelSlideException):
18
+ """Raised when API key is invalid or missing"""
19
+ pass
20
+
21
+ class UnsupportedFileError(BabelSlideException):
22
+ """Raised when file format is not supported"""
23
+ pass
24
+
25
+ class FileSizeError(BabelSlideException):
26
+ """Raised when file is too large"""
27
+ pass
processors/docx_processor.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple, Generator
2
+ from pathlib import Path
3
+ from docx import Document
4
+ from docx.shared import Inches
5
+ from core.base_processor import DocumentProcessor
6
+ from core.exceptions import ProcessorError
7
+
8
+ class DOCXProcessor(DocumentProcessor):
9
+ """Microsoft Word document processor"""
10
+
11
+ def extract_text_elements(self, file_path: Path) -> Generator[Tuple[str, dict], None, None]:
12
+ """Extract text from Word document"""
13
+ try:
14
+ doc = Document(file_path)
15
+
16
+ # Extract text from paragraphs
17
+ for para_idx, paragraph in enumerate(doc.paragraphs):
18
+ if paragraph.text.strip():
19
+ metadata = {
20
+ 'element_type': 'paragraph',
21
+ 'index': para_idx,
22
+ 'style': paragraph.style.name if paragraph.style else 'Normal',
23
+ 'original_text': paragraph.text
24
+ }
25
+ yield paragraph.text, metadata
26
+
27
+ # Extract text from tables
28
+ for table_idx, table in enumerate(doc.tables):
29
+ for row_idx, row in enumerate(table.rows):
30
+ for cell_idx, cell in enumerate(row.cells):
31
+ if cell.text.strip():
32
+ metadata = {
33
+ 'element_type': 'table_cell',
34
+ 'table_index': table_idx,
35
+ 'row_index': row_idx,
36
+ 'cell_index': cell_idx,
37
+ 'original_text': cell.text
38
+ }
39
+ yield cell.text, metadata
40
+
41
+ # Extract text from headers and footers
42
+ for section_idx, section in enumerate(doc.sections):
43
+ # Header
44
+ if section.header:
45
+ for para_idx, paragraph in enumerate(section.header.paragraphs):
46
+ if paragraph.text.strip():
47
+ metadata = {
48
+ 'element_type': 'header',
49
+ 'section_index': section_idx,
50
+ 'paragraph_index': para_idx,
51
+ 'original_text': paragraph.text
52
+ }
53
+ yield paragraph.text, metadata
54
+
55
+ # Footer
56
+ if section.footer:
57
+ for para_idx, paragraph in enumerate(section.footer.paragraphs):
58
+ if paragraph.text.strip():
59
+ metadata = {
60
+ 'element_type': 'footer',
61
+ 'section_index': section_idx,
62
+ 'paragraph_index': para_idx,
63
+ 'original_text': paragraph.text
64
+ }
65
+ yield paragraph.text, metadata
66
+
67
+ except Exception as e:
68
+ raise ProcessorError(f"Failed to extract text from Word document: {str(e)}")
69
+
70
+ def apply_translations(self, file_path: Path, translations: List[Tuple[str, dict]]) -> Path:
71
+ """Apply translations to Word document"""
72
+ try:
73
+ # Load the original document
74
+ doc = Document(file_path)
75
+
76
+ # Group translations by type
77
+ paragraph_translations = {}
78
+ table_translations = {}
79
+ header_translations = {}
80
+ footer_translations = {}
81
+
82
+ for translated_text, metadata in translations:
83
+ element_type = metadata['element_type']
84
+
85
+ if element_type == 'paragraph':
86
+ paragraph_translations[metadata['index']] = translated_text
87
+ elif element_type == 'table_cell':
88
+ table_key = (metadata['table_index'], metadata['row_index'], metadata['cell_index'])
89
+ table_translations[table_key] = translated_text
90
+ elif element_type == 'header':
91
+ header_key = (metadata['section_index'], metadata['paragraph_index'])
92
+ header_translations[header_key] = translated_text
93
+ elif element_type == 'footer':
94
+ footer_key = (metadata['section_index'], metadata['paragraph_index'])
95
+ footer_translations[footer_key] = translated_text
96
+
97
+ # Apply paragraph translations
98
+ for para_idx, paragraph in enumerate(doc.paragraphs):
99
+ if para_idx in paragraph_translations:
100
+ paragraph.text = paragraph_translations[para_idx]
101
+
102
+ # Apply table translations
103
+ for table_idx, table in enumerate(doc.tables):
104
+ for row_idx, row in enumerate(table.rows):
105
+ for cell_idx, cell in enumerate(row.cells):
106
+ table_key = (table_idx, row_idx, cell_idx)
107
+ if table_key in table_translations:
108
+ cell.text = table_translations[table_key]
109
+
110
+ # Apply header and footer translations
111
+ for section_idx, section in enumerate(doc.sections):
112
+ # Headers
113
+ if section.header:
114
+ for para_idx, paragraph in enumerate(section.header.paragraphs):
115
+ header_key = (section_idx, para_idx)
116
+ if header_key in header_translations:
117
+ paragraph.text = header_translations[header_key]
118
+
119
+ # Footers
120
+ if section.footer:
121
+ for para_idx, paragraph in enumerate(section.footer.paragraphs):
122
+ footer_key = (section_idx, para_idx)
123
+ if footer_key in footer_translations:
124
+ paragraph.text = footer_translations[footer_key]
125
+
126
+ # Save translated document
127
+ output_path = self.generate_output_path(file_path, "translated")
128
+ doc.save(output_path)
129
+
130
+ return output_path
131
+
132
+ except Exception as e:
133
+ raise ProcessorError(f"Failed to apply translations to Word document: {str(e)}")
134
+
135
+ @property
136
+ def supported_extensions(self) -> List[str]:
137
+ return ['.docx']
processors/pdf_processor.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple, Generator
2
+ from pathlib import Path
3
+ import fitz # PyMuPDF
4
+ from reportlab.pdfgen import canvas
5
+ from reportlab.lib.pagesizes import letter
6
+ from reportlab.pdfbase import pdfutils
7
+ from reportlab.pdfbase.ttfonts import TTFont
8
+ from reportlab.pdfbase import pdfmetrics
9
+ from core.base_processor import DocumentProcessor
10
+ from core.exceptions import ProcessorError
11
+
12
+ class PDFProcessor(DocumentProcessor):
13
+ """PDF document processor"""
14
+
15
+ def __init__(self, translator):
16
+ super().__init__(translator)
17
+ # Register default font that supports Unicode
18
+ try:
19
+ # Try to register a system font that supports multiple languages
20
+ pdfmetrics.registerFont(TTFont('DejaVuSans', 'DejaVuSans.ttf'))
21
+ self.font_name = 'DejaVuSans'
22
+ except:
23
+ # Fallback to Helvetica if DejaVu is not available
24
+ self.font_name = 'Helvetica'
25
+
26
+ def extract_text_elements(self, file_path: Path) -> Generator[Tuple[str, dict], None, None]:
27
+ """Extract text from PDF"""
28
+ try:
29
+ pdf_document = fitz.open(file_path)
30
+
31
+ for page_num in range(len(pdf_document)):
32
+ page = pdf_document[page_num]
33
+ text_blocks = page.get_text("dict")
34
+
35
+ for block_idx, block in enumerate(text_blocks["blocks"]):
36
+ if "lines" in block: # Text block
37
+ block_text = ""
38
+ for line in block["lines"]:
39
+ for span in line["spans"]:
40
+ block_text += span["text"]
41
+ block_text += "\n"
42
+
43
+ if block_text.strip():
44
+ metadata = {
45
+ 'page_number': page_num,
46
+ 'block_index': block_idx,
47
+ 'bbox': block["bbox"], # Bounding box for positioning
48
+ 'original_text': block_text.strip()
49
+ }
50
+ yield block_text.strip(), metadata
51
+
52
+ pdf_document.close()
53
+
54
+ except Exception as e:
55
+ raise ProcessorError(f"Failed to extract text from PDF: {str(e)}")
56
+
57
+ def apply_translations(self, file_path: Path, translations: List[Tuple[str, dict]]) -> Path:
58
+ """
59
+ Apply translations to PDF by creating a new document
60
+ Note: PDF translation is complex due to formatting preservation.
61
+ This creates a simplified translated version.
62
+ """
63
+ try:
64
+ # Create output path
65
+ output_path = self.generate_output_path(file_path, "translated")
66
+
67
+ # Group translations by page
68
+ page_translations = {}
69
+ for translated_text, metadata in translations:
70
+ page_num = metadata['page_number']
71
+ if page_num not in page_translations:
72
+ page_translations[page_num] = []
73
+ page_translations[page_num].append({
74
+ 'text': translated_text,
75
+ 'bbox': metadata['bbox']
76
+ })
77
+
78
+ # Create new PDF with translations
79
+ pdf_canvas = canvas.Canvas(str(output_path), pagesize=letter)
80
+
81
+ # Get original PDF dimensions
82
+ original_pdf = fitz.open(file_path)
83
+
84
+ for page_num in range(len(original_pdf)):
85
+ if page_num > 0:
86
+ pdf_canvas.showPage()
87
+
88
+ # Get page dimensions
89
+ page = original_pdf[page_num]
90
+ page_rect = page.rect
91
+
92
+ if page_num in page_translations:
93
+ y_position = page_rect.height - 50 # Start from top
94
+
95
+ for translation_block in page_translations[page_num]:
96
+ text = translation_block['text']
97
+
98
+ # Set font and size
99
+ pdf_canvas.setFont(self.font_name, 12)
100
+
101
+ # Handle multi-line text
102
+ lines = text.split('\n')
103
+ for line in lines:
104
+ if line.strip():
105
+ pdf_canvas.drawString(50, y_position, line.strip())
106
+ y_position -= 15 # Line spacing
107
+
108
+ y_position -= 10 # Block spacing
109
+
110
+ pdf_canvas.save()
111
+ original_pdf.close()
112
+
113
+ return output_path
114
+
115
+ except Exception as e:
116
+ raise ProcessorError(f"Failed to apply translations to PDF: {str(e)}")
117
+
118
+ def create_text_only_pdf(self, file_path: Path, translations: List[Tuple[str, dict]]) -> Path:
119
+ """
120
+ Create a simplified text-only PDF with translations
121
+ This is a fallback method for complex PDFs
122
+ """
123
+ try:
124
+ output_path = self.generate_output_path(file_path, "translated_text_only")
125
+
126
+ # Group by pages
127
+ page_translations = {}
128
+ for translated_text, metadata in translations:
129
+ page_num = metadata['page_number']
130
+ if page_num not in page_translations:
131
+ page_translations[page_num] = []
132
+ page_translations[page_num].append(translated_text)
133
+
134
+ pdf_canvas = canvas.Canvas(str(output_path), pagesize=letter)
135
+
136
+ for page_num in sorted(page_translations.keys()):
137
+ if page_num > 0:
138
+ pdf_canvas.showPage()
139
+
140
+ # Add page title
141
+ pdf_canvas.setFont(self.font_name, 14)
142
+ pdf_canvas.drawString(50, 750, f"Page {page_num + 1}")
143
+
144
+ y_position = 720
145
+ pdf_canvas.setFont(self.font_name, 11)
146
+
147
+ for text_block in page_translations[page_num]:
148
+ lines = text_block.split('\n')
149
+ for line in lines:
150
+ if line.strip() and y_position > 50:
151
+ # Handle long lines by wrapping
152
+ if len(line) > 80:
153
+ words = line.split()
154
+ current_line = ""
155
+ for word in words:
156
+ if len(current_line + word) < 80:
157
+ current_line += word + " "
158
+ else:
159
+ if current_line.strip():
160
+ pdf_canvas.drawString(50, y_position, current_line.strip())
161
+ y_position -= 12
162
+ current_line = word + " "
163
+ if current_line.strip():
164
+ pdf_canvas.drawString(50, y_position, current_line.strip())
165
+ y_position -= 12
166
+ else:
167
+ pdf_canvas.drawString(50, y_position, line.strip())
168
+ y_position -= 12
169
+ y_position -= 8 # Block spacing
170
+
171
+ pdf_canvas.save()
172
+ return output_path
173
+
174
+ except Exception as e:
175
+ raise ProcessorError(f"Failed to create text-only PDF: {str(e)}")
176
+
177
+ @property
178
+ def supported_extensions(self) -> List[str]:
179
+ return ['.pdf']
processors/pptx_processor.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple, Generator
2
+ from pathlib import Path
3
+ from pptx import Presentation
4
+ from core.base_processor import DocumentProcessor
5
+ from core.exceptions import ProcessorError
6
+
7
+ class PPTXProcessor(DocumentProcessor):
8
+ """PowerPoint presentation processor"""
9
+
10
+ def extract_text_elements(self, file_path: Path) -> Generator[Tuple[str, dict], None, None]:
11
+ """Extract text from PowerPoint slides"""
12
+ try:
13
+ prs = Presentation(file_path)
14
+
15
+ for slide_idx, slide in enumerate(prs.slides):
16
+ for shape_idx, shape in enumerate(slide.shapes):
17
+ if hasattr(shape, "text") and shape.text.strip():
18
+ metadata = {
19
+ 'slide_index': slide_idx,
20
+ 'shape_index': shape_idx,
21
+ 'shape_type': str(type(shape)),
22
+ 'original_text': shape.text
23
+ }
24
+ yield shape.text, metadata
25
+
26
+ except Exception as e:
27
+ raise ProcessorError(f"Failed to extract text from PowerPoint: {str(e)}")
28
+
29
+ def apply_translations(self, file_path: Path, translations: List[Tuple[str, dict]]) -> Path:
30
+ """Apply translations to PowerPoint presentation"""
31
+ try:
32
+ # Load the original presentation
33
+ prs = Presentation(file_path)
34
+
35
+ # Create a mapping of translations by slide and shape index
36
+ translation_map = {}
37
+ for translated_text, metadata in translations:
38
+ slide_idx = metadata['slide_index']
39
+ shape_idx = metadata['shape_index']
40
+ if slide_idx not in translation_map:
41
+ translation_map[slide_idx] = {}
42
+ translation_map[slide_idx][shape_idx] = translated_text
43
+
44
+ # Apply translations
45
+ for slide_idx, slide in enumerate(prs.slides):
46
+ if slide_idx in translation_map:
47
+ slide_translations = translation_map[slide_idx]
48
+ for shape_idx, shape in enumerate(slide.shapes):
49
+ if shape_idx in slide_translations and hasattr(shape, "text"):
50
+ shape.text = slide_translations[shape_idx]
51
+
52
+ # Save translated presentation
53
+ output_path = self.generate_output_path(file_path, "translated")
54
+ prs.save(output_path)
55
+
56
+ return output_path
57
+
58
+ except Exception as e:
59
+ raise ProcessorError(f"Failed to apply translations to PowerPoint: {str(e)}")
60
+
61
+ @property
62
+ def supported_extensions(self) -> List[str]:
63
+ return ['.pptx']
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies
2
+ streamlit==1.46.1
3
+ openai>=1.0.0
4
+ python-pptx>=0.6.21
5
+ python-docx>=1.1.0
6
+ PyMuPDF>=1.23.0 # fitz for PDF processing
7
+ reportlab>=4.0.0 # PDF generation
8
+
9
+ # Optional dependencies for better PDF support
10
+ Pillow>=10.0.0 # Image processing
11
+ fonttools>=4.0.0 # Font handling
12
+
13
+ # Development dependencies (optional)
14
+ # pytest>=7.0.0
15
+ # black>=23.0.0
16
+ # flake8>=6.0.0
translators/chatgpt_translator.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ from core.base_translator import BaseTranslator
3
+ from core.exceptions import APIKeyError, TranslationError
4
+ from utils.constants import API_PROVIDERS, PROVIDER_MODELS
5
+
6
+ class ChatGPTTranslator(BaseTranslator):
7
+ """ChatGPT/OpenAI translator implementation"""
8
+
9
+ def __init__(self, api_key: str):
10
+ super().__init__(api_key)
11
+ self.client = OpenAI(api_key=api_key)
12
+
13
+ def _validate_api_key(self) -> None:
14
+ """Validate OpenAI API key format"""
15
+ if not self.api_key or not self.api_key.startswith('sk-'):
16
+ raise APIKeyError("Invalid OpenAI API key format. Must start with 'sk-'")
17
+
18
+ def _make_translation_request(self, text: str, source_lang: str, target_lang: str) -> str:
19
+ """Make translation request to OpenAI API"""
20
+ try:
21
+ response = self.client.chat.completions.create(
22
+ model=PROVIDER_MODELS["ChatGPT"],
23
+ messages=[
24
+ {
25
+ "role": "system",
26
+ "content": self.get_system_prompt(source_lang, target_lang)
27
+ },
28
+ {
29
+ "role": "user",
30
+ "content": text
31
+ }
32
+ ],
33
+ temperature=0.1, # Low temperature for consistent translations
34
+ max_tokens=4000,
35
+ top_p=0.9
36
+ )
37
+
38
+ if not response.choices or not response.choices[0].message:
39
+ raise TranslationError("Empty response from ChatGPT API")
40
+
41
+ return response.choices[0].message.content
42
+
43
+ except Exception as e:
44
+ if "invalid_api_key" in str(e).lower():
45
+ raise APIKeyError("Invalid ChatGPT API key")
46
+ elif "rate_limit" in str(e).lower():
47
+ raise TranslationError("Rate limit exceeded. Please try again later.")
48
+ elif "quota" in str(e).lower():
49
+ raise TranslationError("API quota exceeded. Please check your billing.")
50
+ else:
51
+ raise TranslationError(f"ChatGPT API error: {str(e)}")
52
+
53
+ @property
54
+ def provider_name(self) -> str:
55
+ return "ChatGPT"
translators/deepseek_translator.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ from core.base_translator import BaseTranslator
3
+ from core.exceptions import APIKeyError, TranslationError
4
+ from utils.constants import API_PROVIDERS, PROVIDER_MODELS
5
+
6
+ class DeepSeekTranslator(BaseTranslator):
7
+ """DeepSeek translator implementation"""
8
+
9
+ def __init__(self, api_key: str):
10
+ super().__init__(api_key)
11
+ self.client = OpenAI(
12
+ api_key=api_key,
13
+ base_url=API_PROVIDERS["DeepSeek"]
14
+ )
15
+
16
+ def _validate_api_key(self) -> None:
17
+ """Validate DeepSeek API key format"""
18
+ if not self.api_key or len(self.api_key) < 10:
19
+ raise APIKeyError("Invalid DeepSeek API key format")
20
+
21
+ def _make_translation_request(self, text: str, source_lang: str, target_lang: str) -> str:
22
+ """Make translation request to DeepSeek API"""
23
+ try:
24
+ response = self.client.chat.completions.create(
25
+ model=PROVIDER_MODELS["DeepSeek"],
26
+ messages=[
27
+ {
28
+ "role": "system",
29
+ "content": self.get_system_prompt(source_lang, target_lang)
30
+ },
31
+ {
32
+ "role": "user",
33
+ "content": text
34
+ }
35
+ ],
36
+ temperature=0.1, # Low temperature for consistent translations
37
+ max_tokens=4000,
38
+ top_p=0.9
39
+ )
40
+
41
+ if not response.choices or not response.choices[0].message:
42
+ raise TranslationError("Empty response from DeepSeek API")
43
+
44
+ return response.choices[0].message.content
45
+
46
+ except Exception as e:
47
+ if "invalid_api_key" in str(e).lower() or "unauthorized" in str(e).lower():
48
+ raise APIKeyError("Invalid DeepSeek API key")
49
+ elif "rate_limit" in str(e).lower():
50
+ raise TranslationError("Rate limit exceeded. Please try again later.")
51
+ elif "quota" in str(e).lower():
52
+ raise TranslationError("API quota exceeded. Please check your billing.")
53
+ else:
54
+ raise TranslationError(f"DeepSeek API error: {str(e)}")
55
+
56
+ @property
57
+ def provider_name(self) -> str:
58
+ return "DeepSeek"
utils/constants.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict
2
+
3
+ # Supported languages
4
+ LANGUAGES: Dict[str, str] = {
5
+ "Arabic": "ar",
6
+ "Chinese (Simplified)": "zh",
7
+ "Chinese (Traditional)": "zh-TW",
8
+ "Dutch": "nl",
9
+ "English": "en",
10
+ "French": "fr",
11
+ "German": "de",
12
+ "Greek": "el",
13
+ "Hindi": "hi",
14
+ "Indonesian": "id",
15
+ "Italian": "it",
16
+ "Japanese": "ja",
17
+ "Korean": "ko",
18
+ "Polish": "pl",
19
+ "Portuguese": "pt",
20
+ "Russian": "ru",
21
+ "Spanish": "es",
22
+ "Swedish": "sv",
23
+ "Thai": "th",
24
+ "Turkish": "tr",
25
+ "Vietnamese": "vi"
26
+ }
27
+
28
+ # API providers configuration
29
+ API_PROVIDERS: Dict[str, str] = {
30
+ "ChatGPT": "https://api.openai.com/v1",
31
+ "DeepSeek": "https://api.deepseek.com"
32
+ }
33
+
34
+ # Models for each provider
35
+ PROVIDER_MODELS: Dict[str, str] = {
36
+ "ChatGPT": "gpt-4o",
37
+ "DeepSeek": "deepseek-chat"
38
+ }
39
+
40
+ # Strict translation system prompt to prevent extra commentary
41
+ STRICT_TRANSLATION_PROMPT = """You are a professional translator. Your task is to translate text accurately while maintaining the original formatting and structure.
42
+
43
+ CRITICAL RULES:
44
+ 1. Return ONLY the translated text
45
+ 2. Do NOT add explanations, comments, or notes
46
+ 3. Do NOT add quotation marks around the translation
47
+ 4. Do NOT add phrases like "Here is the translation:" or "The translation is:"
48
+ 5. Preserve original formatting, line breaks, and punctuation
49
+ 6. If you cannot translate something, return it unchanged
50
+ 7. Do NOT be conversational - just translate
51
+
52
+ Translate from {source_lang} to {target_lang}. Return only the translated text."""
53
+
54
+ # Post-processing patterns to clean unwanted LLM additions
55
+ UNWANTED_PATTERNS = [
56
+ r'^(Here is the translation|The translation is|Translation|Translated text):?\s*',
57
+ r'^"([^"]*)"$', # Remove surrounding quotes
58
+ r'^\s*[-•]\s*', # Remove bullet points
59
+ r'\n\n(Note|Comment|Explanation):.*$', # Remove trailing notes
60
+ ]
61
+
62
+ # File size limits (in MB)
63
+ MAX_FILE_SIZE_MB = 50
64
+
65
+ # Supported file extensions
66
+ SUPPORTED_EXTENSIONS = ['.pptx', '.pdf', '.docx']
utils/logger.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import sys
3
+ from datetime import datetime
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+ class ColoredFormatter(logging.Formatter):
8
+ """Custom formatter with colors for console output"""
9
+
10
+ COLORS = {
11
+ 'DEBUG': '\033[36m', # Cyan
12
+ 'INFO': '\033[32m', # Green
13
+ 'WARNING': '\033[33m', # Yellow
14
+ 'ERROR': '\033[31m', # Red
15
+ 'CRITICAL': '\033[35m', # Magenta
16
+ 'ENDC': '\033[0m' # End color
17
+ }
18
+
19
+ def format(self, record):
20
+ log_color = self.COLORS.get(record.levelname, self.COLORS['ENDC'])
21
+ record.levelname = f"{log_color}{record.levelname}{self.COLORS['ENDC']}"
22
+ return super().format(record)
23
+
24
+ def setup_logger(name: str = "BabelSlide", level: int = logging.INFO, log_file: Optional[Path] = None) -> logging.Logger:
25
+ """
26
+ Setup logger with console and optional file output
27
+
28
+ Args:
29
+ name: Logger name
30
+ level: Logging level
31
+ log_file: Optional file path for logging
32
+
33
+ Returns:
34
+ Configured logger instance
35
+ """
36
+ logger = logging.getLogger(name)
37
+ logger.setLevel(level)
38
+
39
+ # Clear existing handlers
40
+ logger.handlers.clear()
41
+
42
+ # Console handler with colors
43
+ console_handler = logging.StreamHandler(sys.stdout)
44
+ console_handler.setLevel(level)
45
+ console_formatter = ColoredFormatter(
46
+ '%(asctime)s | %(levelname)s | %(name)s | %(message)s',
47
+ datefmt='%H:%M:%S'
48
+ )
49
+ console_handler.setFormatter(console_formatter)
50
+ logger.addHandler(console_handler)
51
+
52
+ # File handler if specified
53
+ if log_file:
54
+ log_file.parent.mkdir(parents=True, exist_ok=True)
55
+ file_handler = logging.FileHandler(log_file)
56
+ file_handler.setLevel(level)
57
+ file_formatter = logging.Formatter(
58
+ '%(asctime)s | %(levelname)s | %(name)s | %(message)s',
59
+ datefmt='%Y-%m-%d %H:%M:%S'
60
+ )
61
+ file_handler.setFormatter(file_formatter)
62
+ logger.addHandler(file_handler)
63
+
64
+ return logger
65
+
66
+ class ProcessLogger:
67
+ """Logger for tracking document processing progress"""
68
+
69
+ def __init__(self, logger: logging.Logger):
70
+ self.logger = logger
71
+ self.start_time = None
72
+ self.current_step = None
73
+ self.total_steps = None
74
+
75
+ def start_process(self, total_steps: int, process_name: str = "Processing"):
76
+ """Start a new process with progress tracking"""
77
+ self.start_time = datetime.now()
78
+ self.total_steps = total_steps
79
+ self.current_step = 0
80
+ self.logger.info(f"Started {process_name} - Total steps: {total_steps}")
81
+
82
+ def log_step(self, step_name: str, step_number: Optional[int] = None):
83
+ """Log completion of a processing step"""
84
+ if step_number is not None:
85
+ self.current_step = step_number
86
+ else:
87
+ self.current_step += 1
88
+
89
+ if self.total_steps:
90
+ progress = (self.current_step / self.total_steps) * 100
91
+ self.logger.info(f"Step {self.current_step}/{self.total_steps} ({progress:.1f}%): {step_name}")
92
+ else:
93
+ self.logger.info(f"Step {self.current_step}: {step_name}")
94
+
95
+ def finish_process(self, success: bool = True):
96
+ """Mark process as finished"""
97
+ if self.start_time:
98
+ duration = datetime.now() - self.start_time
99
+ status = "completed successfully" if success else "failed"
100
+ self.logger.info(f"Process {status} in {duration.total_seconds():.2f} seconds")
101
+
102
+ # Reset state
103
+ self.start_time = None
104
+ self.current_step = None
105
+ self.total_steps = None
utils/validator.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+ from typing import List, Optional
4
+ from core.exceptions import ValidationError, UnsupportedFileError, FileSizeError
5
+ from utils.constants import MAX_FILE_SIZE_MB, SUPPORTED_EXTENSIONS, LANGUAGES
6
+
7
+ class FileValidator:
8
+ """Validator for file uploads and processing parameters"""
9
+
10
+ @staticmethod
11
+ def validate_file(file_path: Path) -> None:
12
+ """
13
+ Validate uploaded file
14
+
15
+ Args:
16
+ file_path: Path to the file to validate
17
+
18
+ Raises:
19
+ ValidationError: If file is invalid
20
+ UnsupportedFileError: If file format is not supported
21
+ FileSizeError: If file is too large
22
+ """
23
+ if not file_path.exists():
24
+ raise ValidationError(f"File does not exist: {file_path}")
25
+
26
+ if not file_path.is_file():
27
+ raise ValidationError(f"Path is not a file: {file_path}")
28
+
29
+ # Check file extension
30
+ extension = file_path.suffix.lower()
31
+ if extension not in SUPPORTED_EXTENSIONS:
32
+ raise UnsupportedFileError(
33
+ f"Unsupported file format: {extension}. "
34
+ f"Supported formats: {', '.join(SUPPORTED_EXTENSIONS)}"
35
+ )
36
+
37
+ # Check file size
38
+ file_size_mb = file_path.stat().st_size / (1024 * 1024)
39
+ if file_size_mb > MAX_FILE_SIZE_MB:
40
+ raise FileSizeError(
41
+ f"File too large: {file_size_mb:.1f}MB. "
42
+ f"Maximum allowed size: {MAX_FILE_SIZE_MB}MB"
43
+ )
44
+
45
+ # Check if file is readable
46
+ try:
47
+ with open(file_path, 'rb') as f:
48
+ f.read(1024) # Try to read first KB
49
+ except Exception as e:
50
+ raise ValidationError(f"Cannot read file: {str(e)}")
51
+
52
+ @staticmethod
53
+ def validate_language(language: str) -> str:
54
+ """
55
+ Validate and normalize language input
56
+
57
+ Args:
58
+ language: Language name or code
59
+
60
+ Returns:
61
+ Normalized language name
62
+
63
+ Raises:
64
+ ValidationError: If language is not supported
65
+ """
66
+ if not language:
67
+ raise ValidationError("Language cannot be empty")
68
+
69
+ # Check if it's a valid language name
70
+ if language in LANGUAGES:
71
+ return language
72
+
73
+ # Check if it's a valid language code
74
+ for name, code in LANGUAGES.items():
75
+ if code == language:
76
+ return name
77
+
78
+ raise ValidationError(
79
+ f"Unsupported language: {language}. "
80
+ f"Supported languages: {', '.join(LANGUAGES.keys())}"
81
+ )
82
+
83
+ @staticmethod
84
+ def validate_api_key(api_key: str, provider: str) -> None:
85
+ """
86
+ Validate API key format
87
+
88
+ Args:
89
+ api_key: API key to validate
90
+ provider: API provider name
91
+
92
+ Raises:
93
+ ValidationError: If API key is invalid
94
+ """
95
+ if not api_key or not api_key.strip():
96
+ raise ValidationError("API key cannot be empty")
97
+
98
+ api_key = api_key.strip()
99
+
100
+ if provider == "ChatGPT":
101
+ if not api_key.startswith('sk-'):
102
+ raise ValidationError("OpenAI API key must start with 'sk-'")
103
+ if len(api_key) < 20:
104
+ raise ValidationError("OpenAI API key appears too short")
105
+
106
+ elif provider == "DeepSeek":
107
+ if len(api_key) < 10:
108
+ raise ValidationError("DeepSeek API key appears too short")
109
+
110
+ else:
111
+ raise ValidationError(f"Unknown provider: {provider}")
112
+
113
+ @staticmethod
114
+ def validate_translation_params(
115
+ source_lang: str,
116
+ target_lang: str,
117
+ api_provider: str,
118
+ api_key: str
119
+ ) -> tuple[str, str]:
120
+ """
121
+ Validate all translation parameters
122
+
123
+ Args:
124
+ source_lang: Source language
125
+ target_lang: Target language
126
+ api_provider: API provider name
127
+ api_key: API key
128
+
129
+ Returns:
130
+ Tuple of normalized (source_lang, target_lang)
131
+
132
+ Raises:
133
+ ValidationError: If any parameter is invalid
134
+ """
135
+ # Validate languages
136
+ norm_source = FileValidator.validate_language(source_lang)
137
+ norm_target = FileValidator.validate_language(target_lang)
138
+
139
+ if norm_source == norm_target:
140
+ raise ValidationError("Source and target languages cannot be the same")
141
+
142
+ # Validate API provider
143
+ if api_provider not in ["ChatGPT", "DeepSeek"]:
144
+ raise ValidationError(f"Unsupported API provider: {api_provider}")
145
+
146
+ # Validate API key
147
+ FileValidator.validate_api_key(api_key, api_provider)
148
+
149
+ return norm_source, norm_target
150
+
151
+ @staticmethod
152
+ def sanitize_filename(filename: str) -> str:
153
+ """
154
+ Sanitize filename for safe file operations
155
+
156
+ Args:
157
+ filename: Original filename
158
+
159
+ Returns:
160
+ Sanitized filename
161
+ """
162
+ # Remove or replace unsafe characters
163
+ unsafe_chars = '<>:"/\\|?*'
164
+ for char in unsafe_chars:
165
+ filename = filename.replace(char, '_')
166
+
167
+ # Remove leading/trailing spaces and dots
168
+ filename = filename.strip(' .')
169
+
170
+ # Ensure filename is not empty
171
+ if not filename:
172
+ filename = "translated_document"
173
+
174
+ return filename