Vlad Bastina commited on
Commit
ad6a882
Β·
0 Parent(s):

first commit

Browse files
.gitattributes ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ *.PNG filter=lfs diff=lfs merge=lfs -text
2
+ *.ttf filter=lfs diff=lfs merge=lfs -text
3
+ *.pdf filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ *.pdf
2
+ !default_pharma.pdf
3
+ *.py
4
+ !app.py
5
+ .streamlit/secrets.toml
.streamlit/config.toml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [theme]
2
+ base="light"
DejaVuSans.ttf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08ca98e69d9d8fa1065584b4f9ab7d49b6205abea6572b90e171b254845bb990
3
+ size 741536
app.py ADDED
@@ -0,0 +1,545 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import sys
4
+ import time
5
+ import io # Needed for handling file streams in memory
6
+ from pathlib import Path
7
+
8
+ # --- Import necessary libraries ---
9
+ try:
10
+ import google.generativeai as genai
11
+ from google.api_core import exceptions as google_exceptions
12
+ except ImportError:
13
+ print(sys.path)
14
+ print(sys.executable)
15
+ st.error("Error: google-generativeai library not found. Please install it: `pip install google-generativeai`")
16
+ st.stop()
17
+
18
+ try:
19
+ import pypdf
20
+ except ImportError:
21
+ st.error("Error: pypdf library not found. Please install it: `pip install pypdf`")
22
+ st.stop()
23
+
24
+ try:
25
+ from reportlab.lib.pagesizes import letter
26
+ from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
27
+ from reportlab.lib.styles import getSampleStyleSheet
28
+ from reportlab.lib.enums import TA_JUSTIFY
29
+ from reportlab.pdfbase import pdfmetrics
30
+ from reportlab.pdfbase.ttfonts import TTFont
31
+ # Attempt to register a font that supports a wider range of characters
32
+ try:
33
+ # Assume DejaVuSans.ttf is in the same directory as the script
34
+ font_path = Path(__file__).parent / 'DejaVuSans.ttf'
35
+ if font_path.exists():
36
+ pdfmetrics.registerFont(TTFont('DejaVuSans', str(font_path)))
37
+ DEFAULT_FONT = 'DejaVuSans'
38
+ print("Using DejaVuSans font.") # Log to console
39
+ else:
40
+ DEFAULT_FONT = 'Helvetica'
41
+ print("Warning: DejaVuSans.ttf not found. Using Helvetica.")
42
+ # Display warning in Streamlit app as well
43
+ st.warning("⚠️ Warning: DejaVuSans font not found. Non-Latin characters might not render correctly in the output PDF. Consider placing `DejaVuSans.ttf` in the app directory.")
44
+ except Exception as font_e:
45
+ st.warning(f"⚠️ Warning: Error registering font. Using Helvetica. Details: {font_e}")
46
+ DEFAULT_FONT = 'Helvetica'
47
+
48
+ except ImportError:
49
+ st.error("Error: reportlab library not found. Please install it: `pip install reportlab`")
50
+ st.stop()
51
+
52
+ # --- Configuration (Moved API Key handling) ---
53
+ # GEMINI_API_KEY = os.environ.get("GOOGLE_API_KEY") # Handled via Streamlit input/secrets later
54
+ MODEL_NAME = "gemini-1.5-pro" # Or "gemini-1.5-flash-latest" etc.
55
+ SAFETY_SETTINGS = [
56
+ {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
57
+ {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
58
+ {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
59
+ {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
60
+ ]
61
+ BATCH_SIZE = 50 # Number of pages to process per batch if PDF is large
62
+ API_CALL_DELAY = 0.5 # Optional delay in seconds between API calls
63
+ DEFAULT_PDF_PATH = Path(__file__).parent / "default_pharma.pdf" # Path to your default PDF
64
+ LANGUAGES = ["russian", "romanian", "english", "german", "french", "spanish"]
65
+
66
+ # --- Core Functions (Adapted from your script) ---
67
+
68
+ # Global variable to hold the configured model
69
+ gemini_model = None
70
+
71
+ def configure_gemini(api_key):
72
+ """Configures the Gemini client."""
73
+ global gemini_model
74
+ try:
75
+ genai.configure(api_key=st.secrets["GOOGLE_API_KEY"])
76
+ gemini_model = genai.GenerativeModel(MODEL_NAME, safety_settings=SAFETY_SETTINGS)
77
+ return True
78
+ except Exception as e:
79
+ st.error(f"Error configuring Gemini: {e}")
80
+ gemini_model = None # Ensure model is None if config fails
81
+ return False
82
+
83
+ def extract_text_from_pdf(pdf_file_obj):
84
+ """Extracts text from each page of the PDF file object."""
85
+ page_texts = []
86
+ try:
87
+ reader = pypdf.PdfReader(pdf_file_obj)
88
+ num_pages = len(reader.pages)
89
+ st.info(f"Found {num_pages} page(s) in the PDF.")
90
+ progress_bar = st.progress(0)
91
+ status_text = st.empty()
92
+
93
+ for i, page in enumerate(reader.pages):
94
+ try:
95
+ text = page.extract_text()
96
+ if text:
97
+ page_texts.append(text.strip())
98
+ else:
99
+ page_texts.append("") # Keep page count consistent
100
+ status_text.text(f"Extracting text from page {i + 1}/{num_pages}")
101
+ progress_bar.progress((i + 1) / num_pages)
102
+
103
+ except Exception as e:
104
+ st.warning(f"Warning: Could not extract text from page {i + 1}: {e}")
105
+ page_texts.append("") # Add empty string on error
106
+
107
+ status_text.text("Text extraction complete.")
108
+ return page_texts
109
+ except pypdf.errors.PdfReadError as e:
110
+ st.error(f"Error reading PDF file: {e}. The file might be corrupted, password-protected, or not a valid PDF.")
111
+ return None
112
+ except Exception as e:
113
+ st.error(f"An unexpected error occurred during PDF processing: {e}")
114
+ return None
115
+
116
+ def extract_text_from_txt(txt_file_obj):
117
+ """Reads text content from a TXT file object."""
118
+ try:
119
+ # Read as bytes first, then decode smartly
120
+ content_bytes = txt_file_obj.read()
121
+ try:
122
+ # Try UTF-8 first
123
+ text = content_bytes.decode('utf-8')
124
+ except UnicodeDecodeError:
125
+ try:
126
+ # Fallback to latin-1 (or cp1252 for Windows files)
127
+ text = content_bytes.decode('latin-1')
128
+ st.warning("Decoded TXT file using 'latin-1'. Some characters might be misinterpreted if the encoding is different.")
129
+ except Exception as decode_err:
130
+ st.error(f"Error decoding TXT file: {decode_err}. Please ensure it's UTF-8 or Latin-1 encoded.")
131
+ return None
132
+ st.info(f"Successfully read text file.")
133
+ return text
134
+ except Exception as e:
135
+ st.error(f"An error occurred reading the TXT file: {e}")
136
+ return None
137
+
138
+ def translate_text_gemini(text, source_lang, target_lang, page_num_for_log=""):
139
+ """Translates text using the Gemini API."""
140
+ global gemini_model
141
+ if gemini_model is None:
142
+ st.error("Gemini model not configured. Cannot translate.")
143
+ return None # Indicate failure
144
+
145
+ if not text:
146
+ return "" # Nothing to translate
147
+
148
+ log_prefix = f"Page {page_num_for_log}: " if page_num_for_log else "Text block: "
149
+
150
+ prompt = f"""Translate the following text from {source_lang} to {target_lang}.
151
+ Preserve paragraph breaks where appropriate. Output *only* the translated text, without any introductory phrases like "Here is the translation:", or any explanations or markdown formatting. If the input text is empty or nonsensical for translation, output nothing.
152
+
153
+ Text to translate:
154
+ ---
155
+ {text}
156
+ ---
157
+
158
+ Translation:"""
159
+
160
+ try:
161
+ # Optional: Add delay between calls
162
+ if API_CALL_DELAY > 0:
163
+ time.sleep(API_CALL_DELAY)
164
+
165
+ response = gemini_model.generate_content(prompt)
166
+
167
+ # Robust check for content
168
+ translated_text = ""
169
+ if response.parts:
170
+ translated_text = "".join(part.text for part in response.parts).strip()
171
+ elif hasattr(response, 'text'): # Fallback for simpler response structures
172
+ translated_text = response.text.strip()
173
+
174
+ # Handle potential blocking or empty responses even if parts exist but are empty
175
+ if not translated_text:
176
+ if response.prompt_feedback and response.prompt_feedback.block_reason:
177
+ st.warning(f"{log_prefix}Translation blocked. Reason: {response.prompt_feedback.block_reason}")
178
+ return f"[Translation blocked on {log_prefix.strip(':')}: {response.prompt_feedback.block_reason}]"
179
+ else:
180
+ finish_reason = response.candidates[0].finish_reason if response.candidates else 'UNKNOWN'
181
+ if finish_reason == 'STOP':
182
+ # Don't warn if input was likely just whitespace/empty
183
+ if text.strip():
184
+ st.warning(f"{log_prefix}Received no translated content (finish reason STOP). Original text might have been empty or untranslatable.")
185
+ return "" # Return empty if no content and no blocking
186
+ else:
187
+ st.warning(f"{log_prefix}Received empty response from API. Finish Reason: {finish_reason}, Feedback: {response.prompt_feedback}")
188
+ return f"[Translation failed on {log_prefix.strip(':')}: Empty API response]"
189
+
190
+ return translated_text
191
+
192
+ except google_exceptions.ResourceExhausted as e:
193
+ st.error(f"{log_prefix}Error: Gemini API quota exceeded: {e}. Consider increasing API_CALL_DELAY or checking your quota.")
194
+ return f"[Translation failed on {log_prefix.strip(':')}: Quota Exceeded - {e}]"
195
+ except google_exceptions.InvalidArgument as e:
196
+ st.error(f"{log_prefix}Error: Invalid argument passed to Gemini API: {e}")
197
+ # st.error(f" Problematic text snippet (first 100 chars): {text[:100]}...") # Debugging
198
+ return f"[Translation failed on {log_prefix.strip(':')}: Invalid Argument - {e}]"
199
+ except Exception as e:
200
+ st.error(f"{log_prefix}Error during Gemini API call: {e}")
201
+ return f"[Translation failed on {log_prefix.strip(':')}: {e}]"
202
+
203
+
204
+ def translate_pages_in_batches(original_pages_text, source_lang, target_lang):
205
+ """Translates list of page texts, batching if necessary."""
206
+ global gemini_model
207
+ if gemini_model is None:
208
+ st.error("Gemini model not configured. Cannot translate.")
209
+ return None
210
+
211
+ translated_pages = []
212
+ total_pages = len(original_pages_text)
213
+
214
+ if total_pages == 0:
215
+ st.warning("No text pages found to translate.")
216
+ return []
217
+
218
+ st.info(f"Starting translation of {total_pages} page(s)...")
219
+ progress_bar = st.progress(0)
220
+ status_text = st.empty()
221
+
222
+ if total_pages <= BATCH_SIZE:
223
+ # Single batch processing
224
+ for i, text in enumerate(original_pages_text):
225
+ page_num = i + 1
226
+ status_text.text(f"Translating page {page_num}/{total_pages}...")
227
+ if not text.strip():
228
+ # st.write(f" - Page {page_num}: Skipping empty page.") # Optional verbose logging
229
+ translated_pages.append("")
230
+ else:
231
+ translated = translate_text_gemini(text, source_lang, target_lang, page_num_for_log=page_num)
232
+ if translated is None: return None # Propagate failure
233
+ translated_pages.append(translated)
234
+ progress_bar.progress((i + 1) / total_pages)
235
+ else:
236
+ # Batch processing
237
+ num_batches = (total_pages + BATCH_SIZE - 1) // BATCH_SIZE
238
+ st.info(f"Translating {total_pages} pages in {num_batches} batches of up to {BATCH_SIZE}...")
239
+ pages_processed = 0
240
+ for batch_num in range(num_batches):
241
+ start_index = batch_num * BATCH_SIZE
242
+ end_index = min((batch_num + 1) * BATCH_SIZE, total_pages)
243
+ batch_texts = original_pages_text[start_index:end_index]
244
+ start_page = start_index + 1
245
+ end_page = end_index
246
+
247
+ # st.write(f"-- Processing Batch {batch_num + 1}/{num_batches} (Pages {start_page}-{end_page}) --")
248
+
249
+ for i, text in enumerate(batch_texts):
250
+ current_page_number = start_index + i + 1
251
+ status_text.text(f"Translating page {current_page_number}/{total_pages} (Batch {batch_num + 1}/{num_batches})...")
252
+ if not text.strip():
253
+ # st.write(f" - Page {current_page_number}: Skipping empty page.")
254
+ translated_pages.append("")
255
+ else:
256
+ translated = translate_text_gemini(text, source_lang, target_lang, page_num_for_log=current_page_number)
257
+ if translated is None: return None # Propagate failure
258
+ translated_pages.append(translated)
259
+
260
+ pages_processed += 1
261
+ progress_bar.progress(pages_processed / total_pages)
262
+ # st.write(f"-- Finished Batch {batch_num + 1}/{num_batches} --")
263
+
264
+ status_text.text("Translation step complete.")
265
+ return translated_pages
266
+
267
+
268
+ def create_pdf_from_text(translated_pages):
269
+ """Creates a new PDF document from the translated text pages in memory."""
270
+ pdf_buffer = io.BytesIO()
271
+ try:
272
+ doc = SimpleDocTemplate(pdf_buffer, pagesize=letter)
273
+ styles = getSampleStyleSheet()
274
+ style = styles["Normal"]
275
+ style.fontName = DEFAULT_FONT
276
+ style.fontSize = 10
277
+ style.alignment = TA_JUSTIFY
278
+
279
+ style_bold = styles["Heading2"] # Use a heading style for page markers
280
+ style_bold.fontName = DEFAULT_FONT
281
+ style_bold.fontSize = 8 # Make header smaller
282
+ style_bold.alignment = TA_JUSTIFY
283
+
284
+ story = []
285
+ st.info(f"Reconstructing PDF with {len(translated_pages)} page(s)...")
286
+ progress_bar = st.progress(0)
287
+ status_text = st.empty()
288
+
289
+ for i, page_text in enumerate(translated_pages):
290
+ page_num = i + 1
291
+ status_text.text(f"Adding translated page {page_num}/{len(translated_pages)} to PDF...")
292
+
293
+ # Add a header indicating the original page number
294
+ story.append(Paragraph(f"--- Translated Page {page_num} ---", style_bold))
295
+ story.append(Spacer(1, 6)) # Add smaller space after header
296
+
297
+ if page_text:
298
+ # Replace newline characters with <br/> tags for ReportLab Paragraphs
299
+ formatted_text = page_text.replace('\n', '<br/>\n')
300
+ try:
301
+ para = Paragraph(formatted_text, style)
302
+ story.append(para)
303
+ except Exception as e:
304
+ st.warning(f"Warning: Could not add text from page {page_num} to PDF (potential encoding/font issue): {e}")
305
+ try:
306
+ error_para = Paragraph(f"[Could not render text for page {page_num} due to error. See logs/warnings.]", style)
307
+ story.append(error_para)
308
+ except: pass # Skip if even the error message fails
309
+ else:
310
+ story.append(Paragraph(f"[No translatable text found or translation failed for page {page_num}]", style))
311
+
312
+ # Add a page break after each page's content, except the last one
313
+ if i < len(translated_pages) - 1:
314
+ story.append(PageBreak())
315
+
316
+ progress_bar.progress((i + 1) / len(translated_pages))
317
+
318
+ doc.build(story)
319
+ status_text.text("PDF reconstruction complete.")
320
+ pdf_buffer.seek(0) # Rewind the buffer to the beginning
321
+ return pdf_buffer
322
+
323
+ except Exception as e:
324
+ st.error(f"Error creating output PDF: {e}")
325
+ return None
326
+
327
+ def create_txt_from_text(translated_text):
328
+ """Creates a TXT file content in memory."""
329
+ try:
330
+ txt_buffer = io.StringIO()
331
+ txt_buffer.write(translated_text)
332
+ txt_buffer.seek(0)
333
+ # We need BytesIO for download button, so encode it
334
+ txt_bytes_buffer = io.BytesIO(txt_buffer.getvalue().encode('utf-8'))
335
+ st.info("TXT file content prepared.")
336
+ return txt_bytes_buffer
337
+ except Exception as e:
338
+ st.error(f"Error creating output TXT: {e}")
339
+ return None
340
+
341
+
342
+ # --- Streamlit App UI ---
343
+ st.title("πŸ“„ Document Translator")
344
+
345
+ configure_gemini(None)
346
+
347
+ st.sidebar.image('zega_logo.PNG',use_container_width=True)
348
+
349
+ st.sidebar.markdown("---") # Separator
350
+
351
+ # --- File Input Options ---
352
+ st.sidebar.subheader("πŸ“ Input File")
353
+ use_default = st.sidebar.checkbox("Use default Russian pharma PDF", value=False)
354
+
355
+ uploaded_file = None
356
+ source_lang_selected = None
357
+ input_file_type = None # To track 'pdf' or 'txt'
358
+
359
+ if use_default:
360
+ if not DEFAULT_PDF_PATH.exists():
361
+ st.sidebar.error(f"Default PDF '{DEFAULT_PDF_PATH.name}' not found in the app directory!")
362
+ st.stop()
363
+ else:
364
+ st.sidebar.info(f"Using default file: `{DEFAULT_PDF_PATH.name}`")
365
+ source_lang_selected = "russian" # Default file is Russian
366
+ input_file_type = "pdf"
367
+ else:
368
+ uploaded_file = st.sidebar.file_uploader(
369
+ "Upload your PDF or TXT file",
370
+ type=["pdf", "txt"],
371
+ accept_multiple_files=False
372
+ )
373
+ if uploaded_file:
374
+ input_file_type = uploaded_file.type.split('/')[-1].lower() # pdf or plain (->txt)
375
+ if input_file_type == 'plain':
376
+ input_file_type = 'txt'
377
+
378
+ # Dropdown for source language ONLY if uploading
379
+ st.sidebar.markdown("πŸ‘‡ Select the **source** language of your uploaded file:")
380
+ source_lang_selected = st.sidebar.selectbox(
381
+ "Source Language",
382
+ options=[""] + LANGUAGES, # Add empty option for prompt
383
+ index=0, # Default to empty
384
+ key="source_lang_uploader"
385
+ )
386
+ if not source_lang_selected:
387
+ st.sidebar.warning("Please select the source language of your document.")
388
+
389
+
390
+ st.sidebar.markdown("---") # Separator
391
+
392
+ # --- Target Language Selection ---
393
+ st.sidebar.subheader("🎯 Target Language")
394
+ target_lang_selected = None
395
+ # Ensure a source is defined before showing target selection
396
+ if source_lang_selected:
397
+ target_lang_selected = st.sidebar.selectbox(
398
+ "Translate To",
399
+ options=[""] + [lang for lang in LANGUAGES if lang != source_lang_selected], # Exclude source lang
400
+ index=0, # Default to empty
401
+ key="target_lang",
402
+ help="Select the language you want to translate the document into."
403
+ )
404
+ if not target_lang_selected:
405
+ st.sidebar.warning("Please select the target language.")
406
+ else:
407
+ st.sidebar.info("Select or upload a file and its source language first.")
408
+
409
+
410
+ st.sidebar.markdown("---") # Separator
411
+
412
+ # --- Translate Button ---
413
+ translate_button = st.sidebar.button("Translate Document", disabled=(gemini_model is None or not target_lang_selected))
414
+
415
+ if not source_lang_selected:
416
+ st.sidebar.markdown("_(Select/Upload file and source language to enable translation)_")
417
+ elif not target_lang_selected:
418
+ st.sidebar.markdown("_(Select target language to enable translation)_")
419
+
420
+ # --- Main Area for Processing and Results ---
421
+ if translate_button:
422
+ st.subheader("πŸš€ Translation Progress")
423
+ output_buffer = None
424
+ output_filename = "translation_failed" # Default filename
425
+
426
+ with st.spinner("Processing... Please wait."):
427
+ # 1. Get Input Data
428
+ input_data = None
429
+ if use_default:
430
+ try:
431
+ with open(DEFAULT_PDF_PATH, "rb") as f:
432
+ input_data = io.BytesIO(f.read())
433
+ st.write(f"Processing default file: {DEFAULT_PDF_PATH.name} (PDF)")
434
+ except Exception as e:
435
+ st.error(f"Error reading default PDF: {e}")
436
+ st.stop()
437
+ elif uploaded_file:
438
+ input_data = io.BytesIO(uploaded_file.getvalue()) # Use BytesIO for consistency
439
+ st.write(f"Processing uploaded file: {uploaded_file.name} ({input_file_type.upper()})")
440
+ else:
441
+ st.error("No input file selected!")
442
+ st.stop()
443
+
444
+ # Basic validation passed in UI, but double-check
445
+ if not input_data or not source_lang_selected or not target_lang_selected:
446
+ st.error("Missing required input (file, source language, or target language).")
447
+ st.stop()
448
+ if source_lang_selected == target_lang_selected:
449
+ st.error("Source and Target languages cannot be the same.")
450
+ st.stop()
451
+
452
+ # --- Start Processing based on file type ---
453
+ if input_file_type == "pdf":
454
+ st.markdown("---")
455
+ st.write("**Step 1: Extracting Text from PDF...**")
456
+ original_pages = extract_text_from_pdf(input_data)
457
+
458
+ if original_pages is not None:
459
+ st.markdown("---")
460
+ st.write(f"**Step 2: Translating {len(original_pages)} pages from {source_lang_selected} to {target_lang_selected}...**")
461
+ translated_pages = translate_pages_in_batches(original_pages, source_lang_selected, target_lang_selected)
462
+
463
+ if translated_pages is not None:
464
+ st.markdown("---")
465
+ st.write("**Step 3: Creating Translated PDF...**")
466
+ output_buffer = create_pdf_from_text(translated_pages)
467
+ if output_buffer:
468
+ output_filename = f"{Path(uploaded_file.name if uploaded_file else DEFAULT_PDF_PATH.name).stem}_translated_{target_lang_selected}.pdf"
469
+ st.success("βœ… Translation and PDF creation successful!")
470
+ else:
471
+ st.error("Translation failed. Cannot create PDF.")
472
+ else:
473
+ st.error("Text extraction failed. Cannot proceed.")
474
+
475
+ elif input_file_type == "txt":
476
+ st.markdown("---")
477
+ st.write("**Step 1: Reading Text from TXT...**")
478
+ original_text = extract_text_from_txt(input_data)
479
+
480
+ if original_text is not None:
481
+ st.markdown("---")
482
+ st.write(f"**Step 2: Translating text from {source_lang_selected} to {target_lang_selected}...**")
483
+ # Use the single text translation function - treat TXT as one block
484
+ status_text_txt = st.empty()
485
+ status_text_txt.text("Sending text to translation API...")
486
+ translated_text = translate_text_gemini(original_text, source_lang_selected, target_lang_selected, page_num_for_log="TXT content")
487
+ status_text_txt.text("Translation received.")
488
+
489
+
490
+ if translated_text is not None: # Check if translation call succeeded
491
+ st.markdown("---")
492
+ st.write("**Step 3: Creating Translated TXT file...**")
493
+ output_buffer = create_txt_from_text(translated_text)
494
+ if output_buffer:
495
+ output_filename = f"{Path(uploaded_file.name).stem}_translated_{target_lang_selected}.txt"
496
+ st.success("βœ… Translation and TXT creation successful!")
497
+ else:
498
+ st.error("Translation failed. Cannot create TXT file.")
499
+ else:
500
+ st.error("Reading TXT file failed. Cannot proceed.")
501
+
502
+ else:
503
+ st.error(f"Unsupported file type: {input_file_type}")
504
+
505
+ # --- Offer Download ---
506
+ if output_buffer:
507
+ st.markdown("---")
508
+ st.subheader("πŸ“₯ Download Result")
509
+ file_mime = "application/pdf" if output_filename.endswith(".pdf") else "text/plain"
510
+ st.download_button(
511
+ label=f"Download {output_filename}",
512
+ data=output_buffer,
513
+ file_name=output_filename,
514
+ mime=file_mime,
515
+ )
516
+ # Display a snippet of the translation (optional)
517
+ # try:
518
+ # if output_filename.endswith(".pdf"):
519
+ # st.info("PDF generated. Download to view content.")
520
+ # else: # TXT file
521
+ # output_buffer.seek(0)
522
+ # snippet = output_buffer.read(500).decode('utf-8', errors='ignore')
523
+ # st.text_area("Translation Snippet:", snippet + "...", height=200)
524
+ # except Exception as e:
525
+ # st.warning(f"Could not display snippet: {e}")
526
+
527
+ # --- Initial Instructions ---
528
+ if not translate_button:
529
+ st.markdown(
530
+ """
531
+ ## How to Use:
532
+
533
+ 1. **Choose Input:**
534
+ * Check the box to use the **default Russian pharma PDF**.
535
+ * Or, **upload** your own PDF or TXT file using the uploader.
536
+ 2. **Select Languages:**
537
+ * If uploading, select the **source language** of your file.
538
+ * Select the **target language** you want to translate to.
539
+ 3. **Translate:** Click the "Translate Document" button in the sidebar.
540
+ 4. **Download:** Once processed, a download button for the translated file will appear.
541
+
542
+ **Note:**
543
+ * PDF translation attempts to preserve page structure but loses original formatting (images, fonts, layout).
544
+ """
545
+ )
default_pharma.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d9938187bec7b1f5757f089589057661a8a63ae36d731e1d4c28ee20f7e8076
3
+ size 151108
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ streamlit
2
+ google-generativeai
3
+ pypdf
4
+ reportlab
zega_logo.PNG ADDED

Git LFS Details

  • SHA256: ab929904c4eadf8cc1aadc9a797a469f20d31a5636770f1db2789f2096033558
  • Pointer size: 131 Bytes
  • Size of remote file: 116 kB