Spaces:
Sleeping
Sleeping
Vlad Bastina commited on
Commit Β·
ad6a882
0
Parent(s):
first commit
Browse files- .gitattributes +3 -0
- .gitignore +5 -0
- .streamlit/config.toml +2 -0
- DejaVuSans.ttf +3 -0
- app.py +545 -0
- default_pharma.pdf +3 -0
- requirements.txt +4 -0
- zega_logo.PNG +3 -0
.gitattributes
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.PNG filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.ttf filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.pdf filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.pdf
|
| 2 |
+
!default_pharma.pdf
|
| 3 |
+
*.py
|
| 4 |
+
!app.py
|
| 5 |
+
.streamlit/secrets.toml
|
.streamlit/config.toml
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[theme]
|
| 2 |
+
base="light"
|
DejaVuSans.ttf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:08ca98e69d9d8fa1065584b4f9ab7d49b6205abea6572b90e171b254845bb990
|
| 3 |
+
size 741536
|
app.py
ADDED
|
@@ -0,0 +1,545 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import os
|
| 3 |
+
import sys
|
| 4 |
+
import time
|
| 5 |
+
import io # Needed for handling file streams in memory
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
# --- Import necessary libraries ---
|
| 9 |
+
try:
|
| 10 |
+
import google.generativeai as genai
|
| 11 |
+
from google.api_core import exceptions as google_exceptions
|
| 12 |
+
except ImportError:
|
| 13 |
+
print(sys.path)
|
| 14 |
+
print(sys.executable)
|
| 15 |
+
st.error("Error: google-generativeai library not found. Please install it: `pip install google-generativeai`")
|
| 16 |
+
st.stop()
|
| 17 |
+
|
| 18 |
+
try:
|
| 19 |
+
import pypdf
|
| 20 |
+
except ImportError:
|
| 21 |
+
st.error("Error: pypdf library not found. Please install it: `pip install pypdf`")
|
| 22 |
+
st.stop()
|
| 23 |
+
|
| 24 |
+
try:
|
| 25 |
+
from reportlab.lib.pagesizes import letter
|
| 26 |
+
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
|
| 27 |
+
from reportlab.lib.styles import getSampleStyleSheet
|
| 28 |
+
from reportlab.lib.enums import TA_JUSTIFY
|
| 29 |
+
from reportlab.pdfbase import pdfmetrics
|
| 30 |
+
from reportlab.pdfbase.ttfonts import TTFont
|
| 31 |
+
# Attempt to register a font that supports a wider range of characters
|
| 32 |
+
try:
|
| 33 |
+
# Assume DejaVuSans.ttf is in the same directory as the script
|
| 34 |
+
font_path = Path(__file__).parent / 'DejaVuSans.ttf'
|
| 35 |
+
if font_path.exists():
|
| 36 |
+
pdfmetrics.registerFont(TTFont('DejaVuSans', str(font_path)))
|
| 37 |
+
DEFAULT_FONT = 'DejaVuSans'
|
| 38 |
+
print("Using DejaVuSans font.") # Log to console
|
| 39 |
+
else:
|
| 40 |
+
DEFAULT_FONT = 'Helvetica'
|
| 41 |
+
print("Warning: DejaVuSans.ttf not found. Using Helvetica.")
|
| 42 |
+
# Display warning in Streamlit app as well
|
| 43 |
+
st.warning("β οΈ Warning: DejaVuSans font not found. Non-Latin characters might not render correctly in the output PDF. Consider placing `DejaVuSans.ttf` in the app directory.")
|
| 44 |
+
except Exception as font_e:
|
| 45 |
+
st.warning(f"β οΈ Warning: Error registering font. Using Helvetica. Details: {font_e}")
|
| 46 |
+
DEFAULT_FONT = 'Helvetica'
|
| 47 |
+
|
| 48 |
+
except ImportError:
|
| 49 |
+
st.error("Error: reportlab library not found. Please install it: `pip install reportlab`")
|
| 50 |
+
st.stop()
|
| 51 |
+
|
| 52 |
+
# --- Configuration (Moved API Key handling) ---
|
| 53 |
+
# GEMINI_API_KEY = os.environ.get("GOOGLE_API_KEY") # Handled via Streamlit input/secrets later
|
| 54 |
+
MODEL_NAME = "gemini-1.5-pro" # Or "gemini-1.5-flash-latest" etc.
|
| 55 |
+
SAFETY_SETTINGS = [
|
| 56 |
+
{"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
|
| 57 |
+
{"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
|
| 58 |
+
{"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
|
| 59 |
+
{"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
|
| 60 |
+
]
|
| 61 |
+
BATCH_SIZE = 50 # Number of pages to process per batch if PDF is large
|
| 62 |
+
API_CALL_DELAY = 0.5 # Optional delay in seconds between API calls
|
| 63 |
+
DEFAULT_PDF_PATH = Path(__file__).parent / "default_pharma.pdf" # Path to your default PDF
|
| 64 |
+
LANGUAGES = ["russian", "romanian", "english", "german", "french", "spanish"]
|
| 65 |
+
|
| 66 |
+
# --- Core Functions (Adapted from your script) ---
|
| 67 |
+
|
| 68 |
+
# Global variable to hold the configured model
|
| 69 |
+
gemini_model = None
|
| 70 |
+
|
| 71 |
+
def configure_gemini(api_key):
|
| 72 |
+
"""Configures the Gemini client."""
|
| 73 |
+
global gemini_model
|
| 74 |
+
try:
|
| 75 |
+
genai.configure(api_key=st.secrets["GOOGLE_API_KEY"])
|
| 76 |
+
gemini_model = genai.GenerativeModel(MODEL_NAME, safety_settings=SAFETY_SETTINGS)
|
| 77 |
+
return True
|
| 78 |
+
except Exception as e:
|
| 79 |
+
st.error(f"Error configuring Gemini: {e}")
|
| 80 |
+
gemini_model = None # Ensure model is None if config fails
|
| 81 |
+
return False
|
| 82 |
+
|
| 83 |
+
def extract_text_from_pdf(pdf_file_obj):
|
| 84 |
+
"""Extracts text from each page of the PDF file object."""
|
| 85 |
+
page_texts = []
|
| 86 |
+
try:
|
| 87 |
+
reader = pypdf.PdfReader(pdf_file_obj)
|
| 88 |
+
num_pages = len(reader.pages)
|
| 89 |
+
st.info(f"Found {num_pages} page(s) in the PDF.")
|
| 90 |
+
progress_bar = st.progress(0)
|
| 91 |
+
status_text = st.empty()
|
| 92 |
+
|
| 93 |
+
for i, page in enumerate(reader.pages):
|
| 94 |
+
try:
|
| 95 |
+
text = page.extract_text()
|
| 96 |
+
if text:
|
| 97 |
+
page_texts.append(text.strip())
|
| 98 |
+
else:
|
| 99 |
+
page_texts.append("") # Keep page count consistent
|
| 100 |
+
status_text.text(f"Extracting text from page {i + 1}/{num_pages}")
|
| 101 |
+
progress_bar.progress((i + 1) / num_pages)
|
| 102 |
+
|
| 103 |
+
except Exception as e:
|
| 104 |
+
st.warning(f"Warning: Could not extract text from page {i + 1}: {e}")
|
| 105 |
+
page_texts.append("") # Add empty string on error
|
| 106 |
+
|
| 107 |
+
status_text.text("Text extraction complete.")
|
| 108 |
+
return page_texts
|
| 109 |
+
except pypdf.errors.PdfReadError as e:
|
| 110 |
+
st.error(f"Error reading PDF file: {e}. The file might be corrupted, password-protected, or not a valid PDF.")
|
| 111 |
+
return None
|
| 112 |
+
except Exception as e:
|
| 113 |
+
st.error(f"An unexpected error occurred during PDF processing: {e}")
|
| 114 |
+
return None
|
| 115 |
+
|
| 116 |
+
def extract_text_from_txt(txt_file_obj):
|
| 117 |
+
"""Reads text content from a TXT file object."""
|
| 118 |
+
try:
|
| 119 |
+
# Read as bytes first, then decode smartly
|
| 120 |
+
content_bytes = txt_file_obj.read()
|
| 121 |
+
try:
|
| 122 |
+
# Try UTF-8 first
|
| 123 |
+
text = content_bytes.decode('utf-8')
|
| 124 |
+
except UnicodeDecodeError:
|
| 125 |
+
try:
|
| 126 |
+
# Fallback to latin-1 (or cp1252 for Windows files)
|
| 127 |
+
text = content_bytes.decode('latin-1')
|
| 128 |
+
st.warning("Decoded TXT file using 'latin-1'. Some characters might be misinterpreted if the encoding is different.")
|
| 129 |
+
except Exception as decode_err:
|
| 130 |
+
st.error(f"Error decoding TXT file: {decode_err}. Please ensure it's UTF-8 or Latin-1 encoded.")
|
| 131 |
+
return None
|
| 132 |
+
st.info(f"Successfully read text file.")
|
| 133 |
+
return text
|
| 134 |
+
except Exception as e:
|
| 135 |
+
st.error(f"An error occurred reading the TXT file: {e}")
|
| 136 |
+
return None
|
| 137 |
+
|
| 138 |
+
def translate_text_gemini(text, source_lang, target_lang, page_num_for_log=""):
|
| 139 |
+
"""Translates text using the Gemini API."""
|
| 140 |
+
global gemini_model
|
| 141 |
+
if gemini_model is None:
|
| 142 |
+
st.error("Gemini model not configured. Cannot translate.")
|
| 143 |
+
return None # Indicate failure
|
| 144 |
+
|
| 145 |
+
if not text:
|
| 146 |
+
return "" # Nothing to translate
|
| 147 |
+
|
| 148 |
+
log_prefix = f"Page {page_num_for_log}: " if page_num_for_log else "Text block: "
|
| 149 |
+
|
| 150 |
+
prompt = f"""Translate the following text from {source_lang} to {target_lang}.
|
| 151 |
+
Preserve paragraph breaks where appropriate. Output *only* the translated text, without any introductory phrases like "Here is the translation:", or any explanations or markdown formatting. If the input text is empty or nonsensical for translation, output nothing.
|
| 152 |
+
|
| 153 |
+
Text to translate:
|
| 154 |
+
---
|
| 155 |
+
{text}
|
| 156 |
+
---
|
| 157 |
+
|
| 158 |
+
Translation:"""
|
| 159 |
+
|
| 160 |
+
try:
|
| 161 |
+
# Optional: Add delay between calls
|
| 162 |
+
if API_CALL_DELAY > 0:
|
| 163 |
+
time.sleep(API_CALL_DELAY)
|
| 164 |
+
|
| 165 |
+
response = gemini_model.generate_content(prompt)
|
| 166 |
+
|
| 167 |
+
# Robust check for content
|
| 168 |
+
translated_text = ""
|
| 169 |
+
if response.parts:
|
| 170 |
+
translated_text = "".join(part.text for part in response.parts).strip()
|
| 171 |
+
elif hasattr(response, 'text'): # Fallback for simpler response structures
|
| 172 |
+
translated_text = response.text.strip()
|
| 173 |
+
|
| 174 |
+
# Handle potential blocking or empty responses even if parts exist but are empty
|
| 175 |
+
if not translated_text:
|
| 176 |
+
if response.prompt_feedback and response.prompt_feedback.block_reason:
|
| 177 |
+
st.warning(f"{log_prefix}Translation blocked. Reason: {response.prompt_feedback.block_reason}")
|
| 178 |
+
return f"[Translation blocked on {log_prefix.strip(':')}: {response.prompt_feedback.block_reason}]"
|
| 179 |
+
else:
|
| 180 |
+
finish_reason = response.candidates[0].finish_reason if response.candidates else 'UNKNOWN'
|
| 181 |
+
if finish_reason == 'STOP':
|
| 182 |
+
# Don't warn if input was likely just whitespace/empty
|
| 183 |
+
if text.strip():
|
| 184 |
+
st.warning(f"{log_prefix}Received no translated content (finish reason STOP). Original text might have been empty or untranslatable.")
|
| 185 |
+
return "" # Return empty if no content and no blocking
|
| 186 |
+
else:
|
| 187 |
+
st.warning(f"{log_prefix}Received empty response from API. Finish Reason: {finish_reason}, Feedback: {response.prompt_feedback}")
|
| 188 |
+
return f"[Translation failed on {log_prefix.strip(':')}: Empty API response]"
|
| 189 |
+
|
| 190 |
+
return translated_text
|
| 191 |
+
|
| 192 |
+
except google_exceptions.ResourceExhausted as e:
|
| 193 |
+
st.error(f"{log_prefix}Error: Gemini API quota exceeded: {e}. Consider increasing API_CALL_DELAY or checking your quota.")
|
| 194 |
+
return f"[Translation failed on {log_prefix.strip(':')}: Quota Exceeded - {e}]"
|
| 195 |
+
except google_exceptions.InvalidArgument as e:
|
| 196 |
+
st.error(f"{log_prefix}Error: Invalid argument passed to Gemini API: {e}")
|
| 197 |
+
# st.error(f" Problematic text snippet (first 100 chars): {text[:100]}...") # Debugging
|
| 198 |
+
return f"[Translation failed on {log_prefix.strip(':')}: Invalid Argument - {e}]"
|
| 199 |
+
except Exception as e:
|
| 200 |
+
st.error(f"{log_prefix}Error during Gemini API call: {e}")
|
| 201 |
+
return f"[Translation failed on {log_prefix.strip(':')}: {e}]"
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
def translate_pages_in_batches(original_pages_text, source_lang, target_lang):
|
| 205 |
+
"""Translates list of page texts, batching if necessary."""
|
| 206 |
+
global gemini_model
|
| 207 |
+
if gemini_model is None:
|
| 208 |
+
st.error("Gemini model not configured. Cannot translate.")
|
| 209 |
+
return None
|
| 210 |
+
|
| 211 |
+
translated_pages = []
|
| 212 |
+
total_pages = len(original_pages_text)
|
| 213 |
+
|
| 214 |
+
if total_pages == 0:
|
| 215 |
+
st.warning("No text pages found to translate.")
|
| 216 |
+
return []
|
| 217 |
+
|
| 218 |
+
st.info(f"Starting translation of {total_pages} page(s)...")
|
| 219 |
+
progress_bar = st.progress(0)
|
| 220 |
+
status_text = st.empty()
|
| 221 |
+
|
| 222 |
+
if total_pages <= BATCH_SIZE:
|
| 223 |
+
# Single batch processing
|
| 224 |
+
for i, text in enumerate(original_pages_text):
|
| 225 |
+
page_num = i + 1
|
| 226 |
+
status_text.text(f"Translating page {page_num}/{total_pages}...")
|
| 227 |
+
if not text.strip():
|
| 228 |
+
# st.write(f" - Page {page_num}: Skipping empty page.") # Optional verbose logging
|
| 229 |
+
translated_pages.append("")
|
| 230 |
+
else:
|
| 231 |
+
translated = translate_text_gemini(text, source_lang, target_lang, page_num_for_log=page_num)
|
| 232 |
+
if translated is None: return None # Propagate failure
|
| 233 |
+
translated_pages.append(translated)
|
| 234 |
+
progress_bar.progress((i + 1) / total_pages)
|
| 235 |
+
else:
|
| 236 |
+
# Batch processing
|
| 237 |
+
num_batches = (total_pages + BATCH_SIZE - 1) // BATCH_SIZE
|
| 238 |
+
st.info(f"Translating {total_pages} pages in {num_batches} batches of up to {BATCH_SIZE}...")
|
| 239 |
+
pages_processed = 0
|
| 240 |
+
for batch_num in range(num_batches):
|
| 241 |
+
start_index = batch_num * BATCH_SIZE
|
| 242 |
+
end_index = min((batch_num + 1) * BATCH_SIZE, total_pages)
|
| 243 |
+
batch_texts = original_pages_text[start_index:end_index]
|
| 244 |
+
start_page = start_index + 1
|
| 245 |
+
end_page = end_index
|
| 246 |
+
|
| 247 |
+
# st.write(f"-- Processing Batch {batch_num + 1}/{num_batches} (Pages {start_page}-{end_page}) --")
|
| 248 |
+
|
| 249 |
+
for i, text in enumerate(batch_texts):
|
| 250 |
+
current_page_number = start_index + i + 1
|
| 251 |
+
status_text.text(f"Translating page {current_page_number}/{total_pages} (Batch {batch_num + 1}/{num_batches})...")
|
| 252 |
+
if not text.strip():
|
| 253 |
+
# st.write(f" - Page {current_page_number}: Skipping empty page.")
|
| 254 |
+
translated_pages.append("")
|
| 255 |
+
else:
|
| 256 |
+
translated = translate_text_gemini(text, source_lang, target_lang, page_num_for_log=current_page_number)
|
| 257 |
+
if translated is None: return None # Propagate failure
|
| 258 |
+
translated_pages.append(translated)
|
| 259 |
+
|
| 260 |
+
pages_processed += 1
|
| 261 |
+
progress_bar.progress(pages_processed / total_pages)
|
| 262 |
+
# st.write(f"-- Finished Batch {batch_num + 1}/{num_batches} --")
|
| 263 |
+
|
| 264 |
+
status_text.text("Translation step complete.")
|
| 265 |
+
return translated_pages
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
def create_pdf_from_text(translated_pages):
|
| 269 |
+
"""Creates a new PDF document from the translated text pages in memory."""
|
| 270 |
+
pdf_buffer = io.BytesIO()
|
| 271 |
+
try:
|
| 272 |
+
doc = SimpleDocTemplate(pdf_buffer, pagesize=letter)
|
| 273 |
+
styles = getSampleStyleSheet()
|
| 274 |
+
style = styles["Normal"]
|
| 275 |
+
style.fontName = DEFAULT_FONT
|
| 276 |
+
style.fontSize = 10
|
| 277 |
+
style.alignment = TA_JUSTIFY
|
| 278 |
+
|
| 279 |
+
style_bold = styles["Heading2"] # Use a heading style for page markers
|
| 280 |
+
style_bold.fontName = DEFAULT_FONT
|
| 281 |
+
style_bold.fontSize = 8 # Make header smaller
|
| 282 |
+
style_bold.alignment = TA_JUSTIFY
|
| 283 |
+
|
| 284 |
+
story = []
|
| 285 |
+
st.info(f"Reconstructing PDF with {len(translated_pages)} page(s)...")
|
| 286 |
+
progress_bar = st.progress(0)
|
| 287 |
+
status_text = st.empty()
|
| 288 |
+
|
| 289 |
+
for i, page_text in enumerate(translated_pages):
|
| 290 |
+
page_num = i + 1
|
| 291 |
+
status_text.text(f"Adding translated page {page_num}/{len(translated_pages)} to PDF...")
|
| 292 |
+
|
| 293 |
+
# Add a header indicating the original page number
|
| 294 |
+
story.append(Paragraph(f"--- Translated Page {page_num} ---", style_bold))
|
| 295 |
+
story.append(Spacer(1, 6)) # Add smaller space after header
|
| 296 |
+
|
| 297 |
+
if page_text:
|
| 298 |
+
# Replace newline characters with <br/> tags for ReportLab Paragraphs
|
| 299 |
+
formatted_text = page_text.replace('\n', '<br/>\n')
|
| 300 |
+
try:
|
| 301 |
+
para = Paragraph(formatted_text, style)
|
| 302 |
+
story.append(para)
|
| 303 |
+
except Exception as e:
|
| 304 |
+
st.warning(f"Warning: Could not add text from page {page_num} to PDF (potential encoding/font issue): {e}")
|
| 305 |
+
try:
|
| 306 |
+
error_para = Paragraph(f"[Could not render text for page {page_num} due to error. See logs/warnings.]", style)
|
| 307 |
+
story.append(error_para)
|
| 308 |
+
except: pass # Skip if even the error message fails
|
| 309 |
+
else:
|
| 310 |
+
story.append(Paragraph(f"[No translatable text found or translation failed for page {page_num}]", style))
|
| 311 |
+
|
| 312 |
+
# Add a page break after each page's content, except the last one
|
| 313 |
+
if i < len(translated_pages) - 1:
|
| 314 |
+
story.append(PageBreak())
|
| 315 |
+
|
| 316 |
+
progress_bar.progress((i + 1) / len(translated_pages))
|
| 317 |
+
|
| 318 |
+
doc.build(story)
|
| 319 |
+
status_text.text("PDF reconstruction complete.")
|
| 320 |
+
pdf_buffer.seek(0) # Rewind the buffer to the beginning
|
| 321 |
+
return pdf_buffer
|
| 322 |
+
|
| 323 |
+
except Exception as e:
|
| 324 |
+
st.error(f"Error creating output PDF: {e}")
|
| 325 |
+
return None
|
| 326 |
+
|
| 327 |
+
def create_txt_from_text(translated_text):
|
| 328 |
+
"""Creates a TXT file content in memory."""
|
| 329 |
+
try:
|
| 330 |
+
txt_buffer = io.StringIO()
|
| 331 |
+
txt_buffer.write(translated_text)
|
| 332 |
+
txt_buffer.seek(0)
|
| 333 |
+
# We need BytesIO for download button, so encode it
|
| 334 |
+
txt_bytes_buffer = io.BytesIO(txt_buffer.getvalue().encode('utf-8'))
|
| 335 |
+
st.info("TXT file content prepared.")
|
| 336 |
+
return txt_bytes_buffer
|
| 337 |
+
except Exception as e:
|
| 338 |
+
st.error(f"Error creating output TXT: {e}")
|
| 339 |
+
return None
|
| 340 |
+
|
| 341 |
+
|
| 342 |
+
# --- Streamlit App UI ---
|
| 343 |
+
st.title("π Document Translator")
|
| 344 |
+
|
| 345 |
+
configure_gemini(None)
|
| 346 |
+
|
| 347 |
+
st.sidebar.image('zega_logo.PNG',use_container_width=True)
|
| 348 |
+
|
| 349 |
+
st.sidebar.markdown("---") # Separator
|
| 350 |
+
|
| 351 |
+
# --- File Input Options ---
|
| 352 |
+
st.sidebar.subheader("π Input File")
|
| 353 |
+
use_default = st.sidebar.checkbox("Use default Russian pharma PDF", value=False)
|
| 354 |
+
|
| 355 |
+
uploaded_file = None
|
| 356 |
+
source_lang_selected = None
|
| 357 |
+
input_file_type = None # To track 'pdf' or 'txt'
|
| 358 |
+
|
| 359 |
+
if use_default:
|
| 360 |
+
if not DEFAULT_PDF_PATH.exists():
|
| 361 |
+
st.sidebar.error(f"Default PDF '{DEFAULT_PDF_PATH.name}' not found in the app directory!")
|
| 362 |
+
st.stop()
|
| 363 |
+
else:
|
| 364 |
+
st.sidebar.info(f"Using default file: `{DEFAULT_PDF_PATH.name}`")
|
| 365 |
+
source_lang_selected = "russian" # Default file is Russian
|
| 366 |
+
input_file_type = "pdf"
|
| 367 |
+
else:
|
| 368 |
+
uploaded_file = st.sidebar.file_uploader(
|
| 369 |
+
"Upload your PDF or TXT file",
|
| 370 |
+
type=["pdf", "txt"],
|
| 371 |
+
accept_multiple_files=False
|
| 372 |
+
)
|
| 373 |
+
if uploaded_file:
|
| 374 |
+
input_file_type = uploaded_file.type.split('/')[-1].lower() # pdf or plain (->txt)
|
| 375 |
+
if input_file_type == 'plain':
|
| 376 |
+
input_file_type = 'txt'
|
| 377 |
+
|
| 378 |
+
# Dropdown for source language ONLY if uploading
|
| 379 |
+
st.sidebar.markdown("π Select the **source** language of your uploaded file:")
|
| 380 |
+
source_lang_selected = st.sidebar.selectbox(
|
| 381 |
+
"Source Language",
|
| 382 |
+
options=[""] + LANGUAGES, # Add empty option for prompt
|
| 383 |
+
index=0, # Default to empty
|
| 384 |
+
key="source_lang_uploader"
|
| 385 |
+
)
|
| 386 |
+
if not source_lang_selected:
|
| 387 |
+
st.sidebar.warning("Please select the source language of your document.")
|
| 388 |
+
|
| 389 |
+
|
| 390 |
+
st.sidebar.markdown("---") # Separator
|
| 391 |
+
|
| 392 |
+
# --- Target Language Selection ---
|
| 393 |
+
st.sidebar.subheader("π― Target Language")
|
| 394 |
+
target_lang_selected = None
|
| 395 |
+
# Ensure a source is defined before showing target selection
|
| 396 |
+
if source_lang_selected:
|
| 397 |
+
target_lang_selected = st.sidebar.selectbox(
|
| 398 |
+
"Translate To",
|
| 399 |
+
options=[""] + [lang for lang in LANGUAGES if lang != source_lang_selected], # Exclude source lang
|
| 400 |
+
index=0, # Default to empty
|
| 401 |
+
key="target_lang",
|
| 402 |
+
help="Select the language you want to translate the document into."
|
| 403 |
+
)
|
| 404 |
+
if not target_lang_selected:
|
| 405 |
+
st.sidebar.warning("Please select the target language.")
|
| 406 |
+
else:
|
| 407 |
+
st.sidebar.info("Select or upload a file and its source language first.")
|
| 408 |
+
|
| 409 |
+
|
| 410 |
+
st.sidebar.markdown("---") # Separator
|
| 411 |
+
|
| 412 |
+
# --- Translate Button ---
|
| 413 |
+
translate_button = st.sidebar.button("Translate Document", disabled=(gemini_model is None or not target_lang_selected))
|
| 414 |
+
|
| 415 |
+
if not source_lang_selected:
|
| 416 |
+
st.sidebar.markdown("_(Select/Upload file and source language to enable translation)_")
|
| 417 |
+
elif not target_lang_selected:
|
| 418 |
+
st.sidebar.markdown("_(Select target language to enable translation)_")
|
| 419 |
+
|
| 420 |
+
# --- Main Area for Processing and Results ---
|
| 421 |
+
if translate_button:
|
| 422 |
+
st.subheader("π Translation Progress")
|
| 423 |
+
output_buffer = None
|
| 424 |
+
output_filename = "translation_failed" # Default filename
|
| 425 |
+
|
| 426 |
+
with st.spinner("Processing... Please wait."):
|
| 427 |
+
# 1. Get Input Data
|
| 428 |
+
input_data = None
|
| 429 |
+
if use_default:
|
| 430 |
+
try:
|
| 431 |
+
with open(DEFAULT_PDF_PATH, "rb") as f:
|
| 432 |
+
input_data = io.BytesIO(f.read())
|
| 433 |
+
st.write(f"Processing default file: {DEFAULT_PDF_PATH.name} (PDF)")
|
| 434 |
+
except Exception as e:
|
| 435 |
+
st.error(f"Error reading default PDF: {e}")
|
| 436 |
+
st.stop()
|
| 437 |
+
elif uploaded_file:
|
| 438 |
+
input_data = io.BytesIO(uploaded_file.getvalue()) # Use BytesIO for consistency
|
| 439 |
+
st.write(f"Processing uploaded file: {uploaded_file.name} ({input_file_type.upper()})")
|
| 440 |
+
else:
|
| 441 |
+
st.error("No input file selected!")
|
| 442 |
+
st.stop()
|
| 443 |
+
|
| 444 |
+
# Basic validation passed in UI, but double-check
|
| 445 |
+
if not input_data or not source_lang_selected or not target_lang_selected:
|
| 446 |
+
st.error("Missing required input (file, source language, or target language).")
|
| 447 |
+
st.stop()
|
| 448 |
+
if source_lang_selected == target_lang_selected:
|
| 449 |
+
st.error("Source and Target languages cannot be the same.")
|
| 450 |
+
st.stop()
|
| 451 |
+
|
| 452 |
+
# --- Start Processing based on file type ---
|
| 453 |
+
if input_file_type == "pdf":
|
| 454 |
+
st.markdown("---")
|
| 455 |
+
st.write("**Step 1: Extracting Text from PDF...**")
|
| 456 |
+
original_pages = extract_text_from_pdf(input_data)
|
| 457 |
+
|
| 458 |
+
if original_pages is not None:
|
| 459 |
+
st.markdown("---")
|
| 460 |
+
st.write(f"**Step 2: Translating {len(original_pages)} pages from {source_lang_selected} to {target_lang_selected}...**")
|
| 461 |
+
translated_pages = translate_pages_in_batches(original_pages, source_lang_selected, target_lang_selected)
|
| 462 |
+
|
| 463 |
+
if translated_pages is not None:
|
| 464 |
+
st.markdown("---")
|
| 465 |
+
st.write("**Step 3: Creating Translated PDF...**")
|
| 466 |
+
output_buffer = create_pdf_from_text(translated_pages)
|
| 467 |
+
if output_buffer:
|
| 468 |
+
output_filename = f"{Path(uploaded_file.name if uploaded_file else DEFAULT_PDF_PATH.name).stem}_translated_{target_lang_selected}.pdf"
|
| 469 |
+
st.success("β
Translation and PDF creation successful!")
|
| 470 |
+
else:
|
| 471 |
+
st.error("Translation failed. Cannot create PDF.")
|
| 472 |
+
else:
|
| 473 |
+
st.error("Text extraction failed. Cannot proceed.")
|
| 474 |
+
|
| 475 |
+
elif input_file_type == "txt":
|
| 476 |
+
st.markdown("---")
|
| 477 |
+
st.write("**Step 1: Reading Text from TXT...**")
|
| 478 |
+
original_text = extract_text_from_txt(input_data)
|
| 479 |
+
|
| 480 |
+
if original_text is not None:
|
| 481 |
+
st.markdown("---")
|
| 482 |
+
st.write(f"**Step 2: Translating text from {source_lang_selected} to {target_lang_selected}...**")
|
| 483 |
+
# Use the single text translation function - treat TXT as one block
|
| 484 |
+
status_text_txt = st.empty()
|
| 485 |
+
status_text_txt.text("Sending text to translation API...")
|
| 486 |
+
translated_text = translate_text_gemini(original_text, source_lang_selected, target_lang_selected, page_num_for_log="TXT content")
|
| 487 |
+
status_text_txt.text("Translation received.")
|
| 488 |
+
|
| 489 |
+
|
| 490 |
+
if translated_text is not None: # Check if translation call succeeded
|
| 491 |
+
st.markdown("---")
|
| 492 |
+
st.write("**Step 3: Creating Translated TXT file...**")
|
| 493 |
+
output_buffer = create_txt_from_text(translated_text)
|
| 494 |
+
if output_buffer:
|
| 495 |
+
output_filename = f"{Path(uploaded_file.name).stem}_translated_{target_lang_selected}.txt"
|
| 496 |
+
st.success("β
Translation and TXT creation successful!")
|
| 497 |
+
else:
|
| 498 |
+
st.error("Translation failed. Cannot create TXT file.")
|
| 499 |
+
else:
|
| 500 |
+
st.error("Reading TXT file failed. Cannot proceed.")
|
| 501 |
+
|
| 502 |
+
else:
|
| 503 |
+
st.error(f"Unsupported file type: {input_file_type}")
|
| 504 |
+
|
| 505 |
+
# --- Offer Download ---
|
| 506 |
+
if output_buffer:
|
| 507 |
+
st.markdown("---")
|
| 508 |
+
st.subheader("π₯ Download Result")
|
| 509 |
+
file_mime = "application/pdf" if output_filename.endswith(".pdf") else "text/plain"
|
| 510 |
+
st.download_button(
|
| 511 |
+
label=f"Download {output_filename}",
|
| 512 |
+
data=output_buffer,
|
| 513 |
+
file_name=output_filename,
|
| 514 |
+
mime=file_mime,
|
| 515 |
+
)
|
| 516 |
+
# Display a snippet of the translation (optional)
|
| 517 |
+
# try:
|
| 518 |
+
# if output_filename.endswith(".pdf"):
|
| 519 |
+
# st.info("PDF generated. Download to view content.")
|
| 520 |
+
# else: # TXT file
|
| 521 |
+
# output_buffer.seek(0)
|
| 522 |
+
# snippet = output_buffer.read(500).decode('utf-8', errors='ignore')
|
| 523 |
+
# st.text_area("Translation Snippet:", snippet + "...", height=200)
|
| 524 |
+
# except Exception as e:
|
| 525 |
+
# st.warning(f"Could not display snippet: {e}")
|
| 526 |
+
|
| 527 |
+
# --- Initial Instructions ---
|
| 528 |
+
if not translate_button:
|
| 529 |
+
st.markdown(
|
| 530 |
+
"""
|
| 531 |
+
## How to Use:
|
| 532 |
+
|
| 533 |
+
1. **Choose Input:**
|
| 534 |
+
* Check the box to use the **default Russian pharma PDF**.
|
| 535 |
+
* Or, **upload** your own PDF or TXT file using the uploader.
|
| 536 |
+
2. **Select Languages:**
|
| 537 |
+
* If uploading, select the **source language** of your file.
|
| 538 |
+
* Select the **target language** you want to translate to.
|
| 539 |
+
3. **Translate:** Click the "Translate Document" button in the sidebar.
|
| 540 |
+
4. **Download:** Once processed, a download button for the translated file will appear.
|
| 541 |
+
|
| 542 |
+
**Note:**
|
| 543 |
+
* PDF translation attempts to preserve page structure but loses original formatting (images, fonts, layout).
|
| 544 |
+
"""
|
| 545 |
+
)
|
default_pharma.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1d9938187bec7b1f5757f089589057661a8a63ae36d731e1d4c28ee20f7e8076
|
| 3 |
+
size 151108
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit
|
| 2 |
+
google-generativeai
|
| 3 |
+
pypdf
|
| 4 |
+
reportlab
|
zega_logo.PNG
ADDED
|
|
Git LFS Details
|