import streamlit as st import PyPDF2 import openai from io import BytesIO import io from reportlab.pdfgen import canvas from reportlab.lib.pagesizes import letter, A4 from reportlab.pdfbase import pdfmetrics from reportlab.pdfbase.ttfonts import TTFont from weasyprint import HTML, CSS from weasyprint.text.fonts import FontConfiguration import arabic_reshaper from bidi.algorithm import get_display import os import tempfile # Get API key from Hugging Face secrets api_key = os.environ.get('OPENAI_API_KEY') def register_fonts(): """Register fonts for different languages""" try: # Using Noto Nastaliq Urdu for Urdu pdfmetrics.registerFont(TTFont('NotoNastaliqUrdu', 'NafeesNastaleeqXX.ttf')) # Using Noto Naskh Arabic for Arabic pdfmetrics.registerFont(TTFont('NotoNaskhArabic', 'NotoNaskhArabic-Regular.ttf')) # Using Noto Sans for other languages pdfmetrics.registerFont(TTFont('NotoSans', 'NotoSans-Regular.ttf')) except Exception as e: st.warning(f"Font files not found. Default fonts will be used. Error: {str(e)}") def extract_text_from_pdf(pdf_file): """Extract text from uploaded PDF file""" pdf_reader = PyPDF2.PdfReader(pdf_file) text = "" for page in pdf_reader.pages: text += page.extract_text() return text def create_pdf(text, target_language): if target_language == "Urdu": font_config = FontConfiguration() # Process text to handle English and numbers differently processed_lines = [] for line in text.split('\n'): # Split line into Urdu and non-Urdu parts processed_line = "" current_text = "" is_urdu = True for char in line: if '\u0600' <= char <= '\u06FF' or char in ['۔', '،']: # Urdu character range if not is_urdu: if current_text: processed_line += f'{current_text}' current_text = "" is_urdu = True current_text += char else: if is_urdu: if current_text: processed_line += current_text current_text = "" is_urdu = False current_text += char if current_text: if is_urdu: processed_line += current_text else: processed_line += f'{current_text}' processed_lines.append(f'

{processed_line}

') processed_text = '\n'.join(processed_lines) html_content = f"""
{processed_text}
""" # Create a temporary HTML file with tempfile.NamedTemporaryFile(suffix='.html', mode='w', encoding='utf-8', delete=False) as f: f.write(html_content) temp_html = f.name # Convert HTML to PDF using WeasyPrint with improved settings buffer = BytesIO() HTML(temp_html).write_pdf( buffer, font_config=font_config, stylesheets=[CSS(string=''' @page { size: A4; margin: 3cm 2.5cm; @top-right { content: ""; margin: 1cm 0; } @bottom-center { content: counter(page); font-family: Arial, sans-serif; } } ''')] ) buffer.seek(0) # Clean up temporary file os.unlink(temp_html) return buffer else: # Use ReportLab for other languages buffer = BytesIO() c = canvas.Canvas(buffer, pagesize=A4) width, height = A4 y = height - 50 margin = 50 if target_language == "Arabic": try: c.setFont('NotoNaskhArabic', 14) text = arabic_reshaper.reshape(text) text = get_display(text) lines = text.split('\n') line_height = c._fontsize * 1.5 for line in lines: if y < 50: c.showPage() y = height - 50 c.setFont('NotoNaskhArabic', 14) line_width = c.stringWidth(line, c._fontname, c._fontsize) x = width - margin - line_width c.drawString(x, y, line) y -= line_height except Exception as e: st.warning(f"Arabic rendering error: {str(e)}") c.setFont('Helvetica', 12) else: try: c.setFont('NotoSans', 12) lines = text.split('\n') line_height = c._fontsize * 1.5 for line in lines: if y < 50: c.showPage() y = height - 50 c.setFont('NotoSans', 12) c.drawString(margin, y, line) y -= line_height except Exception as e: st.warning(f"Text rendering error: {str(e)}") c.setFont('Helvetica', 12) c.save() buffer.seek(0) return buffer def translate_text(text, target_language, api_key): """Translate text using OpenAI API with improved prompting""" try: client = openai.OpenAI(api_key=api_key) # Enhanced prompt for better translation system_prompt = f"""You are a professional translator specializing in {target_language}. Translate the following text to {target_language}, ensuring: 1. Technical terms are accurately translated 2. Maintain formal language and proper grammar 3. Preserve formatting and structure 4. Keep proper nouns and technical terms like 'AI', 'LLMs', 'Python' in English where appropriate 5. Use culturally appropriate expressions 6. For Urdu/Arabic, ensure proper character connections and diacritics 7. Maintain professional and accurate technical translations 8. Preserve line breaks and paragraph structure """ response = client.chat.completions.create( model="gpt-3.5-turbo", messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": text} ], temperature=0.3 ) return response.choices[0].message.content except Exception as e: return f"Translation error: {str(e)}" # Set page config st.set_page_config(page_title="PDF Translator", layout="wide") # Try to register fonts at startup register_fonts() # Main app interface st.title("PDF Document Translator") # Add custom CSS for better text display st.markdown(""" """, unsafe_allow_html=True) # Language selection languages = { "English": "English", "Urdu": "Urdu", "Arabic": "Arabic", "Roman English": "Roman English", "Roman Urdu": "Roman Urdu", "Hindi": "Hindi", "Spanish": "Spanish", "French": "French" } # File uploader uploaded_file = st.file_uploader("Upload your PDF file", type="pdf") # API Key input field api_key_input = st.text_input("Enter OpenAI API Key:", type="password", key="api_key_input") if api_key_input: api_key = api_key_input # Language selector target_language = st.selectbox( "Select target language", options=list(languages.keys()) ) # Create two columns for original and translated text col1, col2 = st.columns(2) if uploaded_file is not None and api_key: # Extract text from PDF with st.spinner("Extracting text from PDF..."): text = extract_text_from_pdf(uploaded_file) # Show original text with col1: st.subheader("Original Text") st.text_area("", value=text, height=400, key="original_text") # Initialize session state for translated text if 'translated_text' not in st.session_state: st.session_state.translated_text = None # Translate button if st.button("Translate"): with st.spinner("Translating..."): translated_text = translate_text(text, languages[target_language], api_key) st.session_state.translated_text = translated_text # Show translated text with col2: st.subheader(f"Translated Text ({target_language})") st.text_area("", value=translated_text, height=400, key="translated_text") # Show download button if translation exists if st.session_state.translated_text: # Create PDF button if st.download_button( label="Download Translated PDF", data=create_pdf(st.session_state.translated_text, target_language), file_name=f"translated_{target_language}.pdf", mime="application/pdf" ): st.success("PDF downloaded successfully!") elif not api_key: st.warning("Please enter your OpenAI API key to proceed.") # Add instructions and notes st.markdown(""" ### Instructions: 1. Enter your OpenAI API key 2. Upload your PDF file 3. Select your target language 4. Click 'Translate' to get your translation 5. Review the translation 6. Click 'Download Translated PDF' to save as PDF """)