Spaces:

syed7
/

Translator

Build error

App Files Files Community

syed7 commited on Dec 4, 2024

Commit

e7fdc76

verified ·

1 Parent(s): aa5d775

Update app.py

Browse files

Files changed (1) hide show

app.py +356 -0

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import streamlit as st
 import PyPDF2
 import openai
 from io import BytesIO
@@ -7,6 +8,361 @@ from reportlab.pdfgen import canvas
 from reportlab.lib.pagesizes import letter, A4
 from reportlab.pdfbase import pdfmetrics
 from reportlab.pdfbase.ttfonts import TTFont
 from reportlab.lib.utils import simpleSplit
 from reportlab.lib.colors import black
 import arabic_reshaper

 import streamlit as st
+imimport streamlit as st
 import PyPDF2
 import openai
 from io import BytesIO
 from reportlab.lib.pagesizes import letter, A4
 from reportlab.pdfbase import pdfmetrics
 from reportlab.pdfbase.ttfonts import TTFont
+from weasyprint import HTML, CSS
+from weasyprint.text.fonts import FontConfiguration
+import arabic_reshaper
+from bidi.algorithm import get_display
+import os
+import tempfile
+# Get API key from Hugging Face secrets
+api_key = os.environ.get('OPENAI_API_KEY')
+def register_fonts():
+    """Register fonts for different languages"""
+    try:
+        # Using Noto Nastaliq Urdu for Urdu
+        pdfmetrics.registerFont(TTFont('NotoNastaliqUrdu', 'NafeesNastaleeqXX.ttf'))
+        # Using Noto Naskh Arabic for Arabic
+        pdfmetrics.registerFont(TTFont('NotoNaskhArabic', 'NotoNaskhArabic-Regular.ttf'))
+        # Using Noto Sans for other languages
+        pdfmetrics.registerFont(TTFont('NotoSans', 'NotoSans-Regular.ttf'))
+    except Exception as e:
+        st.warning(f"Font files not found. Default fonts will be used. Error: {str(e)}")
+def extract_text_from_pdf(pdf_file):
+    """Extract text from uploaded PDF file"""
+    pdf_reader = PyPDF2.PdfReader(pdf_file)
+    text = ""
+    for page in pdf_reader.pages:
+        text += page.extract_text()
+    return text
+def create_pdf(text, target_language):
+    if target_language == "Urdu":
+        font_config = FontConfiguration()
+        # Process text to handle English and numbers differently
+        processed_lines = []
+        for line in text.split('\n'):
+            # Split line into Urdu and non-Urdu parts
+            processed_line = ""
+            current_text = ""
+            is_urdu = True
+            for char in line:
+                if '\u0600' <= char <= '\u06FF' or char in ['۔', '،']:  # Urdu character range
+                    if not is_urdu:
+                        if current_text:
+                            processed_line += f'<span class="latin">{current_text}</span>'
+                        current_text = ""
+                        is_urdu = True
+                    current_text += char
+                else:
+                    if is_urdu:
+                        if current_text:
+                            processed_line += current_text
+                        current_text = ""
+                        is_urdu = False
+                    current_text += char
+            if current_text:
+                if is_urdu:
+                    processed_line += current_text
+                else:
+                    processed_line += f'<span class="latin">{current_text}</span>'
+            processed_lines.append(f'<p class="urdu-text">{processed_line}</p>')
+        processed_text = '\n'.join(processed_lines)
+        html_content = f"""
+        <!DOCTYPE html>
+        <html dir="rtl" lang="ur">
+        <head>
+            <meta charset="UTF-8">
+            <style>
+                @font-face {{
+                    font-family: 'NotoNastaliqUrdu';
+                    src: url('fonts/NotoNastaliqUrdu-Regular.ttf') format('truetype');
+                    font-weight: normal;
+                    font-style: normal;
+                }}
+                @page {{
+                    size: A4;
+                    margin: 3cm 2.5cm;
+                }}
+                body {{
+                    font-family: 'NotoNastaliqUrdu', serif;
+                    font-size: 16pt;
+                    line-height: 3;
+                    margin: 0;
+                    padding: 0;
+                    direction: rtl;
+                    text-align: right;
+                    text-rendering: optimizeLegibility;
+                    -webkit-font-smoothing: antialiased;
+                }}
+                .content {{
+                    width: 100%;
+                    max-width: 18cm;
+                    margin: 0 auto;
+                }}
+                .urdu-text {{
+                    margin: 0 0 2em 0;
+                    padding: 0;
+                    text-align: right;
+                    white-space: pre-wrap;
+                    word-wrap: break-word;
+                    font-feature-settings: "kern", "liga", "calt";
+                    letter-spacing: 0.02em;
+                }}
+                .latin {{
+                    font-family: Arial, sans-serif;
+                    direction: ltr;
+                    unicode-bidi: embed;
+                    font-size: 14pt;
+                }}
+                /* Improve spacing around punctuation */
+                .urdu-text::after {{
+                    content: "";
+                    display: block;
+                    height: 1.5em;
+                }}
+            </style>
+        </head>
+        <body>
+            <div class="content">
+                {processed_text}
+            </div>
+        </body>
+        </html>
+        """
+        # Create a temporary HTML file
+        with tempfile.NamedTemporaryFile(suffix='.html', mode='w', encoding='utf-8', delete=False) as f:
+            f.write(html_content)
+            temp_html = f.name
+        # Convert HTML to PDF using WeasyPrint with improved settings
+        buffer = BytesIO()
+        HTML(temp_html).write_pdf(
+            buffer,
+            font_config=font_config,
+            stylesheets=[CSS(string='''
+                @page {
+                    size: A4;
+                    margin: 3cm 2.5cm;
+                    @top-right {
+                        content: "";
+                        margin: 1cm 0;
+                    }
+                    @bottom-center {
+                        content: counter(page);
+                        font-family: Arial, sans-serif;
+                    }
+                }
+            ''')]
+        )
+        buffer.seek(0)
+        # Clean up temporary file
+        os.unlink(temp_html)
+        return buffer
+    else:
+        # Use ReportLab for other languages
+        buffer = BytesIO()
+        c = canvas.Canvas(buffer, pagesize=A4)
+        width, height = A4
+        y = height - 50
+        margin = 50
+        if target_language == "Arabic":
+            try:
+                c.setFont('NotoNaskhArabic', 14)
+                text = arabic_reshaper.reshape(text)
+                text = get_display(text)
+                lines = text.split('\n')
+                line_height = c._fontsize * 1.5
+                for line in lines:
+                    if y < 50:
+                        c.showPage()
+                        y = height - 50
+                        c.setFont('NotoNaskhArabic', 14)
+                    line_width = c.stringWidth(line, c._fontname, c._fontsize)
+                    x = width - margin - line_width
+                    c.drawString(x, y, line)
+                    y -= line_height
+            except Exception as e:
+                st.warning(f"Arabic rendering error: {str(e)}")
+                c.setFont('Helvetica', 12)
+        else:
+            try:
+                c.setFont('NotoSans', 12)
+                lines = text.split('\n')
+                line_height = c._fontsize * 1.5
+                for line in lines:
+                    if y < 50:
+                        c.showPage()
+                        y = height - 50
+                        c.setFont('NotoSans', 12)
+                    c.drawString(margin, y, line)
+                    y -= line_height
+            except Exception as e:
+                st.warning(f"Text rendering error: {str(e)}")
+                c.setFont('Helvetica', 12)
+        c.save()
+        buffer.seek(0)
+        return buffer
+def translate_text(text, target_language, api_key):
+    """Translate text using OpenAI API with improved prompting"""
+    try:
+        client = openai.OpenAI(api_key=api_key)
+        # Enhanced prompt for better translation
+        system_prompt = f"""You are a professional translator specializing in {target_language}.
+        Translate the following text to {target_language}, ensuring:
+        1. Technical terms are accurately translated
+        2. Maintain formal language and proper grammar
+        3. Preserve formatting and structure
+        4. Keep proper nouns and technical terms like 'AI', 'LLMs', 'Python' in English where appropriate
+        5. Use culturally appropriate expressions
+        6. For Urdu/Arabic, ensure proper character connections and diacritics
+        7. Maintain professional and accurate technical translations
+        8. Preserve line breaks and paragraph structure
+        """
+        response = client.chat.completions.create(
+            model="gpt-3.5-turbo",
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": text}
+            ],
+            temperature=0.3
+        )
+        return response.choices[0].message.content
+    except Exception as e:
+        return f"Translation error: {str(e)}"
+# Set page config
+st.set_page_config(page_title="PDF Translator", layout="wide")
+# Try to register fonts at startup
+register_fonts()
+# Main app interface
+st.title("PDF Document Translator")
+# Add custom CSS for better text display
+st.markdown("""
+<style>
+    .stTextArea textarea {
+        font-size: 16px !important;
+    }
+</style>
+""", unsafe_allow_html=True)
+# Language selection
+languages = {
+    "English": "English",
+    "Urdu": "Urdu",
+    "Arabic": "Arabic",
+    "Roman English": "Roman English",
+    "Roman Urdu": "Roman Urdu",
+    "Hindi": "Hindi",
+    "Spanish": "Spanish",
+    "French": "French"
+}
+# File uploader
+uploaded_file = st.file_uploader("Upload your PDF file", type="pdf")
+# API Key input field
+api_key_input = st.text_input("Enter OpenAI API Key:", type="password", key="api_key_input")
+if api_key_input:
+    api_key = api_key_input
+# Language selector
+target_language = st.selectbox(
+    "Select target language",
+    options=list(languages.keys())
+)
+# Create two columns for original and translated text
+col1, col2 = st.columns(2)
+if uploaded_file is not None and api_key:
+    # Extract text from PDF
+    with st.spinner("Extracting text from PDF..."):
+        text = extract_text_from_pdf(uploaded_file)
+    # Show original text
+    with col1:
+        st.subheader("Original Text")
+        st.text_area("", value=text, height=400, key="original_text")
+    # Initialize session state for translated text
+    if 'translated_text' not in st.session_state:
+        st.session_state.translated_text = None
+    # Translate button
+    if st.button("Translate"):
+        with st.spinner("Translating..."):
+            translated_text = translate_text(text, languages[target_language], api_key)
+            st.session_state.translated_text = translated_text
+        # Show translated text
+        with col2:
+            st.subheader(f"Translated Text ({target_language})")
+            st.text_area("", value=translated_text, height=400, key="translated_text")
+    # Show download button if translation exists
+    if st.session_state.translated_text:
+        # Create PDF button
+        if st.download_button(
+            label="Download Translated PDF",
+            data=create_pdf(st.session_state.translated_text, target_language),
+            file_name=f"translated_{target_language}.pdf",
+            mime="application/pdf"
+        ):
+            st.success("PDF downloaded successfully!")
+elif not api_key:
+    st.warning("Please enter your OpenAI API key to proceed.")
+# Add instructions and notes
+st.markdown("""
+### Instructions:
+1. Enter your OpenAI API key
+2. Upload your PDF file
+3. Select your target language
+4. Click 'Translate' to get your translation
+5. Review the translation
+6. Click 'Download Translated PDF' to save as PDF
+""")port PyPDF2
+import openai
+from io import BytesIO
+import io
+from reportlab.pdfgen import canvas
+from reportlab.lib.pagesizes import letter, A4
+from reportlab.pdfbase import pdfmetrics
+from reportlab.pdfbase.ttfonts import TTFont
 from reportlab.lib.utils import simpleSplit
 from reportlab.lib.colors import black
 import arabic_reshaper