syed7 commited on
Commit
e7fdc76
·
verified ·
1 Parent(s): aa5d775

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +356 -0
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import streamlit as st
 
2
  import PyPDF2
3
  import openai
4
  from io import BytesIO
@@ -7,6 +8,361 @@ from reportlab.pdfgen import canvas
7
  from reportlab.lib.pagesizes import letter, A4
8
  from reportlab.pdfbase import pdfmetrics
9
  from reportlab.pdfbase.ttfonts import TTFont
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  from reportlab.lib.utils import simpleSplit
11
  from reportlab.lib.colors import black
12
  import arabic_reshaper
 
1
  import streamlit as st
2
+ imimport streamlit as st
3
  import PyPDF2
4
  import openai
5
  from io import BytesIO
 
8
  from reportlab.lib.pagesizes import letter, A4
9
  from reportlab.pdfbase import pdfmetrics
10
  from reportlab.pdfbase.ttfonts import TTFont
11
+ from weasyprint import HTML, CSS
12
+ from weasyprint.text.fonts import FontConfiguration
13
+ import arabic_reshaper
14
+ from bidi.algorithm import get_display
15
+ import os
16
+ import tempfile
17
+
18
+ # Get API key from Hugging Face secrets
19
+ api_key = os.environ.get('OPENAI_API_KEY')
20
+
21
+ def register_fonts():
22
+ """Register fonts for different languages"""
23
+ try:
24
+ # Using Noto Nastaliq Urdu for Urdu
25
+ pdfmetrics.registerFont(TTFont('NotoNastaliqUrdu', 'NafeesNastaleeqXX.ttf'))
26
+ # Using Noto Naskh Arabic for Arabic
27
+ pdfmetrics.registerFont(TTFont('NotoNaskhArabic', 'NotoNaskhArabic-Regular.ttf'))
28
+ # Using Noto Sans for other languages
29
+ pdfmetrics.registerFont(TTFont('NotoSans', 'NotoSans-Regular.ttf'))
30
+ except Exception as e:
31
+ st.warning(f"Font files not found. Default fonts will be used. Error: {str(e)}")
32
+
33
+ def extract_text_from_pdf(pdf_file):
34
+ """Extract text from uploaded PDF file"""
35
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
36
+ text = ""
37
+ for page in pdf_reader.pages:
38
+ text += page.extract_text()
39
+ return text
40
+
41
+ def create_pdf(text, target_language):
42
+ if target_language == "Urdu":
43
+ font_config = FontConfiguration()
44
+
45
+ # Process text to handle English and numbers differently
46
+ processed_lines = []
47
+ for line in text.split('\n'):
48
+ # Split line into Urdu and non-Urdu parts
49
+ processed_line = ""
50
+ current_text = ""
51
+ is_urdu = True
52
+
53
+ for char in line:
54
+ if '\u0600' <= char <= '\u06FF' or char in ['۔', '،']: # Urdu character range
55
+ if not is_urdu:
56
+ if current_text:
57
+ processed_line += f'<span class="latin">{current_text}</span>'
58
+ current_text = ""
59
+ is_urdu = True
60
+ current_text += char
61
+ else:
62
+ if is_urdu:
63
+ if current_text:
64
+ processed_line += current_text
65
+ current_text = ""
66
+ is_urdu = False
67
+ current_text += char
68
+
69
+ if current_text:
70
+ if is_urdu:
71
+ processed_line += current_text
72
+ else:
73
+ processed_line += f'<span class="latin">{current_text}</span>'
74
+
75
+ processed_lines.append(f'<p class="urdu-text">{processed_line}</p>')
76
+
77
+ processed_text = '\n'.join(processed_lines)
78
+
79
+ html_content = f"""
80
+ <!DOCTYPE html>
81
+ <html dir="rtl" lang="ur">
82
+ <head>
83
+ <meta charset="UTF-8">
84
+ <style>
85
+ @font-face {{
86
+ font-family: 'NotoNastaliqUrdu';
87
+ src: url('fonts/NotoNastaliqUrdu-Regular.ttf') format('truetype');
88
+ font-weight: normal;
89
+ font-style: normal;
90
+ }}
91
+
92
+ @page {{
93
+ size: A4;
94
+ margin: 3cm 2.5cm;
95
+ }}
96
+
97
+ body {{
98
+ font-family: 'NotoNastaliqUrdu', serif;
99
+ font-size: 16pt;
100
+ line-height: 3;
101
+ margin: 0;
102
+ padding: 0;
103
+ direction: rtl;
104
+ text-align: right;
105
+ text-rendering: optimizeLegibility;
106
+ -webkit-font-smoothing: antialiased;
107
+ }}
108
+
109
+ .content {{
110
+ width: 100%;
111
+ max-width: 18cm;
112
+ margin: 0 auto;
113
+ }}
114
+
115
+ .urdu-text {{
116
+ margin: 0 0 2em 0;
117
+ padding: 0;
118
+ text-align: right;
119
+ white-space: pre-wrap;
120
+ word-wrap: break-word;
121
+ font-feature-settings: "kern", "liga", "calt";
122
+ letter-spacing: 0.02em;
123
+ }}
124
+
125
+ .latin {{
126
+ font-family: Arial, sans-serif;
127
+ direction: ltr;
128
+ unicode-bidi: embed;
129
+ font-size: 14pt;
130
+ }}
131
+
132
+ /* Improve spacing around punctuation */
133
+ .urdu-text::after {{
134
+ content: "";
135
+ display: block;
136
+ height: 1.5em;
137
+ }}
138
+ </style>
139
+ </head>
140
+ <body>
141
+ <div class="content">
142
+ {processed_text}
143
+ </div>
144
+ </body>
145
+ </html>
146
+ """
147
+
148
+ # Create a temporary HTML file
149
+ with tempfile.NamedTemporaryFile(suffix='.html', mode='w', encoding='utf-8', delete=False) as f:
150
+ f.write(html_content)
151
+ temp_html = f.name
152
+
153
+ # Convert HTML to PDF using WeasyPrint with improved settings
154
+ buffer = BytesIO()
155
+ HTML(temp_html).write_pdf(
156
+ buffer,
157
+ font_config=font_config,
158
+ stylesheets=[CSS(string='''
159
+ @page {
160
+ size: A4;
161
+ margin: 3cm 2.5cm;
162
+ @top-right {
163
+ content: "";
164
+ margin: 1cm 0;
165
+ }
166
+ @bottom-center {
167
+ content: counter(page);
168
+ font-family: Arial, sans-serif;
169
+ }
170
+ }
171
+ ''')]
172
+ )
173
+ buffer.seek(0)
174
+
175
+ # Clean up temporary file
176
+ os.unlink(temp_html)
177
+
178
+ return buffer
179
+
180
+ else:
181
+ # Use ReportLab for other languages
182
+ buffer = BytesIO()
183
+ c = canvas.Canvas(buffer, pagesize=A4)
184
+ width, height = A4
185
+ y = height - 50
186
+ margin = 50
187
+
188
+ if target_language == "Arabic":
189
+ try:
190
+ c.setFont('NotoNaskhArabic', 14)
191
+ text = arabic_reshaper.reshape(text)
192
+ text = get_display(text)
193
+ lines = text.split('\n')
194
+ line_height = c._fontsize * 1.5
195
+
196
+ for line in lines:
197
+ if y < 50:
198
+ c.showPage()
199
+ y = height - 50
200
+ c.setFont('NotoNaskhArabic', 14)
201
+
202
+ line_width = c.stringWidth(line, c._fontname, c._fontsize)
203
+ x = width - margin - line_width
204
+ c.drawString(x, y, line)
205
+ y -= line_height
206
+
207
+ except Exception as e:
208
+ st.warning(f"Arabic rendering error: {str(e)}")
209
+ c.setFont('Helvetica', 12)
210
+ else:
211
+ try:
212
+ c.setFont('NotoSans', 12)
213
+ lines = text.split('\n')
214
+ line_height = c._fontsize * 1.5
215
+
216
+ for line in lines:
217
+ if y < 50:
218
+ c.showPage()
219
+ y = height - 50
220
+ c.setFont('NotoSans', 12)
221
+
222
+ c.drawString(margin, y, line)
223
+ y -= line_height
224
+
225
+ except Exception as e:
226
+ st.warning(f"Text rendering error: {str(e)}")
227
+ c.setFont('Helvetica', 12)
228
+
229
+ c.save()
230
+ buffer.seek(0)
231
+ return buffer
232
+
233
+ def translate_text(text, target_language, api_key):
234
+ """Translate text using OpenAI API with improved prompting"""
235
+ try:
236
+ client = openai.OpenAI(api_key=api_key)
237
+
238
+ # Enhanced prompt for better translation
239
+ system_prompt = f"""You are a professional translator specializing in {target_language}.
240
+ Translate the following text to {target_language}, ensuring:
241
+ 1. Technical terms are accurately translated
242
+ 2. Maintain formal language and proper grammar
243
+ 3. Preserve formatting and structure
244
+ 4. Keep proper nouns and technical terms like 'AI', 'LLMs', 'Python' in English where appropriate
245
+ 5. Use culturally appropriate expressions
246
+ 6. For Urdu/Arabic, ensure proper character connections and diacritics
247
+ 7. Maintain professional and accurate technical translations
248
+ 8. Preserve line breaks and paragraph structure
249
+ """
250
+
251
+ response = client.chat.completions.create(
252
+ model="gpt-3.5-turbo",
253
+ messages=[
254
+ {"role": "system", "content": system_prompt},
255
+ {"role": "user", "content": text}
256
+ ],
257
+ temperature=0.3
258
+ )
259
+ return response.choices[0].message.content
260
+ except Exception as e:
261
+ return f"Translation error: {str(e)}"
262
+
263
+ # Set page config
264
+ st.set_page_config(page_title="PDF Translator", layout="wide")
265
+
266
+ # Try to register fonts at startup
267
+ register_fonts()
268
+
269
+ # Main app interface
270
+ st.title("PDF Document Translator")
271
+
272
+ # Add custom CSS for better text display
273
+ st.markdown("""
274
+ <style>
275
+ .stTextArea textarea {
276
+ font-size: 16px !important;
277
+ }
278
+ </style>
279
+ """, unsafe_allow_html=True)
280
+
281
+ # Language selection
282
+ languages = {
283
+ "English": "English",
284
+ "Urdu": "Urdu",
285
+ "Arabic": "Arabic",
286
+ "Roman English": "Roman English",
287
+ "Roman Urdu": "Roman Urdu",
288
+ "Hindi": "Hindi",
289
+ "Spanish": "Spanish",
290
+ "French": "French"
291
+ }
292
+
293
+ # File uploader
294
+ uploaded_file = st.file_uploader("Upload your PDF file", type="pdf")
295
+
296
+ # API Key input field
297
+ api_key_input = st.text_input("Enter OpenAI API Key:", type="password", key="api_key_input")
298
+ if api_key_input:
299
+ api_key = api_key_input
300
+
301
+ # Language selector
302
+ target_language = st.selectbox(
303
+ "Select target language",
304
+ options=list(languages.keys())
305
+ )
306
+
307
+ # Create two columns for original and translated text
308
+ col1, col2 = st.columns(2)
309
+
310
+ if uploaded_file is not None and api_key:
311
+ # Extract text from PDF
312
+ with st.spinner("Extracting text from PDF..."):
313
+ text = extract_text_from_pdf(uploaded_file)
314
+
315
+ # Show original text
316
+ with col1:
317
+ st.subheader("Original Text")
318
+ st.text_area("", value=text, height=400, key="original_text")
319
+
320
+ # Initialize session state for translated text
321
+ if 'translated_text' not in st.session_state:
322
+ st.session_state.translated_text = None
323
+
324
+ # Translate button
325
+ if st.button("Translate"):
326
+ with st.spinner("Translating..."):
327
+ translated_text = translate_text(text, languages[target_language], api_key)
328
+ st.session_state.translated_text = translated_text
329
+
330
+ # Show translated text
331
+ with col2:
332
+ st.subheader(f"Translated Text ({target_language})")
333
+ st.text_area("", value=translated_text, height=400, key="translated_text")
334
+
335
+ # Show download button if translation exists
336
+ if st.session_state.translated_text:
337
+ # Create PDF button
338
+ if st.download_button(
339
+ label="Download Translated PDF",
340
+ data=create_pdf(st.session_state.translated_text, target_language),
341
+ file_name=f"translated_{target_language}.pdf",
342
+ mime="application/pdf"
343
+ ):
344
+ st.success("PDF downloaded successfully!")
345
+
346
+ elif not api_key:
347
+ st.warning("Please enter your OpenAI API key to proceed.")
348
+
349
+ # Add instructions and notes
350
+ st.markdown("""
351
+ ### Instructions:
352
+ 1. Enter your OpenAI API key
353
+ 2. Upload your PDF file
354
+ 3. Select your target language
355
+ 4. Click 'Translate' to get your translation
356
+ 5. Review the translation
357
+ 6. Click 'Download Translated PDF' to save as PDF
358
+ """)port PyPDF2
359
+ import openai
360
+ from io import BytesIO
361
+ import io
362
+ from reportlab.pdfgen import canvas
363
+ from reportlab.lib.pagesizes import letter, A4
364
+ from reportlab.pdfbase import pdfmetrics
365
+ from reportlab.pdfbase.ttfonts import TTFont
366
  from reportlab.lib.utils import simpleSplit
367
  from reportlab.lib.colors import black
368
  import arabic_reshaper