devatar commited on
Commit
594a457
·
verified ·
1 Parent(s): d2f7a6e

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +355 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import re
4
+ import requests
5
+ from flask import Flask, request, render_template, send_file, jsonify
6
+ import fitz # PyMuPDF
7
+ from werkzeug.utils import secure_filename
8
+
9
+ app = Flask(__name__)
10
+
11
+ # Configuration
12
+ UPLOAD_FOLDER = 'uploads'
13
+ OUTPUT_FOLDER = 'outputs'
14
+ os.makedirs(UPLOAD_FOLDER, exist_ok=True)
15
+ os.makedirs(OUTPUT_FOLDER, exist_ok=True)
16
+ app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
17
+ app.config['OUTPUT_FOLDER'] = OUTPUT_FOLDER
18
+
19
+ # Hugging Face API Configuration
20
+ HF_API_KEY =
21
+ HF_API_URL = "https://api-inference.huggingface.co/models/facebook/nllb-200-3.3B"
22
+ HEADERS = {
23
+ "Authorization": f"Bearer {HF_API_KEY}",
24
+ "X-Wait-For-Model": "180", # Wait up to 3 minutes
25
+ "X-Use-Cache": "0"
26
+ }
27
+
28
+ # Update token IDs for 1.3B model:
29
+ LANGUAGES = {
30
+ "Hindi": {"token_id": 256047, "code": "hin_Deva", "iso": "hi"},
31
+ "Tamil": {"token_id": 256157, "code": "tam_Taml", "iso": "ta"},
32
+ "Telugu": {"token_id": 256082, "code": "tel_Telu", "iso": "te"}
33
+ }
34
+ MAX_LENGTH_DEFAULT = 512
35
+
36
+ DIGIT_MAP = {
37
+ "Hindi": "०१२३४५६७८९",
38
+ "Tamil": "௦௧௨௩௪௫௬௭௮௯",
39
+ "Telugu": "౦౧౨౩౪౫౬౭౮౯"
40
+ }
41
+ LATIN_DIGITS = "0123456789"
42
+
43
+ # Utility functions
44
+ def parse_user_entities(user_input):
45
+ return sorted({e.strip() for e in user_input.split(',') if e.strip()}, key=len, reverse=True)
46
+
47
+ def parse_user_languages(user_input):
48
+ selected = [lang.strip().capitalize() for lang in user_input.split(',')]
49
+ valid = [lang for lang in selected if lang in LANGUAGES]
50
+ return valid or list(LANGUAGES.keys())
51
+
52
+ def replace_with_placeholders(text, entities):
53
+ placeholder_map = {}
54
+ modified_text = text
55
+ patterns = [
56
+ (re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'), "emails"),
57
+ (re.compile(r'https?://\S+|www\.\S+'), "URLs"),
58
+ (re.compile(r'@\w+'), "usernames"),
59
+ (re.compile(r'[\$£€%#@&*]\s*\d+(?:\.\d+)?|\d+(?:\.\d+)?[\$£€%#@&*]'), "symbols_with_numbers"),
60
+ (re.compile(r'[\$£€%#@&*]'), "symbols")
61
+ ]
62
+
63
+ for pattern, _ in patterns:
64
+ for match in pattern.findall(modified_text):
65
+ placeholder = f"__PRESERVE{len(placeholder_map):03d}__"
66
+ placeholder_map[placeholder] = match
67
+ modified_text = modified_text.replace(match, placeholder)
68
+
69
+ for entity in entities:
70
+ pattern = re.compile(re.escape(entity), re.IGNORECASE)
71
+ modified_text = pattern.sub(lambda m: f"__PRESERVE{len(placeholder_map):03d}__", modified_text)
72
+ placeholder_map[f"__PRESERVE{len(placeholder_map):03d}__"] = entity
73
+
74
+ return modified_text, placeholder_map
75
+
76
+ def convert_numbers_to_script(text, target_lang):
77
+ digit_map = DIGIT_MAP[target_lang]
78
+ return re.sub(r'\d+', lambda m: ''.join(digit_map[int(d)] for d in m.group()), text)
79
+
80
+ def translate_batch(texts, target_lang, fast_mode=False):
81
+ if not texts:
82
+ return []
83
+
84
+ translated_texts = []
85
+ batch_size = 2
86
+ max_retries = 3
87
+ lang_data = LANGUAGES[target_lang]
88
+
89
+ for i in range(0, len(texts), batch_size):
90
+ batch = texts[i:i + batch_size]
91
+ max_length = max(MAX_LENGTH_DEFAULT, max(len(t.split()) for t in batch) * 2)
92
+ payload = {
93
+ "inputs": batch,
94
+ "parameters": {
95
+ "max_length": max_length,
96
+ "forced_bos_token_id": lang_data["token_id"],
97
+ "src_lang": "eng_Latn",
98
+ "tgt_lang": lang_data["code"]
99
+ }
100
+ }
101
+
102
+ for attempt in range(max_retries):
103
+ try:
104
+ response = requests.post(HF_API_URL, headers=HEADERS, json=payload)
105
+ response.raise_for_status()
106
+ result = response.json()
107
+
108
+ if isinstance(result, list):
109
+ translated = [r.get("translation_text", "") for r in result]
110
+ translated_texts.extend([re.sub(r'^\s*…|\.+$', '', t.strip()) for t in translated])
111
+ break
112
+ else:
113
+ if "estimated_time" in result:
114
+ wait_time = result["estimated_time"] + 5
115
+ print(f"Model loading, retrying in {wait_time}s...")
116
+ time.sleep(wait_time)
117
+ continue
118
+ raise ValueError(f"Unexpected response format: {result}")
119
+
120
+ except requests.exceptions.HTTPError as e:
121
+ if e.response.status_code == 503 and attempt < max_retries - 1:
122
+ wait_time = 10 * (attempt + 1)
123
+ print(f"Server overloaded, retrying in {wait_time}s...")
124
+ time.sleep(wait_time)
125
+ continue
126
+ raise
127
+
128
+ time.sleep(1)
129
+
130
+ return translated_texts
131
+
132
+ # PDF processing functions (extract_pdf_components, split_block_into_subblocks,
133
+ # ... [Keep previous configuration and constants] ...
134
+
135
+ def extract_pdf_components(pdf_path):
136
+ print(f"\n📄 Extracting components from {pdf_path}...")
137
+ doc = fitz.open(pdf_path)
138
+ components = []
139
+ for page_num, page in enumerate(doc):
140
+ blocks = page.get_text("dict")["blocks"]
141
+ text_blocks = []
142
+ for b in blocks:
143
+ if b["type"] == 0: # Text block
144
+ lines = []
145
+ for line in b["lines"]:
146
+ if line["spans"]:
147
+ text = join_spans(line["spans"])
148
+ if text.strip():
149
+ lines.append({
150
+ "text": text,
151
+ "y_pos": line["spans"][0]["origin"][1],
152
+ "x_pos": line["spans"][0]["origin"][0],
153
+ "font_size": line["spans"][0]["size"],
154
+ "line_bbox": line["bbox"]
155
+ })
156
+ if lines:
157
+ text_blocks.append({"bbox": b["bbox"], "lines": lines})
158
+ components.append({"page_num": page_num, "text_blocks": text_blocks, "size": (page.rect.width, page.rect.height)})
159
+ doc.close()
160
+ return components
161
+
162
+ def join_spans(spans):
163
+ if not spans:
164
+ return ""
165
+ spans = sorted(spans, key=lambda s: s["bbox"][0])
166
+ text_parts = [spans[0]["text"].strip()]
167
+ for i in range(1, len(spans)):
168
+ span1, span2 = spans[i - 1], spans[i]
169
+ d = span2["bbox"][0] - span1["bbox"][2]
170
+ text2 = span2["text"].strip()
171
+ if not text2:
172
+ continue
173
+ if d < 0.5 * min((span1["bbox"][2] - span1["bbox"][0])/len(span1["text"]),
174
+ (span2["bbox"][2] - span2["bbox"][0])/len(text2)):
175
+ text_parts.append(text2)
176
+ else:
177
+ text_parts.append(" " + text2)
178
+ return "".join(text_parts)
179
+
180
+ def split_block_into_subblocks(block):
181
+ lines = block["lines"]
182
+ subblocks = []
183
+ current_subblock = {"text": "", "lines": [], "is_short": False}
184
+ max_words = 50
185
+
186
+ for i, line in enumerate(lines):
187
+ text = line["text"].strip()
188
+ if not text:
189
+ continue
190
+
191
+ is_short = len(text.split()) <= 3
192
+ font_size = line["font_size"]
193
+ current_words = current_subblock["text"].split()
194
+
195
+ if len(current_words) + len(text.split()) > max_words or font_size > 20:
196
+ subblocks.append(current_subblock)
197
+ current_subblock = {"text": "", "lines": [], "is_short": False}
198
+
199
+ if i > 0:
200
+ gap = lines[i]["y_pos"] - lines[i-1]["y_pos"] - lines[i-1]["font_size"]
201
+ x_shift = abs(line["x_pos"] - lines[i-1]["x_pos"])
202
+
203
+ if gap > font_size * 0.5 or x_shift > 10:
204
+ subblocks.append(current_subblock)
205
+ current_subblock = {"text": "", "lines": [], "is_short": False}
206
+
207
+ current_subblock["text"] += " " + text if current_subblock["text"] else text
208
+ current_subblock["lines"].append(line)
209
+ current_subblock["is_short"] = is_short
210
+
211
+ if current_subblock["text"]:
212
+ subblocks.append(current_subblock)
213
+ return subblocks
214
+
215
+ def translate_chunk(chunk, entities, target_lang, fast_mode=False):
216
+ all_subblocks = []
217
+ for page in chunk:
218
+ for block in page["text_blocks"]:
219
+ subblocks = split_block_into_subblocks(block)
220
+ block["subblocks"] = subblocks
221
+ all_subblocks.extend(subblocks)
222
+
223
+ texts = []
224
+ placeholder_maps = []
225
+ for subblock in all_subblocks:
226
+ if not subblock["text"].strip():
227
+ continue
228
+ modified_text, ph_map = replace_with_placeholders(subblock["text"], entities)
229
+ texts.append(modified_text)
230
+ placeholder_maps.append(ph_map)
231
+
232
+ translated_texts = translate_batch(texts, target_lang, fast_mode)
233
+
234
+ for i, subblock in enumerate(all_subblocks):
235
+ if i >= len(translated_texts):
236
+ continue
237
+ translated = translated_texts[i]
238
+ ph_map = placeholder_maps[i]
239
+
240
+ for placeholder, original in ph_map.items():
241
+ translated = translated.replace(placeholder, original)
242
+ translated = convert_numbers_to_script(translated, target_lang)
243
+ subblock["translated_text"] = translated
244
+
245
+ def rebuild_pdf(components, target_lang, output_path, original_pdf_path):
246
+ doc = fitz.open(original_pdf_path)
247
+ lang_iso = LANGUAGES[target_lang]["iso"]
248
+
249
+ for page_data in components:
250
+ page = doc[page_data["page_num"]]
251
+ links = page.get_links()
252
+
253
+ for block in page_data["text_blocks"]:
254
+ page.add_redact_annot(block["bbox"])
255
+ page.apply_redactions()
256
+
257
+ for subblock in block.get("subblocks", []):
258
+ if not subblock.get("translated_text", "").strip():
259
+ continue
260
+
261
+ translated_lines = redistribute_translated_text(
262
+ subblock["translated_text"],
263
+ subblock["lines"]
264
+ )
265
+
266
+ for line, translated in zip(subblock["lines"], translated_lines):
267
+ rect = fitz.Rect(line["line_bbox"])
268
+ html = f'<p lang="{lang_iso}">{translated}</p>'
269
+ page.insert_htmlbox(rect, html)
270
+
271
+ for link in links:
272
+ page.insert_link(link)
273
+
274
+ doc.save(output_path, garbage=4, deflate=True)
275
+ doc.close()
276
+
277
+ def redistribute_translated_text(translated_text, original_lines):
278
+ words = translated_text.split()
279
+ lines = []
280
+ current_line = []
281
+ current_width = 0
282
+
283
+ for line in original_lines:
284
+ if not words:
285
+ break
286
+
287
+ max_width = line["line_bbox"][2] - line["line_bbox"][0]
288
+ font_size = line["font_size"]
289
+ font = fitz.Font("helv")
290
+
291
+ while words:
292
+ word_width = font.text_length(words[0] + " ", fontsize=font_size)
293
+ if current_width + word_width <= max_width:
294
+ current_line.append(words.pop(0))
295
+ current_width += word_width
296
+ else:
297
+ break
298
+
299
+ lines.append(" ".join(current_line))
300
+ current_line = []
301
+ current_width = 0
302
+
303
+ if words:
304
+ lines[-1] += " " + " ".join(words)
305
+
306
+ return lines + [""]*(len(original_lines)-len(lines))
307
+
308
+ # ... [Keep the Flask routes from previous code] ...
309
+ @app.route('/')
310
+ def index():
311
+ return render_template('index.html')
312
+
313
+ @app.route('/translate', methods=['POST'])
314
+ def translate_pdf():
315
+ if 'pdf_file' not in request.files:
316
+ return jsonify({'error': 'No PDF file uploaded'}), 400
317
+
318
+ pdf_file = request.files['pdf_file']
319
+ if pdf_file.filename == '':
320
+ return jsonify({'error': 'No file selected'}), 400
321
+
322
+ filename = secure_filename(pdf_file.filename)
323
+ pdf_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
324
+ pdf_file.save(pdf_path)
325
+
326
+ entities = parse_user_entities(request.form.get('entities', ''))
327
+ languages = parse_user_languages(request.form.get('languages', 'Hindi,Tamil,Telugu'))
328
+
329
+ try:
330
+ components = extract_pdf_components(pdf_path)
331
+ output_files = []
332
+
333
+ for lang in languages:
334
+ start_time = time.time()
335
+ translate_chunk(components, entities, lang, fast_mode=len(components) <= 5)
336
+ output_path = os.path.join(app.config['OUTPUT_FOLDER'], f"translated_{lang}_{filename}")
337
+ rebuild_pdf(components, lang, output_path, pdf_path)
338
+ output_files.append(output_path)
339
+ print(f"{lang} translation completed in {time.time()-start_time:.2f}s")
340
+
341
+ return jsonify({
342
+ 'message': f"Translation completed for {', '.join(languages)}",
343
+ 'files': [os.path.basename(f) for f in output_files]
344
+ })
345
+
346
+ except Exception as e:
347
+ return jsonify({'error': str(e)}), 500
348
+
349
+ @app.route('/download/<filename>')
350
+ def download_file(filename):
351
+ file_path = os.path.join(app.config['OUTPUT_FOLDER'], filename)
352
+ return send_file(file_path, as_attachment=True) if os.path.exists(file_path) else (jsonify({'error': 'File not found'}), 404)
353
+
354
+ if __name__ == '__main__':
355
+ app.run(host='0.0.0.0', port=int(os.environ.get('PORT', 5000)), debug=False)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Flask
2
+ pymupdf
3
+ requests
4
+ werkzeug
5
+ googletrans==4.0.0-rc1