Toadoum commited on
Commit
3c15094
Β·
verified Β·
1 Parent(s): e05701a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +177 -72
app.py CHANGED
@@ -66,8 +66,7 @@ def extract_text_from_docx(file_path: str) -> str:
66
 
67
  text = ''.join(texts)
68
  if text.strip():
69
- # Add paragraph breaks
70
- return text.replace(' ', '\n\n')
71
  except Exception as e:
72
  print(f"XML extraction failed: {e}")
73
 
@@ -80,64 +79,68 @@ def extract_text_from_docx(file_path: str) -> str:
80
  except Exception as e:
81
  print(f"python-docx failed: {e}")
82
 
83
- # Method 3: Read as binary and extract readable text
84
  try:
85
- with open(file_path, 'rb') as f:
86
- content = f.read()
87
-
88
- # Try to decode text portions
89
- text_parts = []
90
- try:
91
- # Look for XML text content
92
- import re
93
- # Find text between <w:t> tags
94
- matches = re.findall(b'<w:t[^>]*>([^<]+)</w:t>', content)
95
- for match in matches:
96
- try:
97
- text_parts.append(match.decode('utf-8'))
98
- except:
99
- pass
100
-
101
- if text_parts:
102
- return ' '.join(text_parts)
103
- except Exception as e:
104
- print(f"Binary extraction failed: {e}")
105
  except Exception as e:
106
- print(f"File read failed: {e}")
107
 
108
- raise ValueError("Could not extract text from this DOCX file. The file may be corrupted or in an unsupported format. Please try:\n1. Open in Word and Save As a new .docx\n2. Convert to PDF\n3. Copy text to a .txt file")
109
 
110
  def extract_text_from_doc(file_path: str) -> str:
111
- """Extract text from old .doc format."""
112
- import subprocess
113
-
114
- # Try antiword (if installed)
115
  try:
116
- result = subprocess.run(
117
- ['antiword', file_path],
118
- capture_output=True,
119
- text=True,
120
- timeout=30
121
- )
122
- if result.returncode == 0 and result.stdout.strip():
123
- return result.stdout.strip()
124
- except (FileNotFoundError, subprocess.TimeoutExpired):
125
- pass
126
 
127
- # Try catdoc (if installed)
128
  try:
129
- result = subprocess.run(
130
- ['catdoc', file_path],
131
- capture_output=True,
132
- text=True,
133
- timeout=30
134
- )
135
- if result.returncode == 0 and result.stdout.strip():
136
- return result.stdout.strip()
137
- except (FileNotFoundError, subprocess.TimeoutExpired):
138
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
 
140
- raise ValueError("Cannot read .doc files on this server. Please convert to .docx, .pdf, or .txt format.")
141
 
142
  def extract_text(file_path: str) -> str:
143
  """Extract text from uploaded file."""
@@ -153,7 +156,7 @@ def extract_text(file_path: str) -> str:
153
  with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
154
  return f.read()
155
  else:
156
- raise ValueError(f"Unsupported format: {ext}. Please use PDF, DOCX, or TXT.")
157
 
158
  # ============================================
159
  # LAZY MODEL LOADING
@@ -290,6 +293,8 @@ def format_time(seconds: float) -> str:
290
  # ============================================
291
  # MAIN PIPELINE
292
  # ============================================
 
 
293
  def process_document(file, progress=gr.Progress()):
294
  """Main pipeline: Document β†’ Translation β†’ TTS β†’ Audiobook"""
295
 
@@ -298,34 +303,126 @@ def process_document(file, progress=gr.Progress()):
298
 
299
  try:
300
  # Extract text
301
- progress(0.1, desc="πŸ“„ Extracting text...")
302
- text = extract_text(file.name)[:2000] # Limit for POC
 
 
 
 
 
 
 
 
 
 
 
 
303
 
304
- if not text:
305
- return None, "", "", "⚠️ No text found"
 
306
 
307
- # Translate
308
- progress(0.3, desc="🌍 Translating to Hausa...")
309
- translated = translate_text(text)
310
 
311
- # Generate audio
312
- progress(0.6, desc="πŸŽ™οΈ Generating audio...")
313
- audio, timestamps = generate_audio(translated)
314
 
315
- # Save
316
- progress(0.9, desc="πŸ’Ύ Saving...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
318
- wavfile.write(f.name, SAMPLE_RATE, (audio * 32767).astype(np.int16))
319
  audio_path = f.name
320
 
321
  # Format output
322
  timestamps_text = "\n".join([f"[{t['start']} β†’ {t['end']}] {t['text']}" for t in timestamps])
323
- transcript = f"## Original (English)\n{text[:500]}{'...' if len(text) > 500 else ''}\n\n## Translation (Hausa)\n{translated}"
 
 
 
 
 
 
 
 
 
 
 
 
 
324
 
325
  progress(1.0, desc="βœ… Done!")
326
- return audio_path, transcript, timestamps_text, "βœ… Audiobook generated!"
327
 
328
  except Exception as e:
 
 
329
  return None, "", "", f"❌ Error: {str(e)}"
330
 
331
  # ============================================
@@ -345,16 +442,24 @@ with gr.Blocks(
345
 
346
  with gr.Row():
347
  with gr.Column(scale=1):
348
- file_input = gr.File(label="πŸ“ Upload PDF, DOCX, or TXT", file_types=[".pdf", ".docx", ".txt"])
 
 
 
 
349
  btn = gr.Button("πŸš€ Generate Audiobook", variant="primary", size="lg")
350
  status = gr.Textbox(label="Status", interactive=False)
351
 
352
  gr.Markdown("""
353
  ### How it works
354
- 1. Upload English document
355
  2. AI translates to Hausa
356
- 3. TTS generates audio
357
- 4. Download with timestamps
 
 
 
 
358
  """)
359
 
360
  with gr.Column(scale=2):
@@ -363,7 +468,7 @@ with gr.Blocks(
363
  with gr.Tab("πŸ“œ Transcript"):
364
  transcript = gr.Markdown()
365
  with gr.Tab("⏱️ Timestamps"):
366
- timestamps = gr.Textbox(lines=8, interactive=False)
367
 
368
  gr.HTML("""<div style="text-align: center; padding: 1rem; background: #f8f9fa; border-radius: 8px; margin-top: 1rem;">
369
  <strong>PlotWeaver</strong> - AI for African Languages
 
66
 
67
  text = ''.join(texts)
68
  if text.strip():
69
+ return text
 
70
  except Exception as e:
71
  print(f"XML extraction failed: {e}")
72
 
 
79
  except Exception as e:
80
  print(f"python-docx failed: {e}")
81
 
82
+ # Method 3: Use PyMuPDF (can handle some docx too)
83
  try:
84
+ doc = fitz.open(file_path)
85
+ text = ""
86
+ for page in doc:
87
+ text += page.get_text() + "\n"
88
+ doc.close()
89
+ if text.strip():
90
+ return text.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  except Exception as e:
92
+ print(f"PyMuPDF failed: {e}")
93
 
94
+ raise ValueError("Could not extract text from this DOCX file. Please convert to PDF or TXT.")
95
 
96
  def extract_text_from_doc(file_path: str) -> str:
97
+ """Extract text from old .doc format using PyMuPDF."""
98
+ # PyMuPDF can open .doc files
 
 
99
  try:
100
+ doc = fitz.open(file_path)
101
+ text = ""
102
+ for page in doc:
103
+ text += page.get_text() + "\n"
104
+ doc.close()
105
+ if text.strip():
106
+ return text.strip()
107
+ except Exception as e:
108
+ print(f"PyMuPDF .doc failed: {e}")
 
109
 
110
+ # Fallback: Try reading with olefile for OLE-based .doc
111
  try:
112
+ import olefile
113
+ ole = olefile.OleFileIO(file_path)
114
+
115
+ # Try to find the WordDocument stream
116
+ if ole.exists('WordDocument'):
117
+ # Extract text from the document
118
+ stream = ole.openstream('WordDocument')
119
+ data = stream.read()
120
+
121
+ # Simple text extraction (decode readable ASCII/UTF-8)
122
+ text_parts = []
123
+ current_text = []
124
+ for byte in data:
125
+ if 32 <= byte < 127: # Printable ASCII
126
+ current_text.append(chr(byte))
127
+ elif current_text:
128
+ text_parts.append(''.join(current_text))
129
+ current_text = []
130
+ if current_text:
131
+ text_parts.append(''.join(current_text))
132
+
133
+ text = ' '.join([t for t in text_parts if len(t) > 3])
134
+ ole.close()
135
+
136
+ if text.strip():
137
+ return text.strip()
138
+ except ImportError:
139
+ print("olefile not installed")
140
+ except Exception as e:
141
+ print(f"olefile failed: {e}")
142
 
143
+ raise ValueError("Cannot read this .doc file. Please convert to .docx, .pdf, or .txt format.\n\nTip: Open in Microsoft Word or LibreOffice and 'Save As' a different format.")
144
 
145
  def extract_text(file_path: str) -> str:
146
  """Extract text from uploaded file."""
 
156
  with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
157
  return f.read()
158
  else:
159
+ raise ValueError(f"Unsupported format: {ext}. Please use PDF, DOCX, DOC, or TXT.")
160
 
161
  # ============================================
162
  # LAZY MODEL LOADING
 
293
  # ============================================
294
  # MAIN PIPELINE
295
  # ============================================
296
+ MAX_CHARS = 10000 # Max characters to process (increase for longer files)
297
+
298
  def process_document(file, progress=gr.Progress()):
299
  """Main pipeline: Document β†’ Translation β†’ TTS β†’ Audiobook"""
300
 
 
303
 
304
  try:
305
  # Extract text
306
+ progress(0.05, desc="πŸ“„ Extracting text...")
307
+ full_text = extract_text(file.name)
308
+
309
+ if not full_text or not full_text.strip():
310
+ return None, "", "", "⚠️ No text found in document"
311
+
312
+ # Limit text length with warning
313
+ original_length = len(full_text)
314
+ if original_length > MAX_CHARS:
315
+ text = full_text[:MAX_CHARS]
316
+ truncated_msg = f"\n\n⚠️ Text truncated from {original_length:,} to {MAX_CHARS:,} characters for demo."
317
+ else:
318
+ text = full_text
319
+ truncated_msg = ""
320
 
321
+ # Split into sentences for batch processing
322
+ sentences = re.split(r'(?<=[.!?])\s+', text)
323
+ total_sentences = len(sentences)
324
 
325
+ # Translate in batches
326
+ progress(0.1, desc=f"🌍 Translating {total_sentences} sentences...")
327
+ translated_sentences = []
328
 
329
+ model, tokenizer = get_translation_model()
330
+ device = "cuda" if torch.cuda.is_available() else "cpu"
331
+ tgt_lang_id = tokenizer.convert_tokens_to_ids(TGT_LANG)
332
 
333
+ with torch.no_grad():
334
+ for i, sentence in enumerate(sentences):
335
+ if not sentence.strip():
336
+ continue
337
+
338
+ # Update progress
339
+ prog = 0.1 + (0.4 * (i / total_sentences))
340
+ progress(prog, desc=f"🌍 Translating sentence {i+1}/{total_sentences}...")
341
+
342
+ inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=256)
343
+ if device == "cuda":
344
+ inputs = {k: v.cuda() for k, v in inputs.items()}
345
+
346
+ outputs = model.generate(
347
+ **inputs,
348
+ forced_bos_token_id=tgt_lang_id,
349
+ max_length=256,
350
+ num_beams=4,
351
+ )
352
+
353
+ translated_sentences.append(tokenizer.decode(outputs[0], skip_special_tokens=True))
354
+
355
+ translated = " ".join(translated_sentences)
356
+
357
+ # Generate audio in batches
358
+ progress(0.5, desc="πŸŽ™οΈ Generating audio...")
359
+ chunks = split_text(translated)
360
+ total_chunks = len(chunks)
361
+
362
+ tts_model, tts_tokenizer = get_tts_model()
363
+ audio_segments = []
364
+ timestamps = []
365
+ current_time = 0.0
366
+
367
+ with torch.no_grad():
368
+ for i, chunk in enumerate(chunks):
369
+ if not chunk.strip():
370
+ continue
371
+
372
+ # Update progress
373
+ prog = 0.5 + (0.4 * (i / total_chunks))
374
+ progress(prog, desc=f"πŸŽ™οΈ Generating audio {i+1}/{total_chunks}...")
375
+
376
+ inputs = tts_tokenizer(chunk, return_tensors="pt")
377
+ if device == "cuda":
378
+ inputs = {k: v.cuda() for k, v in inputs.items()}
379
+
380
+ audio = tts_model(**inputs).waveform.squeeze().cpu().numpy()
381
+ audio_segments.append(audio)
382
+
383
+ duration = len(audio) / SAMPLE_RATE
384
+ timestamps.append({
385
+ "start": format_time(current_time),
386
+ "end": format_time(current_time + duration),
387
+ "text": chunk
388
+ })
389
+ current_time += duration
390
+
391
+ # Concatenate audio
392
+ if not audio_segments:
393
+ return None, "", "", "❌ No audio generated"
394
+
395
+ full_audio = np.concatenate(audio_segments)
396
+
397
+ # Save audio
398
+ progress(0.95, desc="πŸ’Ύ Saving audiobook...")
399
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
400
+ wavfile.write(f.name, SAMPLE_RATE, (full_audio * 32767).astype(np.int16))
401
  audio_path = f.name
402
 
403
  # Format output
404
  timestamps_text = "\n".join([f"[{t['start']} β†’ {t['end']}] {t['text']}" for t in timestamps])
405
+
406
+ # Calculate audio duration
407
+ audio_duration = len(full_audio) / SAMPLE_RATE
408
+ duration_str = f"{int(audio_duration // 60)}:{int(audio_duration % 60):02d}"
409
+
410
+ transcript = f"""## Original (English)
411
+ {text[:1000]}{'...' if len(text) > 1000 else ''}{truncated_msg}
412
+
413
+ ## Translation (Hausa)
414
+ {translated}
415
+
416
+ ---
417
+ πŸ“Š **Stats**: {len(text):,} chars β†’ {len(translated):,} chars | 🎡 Duration: {duration_str}
418
+ """
419
 
420
  progress(1.0, desc="βœ… Done!")
421
+ return audio_path, transcript, timestamps_text, f"βœ… Audiobook generated! Duration: {duration_str}"
422
 
423
  except Exception as e:
424
+ import traceback
425
+ traceback.print_exc()
426
  return None, "", "", f"❌ Error: {str(e)}"
427
 
428
  # ============================================
 
442
 
443
  with gr.Row():
444
  with gr.Column(scale=1):
445
+ file_input = gr.File(
446
+ label="πŸ“ Upload Document",
447
+ file_types=[".pdf", ".docx", ".doc", ".txt"],
448
+ type="filepath"
449
+ )
450
  btn = gr.Button("πŸš€ Generate Audiobook", variant="primary", size="lg")
451
  status = gr.Textbox(label="Status", interactive=False)
452
 
453
  gr.Markdown("""
454
  ### How it works
455
+ 1. Upload English document (PDF, DOCX, DOC, TXT)
456
  2. AI translates to Hausa
457
+ 3. TTS generates natural audio
458
+ 4. Download audiobook with timestamps
459
+
460
+ ---
461
+ ⏱️ **Processing time**: ~1-2 min per page
462
+ πŸ“„ **Max length**: 10,000 characters (~4 pages)
463
  """)
464
 
465
  with gr.Column(scale=2):
 
468
  with gr.Tab("πŸ“œ Transcript"):
469
  transcript = gr.Markdown()
470
  with gr.Tab("⏱️ Timestamps"):
471
+ timestamps = gr.Textbox(lines=10, interactive=False)
472
 
473
  gr.HTML("""<div style="text-align: center; padding: 1rem; background: #f8f9fa; border-radius: 8px; margin-top: 1rem;">
474
  <strong>PlotWeaver</strong> - AI for African Languages