Spaces:

PlotweaverAI
/

plotweaver-audiobook

Running

App Files Files Community

Toadoum commited on 11 days ago

Commit

3c15094

verified ·

1 Parent(s): e05701a

Update app.py

Browse files

Files changed (1) hide show

app.py +177 -72

app.py CHANGED Viewed

@@ -66,8 +66,7 @@ def extract_text_from_docx(file_path: str) -> str:
                 text = ''.join(texts)
                 if text.strip():
-                    # Add paragraph breaks
-                    return text.replace('  ', '\n\n')
     except Exception as e:
         print(f"XML extraction failed: {e}")
@@ -80,64 +79,68 @@ def extract_text_from_docx(file_path: str) -> str:
     except Exception as e:
         print(f"python-docx failed: {e}")
-    # Method 3: Read as binary and extract readable text
     try:
-        with open(file_path, 'rb') as f:
-            content = f.read()
-        # Try to decode text portions
-        text_parts = []
-        try:
-            # Look for XML text content
-            import re
-            # Find text between <w:t> tags
-            matches = re.findall(b'<w:t[^>]*>([^<]+)</w:t>', content)
-            for match in matches:
-                try:
-                    text_parts.append(match.decode('utf-8'))
-                except:
-                    pass
-            if text_parts:
-                return ' '.join(text_parts)
-        except Exception as e:
-            print(f"Binary extraction failed: {e}")
     except Exception as e:
-        print(f"File read failed: {e}")
-    raise ValueError("Could not extract text from this DOCX file. The file may be corrupted or in an unsupported format. Please try:\n1. Open in Word and Save As a new .docx\n2. Convert to PDF\n3. Copy text to a .txt file")
 def extract_text_from_doc(file_path: str) -> str:
-    """Extract text from old .doc format."""
-    import subprocess
-    # Try antiword (if installed)
     try:
-        result = subprocess.run(
-            ['antiword', file_path],
-            capture_output=True,
-            text=True,
-            timeout=30
-        )
-        if result.returncode == 0 and result.stdout.strip():
-            return result.stdout.strip()
-    except (FileNotFoundError, subprocess.TimeoutExpired):
-        pass
-    # Try catdoc (if installed)
     try:
-        result = subprocess.run(
-            ['catdoc', file_path],
-            capture_output=True,
-            text=True,
-            timeout=30
-        )
-        if result.returncode == 0 and result.stdout.strip():
-            return result.stdout.strip()
-    except (FileNotFoundError, subprocess.TimeoutExpired):
-        pass
-    raise ValueError("Cannot read .doc files on this server. Please convert to .docx, .pdf, or .txt format.")
 def extract_text(file_path: str) -> str:
     """Extract text from uploaded file."""
@@ -153,7 +156,7 @@ def extract_text(file_path: str) -> str:
         with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
             return f.read()
     else:
-        raise ValueError(f"Unsupported format: {ext}. Please use PDF, DOCX, or TXT.")
 # ============================================
 # LAZY MODEL LOADING
@@ -290,6 +293,8 @@ def format_time(seconds: float) -> str:
 # ============================================
 # MAIN PIPELINE
 # ============================================
 def process_document(file, progress=gr.Progress()):
     """Main pipeline: Document → Translation → TTS → Audiobook"""
@@ -298,34 +303,126 @@ def process_document(file, progress=gr.Progress()):
     try:
         # Extract text
-        progress(0.1, desc="📄 Extracting text...")
-        text = extract_text(file.name)[:2000]  # Limit for POC
-        if not text:
-            return None, "", "", "⚠️ No text found"
-        # Translate
-        progress(0.3, desc="🌍 Translating to Hausa...")
-        translated = translate_text(text)
-        # Generate audio
-        progress(0.6, desc="🎙️ Generating audio...")
-        audio, timestamps = generate_audio(translated)
-        # Save
-        progress(0.9, desc="💾 Saving...")
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
-            wavfile.write(f.name, SAMPLE_RATE, (audio * 32767).astype(np.int16))
             audio_path = f.name
         # Format output
         timestamps_text = "\n".join([f"[{t['start']} → {t['end']}] {t['text']}" for t in timestamps])
-        transcript = f"## Original (English)\n{text[:500]}{'...' if len(text) > 500 else ''}\n\n## Translation (Hausa)\n{translated}"
         progress(1.0, desc="✅ Done!")
-        return audio_path, transcript, timestamps_text, "✅ Audiobook generated!"
     except Exception as e:
         return None, "", "", f"❌ Error: {str(e)}"
 # ============================================
@@ -345,16 +442,24 @@ with gr.Blocks(
     with gr.Row():
         with gr.Column(scale=1):
-            file_input = gr.File(label="📁 Upload PDF, DOCX, or TXT", file_types=[".pdf", ".docx", ".txt"])
             btn = gr.Button("🚀 Generate Audiobook", variant="primary", size="lg")
             status = gr.Textbox(label="Status", interactive=False)
             gr.Markdown("""
             ### How it works
-            1. Upload English document
             2. AI translates to Hausa
-            3. TTS generates audio
-            4. Download with timestamps
             """)
         with gr.Column(scale=2):
@@ -363,7 +468,7 @@ with gr.Blocks(
                 with gr.Tab("📜 Transcript"):
                     transcript = gr.Markdown()
                 with gr.Tab("⏱️ Timestamps"):
-                    timestamps = gr.Textbox(lines=8, interactive=False)
     gr.HTML("""<div style="text-align: center; padding: 1rem; background: #f8f9fa; border-radius: 8px; margin-top: 1rem;">
         <strong>PlotWeaver</strong> - AI for African Languages

                 text = ''.join(texts)
                 if text.strip():
+                    return text
     except Exception as e:
         print(f"XML extraction failed: {e}")
     except Exception as e:
         print(f"python-docx failed: {e}")
+    # Method 3: Use PyMuPDF (can handle some docx too)
     try:
+        doc = fitz.open(file_path)
+        text = ""
+        for page in doc:
+            text += page.get_text() + "\n"
+        doc.close()
+        if text.strip():
+            return text.strip()
     except Exception as e:
+        print(f"PyMuPDF failed: {e}")
+    raise ValueError("Could not extract text from this DOCX file. Please convert to PDF or TXT.")
 def extract_text_from_doc(file_path: str) -> str:
+    """Extract text from old .doc format using PyMuPDF."""
+    # PyMuPDF can open .doc files
     try:
+        doc = fitz.open(file_path)
+        text = ""
+        for page in doc:
+            text += page.get_text() + "\n"
+        doc.close()
+        if text.strip():
+            return text.strip()
+    except Exception as e:
+        print(f"PyMuPDF .doc failed: {e}")
+    # Fallback: Try reading with olefile for OLE-based .doc
     try:
+        import olefile
+        ole = olefile.OleFileIO(file_path)
+        # Try to find the WordDocument stream
+        if ole.exists('WordDocument'):
+            # Extract text from the document
+            stream = ole.openstream('WordDocument')
+            data = stream.read()
+            # Simple text extraction (decode readable ASCII/UTF-8)
+            text_parts = []
+            current_text = []
+            for byte in data:
+                if 32 <= byte < 127:  # Printable ASCII
+                    current_text.append(chr(byte))
+                elif current_text:
+                    text_parts.append(''.join(current_text))
+                    current_text = []
+            if current_text:
+                text_parts.append(''.join(current_text))
+            text = ' '.join([t for t in text_parts if len(t) > 3])
+            ole.close()
+            if text.strip():
+                return text.strip()
+    except ImportError:
+        print("olefile not installed")
+    except Exception as e:
+        print(f"olefile failed: {e}")
+    raise ValueError("Cannot read this .doc file. Please convert to .docx, .pdf, or .txt format.\n\nTip: Open in Microsoft Word or LibreOffice and 'Save As' a different format.")
 def extract_text(file_path: str) -> str:
     """Extract text from uploaded file."""
         with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
             return f.read()
     else:
+        raise ValueError(f"Unsupported format: {ext}. Please use PDF, DOCX, DOC, or TXT.")
 # ============================================
 # LAZY MODEL LOADING
 # ============================================
 # MAIN PIPELINE
 # ============================================
+MAX_CHARS = 10000  # Max characters to process (increase for longer files)
 def process_document(file, progress=gr.Progress()):
     """Main pipeline: Document → Translation → TTS → Audiobook"""
     try:
         # Extract text
+        progress(0.05, desc="📄 Extracting text...")
+        full_text = extract_text(file.name)
+        if not full_text or not full_text.strip():
+            return None, "", "", "⚠️ No text found in document"
+        # Limit text length with warning
+        original_length = len(full_text)
+        if original_length > MAX_CHARS:
+            text = full_text[:MAX_CHARS]
+            truncated_msg = f"\n\n⚠️ Text truncated from {original_length:,} to {MAX_CHARS:,} characters for demo."
+        else:
+            text = full_text
+            truncated_msg = ""
+        # Split into sentences for batch processing
+        sentences = re.split(r'(?<=[.!?])\s+', text)
+        total_sentences = len(sentences)
+        # Translate in batches
+        progress(0.1, desc=f"🌍 Translating {total_sentences} sentences...")
+        translated_sentences = []
+        model, tokenizer = get_translation_model()
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        tgt_lang_id = tokenizer.convert_tokens_to_ids(TGT_LANG)
+        with torch.no_grad():
+            for i, sentence in enumerate(sentences):
+                if not sentence.strip():
+                    continue
+                # Update progress
+                prog = 0.1 + (0.4 * (i / total_sentences))
+                progress(prog, desc=f"🌍 Translating sentence {i+1}/{total_sentences}...")
+                inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=256)
+                if device == "cuda":
+                    inputs = {k: v.cuda() for k, v in inputs.items()}
+                outputs = model.generate(
+                    **inputs,
+                    forced_bos_token_id=tgt_lang_id,
+                    max_length=256,
+                    num_beams=4,
+                )
+                translated_sentences.append(tokenizer.decode(outputs[0], skip_special_tokens=True))
+        translated = " ".join(translated_sentences)
+        # Generate audio in batches
+        progress(0.5, desc="🎙️ Generating audio...")
+        chunks = split_text(translated)
+        total_chunks = len(chunks)
+        tts_model, tts_tokenizer = get_tts_model()
+        audio_segments = []
+        timestamps = []
+        current_time = 0.0
+        with torch.no_grad():
+            for i, chunk in enumerate(chunks):
+                if not chunk.strip():
+                    continue
+                # Update progress
+                prog = 0.5 + (0.4 * (i / total_chunks))
+                progress(prog, desc=f"🎙️ Generating audio {i+1}/{total_chunks}...")
+                inputs = tts_tokenizer(chunk, return_tensors="pt")
+                if device == "cuda":
+                    inputs = {k: v.cuda() for k, v in inputs.items()}
+                audio = tts_model(**inputs).waveform.squeeze().cpu().numpy()
+                audio_segments.append(audio)
+                duration = len(audio) / SAMPLE_RATE
+                timestamps.append({
+                    "start": format_time(current_time),
+                    "end": format_time(current_time + duration),
+                    "text": chunk
+                })
+                current_time += duration
+        # Concatenate audio
+        if not audio_segments:
+            return None, "", "", "❌ No audio generated"
+        full_audio = np.concatenate(audio_segments)
+        # Save audio
+        progress(0.95, desc="💾 Saving audiobook...")
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+            wavfile.write(f.name, SAMPLE_RATE, (full_audio * 32767).astype(np.int16))
             audio_path = f.name
         # Format output
         timestamps_text = "\n".join([f"[{t['start']} → {t['end']}] {t['text']}" for t in timestamps])
+        # Calculate audio duration
+        audio_duration = len(full_audio) / SAMPLE_RATE
+        duration_str = f"{int(audio_duration // 60)}:{int(audio_duration % 60):02d}"
+        transcript = f"""## Original (English)
+{text[:1000]}{'...' if len(text) > 1000 else ''}{truncated_msg}
+## Translation (Hausa)
+{translated}
+---
+📊 **Stats**: {len(text):,} chars → {len(translated):,} chars | 🎵 Duration: {duration_str}
+"""
         progress(1.0, desc="✅ Done!")
+        return audio_path, transcript, timestamps_text, f"✅ Audiobook generated! Duration: {duration_str}"
     except Exception as e:
+        import traceback
+        traceback.print_exc()
         return None, "", "", f"❌ Error: {str(e)}"
 # ============================================
     with gr.Row():
         with gr.Column(scale=1):
+            file_input = gr.File(
+                label="📁 Upload Document",
+                file_types=[".pdf", ".docx", ".doc", ".txt"],
+                type="filepath"
+            )
             btn = gr.Button("🚀 Generate Audiobook", variant="primary", size="lg")
             status = gr.Textbox(label="Status", interactive=False)
             gr.Markdown("""
             ### How it works
+            1. Upload English document (PDF, DOCX, DOC, TXT)
             2. AI translates to Hausa
+            3. TTS generates natural audio
+            4. Download audiobook with timestamps
+            ---
+            ⏱️ **Processing time**: ~1-2 min per page
+            📄 **Max length**: 10,000 characters (~4 pages)
             """)
         with gr.Column(scale=2):
                 with gr.Tab("📜 Transcript"):
                     transcript = gr.Markdown()
                 with gr.Tab("⏱️ Timestamps"):
+                    timestamps = gr.Textbox(lines=10, interactive=False)
     gr.HTML("""<div style="text-align: center; padding: 1rem; background: #f8f9fa; border-radius: 8px; margin-top: 1rem;">
         <strong>PlotWeaver</strong> - AI for African Languages