Spaces:

Danielah17
/

itv

Sleeping

App Files Files Community

Danielah17 commited on Jan 7

Commit

8ae87a8

verified ·

1 Parent(s): caf8f9b

Upload app.py

Browse files

Files changed (1) hide show

app.py +266 -0

app.py ADDED Viewed

	@@ -0,0 +1,266 @@

+import gradio as gr
+from supertonic import TTS
+from transformers import pipeline
+import tempfile
+import os
+# Initialize the image-to-text pipeline
+image_to_text = pipeline("image-to-text")
+# Initialize the TTS model
+tts = TTS(auto_download=True)
+# Available voice styles (common Supertonic voices)
+VOICE_OPTIONS = [
+    ("M5 - Male Voice (Default)", "M5"),
+    ("M1 - Male Voice 1", "M1"),
+    ("M2 - Male Voice 2", "M2"),
+    ("M3 - Male Voice 3", "M3"),
+    ("M4 - Male Voice 4", "M4"),
+    ("F1 - Female Voice 1", "F1"),
+    ("F2 - Female Voice 2", "F2"),
+    ("F3 - Female Voice 3", "F3"),
+    ("F4 - Female Voice 4", "F4"),
+    ("F5 - Female Voice 5", "F5"),
+]
+def image_to_voice(image, voice_selection):
+    """
+    Convert an image to text, then text to speech.
+    Args:
+        image: Input image (PIL Image or numpy array)
+        voice_selection: Selected voice style from dropdown (e.g., "M5 - Male Voice (Default)")
+    Returns:
+        Path to the generated audio file and extracted text
+    """
+    if image is None:
+        return None, "Please upload an image to get started."
+    try:
+        # Extract voice name from selection (e.g., "M5 - Male Voice (Default)" -> "M5")
+        voice_name = None
+        for opt_label, opt_value in VOICE_OPTIONS:
+            if opt_label == voice_selection:
+                voice_name = opt_value
+                break
+        if voice_name is None:
+            # Fallback: try to extract from the selection if format is unexpected
+            voice_name = voice_selection.split(" - ")[0] if " - " in voice_selection else voice_selection
+        # Convert image to text
+        result = image_to_text(image)
+        generated_text = result[0]['generated_text']
+        # Get the selected voice style
+        style = tts.get_voice_style(voice_name=voice_name)
+        # Convert text to speech
+        wav, duration = tts.synthesize(generated_text, voice_style=style)
+        # Save to a temporary file
+        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+        tts.save_audio(wav, temp_file.name)
+        return temp_file.name, generated_text
+    except Exception as e:
+        return None, f"❌ Error: {str(e)}"
+# Custom CSS for professional styling
+custom_css = """
+    .gradio-container {
+        font-family: 'Inter', 'Segoe UI', system-ui, sans-serif !important;
+    }
+    .header {
+        text-align: center;
+        padding: 2rem 1rem;
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        border-radius: 12px;
+        margin-bottom: 2rem;
+        color: white;
+        box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+    }
+    .header h1 {
+        margin: 0;
+        font-size: 2.5rem;
+        font-weight: 700;
+        letter-spacing: -0.02em;
+    }
+    .header p {
+        margin: 0.5rem 0 0 0;
+        opacity: 0.95;
+        font-size: 1.1rem;
+    }
+    .feature-box {
+        background: #f8f9fa;
+        border-radius: 10px;
+        padding: 1.5rem;
+        margin: 1rem 0;
+        border-left: 4px solid #667eea;
+    }
+    .feature-box h3 {
+        margin-top: 0;
+        color: #333;
+        font-size: 1.1rem;
+    }
+    .main-content {
+        max-width: 1200px;
+        margin: 0 auto;
+    }
+    .upload-section {
+        background: white;
+        border-radius: 12px;
+        padding: 2rem;
+        box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08);
+        margin-bottom: 1.5rem;
+    }
+    .output-section {
+        background: white;
+        border-radius: 12px;
+        padding: 2rem;
+        box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08);
+    }
+    .generate-btn {
+        width: 100%;
+        padding: 1rem !important;
+        font-size: 1.1rem !important;
+        font-weight: 600 !important;
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
+        border: none !important;
+        border-radius: 8px !important;
+        transition: transform 0.2s, box-shadow 0.2s !important;
+    }
+    .generate-btn:hover {
+        transform: translateY(-2px);
+        box-shadow: 0 6px 12px rgba(102, 126, 234, 0.4) !important;
+    }
+    .footer {
+        text-align: center;
+        padding: 2rem 1rem;
+        margin-top: 3rem;
+        color: #666;
+        font-size: 0.9rem;
+    }
+    .section-title {
+        margin-top: 1rem;
+        margin-bottom: 1rem;
+        color: #333;
+        font-weight: 600;
+    }
+    select, .gr-dropdown {
+        border-radius: 8px !important;
+        border: 2px solid #e0e0e0 !important;
+        padding: 0.75rem !important;
+        font-size: 1rem !important;
+        transition: border-color 0.2s !important;
+    }
+    select:focus, .gr-dropdown:focus {
+        border-color: #667eea !important;
+        outline: none !important;
+    }
+"""
+# Create Gradio interface
+with gr.Blocks(title="Image to Voice Converter", theme=gr.themes.Soft(), css=custom_css) as demo:
+    # Header Section
+    gr.HTML("""
+        <div class="header">
+            <h1>🎙️ Image to Voice Converter</h1>
+            <p>Transform images into speech with AI-powered technology</p>
+        </div>
+    """)
+    # Main Content Container
+    with gr.Column(elem_classes="main-content"):
+        # Instructions Section
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.HTML("""
+                    <div class="feature-box">
+                        <h3>📷 Step 1: Upload Image</h3>
+                        <p>Upload any image containing text. Our AI will extract it automatically.</p>
+                    </div>
+                """)
+            with gr.Column(scale=1):
+                gr.HTML("""
+                    <div class="feature-box">
+                        <h3>🤖 Step 2: AI Processing</h3>
+                        <p>Advanced vision-language models analyze and extract text from your image.</p>
+                    </div>
+                """)
+            with gr.Column(scale=1):
+                gr.HTML("""
+                    <div class="feature-box">
+                        <h3>🔊 Step 3: Audio Generation</h3>
+                        <p>Text is converted to natural-sounding speech using Supertonic TTS.</p>
+                    </div>
+                """)
+        # Main Workflow Section
+        with gr.Row():
+            # Left Column - Input
+            with gr.Column(scale=1, elem_classes="upload-section"):
+                gr.Markdown("### 📤 Upload Your Image", elem_classes="section-title")
+                image_input = gr.Image(
+                    label="",
+                    type="pil",
+                    height=350,
+                    show_label=False
+                )
+                gr.Markdown("### 🎚️ Voice Settings", elem_classes="section-title")
+                voice_dropdown = gr.Dropdown(
+                    choices=[opt[0] for opt in VOICE_OPTIONS],
+                    label="Select Voice Style",
+                    value="M5 - Male Voice (Default)",
+                    info="Choose a voice style for the generated audio"
+                )
+                generate_btn = gr.Button(
+                    "✨ Generate Audio",
+                    variant="primary",
+                    elem_classes="generate-btn",
+                    size="lg"
+                )
+            # Right Column - Output
+            with gr.Column(scale=1, elem_classes="output-section"):
+                gr.Markdown("### 📝 Extracted Text", elem_classes="section-title")
+                text_output = gr.Textbox(
+                    label="",
+                    lines=6,
+                    show_label=False,
+                    placeholder="The extracted text will appear here...",
+                    interactive=False
+                )
+                gr.Markdown("### 🔊 Generated Audio", elem_classes="section-title")
+                audio_output = gr.Audio(
+                    label="",
+                    type="filepath",
+                    show_label=False
+                )
+        # Connection
+        generate_btn.click(
+            fn=image_to_voice,
+            inputs=[image_input, voice_dropdown],
+            outputs=[audio_output, text_output],
+            show_progress="full"
+        )
+    # Footer
+    gr.HTML("""
+        <div class="footer">
+            <p>Powered by <strong>Hugging Face Transformers</strong> & <strong>Supertonic TTS</strong> |
+            Built with ❤️ using Gradio</p>
+        </div>
+    """)
+if __name__ == "__main__":
+    demo.launch()