Spaces:

maria355
/

VoiceVision-Creative-AI

Sleeping

App Files Files Community

maria355 commited on Sep 14, 2025

Commit

8a97603

verified ·

1 Parent(s): d1e7dff

Update app.py

Browse files

Files changed (1) hide show

app.py +316 -321

app.py CHANGED Viewed

@@ -1,375 +1,370 @@
-import gradio as gr
-import speech_recognition as sr
-import requests
-import json
 import io
 import base64
-from PIL import Image
 import os
 from datetime import datetime
-import time
-import re
-try:
-    import google.generativeai as genai
-    GEMINI_AVAILABLE = True
-except ImportError:
-    GEMINI_AVAILABLE = False
-    print("Gemini AI not available")
-GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
-if GEMINI_AVAILABLE and GEMINI_API_KEY:
-    genai.configure(api_key=GEMINI_API_KEY)
-    try:
-        gemini_model = genai.GenerativeModel('gemini-pro')
-    except Exception as e:
-        print(f"Error initializing Gemini: {e}")
-        GEMINI_AVAILABLE = False
-HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN") or os.getenv("HF_TOKEN")
-TEXT_MODELS = [
-    "microsoft/DialoGPT-medium",
-    "gpt2",
-    "facebook/blenderbot-400M-distill"
-]
-IMAGE_MODELS = [
-    "stabilityai/stable-diffusion-2-1",
-    "runwayml/stable-diffusion-v1-5",
-    "CompVis/stable-diffusion-v1-4"
-]
-def query_huggingface_text(payload, model_name):
-    API_URL = f"https://api-inference.huggingface.co/models/{model_name}"
-    headers = {}
-    if HF_TOKEN:
-        headers["Authorization"] = f"Bearer {HF_TOKEN}"
     try:
-        response = requests.post(API_URL, headers=headers, json=payload, timeout=30)
-        if response.status_code == 200:
-            result = response.json()
-            return result
-        elif response.status_code == 503:
-            print(f"Model {model_name} is loading")
-            return None
         else:
-            print(f"Error {response.status_code} with model {model_name}")
-            return None
     except Exception as e:
-        print(f"Error with model {model_name}: {str(e)}")
         return None
-def query_huggingface_image(payload, model_name):
-    API_URL = f"https://api-inference.huggingface.co/models/{model_name}"
-    headers = {}
-    if HF_TOKEN:
-        headers["Authorization"] = f"Bearer {HF_TOKEN}"
     try:
-        response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
-        if response.status_code == 200:
-            return response.content
-        else:
-            print(f"Error with image model {model_name}")
-            return None
     except Exception as e:
-        print(f"Error with image model {model_name}: {str(e)}")
         return None
-def transcribe_audio(audio_file):
-    if audio_file is None:
-        return "No audio file provided"
-    recognizer = sr.Recognizer()
     try:
-        audio_path = str(audio_file)
-        with sr.AudioFile(audio_path) as source:
-            audio = recognizer.record(source)
-        try:
-            text = recognizer.recognize_google(audio)
-            return text
-        except sr.UnknownValueError:
-            return "Could not understand the audio"
-        except sr.RequestError as e:
-            return f"Speech recognition error: {str(e)}"
     except Exception as e:
-        return f"Error processing audio: {str(e)}"
-def enhance_prompt_with_gemini(text):
-    if not (GEMINI_AVAILABLE and GEMINI_API_KEY):
-        return text, text
     try:
         prompt = f"""
-        Enhance this prompt for content and image generation:
-        Original: {text}
-        Provide:
-        TEXT: [enhanced text prompt]
-        IMAGE: [enhanced image prompt]
-        """
-        response = gemini_model.generate_content(prompt)
-        enhanced = response.text
-        text_match = re.search(r'TEXT:\s*(.+?)(?=IMAGE:|$)', enhanced, re.DOTALL)
-        image_match = re.search(r'IMAGE:\s*(.+?)$', enhanced, re.DOTALL)
-        enhanced_text = text_match.group(1).strip() if text_match else text
-        enhanced_image = image_match.group(1).strip() if image_match else text
-        return enhanced_text, enhanced_image
     except Exception as e:
-        print(f"Gemini error: {str(e)}")
-        return text, text
-def generate_text_content(prompt, content_type="blog"):
-    if GEMINI_AVAILABLE and GEMINI_API_KEY:
-        enhanced_text, _ = enhance_prompt_with_gemini(prompt)
-        prompt = enhanced_text
-    content_templates = {
-        "blog": f"Write a blog post about: {prompt}\n\nPost:",
-        "social": f"Write a social media post about: {prompt}\n\nPost:",
-        "caption": f"Write a caption for: {prompt}\n\nCaption:",
-        "story": f"Write a story about: {prompt}\n\nStory:"
-    }
-    full_prompt = content_templates.get(content_type, prompt)
-    for model in TEXT_MODELS:
-        payload = {
-            "inputs": full_prompt,
-            "parameters": {
-                "max_length": 200,
-                "temperature": 0.7
-            }
-        }
-        result = query_huggingface_text(payload, model)
-        if result and len(result) > 0:
-            try:
-                if isinstance(result, list) and len(result) > 0:
-                    generated_text = result[0].get("generated_text", "")
-                elif isinstance(result, dict):
-                    generated_text = result.get("generated_text", "")
-                else:
-                    continue
-                if generated_text and generated_text.startswith(full_prompt):
-                    generated_text = generated_text[len(full_prompt):].strip()
-                if generated_text and len(generated_text) > 10:
-                    return generated_text
-            except Exception as e:
-                print(f"Error processing result: {e}")
-                continue
-    fallback_content = {
-        "blog": f"# About {prompt}\n\nThis is an interesting topic with many aspects to explore. Here are key points:\n\n• Main concepts and principles\n• Practical applications\n• Future possibilities\n\nThis topic offers great potential for discussion.",
-        "social": f"Excited to share thoughts about {prompt}! This is such an important topic. What are your thoughts? #inspiration",
-        "caption": f"✨ {prompt} ✨ Beautiful moments from simple ideas. #creativity #inspiration",
-        "story": f"There was something special about {prompt}. It captured everyone's imagination, leading to wonderful adventures and discoveries."
-    }
-    return fallback_content.get(content_type, f"Content about: {prompt}")
-def generate_image_from_text(prompt):
-    if GEMINI_AVAILABLE and GEMINI_API_KEY:
-        _, enhanced_image = enhance_prompt_with_gemini(prompt)
-        prompt = enhanced_image
-    enhanced_prompt = f"{prompt}, high quality, detailed, artistic"
-    for model in IMAGE_MODELS:
-        payload = {"inputs": enhanced_prompt}
-        image_bytes = query_huggingface_image(payload, model)
-        if image_bytes:
-            try:
-                image = Image.open(io.BytesIO(image_bytes))
-                if image.mode != 'RGB':
-                    image = image.convert('RGB')
-                return image
-            except Exception as e:
-                print(f"Error opening image: {str(e)}")
-                continue
-    placeholder = Image.new('RGB', (512, 512), color='lightblue')
-    return placeholder
-def process_voice_input(audio_file, content_type):
-    if audio_file is None:
-        return "Please record some audio first", None, ""
-    transcribed_text = transcribe_audio(audio_file)
-    if transcribed_text.startswith("Error") or transcribed_text.startswith("Could not"):
-        return transcribed_text, None, transcribed_text
-    try:
-        text_content = generate_text_content(transcribed_text, content_type)
-    except Exception as e:
-        text_content = f"Error generating text: {str(e)}"
     try:
-        image = generate_image_from_text(transcribed_text)
     except Exception as e:
-        print(f"Error generating image: {str(e)}")
-        image = Image.new('RGB', (512, 512), color='lightgray')
-    return text_content, image, transcribed_text
-def process_text_input(text_input, content_type):
-    if not text_input.strip():
-        return "Please enter some text", None
-    try:
-        text_content = generate_text_content(text_input, content_type)
-    except Exception as e:
-        text_content = f"Error generating text: {str(e)}"
-    try:
-        image = generate_image_from_text(text_input)
-    except Exception as e:
-        print(f"Error generating image: {str(e)}")
-        image = Image.new('RGB', (512, 512), color='lightgray')
-    return text_content, image
-def create_interface():
-    with gr.Blocks(title="VociArt - Voice AI Creator", theme=gr.themes.Soft()) as app:
-        gr.Markdown("""
-        # 🎙️ VociArt - Voice AI Creator
-        Transform your voice into AI-generated content and images!
-        **Features:** Voice-to-text • Content generation • Image creation
-        """)
-        with gr.Tab("🎙️ Voice Input"):
-            with gr.Row():
-                with gr.Column():
-                    audio_input = gr.Audio(
-                        sources=["microphone"],
-                        type="filepath",
-                        label="🎤 Record Your Voice"
-                    )
-                    content_type = gr.Dropdown(
-                        choices=["blog", "social", "caption", "story"],
-                        value="blog",
-                        label="📝 Content Type"
-                    )
-                    voice_submit_btn = gr.Button("🚀 Generate from Voice", variant="primary")
-                with gr.Column():
-                    transcribed_output = gr.Textbox(
-                        label="📝 What You Said",
-                        lines=3
-                    )
-            with gr.Row():
-                with gr.Column():
-                    text_output = gr.Textbox(
-                        label="📄 Generated Content",
-                        lines=8
-                    )
-                with gr.Column():
-                    image_output = gr.Image(
-                        label="🎨 Generated Image",
-                        type="pil"
-                    )
-        with gr.Tab("⌨️ Text Input"):
-            with gr.Row():
-                with gr.Column():
-                    text_input = gr.Textbox(
-                        label="💭 Enter Your Idea",
-                        lines=3
-                    )
-                    text_content_type = gr.Dropdown(
-                        choices=["blog", "social", "caption", "story"],
-                        value="blog",
-                        label="📝 Content Type"
-                    )
-                    text_submit_btn = gr.Button("🚀 Generate from Text", variant="primary")
-            with gr.Row():
-                with gr.Column():
-                    text_output_2 = gr.Textbox(
-                        label="📄 Generated Content",
-                        lines=8
-                    )
-                with gr.Column():
-                    image_output_2 = gr.Image(
-                        label="🎨 Generated Image",
-                        type="pil"
-                    )
-        with gr.Tab("ℹ️ About"):
-            gr.Markdown("""
-            ## About VociArt
-            Transform spoken ideas into content and visuals using AI!
-            ### How to Use:
-            1. **Voice**: Record your idea, select content type, generate
-            2. **Text**: Type your idea, choose type, generate
-            ### Content Types:
-            - **Blog**: Articles and posts
-            - **Social**: Social media content
-            - **Caption**: Image captions
-            - **Story**: Short stories
-            ### Tips:
-            - Speak clearly in a quiet environment
-            - Be specific with your ideas
-            - Try different content types
-            Made with free AI models from Hugging Face!
-            """)
-        voice_submit_btn.click(
-            fn=process_voice_input,
-            inputs=[audio_input, content_type],
-            outputs=[text_output, image_output, transcribed_output]
-        )
-        text_submit_btn.click(
-            fn=process_text_input,
-            inputs=[text_input, text_content_type],
-            outputs=[text_output_2, image_output_2]
-        )
-    return app
 if __name__ == "__main__":
-    print("Starting VociArt...")
-    app = create_interface()
-    app.launch(
-        server_name="0.0.0.0",
-        server_port=7860
-    )

+import streamlit as st
+import torch
+import numpy as np
 import io
 import base64
 import os
+import tempfile
+from PIL import Image
+import requests
+import json
 from datetime import datetime
+# Hugging Face imports
+from transformers import (
+    AutoProcessor,
+    AutoModelForSpeechSeq2Seq,
+    pipeline
+)
+from diffusers import StableDiffusionPipeline, StableDiffusionImg2ImgPipeline
+import torchaudio
+from scipy.io import wavfile
+import google.generativeai as genai
+# Configure page
+st.set_page_config(
+    page_title="VoiceCanvas - AI Content Studio",
+    page_icon="🎨",
+    layout="wide"
+)
+# Initialize session state
+if 'generated_images' not in st.session_state:
+    st.session_state.generated_images = []
+if 'generated_text' not in st.session_state:
+    st.session_state.generated_text = []
+if 'transcription' not in st.session_state:
+    st.session_state.transcription = ""
+if 'selected_image' not in st.session_state:
+    st.session_state.selected_image = None
+@st.cache_resource
+def load_whisper_model():
+    """Load Whisper model for speech-to-text"""
+    try:
+        model_name = "openai/whisper-small"
+        processor = AutoProcessor.from_pretrained(model_name)
+        model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name)
+        return processor, model
+    except Exception as e:
+        st.error(f"Error loading Whisper model: {e}")
+        return None, None
+@st.cache_resource
+def load_diffusion_model():
+    """Load Stable Diffusion model for image generation"""
     try:
+        model_name = "runwayml/stable-diffusion-v1-5"
+        pipe = StableDiffusionPipeline.from_pretrained(
+            model_name,
+            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+            safety_checker=None,
+            requires_safety_checker=False
+        )
+        if torch.cuda.is_available():
+            pipe = pipe.to("cuda")
         else:
+            pipe = pipe.to("cpu")
+        pipe.enable_attention_slicing()
+        return pipe
     except Exception as e:
+        st.error(f"Error loading Stable Diffusion model: {e}")
         return None
+@st.cache_resource
+def load_tts_model():
+    """Load TTS model for text-to-speech"""
     try:
+        tts_pipeline = pipeline("text-to-speech", model="microsoft/speecht5_tts")
+        return tts_pipeline
     except Exception as e:
+        st.error(f"Error loading TTS model: {e}")
         return None
+def setup_gemini():
+    """Setup Gemini API"""
     try:
+        api_key = os.getenv("GEMINI_API_KEY")
+        if not api_key:
+            st.error("Gemini API key not found in environment variables")
+            return False
+        genai.configure(api_key=api_key)
+        return True
+    except Exception as e:
+        st.error(f"Error setting up Gemini: {e}")
+        return False
+def transcribe_audio(audio_data, processor, model):
+    """Transcribe audio using Whisper"""
+    try:
+        if processor is None or model is None:
+            return "Error: Whisper model not loaded"
+        # Process audio
+        inputs = processor(audio_data, sampling_rate=16000, return_tensors="pt")
+        # Generate transcription
+        with torch.no_grad():
+            predicted_ids = model.generate(inputs["input_features"])
+            transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+        return transcription
     except Exception as e:
+        return f"Error in transcription: {e}"
+def generate_creative_content(transcription):
+    """Generate creative copy and image prompts using Gemini"""
     try:
+        model = genai.GenerativeModel('gemini-pro')
         prompt = f"""
+        Based on this user request: "{transcription}"
+        Please generate:
+        1. Three marketing taglines/copy variations
+        2. Three detailed image prompt variations for AI image generation
+        Format your response as JSON:
+        {{
+            "taglines": ["tagline1", "tagline2", "tagline3"],
+            "image_prompts": ["prompt1", "prompt2", "prompt3"]
+        }}
+        Make the taglines catchy and marketing-focused.
+        Make the image prompts detailed and optimized for Stable Diffusion.
+        """
+        response = model.generate_content(prompt)
+        # Try to parse JSON from response
+        try:
+            content = json.loads(response.text)
+            return content["taglines"], content["image_prompts"]
+        except:
+            # Fallback if JSON parsing fails
+            taglines = [
+                f"Creative content based on: {transcription}",
+                f"Innovative solution for: {transcription}",
+                f"Experience the magic of: {transcription}"
+            ]
+            image_prompts = [
+                f"High quality, detailed illustration of {transcription}, professional art style",
+                f"Beautiful artistic rendering of {transcription}, vibrant colors",
+                f"Creative visual representation of {transcription}, modern design"
+            ]
+            return taglines, image_prompts
     except Exception as e:
+        st.error(f"Error with Gemini API: {e}")
+        # Fallback content
+        taglines = [
+            f"Discover: {transcription}",
+            f"Experience: {transcription}",
+            f"Explore: {transcription}"
+        ]
+        image_prompts = [
+            f"Artistic illustration of {transcription}",
+            f"Creative visualization of {transcription}",
+            f"Beautiful rendering of {transcription}"
+        ]
+        return taglines, image_prompts
+def generate_images(prompts, pipe):
+    """Generate images using Stable Diffusion"""
+    images = []
+    if pipe is None:
+        return images
+    try:
+        for prompt in prompts:
+            with st.spinner(f"Generating image for: {prompt[:50]}..."):
+                # Generate image
+                result = pipe(
+                    prompt,
+                    num_inference_steps=20,
+                    guidance_scale=7.5,
+                    height=512,
+                    width=512
+                )
+                images.append(result.images[0])
+    except Exception as e:
+        st.error(f"Error generating images: {e}")
+    return images
+def generate_tts(text, tts_pipeline):
+    """Generate text-to-speech audio"""
     try:
+        if tts_pipeline is None:
+            return None
+        # Generate speech
+        result = tts_pipeline(text)
+        # Convert to audio format
+        audio_data = result["audio"]
+        sample_rate = result["sampling_rate"]
+        # Save to temporary file
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
+            wavfile.write(tmp_file.name, sample_rate, (audio_data * 32767).astype(np.int16))
+            return tmp_file.name
     except Exception as e:
+        st.error(f"Error generating TTS: {e}")
+        return None
+def main():
+    st.title("🎨 VoiceCanvas - AI Content Studio")
+    st.markdown("*Transform your voice into visual and textual content*")
+    # Setup models and APIs
+    with st.spinner("Loading AI models..."):
+        whisper_processor, whisper_model = load_whisper_model()
+        diffusion_pipe = load_diffusion_model()
+        tts_pipeline = load_tts_model()
+        gemini_ready = setup_gemini()
+    # Sidebar for settings
+    with st.sidebar:
+        st.header("Settings")
+        st.info("💡 **How to use:**\n1. Record or upload audio\n2. Review transcription\n3. Generate content\n4. Download results")
+        # Model status
+        st.header("Model Status")
+        st.write("🎤 Whisper:", "✅" if whisper_model else "❌")
+        st.write("🎨 Stable Diffusion:", "✅" if diffusion_pipe else "❌")
+        st.write("🔊 TTS:", "✅" if tts_pipeline else "❌")
+        st.write("🤖 Gemini:", "✅" if gemini_ready else "❌")
+    # Main interface
+    col1, col2 = st.columns([1, 2])
+    with col1:
+        st.header("🎤 Voice Input")
+        # Audio input methods
+        audio_method = st.radio("Choose input method:", ["Upload Audio File", "Record Audio"])
+        audio_data = None
+        if audio_method == "Upload Audio File":
+            uploaded_file = st.file_uploader("Upload audio file", type=['wav', 'mp3', 'mp4'])
+            if uploaded_file:
+                # Load audio file
+                try:
+                    audio_data, sample_rate = torchaudio.load(io.BytesIO(uploaded_file.read()))
+                    # Convert to mono and resample to 16kHz
+                    if audio_data.shape[0] > 1:
+                        audio_data = torch.mean(audio_data, dim=0, keepdim=True)
+                    if sample_rate != 16000:
+                        resampler = torchaudio.transforms.Resample(sample_rate, 16000)
+                        audio_data = resampler(audio_data)
+                    audio_data = audio_data.squeeze().numpy()
+                except Exception as e:
+                    st.error(f"Error loading audio: {e}")
+        else:  # Record Audio
+            st.info("Audio recording requires browser permissions. Click the record button below.")
+            # Note: Streamlit doesn't have built-in audio recording,
+            # so we'll provide a text input as alternative
+            st.text_area("Or type your prompt directly:", key="direct_prompt", height=100)
+            if st.session_state.direct_prompt:
+                st.session_state.transcription = st.session_state.direct_prompt
+        # Transcription
+        if st.button("🎯 Transcribe Audio") and audio_data is not None:
+            with st.spinner("Transcribing audio..."):
+                transcription = transcribe_audio(audio_data, whisper_processor, whisper_model)
+                st.session_state.transcription = transcription
+        # Show transcription
+        if st.session_state.transcription:
+            st.subheader("📝 Transcription")
+            edited_transcription = st.text_area(
+                "Edit if needed:",
+                value=st.session_state.transcription,
+                height=100
+            )
+            st.session_state.transcription = edited_transcription
+    with col2:
+        st.header("🚀 Content Generation")
+        if st.session_state.transcription and st.button("✨ Generate Content"):
+            with st.spinner("Generating creative content..."):
+                # Generate taglines and image prompts
+                taglines, image_prompts = generate_creative_content(st.session_state.transcription)
+                st.session_state.generated_text = taglines
+                # Generate images
+                images = generate_images(image_prompts, diffusion_pipe)
+                st.session_state.generated_images = images
+        # Display generated content
+        if st.session_state.generated_text:
+            st.subheader("✍️ Generated Taglines")
+            for i, tagline in enumerate(st.session_state.generated_text):
+                st.write(f"**{i+1}.** {tagline}")
+        if st.session_state.generated_images:
+            st.subheader("🎨 Generated Images")
+            cols = st.columns(3)
+            for i, img in enumerate(st.session_state.generated_images):
+                with cols[i % 3]:
+                    st.image(img, caption=f"Variation {i+1}")
+                    if st.button(f"Select Image {i+1}", key=f"select_{i}"):
+                        st.session_state.selected_image = img
+    # Content export section
+    if st.session_state.generated_text or st.session_state.generated_images:
+        st.header("📦 Export Content")
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            if st.session_state.generated_text and st.button("🔊 Generate Voiceover"):
+                selected_text = st.selectbox("Choose text for voiceover:", st.session_state.generated_text)
+                with st.spinner("Generating voiceover..."):
+                    audio_file = generate_tts(selected_text, tts_pipeline)
+                    if audio_file:
+                        st.audio(audio_file)
+                        with open(audio_file, "rb") as f:
+                            st.download_button(
+                                "Download Audio",
+                                f.read(),
+                                file_name=f"voiceover_{datetime.now().strftime('%Y%m%d_%H%M%S')}.wav",
+                                mime="audio/wav"
+                            )
+        with col2:
+            if st.session_state.selected_image:
+                st.write("Selected Image:")
+                st.image(st.session_state.selected_image, width=200)
+                # Convert image to bytes for download
+                img_buffer = io.BytesIO()
+                st.session_state.selected_image.save(img_buffer, format="PNG")
+                st.download_button(
+                    "Download Image",
+                    img_buffer.getvalue(),
+                    file_name=f"generated_image_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png",
+                    mime="image/png"
+                )
+        with col3:
+            if st.session_state.generated_text:
+                # Create text file with all taglines
+                text_content = "\n".join([f"{i+1}. {tagline}" for i, tagline in enumerate(st.session_state.generated_text)])
+                st.download_button(
+                    "Download Taglines",
+                    text_content,
+                    file_name=f"taglines_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
+                    mime="text/plain"
+                )
 if __name__ == "__main__":
+    main()