Spaces:

maria355
/

VoiceVision-Creative-AI

Sleeping

App Files Files Community

maria355 commited on Sep 14, 2025

Commit

69b204b

verified ·

1 Parent(s): 4741bb9

Update app.py

Browse files

Files changed (1) hide show

app.py +234 -119

app.py CHANGED Viewed

@@ -5,92 +5,151 @@ import json
 import io
 import base64
 from PIL import Image
-import google.generativeai as genai
 import os
 from datetime import datetime
 import time
 import re
-# Configure Gemini API
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
-if GEMINI_API_KEY:
     genai.configure(api_key=GEMINI_API_KEY)
-    gemini_model = genai.GenerativeModel('gemini-pro')
-# Hugging Face API endpoints for free models
-HF_TEXT_API_URL = "https://api-inference.huggingface.co/models/microsoft/DialoGPT-medium"
-HF_IMAGE_API_URL = "https://api-inference.huggingface.co/models/stabilityai/stable-diffusion-2-1"
 # Alternative text generation models to try
 TEXT_MODELS = [
     "microsoft/DialoGPT-medium",
-    "gpt2-medium",
-    "facebook/blenderbot-400M-distill"
 ]
 # Alternative image generation models to try
 IMAGE_MODELS = [
     "stabilityai/stable-diffusion-2-1",
     "runwayml/stable-diffusion-v1-5",
-    "CompVis/stable-diffusion-v1-4"
 ]
 def query_huggingface_text(payload, model_name):
-    """Query Hugging Face text generation API"""
     API_URL = f"https://api-inference.huggingface.co/models/{model_name}"
-    headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_TOKEN', '')}"}
     try:
         response = requests.post(API_URL, headers=headers, json=payload, timeout=30)
         if response.status_code == 200:
-            return response.json()
         else:
             return None
     except Exception as e:
         print(f"Error with model {model_name}: {str(e)}")
         return None
 def query_huggingface_image(payload, model_name):
-    """Query Hugging Face image generation API"""
     API_URL = f"https://api-inference.huggingface.co/models/{model_name}"
-    headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_TOKEN', '')}"}
     try:
         response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
         if response.status_code == 200:
             return response.content
         else:
             return None
     except Exception as e:
         print(f"Error with image model {model_name}: {str(e)}")
         return None
 def transcribe_audio(audio_file):
-    """Convert speech to text using speech recognition"""
     if audio_file is None:
         return "No audio file provided"
     recognizer = sr.Recognizer()
     try:
-        # Load audio file
-        with sr.AudioFile(audio_file) as source:
             audio = recognizer.record(source)
-        # Recognize speech using Google Speech Recognition (free)
-        text = recognizer.recognize_google(audio)
-        return text
     except sr.UnknownValueError:
-        return "Could not understand the audio"
     except sr.RequestError as e:
-        return f"Error with speech recognition service: {str(e)}"
     except Exception as e:
         return f"Error processing audio: {str(e)}"
 def enhance_prompt_with_gemini(text):
     """Enhance the prompt using Gemini API for better results"""
-    if not GEMINI_API_KEY:
-        return text
     try:
         prompt = f"""
@@ -126,89 +185,101 @@ def generate_text_content(prompt, content_type="blog"):
     """Generate text content using Hugging Face models"""
     # Enhance prompt with Gemini if available
-    if GEMINI_API_KEY:
         enhanced_text, _ = enhance_prompt_with_gemini(prompt)
         prompt = enhanced_text
     # Adjust prompt based on content type
-    if content_type == "blog":
-        full_prompt = f"Write a detailed blog post about: {prompt}\n\nBlog post:"
-    elif content_type == "social":
-        full_prompt = f"Write an engaging social media post about: {prompt}\n\nPost:"
-    elif content_type == "caption":
-        full_prompt = f"Write a creative caption for: {prompt}\n\nCaption:"
-    elif content_type == "story":
-        full_prompt = f"Write a short story about: {prompt}\n\nStory:"
-    else:
-        full_prompt = prompt
     # Try different models until one works
     for model in TEXT_MODELS:
         payload = {
             "inputs": full_prompt,
             "parameters": {
-                "max_length": 500,
                 "temperature": 0.7,
                 "do_sample": True,
-                "top_p": 0.9
             }
         }
         result = query_huggingface_text(payload, model)
         if result and len(result) > 0:
-            if isinstance(result, list) and len(result) > 0:
-                generated_text = result[0].get("generated_text", "")
                 # Clean up the response
-                if generated_text.startswith(full_prompt):
-                    generated_text = generated_text[len(full_prompt):].strip()
-                return generated_text if generated_text else f"Generated content for: {prompt}"
-            elif isinstance(result, dict):
-                generated_text = result.get("generated_text", "")
-                if generated_text.startswith(full_prompt):
                     generated_text = generated_text[len(full_prompt):].strip()
-                return generated_text if generated_text else f"Generated content for: {prompt}"
     # Fallback content if all models fail
-    return f"""Here's some content about {prompt}:
-This is an interesting topic that deserves exploration. The concept of {prompt} has various applications and implications that are worth discussing.
-Key points to consider:
-• The fundamental aspects of this topic
-• Its practical applications
-• Potential future developments
-• Impact on relevant stakeholders
-This content was generated based on your voice input and can be further customized according to your specific needs."""
 def generate_image_from_text(prompt):
     """Generate image using Hugging Face Stable Diffusion models"""
     # Enhance prompt with Gemini if available
-    if GEMINI_API_KEY:
         _, enhanced_image = enhance_prompt_with_gemini(prompt)
         prompt = enhanced_image
     # Add some style enhancements to the prompt
-    enhanced_prompt = f"{prompt}, high quality, detailed, artistic, professional"
     # Try different image models until one works
     for model in IMAGE_MODELS:
-        payload = {"inputs": enhanced_prompt}
         image_bytes = query_huggingface_image(payload, model)
         if image_bytes:
             try:
                 image = Image.open(io.BytesIO(image_bytes))
                 return image
             except Exception as e:
                 print(f"Error opening image from {model}: {str(e)}")
                 continue
     # Return a placeholder image if all models fail
-    placeholder = Image.new('RGB', (512, 512), color='lightgray')
     return placeholder
 def process_voice_input(audio_file, content_type):
@@ -259,60 +330,79 @@ def process_text_input(text_input, content_type):
     return text_content, image
-# Create Gradio interface
 def create_interface():
-    """Create the main Gradio interface"""
-    with gr.Blocks(title="VociArt - Voice-Controlled AI Content Creator", theme=gr.themes.Soft()) as app:
-        gr.Markdown("""
-        # 🎙️ VociArt - Voice-Controlled AI Content Creator
-        Generate AI content and images using just your voice! Speak your ideas and watch them come to life as both text and visuals.
-        **Features:**
-        - 🗣️ Voice-to-text conversion
-        - 📝 AI text content generation
-        - 🎨 AI image generation
-        - 🌍 Multi-language support
-        - 💾 Save and share outputs
         """)
         with gr.Tab("🎙️ Voice Input"):
             with gr.Row():
-                with gr.Column():
                     audio_input = gr.Audio(
                         sources=["microphone"],
                         type="filepath",
-                        label="🎤 Record Your Voice"
                     )
                     content_type = gr.Dropdown(
                         choices=["blog", "social", "caption", "story"],
                         value="blog",
-                        label="📝 Content Type"
                     )
-                    voice_submit_btn = gr.Button("🚀 Generate Content from Voice", variant="primary")
-                with gr.Column():
                     transcribed_output = gr.Textbox(
-                        label="📝 Transcribed Text",
-                        placeholder="Your speech will appear here..."
                     )
             with gr.Row():
                 with gr.Column():
                     text_output = gr.Textbox(
-                        label="📄 Generated Text Content",
-                        lines=10,
-                        placeholder="Generated text content will appear here..."
                     )
                 with gr.Column():
                     image_output = gr.Image(
                         label="🎨 Generated Image",
-                        type="pil"
                     )
         with gr.Tab("⌨️ Text Input"):
@@ -320,7 +410,7 @@ def create_interface():
                 with gr.Column():
                     text_input = gr.Textbox(
                         label="💭 Enter Your Idea",
-                        placeholder="Type your content idea here...",
                         lines=3
                     )
@@ -330,76 +420,101 @@ def create_interface():
                         label="📝 Content Type"
                     )
-                    text_submit_btn = gr.Button("🚀 Generate Content from Text", variant="primary")
             with gr.Row():
                 with gr.Column():
                     text_output_2 = gr.Textbox(
-                        label="📄 Generated Text Content",
-                        lines=10,
-                        placeholder="Generated text content will appear here..."
                     )
                 with gr.Column():
                     image_output_2 = gr.Image(
                         label="🎨 Generated Image",
-                        type="pil"
                     )
-        with gr.Tab("ℹ️ About"):
             gr.Markdown("""
-            ## About VociArt
-            VociArt is a revolutionary voice-controlled AI content creator that transforms your spoken ideas into both text content and stunning visuals.
-            ### How it works:
-            1. **Record**: Speak your ideas using the microphone
-            2. **Process**: AI transcribes and enhances your prompt
-            3. **Generate**: Creates both text content and images
-            4. **Customize**: Choose from different content types
-            ### Content Types:
-            - **Blog**: Detailed blog posts and articles
-            - **Social**: Social media posts and updates
-            - **Caption**: Creative captions for images
-            - **Story**: Short stories and narratives
-            ### Technologies Used:
-            - Hugging Face Transformers (Free models)
-            - Google Speech Recognition
-            - Gemini AI for prompt enhancement
-            - Stable Diffusion for image generation
-            ### Tips for best results:
-            - Speak clearly and at a moderate pace
-            - Be specific about what you want
-            - Try different content types
-            - Use descriptive language for better images
             ---
-            *Made with ❤️ using free AI models*
             """)
-        # Event handlers
         voice_submit_btn.click(
             fn=process_voice_input,
             inputs=[audio_input, content_type],
-            outputs=[text_output, image_output, transcribed_output]
         )
         text_submit_btn.click(
             fn=process_text_input,
             inputs=[text_input, text_content_type],
-            outputs=[text_output_2, image_output_2]
         )
     return app
 # Launch the application
 if __name__ == "__main__":
     app = create_interface()
     app.launch(
         server_name="0.0.0.0",
         server_port=7860,
-        share=True
     )

 import io
 import base64
 from PIL import Image
 import os
 from datetime import datetime
 import time
 import re
+import tempfile
+# Try to import optional dependencies
+try:
+    import google.generativeai as genai
+    GEMINI_AVAILABLE = True
+except ImportError:
+    GEMINI_AVAILABLE = False
+    print("Gemini AI not available - continuing without prompt enhancement")
+# Configure Gemini API if available
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
+if GEMINI_AVAILABLE and GEMINI_API_KEY:
     genai.configure(api_key=GEMINI_API_KEY)
+    try:
+        gemini_model = genai.GenerativeModel('gemini-pro')
+    except Exception as e:
+        print(f"Error initializing Gemini: {e}")
+        GEMINI_AVAILABLE = False
+# Hugging Face token
+HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN") or os.getenv("HF_TOKEN")
 # Alternative text generation models to try
 TEXT_MODELS = [
     "microsoft/DialoGPT-medium",
+    "gpt2",
+    "facebook/blenderbot-400M-distill",
+    "microsoft/DialoGPT-small"
 ]
 # Alternative image generation models to try
 IMAGE_MODELS = [
     "stabilityai/stable-diffusion-2-1",
     "runwayml/stable-diffusion-v1-5",
+    "CompVis/stable-diffusion-v1-4",
+    "stabilityai/stable-diffusion-2-1-base"
 ]
 def query_huggingface_text(payload, model_name):
+    """Query Hugging Face text generation API with better error handling"""
     API_URL = f"https://api-inference.huggingface.co/models/{model_name}"
+    headers = {}
+    if HF_TOKEN:
+        headers["Authorization"] = f"Bearer {HF_TOKEN}"
     try:
         response = requests.post(API_URL, headers=headers, json=payload, timeout=30)
         if response.status_code == 200:
+            result = response.json()
+            return result
+        elif response.status_code == 503:
+            print(f"Model {model_name} is loading, trying next model...")
+            return None
         else:
+            print(f"Error {response.status_code} with model {model_name}: {response.text}")
             return None
+    except requests.exceptions.Timeout:
+        print(f"Timeout with model {model_name}")
+        return None
     except Exception as e:
         print(f"Error with model {model_name}: {str(e)}")
         return None
 def query_huggingface_image(payload, model_name):
+    """Query Hugging Face image generation API with better error handling"""
     API_URL = f"https://api-inference.huggingface.co/models/{model_name}"
+    headers = {}
+    if HF_TOKEN:
+        headers["Authorization"] = f"Bearer {HF_TOKEN}"
     try:
         response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
         if response.status_code == 200:
             return response.content
+        elif response.status_code == 503:
+            print(f"Image model {model_name} is loading, trying next model...")
+            return None
         else:
+            print(f"Error {response.status_code} with image model {model_name}")
             return None
+    except requests.exceptions.Timeout:
+        print(f"Timeout with image model {model_name}")
+        return None
     except Exception as e:
         print(f"Error with image model {model_name}: {str(e)}")
         return None
 def transcribe_audio(audio_file):
+    """Convert speech to text using speech recognition with better error handling"""
     if audio_file is None:
         return "No audio file provided"
     recognizer = sr.Recognizer()
+    # Adjust for ambient noise
+    recognizer.energy_threshold = 300
+    recognizer.dynamic_energy_threshold = True
+    recognizer.pause_threshold = 0.8
     try:
+        # Handle different audio file types
+        audio_path = str(audio_file)
+        # Load and process audio file
+        with sr.AudioFile(audio_path) as source:
+            # Adjust for ambient noise
+            recognizer.adjust_for_ambient_noise(source, duration=0.5)
             audio = recognizer.record(source)
+        # Try Google Speech Recognition first (free tier)
+        try:
+            text = recognizer.recognize_google(audio, language='en-US')
+            return text
+        except sr.RequestError:
+            # Fallback to offline recognition if available
+            try:
+                text = recognizer.recognize_sphinx(audio)
+                return text
+            except (sr.RequestError, sr.UnknownValueError):
+                pass
+        return "Could not transcribe the audio. Please try speaking more clearly."
     except sr.UnknownValueError:
+        return "Could not understand the audio. Please speak more clearly."
     except sr.RequestError as e:
+        return f"Speech recognition service error: {str(e)}"
     except Exception as e:
         return f"Error processing audio: {str(e)}"
 def enhance_prompt_with_gemini(text):
     """Enhance the prompt using Gemini API for better results"""
+    if not (GEMINI_AVAILABLE and GEMINI_API_KEY):
+        return text, text
     try:
         prompt = f"""
     """Generate text content using Hugging Face models"""
     # Enhance prompt with Gemini if available
+    if GEMINI_AVAILABLE and GEMINI_API_KEY:
         enhanced_text, _ = enhance_prompt_with_gemini(prompt)
         prompt = enhanced_text
     # Adjust prompt based on content type
+    content_templates = {
+        "blog": f"Write a detailed blog post about: {prompt}\n\nBlog post:",
+        "social": f"Write an engaging social media post about: {prompt}\n\nPost:",
+        "caption": f"Write a creative caption for: {prompt}\n\nCaption:",
+        "story": f"Write a short story about: {prompt}\n\nStory:"
+    }
+    full_prompt = content_templates.get(content_type, prompt)
     # Try different models until one works
     for model in TEXT_MODELS:
         payload = {
             "inputs": full_prompt,
             "parameters": {
+                "max_length": 200,
                 "temperature": 0.7,
                 "do_sample": True,
+                "top_p": 0.9,
+                "repetition_penalty": 1.1
             }
         }
         result = query_huggingface_text(payload, model)
         if result and len(result) > 0:
+            try:
+                if isinstance(result, list) and len(result) > 0:
+                    generated_text = result[0].get("generated_text", "")
+                elif isinstance(result, dict):
+                    generated_text = result.get("generated_text", "")
+                else:
+                    continue
                 # Clean up the response
+                if generated_text and generated_text.startswith(full_prompt):
                     generated_text = generated_text[len(full_prompt):].strip()
+                if generated_text and len(generated_text) > 10:
+                    return generated_text
+            except Exception as e:
+                print(f"Error processing result from {model}: {e}")
+                continue
     # Fallback content if all models fail
+    fallback_content = {
+        "blog": f"# {prompt}\n\nThis is an interesting topic that deserves exploration. Here are some key points to consider:\n\n• The fundamental concepts and principles\n• Practical applications and use cases\n• Benefits and potential challenges\n• Future developments and trends\n\nThis topic offers many opportunities for further discussion and research.",
+        "social": f"🌟 Excited to share thoughts on {prompt}! This is such an important topic that deserves more attention. What are your thoughts? #AI #Innovation",
+        "caption": f"✨ {prompt} ✨ Sometimes the most beautiful moments come from the simplest ideas. 📸 #inspiration #creativity",
+        "story": f"Once upon a time, there was something special about {prompt}. It captured the imagination of everyone who encountered it, leading to unexpected adventures and new discoveries. The end was just the beginning of something even more wonderful."
+    }
+    return fallback_content.get(content_type, f"Content generated for: {prompt}")
 def generate_image_from_text(prompt):
     """Generate image using Hugging Face Stable Diffusion models"""
     # Enhance prompt with Gemini if available
+    if GEMINI_AVAILABLE and GEMINI_API_KEY:
         _, enhanced_image = enhance_prompt_with_gemini(prompt)
         prompt = enhanced_image
     # Add some style enhancements to the prompt
+    enhanced_prompt = f"{prompt}, high quality, detailed, artistic, professional, masterpiece"
     # Try different image models until one works
     for model in IMAGE_MODELS:
+        payload = {
+            "inputs": enhanced_prompt,
+            "parameters": {
+                "num_inference_steps": 20,
+                "guidance_scale": 7.5
+            }
+        }
         image_bytes = query_huggingface_image(payload, model)
         if image_bytes:
             try:
                 image = Image.open(io.BytesIO(image_bytes))
+                # Ensure image is in RGB mode
+                if image.mode != 'RGB':
+                    image = image.convert('RGB')
                 return image
             except Exception as e:
                 print(f"Error opening image from {model}: {str(e)}")
                 continue
     # Return a placeholder image if all models fail
+    placeholder = Image.new('RGB', (512, 512), color='lightblue')
     return placeholder
 def process_voice_input(audio_file, content_type):
     return text_content, image
 def create_interface():
+    """Create the main Gradio interface optimized for Hugging Face Spaces"""
+    # Custom CSS for better appearance
+    custom_css = """
+    .gradio-container {
+        max-width: 1200px !important;
+    }
+    .main-header {
+        text-align: center;
+        background: linear-gradient(45deg, #FF6B6B, #4ECDC4);
+        -webkit-background-clip: text;
+        -webkit-text-fill-color: transparent;
+        font-size: 2.5em;
+        font-weight: bold;
+        margin-bottom: 20px;
+    }
+    """
+    with gr.Blocks(title="VociArt - Voice AI Creator", theme=gr.themes.Soft(), css=custom_css) as app:
+        gr.HTML("""
+        <div class="main-header">
+            🎙️ VociArt - Voice AI Creator
+        </div>
+        """)
+        gr.Markdown("""
+        Transform your voice into AI-generated content and stunning visuals! 🚀
+        **✨ Features:** Voice-to-text • AI content generation • Image creation • Multiple content types
         """)
         with gr.Tab("🎙️ Voice Input"):
             with gr.Row():
+                with gr.Column(scale=1):
                     audio_input = gr.Audio(
                         sources=["microphone"],
                         type="filepath",
+                        label="🎤 Record Your Voice",
+                        show_download_button=False
                     )
                     content_type = gr.Dropdown(
                         choices=["blog", "social", "caption", "story"],
                         value="blog",
+                        label="📝 Content Type",
+                        info="Choose the type of content to generate"
                     )
+                    voice_submit_btn = gr.Button("🚀 Generate from Voice", variant="primary", size="lg")
+                with gr.Column(scale=1):
                     transcribed_output = gr.Textbox(
+                        label="📝 What You Said",
+                        placeholder="Your transcribed speech will appear here...",
+                        lines=3
                     )
             with gr.Row():
                 with gr.Column():
                     text_output = gr.Textbox(
+                        label="📄 Generated Content",
+                        lines=8,
+                        placeholder="AI-generated content will appear here...",
+                        show_copy_button=True
                     )
                 with gr.Column():
                     image_output = gr.Image(
                         label="🎨 Generated Image",
+                        type="pil",
+                        show_download_button=True
                     )
         with gr.Tab("⌨️ Text Input"):
                 with gr.Column():
                     text_input = gr.Textbox(
                         label="💭 Enter Your Idea",
+                        placeholder="Type your creative idea here...",
                         lines=3
                     )
                         label="📝 Content Type"
                     )
+                    text_submit_btn = gr.Button("🚀 Generate from Text", variant="primary", size="lg")
             with gr.Row():
                 with gr.Column():
                     text_output_2 = gr.Textbox(
+                        label="📄 Generated Content",
+                        lines=8,
+                        placeholder="AI-generated content will appear here...",
+                        show_copy_button=True
                     )
                 with gr.Column():
                     image_output_2 = gr.Image(
                         label="🎨 Generated Image",
+                        type="pil",
+                        show_download_button=True
                     )
+        with gr.Tab("ℹ️ About & Tips"):
             gr.Markdown("""
+            ## 🌟 About VociArt
+            VociArt transforms your spoken ideas into professional content and stunning visuals using cutting-edge AI technology.
+            ### 🎯 How to Use:
+            1. **Voice Tab**: Click the microphone, speak your idea clearly, select content type, then click generate
+            2. **Text Tab**: Type your idea directly, choose content type, and generate
+            ### 📝 Content Types:
+            - **📰 Blog**: Detailed articles and posts
+            - **📱 Social**: Engaging social media content
+            - **📸 Caption**: Creative image captions
+            - **📚 Story**: Short narratives and tales
+            ### 💡 Pro Tips:
+            - **Speak Clearly**: Use a quiet environment and speak at normal pace
+            - **Be Specific**: Detailed prompts create better results
+            - **Try Different Types**: Each content type has unique characteristics
+            - **Use Keywords**: Include relevant terms for better image generation
+            ### 🔧 Technical Features:
+            - **Free AI Models**: Powered by Hugging Face's free inference API
+            - **Speech Recognition**: Google Speech Recognition for transcription
+            - **Smart Fallbacks**: Multiple models ensure reliability
+            - **Gemini Enhancement**: Optional prompt improvement (if API key provided)
+            ### 🎨 Example Prompts:
+            - *"A futuristic city with flying cars at sunset"*
+            - *"Write about the benefits of morning meditation"*
+            - *"Create a social media post about healthy cooking"*
+            - *"A magical forest with glowing mushrooms"*
             ---
+            💝 **Made with love using free AI models** - Perfect for creators, marketers, and storytellers!
             """)
+        # Event handlers with better error handling
         voice_submit_btn.click(
             fn=process_voice_input,
             inputs=[audio_input, content_type],
+            outputs=[text_output, image_output, transcribed_output],
+            api_name="voice_generate"
         )
         text_submit_btn.click(
             fn=process_text_input,
             inputs=[text_input, text_content_type],
+            outputs=[text_output_2, image_output_2],
+            api_name="text_generate"
+        )
+        # Add examples
+        gr.Examples(
+            examples=[
+                ["A peaceful mountain landscape with a lake", "caption"],
+                ["The future of artificial intelligence in education", "blog"],
+                ["Delicious homemade pizza recipe", "social"],
+                ["A brave knight on a quest for the golden crown", "story"]
+            ],
+            inputs=[text_input, text_content_type],
+            outputs=[text_output_2, image_output_2],
+            fn=process_text_input,
+            cache_examples=False
         )
     return app
 # Launch the application
 if __name__ == "__main__":
+    print("🚀 Starting VociArt...")
     app = create_interface()
     app.launch(
         server_name="0.0.0.0",
         server_port=7860,
+        share=False,  # Set to False for Hugging Face Spaces
+        show_error=True,
+        quiet=False
     )