Spaces:

MuhammadSajid
/

Text_to_speeech

Running

App Files Files Community

Text_to_speeech / app.py

MuhammadSajid

Update app.py

5f5d8ad verified 8 months ago

raw

history blame contribute delete

8.44 kB

	import gradio as gr
	import requests
	import tempfile
	from faster_whisper import WhisperModel
	from gtts import gTTS
	import os
	import subprocess
	from PIL import Image, ImageDraw, ImageFont
	import random
	import textwrap
	import pkg_resources
	import sys
	import io
	import base64

	# === Config ===
	GROQ_API_KEY = "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"
	GROQ_MODEL = "llama3-70b-8192"
	GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions"
	IMAGE_GENERATION_API_URL = "https://api.openai.com/v1/images/generations" # Corrected URL for DALL-E 3
	OPENAI_API_KEY = "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" # Replace with your OpenAI key

	# === Init Whisper ===
	whisper = WhisperModel("base", device="cpu", compute_type="int8")

	# === Animation Functions ===
	def get_audio_duration(audio_file):
	"""Gets the duration of the audio using ffprobe."""
	try:
	command = [
	"ffprobe",
	"-v",
	"error",
	"-show_entries",
	"format=duration",
	"-of",
	"default=noprint_wrappers=1:nokey=1",
	audio_file,
	]
	result = subprocess.run(command, capture_output=True, text=True)
	return float(result.stdout)
	except Exception as e:
	print(f"Error getting audio duration: {e}")
	return 5 # Return a default duration

	def generate_images_with_openai(prompt, num_images=1):
	"""
	Generates images using OpenAI's API.

	Args:
	prompt (str): The prompt to use for image generation.
	num_images (int, optional): The number of images to generate. Defaults to 1.

	Returns:
	list: A list of image URLs, or None on error.
	"""
	headers = {
	"Authorization": f"Bearer {OPENAI_API_KEY}",
	"Content-Type": "application/json",
	}
	payload = {
	"model": "dall-e-3", # Use the DALL-E 3 model
	"prompt": prompt,
	"n": num_images,
	"size": "1024x1024", # You can adjust the size as needed
	"response_format": "url" # Ensure we get URLs
	}

	try:
	response = requests.post(IMAGE_GENERATION_API_URL, headers=headers, json=payload)
	response.raise_for_status() # Raise an exception for bad status codes
	data = response.json()
	image_urls = [item["url"] for item in data["data"]]
	return image_urls
	except requests.exceptions.RequestException as e:
	print(f"Error generating images with OpenAI: {e}")
	return None
	except KeyError:
	print(f"Error: Unexpected response format from OpenAI: {data}")
	return None
	except Exception as e:
	print(f"Error during image generation: {e}")
	return None

	def create_animated_explanation(text, audio_file):
	"""
	Generates a more professional animation with images and synchronized text.

	Args:
	text (str): The text to display and explain.
	audio_file (str): The path to the audio file.

	Returns:
	str: The path to the generated video file (.mp4), or None on error.
	"""
	try:
	# 1. Split text into meaningful chunks (sentences or phrases)
	sentences = split_text_into_chunks(text)
	audio_duration = get_audio_duration(audio_file)
	if audio_duration is None:
	audio_duration = 5 # set default
	total_frames = 100 # Example number of frames
	fps = 10
	frame_duration = 1 / fps
	image_urls = []

	# 2. Generate images for key sentences
	for sentence in sentences:
	image_prompt = f"Illustrate the concept: {sentence}"
	urls = generate_images_with_openai(image_prompt) # Generate 1 image per sentence
	if urls:
	image_urls.append(urls[0]) # Use the first URL
	else:
	image_urls.append(None) # Append None if image generation fails

	# 3. Create frames for the animation
	frames = []
	for i in range(total_frames):
	frame_progress = i / total_frames
	sentence_index = int(frame_progress * len(sentences))
	sentence_index = min(sentence_index, len(sentences) - 1) #clamp

	color = (220, 220, 220) # Light gray
	img = Image.new("RGB", (640, 480), color=color)
	d = ImageDraw.Draw(img)
	font = ImageFont.truetype("DejaVuSans.ttf", 20)
	current_sentence = sentences[sentence_index]
	lines = textwrap.wrap(current_sentence, width=40)
	y_start = (480 - len(lines) * 24) // 2

	# Display sentence
	for j, line in enumerate(lines):
	try:
	bbox = d.textbbox((0,0), line, font=font)
	text_width = bbox[2] - bbox[0]
	text_x = (640 - text_width) // 2
	except AttributeError as e:
	print(f"Pillow version error: {e}")
	text_x = 10
	d.text((text_x, y_start + j * 24), line, fill=(0, 0, 0), font=font)

	# Add image if available
	if image_urls[sentence_index]:
	try:
	image_data = requests.get(image_urls[sentence_index], stream=True).raw
	img_to_paste = Image.open(image_data).resize((200, 200)) # Resize as needed
	img.paste(img_to_paste, (440, 280)) # Position the image
	except Exception as e:
	print(f"Error loading or pasting image: {e}")

	frames.append(img)

	# 4. Save frames and create video
	image_files = []
	for i, frame in enumerate(frames):
	image_file = f"frame_{i:04d}.png"
	frame.save(image_file)
	image_files.append(image_file)

	video_file = "animated_explanation.mp4"
	command = [
	"ffmpeg",
	"-framerate", str(fps),
	"-i", "frame_%04d.png",
	"-i", audio_file,
	"-c:v", "libx264",
	"-pix_fmt", "yuv420p",
	"-y",
	video_file,
	]
	subprocess.run(command, check=True, capture_output=True)

	for image_file in image_files:
	os.remove(image_file)
	return video_file

	except Exception as e:
	print(f"Error creating animated explanation: {e}")
	return None # Return None on error

	def split_text_into_chunks(text):
	"""Splits text into sentences or phrases, handling punctuation."""
	import re
	# Split by common sentence-ending punctuation, but handle abbreviations
	sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.\|\?\|!)\s+', text)
	return sentences

	def create_animation(text, audio_file):
	"""
	Selects and runs an animation function.
	"""
	return create_animated_explanation(text, audio_file)

	def process_audio(audio_file):
	# 1. Speech to Text
	segments, _ = whisper.transcribe(audio_file)
	user_text = " ".join([segment.text for segment in segments])

	# 2. Groq API Call
	headers = {
	"Authorization": f"Bearer {GROQ_API_KEY}",
	"Content-Type": "application/json",
	}
	payload = {
	"model": GROQ_MODEL,
	"messages": [{"role": "user", "content": user_text}],
	"temperature": 0.5,
	}

	response = requests.post(GROQ_API_URL, headers=headers, json=payload)
	if response.status_code != 200:
	return f"Groq API Error: {response.text}", None, None

	reply = response.json()["choices"][0]["message"]["content"]

	# 3. TTS using gTTS
	tts = gTTS(reply)
	audio_output = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
	tts.save(audio_output.name)

	# 4. Create animation
	video_file = create_animation(reply, audio_output.name)

	return reply, audio_output.name, video_file

	iface = gr.Interface(
	fn=process_audio,
	inputs=gr.Audio(type="filepath", label="🎤 Speak your question"),
	outputs=[
	gr.Textbox(label="🧠 Groq Response"),
	gr.Audio(label="🔊 AI Voice Reply"),
	gr.Video(label="🎬 Animation"),
	],
	title="🗣️ Voice AI Assistant with Professional Animation (Groq + Whisper + gTTS)",
	description="🎙️ Whisper for STT, Groq for response, gTTS for voice output, and enhanced animation.",
	live=True,
	)

	iface.launch()