Spaces:

mobenta
/

pdf_audio

Build error

App Files Files Community

pdf_audio / app.py

mobenta

Update app.py

b3226f0 verified over 1 year ago

raw

history blame contribute delete

4.47 kB

	import cohere
	import gradio as gr
	from pypdf import PdfReader
	from gtts import gTTS # Import Google Text-to-Speech
	from io import BytesIO # To handle audio in memory
	import os
	from loguru import logger
	import tempfile # To create temporary files
	from dotenv import load_dotenv # To load environment variables from a .env file

	# Load environment variables from .env file (if you're using one)
	load_dotenv()

	# Read the Cohere API key from an environment variable
	COHERE_API_KEY = os.getenv('COHERE_API_KEY')

	# Check if the API key is available
	if not COHERE_API_KEY:
	raise ValueError("Cohere API key not found. Please set the COHERE_API_KEY environment variable.")

	cohere_client = cohere.Client(COHERE_API_KEY)

	# Correct language codes for gTTS
	language_options = [
	("English", "en"),
	("Spanish", "es"),
	("French", "fr"),
	("German", "de"),
	("Italian", "it"),
	("Chinese", "zh-CN"),
	("Japanese", "ja"),
	("Hindi", "hi")
	]

	# Function to extract text from PDF
	def extract_text_from_pdf(pdf_file):
	reader = PdfReader(pdf_file)
	text = ""
	for page in reader.pages:
	page_text = page.extract_text()
	if page_text:
	text += page_text
	return text

	# Function to convert text to speech using gTTS
	def text_to_speech(text, language_code):
	if not text or not isinstance(text, str):
	logger.error("No valid text available for speech conversion.")
	return None

	try:
	tts = gTTS(text, lang=language_code)
	audio_fp = BytesIO() # In-memory file to store audio
	tts.write_to_fp(audio_fp) # Write audio data to the in-memory file
	audio_fp.seek(0) # Reset file pointer to the start

	# Create a temporary file to save the audio data for Gradio
	with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio_file:
	temp_audio_file.write(audio_fp.read()) # Write the audio data to the temp file
	temp_audio_path = temp_audio_file.name # Store the path of the temporary file
	return temp_audio_path # Return the file path
	except Exception as e:
	logger.error(f"Error during text-to-speech conversion: {e}")
	return None

	# Function to convert PDF text to audio via Cohere and gTTS
	def pdf_to_audio(pdf_file, language_code):
	try:
	text = extract_text_from_pdf(pdf_file)

	# Check if the extracted text is empty
	if not text.strip():
	logger.error("The PDF contains no extractable text.")
	return "The PDF contains no extractable text. Please try a different file.", None

	# Process the text with Cohere before audio generation
	response = cohere_client.generate(
	model='c4ai-aya-23', # Using your specified model
	prompt=text,
	max_tokens=500 # Adjust based on your needs
	)

	# Check if the response is valid
	if not response or not response.generations:
	logger.error("Cohere API did not return a valid response.")
	return "Error: Cohere API did not return a valid response.", None

	processed_text = response.generations[0].text.strip()

	# Check if processed_text is valid
	if not processed_text:
	logger.error("Cohere generated an empty response.")
	return "Error: Cohere generated an empty response.", None

	# Convert the processed text to speech and return the file path
	audio_file_path = text_to_speech(processed_text, language_code)

	if audio_file_path is None:
	return "Error: Failed to generate speech from the provided text.", None

	return processed_text, audio_file_path # Return the text and the path to the audio file
	except Exception as e:
	logger.error(f"Error during PDF to audio conversion: {e}")
	return "An error occurred while processing the PDF.", None

	# Gradio interface
	def gradio_interface(pdf_file, language_code):
	return pdf_to_audio(pdf_file, language_code)

	# Launch the Gradio interface with file input, language dropdown, text output, and audio output
	gr.Interface(
	fn=gradio_interface,
	inputs=[
	"file",
	gr.Dropdown(choices=language_options, label="Select Language")
	],
	outputs=[
	"text",
	"audio"
	],
	title="PDF to Audio using Cohere (Multi-language)"
	).launch(debug=True)