pdf2video / utils.py
Narayana02's picture
Update utils.py
05a01a3 verified
from groq import Groq
from pydantic import BaseModel, ValidationError
from typing import List, Literal
import os
import tiktoken
import json
import re
import tempfile
from gtts import gTTS
from bs4 import BeautifulSoup
import requests
from moviepy.editor import ImageClip, AudioFileClip
# Initialize Groq client and tokenizer
groq_client = Groq(api_key=os.environ["GROQ_API_KEY"])
tokenizer = tiktoken.get_encoding("cl100k_base")
# Data models for validating dialogue structure
class DialogueItem(BaseModel):
speaker: Literal["Priya", "Ananya"]
text: str
class Dialogue(BaseModel):
dialogue: List[DialogueItem]
# Truncate text to meet the token limit
def truncate_text(text, max_tokens=2048):
tokens = tokenizer.encode(text)
if len(tokens) > max_tokens:
return tokenizer.decode(tokens[:max_tokens])
return text
# Extract plain text content from a URL
def extract_text_from_url(url):
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Remove script and style elements
for script in soup(["script", "style"]):
script.decompose()
# Extract visible text
lines = (line.strip() for line in soup.get_text().splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)
return text
except Exception as e:
raise ValueError(f"Error extracting text from URL: {str(e)}")
# Generate a conversational script
def generate_script(system_prompt: str, input_text: str, tone: str, target_length: str):
input_text = truncate_text(input_text)
word_limit = 300 if target_length == "Short (1-2 min)" else 750
prompt = f"""
{system_prompt}
TONE: {tone}
TARGET LENGTH: {target_length} (approximately {word_limit} words)
INPUT TEXT: {input_text}
Generate a complete, well-structured podcast script that:
- Starts with a friendly introduction.
- Features a conversational exchange between Priya (American accent) and Ananya (British accent).
- Concludes with a summary and thanks listeners.
- Matches the tone and target length specifications.
"""
try:
response = groq_client.chat.completions.create(
messages=[{"role": "system", "content": prompt}],
model="llama-3.1-70b-versatile",
max_tokens=2048,
temperature=0.7
)
content = response.choices[0].message.content
content = re.sub(r'```json\s*|\s*```', '', content)
# Attempt to parse and validate JSON response
try:
json_data = json.loads(content)
dialogue = Dialogue.model_validate(json_data)
except (json.JSONDecodeError, ValidationError) as e:
match = re.search(r'\{.*\}', content, re.DOTALL)
if match:
try:
json_data = json.loads(match.group())
dialogue = Dialogue.model_validate(json_data)
except (json.JSONDecodeError, ValidationError) as parse_error:
raise ValueError(f"Failed to parse dialogue JSON: {parse_error}\nContent: {content}")
else:
raise ValueError(f"Failed to find valid JSON in the response: {content}")
return dialogue
except Exception as e:
raise RuntimeError(f"Error generating script: {str(e)}")
# Generate audio for a dialogue line
def generate_audio(text: str, speaker: str) -> str:
tld = 'com' if speaker == "Priya" else 'co.in'
try:
tts = gTTS(text=text, lang='en', tld=tld)
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
tts.save(temp_audio.name)
return temp_audio.name
except Exception as e:
raise RuntimeError(f"Error generating audio: {str(e)}")
# Generate video using audio and image
def generate_video(audio_path: str, image_path: str) -> str:
"""
Combines an audio file and an image to generate a video.
Args:
audio_path (str): Path to the audio file.
image_path (str): Path to the image file.
Returns:
str: Path to the generated video file.
"""
try:
audio = AudioFileClip(audio_path)
video = ImageClip(image_path, duration=audio.duration)
video = video.set_audio(audio)
# Define the output path
output_path = tempfile.mktemp(suffix=".mp4")
video.write_videofile(output_path, codec="libx264", audio_codec="aac")
return output_path
except Exception as e:
raise RuntimeError(f"Error generating video: {str(e)}")