VOICENEWSC / app.py
surajit2839's picture
Update app.py
98a52ab verified
import os
import tempfile
import asyncio
from pathlib import Path
import gradio as gr
from huggingface_hub import InferenceClient
import edge_tts
from pydub import AudioSegment
from langchain_community.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
# =================================================================
# 1. UI STYLING & PREMIUM MOVING ANIMATIONS
# =================================================================
CUSTOM_CSS = """
.gradio-container {
background: #ffffff;
background-image:
radial-gradient(at 0% 0%, rgba(147, 51, 234, 0.15) 0px, transparent 50%),
radial-gradient(at 100% 0%, rgba(249, 115, 22, 0.12) 0px, transparent 50%),
radial-gradient(at 100% 100%, rgba(147, 51, 234, 0.15) 0px, transparent 50%),
radial-gradient(at 0% 100%, rgba(249, 115, 22, 0.12) 0px, transparent 50%);
background-attachment: fixed;
animation: meshFlow 20s ease-in-out infinite alternate;
min-height: 100vh;
overflow-x: hidden;
}
@keyframes meshFlow {
0% { background-size: 100% 100%; background-position: 0% 0%; }
50% { background-size: 140% 140%; background-position: 50% 50%; }
100% { background-size: 100% 100%; background-position: 100% 100%; }
}
.glass-panel {
background: rgba(255, 255, 255, 0.5) !important;
backdrop-filter: blur(25px) saturate(160%);
-webkit-backdrop-filter: blur(25px) saturate(160%);
border: 1px solid rgba(255, 255, 255, 0.4) !important;
border-radius: 28px !important;
padding: 30px !important;
box-shadow: 0 20px 40px rgba(0, 0, 0, 0.03) !important;
transition: all 0.4s cubic-bezier(0.4, 0, 0.2, 1);
}
.glass-panel:hover {
transform: translateY(-8px);
background: rgba(255, 255, 255, 0.65) !important;
box-shadow: 0 35px 70px rgba(147, 51, 234, 0.12) !important;
}
.premium-btn {
background: linear-gradient(135deg, #f97316 0%, #9333ea 50%, #f97316 100%) !important;
background-size: 200% auto !important;
border: none !important;
color: white !important;
font-weight: 800 !important;
text-transform: uppercase;
letter-spacing: 1px;
border-radius: 15px !important;
box-shadow: 0 10px 25px rgba(147, 51, 234, 0.35) !important;
transition: 0.5s all !important;
}
.premium-btn:hover {
background-position: right center !important;
transform: scale(1.04);
box-shadow: 0 15px 35px rgba(147, 51, 234, 0.5) !important;
}
.gradio-container > * {
animation: fadeIn 1.2s ease-out;
}
@keyframes fadeIn {
from { opacity: 0; transform: translateY(20px); }
to { opacity: 1; transform: translateY(0); }
}
"""
SURAJIT_HF_TOKEN = os.getenv("CLONE_SURAJIT_TOKEN")
client = InferenceClient(token=SURAJIT_HF_TOKEN)
MODEL_ID = "HuggingFaceH4/zephyr-7b-beta"
# =================================================================
# 2. CORE LOGIC
# =================================================================
def process_multiple_documents(files) -> str:
if not files: return ""
combined_text = ""
for file in files:
ext = Path(file.name).suffix.lower()
try:
if ext == ".pdf": loader = PyPDFLoader(file.name)
elif ext == ".docx": loader = Docx2txtLoader(file.name)
else: loader = TextLoader(file.name)
docs = loader.load()
combined_text += " ".join([d.page_content for d in docs]) + "\n\n"
except Exception as e:
print(f"Error loading {file.name}: {e}")
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = splitter.split_text(combined_text)
return " ".join(chunks)[:10000]
def generate_timed_script(context: str, mode: str, duration: str):
duration_map = {
"1 Minute (Short)": "approx 150 words",
"5 Minutes (Medium)": "approx 750 words",
"10 Minutes (Detailed)": "approx 1500 words",
"20 Minutes (Deep Dive)": "approx 3000 words"
}
target_len = duration_map.get(duration, "750 words")
messages = [
{"role": "system", "content": f"You are a master scriptwriter. Mode: {mode}. Length: {target_len}. Use 'Host:' and 'Expert:' for dialogue."},
{"role": "user", "content": f"Analyze these documents and write the script:\n\n{context}"}
]
response = client.chat_completion(model=MODEL_ID, messages=messages, max_tokens=2500)
return response.choices[0].message.content
async def create_audio(script: str, mode: str, voice: str, speed: float):
rate_val = int((speed - 1.0) * 100)
rate_str = f"{rate_val:+d}%"
if mode == "Podcast":
combined = AudioSegment.empty()
for line in script.split('\n'):
line = line.strip()
if not line: continue
# Determine which voice to use
if ":" in line:
current_voice = voice if "Host" in line else "en-GB-SoniaNeural"
text_to_speak = line.split(":", 1)[1].strip()
else:
# If no colon, Host reads the line instead of skipping it
current_voice = voice
text_to_speak = line
if text_to_speak:
communicate = edge_tts.Communicate(text_to_speak, current_voice, rate=rate_str)
t_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3").name
await communicate.save(t_path)
combined += AudioSegment.from_mp3(t_path) + AudioSegment.silent(duration=600)
os.remove(t_path)
out = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3").name
combined.export(out, format="mp3")
return out
else:
communicate = edge_tts.Communicate(script, voice, rate=rate_str)
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
await communicate.save(tmp.name)
return tmp.name
# =================================================================
# 3. INTERFACE
# =================================================================
with gr.Blocks() as app:
gr.HTML("<div style='text-align: center; padding: 20px;'><img src='https://cdn.pixabay.com/animation/2023/06/13/15/12/15-12-47-323_512.gif' style='width:50px;'><h1 style='color: #1f2937; font-weight: 900;'>AI Multi-Doc Studio</h1></div>")
with gr.Row():
with gr.Column(scale=1):
with gr.Group(elem_classes="glass-panel"):
gr.HTML("<h4>πŸ“š Upload Documents</h4>")
file_input = gr.File(label=None, file_count="multiple", file_types=[".pdf", ".docx", ".txt"])
gr.HTML("<h4>⏱️ Duration & Style</h4>")
duration_sel = gr.Dropdown(
["1 Minute (Short)", "5 Minutes (Medium)", "10 Minutes (Detailed)", "20 Minutes (Deep Dive)"],
value="5 Minutes (Medium)", label="Target Audio Length"
)
mode_sel = gr.Dropdown(["Podcast", "Storytelling", "Teaching", "Summary"], value="Podcast", label="Script Style")
gr.HTML("<h4>πŸ—£οΈ Voice Settings</h4>")
voice_sel = gr.Dropdown([
("Andrew (US - Male)", "en-US-AndrewNeural"),
("Ava (US - Female)", "en-US-AvaNeural"),
("Emma (UK - Female)", "en-GB-SoniaNeural"),
("Aditi (IN - Female)", "en-IN-NeerjaNeural")
], value="en-US-AndrewNeural", label="Voice Selection")
speed_sld = gr.Slider(0.5, 1.5, value=1.0, label="Pace")
btn = gr.Button("πŸš€ GENERATE STUDIO AUDIO", elem_classes="premium-btn")
with gr.Column(scale=1):
with gr.Group(elem_classes="glass-panel"):
gr.HTML("<h4>πŸ“ Generated Script</h4>")
out_txt = gr.Textbox(label=None, lines=15)
gr.HTML("<h4>πŸ”Š Audio Output</h4>")
out_aud = gr.Audio(label=None)
async def run_pipeline(files, dur, mode, voice, speed):
if not files: return "Please upload at least one file.", None
ctx = process_multiple_documents(files)
sc = generate_timed_script(ctx, mode, dur)
aud = await create_audio(sc, mode, voice, speed)
return sc, aud
btn.click(run_pipeline, inputs=[file_input, duration_sel, mode_sel, voice_sel, speed_sld], outputs=[out_txt, out_aud])
if __name__ == "__main__":
app.launch(css=CUSTOM_CSS)