voice / app.py
rahulrana0001's picture
feat: unify React Dubbing Studio UI with Gradio Manga AI in single Docker container
337d4b1
import gradio as gr
import psutil
import logging
import os
import asyncio
import nest_asyncio
# --- NEURAL SILENCER: Fix for persistent "Invalid file descriptor: -1" logs ---
def _silence_asyncio_ghosts():
from asyncio.base_events import BaseEventLoop
original_del = BaseEventLoop.__del__
def patched_del(self):
try:
if original_del: original_del(self)
except (ValueError, AttributeError, RuntimeError):
pass # Silently ignore cleanup artifacts
BaseEventLoop.__del__ = patched_del
_silence_asyncio_ghosts()
# nest_asyncio.apply()
from pipeline.ocr import extract_text_from_image
from pipeline.translation import translate_to_tamil
from pipeline.tts import generate_tamil_speech
from pipeline.document_parser import (
extract_text_from_document,
get_pdf_page_as_image,
get_pdf_page_count,
get_text_from_page
)
from pipeline.maya_chat_engine import get_maya_response
import threading
from concurrent.futures import ThreadPoolExecutor
import re
import numpy as np
def run_cinematic_pipeline(extracted_text, emotion_choice, spicy_mode):
final_tamil_text = []
final_audio_chunks = []
master_sample_rate = None
try:
if "[Panel" in extracted_text:
raw_panels = re.split(r'(?=\[Panel\s*\d+\])', extracted_text, flags=re.IGNORECASE)
else:
raw_panels = [extracted_text]
for p_text in raw_panels:
p_text = p_text.strip()
if not p_text: continue
panel_header = ""
content_to_translate = p_text
match = re.match(r'(\[Panel\s*\d+\])\s*(.*)', p_text, re.DOTALL | re.IGNORECASE)
if match:
panel_header = match.group(1)
content_to_translate = match.group(2)
if not content_to_translate.strip():
if panel_header: final_tamil_text.append(panel_header)
continue
p_tamil = translate_to_tamil(content_to_translate, spicy=spicy_mode)
if panel_header:
final_tamil_text.append(f"{panel_header}\n{p_tamil}")
else:
final_tamil_text.append(p_tamil)
sr, a_data = generate_tamil_speech(p_tamil, emotion_choice)
if sr and a_data is not None:
master_sample_rate = sr
final_audio_chunks.append(a_data)
tamil_translation = "\n\n".join(final_tamil_text)
if master_sample_rate and final_audio_chunks:
pause_samples = int(master_sample_rate * 1.5)
silence_array = np.zeros(pause_samples, dtype=np.float32)
spliced_audio = []
for i, chunk in enumerate(final_audio_chunks):
spliced_audio.append(chunk)
if i < len(final_audio_chunks) - 1:
spliced_audio.append(silence_array)
audio_data = np.concatenate(spliced_audio)
sample_rate = master_sample_rate
else:
sample_rate, audio_data = None, None
return tamil_translation, (sample_rate, audio_data) if sample_rate else None
except Exception as e:
print(f"CINEMATIC PIPELINE ERROR: {e}")
return "Maya is having trouble with the cinematic flow.", None
# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Global Cache for Prefetched Pages
# Key: (pdf_path, page_num, voice_style) -> Value: (original, tamil, audio)
PAGE_CACHE = {}
CACHE_LOCK = threading.Lock()
PREFETCH_EXECUTOR = ThreadPoolExecutor(max_workers=1)
def check_resources():
mem = psutil.virtual_memory()
available_gb = mem.available / (1024**3)
logging.info(f"System Resources: {available_gb:.2f} GB RAM available.")
if available_gb < 1.0:
logging.warning("EXTREMELY LOW MEMORY DETECTED! Application may crash.")
return available_gb
# Expressive Voice Styles
# Background Atmosphere Sounds
BGM_LINKS = {
"None": "",
"Soft Rain 🌧️": "https://www.soundjay.com/nature/sounds/rain-07.mp3",
"Romantic Piano 🎹": "https://www.soundjay.com/misc/sounds/music-box-1.mp3",
"Midnight Jazz 🎷": "https://www.soundjay.com/misc/sounds/bell-ringing-05.mp3",
"Summer Night 🌙": "https://www.soundjay.com/nature/sounds/cricket-chirping-01.mp3",
"Heartbeat 💓": "https://www.soundjay.com/misc/sounds/heartbeat-01.mp3"
}
VOICE_STYLES = [
"Cheerful (Maya)",
"Excited (Maya)",
"Sad & Emotional (Sita)",
"Dramatic Narrator (Sita)",
"Old Wise Woman",
"Playful Child",
"Brave Heroine",
"Deep & Serious",
"Calm Storyteller",
"Professional News"
]
def process_standard_pipeline(image, document, input_text, emotion_choice):
text_to_translate = ""
if document is not None:
text_to_translate += extract_text_from_document(document) + "\n"
if image is not None:
text_to_translate += extract_text_from_image(image, is_comic=False) + " "
if input_text:
text_to_translate += input_text
text_to_translate = text_to_translate.strip()
if not text_to_translate:
return "No text detected", "", None
tamil_translation = translate_to_tamil(text_to_translate)
sample_rate, audio_data = generate_tamil_speech(tamil_translation, emotion_choice)
return text_to_translate, tamil_translation, (sample_rate, audio_data)
def load_comic_page(pdf_path, page_num):
if not pdf_path:
return None, "Upload a PDF first", 0
img_path = get_pdf_page_as_image(pdf_path, page_num)
total_pages = get_pdf_page_count(pdf_path)
status = f"Page {page_num + 1} of {total_pages}"
return img_path, status, page_num
def prefetch_pages(pdf_path, current_page, voice_style, spicy=False, count=5):
"""
Background worker to process upcoming pages.
"""
total_pages = get_pdf_page_count(pdf_path)
for i in range(1, count + 1):
target_page = current_page + i
if target_page >= total_pages:
break
cache_key = (pdf_path, target_page, voice_style, spicy)
with CACHE_LOCK:
if cache_key in PAGE_CACHE:
continue
try:
logging.info(f"PREFETCH: Processing Page {target_page+1} in background...")
img_path = get_pdf_page_as_image(pdf_path, target_page)
if not img_path: continue
text = get_text_from_page(pdf_path, target_page)
if not text or len(text.strip()) < 5:
text = extract_text_from_image(img_path)
if text.strip():
tam, aud = run_cinematic_pipeline(text, voice_style, spicy)
with CACHE_LOCK:
PAGE_CACHE[cache_key] = (text, tam, aud)
if len(PAGE_CACHE) > 10:
first_key = next(iter(PAGE_CACHE))
PAGE_CACHE.pop(first_key)
except Exception as e:
logging.error(f"PREFETCH ERROR on Page {target_page+1}: {e}")
def process_comic_page(pdf_path, page_num, emotion_choice, heat_level):
try:
if not pdf_path:
return "No page loaded", "", None
from pipeline.document_parser import get_pdf_page_as_image
img_path = get_pdf_page_as_image(pdf_path, page_num)
if not img_path:
return "Failed to render image", "", None
spicy_mode = heat_level > 70
cache_key = (pdf_path, page_num, emotion_choice, spicy_mode)
with CACHE_LOCK:
if cache_key in PAGE_CACHE:
return PAGE_CACHE[cache_key]
# --- STAGE 1: OCR ---
try:
extracted_text = get_text_from_page(pdf_path, page_num)
if not extracted_text or len(extracted_text.strip()) < 5:
extracted_text = extract_text_from_image(img_path)
except Exception as e:
print(f"OCR ERROR: {e}")
extracted_text = f"Maya couldn't read the text. (Error: {e})"
if not extracted_text.strip():
extracted_text = "No text found on this page."
# --- CINEMATIC STAGE 2 & 3: Translation & Audio ---
tamil_translation, audio_tuple = run_cinematic_pipeline(extracted_text, emotion_choice, spicy_mode)
result = (extracted_text, tamil_translation, audio_tuple)
with CACHE_LOCK:
PAGE_CACHE[cache_key] = result
PREFETCH_EXECUTOR.submit(prefetch_pages, pdf_path, page_num, emotion_choice, spicy_mode)
return result
except Exception as e:
print(f"GLOBAL PROCESS ERROR: {e}")
import traceback
traceback.print_exc()
return f"CRITICAL CRASH: {e}", "", None
# Custom Premium CSS
CUSTOM_CSS = """
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600&family=Outfit:wght@500;700&display=swap');
:root {
--primary: #6366f1;
--secondary: #a855f7;
--bg-dark: #0f172a;
--panel-bg: rgba(30, 41, 59, 0.7);
}
body {
background-color: var(--bg-dark);
color: #f1f5f9;
font-family: 'Inter', sans-serif;
}
.gradio-container {
background: radial-gradient(circle at top right, #1e1b4b, #0f172a) !important;
}
h1 {
font-family: 'Outfit', sans-serif;
background: linear-gradient(to right, #818cf8, #c084fc);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
font-weight: 700;
}
.glass {
background: var(--panel-bg) !important;
backdrop-filter: blur(12px);
border: 1px solid rgba(255, 255, 255, 0.1) !important;
border-radius: 16px !important;
box-shadow: 0 4px 30px rgba(0, 0, 0, 0.1);
transition: all 0.3s ease;
}
.glass:hover {
border: 1px solid rgba(255, 255, 255, 0.2) !important;
box-shadow: 0 8px 32px rgba(99, 102, 241, 0.2);
}
#maya_chat_log {
border-radius: 12px;
padding: 12px;
background: rgba(99, 102, 241, 0.1);
border: 1px solid rgba(99, 102, 241, 0.2);
margin-bottom: 10px;
animation: fadeIn 0.5s ease-out;
}
@keyframes fadeIn {
from { opacity: 0; transform: translateY(10px); }
to { opacity: 1; transform: translateY(0); }
}
#main_comic img {
border-radius: 12px;
box-shadow: 0 20px 25px -5px rgba(0, 0, 0, 0.3);
transition: transform 0.5s cubic-bezier(0.4, 0, 0.2, 1);
}
#main_comic img:hover {
transform: scale(1.02);
}
.gr-button-primary {
background: linear-gradient(135deg, var(--primary), var(--secondary)) !important;
border: none !important;
box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
transition: all 0.3s ease !important;
font-weight: 600 !important;
}
.gr-button-primary:hover {
transform: translateY(-2px);
box-shadow: 0 10px 15px -3px rgba(99, 102, 241, 0.4);
}
#maya_chat_log::before {
content: "Maya is thinking...";
display: block;
font-size: 0.8em;
color: var(--primary);
margin-bottom: 5px;
opacity: 0.7;
}
#maya_audio_player { display: none; }
.boss-active { display: block !important; }
/* Fix for Audio Timeline Visibility */
#comic_audio_player .track {
background-color: rgba(0, 0, 0, 0.4) !important;
border-radius: 4px;
}
#comic_audio_player .time {
color: #818cf8 !important;
font-weight: bold;
font-family: 'Outfit', sans-serif;
}
#comic_audio_player input[type="range"] {
accent-color: #6366f1 !important;
}
"""
# UI
with gr.Blocks(title="Maya: Immersive Manga AI", css=CUSTOM_CSS) as demo:
gr.Markdown("# 🎭 Maya: Immersive Tamil Manga AI")
gr.Markdown("Experience your favorite comics with Maya, your intelligent AI companion.")
current_page = gr.State(0)
comic_pdf_path = gr.State(None)
with gr.Tabs():
with gr.Tab("📖 Comic Reader Mode"):
with gr.Row():
with gr.Column(scale=5, min_width=300, elem_classes=["glass"]):
comic_display = gr.Image(label="Comic Page", type="filepath", height=600, elem_id="main_comic")
with gr.Row():
prev_btn = gr.Button("⬅️ Prev", scale=1)
page_status = gr.Label(value="Upload PDF", scale=2)
next_btn = gr.Button("Next ➡️", scale=1)
with gr.Column(scale=4, min_width=300, elem_classes=["glass"]):
with gr.Group():
gr.Markdown("### ⚙️ Master Settings")
comic_upload = gr.File(label="Upload (PDF/EPUB)", file_types=[".pdf", ".epub"], height=80)
voice_style_comic = gr.Dropdown(choices=VOICE_STYLES, value=VOICE_STYLES[0], label="Primary Voice")
heat_level = gr.Slider(minimum=0, maximum=100, value=50, label="🌶️ Translation Heat Level")
share_btn = gr.Button("🔗 Share with Friends", variant="secondary", size="sm")
share_status = gr.Markdown("")
share_btn.click(None, None, None, js="""
() => {
const url = "https://huggingface.co/spaces/ranaspark/voice";
navigator.clipboard.writeText(url);
alert("Link Copied! Share it with your friends: " + url);
}
""")
auto_play = gr.Checkbox(label="🔄 Auto-Play Next Page", value=False)
read_page_btn = gr.Button("🔊 Read This Page", variant="primary")
with gr.Accordion("🎭 Character Memory", open=False):
char_a_voice = gr.Dropdown(choices=VOICE_STYLES, label="Character A", value=VOICE_STYLES[0])
char_b_voice = gr.Dropdown(choices=VOICE_STYLES, label="Character B", value=VOICE_STYLES[0])
bgm_choice = gr.Dropdown(choices=list(BGM_LINKS.keys()), value="None", label="Background Atmosphere")
bgm_player = gr.HTML(value="")
# Boss Key & Vibration JS
gr.HTML("""
<div id="boss_screen" style="display:none; position:fixed; top:0; left:0; width:100%; height:100%; background:white; z-index:999999; overflow:hidden;">
<img src="https://i.imgur.com/8N6Rz7C.png" style="width:100%; height:100%; object-fit:cover;">
</div>
<script>
document.addEventListener('keydown', function(e) {
if (e.key === 'b' || e.key === 'B') {
const screen = document.getElementById('boss_screen');
screen.classList.toggle('boss-active');
}
});
function triggerHaptic() {
if (navigator.vibrate) navigator.vibrate([100, 50, 100]);
return Array.from(arguments);
}
function updateTemp(level) {
const r = Math.floor(level * 2.55);
const b = 255 - r;
document.documentElement.style.setProperty('--bg-glow', `rgba(${r}, 50, ${b}, 0.3)`);
const comic = document.getElementById('main_comic');
if (comic) comic.style.border = `5px solid rgba(${r}, 50, ${b}, 0.5)`;
}
</script>
""")
comic_text = gr.Textbox(label="Original", lines=3)
comic_tamil = gr.Textbox(label="Tamil", lines=3)
comic_audio = gr.Audio(label="Speech", elem_id="comic_audio_player")
with gr.Tab("✍️ Text to Speech"):
with gr.Row():
with gr.Column():
input_text = gr.Textbox(lines=10, label="✍️ Paste or Type your story here", placeholder="Enter English text...")
voice_style_std = gr.Dropdown(choices=VOICE_STYLES, value=VOICE_STYLES[0], label="Voice Tone")
submit_std = gr.Button("🚀 Generate Tamil Speech", variant="primary")
with gr.Column():
out_text = gr.Textbox(label="Original Text (Cleaned)", lines=5)
out_tamil = gr.Textbox(label="Tamil Translation", lines=5)
out_audio = gr.Audio(label="Audio Output")
with gr.Tab("🎥 Video Dubbing Studio"):
gr.Markdown("### 🎬 Cinematic AI Video Dubbing")
gr.Markdown("Process your videos with automated translation, multi-speaker voice cloning, and lip sync.")
gr.HTML('<iframe src="/dubbing-ui/" width="100%" height="850px" style="border: none; border-radius: 12px; box-shadow: 0 4px 20px rgba(0,0,0,0.5); background: #0f172a;"></iframe>')
# --- Dynamic Temperature & Heartbeat Speed Logic ---
def update_mood(level, bgm):
# JS to update color and potentially heartbeat speed if possible
return gr.update()
heat_level.change(None, inputs=[heat_level], js="updateTemp")
# --- BGM Logic ---
def update_bgm(choice, level):
link = BGM_LINKS.get(choice, "")
if not link:
return ""
# If heartbeat, adjust playback rate based on level
speed = 1.0 + (level / 100.0) # 1.0x to 2.0x speed
return f'<audio id="bgm_tag" autoplay loop><source src="{link}" type="audio/mpeg"></audio><script>const a=document.getElementById("bgm_tag"); a.volume=0.3; a.playbackRate={speed};</script>'
bgm_choice.change(update_bgm, inputs=[bgm_choice, heat_level], outputs=[bgm_player])
# Comic Logic
def start_comic(file):
if not file: return None, "No file", 0, None
img, status, page = load_comic_page(file.name, 0)
return img, status, page, file.name
comic_upload.change(start_comic, inputs=[comic_upload], outputs=[comic_display, page_status, current_page, comic_pdf_path])
def go_next(pdf, page):
new_page = page + 1
return load_comic_page(pdf, new_page)
def go_prev(pdf, page):
new_page = max(0, page - 1)
return load_comic_page(pdf, new_page)
# Navigation logic...
next_btn.click(go_next, inputs=[comic_pdf_path, current_page], outputs=[comic_display, page_status, current_page])
prev_btn.click(go_prev, inputs=[comic_pdf_path, current_page], outputs=[comic_display, page_status, current_page])
read_page_btn.click(
process_comic_page,
inputs=[comic_pdf_path, current_page, voice_style_comic, heat_level],
outputs=[comic_text, comic_tamil, comic_audio]
)
# --- Auto-Play Logic (JS Listener) ---
hidden_auto_next = gr.Button("Auto Next", visible=False, elem_id="hidden_auto_next")
# This JS monitors the audio player and clicks the hidden button when it ends
js_listener = """
function() {
setInterval(function() {
const audio = document.querySelector('#comic_audio_player audio');
if (audio && !audio.onended) {
audio.onended = function() {
const btn = document.querySelector('button#hidden_auto_next');
if (btn) btn.click();
};
}
}, 1000);
}
"""
# Trigger the JS listener when audio is loaded
comic_audio.change(None, None, None, js=js_listener)
def handle_auto_play(is_enabled, pdf, page, voice, heat_level):
try:
if not is_enabled or not pdf:
return gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update()
# 1. Go to next page
new_page = page + 1
img, status, p_num = load_comic_page(pdf, new_page)
if not img: # End of book
return gr.update(), status, p_num, gr.update(), gr.update(), gr.update()
# 2. Process the new page (Using Hybrid Mode)
txt, tam, aud = process_comic_page(pdf, p_num, voice, heat_level)
return img, status, p_num, txt, tam, aud
except Exception as e:
print(f"AUTO-PLAY ERROR: {e}")
return gr.update(), f"Auto-Play Error: {e}", page, f"CRASH: {e}", "", None
# The hidden button triggers the actual logic
hidden_auto_next.click(
handle_auto_play,
inputs=[auto_play, comic_pdf_path, current_page, voice_style_comic, heat_level],
outputs=[comic_display, page_status, current_page, comic_text, comic_tamil, comic_audio]
)
# Trigger JS listener on app start too
demo.load(None, None, None, js=js_listener)
# Standard Logic (Text Only)
submit_std.click(
process_standard_pipeline,
inputs=[gr.State(None), gr.State(None), input_text, voice_style_std],
outputs=[out_text, out_tamil, out_audio]
)
if __name__ == "__main__":
check_resources()
from fastapi import FastAPI
from fastapi.staticfiles import StaticFiles
import os
import gradio as gr
from dubbing_backend.main import app as api_app
app = FastAPI()
# Mount backend API
app.mount("/api", api_app)
# Mount React UI
if os.path.exists("dist"):
app.mount("/dubbing-ui", StaticFiles(directory="dist", html=True))
# Mount Gradio at root
app = gr.mount_gradio_app(app, demo, path="/")
if __name__ == "__main__":
import uvicorn
uvicorn.run("app:app", host="0.0.0.0", port=7860)