Spaces:

yukee1992
/

Tts-api-new

Paused

File size: 24,182 Bytes

import gradio as gr
import asyncio
import edge_tts
import tempfile
import os
import json
from pathlib import Path
from huggingface_hub import HfApi, upload_file
import uuid
from datetime import datetime
import shutil
import re
import requests
import threading
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
import uvicorn
import subprocess
import mutagen.mp3

# Configuration
HF_TOKEN = os.environ.get("HF_TOKEN")
DATASET_REPO = os.environ.get("DATASET_REPO", "yukee1992/video-project-images")
TRACKER_URL = os.environ.get("TRACKER_URL", "https://yukee1992-status-tracker.hf.space")

print("=" * 60)
print("🚀 STARTING TTS SERVICE WITH API AND SRT CAPTIONS")
print("=" * 60)
print(f"📦 HF Dataset: {DATASET_REPO}")
print(f"🔑 HF Token: {'✅ Set' if HF_TOKEN else '❌ Missing'}")
print(f"📡 Tracker URL: {TRACKER_URL}")

# Initialize Hugging Face API
hf_api = HfApi(token=HF_TOKEN)

# =============================================
# Helper function to get audio duration
# =============================================
def get_audio_duration(audio_path):
    """Get actual audio duration in seconds using mutagen"""
    try:
        audio = mutagen.mp3.MP3(audio_path)
        return audio.info.length
    except:
        # Fallback: use ffprobe if available
        try:
            result = subprocess.run(
                ['ffprobe', '-v', 'error', '-show_entries', 'format=duration', 
                 '-of', 'default=noprint_wrappers=1:nokey=1', audio_path],
                capture_output=True, text=True
            )
            return float(result.stdout.strip())
        except:
            return None

# =============================================
# SRT Generation Functions
# =============================================

def format_timestamp(seconds):
    """Convert seconds to SRT timestamp format (HH:MM:SS,mmm)"""
    # Ensure seconds is within reasonable range
    seconds = max(0, min(seconds, 3600))  # Max 1 hour
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = seconds % 60
    milliseconds = int((secs - int(secs)) * 1000)
    return f"{hours:02d}:{minutes:02d}:{int(secs):02d},{milliseconds:03d}"

def create_short_srt_from_words(words_data, total_duration):
    """
    Create SRT with short, readable subtitles (2-3 words per line)
    Ensures timestamps are scaled to match actual audio duration
    """
    if not words_data:
        return ""
    
    # Find the maximum timestamp from words data
    max_word_time = 0
    for word in words_data:
        word_end = (word['offset'] + word['duration']) / 1e7
        max_word_time = max(max_word_time, word_end)
    
    # If the max word time is close to total_duration, use it
    # Otherwise, we need to scale timestamps
    if max_word_time > 0 and total_duration and abs(max_word_time - total_duration) > 0.5:
        # Scale factor to match actual audio duration
        scale_factor = total_duration / max_word_time
        print(f"📊 Scaling timestamps: max_word_time={max_word_time:.2f}s, audio_duration={total_duration:.2f}s, scale={scale_factor:.3f}")
    else:
        scale_factor = 1.0
    
    srt_entries = []
    counter = 1
    
    # Group words into small chunks (2 words per subtitle for readability)
    words_per_subtitle = 2
    
    for i in range(0, len(words_data), words_per_subtitle):
        chunk = words_data[i:i + words_per_subtitle]
        
        if not chunk:
            continue
        
        # Get start time of first word (scaled)
        start_time = (chunk[0]['offset'] / 1e7) * scale_factor
        
        # Get end time of last word (scaled)
        end_time = ((chunk[-1]['offset'] + chunk[-1]['duration']) / 1e7) * scale_factor
        
        # Ensure end_time doesn't exceed total_duration
        end_time = min(end_time, total_duration)
        
        # Combine text without spaces (Chinese)
        text = ''.join([word['text'] for word in chunk])
        
        # Format timestamps
        start_str = format_timestamp(start_time)
        end_str = format_timestamp(end_time)
        
        # Add SRT entry
        srt_entries.append(str(counter))
        srt_entries.append(f"{start_str} --> {end_str}")
        srt_entries.append(text)
        srt_entries.append("")
        counter += 1
    
    print(f"✅ Created {counter-1} short subtitle entries")
    print(f"   First subtitle: {srt_entries[1]} -> {srt_entries[2]}")
    print(f"   Last subtitle ends at: {format_timestamp(total_duration)}")
    
    return "\n".join(srt_entries)

def create_fallback_srt(text, total_duration):
    """
    Fallback method: split text into smaller phrases based on character count
    """
    # Split into smaller chunks (max 15 characters per subtitle for readability)
    max_chars = 15
    phrases = []
    
    # First split by punctuation
    temp_phrases = re.split(r'[，,。！？.!?]', text)
    for phrase in temp_phrases:
        phrase = phrase.strip()
        if phrase:
            # Further split long phrases by character count
            if len(phrase) > max_chars:
                for j in range(0, len(phrase), max_chars):
                    sub_phrase = phrase[j:j+max_chars]
                    if sub_phrase:
                        phrases.append(sub_phrase)
            else:
                phrases.append(phrase)
    
    if not phrases:
        phrases = [text[i:i+max_chars] for i in range(0, len(text), max_chars)]
    
    # Calculate duration per phrase
    duration_per_phrase = total_duration / len(phrases)
    
    srt_entries = []
    current_time = 0
    
    for i, phrase in enumerate(phrases):
        start_time = current_time
        end_time = current_time + duration_per_phrase
        
        srt_entries.append(str(i + 1))
        srt_entries.append(f"{format_timestamp(start_time)} --> {format_timestamp(end_time)}")
        srt_entries.append(phrase)
        srt_entries.append("")
        
        current_time = end_time
    
    print(f"⚠️ Using fallback SRT: {len(phrases)} phrases over {total_duration:.1f}s")
    return "\n".join(srt_entries)

# =============================================
# Status Reporting Function
# =============================================
def report_to_tracker(project_id, service_type, status, file_urls=None, error=None):
    """Report generation status to central tracker"""
    if not project_id:
        return
    
    def _report():
        try:
            payload = {
                "project_id": project_id,
                "service_type": service_type,
                "status": status,
                "file_urls": file_urls or []
            }
            if error:
                payload["error"] = error
            
            response = requests.post(
                f"{TRACKER_URL}/update",
                json=payload,
                timeout=5
            )
            print(f"📤 Reported to tracker: {response.status_code}")
        except Exception as e:
            print(f"⚠️ Failed to report to tracker: {e}")
    
    thread = threading.Thread(target=_report, daemon=True)
    thread.start()

# =============================================
# Chinese voice options (both female and male)
# =============================================
VOICE_MAPPING = {
    # Female voices
    0: "zh-CN-XiaoxiaoNeural",
    1: "zh-CN-XiaoyiNeural",
    2: "zh-CN-XiaomengNeural",
    3: "zh-CN-XiaoxuanNeural",
    4: "zh-CN-XiaohanNeural",
    5: "zh-CN-XiaomoNeural",
    6: "zh-CN-XiaoruiNeural",
    
    # Male voices
    7: "zh-CN-YunxiNeural",
    8: "zh-CN-YunjianNeural",
    9: "zh-CN-YunyangNeural",
    10: "zh-CN-YunxiaNeural",
    11: "zh-CN-YunhaoNeural",
    12: "zh-CN-YunfengNeural",
    
    # Regional/dialect voices
    13: "zh-CN-liaoning-XiaobeiNeural",
    14: "zh-CN-shaanxi-XiaoniNeural",
    15: "zh-HK-HiuGaaiNeural",
    16: "zh-HK-HiuMaanNeural",
    17: "zh-HK-WanLungNeural",
    18: "zh-TW-HsiaoChenNeural",
    19: "zh-TW-HsiaoYuNeural",
    20: "zh-TW-YunJheNeural",
}

VOICE_DESCRIPTIONS = {
    0: "Xiaoxiao (Female) - Warm, caring sister",
    1: "Xiaoyi (Female) - Lively, cute sweet voice",
    2: "Xiaomeng (Female) - Childish, energetic loli voice",
    3: "Xiaoxuan (Female) - Mature, professional",
    4: "Xiaohan (Female) - Gentle, warm",
    5: "Xiaomo (Female) - Youthful, friendly",
    6: "Xiaorui (Female) - Kind, gentle senior",
    7: "Yunxi (Male) - Clear, professional broadcast",
    8: "Yunjian (Male) - Cool, calm, sports commentary",
    9: "Yunyang (Male) - Authoritative news anchor",
    10: "Yunxia (Male) - Lively, sunshine, anime style",
    11: "Yunhao (Male) - Warm, friendly, optimistic",
    12: "Yunfeng (Male) - Deep, mature, serious",
    13: "Xiaobei (Female) - Cheerful Liaoning dialect",
    14: "Xiaoni (Female) - Bright Shaanxi dialect",
    15: "HiuGaai (Female Cantonese) - Hong Kong style",
    16: "HiuMaan (Female Cantonese) - Hong Kong style",
    17: "WanLung (Male Cantonese) - Hong Kong style",
    18: "HsiaoChen (Female Taiwanese) - Taiwan Mandarin",
    19: "HsiaoYu (Female Taiwanese) - Taiwan Mandarin",
    20: "YunJhe (Male Taiwanese) - Taiwan Mandarin",
}

# Create FastAPI app
fastapi_app = FastAPI(title="TTS API")

# Add CORS middleware
fastapi_app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

def sanitize_folder_name(title):
    """Convert video title to safe folder name"""
    safe_name = re.sub(r'[^\w\s-]', '', title)
    safe_name = re.sub(r'[-\s]+', '_', safe_name)
    return safe_name.strip('_')

def get_emotion_params(emotion_id):
    """Convert emotion ID to speech parameters"""
    emotions = {
        0: {"rate": "+0%", "pitch": "+0Hz", "volume": "+0%"},
        1: {"rate": "+15%", "pitch": "+30Hz", "volume": "+10%"},
        2: {"rate": "-10%", "pitch": "-20Hz", "volume": "-10%"},
        3: {"rate": "+25%", "pitch": "+50Hz", "volume": "+15%"},
        4: {"rate": "+5%", "pitch": "+15Hz", "volume": "+5%"},
    }
    return emotions.get(emotion_id, emotions[0])

def upload_to_dataset(audio_path, srt_path, metadata, video_title, project_id=None):
    """Upload audio and SRT files to Hugging Face dataset"""
    try:
        folder_name = project_id if project_id else sanitize_folder_name(video_title)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        file_id = str(uuid.uuid4())[:8]
        
        voice_name = VOICE_DESCRIPTIONS[metadata["voice_id"]].split(" ")[0]
        emotion_names = ["neutral", "happy", "sad", "excited", "frustrated"]
        emotion_name = emotion_names[metadata["emotion_id"]]
        
        audio_filename = f"{timestamp}_{voice_name}_{emotion_name}_{file_id}.mp3"
        srt_filename = f"{timestamp}_{voice_name}_{emotion_name}_{file_id}.srt"
        
        audio_dataset_path = f"data/projects/{folder_name}/audio/{audio_filename}"
        srt_dataset_path = f"data/projects/{folder_name}/subtitles/{srt_filename}"
        
        # Upload files
        upload_file(
            path_or_fileobj=audio_path,
            path_in_repo=audio_dataset_path,
            repo_id=DATASET_REPO,
            repo_type="dataset",
            token=HF_TOKEN
        )
        
        upload_file(
            path_or_fileobj=srt_path,
            path_in_repo=srt_dataset_path,
            repo_id=DATASET_REPO,
            repo_type="dataset",
            token=HF_TOKEN
        )
        
        audio_file_url = f"https://huggingface.co/datasets/{DATASET_REPO}/blob/main/{audio_dataset_path}"
        srt_file_url = f"https://huggingface.co/datasets/{DATASET_REPO}/blob/main/{srt_dataset_path}"
        
        metadata_entry = {
            "file_id": file_id,
            "type": "audio_with_subtitles",
            "audio_filename": audio_filename,
            "srt_filename": srt_filename,
            "audio_dataset_path": audio_dataset_path,
            "srt_dataset_path": srt_dataset_path,
            "audio_file_url": audio_file_url,
            "srt_file_url": srt_file_url,
            "video_title": video_title,
            "project_id": folder_name,
            "timestamp": timestamp,
            "text": metadata["text"],
            "voice_id": metadata["voice_id"],
            "voice_name": voice_name,
            "emotion_id": metadata["emotion_id"],
            "emotion_name": emotion_name,
            "speed": metadata["speed"],
            "duration_seconds": metadata.get("duration_seconds"),
            "word_count": metadata.get("word_count"),
            "parameters": metadata["parameters"]
        }
        
        audio_metadata_path = f"data/projects/{folder_name}/metadata/audio_{file_id}.json"
        with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
            json.dump(metadata_entry, f, indent=2)
            temp_meta_path = f.name
        
        upload_file(
            path_or_fileobj=temp_meta_path,
            path_in_repo=audio_metadata_path,
            repo_id=DATASET_REPO,
            repo_type="dataset",
            token=HF_TOKEN
        )
        
        os.unlink(temp_meta_path)
        
        return {
            "success": True,
            "audio_file_url": audio_file_url,
            "srt_file_url": srt_file_url,
            "project_id": folder_name,
            "metadata": metadata_entry
        }
        
    except Exception as e:
        return {
            "success": False,
            "error": str(e)
        }

async def generate_speech(text, voice_id, emotion_id, speed, video_title, project_id=None, subtitle_style="short"):
    """Generate speech with accurate subtitles that match audio duration"""
    temp_dir = None
    try:
        voice = VOICE_MAPPING.get(voice_id, "zh-CN-YunxiNeural")
        emotion_params = get_emotion_params(emotion_id)
        
        rate_percentage = int(emotion_params["rate"].replace("%", "").replace("+", ""))
        adjusted_rate = rate_percentage + int((speed - 1.0) * 50)
        rate = f"{adjusted_rate:+d}%"
        
        temp_dir = tempfile.mkdtemp()
        local_audio_path = os.path.join(temp_dir, "temp_audio.mp3")
        local_srt_path = os.path.join(temp_dir, "temp_subtitles.srt")
        
        # Initialize word boundary collection
        words_data = []
        
        # Create communicate instance
        communicate = edge_tts.Communicate(
            text,
            voice,
            rate=rate,
            pitch=emotion_params["pitch"],
            volume=emotion_params["volume"]
        )
        
        # Collect word boundaries and save audio
        audio_data = bytearray()
        
        print(f"🎤 Generating TTS and collecting word boundaries...")
        
        async for chunk in communicate.stream():
            if chunk["type"] == "audio":
                audio_data.extend(chunk["data"])
            elif chunk["type"] == "WordBoundary":
                word_info = {
                    'text': chunk['text'],
                    'offset': chunk['offset'],
                    'duration': chunk['duration']
                }
                words_data.append(word_info)
                
                if len(words_data) <= 5:
                    start_sec = word_info['offset'] / 1e7
                    print(f"  Word {len(words_data)}: '{word_info['text']}' at {start_sec:.2f}s")
        
        # Save audio file
        with open(local_audio_path, "wb") as f:
            f.write(audio_data)
        
        print(f"✅ Audio saved. Total words collected: {len(words_data)}")
        
        # Get actual audio duration using mutagen
        total_duration = get_audio_duration(local_audio_path)
        
        if total_duration is None:
            # Fallback: estimate from last word
            if words_data:
                last_word = words_data[-1]
                total_duration = (last_word['offset'] + last_word['duration']) / 1e7
                print(f"📊 Estimated duration from words: {total_duration:.2f}s")
            else:
                total_duration = max(len(text) / 3.5, 5)
                print(f"📊 Estimated duration from text length: {total_duration:.2f}s")
        else:
            print(f"🎵 Actual audio duration: {total_duration:.2f} seconds")
        
        # Create SRT with proper timing
        if words_data:
            srt_content = create_short_srt_from_words(words_data, total_duration)
            caption_count = srt_content.count('-->')
            print(f"📝 Generated {caption_count} subtitle entries")
            print(f"   Last subtitle timestamp: {srt_content.split('-->')[-1].strip() if '-->' in srt_content else 'N/A'}")
        else:
            print("⚠️ No word boundaries collected - using fallback")
            srt_content = create_fallback_srt(text, total_duration)
        
        # Save SRT file
        with open(local_srt_path, "w", encoding="utf-8") as f:
            f.write(srt_content)
        
        metadata = {
            "text": text,
            "voice_id": voice_id,
            "voice_description": VOICE_DESCRIPTIONS[voice_id],
            "emotion_id": emotion_id,
            "speed": speed,
            "duration_seconds": total_duration,
            "word_count": len(words_data),
            "parameters": {
                "rate": rate,
                "pitch": emotion_params["pitch"],
                "volume": emotion_params["volume"]
            }
        }
        
        # Upload both files
        upload_result = upload_to_dataset(local_audio_path, local_srt_path, metadata, video_title, project_id)
        
        # Clean up
        if temp_dir and os.path.exists(temp_dir):
            shutil.rmtree(temp_dir)
        
        if upload_result["success"]:
            result = {
                "success": True,
                "message": f"Audio ({total_duration:.1f}s) and SRT captions generated",
                "video_title": video_title,
                "project_id": upload_result["project_id"],
                "audio_url": upload_result["audio_file_url"],
                "srt_url": upload_result["srt_file_url"],
                "audio_duration": total_duration,
                "word_count": len(words_data),
                "subtitle_count": caption_count if 'caption_count' in locals() else 0,
                "metadata": upload_result["metadata"]
            }
            
            if project_id:
                report_to_tracker(
                    project_id=project_id,
                    service_type="tts",
                    status="completed",
                    file_urls=[upload_result["audio_file_url"], upload_result["srt_file_url"]]
                )
            
            return result
        else:
            if project_id:
                report_to_tracker(
                    project_id=project_id,
                    service_type="tts",
                    status="failed",
                    error=upload_result["error"]
                )
            
            return {
                "success": False,
                "error": upload_result["error"]
            }
        
    except Exception as e:
        if temp_dir and os.path.exists(temp_dir):
            shutil.rmtree(temp_dir)
        
        print(f"❌ Error in generate_speech: {str(e)}")
        
        if project_id:
            report_to_tracker(
                project_id=project_id,
                service_type="tts",
                status="failed",
                error=str(e)
            )
        
        return {
            "success": False,
            "error": str(e)
        }

# =============================================
# FASTAPI ENDPOINTS
# =============================================

@fastapi_app.get("/")
async def root():
    return {
        "name": "TTS API with SRT Captions",
        "version": "2.1",
        "endpoints": {
            "generate": "POST /api/generate",
            "health": "GET /api/health"
        }
    }

@fastapi_app.get("/api/health")
async def health():
    return {"status": "healthy", "service": "tts"}

@fastapi_app.post("/api/generate")
async def generate_tts(request: dict):
    """API endpoint - returns permanent dataset URLs for audio and SRT"""
    try:
        text = request.get("text", "")
        voice_id = int(request.get("voice_id", 7))
        emotion_id = int(request.get("emotion_id", 0))
        speed = float(request.get("speed", 1.0))
        video_title = request.get("video_title", "Untitled Video")
        project_id = request.get("project_id")
        subtitle_style = request.get("subtitle_style", "short")
        
        if voice_id not in VOICE_MAPPING:
            return {"status": "error", "error": f"Invalid voice_id: {voice_id}"}
        
        if not text:
            return {"status": "error", "error": "No text provided"}
        
        result = await generate_speech(text, voice_id, emotion_id, speed, video_title, project_id, subtitle_style)
        return result
        
    except Exception as e:
        return {"status": "error", "error": str(e)}

# =============================================
# GRADIO INTERFACE
# =============================================
with gr.Blocks(title="TTS with SRT Captions") as demo:
    gr.Markdown("# 🎙️ TTS API with Accurate SRT Captions")
    gr.Markdown("Generates short, readable subtitles that exactly match the audio duration")
    
    with gr.Row():
        with gr.Column(scale=1):
            video_title_input = gr.Textbox(
                label="🎬 Video Title",
                placeholder="Enter video title...",
                value="My Video"
            )
            project_id_input = gr.Textbox(
                label="📁 Project ID (optional)",
                placeholder="Enter project ID if known..."
            )
            text_input = gr.Textbox(
                label="📝 Text to synthesize",
                placeholder="输入中文...",
                lines=4,
                value="乌鲁登嘉楼发生致命车祸，一辆休旅车疑未察觉前方路段因土崩已封闭，失控坠入约61公尺深山沟，导致一名男教师与其未婚妻被抛出车外，当场身亡。"
            )
            
            voice_dropdown = gr.Dropdown(
                label="🎤 Voice Selection",
                choices=[
                    ("👩 Xiaoxiao - Warm Sister (Female)", 0),
                    ("👨 Yunxi - Professional (Male)", 7),
                    ("👨 Yunjian - Cool Commentary (Male)", 8),
                ],
                value=7,
                type="index"
            )
            
            subtitle_style = gr.Radio(
                label="📝 Subtitle Style",
                choices=[("Short (2 words per line)", "short")],
                value="short",
                type="value"
            )
            
            emotion_slider = gr.Slider(
                minimum=0, maximum=4, step=1, value=0, 
                label="😊 Emotion",
                info="0:Neutral 1:Happy 2:Sad 3:Excited 4:Frustrated"
            )
            speed_slider = gr.Slider(
                minimum=0.5, maximum=2.0, step=0.1, value=1.0, 
                label="⚡ Speed"
            )
            generate_btn = gr.Button("🎵 Generate Audio + SRT", variant="primary")
        
        with gr.Column(scale=1):
            audio_output = gr.Audio(label="Generated Audio", type="filepath")
            srt_output = gr.File(label="SRT Captions", file_types=[".srt"])
            json_output = gr.JSON(label="Response Data")
    
    generate_btn.click(
        fn=lambda t, v, e, s, vt, p, st: asyncio.run(generate_speech(t, v, e, s, vt, p, st)),
        inputs=[text_input, voice_dropdown, emotion_slider, speed_slider, video_title_input, project_id_input, subtitle_style],
        outputs=[audio_output, srt_output, json_output]
    )

# =============================================
# MAIN
# =============================================
app = gr.mount_gradio_app(fastapi_app, demo, path="/")

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=7860)