Ttspro2

Running

File size: 13,874 Bytes

eb4ceda
83ac874
db88be5
83ac874
 
db88be5
 
 
83ac874
 
 
 
 
eb4ceda
 
 
83ac874
 
42a6235
 
83ac874
d9cad98
4a1e151
 
 
 
 
 
af0842c
8150db4
eb4ceda
50ab425
 
83ac874
af0842c
8150db4
af0842c
 
e4cf6e1
af0842c
 
eb4ceda
af0842c
50ab425
 
eb4ceda
 
 
 
50ab425
 
 
 
 
 
eb4ceda
1ec58e2
eb4ceda
 
42a6235
eb4ceda
e4cf6e1
 
 
 
 
 
 
 
1ddb4e8
42a6235
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb4ceda
 
 
42a6235
 
 
db88be5
e4cf6e1
 
42a6235
 
 
e4cf6e1
42a6235
4e27563
eb4ceda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50ab425
eb4ceda
 
 
 
 
 
 
 
 
 
cf1843b
eb4ceda
f7677f8
42a6235
 
 
 
50ab425
eb4ceda
42a6235
eb4ceda
42a6235
cf1843b
eb4ceda
 
3eb70c5
 
eb4ceda
 
42a6235
 
83ac874
eb4ceda
 
e4cf6e1
eb4ceda
 
 
 
 
e4cf6e1
eb4ceda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83ac874
42a6235
eb4ceda
 
 
e4cf6e1
42a6235
 
cf1843b
83ac874
af0842c
83ac874
42a6235
83ac874
 
 
 
 
eb4ceda
 
 
42a6235
83ac874
50ab425
83ac874
 
 
 
 
 
 
eb4ceda
 
 
 
83ac874
 
 
 
 
eb4ceda
83ac874
eb4ceda
83ac874
8150db4
83ac874
 
 
8150db4
73c0690
83ac874
50ab425

# app.py - نسخه Worker بدون تقسیم متن (No Splitting)

import os
import sys
import traceback
import re
import struct
import time
import uuid
import shutil
import logging
import mimetypes
import threading
import random
import asyncio
import wave
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from google import genai
from google.genai import types
import uvicorn

try:
    from pydub import AudioSegment
    PYDUB_AVAILABLE = True
except ImportError:
    PYDUB_AVAILABLE = False

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

# --- تنظیمات مدیریت کلیدها ---
GEMINI_CLIENTS_CACHE = {} 
CLIENT_CACHE_LOCK = threading.Lock() 

ALL_API_KEYS: list[str] = []

def _init_api_keys():
    global ALL_API_KEYS
    all_keys_string = os.environ.get("ALL_GEMINI_API_KEYS")
    if all_keys_string:
        ALL_API_KEYS = [key.strip() for key in all_keys_string.split(',') if key.strip()]
    logging.info(f"✅ تعداد {len(ALL_API_KEYS)} کلید API جیمینای شناسایی و بارگذاری شد.")
    if not ALL_API_KEYS:
        logging.warning("⛔️ هشدار: هیچ Secret با نام ALL_GEMINI_API_KEYS یافت نشد!")

def get_random_api_key_and_client():
    if not ALL_API_KEYS:
        return None, None
    key_to_use = random.choice(ALL_API_KEYS)
    with CLIENT_CACHE_LOCK:
        if key_to_use in GEMINI_CLIENTS_CACHE:
            client = GEMINI_CLIENTS_CACHE[key_to_use]
        else:
            client = genai.Client(api_key=key_to_use)
            GEMINI_CLIENTS_CACHE[key_to_use] = client
    return key_to_use, client

FIXED_MODEL_NAME_STANDARD = "gemini-2.5-flash-preview-tts"
FIXED_MODEL_NAME_LIVE = "models/gemini-2.5-flash-native-audio-preview-12-2025"
DEFAULT_MAX_CHUNK_SIZE = 3800
DEFAULT_SLEEP_BETWEEN_REQUESTS = 5

def save_binary_file(file_name, data):
    try:
        with open(file_name, "wb") as f: f.write(data)
        return file_name
    except Exception as e:
        logging.error(f"❌ خطا در ذخیره فایل {file_name}: {e}")
        return None

def convert_to_wav(audio_data: bytes, mime_type: str) -> bytes:
    parameters = parse_audio_mime_type(mime_type)
    bits_per_sample, rate = parameters["bits_per_sample"], parameters["rate"]
    num_channels, data_size = 1, len(audio_data)
    bytes_per_sample, block_align = bits_per_sample // 8, num_channels * (bits_per_sample // 8)
    byte_rate, chunk_size = rate * block_align, 36 + data_size
    header = struct.pack("<4sI4s4sIHHIIHH4sI", b"RIFF", chunk_size, b"WAVE", b"fmt ", 16, 1, num_channels, rate, byte_rate, block_align, bits_per_sample, b"data", data_size)
    return header + audio_data

def parse_audio_mime_type(mime_type: str) -> dict[str, int]:
    bits, rate = 16, 24000
    for param in mime_type.split(";"):
        param = param.strip()
        if param.lower().startswith("rate="):
            try: rate = int(param.split("=", 1)[1])
            except: pass
        elif param.startswith("audio/L"):
            try: bits = int(param.split("L", 1)[1])
            except: pass
    return {"bits_per_sample": bits, "rate": rate}

def smart_text_split(text, max_size=3800):
    # تغییر مهم: حذف کامل تقسیم‌بندی متن
    # کل متن به عنوان یک تکه بازگردانده می‌شود تا هوش مصنوعی یکجا آن را پردازش کند
    return [text]

def merge_audio_files_func(file_paths, output_path):
    if not PYDUB_AVAILABLE: logging.warning("⚠️ pydub برای ادغام در دسترس نیست."); return False
    try:
        combined = AudioSegment.empty()
        for i, fp in enumerate(file_paths):
            if os.path.exists(fp): combined += AudioSegment.from_file(fp) + (AudioSegment.silent(duration=150) if i < len(file_paths) - 1 else AudioSegment.empty())
            else: logging.warning(f"⚠️ فایل برای ادغام پیدا نشد: {fp}")
        combined.export(output_path, format="wav")
        return True
    except Exception as e: logging.error(f"❌ خطا در ادغام فایل‌های صوتی: {e}"); return False

# --- منطق Gemini Live ---
async def generate_audio_live_with_retry(text, prompt, voice, session_id):
    MAX_RETRIES = 50 
    live_config = types.LiveConnectConfig(
        response_modalities=["AUDIO"],
        speech_config=types.SpeechConfig(
            voice_config=types.VoiceConfig(
                prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=voice)
            )
        ),
    )
    for attempt in range(MAX_RETRIES):
        selected_api_key, _ = get_random_api_key_and_client()
        if not selected_api_key: break
        client = genai.Client(http_options={"api_version": "v1beta"}, api_key=selected_api_key)
        unique_id_for_req = str(uuid.uuid4())[:8]
        tts_prompt = f"Please read the following text naturally: '{text}' [ID: {unique_id_for_req}]"
        if prompt: tts_prompt = f"With a {prompt} tone, please read: '{text}'"
        try:
            logging.info(f"[{session_id}] (Live) تلاش {attempt+1} با کلید ...{selected_api_key[-4:]}")
            audio_buffer = bytearray()
            async with client.aio.live.connect(model=FIXED_MODEL_NAME_LIVE, config=live_config) as session:
                await session.send(input=tts_prompt, end_of_turn=True)
                async for response in session.receive():
                    if response.data: audio_buffer.extend(response.data)
            if len(audio_buffer) > 0:
                logging.info(f"[{session_id}] ✅ (Live) موفقیت‌آمیز.")
                return audio_buffer
            else: raise Exception("بافر صوتی خالی بود.")
        except Exception as e:
            logging.warning(f"[{session_id}] ⚠️ (Live) خطا در تلاش {attempt+1}: {e}")
            time.sleep(0.5)
    return None

def save_pcm_to_wav(pcm_data, output_path):
    try:
        with wave.open(output_path, 'wb') as wf:
            wf.setnchannels(1)
            wf.setsampwidth(2)
            wf.setframerate(24000)
            wf.writeframes(pcm_data)
        return True
    except Exception as e:
        logging.error(f"خطا در تبدیل PCM به WAV: {e}")
        return False

# --- منطق Gemini Standard (اصلاح شده با retry_limit) ---
def generate_audio_chunk_standard_with_retry(chunk_text, prompt_text, voice, temp, session_id, retry_limit):
    if not ALL_API_KEYS: raise Exception("هیچ کلید API در دسترس نیست.")
    
    # استفاده از محدودیت تعیین شده توسط Manager
    MAX_RETRIES = retry_limit 
    
    for attempt in range(MAX_RETRIES):
        selected_api_key, client = get_random_api_key_and_client()
        if not client: break
        try:
            # logging.info(f"[{session_id}] (Standard) تلاش {attempt+1}/{MAX_RETRIES} با کلید ...{selected_api_key[-4:]}")
            final_text = f'{chunk_text}({prompt_text})' if prompt_text and prompt_text.strip() else chunk_text
            contents = [types.Content(role="user", parts=[types.Part.from_text(text=final_text)])]
            config = types.GenerateContentConfig(temperature=temp, response_modalities=["audio"],
                speech_config=types.SpeechConfig(voice_config=types.VoiceConfig(
                    prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=voice))))
            
            response = client.models.generate_content(model=FIXED_MODEL_NAME_STANDARD, contents=contents, config=config)
            if response.candidates and response.candidates[0].content and response.candidates[0].content.parts and response.candidates[0].content.parts[0].inline_data:
                logging.info(f"[{session_id}] ✅ (Standard) موفقیت در تلاش {attempt+1}.")
                return response.candidates[0].content.parts[0].inline_data
        except Exception as e:
            logging.warning(f"[{session_id}] ⚠️ (Standard) خطا در تلاش {attempt+1}: {e}")
            time.sleep(0.5)
    return None

def core_generate_audio(text_input, prompt_input, selected_voice, temperature_val, session_id, use_live_model=False, retry_limit=50, fallback_to_live=False):
    logging.info(f"[{session_id}] 🚀 شروع: Live={use_live_model}, Retry={retry_limit}, Fallback={fallback_to_live}")
    temp_dir = f"temp_{session_id}"
    os.makedirs(temp_dir, exist_ok=True)
    output_base_name = f"{temp_dir}/audio_session_{session_id}"
    final_output_path = f"output_{session_id}.wav"

    try:
        # 1. اگر دستور مستقیم استفاده از لایف باشد (مثلاً کاربر رایگان)
        if use_live_model:
            pcm_data = asyncio.run(generate_audio_live_with_retry(text_input, prompt_input, selected_voice, session_id))
            if pcm_data and save_pcm_to_wav(pcm_data, final_output_path):
                return final_output_path
            else:
                raise Exception("تولید صدا با مدل لایف ناموفق بود.")

        # 2. استفاده از مدل استاندارد
        else:
            # تقسیم‌بندی هوشمند حذف شده و فقط یک چانک (کل متن) برمی‌گرداند
            text_chunks = smart_text_split(text_input, DEFAULT_MAX_CHUNK_SIZE)
            generated_files = []
            standard_failed = False

            for i, chunk in enumerate(text_chunks):
                # تلاش با مدل استاندارد به تعداد retry_limit
                inline_data = generate_audio_chunk_standard_with_retry(chunk, prompt_input, selected_voice, temperature_val, session_id, retry_limit)
                
                if inline_data:
                    data_buffer = inline_data.data
                    ext = mimetypes.guess_extension(inline_data.mime_type) or ".wav"
                    if "audio/L" in inline_data.mime_type and ext == ".wav": 
                        data_buffer = convert_to_wav(data_buffer, inline_data.mime_type)
                    if not ext.startswith("."): ext = "." + ext
                    fpath = save_binary_file(f"{output_base_name}_part{i+1:03d}{ext}", data_buffer)
                    if fpath: generated_files.append(fpath)
                else:
                    standard_failed = True
                    break # شکست در تولید یکی از چانک‌ها (در اینجا کل متن)
            
            # 3. بررسی شکست و Fallback
            if standard_failed:
                if fallback_to_live:
                    logging.info(f"[{session_id}] 🔄 مدل استاندارد شکست خورد. سوییچ به مدل لایف (Fallback)...")
                    generated_files = [] 
                    # فراخوانی مدل لایف برای کل متن
                    pcm_data = asyncio.run(generate_audio_live_with_retry(text_input, prompt_input, selected_voice, session_id))
                    if pcm_data and save_pcm_to_wav(pcm_data, final_output_path):
                        return final_output_path
                    else:
                        raise Exception("هم مدل استاندارد و هم مدل لایف (Fallback) شکست خوردند.")
                else:
                    raise Exception(f"تولید صدا با مدل استاندارد پس از {retry_limit} تلاش ناموفق بود.")
            
            # اگر استاندارد موفق بود، فایل‌ها را ادغام کن (در اینجا معمولاً فقط یک فایل است)
            if not generated_files: raise Exception("هیچ فایلی تولید نشد.")
            
            if len(generated_files) > 1:
                if PYDUB_AVAILABLE and merge_audio_files_func(generated_files, final_output_path):
                    pass
                else:
                    shutil.move(generated_files[0], final_output_path)
            else:
                shutil.move(generated_files[0], final_output_path)
            
            return final_output_path

    finally:
        if os.path.exists(temp_dir):
            shutil.rmtree(temp_dir)

_init_api_keys()

app = FastAPI(title="Alpha TTS Worker API")

class TTSRequest(BaseModel):
    text: str
    prompt: str | None = ""
    speaker: str
    temperature: float
    use_live_model: bool = False
    retry_limit: int = 50       # پارامتر جدید
    fallback_to_live: bool = False # پارامتر جدید

@app.post("/generate")
def generate_audio_endpoint(request: TTSRequest):
    session_id = str(uuid.uuid4())[:8]
    try:
        final_path = core_generate_audio(
            text_input=request.text,
            prompt_input=request.prompt,
            selected_voice=request.speaker,
            temperature_val=request.temperature,
            session_id=session_id,
            use_live_model=request.use_live_model,
            retry_limit=request.retry_limit,
            fallback_to_live=request.fallback_to_live
        )
        if final_path and os.path.exists(final_path):
            from fastapi.responses import FileResponse
            return FileResponse(path=final_path, media_type='audio/wav', filename=os.path.basename(final_path), background=shutil.rmtree(os.path.dirname(final_path), ignore_errors=True))
        else:
            raise HTTPException(status_code=500, detail="خطا در تولید فایل صوتی.")
    except Exception as e:
        logging.error(f"[{session_id}] ❌ خطا: {e}")
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/")
def health_check():
    return {"status": "ok", "message": "TTS Worker is running."}

if __name__ == "__main__":
    port = int(os.environ.get("PORT", 7860))
    uvicorn.run(app, host="0.0.0.0", port=port, reload=False)