File size: 3,701 Bytes
efbc3d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
941449c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import os
import base64
import json
import streamlit as st
from sarvamai import SarvamAI
from typing import List, Dict, Optional


client = None
try:
    api_key = st.secrets.get("SARVAM_API_KEY") or os.getenv("SARVAM_API_KEY")
    if api_key:
        client = SarvamAI(api_subscription_key=api_key)
        print("βœ… Sarvam AI client for TTS (Bulbul) initialized successfully.")
    else:
        print("⚠️ Warning: SARVAM_API_KEY not found.")
except Exception as e:
    print(f"❌ Error initializing Sarvam AI client: {e}")

# --- Language Mapping ---
LANGUAGE_CODE_MAP = {
    "hindi": "hi-IN", "bengali": "bn-IN", "tamil": "ta-IN", "telugu": "te-IN",
    "gujarati": "gu-IN", "kannada": "kn-IN", "malayalam": "ml-IN", "marathi": "mr-IN",
    "punjabi": "pa-IN", "odia": "od-IN", "english": "en-IN",
}

def get_language_code(language_name: str) -> Optional[str]:
    return LANGUAGE_CODE_MAP.get(language_name.lower())

def generate_audio_from_text(
    text: str,
    language_name: str,
    gender: str,
    output_file_path: str
) -> bool:
    """
    Generates an audio file from a text string using the Sarvam "Bulbul" TTS API
    with a specified gender and pace for the voice.
    """
    if not client: return False
    lang_code = get_language_code(language_name)
    if not lang_code:
        print(f"❌ Language '{language_name}' is not supported. Skipping.")
        return False

    if gender.lower() == "male":
        speaker_name = "abhilash"
        pace_value = 1.0
    else:
        speaker_name = "anushka"
        pace_value = 0.9

    print(f"--- 🎀 Generating audio for chunk: '{text[:50]}...' in {language_name} (Voice: {speaker_name}, Pace: {pace_value}) ---")

    try:
        response = client.text_to_speech.convert(
            text=text,
            model="bulbul:v2",
            target_language_code=lang_code,
            speaker=speaker_name,
            pace=pace_value, # Use the selected pace
            speech_sample_rate=22050,
            enable_preprocessing=True
        )

        combined_audio_b64 = "".join(response.audios)
        audio_data = base64.b64decode(combined_audio_b64)

        with open(output_file_path, "wb") as f:
            f.write(audio_data)

        print(f"βœ… Audio saved to {output_file_path}")
        return True

    except Exception as e:
        print(f"❌ An error occurred during the Sarvam TTS API call: {e}")
        return False

def generate_all_audio_from_file(
    json_path: str,
    target_language: str,
    gender: str,
    output_dir: str = "generated_audio",
    output_json_path: str = "multimedia_data_final.json"
) -> List[Dict[str, str]]:
    """
    Reads data from a JSON, generates audio with a specific gender, and saves a final JSON.
    """
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            multimedia_data = json.load(f)
    except (FileNotFoundError, json.JSONDecodeError) as e:
        print(f"❌ Error reading or parsing {json_path}: {e}")
        return []

    os.makedirs(output_dir, exist_ok=True)

    for i, item in enumerate(multimedia_data):
        audio_text = item.get("audio_text")
        if not audio_text:
            item["audio_path"] = None
            continue

        file_path = os.path.join(output_dir, f"audio_{i:03d}.mp3")
        success = generate_audio_from_text(audio_text, target_language, gender, file_path)
        item["audio_path"] = file_path if success else None

    with open(output_json_path, 'w', encoding='utf-8') as f:
        json.dump(multimedia_data, f, indent=2, ensure_ascii=False)
    print(f"\n--- βœ… Audio generation finished. Final data saved to {output_json_path}. ---")

    return multimedia_data