import os
import re
import math
import shutil
import tempfile
from datetime import timedelta

from pydub import AudioSegment
from pydub.utils import which
from openai import OpenAI
from dotenv import load_dotenv

import gradio as gr

# === CONFIG ===
chunk_duration_min = 9
chunk_dir = "temp_chunks"
AudioSegment.converter = which("ffmpeg")
load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)


def split_audio_to_chunks(audio_file_path):
    os.makedirs(chunk_dir, exist_ok=True)
    audio = AudioSegment.from_file(audio_file_path)
    audio = audio.set_channels(1).set_frame_rate(16000)
    chunk_duration_ms = chunk_duration_min * 60 * 1000
    total_chunks = math.ceil(len(audio) / chunk_duration_ms)
    chunk_paths = []
    for i in range(total_chunks):
        start = i * chunk_duration_ms
        end = min(len(audio), start + chunk_duration_ms)
        chunk = audio[start:end]
        chunk_path = os.path.join(chunk_dir, f"chunk_{i+1}.mp3")
        chunk.export(chunk_path, format="mp3", bitrate="32k")
        chunk_paths.append(chunk_path)
    return chunk_paths


def shift_srt_timestamps(srt_text, offset_seconds):
    def shift_timecode(tc):
        h, m, s_ms = tc.split(":")
        s, ms = s_ms.split(",")
        original = timedelta(hours=int(h), minutes=int(m), seconds=int(s), milliseconds=int(ms))
        shifted = original + timedelta(seconds=offset_seconds)
        total_seconds = int(shifted.total_seconds())
        ms = int(shifted.microseconds / 1000)

        h = total_seconds // 3600
        m = (total_seconds % 3600) // 60
        s = total_seconds % 60

        return f"{h:02}:{m:02}:{s:02},{ms:03}"

    updated_lines = []
    for line in srt_text.splitlines():
        if " --> " in line:
            start, end = line.split(" --> ")
            new_start = shift_timecode(start.strip())
            new_end = shift_timecode(end.strip())
            updated_lines.append(f"{new_start} --> {new_end}")
        else:
            updated_lines.append(line)
    return "\n".join(updated_lines)


def transcribe_chunks(chunk_paths):
    srt_blocks = []
    for i, chunk_path in enumerate(chunk_paths):
        with open(chunk_path, "rb") as audio_file:
            result = client.audio.transcriptions.create(
                model="whisper-1",
                file=audio_file,
                response_format="srt"
            )
        offset_sec = i * chunk_duration_min * 60
        shifted = shift_srt_timestamps(result, offset_sec)
        srt_blocks.append(shifted)
    return "\n\n".join(srt_blocks)


def parse_srt_paragraphs(srt_str):
    blocks = srt_str.strip().split("\n\n")
    paragraphs = []
    current_paragraph = ""
    current_timestamp = ""

    for block in blocks:
        lines = block.strip().split("\n")
        if len(lines) >= 3:
            timestamp = lines[1].split(" --> ")[0].strip()
            text = " ".join(lines[2:]).strip()

            if not current_paragraph:
                current_timestamp = timestamp
                current_paragraph = text
            else:
                current_paragraph += " " + text

            if re.search(r'(?<=[.!?])["\']?\s', current_paragraph):
                paragraphs.append((current_timestamp, current_paragraph.strip()))
                current_paragraph = ""
                current_timestamp = ""

    # Flush any remaining text at the end
    if current_paragraph:
        paragraphs.append((current_timestamp, current_paragraph.strip()))

    return paragraphs


def process_audio(audio_path):
    try:
        tmp_audio_path = audio_path  # Already a file path from Gradio

        chunk_paths = split_audio_to_chunks(tmp_audio_path)
        merged_srt = transcribe_chunks(chunk_paths)
        transcript = parse_srt_paragraphs(merged_srt)

        output_lines = []
        display_text = ""

        def timestamp_to_seconds(ts):
            h, m, s_ms = ts.split(":")
            s, ms = s_ms.split(",")
            total_seconds = int(h) * 3600 + int(m) * 60 + int(s)
            return total_seconds  # integer seconds only

        for ts, para in transcript:
            seconds = timestamp_to_seconds(ts)
            out = f"{seconds} {para}"
            output_lines.append(out)
            display_text += f"**{seconds}s** — {para}\n\n"

        output_txt_path = tempfile.NamedTemporaryFile(delete=False, suffix=".txt").name
        with open(output_txt_path, "w", encoding="utf-8") as f:
            f.write("\n".join(output_lines))

        return display_text, output_txt_path
    finally:
        shutil.rmtree(chunk_dir, ignore_errors=True)


# === Gradio Interface ===
demo = gr.Interface(
    fn=process_audio,
    inputs=gr.Audio(type="filepath", label="🎧 Upload MP3 Audio"),
    outputs=[
        gr.Markdown(label="📜 Timestamped Transcript"),
        gr.File(label="📥 Download TXT File")
    ],
    title="🕓 Audio Timestamp Generator",
    # description="Upload an MP3 file. The tool splits the audio into chunks, transcribes them with Whisper, and returns a paragraph-wise timestamped transcript (timestamps in integer seconds).",
)

if __name__ == "__main__":
    demo.launch(ssr_mode=False)