Spaces:

rohitdiwane
/

audio

Running

App Files Files Community

audio / app.py

rohitdiwane

Update app.py

09e1504 verified about 2 months ago

raw

history blame contribute delete

5.19 kB

	import os
	import re
	import math
	import shutil
	import tempfile
	from datetime import timedelta

	from pydub import AudioSegment
	from pydub.utils import which
	from openai import OpenAI
	from dotenv import load_dotenv

	import gradio as gr

	# === CONFIG ===
	chunk_duration_min = 9
	chunk_dir = "temp_chunks"
	AudioSegment.converter = which("ffmpeg")
	load_dotenv()

	api_key = os.getenv("OPENAI_API_KEY")
	client = OpenAI(api_key=api_key)


	def split_audio_to_chunks(audio_file_path):
	os.makedirs(chunk_dir, exist_ok=True)
	audio = AudioSegment.from_file(audio_file_path)
	audio = audio.set_channels(1).set_frame_rate(16000)
	chunk_duration_ms = chunk_duration_min * 60 * 1000
	total_chunks = math.ceil(len(audio) / chunk_duration_ms)
	chunk_paths = []
	for i in range(total_chunks):
	start = i * chunk_duration_ms
	end = min(len(audio), start + chunk_duration_ms)
	chunk = audio[start:end]
	chunk_path = os.path.join(chunk_dir, f"chunk_{i+1}.mp3")
	chunk.export(chunk_path, format="mp3", bitrate="32k")
	chunk_paths.append(chunk_path)
	return chunk_paths


	def shift_srt_timestamps(srt_text, offset_seconds):
	def shift_timecode(tc):
	h, m, s_ms = tc.split(":")
	s, ms = s_ms.split(",")
	original = timedelta(hours=int(h), minutes=int(m), seconds=int(s), milliseconds=int(ms))
	shifted = original + timedelta(seconds=offset_seconds)
	total_seconds = int(shifted.total_seconds())
	ms = int(shifted.microseconds / 1000)

	h = total_seconds // 3600
	m = (total_seconds % 3600) // 60
	s = total_seconds % 60

	return f"{h:02}:{m:02}:{s:02},{ms:03}"

	updated_lines = []
	for line in srt_text.splitlines():
	if " --> " in line:
	start, end = line.split(" --> ")
	new_start = shift_timecode(start.strip())
	new_end = shift_timecode(end.strip())
	updated_lines.append(f"{new_start} --> {new_end}")
	else:
	updated_lines.append(line)
	return "\n".join(updated_lines)


	def transcribe_chunks(chunk_paths):
	srt_blocks = []
	for i, chunk_path in enumerate(chunk_paths):
	with open(chunk_path, "rb") as audio_file:
	result = client.audio.transcriptions.create(
	model="whisper-1",
	file=audio_file,
	response_format="srt"
	)
	offset_sec = i * chunk_duration_min * 60
	shifted = shift_srt_timestamps(result, offset_sec)
	srt_blocks.append(shifted)
	return "\n\n".join(srt_blocks)


	def parse_srt_paragraphs(srt_str):
	blocks = srt_str.strip().split("\n\n")
	paragraphs = []
	current_paragraph = ""
	current_timestamp = ""

	for block in blocks:
	lines = block.strip().split("\n")
	if len(lines) >= 3:
	timestamp = lines[1].split(" --> ")[0].strip()
	text = " ".join(lines[2:]).strip()

	if not current_paragraph:
	current_timestamp = timestamp
	current_paragraph = text
	else:
	current_paragraph += " " + text

	if re.search(r'(?<=[.!?])["\']?\s', current_paragraph):
	paragraphs.append((current_timestamp, current_paragraph.strip()))
	current_paragraph = ""
	current_timestamp = ""

	# Flush any remaining text at the end
	if current_paragraph:
	paragraphs.append((current_timestamp, current_paragraph.strip()))

	return paragraphs


	def process_audio(audio_path):
	try:
	tmp_audio_path = audio_path # Already a file path from Gradio

	chunk_paths = split_audio_to_chunks(tmp_audio_path)
	merged_srt = transcribe_chunks(chunk_paths)
	transcript = parse_srt_paragraphs(merged_srt)

	output_lines = []
	display_text = ""

	def timestamp_to_seconds(ts):
	h, m, s_ms = ts.split(":")
	s, ms = s_ms.split(",")
	total_seconds = int(h) * 3600 + int(m) * 60 + int(s)
	return total_seconds # integer seconds only

	for ts, para in transcript:
	seconds = timestamp_to_seconds(ts)
	out = f"{seconds} {para}"
	output_lines.append(out)
	display_text += f"{seconds}s — {para}\n\n"

	output_txt_path = tempfile.NamedTemporaryFile(delete=False, suffix=".txt").name
	with open(output_txt_path, "w", encoding="utf-8") as f:
	f.write("\n".join(output_lines))

	return display_text, output_txt_path
	finally:
	shutil.rmtree(chunk_dir, ignore_errors=True)


	# === Gradio Interface ===
	demo = gr.Interface(
	fn=process_audio,
	inputs=gr.Audio(type="filepath", label="🎧 Upload MP3 Audio"),
	outputs=[
	gr.Markdown(label="📜 Timestamped Transcript"),
	gr.File(label="📥 Download TXT File")
	],
	title="🕓 Audio Timestamp Generator",
	# description="Upload an MP3 file. The tool splits the audio into chunks, transcribes them with Whisper, and returns a paragraph-wise timestamped transcript (timestamps in integer seconds).",
	)

	if __name__ == "__main__":
	demo.launch(ssr_mode=False)