Spaces:

subashpoudel
/

Subtitle-Generator

Sleeping

App Files Files Community

Subtitle-Generator / app.py

subashpoudel

Update app.py

8e937fe verified 4 months ago

raw

history blame contribute delete

3.69 kB

	import os
	import time
	import gradio as gr
	import google.generativeai as genai
	from phi.agent import Agent
	from phi.model.google import Gemini
	from google.generativeai import upload_file, get_file
	from dotenv import load_dotenv
	from phi.model.groq import Groq

	# --- Load API Key ---
	load_dotenv()
	API_KEY = os.getenv("GOOGLE_API_KEY")
	if not API_KEY:
	raise EnvironmentError("GOOGLE_API_KEY not found in environment.")
	genai.configure(api_key=API_KEY)

	# --- Fix SRT format ---
	def fix_srt_format(input_file, output_file):
	with open(input_file, "r", encoding="utf-8") as infile:
	lines = infile.readlines()

	fixed_lines = []
	for i, line in enumerate(lines):
	if line.strip().isdigit():
	if i > 0 and lines[i - 1].strip() != "":
	fixed_lines.append("\n")
	fixed_lines.append(line)

	with open(output_file, "w", encoding="utf-8") as outfile:
	outfile.writelines(fixed_lines)

	# --- Initialize Gemini Agent ---
	def initialize_agent():
	return Agent(
	name="Video AI Subtitle Generator",
	model=Gemini(id="gemini-2.0-flash-exp"),
	# model=Groq(id="llama-3.3-70b-versatile"),
	markdown=True,
	)

	# --- Prompt Template ---
	subtitle_prompt_tuned = '''
	You are given a video. Your task is to extract the spoken words along with the exact timestamps of when each word is spoken.

	Please follow these instructions strictly:

	1. For every single spoken word, include:
	- A unique line number
	- The exact start time and end time in the format: HH:MM:SS,mmm --> HH:MM:SS,mmm
	- The spoken word on the next line (exactly one word per block)
	2. Do not include more or fewer than one word per timestamp.
	3. Do not summarize, paraphrase, or skip any spoken content — include all spoken words verbatim.
	4. Do not include any sound effects or non-verbal cues like [Music], [Laughter], etc.
	5. Your output must be a raw transcription — no extra formatting, no explanations, no commentary.
	6. Maintain the exact chronological order as spoken in the video.

	*FINAL AND CRITICAL REMINDER: The timestamp accuracy is the highest priority. Focus on getting the precise start and end time for each word*.

	Example format:

	1
	00:00:01,000 --> 00:00:01,300
	Hello

	2
	00:00:01,310 --> 00:00:01,600
	everyone

	3
	00:00:01,610 --> 00:00:01,900
	welcome

	...

	Only output the transcription in the above format (like SRT file format). Do not return any additional text.
	'''

	# --- Gradio Interface Function ---
	def generate_subtitles(video):
	if not video:
	return None

	video_path = video
	output_txt = "raw_subtitles.srt"
	output_fixed = "output_subtitles.srt"

	agent = initialize_agent()

	print("[INFO] Uploading video...")
	uploaded_video = upload_file(video_path)

	while uploaded_video.state.name == "PROCESSING":
	time.sleep(1)
	uploaded_video = get_file(uploaded_video.name)

	print("[INFO] Generating subtitles...")
	response = agent.run(subtitle_prompt_tuned, videos=[uploaded_video])
	raw_text = response.content.strip()

	with open(output_txt, "w", encoding="utf-8") as f:
	f.write(raw_text)

	fix_srt_format(output_txt, output_fixed)
	return output_fixed

	# --- Launch Gradio App ---
	demo = gr.Interface(
	fn=generate_subtitles,
	inputs=gr.Video(label="Upload MP4 Video"),
	outputs=gr.File(label="Download .srt Subtitle File"),
	title="Subtitle Generator",
	description="Upload a video to extract precise subtitles using AI. Output is a .srt file with exact timestamps.",
	)

	if __name__ == "__main__":
	demo.launch(share=True)