Subtrans / app /tests /experimental /scratch_gemini_batch.py
arjun-ms's picture
Initial commit: Subtrans Subtitle Pipeline
57bbccb
import os
import sys
# Ensure the app module can be imported from root directory
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
import pysrt
import google.generativeai as genai
from app.services.translators.gemini_adapter import GeminiAdapter
from dotenv import load_dotenv
load_dotenv()
adapter = GeminiAdapter()
subs = pysrt.open('app/subtitles/08-52-AM--10-05-2026/nikhil kamath clip_test_hi.srt', encoding='utf-8')
lines = [sub.text for sub in subs[:30]]
print(f"Translating {len(lines)} lines")
lang_name = "Hindi"
non_empty = [(i, line) for i, line in enumerate(lines) if line.strip()]
numbered_block = "\n".join(
f"[{idx+1}] {line}" for idx, (_, line) in enumerate(non_empty)
)
system_instruction = (
f"You are an expert translator specializing in {lang_name}. "
f"You will receive numbered English subtitle lines from a conversation. "
f"Translate ALL {len(non_empty)} lines to natural, colloquial {lang_name}. "
f"Use the surrounding lines as context to pick the right tone, pronouns, and expressions. "
f"Return ONLY the translations in the exact same numbered format: [1] translation, [2] translation, etc. "
f"Do NOT add explanations, notes, or extra text. "
f"You MUST translate exactly {len(non_empty)} lines. Do not stop until you have output all of them."
)
user_prompt = f"Here are the lines:\n{numbered_block}"
model = genai.GenerativeModel("gemini-2.5-flash", system_instruction=system_instruction)
response = model.generate_content(
user_prompt,
generation_config=genai.types.GenerationConfig(
temperature=0.3,
)
)
raw_output = response.text.strip()
print("RAW OUTPUT:")
print("---")
print(raw_output)
print("---")
print("Finish Reason:", response.candidates[0].finish_reason)
translated_dict = adapter._parse_numbered_block(raw_output)
print(f"Parsed {len(translated_dict)} lines.")