SmartScribe / app.py
itsasutosha's picture
Update app.py
a3d82a2 verified
# Import Libraries
import os
import gc
import torch
import numpy as np
import uuid
import pycountry
import yt_dlp
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
from huggingface_hub import login
from pydub import AudioSegment
from faster_whisper import WhisperModel
# Setup YouTube Cookies from Environment
def setup_cookies():
"""Write cookies from environment variable to cookies.txt file"""
cookies_content = os.getenv('YOUTUBE_COOKIES')
if cookies_content:
with open('cookies.txt', 'w') as f:
f.write(cookies_content)
print("✅ Cookies loaded successfully")
return True
else:
print("⚠️ No cookies found in environment - YouTube downloads may fail")
return False
# Call cookie setup when app starts
setup_cookies()
# Hugging Face Login Setup
hf_token = os.getenv('HF_TOKEN')
if hf_token:
login(hf_token, add_to_git_credential=True)
# Model names
LLAMA = "meta-llama/Llama-3.2-3B-Instruct"
QWEN = "Qwen/Qwen3-4B-Instruct-2507"
PHI = "microsoft/Phi-4-mini-instruct"
DEEPSEEK = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
Gemma = 'google/gemma-3-4b-it'
# YouTube Download Function
def _download_if_youtube(source):
if "youtube.com" in source or "youtu.be" in source:
unique = str(uuid.uuid4())[:8]
filename = f"audio_{unique}.%(ext)s"
ydl_opts = {
"format": "bestaudio/best",
"outtmpl": filename,
"quiet": True,
"extractor_args": {"youtube": {"player_client": ["default"]}},
"cookiefile": "cookies.txt",
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(source, download=True)
return ydl.prepare_filename(info)
else:
return source
# Convert to WAV
def _to_wav(path):
unique = str(uuid.uuid4())[:8]
wav_path = f"audio_{unique}.wav"
AudioSegment.from_file(path).export(wav_path, format="wav")
return wav_path
# Transcription Function
def transcription_whisper(source):
torch.cuda.empty_cache()
gc.collect()
device = "cuda" if torch.cuda.is_available() else "cpu"
compute = "float16" if device == "cuda" else "int8"
model = WhisperModel('medium', device=device, compute_type=compute)
file_path = _download_if_youtube(source)
wav_path = _to_wav(file_path)
segments, info = model.transcribe(wav_path)
result = []
formatted_output = "**TRANSCRIPTION**\n" + "="*50 + "\n\n"
for seg in segments:
result.append({
"start": seg.start,
"end": seg.end,
"text": seg.text.strip()
})
formatted_output += f"[{seg.start:.2f}s - {seg.end:.2f}s]\n{seg.text.strip()}\n\n"
del model
gc.collect()
torch.cuda.empty_cache()
return formatted_output, result
# Prompts
system_prompt = """
You are an expert assistant that generates clear, concise, and well-structured
Minutes of Meeting (MOM) documents from raw meeting transcripts.
Your output must be in clean Markdown format (without code blocks) and must include:
- **Meeting Summary:** A brief overview of the meeting context, agenda, and participants (if mentioned).
- **Key Discussion Points:** Major topics, decisions, or debates.
- **Takeaways:** Important insights, conclusions, and agreements.
- **Action Items:** Actionable tasks with responsible owners and deadlines
(e.g., "John will prepare the project plan by Monday").
Guidelines:
- Write in professional, easy-to-read language.
- Summarize meaningfully; avoid filler words or irrelevant content.
- Omit transcription artifacts (e.g., "um", "okay", "yeah").
- Do not include timestamps.
- Maintain a formal and factual tone while being concise.
- Focus entirely on clarity, structure, and readability.
"""
def user_prompt_for(source):
formatted_output, segments = transcription_whisper(source)
transcript_text = " ".join(seg["text"] for seg in segments)
user_prompt = f"""
Please write well-structured **Minutes of Meeting (MOM)** in Markdown format (without code blocks), including:
- **Summary:** Include attendees, location, and date if mentioned.
- **Key Discussion Points:** List the main topics or discussions.
- **Takeaways:** Summaries of conclusions or insights.
- **Action Items:** Tasks with clear owners and deadlines.
Transcription:
{transcript_text}
"""
return user_prompt
def messages_for(source):
messages = [
{'role': 'system', 'content': system_prompt},
{'role': 'user', 'content': user_prompt_for(source)}
]
return messages
# Quantization Config
quant_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_quant_type='nf4'
)
# Generate MOM / Summarization
def generate(model_name, source):
messages = messages_for(source)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
inputs = tokenizer.apply_chat_template(messages, return_tensors='pt', add_generation_prompt=True).to('cuda')
model = AutoModelForCausalLM.from_pretrained(model_name, device_map='auto', quantization_config=quant_config)
streamer = TextStreamer(tokenizer)
outputs = model.generate(inputs, streamer=streamer, max_new_tokens=5000)
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
mom_output = result
if '<|start_header_id|>assistant<|end_header_id|>' in mom_output:
mom_output = mom_output.split('<|start_header_id|>assistant<|end_header_id|>')[-1]
elif 'assistant' in mom_output:
parts = mom_output.split('assistant')
if len(parts) > 1:
mom_output = parts[-1]
mom_output = mom_output.replace('<|eot_id|>', '').replace('<|end_header_id|>', '').strip()
if '**Minutes of Meeting' in mom_output:
mom_output = mom_output.split('**Minutes of Meeting')[1]
mom_output = '**Minutes of Meeting' + mom_output
elif '**MINUTES' in mom_output:
mom_output = mom_output.split('**MINUTES')[1]
mom_output = '**MINUTES' + mom_output
del model, inputs, tokenizer, outputs
gc.collect()
torch.cuda.empty_cache()
yield mom_output.strip()
# Translation Functions : Valid Language or Not
def valid_language(lang):
return bool(
pycountry.languages.get(name=lang.capitalize()) or
pycountry.languages.get(alpha_2=lang.lower()) or
pycountry.languages.get(alpha_3=lang.lower())
)
# Translate Prompts
system_prompt_translate = "You are a translation assistant. Given a target language and some content, translate the content accurately into that language, preserving meaning, tone, and style, and return only the translated text. Also maintain proper format."
def user_prompt_translate(source, lang):
if not valid_language(lang):
return f"Invalid language: {lang}. Please provide a valid language name or code."
transcript_text, _ = transcription_whisper(source)
lines = transcript_text.split('\n')
text_lines = []
for line in lines:
if line.startswith('**') or line.startswith('=') or line.startswith('[') or not line.strip():
continue
text_lines.append(line.strip())
transcript_text = " ".join(text_lines)
max_chars = 3000
if len(transcript_text) > max_chars:
transcript_text = transcript_text[:max_chars] + "..."
user_prompt = f"""Translate the following text into {lang}.
Instructions:
- Provide ONLY the translation in {lang}
- Do NOT add any explanations or comments
- Preserve the original meaning and tone
- Keep formatting simple and clean
Text to translate:
{transcript_text}
{lang} translation:"""
return user_prompt
def messages_for_translate(source, lang):
messages = [
{'role': 'system', 'content': system_prompt_translate},
{'role': 'user', 'content': user_prompt_translate(source, lang)}
]
return messages
def translate_transcribe(model_name, source, lang):
messages = messages_for_translate(source, lang)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
inputs = tokenizer.apply_chat_template(messages, return_tensors='pt', add_generation_prompt=True).to('cuda')
model = AutoModelForCausalLM.from_pretrained(model_name, device_map='auto', quantization_config=quant_config)
streamer = TextStreamer(tokenizer)
outputs = model.generate(inputs, streamer=streamer, max_new_tokens=5000)
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
translate_output = result
if '<|start_header_id|>assistant<|end_header_id|>' in translate_output:
translate_output = translate_output.split('<|start_header_id|>assistant<|end_header_id|>')[-1]
elif 'assistant' in translate_output:
parts = translate_output.split('assistant')
if len(parts) > 1:
translate_output = parts[-1]
translate_output = translate_output.replace('<|eot_id|>', '').replace('<|end_header_id|>', '').strip()
if 'translation:' in translate_output.lower():
translate_output = translate_output.split('translation:')[-1].strip()
if "Here's an edited version:" in translate_output:
translate_output = translate_output.split("Here's an edited version:")[0].strip()
translate_output = translate_output.replace('assistant', '').strip()
# Format into paragraphs
sentences = translate_output.split('. ')
paragraphs = []
current_para = []
sentence_count = 0
for sentence in sentences:
current_para.append(sentence.strip())
sentence_count += 1
if sentence_count >= 4:
paragraphs.append('. '.join(current_para) + '.')
current_para = []
sentence_count = 0
if current_para:
paragraphs.append('. '.join(current_para) + ('.' if not current_para[-1].endswith('.') else ''))
formatted_output = '\n\n'.join(paragraphs)
del model, inputs, tokenizer, outputs
gc.collect()
torch.cuda.empty_cache()
yield formatted_output
def translate_transcribe_gemma(Gemma, source, lang):
messages = [{'role': 'user', 'content': user_prompt_translate(source, lang)}]
tokenizer = AutoTokenizer.from_pretrained(Gemma, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
inputs = tokenizer.apply_chat_template(messages, return_tensors='pt', add_generation_prompt=True).to('cuda')
model = AutoModelForCausalLM.from_pretrained(Gemma, device_map='auto', quantization_config=quant_config)
streamer = TextStreamer(tokenizer)
outputs = model.generate(inputs, streamer=streamer, max_new_tokens=5000)
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
translate_output = result
if '<|start_header_id|>assistant<|end_header_id|>' in translate_output:
translate_output = translate_output.split('<|start_header_id|>assistant<|end_header_id|>')[-1]
elif 'assistant' in translate_output:
parts = translate_output.split('assistant')
if len(parts) > 1:
translate_output = parts[-1]
translate_output = translate_output.replace('<|eot_id|>', '').replace('<|end_header_id|>', '').strip()
if 'translation:' in translate_output.lower():
translate_output = translate_output.split('translation:')[-1].strip()
if "Here's an edited version:" in translate_output:
translate_output = translate_output.split("Here's an edited version:")[0].strip()
translate_output = translate_output.replace('assistant', '').strip()
del model, inputs, tokenizer, outputs
gc.collect()
torch.cuda.empty_cache()
yield translate_output
# Optimization Functions for MOM
def optimize(model_name, source):
if model_name == 'LLAMA':
result = generate(LLAMA, source)
elif model_name == 'PHI':
result = generate(PHI, source)
elif model_name == 'QWEN':
result = generate(QWEN, source)
elif model_name == 'DEEPSEEK':
result = generate(DEEPSEEK, source)
for chunk in result:
yield chunk
# Optimization Functions for Translation
def optimize_translate(model_name, source, lang):
if model_name == 'LLAMA':
translate = translate_transcribe(LLAMA, source, lang)
elif model_name == 'PHI':
translate = translate_transcribe(PHI, source, lang)
elif model_name == 'QWEN':
translate = translate_transcribe(QWEN, source, lang)
elif model_name == 'DEEPSEEK':
translate = translate_transcribe(DEEPSEEK, source, lang)
elif model_name == 'Gemma':
translate = translate_transcribe_gemma(Gemma, source, lang)
for chunk_tr in translate:
yield chunk_tr
# Helper Functon for Gradio UI
def get_source_input(file, link):
if file is not None:
return file.name if hasattr(file, 'name') else file
return link if link else ""
# CSS Styling
css = """
#file-box {
min-height: 500px !important;
}
#file-box button {
height: 100% !important;
width: 100% !important;
display: flex !important;
flex-direction: column !important;
align-items: center !important;
justify-content: center !important;
margin: 0 !important;
padding: 0 !important;
}
#box {
min-height: 550px !important;
}
"""
# Gradio Interface
with gr.Blocks(css=css) as ui:
gr.Markdown("## Transcription & MOM Generator & Translator")
gr.Markdown("""
### 📌 Note: YouTube Link Support
Due to YouTube's bot protection, only **direct file uploads** are guaranteed to work.
YouTube links may fail without authentication cookies.
**Workaround:** Upload your audio/video file directly for best results.
""")
with gr.Row():
with gr.Column(scale=2):
input_file = gr.File(label="Upload Audio/Video", file_types=["audio", "video"], elem_id="file-box")
input_link = gr.Textbox(label="YouTube Link (optional)", lines=2)
with gr.Column(scale=2):
output_transcription = gr.Textbox(label="Transcription", lines=25, elem_id='box')
transcribe = gr.Button("Transcribe", variant="primary", scale=2)
with gr.Column(scale=2):
output_summary = gr.Textbox(label="MOM Output", lines=25, elem_id='box')
summarize = gr.Button("Summarize", variant="secondary", scale=2)
with gr.Column(scale=2):
output_translate = gr.Textbox(label='Translation Output', lines=20)
language_input = gr.Textbox(label="Target Language", value="English", lines=1)
translate = gr.Button('Translate', scale=2)
with gr.Row():
model = gr.Dropdown(
["LLAMA", "PHI", "QWEN", "DEEPSEEK", 'Gemma'],
label="Choose Your Model",
value="LLAMA"
)
# Wrapper functions to handle generators properly
def summarize_wrapper(model, file, link):
source = get_source_input(file, link)
for result in optimize(model, source):
yield result
def translate_wrapper(model, file, link, lang):
source = get_source_input(file, link)
for result in optimize_translate(model, source, lang):
yield result
# Event handlers with file or link support
transcribe.click(
fn=lambda file, link: transcription_whisper(get_source_input(file, link))[0],
inputs=[input_file, input_link],
outputs=[output_transcription]
)
summarize.click(
fn=summarize_wrapper,
inputs=[model, input_file, input_link],
outputs=[output_summary]
)
translate.click(
fn=translate_wrapper,
inputs=[model, input_file, input_link, language_input],
outputs=[output_translate]
)
# Launch the app
if __name__ == "__main__":
ui.launch(server_name="0.0.0.0", server_port=7860)