jonathanagustin's picture
Sync from deploy tool: tutorials/09-voice-summarizer
2b29497 verified
import os
import logging
import gradio as gr
from huggingface_hub import InferenceClient
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s | %(levelname)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger(__name__)
# Environment variables
HF_TOKEN = os.environ.get("HF_TOKEN", "")
WHISPER_MODEL = os.environ.get("WHISPER_MODEL", "openai/whisper-large-v3-turbo")
LLM_MODEL = os.environ.get("LLM_MODEL", "Qwen/Qwen2.5-72B-Instruct")
logger.info(f"HF_TOKEN configured: {bool(HF_TOKEN)}")
logger.info(f"WHISPER_MODEL: {WHISPER_MODEL}")
logger.info(f"LLM_MODEL: {LLM_MODEL}")
client = InferenceClient(token=HF_TOKEN) if HF_TOKEN else InferenceClient()
logger.info("InferenceClient initialized")
def transcribe(audio) -> str:
"""Transcribe audio to text."""
if audio is None:
return ""
try:
logger.info(f"Transcribing with {WHISPER_MODEL}...")
result = client.automatic_speech_recognition(audio, model=WHISPER_MODEL)
logger.info(f"Transcription: {len(result.text)} chars")
return result.text
except Exception as e:
logger.error(f"Transcription error: {e}")
return f"❌ Transcription error: {e}"
def summarize(text: str, style: str) -> str:
"""Summarize text with LLM."""
if not text.strip() or text.startswith("❌"):
return text
prompts = {
"Brief Summary": f"Summarize this in 2-3 sentences:\n\n{text}",
"Key Points": f"Extract the key points as bullet points:\n\n{text}",
"Action Items": f"Extract any action items or tasks mentioned:\n\n{text}",
"ELI5": f"Explain the main idea like I'm 5 years old:\n\n{text}",
}
try:
logger.info(f"Summarizing with {LLM_MODEL} | style={style}")
response = client.chat.completions.create(
model=LLM_MODEL,
messages=[{"role": "user", "content": prompts[style]}],
max_tokens=300,
)
return response.choices[0].message.content
except Exception as e:
logger.error(f"Summary error: {e}")
return f"❌ Summary error: {e}"
def process_audio(audio, style: str) -> tuple[str, str]:
"""Full pipeline: transcribe then summarize."""
logger.info(f"process_audio() called | style={style}")
if audio is None:
return "🎀 Record or upload audio first!", ""
transcript = transcribe(audio)
if transcript.startswith("❌"):
return transcript, ""
summary = summarize(transcript, style)
return transcript, summary
logger.info("Building Gradio interface...")
with gr.Blocks(title="Voice Summarizer") as demo:
gr.Markdown("""# πŸŽ™οΈ Voice Summarizer
Record audio β†’ Get transcript β†’ Get AI summary!
*Pipeline: Whisper (transcription) β†’ Qwen (summarization)*
""")
with gr.Row(equal_height=True):
with gr.Column(scale=1):
audio_input = gr.Audio(
sources=["microphone", "upload"],
type="filepath",
label="🎀 Record or Upload Audio"
)
style = gr.Radio(
choices=["Brief Summary", "Key Points", "Action Items", "ELI5"],
value="Brief Summary",
label="Summary Style"
)
btn = gr.Button("πŸš€ Process!", variant="primary", size="lg")
with gr.Column(scale=1):
transcript = gr.Textbox(label="πŸ“ Transcript", lines=6, interactive=False)
summary = gr.Textbox(label="✨ Summary", lines=6, interactive=False)
btn.click(process_audio, inputs=[audio_input, style], outputs=[transcript, summary])
gr.Markdown("""
### How it works
1. **Whisper** transcribes your audio to text
2. **Qwen 2.5** summarizes based on your selected style
3. All serverless - no downloads needed!
""")
demo.queue()
logger.info("Starting Gradio server...")
demo.launch()