Spaces:

jonathanagustin
/

voice-summarizer

Sleeping

File size: 3,980 Bytes

2b29497

import os
import logging
import gradio as gr
from huggingface_hub import InferenceClient

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger(__name__)

# Environment variables
HF_TOKEN = os.environ.get("HF_TOKEN", "")
WHISPER_MODEL = os.environ.get("WHISPER_MODEL", "openai/whisper-large-v3-turbo")
LLM_MODEL = os.environ.get("LLM_MODEL", "Qwen/Qwen2.5-72B-Instruct")

logger.info(f"HF_TOKEN configured: {bool(HF_TOKEN)}")
logger.info(f"WHISPER_MODEL: {WHISPER_MODEL}")
logger.info(f"LLM_MODEL: {LLM_MODEL}")

client = InferenceClient(token=HF_TOKEN) if HF_TOKEN else InferenceClient()
logger.info("InferenceClient initialized")


def transcribe(audio) -> str:
    """Transcribe audio to text."""
    if audio is None:
        return ""
    
    try:
        logger.info(f"Transcribing with {WHISPER_MODEL}...")
        result = client.automatic_speech_recognition(audio, model=WHISPER_MODEL)
        logger.info(f"Transcription: {len(result.text)} chars")
        return result.text
    except Exception as e:
        logger.error(f"Transcription error: {e}")
        return f"❌ Transcription error: {e}"


def summarize(text: str, style: str) -> str:
    """Summarize text with LLM."""
    if not text.strip() or text.startswith("❌"):
        return text
    
    prompts = {
        "Brief Summary": f"Summarize this in 2-3 sentences:\n\n{text}",
        "Key Points": f"Extract the key points as bullet points:\n\n{text}",
        "Action Items": f"Extract any action items or tasks mentioned:\n\n{text}",
        "ELI5": f"Explain the main idea like I'm 5 years old:\n\n{text}",
    }
    
    try:
        logger.info(f"Summarizing with {LLM_MODEL} | style={style}")
        response = client.chat.completions.create(
            model=LLM_MODEL,
            messages=[{"role": "user", "content": prompts[style]}],
            max_tokens=300,
        )
        return response.choices[0].message.content
    except Exception as e:
        logger.error(f"Summary error: {e}")
        return f"❌ Summary error: {e}"


def process_audio(audio, style: str) -> tuple[str, str]:
    """Full pipeline: transcribe then summarize."""
    logger.info(f"process_audio() called | style={style}")
    
    if audio is None:
        return "🎤 Record or upload audio first!", ""
    
    transcript = transcribe(audio)
    if transcript.startswith("❌"):
        return transcript, ""
    
    summary = summarize(transcript, style)
    return transcript, summary


logger.info("Building Gradio interface...")

with gr.Blocks(title="Voice Summarizer") as demo:
    gr.Markdown("""# 🎙️ Voice Summarizer
Record audio → Get transcript → Get AI summary!

*Pipeline: Whisper (transcription) → Qwen (summarization)*
""")

    with gr.Row(equal_height=True):
        with gr.Column(scale=1):
            audio_input = gr.Audio(
                sources=["microphone", "upload"],
                type="filepath",
                label="🎤 Record or Upload Audio"
            )
            
            style = gr.Radio(
                choices=["Brief Summary", "Key Points", "Action Items", "ELI5"],
                value="Brief Summary",
                label="Summary Style"
            )
            
            btn = gr.Button("🚀 Process!", variant="primary", size="lg")
        
        with gr.Column(scale=1):
            transcript = gr.Textbox(label="📝 Transcript", lines=6, interactive=False)
            summary = gr.Textbox(label="✨ Summary", lines=6, interactive=False)
    
    btn.click(process_audio, inputs=[audio_input, style], outputs=[transcript, summary])

    gr.Markdown("""
### How it works
1. **Whisper** transcribes your audio to text
2. **Qwen 2.5** summarizes based on your selected style
3. All serverless - no downloads needed!
""")

demo.queue()
logger.info("Starting Gradio server...")
demo.launch()