File size: 3,980 Bytes
2b29497
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import os
import logging
import gradio as gr
from huggingface_hub import InferenceClient

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger(__name__)

# Environment variables
HF_TOKEN = os.environ.get("HF_TOKEN", "")
WHISPER_MODEL = os.environ.get("WHISPER_MODEL", "openai/whisper-large-v3-turbo")
LLM_MODEL = os.environ.get("LLM_MODEL", "Qwen/Qwen2.5-72B-Instruct")

logger.info(f"HF_TOKEN configured: {bool(HF_TOKEN)}")
logger.info(f"WHISPER_MODEL: {WHISPER_MODEL}")
logger.info(f"LLM_MODEL: {LLM_MODEL}")

client = InferenceClient(token=HF_TOKEN) if HF_TOKEN else InferenceClient()
logger.info("InferenceClient initialized")


def transcribe(audio) -> str:
    """Transcribe audio to text."""
    if audio is None:
        return ""
    
    try:
        logger.info(f"Transcribing with {WHISPER_MODEL}...")
        result = client.automatic_speech_recognition(audio, model=WHISPER_MODEL)
        logger.info(f"Transcription: {len(result.text)} chars")
        return result.text
    except Exception as e:
        logger.error(f"Transcription error: {e}")
        return f"❌ Transcription error: {e}"


def summarize(text: str, style: str) -> str:
    """Summarize text with LLM."""
    if not text.strip() or text.startswith("❌"):
        return text
    
    prompts = {
        "Brief Summary": f"Summarize this in 2-3 sentences:\n\n{text}",
        "Key Points": f"Extract the key points as bullet points:\n\n{text}",
        "Action Items": f"Extract any action items or tasks mentioned:\n\n{text}",
        "ELI5": f"Explain the main idea like I'm 5 years old:\n\n{text}",
    }
    
    try:
        logger.info(f"Summarizing with {LLM_MODEL} | style={style}")
        response = client.chat.completions.create(
            model=LLM_MODEL,
            messages=[{"role": "user", "content": prompts[style]}],
            max_tokens=300,
        )
        return response.choices[0].message.content
    except Exception as e:
        logger.error(f"Summary error: {e}")
        return f"❌ Summary error: {e}"


def process_audio(audio, style: str) -> tuple[str, str]:
    """Full pipeline: transcribe then summarize."""
    logger.info(f"process_audio() called | style={style}")
    
    if audio is None:
        return "🎀 Record or upload audio first!", ""
    
    transcript = transcribe(audio)
    if transcript.startswith("❌"):
        return transcript, ""
    
    summary = summarize(transcript, style)
    return transcript, summary


logger.info("Building Gradio interface...")

with gr.Blocks(title="Voice Summarizer") as demo:
    gr.Markdown("""# πŸŽ™οΈ Voice Summarizer
Record audio β†’ Get transcript β†’ Get AI summary!

*Pipeline: Whisper (transcription) β†’ Qwen (summarization)*
""")

    with gr.Row(equal_height=True):
        with gr.Column(scale=1):
            audio_input = gr.Audio(
                sources=["microphone", "upload"],
                type="filepath",
                label="🎀 Record or Upload Audio"
            )
            
            style = gr.Radio(
                choices=["Brief Summary", "Key Points", "Action Items", "ELI5"],
                value="Brief Summary",
                label="Summary Style"
            )
            
            btn = gr.Button("πŸš€ Process!", variant="primary", size="lg")
        
        with gr.Column(scale=1):
            transcript = gr.Textbox(label="πŸ“ Transcript", lines=6, interactive=False)
            summary = gr.Textbox(label="✨ Summary", lines=6, interactive=False)
    
    btn.click(process_audio, inputs=[audio_input, style], outputs=[transcript, summary])

    gr.Markdown("""
### How it works
1. **Whisper** transcribes your audio to text
2. **Qwen 2.5** summarizes based on your selected style
3. All serverless - no downloads needed!
""")

demo.queue()
logger.info("Starting Gradio server...")
demo.launch()