Spaces:
Sleeping
Sleeping
| import os | |
| import logging | |
| import gradio as gr | |
| from huggingface_hub import InferenceClient | |
| # Configure logging | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s | %(levelname)s | %(message)s", | |
| datefmt="%Y-%m-%d %H:%M:%S", | |
| ) | |
| logger = logging.getLogger(__name__) | |
| # Environment variables | |
| HF_TOKEN = os.environ.get("HF_TOKEN", "") | |
| WHISPER_MODEL = os.environ.get("WHISPER_MODEL", "openai/whisper-large-v3-turbo") | |
| LLM_MODEL = os.environ.get("LLM_MODEL", "Qwen/Qwen2.5-72B-Instruct") | |
| logger.info(f"HF_TOKEN configured: {bool(HF_TOKEN)}") | |
| logger.info(f"WHISPER_MODEL: {WHISPER_MODEL}") | |
| logger.info(f"LLM_MODEL: {LLM_MODEL}") | |
| client = InferenceClient(token=HF_TOKEN) if HF_TOKEN else InferenceClient() | |
| logger.info("InferenceClient initialized") | |
| def transcribe(audio) -> str: | |
| """Transcribe audio to text.""" | |
| if audio is None: | |
| return "" | |
| try: | |
| logger.info(f"Transcribing with {WHISPER_MODEL}...") | |
| result = client.automatic_speech_recognition(audio, model=WHISPER_MODEL) | |
| logger.info(f"Transcription: {len(result.text)} chars") | |
| return result.text | |
| except Exception as e: | |
| logger.error(f"Transcription error: {e}") | |
| return f"β Transcription error: {e}" | |
| def summarize(text: str, style: str) -> str: | |
| """Summarize text with LLM.""" | |
| if not text.strip() or text.startswith("β"): | |
| return text | |
| prompts = { | |
| "Brief Summary": f"Summarize this in 2-3 sentences:\n\n{text}", | |
| "Key Points": f"Extract the key points as bullet points:\n\n{text}", | |
| "Action Items": f"Extract any action items or tasks mentioned:\n\n{text}", | |
| "ELI5": f"Explain the main idea like I'm 5 years old:\n\n{text}", | |
| } | |
| try: | |
| logger.info(f"Summarizing with {LLM_MODEL} | style={style}") | |
| response = client.chat.completions.create( | |
| model=LLM_MODEL, | |
| messages=[{"role": "user", "content": prompts[style]}], | |
| max_tokens=300, | |
| ) | |
| return response.choices[0].message.content | |
| except Exception as e: | |
| logger.error(f"Summary error: {e}") | |
| return f"β Summary error: {e}" | |
| def process_audio(audio, style: str) -> tuple[str, str]: | |
| """Full pipeline: transcribe then summarize.""" | |
| logger.info(f"process_audio() called | style={style}") | |
| if audio is None: | |
| return "π€ Record or upload audio first!", "" | |
| transcript = transcribe(audio) | |
| if transcript.startswith("β"): | |
| return transcript, "" | |
| summary = summarize(transcript, style) | |
| return transcript, summary | |
| logger.info("Building Gradio interface...") | |
| with gr.Blocks(title="Voice Summarizer") as demo: | |
| gr.Markdown("""# ποΈ Voice Summarizer | |
| Record audio β Get transcript β Get AI summary! | |
| *Pipeline: Whisper (transcription) β Qwen (summarization)* | |
| """) | |
| with gr.Row(equal_height=True): | |
| with gr.Column(scale=1): | |
| audio_input = gr.Audio( | |
| sources=["microphone", "upload"], | |
| type="filepath", | |
| label="π€ Record or Upload Audio" | |
| ) | |
| style = gr.Radio( | |
| choices=["Brief Summary", "Key Points", "Action Items", "ELI5"], | |
| value="Brief Summary", | |
| label="Summary Style" | |
| ) | |
| btn = gr.Button("π Process!", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| transcript = gr.Textbox(label="π Transcript", lines=6, interactive=False) | |
| summary = gr.Textbox(label="β¨ Summary", lines=6, interactive=False) | |
| btn.click(process_audio, inputs=[audio_input, style], outputs=[transcript, summary]) | |
| gr.Markdown(""" | |
| ### How it works | |
| 1. **Whisper** transcribes your audio to text | |
| 2. **Qwen 2.5** summarizes based on your selected style | |
| 3. All serverless - no downloads needed! | |
| """) | |
| demo.queue() | |
| logger.info("Starting Gradio server...") | |
| demo.launch() | |