Spaces:
Sleeping
Sleeping
File size: 3,980 Bytes
2b29497 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 | import os
import logging
import gradio as gr
from huggingface_hub import InferenceClient
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s | %(levelname)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger(__name__)
# Environment variables
HF_TOKEN = os.environ.get("HF_TOKEN", "")
WHISPER_MODEL = os.environ.get("WHISPER_MODEL", "openai/whisper-large-v3-turbo")
LLM_MODEL = os.environ.get("LLM_MODEL", "Qwen/Qwen2.5-72B-Instruct")
logger.info(f"HF_TOKEN configured: {bool(HF_TOKEN)}")
logger.info(f"WHISPER_MODEL: {WHISPER_MODEL}")
logger.info(f"LLM_MODEL: {LLM_MODEL}")
client = InferenceClient(token=HF_TOKEN) if HF_TOKEN else InferenceClient()
logger.info("InferenceClient initialized")
def transcribe(audio) -> str:
"""Transcribe audio to text."""
if audio is None:
return ""
try:
logger.info(f"Transcribing with {WHISPER_MODEL}...")
result = client.automatic_speech_recognition(audio, model=WHISPER_MODEL)
logger.info(f"Transcription: {len(result.text)} chars")
return result.text
except Exception as e:
logger.error(f"Transcription error: {e}")
return f"β Transcription error: {e}"
def summarize(text: str, style: str) -> str:
"""Summarize text with LLM."""
if not text.strip() or text.startswith("β"):
return text
prompts = {
"Brief Summary": f"Summarize this in 2-3 sentences:\n\n{text}",
"Key Points": f"Extract the key points as bullet points:\n\n{text}",
"Action Items": f"Extract any action items or tasks mentioned:\n\n{text}",
"ELI5": f"Explain the main idea like I'm 5 years old:\n\n{text}",
}
try:
logger.info(f"Summarizing with {LLM_MODEL} | style={style}")
response = client.chat.completions.create(
model=LLM_MODEL,
messages=[{"role": "user", "content": prompts[style]}],
max_tokens=300,
)
return response.choices[0].message.content
except Exception as e:
logger.error(f"Summary error: {e}")
return f"β Summary error: {e}"
def process_audio(audio, style: str) -> tuple[str, str]:
"""Full pipeline: transcribe then summarize."""
logger.info(f"process_audio() called | style={style}")
if audio is None:
return "π€ Record or upload audio first!", ""
transcript = transcribe(audio)
if transcript.startswith("β"):
return transcript, ""
summary = summarize(transcript, style)
return transcript, summary
logger.info("Building Gradio interface...")
with gr.Blocks(title="Voice Summarizer") as demo:
gr.Markdown("""# ποΈ Voice Summarizer
Record audio β Get transcript β Get AI summary!
*Pipeline: Whisper (transcription) β Qwen (summarization)*
""")
with gr.Row(equal_height=True):
with gr.Column(scale=1):
audio_input = gr.Audio(
sources=["microphone", "upload"],
type="filepath",
label="π€ Record or Upload Audio"
)
style = gr.Radio(
choices=["Brief Summary", "Key Points", "Action Items", "ELI5"],
value="Brief Summary",
label="Summary Style"
)
btn = gr.Button("π Process!", variant="primary", size="lg")
with gr.Column(scale=1):
transcript = gr.Textbox(label="π Transcript", lines=6, interactive=False)
summary = gr.Textbox(label="β¨ Summary", lines=6, interactive=False)
btn.click(process_audio, inputs=[audio_input, style], outputs=[transcript, summary])
gr.Markdown("""
### How it works
1. **Whisper** transcribes your audio to text
2. **Qwen 2.5** summarizes based on your selected style
3. All serverless - no downloads needed!
""")
demo.queue()
logger.info("Starting Gradio server...")
demo.launch()
|