import gradio as gr from faster_whisper import WhisperModel from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer from duckduckgo_search import DDGS import time import torch import base64 import tempfile import os from threading import Thread # Initialize models print("Loading Whisper model...") whisper_model = WhisperModel("tiny", device="cpu", compute_type="int8") print("Loading LLM...") model_name = "Qwen/Qwen2.5-0.5B-Instruct" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float32, device_map="cpu", low_cpu_mem_usage=True ) # Initialize DuckDuckGo Search ddgs = DDGS(timeout=3) def search_web(query, max_results=2): """Perform web search using DuckDuckGo""" try: results = ddgs.text( keywords=query, region='wt-wt', safesearch='moderate', timelimit='m', max_results=max_results ) context = "" for i, result in enumerate(results[:max_results], 1): title = result.get('title', '') body = result.get('body', '') context += f"\n[{i}] {title}\n{body}\n" return context.strip() if context else "No search results found." except Exception as e: return f"Search failed: {str(e)}" def transcribe_audio_base64(audio_base64): """Transcribe audio from base64 string (for Pluely STT endpoint)""" try: # Decode base64 audio audio_bytes = base64.b64decode(audio_base64) # Save to temporary file with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio: temp_audio.write(audio_bytes) temp_path = temp_audio.name # Transcribe segments, _ = whisper_model.transcribe(temp_path, language="en", beam_size=1) transcription = " ".join([seg.text for seg in segments]) # Cleanup os.unlink(temp_path) return {"text": transcription.strip()} except Exception as e: return {"error": f"Transcription failed: {str(e)}"} def generate_answer_stream(text_input): """Generate streaming answer from text input""" try: if not text_input or text_input.strip() == "": yield "No input provided" return # Web search (non-streaming part) search_results = search_web(text_input, max_results=2) # Prepare messages messages = [ {"role": "system", "content": "You are a helpful assistant. Answer briefly using provided context. Keep responses under 40 words."}, {"role": "user", "content": f"Context:\n{search_results}\n\nQuestion: {text_input}\n\nAnswer:"} ] text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) inputs = tokenizer([text], return_tensors="pt").to("cpu") # Setup streaming streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) generation_kwargs = dict( inputs=inputs['input_ids'], attention_mask=inputs['attention_mask'], max_new_tokens=80, temperature=0.2, do_sample=True, top_p=0.85, pad_token_id=tokenizer.eos_token_id, streamer=streamer ) # Start generation in separate thread thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() # Stream tokens as they're generated generated_text = "" for new_text in streamer: generated_text += new_text yield generated_text except Exception as e: yield f"Error: {str(e)}" def generate_answer(text_input): """Generate complete answer (non-streaming)""" try: if not text_input or text_input.strip() == "": return "No input provided" # Get the last chunk from streaming final_answer = "" for chunk in generate_answer_stream(text_input): final_answer = chunk return final_answer except Exception as e: return f"Error: {str(e)}" def process_audio_stream(audio_path, question_text=None): """Streaming pipeline for Gradio UI - Returns tuple generator""" start_time = time.time() # Step 1: Transcribe audio if provided if audio_path: try: segments, _ = whisper_model.transcribe(audio_path, language="en", beam_size=1) question = " ".join([seg.text for seg in segments]) except Exception as e: yield f"❌ Transcription error: {str(e)}", 0.0 return else: question = question_text if not question or question.strip() == "": yield "❌ No input provided", 0.0 return transcription_time = time.time() - start_time # Step 2: Web search search_start = time.time() search_results = search_web(question, max_results=2) search_time = time.time() - search_start # Step 3: Stream answer generation llm_start = time.time() for partial_answer in generate_answer_stream(question): current_time = time.time() - start_time time_emoji = "🟢" if current_time < 3.0 else "🟡" if current_time < 3.5 else "🔴" timing_info = f"\n\n{time_emoji} **Timing:** Trans={transcription_time:.2f}s | Search={search_time:.2f}s | LLM={(time.time()-llm_start):.2f}s | **Total={current_time:.2f}s**" # IMPORTANT: Must yield tuple (text, number) to match output components yield partial_answer + timing_info, current_time # Create Gradio interface with gr.Blocks(title="Fast Q&A - Streaming Enabled", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # ⚡ Ultra-Fast Political Q&A System **Streaming enabled** for instant feedback! Pluely compatible endpoints available. **Features:** Whisper-tiny + Qwen2.5-0.5B + DuckDuckGo + Real-time streaming """) with gr.Tab("🎙️ Audio Input"): with gr.Row(): with gr.Column(): audio_input = gr.Audio( sources=["microphone", "upload"], type="filepath", label="Record or upload audio" ) audio_submit = gr.Button("🚀 Submit Audio", variant="primary", size="lg") with gr.Column(): audio_output = gr.Textbox(label="Answer (Streaming)", lines=8, show_copy_button=True) audio_time = gr.Number(label="Response Time (seconds)", precision=2) # Fixed: Lambda wrapper ensures proper tuple unpacking audio_submit.click( fn=process_audio_stream, inputs=[audio_input, gr.Textbox(value=None, visible=False)], outputs=[audio_output, audio_time], api_name="audio_query_stream" ) with gr.Tab("✍️ Text Input"): with gr.Row(): with gr.Column(): text_input = gr.Textbox( label="Type your question", placeholder="Who is the current US president?", lines=3 ) text_submit = gr.Button("🚀 Submit Text", variant="primary", size="lg") with gr.Column(): text_output = gr.Textbox(label="Answer (Streaming)", lines=8, show_copy_button=True) text_time = gr.Number(label="Response Time (seconds)", precision=2) # Fixed: Proper function call with audio=None text_submit.click( fn=lambda text: process_audio_stream(None, text), inputs=[text_input], outputs=[text_output, text_time], api_name="text_query_stream" ) gr.Examples( examples=[ ["Who won the 2024 US presidential election?"], ["What is the current inflation rate in India?"], ["Who is the prime minister of UK?"] ], inputs=text_input ) # API endpoints for Pluely with gr.Tab("🔌 Pluely Integration"): gr.Markdown(""" ## Dedicated Endpoints for Pluely ### 1. STT Endpoint (Audio Transcription) ``` curl -X POST https://archcoder-basic-app.hf.space/call/transcribe_stt \\ -H "Content-Type: application/json" \\ -d '{"data": ["BASE64_AUDIO_DATA"]}' ``` **Response Format:** `{"data": [{"text": "transcribed text"}]}` ### 2. AI Endpoint - Streaming ``` curl -X POST https://archcoder-basic-app.hf.space/call/answer_ai_stream \\ -H "Content-Type: application/json" \\ -d '{"data": ["Your question here"]}' ``` **Response Format:** Streaming text chunks --- ## Pluely Configuration ### Custom STT Provider: **Curl Command:** ``` curl https://archcoder-basic-app.hf.space/call/transcribe_stt -H "Content-Type: application/json" -d '{"data": ["{{AUDIO_BASE64}}"]}' ``` **Response Content Path:** `data[0].text` **Streaming:** OFF ### Custom AI Provider (Streaming): **Curl Command:** ``` curl https://archcoder-basic-app.hf.space/call/answer_ai_stream -H "Content-Type: application/json" -d '{"data": ["{{TEXT}}"]}' ``` **Response Content Path:** `data` **Streaming:** ON ✅ """) # Hidden interface components that create API endpoints with gr.Row(visible=False): stt_input = gr.Textbox() stt_output = gr.JSON() ai_stream_input = gr.Textbox() ai_stream_output = gr.Textbox() # These create the /call/transcribe_stt and /call/answer_ai_stream endpoints stt_button = gr.Button("STT", visible=False) stt_button.click( fn=transcribe_audio_base64, inputs=[stt_input], outputs=[stt_output], api_name="transcribe_stt" ) ai_stream_button = gr.Button("AI Stream", visible=False) ai_stream_button.click( fn=generate_answer_stream, inputs=[ai_stream_input], outputs=[ai_stream_output], api_name="answer_ai_stream" ) gr.Markdown(""" --- 🟢 = Under 3s | 🟡 = 3-3.5s | 🔴 = Over 3.5s **Streaming Mode:** Words appear as they're generated - much faster perceived response! """) if __name__ == "__main__": demo.queue(max_size=5) demo.launch()