File size: 4,190 Bytes
2dc4fb9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
import gradio as gr
from faster_whisper import WhisperModel
from llama_cpp import Llama
from brave import Brave
import os
import time
# Initialize models
print("Loading models...")
whisper_model = WhisperModel("tiny", device="cpu", compute_type="int8")
llm = Llama.from_pretrained(
repo_id="Qwen/Qwen2.5-0.5B-Instruct-GGUF",
filename="qwen2.5-0.5b-instruct-q4_k_m.gguf",
n_ctx=2048,
n_threads=4,
verbose=False
)
# Initialize Brave Search
brave_client = Brave(api_key=os.getenv("BRAVE_API_KEY", ""))
def search_web(query, max_results=3):
"""Perform web search using Brave API"""
try:
results = brave_client.search(q=query, count=max_results)
web_results = results.web_results if hasattr(results, 'web_results') else []
context = ""
for i, result in enumerate(web_results[:max_results], 1):
context += f"\n[{i}] {result.title}\n{result.description}\n"
return context.strip()
except Exception as e:
return f"Search failed: {str(e)}"
def process_audio(audio_path, question_text=None):
"""Main pipeline: audio -> text -> search -> answer"""
start_time = time.time()
# Step 1: Transcribe audio if provided
if audio_path:
segments, _ = whisper_model.transcribe(audio_path, language="en")
question = " ".join([seg.text for seg in segments])
else:
question = question_text
if not question:
return "No input provided", 0.0
transcription_time = time.time() - start_time
# Step 2: Web search for political/current info
search_start = time.time()
search_results = search_web(question)
search_time = time.time() - search_start
# Step 3: Generate answer with LLM
llm_start = time.time()
prompt = f"""You are a helpful assistant. Answer the question based on the context below.
Context from web search:
{search_results}
Question: {question}
Answer briefly and accurately:"""
response = llm(
prompt,
max_tokens=150,
temperature=0.3,
top_p=0.9,
stop=["Question:", "\n\n"],
echo=False
)
answer = response['choices'][0]['text'].strip()
llm_time = time.time() - llm_start
total_time = time.time() - start_time
timing_info = f"\n\n⏱️ Timing: Transcription={transcription_time:.2f}s | Search={search_time:.2f}s | LLM={llm_time:.2f}s | Total={total_time:.2f}s"
return answer + timing_info, total_time
# Create Gradio interface
with gr.Blocks(title="Fast Q&A with Web Search") as demo:
gr.Markdown("# 🎤 Fast Political Q&A System\nAsk questions via audio or text. Answers in ~3 seconds!")
with gr.Tab("Audio Input"):
audio_input = gr.Audio(type="filepath", label="Record or upload audio question")
audio_submit = gr.Button("Submit Audio", variant="primary")
audio_output = gr.Textbox(label="Answer", lines=6)
audio_time = gr.Number(label="Response Time (seconds)")
audio_submit.click(
fn=lambda x: process_audio(x, None),
inputs=[audio_input],
outputs=[audio_output, audio_time],
api_name="audio_query"
)
with gr.Tab("Text Input"):
text_input = gr.Textbox(label="Type your question", placeholder="Who won the 2024 elections?")
text_submit = gr.Button("Submit Text", variant="primary")
text_output = gr.Textbox(label="Answer", lines=6)
text_time = gr.Number(label="Response Time (seconds)")
text_submit.click(
fn=lambda x: process_audio(None, x),
inputs=[text_input],
outputs=[text_output, text_time],
api_name="text_query"
)
gr.Markdown("""
### 📡 API Usage
```
# Upload audio file
curl -F "files=@audio.mp3" https://YOUR-SPACE-URL/upload
# Make query
curl -X POST https://YOUR-SPACE-URL/call/audio_query \\
-H "Content-Type: application/json" \\
-d '{"data": [{"path": "/tmp/uploaded_audio.mp3"}]}'
```
""")
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)
|