Spaces:

ArchCoder
/

basic_app

Sleeping

App Files Files Community

ArchCoder commited on Oct 10, 2025

Commit

990db9b

verified ·

1 Parent(s): b53f1b1

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -76

app.py CHANGED Viewed

@@ -1,34 +1,35 @@
 import gradio as gr
 from faster_whisper import WhisperModel
-from llama_cpp import Llama
 from duckduckgo_search import DDGS
 import time
 # Initialize models
 print("Loading Whisper model...")
 whisper_model = WhisperModel("tiny", device="cpu", compute_type="int8")
 print("Loading LLM...")
-llm = Llama.from_pretrained(
-    repo_id="Qwen/Qwen2.5-0.5B-Instruct-GGUF",
-    filename="qwen2.5-0.5b-instruct-q4_k_m.gguf",
-    n_ctx=2048,
-    n_threads=4,
-    verbose=False
 )
-# Initialize DuckDuckGo Search (no API key needed!)
 ddgs = DDGS(timeout=3)
-def search_web(query, max_results=3):
     """Perform web search using DuckDuckGo (FREE & UNLIMITED)"""
     try:
-        # Use text search for fast results
         results = ddgs.text(
             keywords=query,
-            region='wt-wt',  # Worldwide results
             safesearch='moderate',
-            timelimit='m',  # Last month for freshness
             max_results=max_results
         )
@@ -62,53 +63,59 @@ def process_audio(audio_path, question_text=None):
     transcription_time = time.time() - start_time
-    # Step 2: Web search for current info
     search_start = time.time()
-    search_results = search_web(question, max_results=2)  # Reduced to 2 for speed
     search_time = time.time() - search_start
     # Step 3: Generate answer with LLM
     llm_start = time.time()
-    prompt = f"""Answer the question briefly using the context below.
-Context:
-{search_results}
-Question: {question}
-Answer:"""
     try:
-        response = llm(
-            prompt,
-            max_tokens=120,  # Reduced for faster generation
-            temperature=0.2,  # Lower for faster, more focused responses
-            top_p=0.85,
-            stop=["Question:", "\n\n\n"],
-            echo=False
         )
-        answer = response['choices'][0]['text'].strip()
     except Exception as e:
         answer = f"❌ LLM error: {str(e)}"
     llm_time = time.time() - llm_start
     total_time = time.time() - start_time
-    # Color code timing (green if under 3s, yellow if close, red if over)
     time_emoji = "🟢" if total_time < 3.0 else "🟡" if total_time < 3.5 else "🔴"
     timing_info = f"\n\n{time_emoji} **Timing:** Trans={transcription_time:.2f}s | Search={search_time:.2f}s | LLM={llm_time:.2f}s | **Total={total_time:.2f}s**"
     return answer + timing_info, total_time
-# Create Gradio interface
-with gr.Blocks(title="Fast Q&A - FREE Unlimited Search", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
     # ⚡ Ultra-Fast Political Q&A System
-    Ask questions via audio or text. **FREE unlimited web search** with DuckDuckGo!
-    **Features:** Whisper-tiny + Qwen2.5-0.5B + DuckDuckGo (No API Key!)
     """)
     with gr.Tab("🎙️ Audio Input"):
@@ -122,11 +129,7 @@ with gr.Blocks(title="Fast Q&A - FREE Unlimited Search", theme=gr.themes.Soft())
                 audio_submit = gr.Button("🚀 Submit Audio", variant="primary", size="lg")
             with gr.Column():
-                audio_output = gr.Textbox(
-                    label="Answer",
-                    lines=8,
-                    show_copy_button=True
-                )
                 audio_time = gr.Number(label="Response Time (seconds)", precision=2)
         audio_submit.click(
@@ -147,11 +150,7 @@ with gr.Blocks(title="Fast Q&A - FREE Unlimited Search", theme=gr.themes.Soft())
                 text_submit = gr.Button("🚀 Submit Text", variant="primary", size="lg")
             with gr.Column():
-                text_output = gr.Textbox(
-                    label="Answer",
-                    lines=8,
-                    show_copy_button=True
-                )
                 text_time = gr.Number(label="Response Time (seconds)", precision=2)
         text_submit.click(
@@ -165,44 +164,16 @@ with gr.Blocks(title="Fast Q&A - FREE Unlimited Search", theme=gr.themes.Soft())
             examples=[
                 ["Who won the 2024 US presidential election?"],
                 ["What is the current inflation rate in India?"],
-                ["Who is the prime minister of UK?"],
-                ["What is the latest news about AI?"]
             ],
             inputs=text_input
         )
-    with gr.Accordion("📡 API Usage via curl", open=False):
-        gr.Markdown("""
-        ### Text Query (Simplest):
-        ```
-        curl -X POST https://archcoder-basic-app.hf.space/call/text_query \\
-          -H "Content-Type: application/json" \\
-          -d '{"data": ["Who is the current US president?"]}'
-        ```
-        ### Audio Query:
-        ```
-        # Upload audio
-        curl -F "files=@audio.mp3" https://archcoder-basic-app.hf.space/upload
-        # Query (replace path from upload response)
-        curl -X POST https://archcoder-basic-app.hf.space/call/audio_query \\
-          -H "Content-Type: application/json" \\
-          -d '{"data": [{"path": "/tmp/gradio/YOUR_FILE.mp3"}]}'
-        ```
-        """)
     gr.Markdown("""
     ---
-    ### 🎯 System Specs
-    - **Search:** DuckDuckGo (FREE, unlimited, no API key!)
-    - **Transcription:** Whisper-tiny (optimized for speed)
-    - **LLM:** Qwen2.5-0.5B Q4 (fast factual answers)
-    - **Target:** Sub-3s total response time
     🟢 = Under 3s | 🟡 = 3-3.5s | 🔴 = Over 3.5s
     """)
 if __name__ == "__main__":
-    demo.queue(max_size=5)  # Limit queue for consistent performance
     demo.launch()

 import gradio as gr
 from faster_whisper import WhisperModel
+from transformers import AutoTokenizer, AutoModelForCausalLM
 from duckduckgo_search import DDGS
 import time
+import torch
 # Initialize models
 print("Loading Whisper model...")
 whisper_model = WhisperModel("tiny", device="cpu", compute_type="int8")
 print("Loading LLM...")
+model_name = "Qwen/Qwen2.5-0.5B-Instruct"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype=torch.float32,
+    device_map="cpu",
+    low_cpu_mem_usage=True
 )
+# Initialize DuckDuckGo Search
 ddgs = DDGS(timeout=3)
+def search_web(query, max_results=2):
     """Perform web search using DuckDuckGo (FREE & UNLIMITED)"""
     try:
         results = ddgs.text(
             keywords=query,
+            region='wt-wt',
             safesearch='moderate',
+            timelimit='m',
             max_results=max_results
         )
     transcription_time = time.time() - start_time
+    # Step 2: Web search
     search_start = time.time()
+    search_results = search_web(question, max_results=2)
     search_time = time.time() - search_start
     # Step 3: Generate answer with LLM
     llm_start = time.time()
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant. Answer questions briefly using the provided context."},
+        {"role": "user", "content": f"Context:\n{search_results}\n\nQuestion: {question}\n\nAnswer:"}
+    ]
     try:
+        text = tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
         )
+        inputs = tokenizer([text], return_tensors="pt").to("cpu")
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=120,
+                temperature=0.2,
+                do_sample=True,
+                top_p=0.85,
+                pad_token_id=tokenizer.eos_token_id
+            )
+        response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
+        answer = response.strip()
     except Exception as e:
         answer = f"❌ LLM error: {str(e)}"
     llm_time = time.time() - llm_start
     total_time = time.time() - start_time
     time_emoji = "🟢" if total_time < 3.0 else "🟡" if total_time < 3.5 else "🔴"
     timing_info = f"\n\n{time_emoji} **Timing:** Trans={transcription_time:.2f}s | Search={search_time:.2f}s | LLM={llm_time:.2f}s | **Total={total_time:.2f}s**"
     return answer + timing_info, total_time
+# Create Gradio interface (same as before)
+with gr.Blocks(title="Fast Q&A - No Building Required!", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
     # ⚡ Ultra-Fast Political Q&A System
+    **No wheel building** - Fast deployment with transformers!
+    **Features:** Whisper-tiny + Qwen2.5-0.5B + DuckDuckGo (FREE unlimited search)
     """)
     with gr.Tab("🎙️ Audio Input"):
                 audio_submit = gr.Button("🚀 Submit Audio", variant="primary", size="lg")
             with gr.Column():
+                audio_output = gr.Textbox(label="Answer", lines=8, show_copy_button=True)
                 audio_time = gr.Number(label="Response Time (seconds)", precision=2)
         audio_submit.click(
                 text_submit = gr.Button("🚀 Submit Text", variant="primary", size="lg")
             with gr.Column():
+                text_output = gr.Textbox(label="Answer", lines=8, show_copy_button=True)
                 text_time = gr.Number(label="Response Time (seconds)", precision=2)
         text_submit.click(
             examples=[
                 ["Who won the 2024 US presidential election?"],
                 ["What is the current inflation rate in India?"],
+                ["Who is the prime minister of UK?"]
             ],
             inputs=text_input
         )
     gr.Markdown("""
     ---
     🟢 = Under 3s | 🟡 = 3-3.5s | 🔴 = Over 3.5s
     """)
 if __name__ == "__main__":
+    demo.queue(max_size=5)
     demo.launch()