Spaces:

WWMachine
/

test

Sleeping

App Files Files Community

WWMachine commited on Dec 2, 2025

Commit

29a7fe7

verified ·

1 Parent(s): d152984

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -168

app.py CHANGED Viewed

@@ -3,204 +3,106 @@ from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 import os
-# --- Configuration (Kept from original script) ---
 MODEL_REPO = "Kezovic/iris-q4gguf-v2"
 MODEL_FILE = "llama-3.2-1b-instruct.Q4_K_M.gguf"
 CONTEXT_WINDOW = 4096
 MAX_NEW_TOKENS = 512
 TEMPERATURE = 0.7
-# --- Model Loading Function (Kept from original script) ---
 def load_llm():
     """Downloads the GGUF model and initializes LlamaCPP."""
     print("Downloading model...")
-    model_path = hf_hub_download(
-        repo_id=MODEL_REPO,
-        filename=MODEL_FILE
-    )
-    llm = Llama(
-        model_path=model_path,
-        n_ctx=CONTEXT_WINDOW,
-        n_threads=2,
-        verbose=False
-    )
-    print("Model loaded successfully!")
-    return llm
-# Load the model only once when the Space starts
 llm = load_llm()
-# --- NEW: Audio-to-Text and Text-to-Audio Inference Function ---
-def generate_audio_response(audio_file_path):
     """
-    1. Transcribes user audio (STT).
-    2. Generates a text response using the Llama model.
-    3. Returns the transcribed text and the generated response text (which Gradio will TTS).
-    Args:
-        audio_file_path (str): The local path to the recorded audio file.
-    Returns:
-        tuple: (Transcribed Text, Generated Text Response)
     """
-    if audio_file_path is None:
-        return "Please record some audio first.", ""
-    # 1. Transcribe the Audio (STT)
-    # Gradio's Audio Input component automatically performs STT
-    # if the 'type' parameter is set to "filepath" and the
-    # 'label' is set to "Microphone with Whisper".
-    # However, since we are not using the ChatInterface directly,
-    # we simulate the transcription by asking the user to speak clearly.
-    # In a real deployed Space, the user would see a transcript in the UI.
-    # For a fully audio-only demo, we'll focus on the TTS part.
-    # ***IMPORTANT***: The Gradio `gr.Audio(type="filepath", sources=["microphone"])` component
-    # returns the path to the recorded audio file. For true STT, you would need an
-    # additional STT model (like OpenAI Whisper or similar) here.
-    # To keep it simple and focus on the UI change, we'll prompt the user for the text
-    # they want to "transcribe" in the UI setup below.
-    # 2. Use the "transcribed" text for generation
-    # For a placeholder, let's assume the user's intent is in the file name or we use a static prompt
-    # Since we can't run Whisper here, we'll rely on the UI component structure.
-    # To make this function testable, let's assume the user's text input is passed via a separate text box
-    # and the audio file is just the trigger.
-    # MODIFICATION: Let's adjust the UI to use the *text* output from an STT component
-    # that's often paired with an audio recorder.
-    # For the purpose of providing a functional script:
-    # If using gr.Interface, we can pass the transcription as a separate input.
-    # If using gr.Blocks, we have full control.
-    # Let's adjust the function to accept the transcribed text directly (as in a common Gradio STT flow)
-    # and remove the audio_file_path argument for simplicity.
-    return "Error: Function signature needs adjustment for Gradio STT/TTS components."
-# --- NEW: Modified Inference Function for Audio Interface ---
-def generate_and_speak(transcribed_text):
-    """
-    Generates a response using the Llama model based on transcribed text
-    and returns the text output for Gradio's TTS feature.
-    """
-    if not transcribed_text or transcribed_text.strip() == "":
-        return "Please speak clearly into the microphone."
     # Use a basic prompt template
-    full_prompt = f"### Human: {transcribed_text}\n### Assistant:"
-    output = llm(
-        prompt=full_prompt,
-        max_tokens=MAX_NEW_TOKENS,
-        temperature=TEMPERATURE,
-        stop=["### Human:"],
-        echo=False
-    )
-    # Extract the text and return it. Gradio's output component (Audio)
-    # will automatically synthesize this text into speech.
-    response_text = output['choices'][0]['text'].strip()
-    return response_text
-# --- NEW: Gradio Interface using gr.Interface for STT/TTS flow ---
-# 1. Input: Audio recorder with automatic Speech-to-Text (STT) via Whisper (if available)
-audio_input = gr.Audio(
-    sources=["microphone"],
-    type="text", # IMPORTANT: This tells Gradio to return the transcribed text (STT)
-    label="Speak Your Question Here"
-)
-# 2. Output: Text box to show the LLM response, which is automatically converted to speech (TTS)
-audio_output = gr.Textbox(
-    label="Assistant Response (Text)",
-    value="The model's response will appear here."
-)
-# 3. Text-to-Speech Output: This component will automatically read the text from 'audio_output'
-tts_output = gr.Audio(
-    label="Assistant Response (Audio)",
-    autoplay=True
-)
-# Use gr.Blocks for the most control over the complex STT/TTS workflow
 with gr.Blocks(title=f"Audio Chat with {MODEL_FILE}") as demo:
-    gr.Markdown(f"## 🎤 Chat with {MODEL_FILE}")
-    gr.Markdown("Speak your query, and the LLM will reply in audio!")
-    # Row for Input and Output
-    with gr.Row():
-        # Column for Input (Audio Recording + STT)
-        with gr.Column(scale=1):
-            audio_recorder = gr.Audio(
-                sources=["microphone"],
-                type="filepath",
-                label="1. Record Your Query"
-            )
-            # Placeholder for Transcription (Whisper STT is often run on the recorded file)
-            transcribed_text = gr.Textbox(
-                label="2. Transcribed Text",
-                placeholder="Transcription appears here (Simulated or by an STT model)"
-            )
-            # The Button triggers the generation
-            generate_button = gr.Button("3. Generate Response")
-        # Column for Output (Generation + TTS)
-        with gr.Column(scale=2):
-            text_response = gr.Textbox(
-                label="LLM Text Response",
-                lines=5
-            )
-            gr.Markdown("### Assistant Audio Response")
-            # The Audio component reads the text from text_response and speaks it.
-            audio_playback = gr.Audio(
-                label="",
-                autoplay=True,
-                # This ensures the audio is generated from the text_response
-                # and doesn't rely on a separate audio file path.
-                interactive=False
-            )
-    # --- Interaction Logic ---
-    # Step 1: When audio is recorded, we simulate transcription (or run an actual STT model here)
-    # For a working Gradio flow without an STT model, we need the user to type the text.
-    # Since we can't assume a separate STT model, we'll streamline the flow:
-    # Instead of a complex multi-step STT workflow, we use a simple text input
-    # that is *read* by the TTS component for the model's response.
-    # **Simpler Audio Flow (Text Input -> LLM -> TTS Output)**
-    # This is the most reliable way to demonstrate TTS without adding a separate STT model.
-    gr.Markdown("---")
-    gr.Markdown("### Simpler Flow: Text Input to Audio Output (TTS)")
     with gr.Row():
         text_input = gr.Textbox(
-            label="Type your query (This is used to generate the LLM response)",
-            lines=1,
             scale=3
         )
         audio_btn = gr.Button("Generate and Speak")
-    text_output_simulated = gr.Textbox(label="LLM Response Text")
-    audio_output_simulated = gr.Audio(label="Assistant Audio Playback", autoplay=True)
-    # Set up the event listener for the simplified flow:
     audio_btn.click(
         fn=generate_and_speak,
         inputs=[text_input],
-        outputs=[text_output_simulated]
-    ).then(
-        # The second function call, after the response text is ready,
-        # is a dummy function that just returns the text to the audio component.
-        # Gradio handles the TTS synthesis automatically when the target is an Audio component.
-        lambda x: x,
-        inputs=[text_output_simulated],
-        outputs=[audio_output_simulated]
     )
 demo.launch()

 from huggingface_hub import hf_hub_download
 import os
+# --- Configuration ---
 MODEL_REPO = "Kezovic/iris-q4gguf-v2"
 MODEL_FILE = "llama-3.2-1b-instruct.Q4_K_M.gguf"
 CONTEXT_WINDOW = 4096
 MAX_NEW_TOKENS = 512
 TEMPERATURE = 0.7
+# --- Model Loading Function ---
+# Initialize llm as None to avoid the Llama.__del__ 'NoneType' error
+llm = None
 def load_llm():
     """Downloads the GGUF model and initializes LlamaCPP."""
+    global llm # Use the global variable
     print("Downloading model...")
+    try:
+        model_path = hf_hub_download(
+            repo_id=MODEL_REPO,
+            filename=MODEL_FILE
+        )
+        llm = Llama(
+            model_path=model_path,
+            n_ctx=CONTEXT_WINDOW,
+            n_threads=2,
+            verbose=False
+        )
+        print("Model loaded successfully!")
+        return llm
+    except Exception as e:
+        print(f"Error loading model: {e}")
+        return None
+# Load the model only once
 llm = load_llm()
+# --- Inference Function ---
+def generate_and_speak(text_prompt):
     """
+    Generates a text response using the Llama model.
+    The output text is automatically synthesized into speech by Gradio's Audio component.
     """
+    if llm is None:
+        return "Error: LLM failed to load. Please check model configuration.", None
+    if not text_prompt or text_prompt.strip() == "":
+        return "Please enter a query.", None
     # Use a basic prompt template
+    full_prompt = f"### Human: {text_prompt}\n### Assistant:"
+    try:
+        output = llm(
+            prompt=full_prompt,
+            max_tokens=MAX_NEW_TOKENS,
+            temperature=TEMPERATURE,
+            stop=["### Human:"],
+            echo=False
+        )
+        response_text = output['choices'][0]['text'].strip()
+        # Return the text. It will update the Textbox AND the Audio component.
+        return response_text, response_text
+    except Exception as e:
+        return f"LLM Generation Error: {e}", None
+# --- Gradio Interface (TTS Flow) ---
 with gr.Blocks(title=f"Audio Chat with {MODEL_FILE}") as demo:
+    gr.Markdown(f"## 🗣️ LLM Chat with Text-to-Speech (TTS)")
+    gr.Markdown("Type your query (Text Input) and the LLM will reply in both text and auto-generated audio (TTS).")
     with gr.Row():
         text_input = gr.Textbox(
+            label="Your Query (Text Input)",
+            lines=2,
             scale=3
         )
         audio_btn = gr.Button("Generate and Speak")
+    # Outputs
+    text_output = gr.Textbox(label="LLM Response Text")
+    audio_output = gr.Audio(
+        label="Assistant Audio Playback (TTS)",
+        autoplay=True,
+        # Gradio automatically synthesizes the text output received by this Audio component
+        # into speech. We set it as an 'update' target.
+        interactive=False
+    )
+    # Set up the event listener: Button click triggers the function.
     audio_btn.click(
         fn=generate_and_speak,
         inputs=[text_input],
+        outputs=[text_output, audio_output]
+    )
+    # Enable enter key to submit
+    text_input.submit(
+        fn=generate_and_speak,
+        inputs=[text_input],
+        outputs=[text_output, audio_output]
     )
 demo.launch()