Spaces:

Ryanus
/

GPA

Runtime error

App Files Files Community

Ryanus commited on about 21 hours ago

Commit

d224b23

verified ·

1 Parent(s): 4a42dd9

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -239

app.py CHANGED Viewed

@@ -1,240 +1,25 @@
-import os
-import torch
-import torchaudio
 import gradio as gr
-from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
-from diffusers import StableAudioPipeline
-import numpy as np
-from scipy.io import wavfile
-# --- IMPLEMENT MODEL LOADING AND FUNCTIONS ACCORDING TO GPA REPO ---
-# Global variable to store the model and processor after initial load
-model_instance = None
-processor_instance = None
-tts_pipeline_instance = None
-def load_gpa_model():
-    global model_instance, processor_instance, tts_pipeline_instance
-    # Use environment variables for cache directory if available (useful for Spaces)
-    cache_dir = os.getenv('HF_HOME', './hf_cache')
-    # --- ASR Model Loading ---
-    print("Loading ASR Model...")
-    asr_model_id = "AutoArk-AI/GPA-0.9B-preview-ASR"
-    try:
-        model_instance = AutoModelForSpeechSeq2Seq.from_pretrained(
-            asr_model_id,
-            torch_dtype=torch.float16, # Use float16 for efficiency if supported
-            low_cpu_mem_usage=True,
-            use_safetensors=True,
-            cache_dir=cache_dir
-        ).to("cuda" if torch.cuda.is_available() else "cpu") # Move to GPU if available
-        processor_instance = AutoProcessor.from_pretrained(asr_model_id, cache_dir=cache_dir)
-        print("ASR Model loaded successfully.")
-    except Exception as e:
-        print(f"Error loading ASR model: {e}")
-        raise gr.Error(f"Failed to load ASR model: {e}")
-    # --- TTS Pipeline Loading ---
-    print("Loading TTS Pipeline...")
-    tts_model_id = "AutoArk-AI/GPA-0.9B-preview-TTS"
-    try:
-        # The TTS model appears to be based on Stable Audio Open Repo
-        tts_pipeline_instance = StableAudioPipeline.from_pretrained(
-            tts_model_id,
-            torch_dtype=torch.float16,
-            cache_dir=cache_dir
-        ).to("cuda" if torch.cuda.is_available() else "cpu")
-        print("TTS Pipeline loaded successfully.")
-    except Exception as e:
-        print(f"Error loading TTS pipeline: {e}")
-        raise gr.Error(f"Failed to load TTS pipeline: {e}")
-    print("All models loaded successfully!")
-    return model_instance, processor_instance, tts_pipeline_instance
-def run_tts(text, pipe, device):
-    """Run TTS using the StableAudioPipeline."""
-    if not text.strip():
-        raise gr.Error("Text input cannot be empty.")
-    try:
-        # Generate audio using the pipeline
-        # The exact parameters might need fine-tuning based on the model's expected prompt format
-        output = pipe(
-            prompt=text,
-            negative_prompt="", # You might want to adjust this
-            num_inference_steps=100, # Adjust steps as needed
-            audio_end_size=1024 * 48000 // 32, # Example: ~10 seconds at 48kHz, adjust as needed
-            generator=torch.Generator().manual_seed(42), # For reproducibility
-        )
-        # Extract audio tensor
-        audio_tensor = output.audios[0] # Shape: [channels, time_steps]
-        # Convert to numpy array and then to the expected format for Gradio (float32 [-1, 1])
-        audio_np = audio_tensor.cpu().numpy()
-        # Ensure shape is (time_steps,) for mono or (time_steps, channels) for stereo
-        if audio_np.ndim > 1 and audio_np.shape[0] == 1:
-             audio_np = audio_np[0] # Flatten if it's (1, time_steps)
-        elif audio_np.ndim > 1 and audio_np.shape[0] == 2:
-             audio_np = audio_np.T # Transpose if it's (2, time_steps) -> (time_steps, 2)
-        # Normalize if values are outside [-1, 1] range (depends on model output scale)
-        if np.max(np.abs(audio_np)) > 1.0:
-            audio_np = audio_np / np.max(np.abs(audio_np))
-        # Create a temporary file to save the audio
-        temp_filename = "temp_tts_output.wav"
-        # Gradio expects int16 wav files for filepath mode, but accepts float32 for numpy arrays.
-        # Saving as int16 wav for compatibility.
-        scaled_audio = np.int16(audio_np * 32767)
-        wavfile.write(temp_filename, 48000, scaled_audio) # Assuming 48kHz sample rate
-        print(f"TTS completed, saved to {temp_filename}")
-        return temp_filename
-    except Exception as e:
-        print(f"TTS Error: {e}")
-        raise gr.Error(f"TTS generation failed: {e}")
-def run_asr(audio_path, model, processor, device):
-    """Run ASR using the Whisper-based model."""
-    if not audio_path:
-        raise gr.Error("Audio input is required for ASR.")
-    try:
-        # Load and preprocess audio
-        audio_input, sr = torchaudio.load(audio_path)
-        # Resample to 16kHz if needed (Whisper typically uses 16kHz)
-        if sr != 16000:
-             resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
-             audio_input = resampler(audio_input)
-        # Take the mean along the channel axis if stereo
-        audio_array = audio_input.mean(dim=0).numpy()
-        # Create the pipeline using the loaded model and processor
-        pipe = pipeline(
-            "automatic-speech-recognition",
-            model=model,
-            tokenizer=processor.tokenizer,
-            feature_extractor=processor.feature_extractor,
-            max_new_tokens=128,
-            chunk_length_s=15,
-            batch_size=16,
-            torch_dtype=torch.float16,
-            device=device,
-        )
-        # Perform transcription
-        result = pipe(audio_array)
-        print(f"ASR completed: {result['text']}")
-        return result["text"]
-    except Exception as e:
-        print(f"ASR Error: {e}")
-        raise gr.Error(f"ASR transcription failed: {e}")
-# Attempt to load the model when the app starts
-print("Starting model loading process...")
-try:
-    model_instance, processor_instance, tts_pipeline_instance = load_gpa_model()
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    print(f"Models loaded successfully on {device}.")
-except Exception as e:
-    print(f"Critical Error during startup: {e}")
-    model_instance = None
-    processor_instance = None
-    tts_pipeline_instance = None
-    device = None
-def tts_interface(text):
-    if tts_pipeline_instance is None:
-        raise gr.Error("TTS model not loaded. Cannot perform TTS.")
-    try:
-        output_path = run_tts(text, tts_pipeline_instance, device)
-        return output_path
-    except Exception as e:
-        print(f"TTS Interface Error: {e}")
-        raise gr.Error(f"TTS failed: {e}")
-def asr_interface(audio):
-    if model_instance is None or processor_instance is None:
-        raise gr.Error("ASR model not loaded. Cannot perform ASR.")
-    try:
-        transcription = run_asr(audio, model_instance, processor_instance, device)
-        return transcription
-    except Exception as e:
-        print(f"ASR Interface Error: {e}")
-        raise gr.Error(f"ASR failed: {e}")
-# VC is not explicitly detailed as a separate model in the latest info found, so it's omitted for now
-# If a specific VC model exists later, it can be added similarly.
-with gr.Blocks(title="GPA Model Demo") as demo:
-    gr.Markdown(
-        """
-        # GPA Model Demo (0.9B Preview)
-        Unified TTS and ASR powered by AutoArk-AI's GPA model.
-        """
-    )
-    with gr.Tab("Text-to-Speech (TTS)"):
-        with gr.Row():
-            with gr.Column():
-                text_input_tts = gr.Textbox(
-                    label="Input Text",
-                    placeholder="Enter text to convert to speech...",
-                    lines=5
-                )
-                tts_button = gr.Button("Generate Speech", variant="primary")
-            with gr.Column():
-                audio_output_tts = gr.Audio(
-                    label="Generated Audio",
-                    type="filepath"
-                )
-        tts_button.click(
-            fn=tts_interface,
-            inputs=text_input_tts,
-            outputs=audio_output_tts
-        )
-    with gr.Tab("Automatic Speech Recognition (ASR)"):
-        with gr.Row():
-            with gr.Column():
-                audio_input_asr = gr.Audio(
-                    label="Upload Audio File",
-                    type="filepath",
-                    sources=["upload"],
-                )
-                asr_button = gr.Button("Transcribe Speech", variant="primary")
-            with gr.Column():
-                text_output_asr = gr.Textbox(
-                    label="Transcribed Text",
-                    placeholder="Transcription will appear here...",
-                    interactive=False
-                )
-        asr_button.click(
-            fn=asr_interface,
-            inputs=audio_input_asr,
-            outputs=text_output_asr
-        )
-    gr.Markdown(
-        """
-        ---
-        *Powered by [AutoArk-AI/GPA](https://huggingface.co/AutoArk-AI/GPA). Deployed on Hugging Face Spaces.*
-        """
-    )
-if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))

 import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer
+# 加載模型和分詞器
+model_name = "AutoArk-AI/GPA"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name).to("cuda")  # 如果使用 GPU
+def generate_text(input_text):
+    # 將輸入文本進行分詞並生成輸出
+    inputs = tokenizer(input_text, return_tensors="pt").to("cuda")  # 如果使用 GPU
+    outputs = model.generate(**inputs, max_length=50)
+    return tokenizer.decode(outputs[0], skip_special_tokens=True)
+# 創建 Gradio 界面
+interface = gr.Interface(
+    fn=generate_text,
+    inputs=gr.Textbox(lines=5, placeholder="輸入你的文本..."),
+    outputs="text",
+    title="AutoArk-AI/GPA 模型演示",
+    description="輸入文本，模型將生成回覆。"
+)
+# 啟動界面
+interface.launch()