Spaces:

akshatOP
/

nuera

Runtime error

App Files Files Community

akshatOP commited on Feb 28, 2025

Commit

2f38e4a

1 Parent(s): b76224d

Initial upload of TTS, SST, and LLM models with API

Browse files

Files changed (8) hide show

README.md +41 -0
app.py +74 -0
download_and_finetune_sst.py +0 -0
download_and_finetune_tts.py +0 -0
models/sst_model/download_and_finetune_sst.py +48 -0
models/tts_model/download_and_finetune_tts.py +44 -0
nuera/models/llama.gguf +0 -0
requirements.txt +9 -0

README.md CHANGED Viewed

@@ -10,3 +10,44 @@ pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# My AI Models Space
+This Hugging Face Space hosts TTS, SST, and LLM models with API endpoints.
+## Setup
+1. **Clone the repository** to your Hugging Face Space.
+2. **Install dependencies**: `pip install -r requirements.txt`.
+3. **Prepare models**:
+   - **TTS**: Run `download_and_finetune_tts.py` externally, then upload `./tts_finetuned` to `models/tts_model`. If not uploaded, uses `parler-tts/parler-tts-mini-v1`.
+   - **SST**: Run `download_and_finetune_sst.py` externally, then upload `./sst_finetuned` to `models/sst_model`. If not uploaded, uses `facebook/wav2vec2-base-960h`.
+   - **LLM**: Download a Llama GGUF file (e.g., from `TheBloke/Llama-2-7B-GGUF` on Hugging Face Hub) and upload to `models/llama.gguf`. Required for LLM to work.
+4. **Deploy**: Push to your Space, and it will run `app.py`.
+## API Endpoints
+- **POST /tts**
+  - **Request**: `{"text": "Your text here"}`
+  - **Response**: Audio file (WAV)
+  - **Example**: `curl -X POST -H "Content-Type: application/json" -d '{"text":"Hello"}' http://your-space.hf.space/tts --output output.wav`
+- **POST /sst**
+  - **Request**: Audio file upload
+  - **Response**: `{"text": "transcribed text"}`
+  - **Example**: `curl -X POST -F "file=@audio.wav" http://your-space.hf.space/sst`
+- **POST /llm**
+  - **Request**: `{"prompt": "Your prompt here"}`
+  - **Response**: `{"text": "generated text"}`
+  - **Example**: `curl -X POST -H "Content-Type: application/json" -d '{"prompt":"Tell me a story"}' http://your-space.hf.space/llm`
+## Fine-Tuning
+- **TTS**: Edit `download_and_finetune_tts.py` with your dataset, run externally, and upload the result.
+- **SST**: Edit `download_and_finetune_sst.py` with your dataset, run externally, and upload the result.
+- **LLM**: Llama.cpp is used for inference only. For fine-tuning, use tools like LoRA with Transformers externally, convert to GGUF, and upload.
+## Notes
+- Ensure GGUF file for LLM is manageable (e.g., quantized versions like `llama-2-7b.Q4_K_M.gguf`).
+- Fine-tuning requires significant resources; perform it outside Spaces.

app.py ADDED Viewed

	@@ -0,0 +1,74 @@

+from fastapi import FastAPI, File, UploadFile, Response
+from transformers import ParlerTTSForConditionalGeneration, AutoTokenizer
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
+from llama_cpp import Llama
+import torch
+import soundfile as sf
+import io
+import os
+from pydantic import BaseModel
+app = FastAPI()
+# Load models
+# TTS: Use local fine-tuned model if available, else load from Hub
+if os.path.exists("./models/tts_model"):
+    tts_model = ParlerTTSForConditionalGeneration.from_pretrained("./models/tts_model")
+    tts_tokenizer = AutoTokenizer.from_pretrained("./models/tts_model")
+else:
+    tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-mini-v1")
+    tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-v1")
+# SST: Use local fine-tuned model if available, else load from Hub
+if os.path.exists("./models/sst_model"):
+    sst_model = Wav2Vec2ForCTC.from_pretrained("./models/sst_model")
+    sst_processor = Wav2Vec2Processor.from_pretrained("./models/sst_model")
+else:
+    sst_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
+    sst_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
+# LLM: Use local GGUF file if available, else raise error (must be uploaded)
+if os.path.exists("./models/llama.gguf"):
+    llm = Llama("./models/llama.gguf")
+else:
+    raise FileNotFoundError("Please upload llama.gguf to models/ directory")
+# Request models
+class TTSRequest(BaseModel):
+    text: str
+class LLMRequest(BaseModel):
+    prompt: str
+# API Endpoints
+@app.post("/tts")
+async def tts_endpoint(request: TTSRequest):
+    """Convert text to speech and return audio."""
+    text = request.text
+    inputs = tts_tokenizer(text, return_tensors="pt")
+    with torch.no_grad():
+        audio = tts_model.generate(**inputs)
+    audio = audio.squeeze().cpu().numpy()
+    buffer = io.BytesIO()
+    sf.write(buffer, audio, 22050, format="WAV")
+    buffer.seek(0)
+    return Response(content=buffer.getvalue(), media_type="audio/wav")
+@app.post("/sst")
+async def sst_endpoint(file: UploadFile = File(...)):
+    """Convert speech to text and return transcription."""
+    audio_bytes = await file.read()
+    audio, sr = sf.read(io.BytesIO(audio_bytes))
+    inputs = sst_processor(audio, sampling_rate=sr, return_tensors="pt")
+    with torch.no_grad():
+        logits = sst_model(inputs.input_values).logits
+    predicted_ids = torch.argmax(logits, dim=-1)
+    transcription = sst_processor.batch_decode(predicted_ids)[0]
+    return {"text": transcription}
+@app.post("/llm")
+async def llm_endpoint(request: LLMRequest):
+    """Generate text from a prompt using Llama.cpp."""
+    prompt = request.prompt
+    output = llm(prompt, max_tokens=50)
+    return {"text": output["choices"][0]["text"]}

download_and_finetune_sst.py ADDED Viewed

File without changes

download_and_finetune_tts.py ADDED Viewed

File without changes

models/sst_model/download_and_finetune_sst.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Trainer, TrainingArguments
+from datasets import load_dataset
+# Download model
+model_name = "facebook/wav2vec2-base-960h"
+model = Wav2Vec2ForCTC.from_pretrained(model_name)
+processor = Wav2Vec2Processor.from_pretrained(model_name)
+# Load dataset (replace with your dataset)
+dataset = load_dataset("librispeech_asr", "clean", split="train.100")  # Example dataset
+# Preprocess function
+def preprocess_function(examples):
+    audio = examples["audio"]
+    inputs = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt", padding=True)
+    with processor.as_target_processor():
+        labels = processor(examples["text"], return_tensors="pt", padding=True)
+    return {
+        "input_values": inputs["input_values"][0],
+        "labels": labels["input_ids"][0]
+    }
+train_dataset = dataset.map(preprocess_function, remove_columns=dataset.column_names)
+# Training arguments
+training_args = TrainingArguments(
+    output_dir="./sst_finetuned",
+    per_device_train_batch_size=8,
+    num_train_epochs=3,
+    save_steps=500,
+    logging_steps=10,
+)
+# Initialize Trainer
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_dataset,
+)
+# Fine-tune
+trainer.train()
+# Save fine-tuned model
+trainer.save_model("./sst_finetuned")
+processor.save_pretrained("./sst_finetuned")
+print("SST model fine-tuned and saved to './sst_finetuned'. Upload to models/sst_model in your Space.")

models/tts_model/download_and_finetune_tts.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from transformers import ParlerTTSForConditionalGeneration, AutoTokenizer, Trainer, TrainingArguments
+from datasets import load_dataset
+# Download model
+model_name = "parler-tts/parler-tts-mini-v1"
+model = ParlerTTSForConditionalGeneration.from_pretrained(model_name)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+# Load dataset (replace with your dataset)
+dataset = load_dataset("lj_speech")  # Example dataset; adjust as needed
+# Preprocess function (customize based on your dataset)
+def preprocess_function(examples):
+    # Tokenize text and prepare audio (example; adjust for your data)
+    inputs = tokenizer(examples["text"], return_tensors="pt", padding=True, truncation=True)
+    # Add audio processing if needed
+    return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"]}
+train_dataset = dataset["train"].map(preprocess_function, batched=True)
+# Training arguments
+training_args = TrainingArguments(
+    output_dir="./tts_finetuned",
+    per_device_train_batch_size=8,
+    num_train_epochs=3,
+    save_steps=500,
+    logging_steps=10,
+)
+# Initialize Trainer
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_dataset,
+)
+# Fine-tune
+trainer.train()
+# Save fine-tuned model
+trainer.save_model("./tts_finetuned")
+tokenizer.save_pretrained("./tts_finetuned")
+print("TTS model fine-tuned and saved to './tts_finetuned'. Upload to models/tts_model in your Space.")

nuera/models/llama.gguf ADDED Viewed

File without changes

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+fastapi
+uvicorn
+transformers
+torch
+soundfile
+numpy
+llama-cpp-python
+pydantic
+datasets