Upload folder using huggingface_hub
Browse files- .gitattributes +2 -0
- README.md +82 -0
- notebooks/01_data_preprocessing.ipynb +65 -0
- requirements.txt +13 -0
- src/models/hard_speech_nuriddin.mp3 +3 -0
- src/models/rubaistt.py +48 -0
- src/models/sarahai.py +35 -0
- src/models/some_audio_max_30_sec.wav +3 -0
- src/train.py +89 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
src/models/hard_speech_nuriddin.mp3 filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
src/models/some_audio_max_30_sec.wav filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# NLP Lab Project
|
| 2 |
+
|
| 3 |
+
This is a Natural Language Processing (NLP) project with a structured codebase for data preprocessing, model training, and experimentation.
|
| 4 |
+
|
| 5 |
+
## Project Structure
|
| 6 |
+
|
| 7 |
+
```
|
| 8 |
+
nlp/
|
| 9 |
+
├── data/
|
| 10 |
+
│ ├── raw/ # Raw, unprocessed datasets
|
| 11 |
+
│ └── processed/ # Cleaned and preprocessed data
|
| 12 |
+
├── notebooks/
|
| 13 |
+
│ └── 01_data_preprocessing.ipynb # Jupyter notebook for data exploration and preprocessing
|
| 14 |
+
├── src/
|
| 15 |
+
│ ├── models/ # Model definitions and architectures
|
| 16 |
+
│ ├── preprocessing/ # Data preprocessing utilities
|
| 17 |
+
│ └── train.py # Main training script
|
| 18 |
+
├── requirements.txt # Python dependencies
|
| 19 |
+
└── README.md # This file
|
| 20 |
+
```
|
| 21 |
+
|
| 22 |
+
## Setup
|
| 23 |
+
|
| 24 |
+
1. **Create a virtual environment:**
|
| 25 |
+
```bash
|
| 26 |
+
python -m venv nlp-env
|
| 27 |
+
source nlp-env/bin/activate # On Windows: nlp-env\Scripts\activate
|
| 28 |
+
```
|
| 29 |
+
|
| 30 |
+
2. **Install dependencies:**
|
| 31 |
+
```bash
|
| 32 |
+
pip install -r requirements.txt
|
| 33 |
+
```
|
| 34 |
+
|
| 35 |
+
3. **Download NLTK data (if using NLTK):**
|
| 36 |
+
```python
|
| 37 |
+
import nltk
|
| 38 |
+
nltk.download('punkt')
|
| 39 |
+
nltk.download('stopwords')
|
| 40 |
+
```
|
| 41 |
+
|
| 42 |
+
## Usage
|
| 43 |
+
|
| 44 |
+
### Data Preprocessing
|
| 45 |
+
1. Place your raw data files in the `data/raw/` directory
|
| 46 |
+
2. Use the Jupyter notebook `notebooks/01_data_preprocessing.ipynb` for initial data exploration and preprocessing
|
| 47 |
+
3. Save processed data to `data/processed/` directory
|
| 48 |
+
|
| 49 |
+
### Model Training
|
| 50 |
+
Run the training script with default parameters:
|
| 51 |
+
```bash
|
| 52 |
+
python src/train.py
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
Or with custom parameters:
|
| 56 |
+
```bash
|
| 57 |
+
python src/train.py --epochs 20 --lr 0.0001 --batch_size 64
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
## Directory Descriptions
|
| 61 |
+
|
| 62 |
+
- **`data/raw/`**: Store your original, unmodified datasets here
|
| 63 |
+
- **`data/processed/`**: Store cleaned and preprocessed data ready for training
|
| 64 |
+
- **`notebooks/`**: Jupyter notebooks for data exploration, visualization, and experimentation
|
| 65 |
+
- **`src/models/`**: Python modules containing model definitions (e.g., neural network architectures)
|
| 66 |
+
- **`src/preprocessing/`**: Utility functions for data cleaning, tokenization, and feature extraction
|
| 67 |
+
- **`src/train.py`**: Main training script with command-line interface
|
| 68 |
+
|
| 69 |
+
## Getting Started
|
| 70 |
+
|
| 71 |
+
1. Add your dataset to `data/raw/`
|
| 72 |
+
2. Open `notebooks/01_data_preprocessing.ipynb` to explore and preprocess your data
|
| 73 |
+
3. Implement your model in `src/models/`
|
| 74 |
+
4. Create preprocessing utilities in `src/preprocessing/`
|
| 75 |
+
5. Run training with `python src/train.py`
|
| 76 |
+
|
| 77 |
+
## Contributing
|
| 78 |
+
|
| 79 |
+
1. Follow PEP 8 style guidelines
|
| 80 |
+
2. Add docstrings to all functions and classes
|
| 81 |
+
3. Write unit tests for your code
|
| 82 |
+
4. Update this README when adding new features
|
notebooks/01_data_preprocessing.ipynb
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"id": "0327aa17",
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"source": [
|
| 8 |
+
"# Data Preprocessing for NLP Project\n",
|
| 9 |
+
"\n",
|
| 10 |
+
"This notebook contains the data preprocessing steps for our NLP project."
|
| 11 |
+
]
|
| 12 |
+
},
|
| 13 |
+
{
|
| 14 |
+
"cell_type": "code",
|
| 15 |
+
"execution_count": null,
|
| 16 |
+
"id": "a4c43313",
|
| 17 |
+
"metadata": {},
|
| 18 |
+
"outputs": [],
|
| 19 |
+
"source": [
|
| 20 |
+
"# Import necessary libraries\n",
|
| 21 |
+
"import pandas as pd\n",
|
| 22 |
+
"import numpy as np\n",
|
| 23 |
+
"import nltk\n",
|
| 24 |
+
"from sklearn.model_selection import train_test_split\n",
|
| 25 |
+
"import re\n",
|
| 26 |
+
"import string"
|
| 27 |
+
]
|
| 28 |
+
},
|
| 29 |
+
{
|
| 30 |
+
"cell_type": "markdown",
|
| 31 |
+
"id": "1d4c2a2b",
|
| 32 |
+
"metadata": {},
|
| 33 |
+
"source": [
|
| 34 |
+
"## Load Raw Data\n",
|
| 35 |
+
"\n",
|
| 36 |
+
"Load the raw data from the data/raw directory."
|
| 37 |
+
]
|
| 38 |
+
},
|
| 39 |
+
{
|
| 40 |
+
"cell_type": "code",
|
| 41 |
+
"execution_count": null,
|
| 42 |
+
"id": "9b2784d5",
|
| 43 |
+
"metadata": {},
|
| 44 |
+
"outputs": [],
|
| 45 |
+
"source": [
|
| 46 |
+
"# Load your raw data here\n",
|
| 47 |
+
"df = pd.read_csv('../data/raw/your_dataset.csv')\n",
|
| 48 |
+
"print(df.head())"
|
| 49 |
+
]
|
| 50 |
+
}
|
| 51 |
+
],
|
| 52 |
+
"metadata": {
|
| 53 |
+
"kernelspec": {
|
| 54 |
+
"display_name": "base",
|
| 55 |
+
"language": "python",
|
| 56 |
+
"name": "python3"
|
| 57 |
+
},
|
| 58 |
+
"language_info": {
|
| 59 |
+
"name": "python",
|
| 60 |
+
"version": "3.13.5"
|
| 61 |
+
}
|
| 62 |
+
},
|
| 63 |
+
"nbformat": 4,
|
| 64 |
+
"nbformat_minor": 5
|
| 65 |
+
}
|
requirements.txt
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch>=1.9.0
|
| 2 |
+
transformers>=4.20.0
|
| 3 |
+
pandas>=1.3.0
|
| 4 |
+
numpy>=1.21.0
|
| 5 |
+
scikit-learn>=1.0.0
|
| 6 |
+
nltk>=3.7
|
| 7 |
+
matplotlib>=3.5.0
|
| 8 |
+
seaborn>=0.11.0
|
| 9 |
+
jupyter>=1.0.0
|
| 10 |
+
datasets>=2.0.0
|
| 11 |
+
tokenizers>=0.12.0
|
| 12 |
+
tqdm>=4.60.0
|
| 13 |
+
wandb>=0.12.0
|
src/models/hard_speech_nuriddin.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:400cc80a8121ce40fb971e1bd31cbb267fb1ba8c724ce12e24800081935fd457
|
| 3 |
+
size 723112
|
src/models/rubaistt.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torchaudio
|
| 3 |
+
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
| 4 |
+
|
| 5 |
+
# Always use CPU (safer for low-memory GPUs)
|
| 6 |
+
device = torch.device("cpu")
|
| 7 |
+
|
| 8 |
+
# Clear any leftover CUDA cache
|
| 9 |
+
torch.cuda.empty_cache()
|
| 10 |
+
|
| 11 |
+
# Load model and processor (using smaller model recommended)
|
| 12 |
+
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
|
| 13 |
+
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)
|
| 14 |
+
|
| 15 |
+
def transcribe_audio(audio_path):
|
| 16 |
+
global model, processor
|
| 17 |
+
|
| 18 |
+
# Load and preprocess audio
|
| 19 |
+
waveform, sample_rate = torchaudio.load(audio_path)
|
| 20 |
+
if sample_rate != 16000:
|
| 21 |
+
waveform = torchaudio.functional.resample(waveform, sample_rate, 16000)
|
| 22 |
+
|
| 23 |
+
# Convert stereo to mono if needed
|
| 24 |
+
if waveform.shape[0] > 1:
|
| 25 |
+
waveform = waveform.mean(dim=0, keepdim=True)
|
| 26 |
+
|
| 27 |
+
# Process audio
|
| 28 |
+
input_features = processor(
|
| 29 |
+
waveform.squeeze().numpy(),
|
| 30 |
+
sampling_rate=16000,
|
| 31 |
+
return_tensors="pt",
|
| 32 |
+
language="uz"
|
| 33 |
+
).input_features.to(device)
|
| 34 |
+
|
| 35 |
+
# Generate transcription (CPU inference)
|
| 36 |
+
with torch.no_grad():
|
| 37 |
+
predicted_ids = model.generate(input_features)
|
| 38 |
+
|
| 39 |
+
# Decode
|
| 40 |
+
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
|
| 41 |
+
return transcription.strip()
|
| 42 |
+
|
| 43 |
+
# Example usage
|
| 44 |
+
if __name__ == "__main__":
|
| 45 |
+
audio_file = "some_audio_max_30_sec.wav"
|
| 46 |
+
print("Transcribing on CPU, please wait...")
|
| 47 |
+
text = transcribe_audio(audio_file)
|
| 48 |
+
print(f"Transcription:\n{text}")
|
src/models/sarahai.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers.models.wav2vec2 import Wav2Vec2Processor, Wav2Vec2ForCTC # pip install transformers
|
| 2 |
+
import torch
|
| 3 |
+
import torchaudio
|
| 4 |
+
|
| 5 |
+
model_name = "sarahai/uzbek-stt-3"
|
| 6 |
+
model = Wav2Vec2ForCTC.from_pretrained(model_name)
|
| 7 |
+
processor = Wav2Vec2Processor.from_pretrained(model_name)
|
| 8 |
+
|
| 9 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 10 |
+
model.to(device)
|
| 11 |
+
|
| 12 |
+
def load_and_preprocess_audio(file_path):
|
| 13 |
+
speech_array, sampling_rate = torchaudio.load(file_path)
|
| 14 |
+
if sampling_rate != 16000:
|
| 15 |
+
resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
|
| 16 |
+
speech_array = resampler(speech_array)
|
| 17 |
+
return speech_array.squeeze().numpy()
|
| 18 |
+
|
| 19 |
+
def replace_unk(transcription):
|
| 20 |
+
return transcription.replace("[UNK]", "ʼ")
|
| 21 |
+
|
| 22 |
+
audio_file = "/content/audio_2024-08-13_15-20-53.ogg"
|
| 23 |
+
speech_array = load_and_preprocess_audio(audio_file)
|
| 24 |
+
|
| 25 |
+
input_values = processor(speech_array, sampling_rate=16000, return_tensors="pt").input_values.to(device)
|
| 26 |
+
|
| 27 |
+
with torch.no_grad():
|
| 28 |
+
logits = model(input_values).logits
|
| 29 |
+
|
| 30 |
+
predicted_ids = torch.argmax(logits, dim=-1)
|
| 31 |
+
transcription = processor.batch_decode(predicted_ids)
|
| 32 |
+
|
| 33 |
+
transcription_text = replace_unk(transcription[0])
|
| 34 |
+
|
| 35 |
+
print("Transcription:", transcription_text)
|
src/models/some_audio_max_30_sec.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:919ca611f6dd17dca72567c6caed6b5c2b0282e5645220e2e306aeb4d2eaea93
|
| 3 |
+
size 3177662
|
src/train.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Training script for NLP models.
|
| 3 |
+
|
| 4 |
+
This module contains the main training loop and model training functions.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import torch
|
| 8 |
+
import torch.nn as nn
|
| 9 |
+
import torch.optim as optim
|
| 10 |
+
from torch.utils.data import DataLoader
|
| 11 |
+
import argparse
|
| 12 |
+
import logging
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
|
| 15 |
+
# Import your custom modules here
|
| 16 |
+
# from models.model import YourModel
|
| 17 |
+
# from preprocessing.data_loader import YourDataLoader
|
| 18 |
+
|
| 19 |
+
logging.basicConfig(level=logging.INFO)
|
| 20 |
+
logger = logging.getLogger(__name__)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def train_model(model, train_loader, val_loader, epochs=10, lr=0.001):
|
| 24 |
+
"""
|
| 25 |
+
Train the NLP model.
|
| 26 |
+
|
| 27 |
+
Args:
|
| 28 |
+
model: The neural network model
|
| 29 |
+
train_loader: Training data loader
|
| 30 |
+
val_loader: Validation data loader
|
| 31 |
+
epochs: Number of training epochs
|
| 32 |
+
lr: Learning rate
|
| 33 |
+
"""
|
| 34 |
+
criterion = nn.CrossEntropyLoss()
|
| 35 |
+
optimizer = optim.Adam(model.parameters(), lr=lr)
|
| 36 |
+
|
| 37 |
+
for epoch in range(epochs):
|
| 38 |
+
model.train()
|
| 39 |
+
train_loss = 0.0
|
| 40 |
+
|
| 41 |
+
for batch_idx, (data, target) in enumerate(train_loader):
|
| 42 |
+
optimizer.zero_grad()
|
| 43 |
+
output = model(data)
|
| 44 |
+
loss = criterion(output, target)
|
| 45 |
+
loss.backward()
|
| 46 |
+
optimizer.step()
|
| 47 |
+
|
| 48 |
+
train_loss += loss.item()
|
| 49 |
+
|
| 50 |
+
if batch_idx % 100 == 0:
|
| 51 |
+
logger.info(f'Epoch: {epoch}, Batch: {batch_idx}, Loss: {loss.item():.4f}')
|
| 52 |
+
|
| 53 |
+
# Validation
|
| 54 |
+
model.eval()
|
| 55 |
+
val_loss = 0.0
|
| 56 |
+
correct = 0
|
| 57 |
+
|
| 58 |
+
with torch.no_grad():
|
| 59 |
+
for data, target in val_loader:
|
| 60 |
+
output = model(data)
|
| 61 |
+
val_loss += criterion(output, target).item()
|
| 62 |
+
pred = output.argmax(dim=1, keepdim=True)
|
| 63 |
+
correct += pred.eq(target.view_as(pred)).sum().item()
|
| 64 |
+
|
| 65 |
+
val_accuracy = correct / len(val_loader.dataset)
|
| 66 |
+
logger.info(f'Epoch {epoch}: Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def main():
|
| 70 |
+
"""Main training function."""
|
| 71 |
+
parser = argparse.ArgumentParser(description='Train NLP Model')
|
| 72 |
+
parser.add_argument('--epochs', type=int, default=10, help='Number of epochs')
|
| 73 |
+
parser.add_argument('--lr', type=float, default=0.001, help='Learning rate')
|
| 74 |
+
parser.add_argument('--batch_size', type=int, default=32, help='Batch size')
|
| 75 |
+
|
| 76 |
+
args = parser.parse_args()
|
| 77 |
+
|
| 78 |
+
# Initialize your model, data loaders here
|
| 79 |
+
# model = YourModel()
|
| 80 |
+
# train_loader = YourDataLoader(batch_size=args.batch_size, split='train')
|
| 81 |
+
# val_loader = YourDataLoader(batch_size=args.batch_size, split='val')
|
| 82 |
+
|
| 83 |
+
logger.info("Starting training...")
|
| 84 |
+
# train_model(model, train_loader, val_loader, args.epochs, args.lr)
|
| 85 |
+
logger.info("Training completed!")
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
if __name__ == "__main__":
|
| 89 |
+
main()
|