davronbekdev commited on
Commit
e077904
·
verified ·
1 Parent(s): b805a26

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ src/models/hard_speech_nuriddin.mp3 filter=lfs diff=lfs merge=lfs -text
37
+ src/models/some_audio_max_30_sec.wav filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # NLP Lab Project
2
+
3
+ This is a Natural Language Processing (NLP) project with a structured codebase for data preprocessing, model training, and experimentation.
4
+
5
+ ## Project Structure
6
+
7
+ ```
8
+ nlp/
9
+ ├── data/
10
+ │ ├── raw/ # Raw, unprocessed datasets
11
+ │ └── processed/ # Cleaned and preprocessed data
12
+ ├── notebooks/
13
+ │ └── 01_data_preprocessing.ipynb # Jupyter notebook for data exploration and preprocessing
14
+ ├── src/
15
+ │ ├── models/ # Model definitions and architectures
16
+ │ ├── preprocessing/ # Data preprocessing utilities
17
+ │ └── train.py # Main training script
18
+ ├── requirements.txt # Python dependencies
19
+ └── README.md # This file
20
+ ```
21
+
22
+ ## Setup
23
+
24
+ 1. **Create a virtual environment:**
25
+ ```bash
26
+ python -m venv nlp-env
27
+ source nlp-env/bin/activate # On Windows: nlp-env\Scripts\activate
28
+ ```
29
+
30
+ 2. **Install dependencies:**
31
+ ```bash
32
+ pip install -r requirements.txt
33
+ ```
34
+
35
+ 3. **Download NLTK data (if using NLTK):**
36
+ ```python
37
+ import nltk
38
+ nltk.download('punkt')
39
+ nltk.download('stopwords')
40
+ ```
41
+
42
+ ## Usage
43
+
44
+ ### Data Preprocessing
45
+ 1. Place your raw data files in the `data/raw/` directory
46
+ 2. Use the Jupyter notebook `notebooks/01_data_preprocessing.ipynb` for initial data exploration and preprocessing
47
+ 3. Save processed data to `data/processed/` directory
48
+
49
+ ### Model Training
50
+ Run the training script with default parameters:
51
+ ```bash
52
+ python src/train.py
53
+ ```
54
+
55
+ Or with custom parameters:
56
+ ```bash
57
+ python src/train.py --epochs 20 --lr 0.0001 --batch_size 64
58
+ ```
59
+
60
+ ## Directory Descriptions
61
+
62
+ - **`data/raw/`**: Store your original, unmodified datasets here
63
+ - **`data/processed/`**: Store cleaned and preprocessed data ready for training
64
+ - **`notebooks/`**: Jupyter notebooks for data exploration, visualization, and experimentation
65
+ - **`src/models/`**: Python modules containing model definitions (e.g., neural network architectures)
66
+ - **`src/preprocessing/`**: Utility functions for data cleaning, tokenization, and feature extraction
67
+ - **`src/train.py`**: Main training script with command-line interface
68
+
69
+ ## Getting Started
70
+
71
+ 1. Add your dataset to `data/raw/`
72
+ 2. Open `notebooks/01_data_preprocessing.ipynb` to explore and preprocess your data
73
+ 3. Implement your model in `src/models/`
74
+ 4. Create preprocessing utilities in `src/preprocessing/`
75
+ 5. Run training with `python src/train.py`
76
+
77
+ ## Contributing
78
+
79
+ 1. Follow PEP 8 style guidelines
80
+ 2. Add docstrings to all functions and classes
81
+ 3. Write unit tests for your code
82
+ 4. Update this README when adding new features
notebooks/01_data_preprocessing.ipynb ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "0327aa17",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Data Preprocessing for NLP Project\n",
9
+ "\n",
10
+ "This notebook contains the data preprocessing steps for our NLP project."
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": null,
16
+ "id": "a4c43313",
17
+ "metadata": {},
18
+ "outputs": [],
19
+ "source": [
20
+ "# Import necessary libraries\n",
21
+ "import pandas as pd\n",
22
+ "import numpy as np\n",
23
+ "import nltk\n",
24
+ "from sklearn.model_selection import train_test_split\n",
25
+ "import re\n",
26
+ "import string"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "markdown",
31
+ "id": "1d4c2a2b",
32
+ "metadata": {},
33
+ "source": [
34
+ "## Load Raw Data\n",
35
+ "\n",
36
+ "Load the raw data from the data/raw directory."
37
+ ]
38
+ },
39
+ {
40
+ "cell_type": "code",
41
+ "execution_count": null,
42
+ "id": "9b2784d5",
43
+ "metadata": {},
44
+ "outputs": [],
45
+ "source": [
46
+ "# Load your raw data here\n",
47
+ "df = pd.read_csv('../data/raw/your_dataset.csv')\n",
48
+ "print(df.head())"
49
+ ]
50
+ }
51
+ ],
52
+ "metadata": {
53
+ "kernelspec": {
54
+ "display_name": "base",
55
+ "language": "python",
56
+ "name": "python3"
57
+ },
58
+ "language_info": {
59
+ "name": "python",
60
+ "version": "3.13.5"
61
+ }
62
+ },
63
+ "nbformat": 4,
64
+ "nbformat_minor": 5
65
+ }
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch>=1.9.0
2
+ transformers>=4.20.0
3
+ pandas>=1.3.0
4
+ numpy>=1.21.0
5
+ scikit-learn>=1.0.0
6
+ nltk>=3.7
7
+ matplotlib>=3.5.0
8
+ seaborn>=0.11.0
9
+ jupyter>=1.0.0
10
+ datasets>=2.0.0
11
+ tokenizers>=0.12.0
12
+ tqdm>=4.60.0
13
+ wandb>=0.12.0
src/models/hard_speech_nuriddin.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:400cc80a8121ce40fb971e1bd31cbb267fb1ba8c724ce12e24800081935fd457
3
+ size 723112
src/models/rubaistt.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torchaudio
3
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
4
+
5
+ # Always use CPU (safer for low-memory GPUs)
6
+ device = torch.device("cpu")
7
+
8
+ # Clear any leftover CUDA cache
9
+ torch.cuda.empty_cache()
10
+
11
+ # Load model and processor (using smaller model recommended)
12
+ processor = WhisperProcessor.from_pretrained("openai/whisper-small")
13
+ model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)
14
+
15
+ def transcribe_audio(audio_path):
16
+ global model, processor
17
+
18
+ # Load and preprocess audio
19
+ waveform, sample_rate = torchaudio.load(audio_path)
20
+ if sample_rate != 16000:
21
+ waveform = torchaudio.functional.resample(waveform, sample_rate, 16000)
22
+
23
+ # Convert stereo to mono if needed
24
+ if waveform.shape[0] > 1:
25
+ waveform = waveform.mean(dim=0, keepdim=True)
26
+
27
+ # Process audio
28
+ input_features = processor(
29
+ waveform.squeeze().numpy(),
30
+ sampling_rate=16000,
31
+ return_tensors="pt",
32
+ language="uz"
33
+ ).input_features.to(device)
34
+
35
+ # Generate transcription (CPU inference)
36
+ with torch.no_grad():
37
+ predicted_ids = model.generate(input_features)
38
+
39
+ # Decode
40
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
41
+ return transcription.strip()
42
+
43
+ # Example usage
44
+ if __name__ == "__main__":
45
+ audio_file = "some_audio_max_30_sec.wav"
46
+ print("Transcribing on CPU, please wait...")
47
+ text = transcribe_audio(audio_file)
48
+ print(f"Transcription:\n{text}")
src/models/sarahai.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers.models.wav2vec2 import Wav2Vec2Processor, Wav2Vec2ForCTC # pip install transformers
2
+ import torch
3
+ import torchaudio
4
+
5
+ model_name = "sarahai/uzbek-stt-3"
6
+ model = Wav2Vec2ForCTC.from_pretrained(model_name)
7
+ processor = Wav2Vec2Processor.from_pretrained(model_name)
8
+
9
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
10
+ model.to(device)
11
+
12
+ def load_and_preprocess_audio(file_path):
13
+ speech_array, sampling_rate = torchaudio.load(file_path)
14
+ if sampling_rate != 16000:
15
+ resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
16
+ speech_array = resampler(speech_array)
17
+ return speech_array.squeeze().numpy()
18
+
19
+ def replace_unk(transcription):
20
+ return transcription.replace("[UNK]", "ʼ")
21
+
22
+ audio_file = "/content/audio_2024-08-13_15-20-53.ogg"
23
+ speech_array = load_and_preprocess_audio(audio_file)
24
+
25
+ input_values = processor(speech_array, sampling_rate=16000, return_tensors="pt").input_values.to(device)
26
+
27
+ with torch.no_grad():
28
+ logits = model(input_values).logits
29
+
30
+ predicted_ids = torch.argmax(logits, dim=-1)
31
+ transcription = processor.batch_decode(predicted_ids)
32
+
33
+ transcription_text = replace_unk(transcription[0])
34
+
35
+ print("Transcription:", transcription_text)
src/models/some_audio_max_30_sec.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:919ca611f6dd17dca72567c6caed6b5c2b0282e5645220e2e306aeb4d2eaea93
3
+ size 3177662
src/train.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Training script for NLP models.
3
+
4
+ This module contains the main training loop and model training functions.
5
+ """
6
+
7
+ import torch
8
+ import torch.nn as nn
9
+ import torch.optim as optim
10
+ from torch.utils.data import DataLoader
11
+ import argparse
12
+ import logging
13
+ from pathlib import Path
14
+
15
+ # Import your custom modules here
16
+ # from models.model import YourModel
17
+ # from preprocessing.data_loader import YourDataLoader
18
+
19
+ logging.basicConfig(level=logging.INFO)
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ def train_model(model, train_loader, val_loader, epochs=10, lr=0.001):
24
+ """
25
+ Train the NLP model.
26
+
27
+ Args:
28
+ model: The neural network model
29
+ train_loader: Training data loader
30
+ val_loader: Validation data loader
31
+ epochs: Number of training epochs
32
+ lr: Learning rate
33
+ """
34
+ criterion = nn.CrossEntropyLoss()
35
+ optimizer = optim.Adam(model.parameters(), lr=lr)
36
+
37
+ for epoch in range(epochs):
38
+ model.train()
39
+ train_loss = 0.0
40
+
41
+ for batch_idx, (data, target) in enumerate(train_loader):
42
+ optimizer.zero_grad()
43
+ output = model(data)
44
+ loss = criterion(output, target)
45
+ loss.backward()
46
+ optimizer.step()
47
+
48
+ train_loss += loss.item()
49
+
50
+ if batch_idx % 100 == 0:
51
+ logger.info(f'Epoch: {epoch}, Batch: {batch_idx}, Loss: {loss.item():.4f}')
52
+
53
+ # Validation
54
+ model.eval()
55
+ val_loss = 0.0
56
+ correct = 0
57
+
58
+ with torch.no_grad():
59
+ for data, target in val_loader:
60
+ output = model(data)
61
+ val_loss += criterion(output, target).item()
62
+ pred = output.argmax(dim=1, keepdim=True)
63
+ correct += pred.eq(target.view_as(pred)).sum().item()
64
+
65
+ val_accuracy = correct / len(val_loader.dataset)
66
+ logger.info(f'Epoch {epoch}: Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')
67
+
68
+
69
+ def main():
70
+ """Main training function."""
71
+ parser = argparse.ArgumentParser(description='Train NLP Model')
72
+ parser.add_argument('--epochs', type=int, default=10, help='Number of epochs')
73
+ parser.add_argument('--lr', type=float, default=0.001, help='Learning rate')
74
+ parser.add_argument('--batch_size', type=int, default=32, help='Batch size')
75
+
76
+ args = parser.parse_args()
77
+
78
+ # Initialize your model, data loaders here
79
+ # model = YourModel()
80
+ # train_loader = YourDataLoader(batch_size=args.batch_size, split='train')
81
+ # val_loader = YourDataLoader(batch_size=args.batch_size, split='val')
82
+
83
+ logger.info("Starting training...")
84
+ # train_model(model, train_loader, val_loader, args.epochs, args.lr)
85
+ logger.info("Training completed!")
86
+
87
+
88
+ if __name__ == "__main__":
89
+ main()