Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

.gitattributes +2 -0
README.md +82 -0
notebooks/01_data_preprocessing.ipynb +65 -0
requirements.txt +13 -0
src/models/hard_speech_nuriddin.mp3 +3 -0
src/models/rubaistt.py +48 -0
src/models/sarahai.py +35 -0
src/models/some_audio_max_30_sec.wav +3 -0
src/train.py +89 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+src/models/hard_speech_nuriddin.mp3 filter=lfs diff=lfs merge=lfs -text
+src/models/some_audio_max_30_sec.wav filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,82 @@

+# NLP Lab Project
+This is a Natural Language Processing (NLP) project with a structured codebase for data preprocessing, model training, and experimentation.
+## Project Structure
+```
+nlp/
+├── data/
+│   ├── raw/                    # Raw, unprocessed datasets
+│   └── processed/              # Cleaned and preprocessed data
+├── notebooks/
+│   └── 01_data_preprocessing.ipynb  # Jupyter notebook for data exploration and preprocessing
+├── src/
+│   ├── models/                 # Model definitions and architectures
+│   ├── preprocessing/          # Data preprocessing utilities
+│   └── train.py               # Main training script
+├── requirements.txt           # Python dependencies
+└── README.md                 # This file
+```
+## Setup
+1. **Create a virtual environment:**
+   ```bash
+   python -m venv nlp-env
+   source nlp-env/bin/activate  # On Windows: nlp-env\Scripts\activate
+   ```
+2. **Install dependencies:**
+   ```bash
+   pip install -r requirements.txt
+   ```
+3. **Download NLTK data (if using NLTK):**
+   ```python
+   import nltk
+   nltk.download('punkt')
+   nltk.download('stopwords')
+   ```
+## Usage
+### Data Preprocessing
+1. Place your raw data files in the `data/raw/` directory
+2. Use the Jupyter notebook `notebooks/01_data_preprocessing.ipynb` for initial data exploration and preprocessing
+3. Save processed data to `data/processed/` directory
+### Model Training
+Run the training script with default parameters:
+```bash
+python src/train.py
+```
+Or with custom parameters:
+```bash
+python src/train.py --epochs 20 --lr 0.0001 --batch_size 64
+```
+## Directory Descriptions
+- **`data/raw/`**: Store your original, unmodified datasets here
+- **`data/processed/`**: Store cleaned and preprocessed data ready for training
+- **`notebooks/`**: Jupyter notebooks for data exploration, visualization, and experimentation
+- **`src/models/`**: Python modules containing model definitions (e.g., neural network architectures)
+- **`src/preprocessing/`**: Utility functions for data cleaning, tokenization, and feature extraction
+- **`src/train.py`**: Main training script with command-line interface
+## Getting Started
+1. Add your dataset to `data/raw/`
+2. Open `notebooks/01_data_preprocessing.ipynb` to explore and preprocess your data
+3. Implement your model in `src/models/`
+4. Create preprocessing utilities in `src/preprocessing/`
+5. Run training with `python src/train.py`
+## Contributing
+1. Follow PEP 8 style guidelines
+2. Add docstrings to all functions and classes
+3. Write unit tests for your code
+4. Update this README when adding new features

notebooks/01_data_preprocessing.ipynb ADDED Viewed

	@@ -0,0 +1,65 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "0327aa17",
+   "metadata": {},
+   "source": [
+    "# Data Preprocessing for NLP Project\n",
+    "\n",
+    "This notebook contains the data preprocessing steps for our NLP project."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a4c43313",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import necessary libraries\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import nltk\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "import re\n",
+    "import string"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1d4c2a2b",
+   "metadata": {},
+   "source": [
+    "## Load Raw Data\n",
+    "\n",
+    "Load the raw data from the data/raw directory."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9b2784d5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load your raw data here\n",
+    "df = pd.read_csv('../data/raw/your_dataset.csv')\n",
+    "print(df.head())"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.13.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+torch>=1.9.0
+transformers>=4.20.0
+pandas>=1.3.0
+numpy>=1.21.0
+scikit-learn>=1.0.0
+nltk>=3.7
+matplotlib>=3.5.0
+seaborn>=0.11.0
+jupyter>=1.0.0
+datasets>=2.0.0
+tokenizers>=0.12.0
+tqdm>=4.60.0
+wandb>=0.12.0

src/models/hard_speech_nuriddin.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:400cc80a8121ce40fb971e1bd31cbb267fb1ba8c724ce12e24800081935fd457
+size 723112

src/models/rubaistt.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import torch
+import torchaudio
+from transformers import WhisperProcessor, WhisperForConditionalGeneration
+# Always use CPU (safer for low-memory GPUs)
+device = torch.device("cpu")
+# Clear any leftover CUDA cache
+torch.cuda.empty_cache()
+# Load model and processor (using smaller model recommended)
+processor = WhisperProcessor.from_pretrained("openai/whisper-small")
+model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)
+def transcribe_audio(audio_path):
+    global model, processor
+    # Load and preprocess audio
+    waveform, sample_rate = torchaudio.load(audio_path)
+    if sample_rate != 16000:
+        waveform = torchaudio.functional.resample(waveform, sample_rate, 16000)
+    # Convert stereo to mono if needed
+    if waveform.shape[0] > 1:
+        waveform = waveform.mean(dim=0, keepdim=True)
+    # Process audio
+    input_features = processor(
+        waveform.squeeze().numpy(),
+        sampling_rate=16000,
+        return_tensors="pt",
+        language="uz"
+    ).input_features.to(device)
+    # Generate transcription (CPU inference)
+    with torch.no_grad():
+        predicted_ids = model.generate(input_features)
+    # Decode
+    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+    return transcription.strip()
+# Example usage
+if __name__ == "__main__":
+    audio_file = "some_audio_max_30_sec.wav"
+    print("Transcribing on CPU, please wait...")
+    text = transcribe_audio(audio_file)
+    print(f"Transcription:\n{text}")

src/models/sarahai.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from transformers.models.wav2vec2 import Wav2Vec2Processor, Wav2Vec2ForCTC # pip install transformers
+import torch
+import torchaudio
+model_name = "sarahai/uzbek-stt-3"
+model = Wav2Vec2ForCTC.from_pretrained(model_name)
+processor = Wav2Vec2Processor.from_pretrained(model_name)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device)
+def load_and_preprocess_audio(file_path):
+    speech_array, sampling_rate = torchaudio.load(file_path)
+    if sampling_rate != 16000:
+        resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
+        speech_array = resampler(speech_array)
+    return speech_array.squeeze().numpy()
+def replace_unk(transcription):
+    return transcription.replace("[UNK]", "ʼ")
+audio_file = "/content/audio_2024-08-13_15-20-53.ogg"
+speech_array = load_and_preprocess_audio(audio_file)
+input_values = processor(speech_array, sampling_rate=16000, return_tensors="pt").input_values.to(device)
+with torch.no_grad():
+    logits = model(input_values).logits
+predicted_ids = torch.argmax(logits, dim=-1)
+transcription = processor.batch_decode(predicted_ids)
+transcription_text = replace_unk(transcription[0])
+print("Transcription:", transcription_text)

src/models/some_audio_max_30_sec.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:919ca611f6dd17dca72567c6caed6b5c2b0282e5645220e2e306aeb4d2eaea93
+size 3177662

src/train.py ADDED Viewed

	@@ -0,0 +1,89 @@

+"""
+Training script for NLP models.
+This module contains the main training loop and model training functions.
+"""
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader
+import argparse
+import logging
+from pathlib import Path
+# Import your custom modules here
+# from models.model import YourModel
+# from preprocessing.data_loader import YourDataLoader
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def train_model(model, train_loader, val_loader, epochs=10, lr=0.001):
+    """
+    Train the NLP model.
+    Args:
+        model: The neural network model
+        train_loader: Training data loader
+        val_loader: Validation data loader
+        epochs: Number of training epochs
+        lr: Learning rate
+    """
+    criterion = nn.CrossEntropyLoss()
+    optimizer = optim.Adam(model.parameters(), lr=lr)
+    for epoch in range(epochs):
+        model.train()
+        train_loss = 0.0
+        for batch_idx, (data, target) in enumerate(train_loader):
+            optimizer.zero_grad()
+            output = model(data)
+            loss = criterion(output, target)
+            loss.backward()
+            optimizer.step()
+            train_loss += loss.item()
+            if batch_idx % 100 == 0:
+                logger.info(f'Epoch: {epoch}, Batch: {batch_idx}, Loss: {loss.item():.4f}')
+        # Validation
+        model.eval()
+        val_loss = 0.0
+        correct = 0
+        with torch.no_grad():
+            for data, target in val_loader:
+                output = model(data)
+                val_loss += criterion(output, target).item()
+                pred = output.argmax(dim=1, keepdim=True)
+                correct += pred.eq(target.view_as(pred)).sum().item()
+        val_accuracy = correct / len(val_loader.dataset)
+        logger.info(f'Epoch {epoch}: Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')
+def main():
+    """Main training function."""
+    parser = argparse.ArgumentParser(description='Train NLP Model')
+    parser.add_argument('--epochs', type=int, default=10, help='Number of epochs')
+    parser.add_argument('--lr', type=float, default=0.001, help='Learning rate')
+    parser.add_argument('--batch_size', type=int, default=32, help='Batch size')
+    args = parser.parse_args()
+    # Initialize your model, data loaders here
+    # model = YourModel()
+    # train_loader = YourDataLoader(batch_size=args.batch_size, split='train')
+    # val_loader = YourDataLoader(batch_size=args.batch_size, split='val')
+    logger.info("Starting training...")
+    # train_model(model, train_loader, val_loader, args.epochs, args.lr)
+    logger.info("Training completed!")
+if __name__ == "__main__":
+    main()