cyberspyde commited on
Commit
c00bf70
·
1 Parent(s): f213e6f
Files changed (4) hide show
  1. .gitignore +46 -0
  2. README.md +82 -3
  3. app.py +134 -4
  4. requirements.txt +6 -0
.gitignore ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # Virtual environments
24
+ venv/
25
+ env/
26
+ ENV/
27
+ .venv
28
+
29
+ # IDE
30
+ .vscode/
31
+ .idea/
32
+ *.swp
33
+ *.swo
34
+ *~
35
+
36
+ # OS
37
+ .DS_Store
38
+ Thumbs.db
39
+
40
+ # Gradio
41
+ gradio_cached_examples/
42
+ flagged/
43
+
44
+ # Model cache
45
+ .cache/
46
+ models/
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
- title: Whisper
3
- emoji: 😻
4
  colorFrom: gray
5
  colorTo: indigo
6
  sdk: gradio
@@ -10,4 +10,83 @@ pinned: false
10
  license: apache-2.0
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Whisper Uzbek STT
3
+ emoji: 🎙️
4
  colorFrom: gray
5
  colorTo: indigo
6
  sdk: gradio
 
10
  license: apache-2.0
11
  ---
12
 
13
+ # 🎙️ Whisper Uzbek Speech-to-Text
14
+
15
+ This Hugging Face Space provides automatic speech recognition (ASR) for Uzbek language using the Whisper model.
16
+
17
+ ## 🚀 Features
18
+
19
+ - **Uzbek Language Support**: Optimized for Uzbek speech recognition
20
+ - **Easy to Use**: Simple interface for recording or uploading audio
21
+ - **Real-time Progress**: Visual feedback during transcription
22
+ - **CPU-Optimized**: Runs efficiently on CPU infrastructure
23
+ - **Comprehensive Logging**: Full logging system for monitoring and debugging
24
+
25
+ ## 🛠️ Technical Details
26
+
27
+ - **Model**: `jmshd/whisper-uz`
28
+ - **Framework**: Gradio 6.1.0
29
+ - **Backend**: PyTorch + Transformers
30
+ - **Processing**: CPU-only (HF Spaces)
31
+
32
+ ## 📝 Usage
33
+
34
+ 1. **Record Audio**: Click the microphone icon to record directly in your browser
35
+ 2. **Upload Audio**: Or upload an existing audio file
36
+ 3. **Transcribe**: Click the "Transcribe" button to convert speech to text
37
+ 4. **View Results**: The transcribed text will appear in the output box
38
+
39
+ ## 🔧 Local Development
40
+
41
+ To run this application locally:
42
+
43
+ ```bash
44
+ # Clone the repository
45
+ git clone <your-repo-url>
46
+ cd whisper
47
+
48
+ # Install dependencies
49
+ pip install -r requirements.txt
50
+
51
+ # Run the application
52
+ python app.py
53
+ ```
54
+
55
+ The application will be available at `http://localhost:7860`
56
+
57
+ ## 📦 Requirements
58
+
59
+ - Python 3.8+
60
+ - gradio==6.1.0
61
+ - transformers>=4.30.0
62
+ - torch>=2.0.0
63
+ - torchaudio>=2.0.0
64
+ - accelerate>=0.20.0
65
+ - huggingface_hub>=0.16.0
66
+
67
+ ## 📊 Logging
68
+
69
+ The application includes comprehensive logging:
70
+ - Environment information (PyTorch version, CUDA availability)
71
+ - Model loading status
72
+ - Audio processing details
73
+ - Transcription results and errors
74
+
75
+ Logs can be viewed in the Hugging Face Spaces logs tab.
76
+
77
+ ## 🤝 Contributing
78
+
79
+ Contributions are welcome! Feel free to:
80
+ - Report bugs
81
+ - Suggest features
82
+ - Submit pull requests
83
+
84
+ ## 📄 License
85
+
86
+ This project is licensed under the Apache 2.0 License.
87
+
88
+ ## 🔗 Resources
89
+
90
+ - [Hugging Face Spaces Documentation](https://huggingface.co/docs/hub/spaces-config-reference)
91
+ - [Gradio Documentation](https://gradio.app/docs)
92
+ - [Whisper Model Card](https://huggingface.co/jmshd/whisper-uz)
app.py CHANGED
@@ -1,7 +1,137 @@
1
  import gradio as gr
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
 
 
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import torch
3
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
4
+ import logging
5
+ import os
6
+ from datetime import datetime
7
+ from huggingface_hub import HfApi
8
 
9
+ # Setup logging
10
+ logging.basicConfig(
11
+ level=logging.INFO,
12
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
13
+ )
14
+ logger = logging.getLogger(__name__)
15
 
16
+ MODEL_NAME = "jmshd/whisper-uz"
17
+
18
+ # Log environment info
19
+ logger.info(f"Starting Whisper Uzbek STT application")
20
+ logger.info(f"PyTorch version: {torch.__version__}")
21
+ logger.info(f"CUDA available: {torch.cuda.is_available()}")
22
+ logger.info(f"Model: {MODEL_NAME}")
23
+
24
+ # Load model and processor
25
+ try:
26
+ logger.info("Loading processor...")
27
+ processor = WhisperProcessor.from_pretrained(MODEL_NAME)
28
+ logger.info("Loading model...")
29
+ model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME)
30
+ logger.info("Model and processor loaded successfully")
31
+ except Exception as e:
32
+ logger.error(f"Error loading model: {str(e)}")
33
+ raise
34
+
35
+ def transcribe(audio, progress=gr.Progress()):
36
+ """
37
+ Transcribe audio to text using Whisper model
38
+
39
+ Args:
40
+ audio: Audio input from Gradio (sample_rate, audio_data)
41
+ progress: Gradio progress tracker
42
+
43
+ Returns:
44
+ str: Transcribed text
45
+ """
46
+ try:
47
+ if audio is None:
48
+ logger.warning("No audio input provided")
49
+ return "⚠️ No audio provided. Please upload or record audio."
50
+
51
+ progress(0.1, desc="Processing audio...")
52
+ sample_rate, audio_data = audio
53
+
54
+ logger.info(f"Processing audio - Sample rate: {sample_rate}, Shape: {audio_data.shape}")
55
+
56
+ progress(0.3, desc="Preparing input features...")
57
+ inputs = processor(
58
+ audio_data,
59
+ sampling_rate=sample_rate,
60
+ return_tensors="pt"
61
+ )
62
+
63
+ progress(0.5, desc="Generating transcription...")
64
+ with torch.no_grad():
65
+ predicted_ids = model.generate(inputs.input_features)
66
+
67
+ progress(0.8, desc="Decoding text...")
68
+ text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
69
+
70
+ progress(1.0, desc="Complete!")
71
+ logger.info(f"Transcription successful - Length: {len(text)} characters")
72
+
73
+ return text
74
+
75
+ except Exception as e:
76
+ error_msg = f"❌ Error during transcription: {str(e)}"
77
+ logger.error(error_msg)
78
+ return error_msg
79
+
80
+ # Enhanced Gradio interface
81
+ with gr.Blocks(theme=gr.themes.Soft()) as iface:
82
+ gr.Markdown(
83
+ """
84
+ # 🎙️ Whisper Uzbek Speech-to-Text
85
+
86
+ Transcribe Uzbek audio to text using the Whisper model. This application runs on CPU and supports Uzbek language.
87
+
88
+ **Model:** `jmshd/whisper-uz`
89
+ """
90
+ )
91
+
92
+ with gr.Row():
93
+ with gr.Column():
94
+ audio_input = gr.Audio(
95
+ label="Upload or Record Audio",
96
+ type="numpy",
97
+ sources=["microphone", "upload"]
98
+ )
99
+ transcribe_btn = gr.Button("🎯 Transcribe", variant="primary")
100
+ clear_btn = gr.ClearButton([audio_input])
101
+
102
+ with gr.Column():
103
+ output_text = gr.Textbox(
104
+ label="Transcription",
105
+ placeholder="Your transcribed text will appear here...",
106
+ lines=10
107
+ )
108
+
109
+ gr.Markdown(
110
+ """
111
+ ### 📝 Usage Instructions:
112
+ 1. Click the microphone icon to record audio or upload an audio file
113
+ 2. Click the "Transcribe" button to convert speech to text
114
+ 3. The transcribed text will appear in the output box
115
+
116
+ ### ℹ️ Information:
117
+ - Supported language: Uzbek
118
+ - Processing: CPU-only (may be slower than GPU)
119
+ - Model size: Small
120
+ """
121
+ )
122
+
123
+ transcribe_btn.click(
124
+ fn=transcribe,
125
+ inputs=audio_input,
126
+ outputs=output_text
127
+ )
128
+
129
+ # Launch configuration for Hugging Face Spaces
130
+ if __name__ == "__main__":
131
+ logger.info("Launching Gradio interface...")
132
+ iface.launch(
133
+ share=False,
134
+ show_error=True,
135
+ server_name="0.0.0.0",
136
+ server_port=7860
137
+ )
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio==6.1.0
2
+ transformers>=4.30.0
3
+ torch>=2.0.0
4
+ torchaudio>=2.0.0
5
+ accelerate>=0.20.0
6
+ huggingface_hub>=0.16.0