eaysu commited on
Commit
92a5582
·
1 Parent(s): 03996b3

initial commit

Browse files
Files changed (8) hide show
  1. .gitignore +41 -0
  2. .gradio/certificate.pem +31 -0
  3. README.md +66 -5
  4. app.py +166 -0
  5. requirements.txt +10 -0
  6. speech_brain_text.py +17 -0
  7. voice_temp_1.wav +1 -0
  8. voice_temp_2.wav +1 -0
.gitignore ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ env/
8
+ venv/
9
+ ENV/
10
+ build/
11
+ develop-eggs/
12
+ dist/
13
+ downloads/
14
+ eggs/
15
+ .eggs/
16
+ lib/
17
+ lib64/
18
+ parts/
19
+ sdist/
20
+ var/
21
+ wheels/
22
+ *.egg-info/
23
+ .installed.cfg
24
+ *.egg
25
+
26
+ # Model cache
27
+ ecapa/
28
+
29
+ # Gradio
30
+ flagged/
31
+
32
+ # IDE
33
+ .vscode/
34
+ .idea/
35
+ *.swp
36
+ *.swo
37
+ *~
38
+
39
+ # OS
40
+ .DS_Store
41
+ Thumbs.db
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
README.md CHANGED
@@ -1,12 +1,73 @@
1
  ---
2
  title: Voice Similarity Checker
3
- emoji:
4
- colorFrom: yellow
5
- colorTo: gray
6
  sdk: gradio
7
- sdk_version: 6.0.2
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: Voice Similarity Checker
3
+ emoji: 🎙️
4
+ colorFrom: purple
5
+ colorTo: violet
6
  sdk: gradio
7
+ sdk_version: 4.0.0
8
  app_file: app.py
9
  pinned: false
10
+ license: mit
11
  ---
12
 
13
+ # 🎙️ Voice Similarity Checker
14
+
15
+ Compare two voice samples using advanced speaker recognition AI powered by SpeechBrain's ECAPA-TDNN model.
16
+
17
+ ## 🚀 Quick Start
18
+
19
+ ### Run Locally
20
+ ```bash
21
+ # Install dependencies
22
+ pip install -r requirements.txt
23
+
24
+ # Launch the Gradio interface
25
+ python app.py
26
+ ```
27
+
28
+ ### Run Original Script
29
+ ```bash
30
+ python speech_brain_text.py
31
+ ```
32
+
33
+ ## ✨ Features
34
+
35
+ - 🎤 **Compare Any Audio Files**: Upload two voice samples in any format (WAV, MP3, FLAC, etc.)
36
+ - 📊 **Similarity Score**: Get a precise similarity score and speaker verification
37
+ - ⚡ **Performance Metrics**: View detailed metrics including:
38
+ - Elapsed processing time
39
+ - Memory usage statistics
40
+ - CPU utilization
41
+ - 🎨 **Modern UI**: Beautiful, responsive Gradio interface with gradient themes
42
+
43
+ ## 📖 How to Use
44
+
45
+ 1. Upload your first audio file in the "Voice Sample 1" section
46
+ 2. Upload your second audio file in the "Voice Sample 2" section
47
+ 3. Click the "Compare Voices" button
48
+ 4. View the similarity results and performance metrics
49
+
50
+ ## 🤖 Model Information
51
+
52
+ This application uses the **SpeechBrain ECAPA-TDNN** model:
53
+ - Model: `speechbrain/spkrec-ecapa-voxceleb`
54
+ - Architecture: ECAPA-TDNN
55
+ - Training Data: VoxCeleb corpus
56
+
57
+ ## 📦 Project Structure
58
+
59
+ - `app.py` - Modern Gradio web interface
60
+ - `speech_brain_text.py` - Original CLI script
61
+ - `requirements.txt` - Python dependencies
62
+ - `README.md` - This file
63
+
64
+ ## 🌐 Deploy to Hugging Face Spaces
65
+
66
+ This app is ready to deploy to Hugging Face Spaces. Simply:
67
+ 1. Create a new Space on Hugging Face
68
+ 2. Upload all files from this repository
69
+ 3. The app will automatically launch!
70
+
71
+ ## 📄 License
72
+
73
+ MIT License
app.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from speechbrain.inference.speaker import SpeakerRecognition
3
+ import time
4
+ import psutil
5
+ import os
6
+ import tracemalloc
7
+ import tempfile
8
+ import shutil
9
+
10
+ # Initialize the model globally
11
+ print("Loading SpeechBrain model...")
12
+ model = SpeakerRecognition.from_hparams(
13
+ source="speechbrain/spkrec-ecapa-voxceleb",
14
+ savedir="ecapa"
15
+ )
16
+ print("Model loaded successfully!")
17
+
18
+ def format_bytes(bytes_value):
19
+ """Convert bytes to human-readable format"""
20
+ for unit in ['B', 'KB', 'MB', 'GB']:
21
+ if bytes_value < 1024.0:
22
+ return f"{bytes_value:.2f} {unit}"
23
+ bytes_value /= 1024.0
24
+ return f"{bytes_value:.2f} TB"
25
+
26
+ def compare_voices(audio1, audio2):
27
+ """
28
+ Compare two voice files and return similarity score with metrics
29
+ """
30
+ if audio1 is None or audio2 is None:
31
+ return "Please upload both audio files", "", "", "", "", "", "", "", ""
32
+
33
+ temp_file1 = None
34
+ temp_file2 = None
35
+
36
+ try:
37
+ # Create temporary copies of the audio files
38
+ temp_dir = tempfile.gettempdir()
39
+
40
+ # Get original filenames for display
41
+ original_name1 = os.path.basename(audio1)
42
+ original_name2 = os.path.basename(audio2)
43
+
44
+ # Create temp files with original extensions
45
+ ext1 = os.path.splitext(audio1)[1]
46
+ ext2 = os.path.splitext(audio2)[1]
47
+
48
+ temp_file1 = os.path.join(temp_dir, f"voice_temp_1{ext1}")
49
+ temp_file2 = os.path.join(temp_dir, f"voice_temp_2{ext2}")
50
+
51
+ # Copy to temp location
52
+ shutil.copy2(audio1, temp_file1)
53
+ shutil.copy2(audio2, temp_file2)
54
+
55
+ # Start tracking metrics
56
+ tracemalloc.start()
57
+ process = psutil.Process(os.getpid())
58
+ mem_before = process.memory_info().rss
59
+ start_time = time.time()
60
+
61
+ # Perform voice comparison using temp files
62
+ score, prediction = model.verify_files(temp_file1, temp_file2)
63
+
64
+ # Calculate metrics
65
+ elapsed_time = time.time() - start_time
66
+ current, peak = tracemalloc.get_traced_memory()
67
+ tracemalloc.stop()
68
+ mem_after = process.memory_info().rss
69
+ mem_used = mem_after - mem_before
70
+
71
+ # Format results
72
+ similarity_score = score.item()
73
+ is_same_speaker = "Yes" if prediction.item() else "No"
74
+
75
+ # Determine interpretation based on score
76
+ # The model uses 0.25 as threshold for same/different speaker decision
77
+ if similarity_score > 0.25:
78
+ interpretation = f"✅ Same Speaker (Score above threshold: {similarity_score:.4f} > 0.25)"
79
+ else:
80
+ interpretation = f"❌ Different Speakers (Score below threshold: {similarity_score:.4f} ≤ 0.25)"
81
+
82
+ # Return individual values
83
+ return (
84
+ f"{similarity_score:.4f}",
85
+ is_same_speaker,
86
+ interpretation,
87
+ original_name1,
88
+ original_name2,
89
+ f"{elapsed_time:.3f} seconds",
90
+ format_bytes(mem_used),
91
+ format_bytes(peak),
92
+ f"{process.cpu_percent():.1f}%"
93
+ )
94
+
95
+ except Exception as e:
96
+ return f"Error: {str(e)}", "", "", "", "", "", "", "", ""
97
+
98
+ finally:
99
+ # Clean up temporary files
100
+ if temp_file1 and os.path.exists(temp_file1):
101
+ try:
102
+ os.remove(temp_file1)
103
+ except:
104
+ pass
105
+ if temp_file2 and os.path.exists(temp_file2):
106
+ try:
107
+ os.remove(temp_file2)
108
+ except:
109
+ pass
110
+
111
+ # Create Gradio interface
112
+ demo = gr.Blocks()
113
+
114
+ with demo:
115
+ gr.Markdown("# Voice Similarity Checker")
116
+ gr.Markdown("Compare two voice samples using SpeechBrain ECAPA-TDNN speaker recognition")
117
+
118
+ with gr.Row():
119
+ audio1 = gr.Audio(label="Voice Sample 1", type="filepath")
120
+ audio2 = gr.Audio(label="Voice Sample 2", type="filepath")
121
+
122
+ compare_btn = gr.Button("Compare Voices", variant="primary")
123
+
124
+ gr.Markdown("""
125
+ ## Score Interpretation Guide
126
+ The model uses **cosine similarity** with a threshold of **0.25**:
127
+ - **Score > 0.25**: ✅ **Same Speaker** (voices match)
128
+ - **Score ≤ 0.25**: ❌ **Different Speakers** (voices don't match)
129
+
130
+ *Higher scores indicate greater similarity. Scores range from -1 to 1, but typically fall between 0 and 1 for voice comparisons.*
131
+ """)
132
+
133
+ gr.Markdown("## Results")
134
+
135
+ with gr.Row():
136
+ with gr.Column():
137
+ gr.Markdown("### Similarity Analysis")
138
+ similarity_score = gr.Textbox(label="Similarity Score", interactive=False)
139
+ same_speaker = gr.Textbox(label="Same Speaker (Model Prediction)", interactive=False)
140
+ interpretation = gr.Textbox(label="Interpretation", interactive=False)
141
+ file1_name = gr.Textbox(label="Audio File 1", interactive=False)
142
+ file2_name = gr.Textbox(label="Audio File 2", interactive=False)
143
+
144
+ with gr.Column():
145
+ gr.Markdown("### Performance Metrics")
146
+ elapsed_time = gr.Textbox(label="Elapsed Time", interactive=False)
147
+ memory_used = gr.Textbox(label="Memory Used", interactive=False)
148
+ peak_memory = gr.Textbox(label="Peak Memory", interactive=False)
149
+ cpu_usage = gr.Textbox(label="CPU Usage", interactive=False)
150
+
151
+ # Event handler
152
+ compare_btn.click(
153
+ fn=compare_voices,
154
+ inputs=[audio1, audio2],
155
+ outputs=[similarity_score, same_speaker, interpretation, file1_name, file2_name,
156
+ elapsed_time, memory_used, peak_memory, cpu_usage]
157
+ )
158
+
159
+ # Launch the app
160
+ if __name__ == "__main__":
161
+ demo.launch(
162
+ share=True, # Creates a public link for Hugging Face Spaces
163
+ server_name="0.0.0.0",
164
+ server_port=7860,
165
+ show_error=True
166
+ )
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ speechbrain==1.0.0
3
+ torch==2.1.0
4
+ torchaudio==2.1.0
5
+ numpy<2.0.0
6
+ huggingface-hub<1.0.0
7
+ psutil>=5.9.0
8
+ soundfile>=0.12.1
9
+ librosa>=0.10.0
10
+ requests
speech_brain_text.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from speechbrain.inference.speaker import SpeakerRecognition
2
+ import os
3
+
4
+ model = SpeakerRecognition.from_hparams(
5
+ source="speechbrain/spkrec-ecapa-voxceleb",
6
+ savedir="ecapa"
7
+ )
8
+
9
+ file1 = "/Users/enesaysu/Desktop/voices/enes_angry.wav"
10
+ file2 = "/Users/enesaysu/Desktop/voices/enes_sad.wav"
11
+
12
+ score, prediction = model.verify_files(file1, file2)
13
+
14
+ print(f"\nVoice File 1: {os.path.basename(file1)}")
15
+ print(f"Voice File 2: {os.path.basename(file2)}")
16
+ print(f"\nSimilarity Score: {score.item():.4f}")
17
+ print(f"Same Speaker: {'Yes' if prediction.item() else 'No'}")
voice_temp_1.wav ADDED
@@ -0,0 +1 @@
 
 
1
+ /var/folders/gt/qsgfz6rd2rz8ssq4jtdxsgdc0000gn/T/voice_temp_1.wav
voice_temp_2.wav ADDED
@@ -0,0 +1 @@
 
 
1
+ /var/folders/gt/qsgfz6rd2rz8ssq4jtdxsgdc0000gn/T/voice_temp_2.wav