Spaces:
Sleeping
Sleeping
ignore cache
Browse files- .gitignore +3 -0
- chunck_time.py +0 -261
.gitignore
CHANGED
|
@@ -106,3 +106,6 @@ env/
|
|
| 106 |
# Misc
|
| 107 |
*.bak
|
| 108 |
*.swp
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
# Misc
|
| 107 |
*.bak
|
| 108 |
*.swp
|
| 109 |
+
|
| 110 |
+
chunk_time.py
|
| 111 |
+
analyze.txt
|
chunck_time.py
DELETED
|
@@ -1,261 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import sys
|
| 3 |
-
import warnings
|
| 4 |
-
import time
|
| 5 |
-
import statistics
|
| 6 |
-
from collections import Counter
|
| 7 |
-
|
| 8 |
-
import torch
|
| 9 |
-
import torchaudio
|
| 10 |
-
from speechbrain.inference.classifiers import EncoderClassifier
|
| 11 |
-
|
| 12 |
-
from audio_extractor import extract_audio_from_video_url
|
| 13 |
-
|
| 14 |
-
warnings.filterwarnings("ignore")
|
| 15 |
-
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'
|
| 16 |
-
|
| 17 |
-
def create_chunks_by_size(waveform, sample_rate, chunk_length_sec):
|
| 18 |
-
"""Create chunks of specific size"""
|
| 19 |
-
chunk_samples = chunk_length_sec * sample_rate
|
| 20 |
-
total_samples = waveform.size(1)
|
| 21 |
-
chunks = []
|
| 22 |
-
|
| 23 |
-
for start in range(0, total_samples, chunk_samples):
|
| 24 |
-
end = min(start + chunk_samples, total_samples)
|
| 25 |
-
chunk = waveform[:, start:end]
|
| 26 |
-
if chunk.size(1) > sample_rate * 2: # minimum 2 seconds
|
| 27 |
-
chunks.append(chunk)
|
| 28 |
-
return chunks
|
| 29 |
-
|
| 30 |
-
def predict_chunks_timing(chunks, classifier):
|
| 31 |
-
"""Time the prediction process for chunks"""
|
| 32 |
-
if not chunks:
|
| 33 |
-
return [], 0.0
|
| 34 |
-
|
| 35 |
-
start_time = time.time()
|
| 36 |
-
|
| 37 |
-
# Pad to same length
|
| 38 |
-
max_len = max(chunk.size(1) for chunk in chunks)
|
| 39 |
-
padded_chunks = [torch.nn.functional.pad(chunk, (0, max_len - chunk.size(1))) for chunk in chunks]
|
| 40 |
-
batch = torch.cat(padded_chunks, dim=0).unsqueeze(1)
|
| 41 |
-
batch = batch.squeeze(1)
|
| 42 |
-
|
| 43 |
-
out_prob, score, index, text_lab = classifier.classify_batch(batch)
|
| 44 |
-
|
| 45 |
-
end_time = time.time()
|
| 46 |
-
prediction_time = end_time - start_time
|
| 47 |
-
|
| 48 |
-
results = []
|
| 49 |
-
for i in range(len(chunks)):
|
| 50 |
-
results.append({
|
| 51 |
-
"accent": text_lab[i],
|
| 52 |
-
"confidence": score[i].item(),
|
| 53 |
-
})
|
| 54 |
-
|
| 55 |
-
return results, prediction_time
|
| 56 |
-
|
| 57 |
-
def analyze_chunk_size_performance(video_url, chunk_sizes=[10, 15, 20, 30, 60]):
|
| 58 |
-
"""Analyze performance for different chunk sizes"""
|
| 59 |
-
print("🔍 Starting Chunk Size Performance Analysis")
|
| 60 |
-
print("=" * 60)
|
| 61 |
-
|
| 62 |
-
# Extract and prepare audio once
|
| 63 |
-
print("🎵 Extracting and preparing audio...")
|
| 64 |
-
audio_start = time.time()
|
| 65 |
-
|
| 66 |
-
audio_path = extract_audio_from_video_url(video_url)
|
| 67 |
-
waveform, sample_rate = torchaudio.load(audio_path)
|
| 68 |
-
|
| 69 |
-
if sample_rate != 16000:
|
| 70 |
-
waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
|
| 71 |
-
sample_rate = 16000
|
| 72 |
-
|
| 73 |
-
if waveform.shape[0] > 1:
|
| 74 |
-
waveform = torch.mean(waveform, dim=0, keepdim=True)
|
| 75 |
-
|
| 76 |
-
# # Apply VAD
|
| 77 |
-
# waveform = simple_vad(waveform, sample_rate)
|
| 78 |
-
|
| 79 |
-
audio_end = time.time()
|
| 80 |
-
audio_prep_time = audio_end - audio_start
|
| 81 |
-
|
| 82 |
-
duration_minutes = waveform.size(1) / sample_rate / 60
|
| 83 |
-
print(f"✅ Audio prepared in {audio_prep_time:.2f}s | Duration: {duration_minutes:.1f} minutes")
|
| 84 |
-
|
| 85 |
-
# Load model once
|
| 86 |
-
print("🧠 Loading model...")
|
| 87 |
-
model_start = time.time()
|
| 88 |
-
classifier = EncoderClassifier.from_hparams(source="Jzuluaga/accent-id-commonaccent_ecapa")
|
| 89 |
-
model_end = time.time()
|
| 90 |
-
model_load_time = model_end - model_start
|
| 91 |
-
print(f"✅ Model loaded in {model_load_time:.2f}s")
|
| 92 |
-
|
| 93 |
-
print("\n" + "=" * 60)
|
| 94 |
-
print("📊 CHUNK SIZE ANALYSIS RESULTS")
|
| 95 |
-
print("=" * 60)
|
| 96 |
-
|
| 97 |
-
results = []
|
| 98 |
-
|
| 99 |
-
for chunk_size in chunk_sizes:
|
| 100 |
-
print(f"\n🧩 Testing {chunk_size}-second chunks...")
|
| 101 |
-
|
| 102 |
-
# Create chunks
|
| 103 |
-
chunk_start = time.time()
|
| 104 |
-
chunks = create_chunks_by_size(waveform, sample_rate, chunk_size)
|
| 105 |
-
chunk_end = time.time()
|
| 106 |
-
chunking_time = chunk_end - chunk_start
|
| 107 |
-
|
| 108 |
-
if not chunks:
|
| 109 |
-
print(f"❌ No valid chunks created for {chunk_size}s size")
|
| 110 |
-
continue
|
| 111 |
-
|
| 112 |
-
# Predict
|
| 113 |
-
predictions, prediction_time = predict_chunks_timing(chunks, classifier)
|
| 114 |
-
|
| 115 |
-
# Calculate statistics
|
| 116 |
-
confidences = [p["confidence"] for p in predictions]
|
| 117 |
-
accents = [p["accent"] for p in predictions]
|
| 118 |
-
|
| 119 |
-
avg_confidence = statistics.mean(confidences) if confidences else 0
|
| 120 |
-
max_confidence = max(confidences) if confidences else 0
|
| 121 |
-
min_confidence = min(confidences) if confidences else 0
|
| 122 |
-
std_confidence = statistics.stdev(confidences) if len(confidences) > 1 else 0
|
| 123 |
-
|
| 124 |
-
# Most common accent
|
| 125 |
-
accent_counts = Counter(accents)
|
| 126 |
-
most_common_accent = accent_counts.most_common(1)[0] if accent_counts else ("Unknown", 0)
|
| 127 |
-
|
| 128 |
-
# Calculate processing rates
|
| 129 |
-
total_processing_time = chunking_time + prediction_time
|
| 130 |
-
chunks_per_second = len(chunks) / total_processing_time if total_processing_time > 0 else 0
|
| 131 |
-
seconds_per_chunk = total_processing_time / len(chunks) if len(chunks) > 0 else 0
|
| 132 |
-
|
| 133 |
-
result = {
|
| 134 |
-
"chunk_size": chunk_size,
|
| 135 |
-
"num_chunks": len(chunks),
|
| 136 |
-
"chunking_time": chunking_time,
|
| 137 |
-
"prediction_time": prediction_time,
|
| 138 |
-
"total_time": total_processing_time,
|
| 139 |
-
"avg_confidence": avg_confidence,
|
| 140 |
-
"max_confidence": max_confidence,
|
| 141 |
-
"min_confidence": min_confidence,
|
| 142 |
-
"std_confidence": std_confidence,
|
| 143 |
-
"most_common_accent": most_common_accent[0],
|
| 144 |
-
"accent_occurrence": most_common_accent[1],
|
| 145 |
-
"chunks_per_second": chunks_per_second,
|
| 146 |
-
"seconds_per_chunk": seconds_per_chunk,
|
| 147 |
-
"confidence_consistency": 1 - (std_confidence / avg_confidence) if avg_confidence > 0 else 0
|
| 148 |
-
}
|
| 149 |
-
|
| 150 |
-
results.append(result)
|
| 151 |
-
|
| 152 |
-
# Print results for this chunk size
|
| 153 |
-
print(f" 📦 Chunks created: {len(chunks)}")
|
| 154 |
-
print(f" ⏱️ Chunking time: {chunking_time:.3f}s")
|
| 155 |
-
print(f" 🧠 Prediction time: {prediction_time:.3f}s")
|
| 156 |
-
print(f" 🔄 Total processing: {total_processing_time:.3f}s")
|
| 157 |
-
print(f" ⚡ Processing rate: {chunks_per_second:.1f} chunks/sec")
|
| 158 |
-
print(f" 📈 Avg confidence: {avg_confidence:.3f}")
|
| 159 |
-
print(f" 🎯 Most common: {most_common_accent[0]} ({most_common_accent[1]} times)")
|
| 160 |
-
print(f" 📊 Confidence range: {min_confidence:.3f} - {max_confidence:.3f}")
|
| 161 |
-
|
| 162 |
-
# Print summary comparison
|
| 163 |
-
print("\n" + "=" * 80)
|
| 164 |
-
print("📈 PERFORMANCE COMPARISON SUMMARY")
|
| 165 |
-
print("=" * 80)
|
| 166 |
-
|
| 167 |
-
if results:
|
| 168 |
-
print(f"{'Size':<6} {'Chunks':<8} {'Total Time':<12} {'Rate':<12} {'Avg Conf':<10} {'Consistency':<12} {'Winner'}")
|
| 169 |
-
print("-" * 80)
|
| 170 |
-
|
| 171 |
-
for r in results:
|
| 172 |
-
consistency = f"{r['confidence_consistency']:.2f}"
|
| 173 |
-
print(f"{r['chunk_size']:<6} {r['num_chunks']:<8} {r['total_time']:<12.3f} {r['chunks_per_second']:<12.1f} {r['avg_confidence']:<10.3f} {consistency:<12} {r['most_common_accent']}")
|
| 174 |
-
|
| 175 |
-
# Recommendations
|
| 176 |
-
print("\n" + "=" * 60)
|
| 177 |
-
print("🏆 RECOMMENDATIONS")
|
| 178 |
-
print("=" * 60)
|
| 179 |
-
|
| 180 |
-
if results:
|
| 181 |
-
# Find best for speed
|
| 182 |
-
fastest = min(results, key=lambda x: x['total_time'])
|
| 183 |
-
print(f"⚡ Fastest processing: {fastest['chunk_size']}s chunks ({fastest['total_time']:.2f}s total)")
|
| 184 |
-
|
| 185 |
-
# Find best for accuracy (highest average confidence)
|
| 186 |
-
most_accurate = max(results, key=lambda x: x['avg_confidence'])
|
| 187 |
-
print(f"🎯 Highest accuracy: {most_accurate['chunk_size']}s chunks ({most_accurate['avg_confidence']:.3f} avg confidence)")
|
| 188 |
-
|
| 189 |
-
# Find most consistent
|
| 190 |
-
most_consistent = max(results, key=lambda x: x['confidence_consistency'])
|
| 191 |
-
print(f"📊 Most consistent: {most_consistent['chunk_size']}s chunks ({most_consistent['confidence_consistency']:.3f} consistency)")
|
| 192 |
-
|
| 193 |
-
# Find best balance (speed + accuracy)
|
| 194 |
-
for r in results:
|
| 195 |
-
r['balance_score'] = (r['chunks_per_second'] * 0.4) + (r['avg_confidence'] * 100 * 0.6)
|
| 196 |
-
|
| 197 |
-
best_balance = max(results, key=lambda x: x['balance_score'])
|
| 198 |
-
print(f"⚖️ Best balance: {best_balance['chunk_size']}s chunks (score: {best_balance['balance_score']:.1f})")
|
| 199 |
-
|
| 200 |
-
return results
|
| 201 |
-
|
| 202 |
-
def quick_test_multiple_videos(video_urls, chunk_sizes=[10, 15, 20, 30]):
|
| 203 |
-
"""Quick test on multiple videos to get average performance"""
|
| 204 |
-
print("🔍 MULTI-VIDEO CHUNK SIZE ANALYSIS")
|
| 205 |
-
print("=" * 60)
|
| 206 |
-
|
| 207 |
-
all_results = {size: [] for size in chunk_sizes}
|
| 208 |
-
|
| 209 |
-
for i, video_url in enumerate(video_urls, 1):
|
| 210 |
-
print(f"\n📹 Testing Video {i}/{len(video_urls)}")
|
| 211 |
-
try:
|
| 212 |
-
video_results = analyze_chunk_size_performance(video_url, chunk_sizes)
|
| 213 |
-
for result in video_results:
|
| 214 |
-
all_results[result['chunk_size']].append(result)
|
| 215 |
-
except Exception as e:
|
| 216 |
-
print(f"❌ Error with video {i}: {str(e)}")
|
| 217 |
-
continue
|
| 218 |
-
|
| 219 |
-
# Calculate averages
|
| 220 |
-
print("\n" + "=" * 60)
|
| 221 |
-
print("📊 AVERAGE PERFORMANCE ACROSS ALL VIDEOS")
|
| 222 |
-
print("=" * 60)
|
| 223 |
-
|
| 224 |
-
avg_results = []
|
| 225 |
-
for chunk_size in chunk_sizes:
|
| 226 |
-
if all_results[chunk_size]:
|
| 227 |
-
results = all_results[chunk_size]
|
| 228 |
-
avg_result = {
|
| 229 |
-
'chunk_size': chunk_size,
|
| 230 |
-
'avg_total_time': statistics.mean([r['total_time'] for r in results]),
|
| 231 |
-
'avg_chunks_per_sec': statistics.mean([r['chunks_per_second'] for r in results]),
|
| 232 |
-
'avg_confidence': statistics.mean([r['avg_confidence'] for r in results]),
|
| 233 |
-
'avg_consistency': statistics.mean([r['confidence_consistency'] for r in results]),
|
| 234 |
-
'sample_count': len(results)
|
| 235 |
-
}
|
| 236 |
-
avg_results.append(avg_result)
|
| 237 |
-
|
| 238 |
-
if avg_results:
|
| 239 |
-
print(f"{'Size':<6} {'Samples':<8} {'Avg Time':<10} {'Avg Rate':<10} {'Avg Conf':<10} {'Consistency'}")
|
| 240 |
-
print("-" * 60)
|
| 241 |
-
for r in avg_results:
|
| 242 |
-
print(f"{r['chunk_size']:<6} {r['sample_count']:<8} {r['avg_total_time']:<10.2f} {r['avg_chunks_per_sec']:<10.1f} {r['avg_confidence']:<10.3f} {r['avg_consistency']:.3f}")
|
| 243 |
-
|
| 244 |
-
return avg_results
|
| 245 |
-
|
| 246 |
-
if __name__ == "__main__":
|
| 247 |
-
# Test with single video
|
| 248 |
-
video_url = "https://www.youtube.com/watch?v=-JTq1BFBwmo&list=PLDN4rrl48XKpZkf03iYFl-O29szjTrs_O&index=2"
|
| 249 |
-
|
| 250 |
-
print("🚀 Starting Single Video Analysis...")
|
| 251 |
-
results = analyze_chunk_size_performance(video_url)
|
| 252 |
-
|
| 253 |
-
# Uncomment below to test multiple videos
|
| 254 |
-
# print("\n" + "="*60)
|
| 255 |
-
# print("🚀 Starting Multi-Video Analysis...")
|
| 256 |
-
# video_urls = [
|
| 257 |
-
# "https://www.youtube.com/watch?v=VIDEO1",
|
| 258 |
-
# "https://www.youtube.com/watch?v=VIDEO2",
|
| 259 |
-
# # Add more video URLs here
|
| 260 |
-
# ]
|
| 261 |
-
# multi_results = quick_test_multiple_videos(video_urls)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|