SoSolaris commited on
Commit
5cf4223
·
verified ·
1 Parent(s): 9c587bc

Upload handler.py

Browse files
Files changed (1) hide show
  1. handler.py +150 -0
handler.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
2
+ import torch
3
+ import librosa
4
+ import io
5
+ import base64
6
+ from typing import Dict, List, Any
7
+ import json
8
+
9
+ class EndpointHandler:
10
+ def __init__(self, path=""):
11
+ """
12
+ Initialize the handler for Hugging Face Inference Endpoints
13
+ """
14
+ print("Loading Whisper model...")
15
+
16
+ try:
17
+ # Try Flash Attention 2 first
18
+ try:
19
+ self.model = WhisperForConditionalGeneration.from_pretrained(
20
+ path,
21
+ torch_dtype=torch.bfloat16,
22
+ device_map={"": 0},
23
+ attn_implementation="flash_attention_2"
24
+ )
25
+ print("✅ Flash Attention 2 activated!")
26
+ except ImportError:
27
+ print("⚠️ Flash Attention not available, fallback to eager")
28
+ self.model = WhisperForConditionalGeneration.from_pretrained(
29
+ path,
30
+ torch_dtype=torch.float16,
31
+ device_map="auto"
32
+ )
33
+
34
+ self.processor = WhisperProcessor.from_pretrained(path)
35
+
36
+ # Set to evaluation mode
37
+ self.model.eval()
38
+
39
+ # Compile model for optimization
40
+ if hasattr(torch, 'compile'):
41
+ try:
42
+ self.model = torch.compile(self.model, mode="max-autotune")
43
+ print("Model compiled with max-autotune!")
44
+ except Exception as e:
45
+ print(f"Max-autotune compilation failed, fallback: {e}")
46
+ try:
47
+ self.model = torch.compile(self.model, mode="reduce-overhead")
48
+ print("Model compiled with reduce-overhead!")
49
+ except Exception as e2:
50
+ print(f"Compilation failed: {e2}")
51
+
52
+ # Pre-compute French decoder IDs
53
+ self.french_decoder_ids = self.processor.get_decoder_prompt_ids(
54
+ language="french",
55
+ task="transcribe"
56
+ )
57
+
58
+ print("Model loaded and optimized successfully!")
59
+
60
+ except Exception as e:
61
+ print(f"Error loading model: {e}")
62
+ raise e
63
+
64
+ def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
65
+ """
66
+ Process the request
67
+ Args:
68
+ data (Dict): The request payload containing:
69
+ - "inputs": base64 encoded audio file or audio bytes
70
+ - "parameters": optional parameters for generation
71
+ Returns:
72
+ Dict: The transcription result
73
+ """
74
+ try:
75
+ # Extract inputs
76
+ inputs = data.get("inputs", "")
77
+ parameters = data.get("parameters", {})
78
+
79
+ # Handle different input formats
80
+ if isinstance(inputs, str):
81
+ # Assume base64 encoded audio
82
+ try:
83
+ audio_bytes = base64.b64decode(inputs)
84
+ except Exception:
85
+ return {"error": "Invalid base64 encoded audio"}
86
+ elif isinstance(inputs, bytes):
87
+ audio_bytes = inputs
88
+ else:
89
+ return {"error": "Invalid input format. Expected base64 string or bytes"}
90
+
91
+ # Validate file size (max 25MB)
92
+ if len(audio_bytes) > 25 * 1024 * 1024:
93
+ return {"error": "File too large (max 25MB)"}
94
+
95
+ # Load audio from bytes
96
+ audio_array, sampling_rate = librosa.load(
97
+ io.BytesIO(audio_bytes),
98
+ sr=16000,
99
+ mono=True,
100
+ duration=30 # Limit to 30 seconds max
101
+ )
102
+
103
+ # Validate audio
104
+ if len(audio_array) == 0:
105
+ return {"error": "Invalid or empty audio file"}
106
+
107
+ # Process audio for the model
108
+ model_inputs = self.processor(
109
+ audio_array,
110
+ sampling_rate=16000,
111
+ return_tensors="pt"
112
+ )
113
+
114
+ # Move inputs to same device and dtype as model
115
+ model_inputs = {
116
+ k: v.to(self.model.device).half() if v.dtype == torch.float32 else v.to(self.model.device)
117
+ for k, v in model_inputs.items()
118
+ }
119
+
120
+ # Extract generation parameters
121
+ max_length = parameters.get("max_length", 256)
122
+ num_beams = parameters.get("num_beams", 6)
123
+ temperature = parameters.get("temperature", 0.0)
124
+
125
+ # Generate transcription with anti-hallucination parameters
126
+ with torch.no_grad(), torch.inference_mode(), torch.autocast(device_type="cuda", dtype=torch.float16):
127
+ predicted_ids = self.model.generate(
128
+ **model_inputs,
129
+ max_length=max_length,
130
+ num_beams=num_beams,
131
+ temperature=temperature,
132
+ do_sample=False,
133
+ early_stopping=True,
134
+ no_repeat_ngram_size=3,
135
+ repetition_penalty=1.1,
136
+ length_penalty=1.0,
137
+ use_cache=True,
138
+ pad_token_id=self.processor.tokenizer.eos_token_id,
139
+ forced_decoder_ids=self.french_decoder_ids,
140
+ suppress_tokens=[],
141
+ begin_suppress_tokens=[]
142
+ )
143
+
144
+ # Decode the transcription
145
+ transcription = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)
146
+
147
+ return {"transcription": transcription[0]}
148
+
149
+ except Exception as e:
150
+ return {"error": f"Transcription error: {str(e)}"}