import gradio as gr import torch import gc import time import logging from transformers import ( pipeline, AutoProcessor, AutoModelForSpeechSeq2Seq, AutoModelForCTC, WhisperForConditionalGeneration, WhisperProcessor, ) logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class MultiASRApp: def __init__(self): self.pipe = None self.current_model = None self.current_kind = None # "whisper" | "ctc" self.available_models = [ "openai/whisper-tiny", "openai/whisper-base", "openai/whisper-small", "openai/whisper-medium", "openai/whisper-large-v2", "openai/whisper-large-v3", "ilsp/whisper_greek_dialect_of_lesbos", "ilsp/xls-r-greek-cretan", ] # ------------------------ # Model detection # ------------------------ def detect_model_kind(self, model_name): if "xls-r" in model_name.lower() or "xlsr" in model_name.lower(): return "ctc" return "whisper" def is_fine_tuned_whisper(self, model_name): return "ilsp/" in model_name.lower() and "whisper" in model_name.lower() # ------------------------ # Device & dtype # ------------------------ def pick_device(self, conservative=True): if torch.cuda.is_available(): return "cuda:0", torch.float32 if conservative else torch.float16 return "cpu", torch.float32 # ------------------------ # Pipeline creation # ------------------------ def create_whisper_pipe(self, model_name): conservative = self.is_fine_tuned_whisper(model_name) device, dtype = self.pick_device(conservative) try: model = WhisperForConditionalGeneration.from_pretrained( model_name, torch_dtype=dtype, low_cpu_mem_usage=True, ) processor = WhisperProcessor.from_pretrained(model_name) except Exception: model = AutoModelForSpeechSeq2Seq.from_pretrained( model_name, torch_dtype=dtype, low_cpu_mem_usage=True, ) processor = AutoProcessor.from_pretrained(model_name) model.to(device) return pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, device=device, torch_dtype=dtype, chunk_length_s=30, ) def create_ctc_pipe(self, model_name): device, dtype = self.pick_device(conservative=True) processor = AutoProcessor.from_pretrained(model_name) model = AutoModelForCTC.from_pretrained( model_name, torch_dtype=dtype, low_cpu_mem_usage=True, ) model.to(device) return pipeline( "automatic-speech-recognition", model=model, tokenizer=getattr(processor, "tokenizer", None), feature_extractor=getattr(processor, "feature_extractor", None), device=device, torch_dtype=dtype, chunk_length_s=20, stride_length_s=(4, 2), ) def load_model(self, model_name): if self.current_model == model_name and self.pipe is not None: return True self.clear_model() kind = self.detect_model_kind(model_name) try: if kind == "ctc": self.pipe = self.create_ctc_pipe(model_name) else: self.pipe = self.create_whisper_pipe(model_name) self.current_model = model_name self.current_kind = kind return True except Exception as e: logger.error(e, exc_info=True) self.clear_model() return False def clear_model(self): if self.pipe is not None: del self.pipe self.pipe = None self.current_model = None self.current_kind = None if torch.cuda.is_available(): torch.cuda.empty_cache() gc.collect() # ------------------------ # Transcription # ------------------------ def transcribe(self, audio, model_name): if audio is None: return "Ανέβασε ένα ηχητικό αρχείο.", "" start = time.time() if not self.load_model(model_name): return "Σφάλμα φόρτωσης μοντέλου.", "" # 🔒 FORCE GREEK FOR ALL WHISPER MODELS if self.current_kind == "whisper": result = self.pipe( audio, generate_kwargs={ "language": "greek", "task": "transcribe", }, ) else: # XLS-R (CTC) result = self.pipe(audio) text = result.get("text", "") info = ( f"Μοντέλο: {model_name}\n" f"Χρόνος επεξεργασίας: {time.time() - start:.2f} δευτ." ) return text.strip(), info def status(self): if not self.current_model: return "Δεν έχει φορτωθεί μοντέλο" return f"✔ {self.current_model}" # ------------------------ # Gradio App # ------------------------ app = MultiASRApp() def run(audio, model): return app.transcribe(audio, model) def status(): return app.status() with gr.Blocks(title="Ίντα λαλείς;", theme=gr.themes.Soft()) as demo: gr.Markdown( """ # Ίντα λαλείς; ## Η Τεχνητή Νοημοσύνη μαθαίνει ελληνικές διαλέκτους 🎧 Ανέβασε ένα ηχητικό αρχείο και δες πώς η Τεχνητή Νοημοσύνη αναγνωρίζει την ελληνική γλώσσα και τις διαλέκτους της. 📍 Athens Science Festival 2025 🏛 Ωδείο Αθηνών | 18–21 Δεκεμβρίου 2025 """ ) model_status = gr.Textbox( label="Κατάσταση μοντέλου", value=status(), interactive=False, ) with gr.Row(): with gr.Column(): audio = gr.Audio( label="🎵 Ανέβασε ηχητικό αρχείο", type="filepath", ) model = gr.Dropdown( choices=app.available_models, value="openai/whisper-small", label="Μοντέλο αναγνώρισης ομιλίας", ) btn = gr.Button( "🗣️ Μετατροπή ομιλίας σε κείμενο", variant="primary", ) with gr.Column(): text_out = gr.Textbox( label="📄 Κείμενο", lines=8, show_copy_button=True, ) info_out = gr.Textbox( label="Πληροφορίες", lines=4, ) btn.click( run, inputs=[audio, model], outputs=[text_out, info_out], ) model.change(lambda _: status(), outputs=model_status) gr.Markdown( """ 🔬 Έρευνα & τεχνολογία για τη γλωσσική ποικιλία 🎙️ Η φωνή ως πολιτιστική κληρονομιά """ ) if __name__ == "__main__": demo.launch()