import torch import librosa import numpy as np import io from pydub import AudioSegment from transformers import ( AutoFeatureExtractor, AutoModelForAudioClassification, pipeline, AutoTokenizer, AutoModelForSequenceClassification ) # Device configuration (Hugging Face Free Spaces use CPU by default) device = "cuda" if torch.cuda.is_available() else "cpu" print(f"🚀 OFOQ Engine is running on: {device.upper()}") # 1. Load Models from Hugging Face Hub (แทนที่ local folders) print("⏳ Loading OFOQ Models from Hugging Face Hub...") # --- Whisper Detection (Your Fine-tuned Model) --- model_path_detection = "GannaEslam38/OFOQ-Whisper-Detection" feature_extractor = AutoFeatureExtractor.from_pretrained(model_path_detection) detection_model = AutoModelForAudioClassification.from_pretrained(model_path_detection).to(device) # --- Arabic Whisper (Speech-to-Text) --- whisper_pipeline = pipeline( "automatic-speech-recognition", model="MohamedRashad/Arabic-Whisper-CodeSwitching-Edition", device=0 if torch.cuda.is_available() else -1 ) # --- MARBERT Cheating Classifier (Your Fine-tuned Model) --- model_path_cheating = "GannaEslam38/OFOQ-Cheating-Classifier" bert_tokenizer = AutoTokenizer.from_pretrained(model_path_cheating) bert_model = AutoModelForSequenceClassification.from_pretrained(model_path_cheating).to(device) text_classifier = pipeline( "text-classification", model=bert_model, tokenizer=bert_tokenizer, device=0 if torch.cuda.is_available() else -1 ) def run_ofoq_logic(audio_bytes): try: # Preprocessing audio_stream = io.BytesIO(audio_bytes) audio_segment = AudioSegment.from_file(audio_stream) wav_io = io.BytesIO() audio_segment.export(wav_io, format="wav") wav_io.seek(0) audio_data, _ = librosa.load(wav_io, sr=16000) except Exception as e: print(f"❌ Audio Processing Error: {e}") return {"status": "Error", "message": f"Could not process audio format: {str(e)}"} # Phase 1: Whisper/Hiss Detection inputs = feature_extractor(audio_data, sampling_rate=16000, return_tensors="pt").to(device) with torch.no_grad(): logits = detection_model(**inputs).logits prediction_label = torch.argmax(logits, dim=-1).item() if prediction_label == 0: return {"status": "Normal", "label": 0} # Phase 2: Speech-to-Text via Whisper boosted_audio = audio_data / (np.max(np.abs(audio_data)) + 1e-9) # Transcription stt_res = whisper_pipeline(boosted_audio, generate_kwargs={"language": "arabic"}) raw_text = stt_res["text"].strip() # Phase 3: Intent Classification (Cheating vs Safe) if not raw_text: return {"status": "Silent_Whisper", "label": 0, "whisper_text": ""} # Inference for text classification bert_res = text_classifier(raw_text)[0] is_cheating = 1 if bert_res['label'] == "LABEL_1" else 0 return { "status": "Cheating" if is_cheating == 1 else "Safe_Whisper", "whisper_text": raw_text, "confidence": float(bert_res['score']), "label": is_cheating, "device_used": device }