OFOQ-AI-Engine / utils.py
GannaEslam38's picture
Update utils.py
e3a77d9 verified
import torch
import librosa
import numpy as np
import io
from pydub import AudioSegment
from transformers import (
AutoFeatureExtractor,
AutoModelForAudioClassification,
pipeline,
AutoTokenizer,
AutoModelForSequenceClassification
)
# Device configuration (Hugging Face Free Spaces use CPU by default)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🚀 OFOQ Engine is running on: {device.upper()}")
# 1. Load Models from Hugging Face Hub (แทนที่ local folders)
print("⏳ Loading OFOQ Models from Hugging Face Hub...")
# --- Whisper Detection (Your Fine-tuned Model) ---
model_path_detection = "GannaEslam38/OFOQ-Whisper-Detection"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_path_detection)
detection_model = AutoModelForAudioClassification.from_pretrained(model_path_detection).to(device)
# --- Arabic Whisper (Speech-to-Text) ---
whisper_pipeline = pipeline(
"automatic-speech-recognition",
model="MohamedRashad/Arabic-Whisper-CodeSwitching-Edition",
device=0 if torch.cuda.is_available() else -1
)
# --- MARBERT Cheating Classifier (Your Fine-tuned Model) ---
model_path_cheating = "GannaEslam38/OFOQ-Cheating-Classifier"
bert_tokenizer = AutoTokenizer.from_pretrained(model_path_cheating)
bert_model = AutoModelForSequenceClassification.from_pretrained(model_path_cheating).to(device)
text_classifier = pipeline(
"text-classification",
model=bert_model,
tokenizer=bert_tokenizer,
device=0 if torch.cuda.is_available() else -1
)
def run_ofoq_logic(audio_bytes):
try:
# Preprocessing
audio_stream = io.BytesIO(audio_bytes)
audio_segment = AudioSegment.from_file(audio_stream)
wav_io = io.BytesIO()
audio_segment.export(wav_io, format="wav")
wav_io.seek(0)
audio_data, _ = librosa.load(wav_io, sr=16000)
except Exception as e:
print(f"❌ Audio Processing Error: {e}")
return {"status": "Error", "message": f"Could not process audio format: {str(e)}"}
# Phase 1: Whisper/Hiss Detection
inputs = feature_extractor(audio_data, sampling_rate=16000, return_tensors="pt").to(device)
with torch.no_grad():
logits = detection_model(**inputs).logits
prediction_label = torch.argmax(logits, dim=-1).item()
if prediction_label == 0:
return {"status": "Normal", "label": 0}
# Phase 2: Speech-to-Text via Whisper
boosted_audio = audio_data / (np.max(np.abs(audio_data)) + 1e-9)
# Transcription
stt_res = whisper_pipeline(boosted_audio, generate_kwargs={"language": "arabic"})
raw_text = stt_res["text"].strip()
# Phase 3: Intent Classification (Cheating vs Safe)
if not raw_text:
return {"status": "Silent_Whisper", "label": 0, "whisper_text": ""}
# Inference for text classification
bert_res = text_classifier(raw_text)[0]
is_cheating = 1 if bert_res['label'] == "LABEL_1" else 0
return {
"status": "Cheating" if is_cheating == 1 else "Safe_Whisper",
"whisper_text": raw_text,
"confidence": float(bert_res['score']),
"label": is_cheating,
"device_used": device
}