import torch
import librosa
import numpy as np
import io
from pydub import AudioSegment
from transformers import (
    AutoFeatureExtractor, 
    AutoModelForAudioClassification, 
    pipeline, 
    AutoTokenizer, 
    AutoModelForSequenceClassification
)

# Device configuration (Hugging Face Free Spaces use CPU by default)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🚀 OFOQ Engine is running on: {device.upper()}")

# 1. Load Models from Hugging Face Hub (แทนที่ local folders)
print("⏳ Loading OFOQ Models from Hugging Face Hub...")

# --- Whisper Detection (Your Fine-tuned Model) ---
model_path_detection = "GannaEslam38/OFOQ-Whisper-Detection"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_path_detection)
detection_model = AutoModelForAudioClassification.from_pretrained(model_path_detection).to(device)

# --- Arabic Whisper (Speech-to-Text) ---
whisper_pipeline = pipeline(
    "automatic-speech-recognition",
    model="MohamedRashad/Arabic-Whisper-CodeSwitching-Edition",
    device=0 if torch.cuda.is_available() else -1
)

# --- MARBERT Cheating Classifier (Your Fine-tuned Model) ---
model_path_cheating = "GannaEslam38/OFOQ-Cheating-Classifier"
bert_tokenizer = AutoTokenizer.from_pretrained(model_path_cheating)
bert_model = AutoModelForSequenceClassification.from_pretrained(model_path_cheating).to(device)

text_classifier = pipeline(
    "text-classification", 
    model=bert_model, 
    tokenizer=bert_tokenizer, 
    device=0 if torch.cuda.is_available() else -1
)

def run_ofoq_logic(audio_bytes):
    try:
        # Preprocessing
        audio_stream = io.BytesIO(audio_bytes)
        audio_segment = AudioSegment.from_file(audio_stream)
        
        wav_io = io.BytesIO()
        audio_segment.export(wav_io, format="wav")
        wav_io.seek(0)
        
        audio_data, _ = librosa.load(wav_io, sr=16000)
        
    except Exception as e:
        print(f"❌ Audio Processing Error: {e}")
        return {"status": "Error", "message": f"Could not process audio format: {str(e)}"}

    # Phase 1: Whisper/Hiss Detection
    inputs = feature_extractor(audio_data, sampling_rate=16000, return_tensors="pt").to(device)
    
    with torch.no_grad():
        logits = detection_model(**inputs).logits
        prediction_label = torch.argmax(logits, dim=-1).item()
    
    if prediction_label == 0:
        return {"status": "Normal", "label": 0}

    # Phase 2: Speech-to-Text via Whisper
    boosted_audio = audio_data / (np.max(np.abs(audio_data)) + 1e-9)
    
    # Transcription
    stt_res = whisper_pipeline(boosted_audio, generate_kwargs={"language": "arabic"})
    raw_text = stt_res["text"].strip()

    # Phase 3: Intent Classification (Cheating vs Safe)
    if not raw_text:
        return {"status": "Silent_Whisper", "label": 0, "whisper_text": ""}

    # Inference for text classification
    bert_res = text_classifier(raw_text)[0]
    
    is_cheating = 1 if bert_res['label'] == "LABEL_1" else 0
    
    return {
        "status": "Cheating" if is_cheating == 1 else "Safe_Whisper",
        "whisper_text": raw_text,
        "confidence": float(bert_res['score']),
        "label": is_cheating,
        "device_used": device
    }