# ==============================
# MODEL + MODALITY PROCESSING
# ==============================
from transformers import BlipProcessor, BlipForConditionalGeneration
from database import save_analysis
from datetime import datetime
from transformers import pipeline
from fusion import compute_fusion
import torch
import time
import os
torch.set_grad_enabled(False)

device = 0 if torch.cuda.is_available() else -1

print("Loading AI models...")

text_pipeline = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment",
    device=device,
    batch_size=8
)
emotion_pipeline = pipeline(
    "text-classification",
    model="j-hartmann/emotion-english-distilroberta-base",
    device=device
)
topic_pipeline = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli",
    device=device
)

image_pipeline = pipeline(
    "image-classification",
    model="google/vit-base-patch16-224",
    device=device,
    batch_size=8
)
print("Loading image caption model...")

caption_processor = BlipProcessor.from_pretrained(
    "Salesforce/blip-image-captioning-base"
)

caption_model = BlipForConditionalGeneration.from_pretrained(
    "Salesforce/blip-image-captioning-base"
)

caption_model.eval()
caption_pipeline = pipeline(
    "image-text-to-text",
    model="Salesforce/blip-image-captioning-base",
    device=device
)

audio_pipeline = pipeline(
    "automatic-speech-recognition",
    model="facebook/wav2vec2-base-960h",
    device=device
)
keyword_pipeline = pipeline(
    "token-classification",
    model="ml6team/keyphrase-extraction-kbir-inspec",
    aggregation_strategy="simple",
    device=device
)
print("Models loaded successfully.")
text_pipeline.model.eval()
image_pipeline.model.eval()
audio_pipeline.model.eval()
caption_pipeline.model.eval()
emotion_pipeline.model.eval()
topic_pipeline.model.eval()
keyword_pipeline.model.eval()
# ================= SCORE NORMALIZATION =================

def normalize_score(score):
    if score <= 1:
        return round(score * 100, 2)
    return round(score, 2)


def multimodal_analyze(text, image, audio):

    start_time = time.time()

    text_label = None
    text_conf = 0
    topics = [
        "technology",
        "business",
        "education",
        "politics",
        "entertainment",
        "health",
        "science",
        "sports",
        "Lifestyle",
        "Personal Development",
        "History & Humanities",
        "Finance",
        "Case Studies",
        "News & Analysis",
        "Interviews"
    ]
    keywords = []
    try:
        kw_results = keyword_pipeline(text)
        keywords = [k["word"] for k in kw_results[:5]]
    except:
        keywords = []
    image_label = None
    image_conf = 0
    audio_label = None
    audio_conf = 0
    transcription = None

    text_result_display = "No text provided."
    image_result_display = "No image provided."
    audio_result_display = "No audio provided."

    # ================= TEXT =================
    if text and text.strip():
        try:
            res = text_pipeline(text)[0]
            emotion_res = emotion_pipeline(text)[0]
            topic_res = topic_pipeline(text, topics)
            topic_label = topic_res["labels"][0]
            topic_conf = round(topic_res["scores"][0] * 100, 2)
            emotion_label = emotion_res["label"]
            emotion_conf = round(emotion_res["score"] * 100, 2)

            label_map = {
                "LABEL_0": "NEGATIVE",
                "LABEL_1": "NEUTRAL",
                "LABEL_2": "POSITIVE"
            }

            text_label = label_map.get(res["label"], res["label"])
            text_conf = normalize_score(res["score"])
            text_result_display = f"""
## 📝 Text Intelligence

Sentiment: **{text_label}**
Confidence: **{text_conf}%**

Emotion: **{emotion_label}**
Emotion Confidence: **{emotion_conf}%**

Topic: **{topic_label}**
Topic Confidence: **{topic_conf}%**

### 🔑 Key Concepts
{", ".join(keywords) if keywords else "None detected"}
"""
        except Exception as e:
            text_result_display = f"Text error: {str(e)}"

    # ================= IMAGE =================
    if image is not None:
        try:
            results = image_pipeline(image)
            inputs = caption_processor(image, return_tensors="pt")
            if device == 0:
                inputs = {k: v.to("cuda") for k, v in inputs.items()}
                
            out = caption_model.generate(**inputs)
            caption = caption_processor.decode(out[0], skip_special_tokens=True)


            image_result_display = "## 🖼 Image Classification\n\n"
            image_result_display += "### Objects Detected\n"

            for r in results[:3]:
                label = r["label"]
                conf = round(r["score"] * 100, 2)
                image_result_display += f"- **{label}** ({conf}%)\n"

            image_result_display += f"\n### Caption\n\"{caption}\""

            image_label = results[0]["label"]
            image_conf = normalize_score(res["score"])

        except Exception as e:
            image_result_display = f"Image error: {str(e)}"

    # ================= AUDIO =================
    if audio is not None:
        try:
            res = audio_pipeline(audio)
            transcription = res["text"]

            sent = text_pipeline(transcription)[0]

            label_map = {
                "LABEL_0": "NEGATIVE",
                "LABEL_1": "NEUTRAL",
                "LABEL_2": "POSITIVE"
            }

            audio_label = label_map.get(sent["label"], sent["label"])
            audio_conf =  round(sent["score"]*100,2)

            audio_result_display = f"""
## 🎙 Audio Intelligence

Transcription:
"{transcription}"

Tone: **{audio_label}**

Confidence: **{audio_conf}%**
"""

        except Exception as e:
            audio_result_display = f"Audio error: {str(e)}"

    # ================= FUSION REASONING =================
    
    caption = caption if 'caption' in locals() else None
    reasoning_lines = []# ================= CONTRADICTION DETECTION =================


    if text_label:
        reasoning_lines.append(
            f"<b style='color:#60a5fa;'>📝 Text Analysis:</b> "
            f"The text expresses a {text_label.lower()} sentiment with emotion "
            f"'{emotion_label}'. The topic appears related to {topic_label}."
        )
    if keywords:
        reasoning_lines.append(
            f"<b style='color:#38bdf8;'>🔑 Key Concepts:</b> "
            f"{', '.join(keywords)}."
        )
    # ================= IMAGE REASONING =================
    if image_label:
        reasoning_lines.append(
            f"<b style='color:#22c55e;'>🖼 Image Analysis:</b> "
            f"Classifier predicted '{image_label}' ({image_conf}% confidence), "
            "though this may be an approximate category."
        )
        if caption:
            reasoning_lines.append(
                f"<b style='color:#a78bfa;'>📷 Scene Description:</b> "
                f"'{caption}', providing a clearer interpretation of the image."
            )

    if audio_label:
        reasoning_lines.append(
            f"<b style='color:#f59e0b;'>🎙 Audio Tone:</b> "
            f"Spoken content carries a {audio_label.lower()} tone ({audio_conf}%)."
        )
    if text_label and audio_label:
        if text_label == "POSITIVE" and audio_label == "NEGATIVE":
            reasoning_lines.append(
                "<b style='color:#ef4444;'>⚠️ Cross-Modal Conflict:</b> "
                " the text expresses positivity "
                "but the voice tone suggests negativity."
            )
        if text_label == "NEGATIVE" and audio_label == "POSITIVE":
           reasoning_lines.append(
               "<b style='color:#ef4444;'>⚠ Cross-Modal Conflict:</b> "
               "Text expresses negativity while the voice tone appears positive."
)
    if image_label and topic_label:
        if topic_label == "technology":
            reasoning_lines.append(
                "<b style='color:#ef4444;'>💻🤖 Technology:</b> "
                "The textual topic relates to technology, which aligns with the detected visual content."
            )

    
    # ================= FUSION SCORE =================

    # ================= CONFIDENCE CALIBRATION =================
    text_weight = 0.4
    image_weight = 0.3
    audio_weight = 0.3

    fusion_score = 0
    if text_label:
        if text_label == "POSITIVE":
            fusion_score += text_conf * text_weight
        elif text_label == "NEGATIVE":
            fusion_score -= text_conf * text_weight
            
    if image_label:
        fusion_score += image_conf * image_weight
    if audio_label:
        fusion_score += audio_conf * audio_weight

    # ================= INTERPRETATION =================
    # ================= INTERPRETATION =================
    if fusion_score >= 60:
        alignment_message = "Multimodal signals align toward a positive and confident contextual interpretation."
        color = "#22c55e"   # GREEN
    elif fusion_score >= 20:
        alignment_message = "Multimodal signals show moderate contextual alignment."
        color = "#f59e0b"   # ORANGE
    else:
        alignment_message = "Multimodal signals are weak, mixed, or uncertain."
        color = "#ef4444"   # RED
    # ================= CONSISTENCY SCORE =================
    caption = caption if 'caption' in locals() else None
    tech_words = [
        "robot", "computer", "ai", "machine", "device",
        "tablet", "software", "screen", "technology"
    ]
    alignment_detected = False
    if topic_label and caption:
        if topic_label == "technology":
            for word in tech_words:
                if word in caption.lower():
                    alignment_detected = True
                    break
    consistency = 0
    checks = 0

    if text_label and audio_label:
        checks += 1
        if text_label == audio_label:
            consistency += 1

    if caption and topic_label:
        checks += 1
        if topic_label in caption.lower() or alignment_detected:
            consistency += 1
            
    if checks > 0:
        consistency_score = round((consistency / checks) * 100, 2)
    else:
        consistency_score = 0

    # ================= SEMANTIC ALIGNMENT =================

    
    if consistency_score > 70:
        consistency_text = "High consistency across modalities."
    elif consistency_score > 40:
        consistency_text = "Moderate alignment between modalities."
    else:
        consistency_text = "Low consistency detected across modalities."

    if alignment_detected:
        reasoning_lines.append(
            "<b style='color:#22c55e;'>🤖 Technology Alignment:</b> "
            "Visual content contains technology-related objects matching the text topic."
        )

    # ================= DISPLAY SUMMARY =================
    if not reasoning_lines:
        reasoning_lines.append("No multimodal signals detected from the provided inputs.")

    processing_time = round(time.time() - start_time, 2)
    fusion_summary = f"""
<div style="padding:20px;border-radius:16px;
background:linear-gradient(135deg,#0f172a,#1e293b);
border:1px solid #1f2a44;">

<h2>🔎 Multimodal Intelligence Summary</h2>

{"<br>".join(reasoning_lines)}

<hr>

<h3>📊 Fusion Score</h3>

<span style="color:{color}; font-size:34px; font-weight:800;">
{round(fusion_score,2)}
</span>

<hr>

<h3>🔗 Multimodal Consistency</h3>

<span style="font-size:28px;font-weight:700;">
{consistency_score}%
</span>

<br>

{consistency_text}
<hr>
<h3>🧠 Interpretation</h3>

{alignment_message}

<br>
⏱ Processing Time: {processing_time} sec

</div>
"""

    # ================= SAVE HISTORY =================

    save_analysis({
        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "text": text,
        "image": image_label,
        "audio": audio_label,
        "transcription": transcription,
        "fusion_score": round(fusion_score, 2)
    })

    return fusion_summary, text_result_display, image_result_display, audio_result_display