# ============================== # MODEL + MODALITY PROCESSING # ============================== from transformers import BlipProcessor, BlipForConditionalGeneration from database import save_analysis from datetime import datetime from transformers import pipeline from fusion import compute_fusion import torch import time import os torch.set_grad_enabled(False) device = 0 if torch.cuda.is_available() else -1 print("Loading AI models...") text_pipeline = pipeline( "sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment", device=device, batch_size=8 ) emotion_pipeline = pipeline( "text-classification", model="j-hartmann/emotion-english-distilroberta-base", device=device ) topic_pipeline = pipeline( "zero-shot-classification", model="facebook/bart-large-mnli", device=device ) image_pipeline = pipeline( "image-classification", model="google/vit-base-patch16-224", device=device, batch_size=8 ) print("Loading image caption model...") caption_processor = BlipProcessor.from_pretrained( "Salesforce/blip-image-captioning-base" ) caption_model = BlipForConditionalGeneration.from_pretrained( "Salesforce/blip-image-captioning-base" ) caption_model.eval() caption_pipeline = pipeline( "image-text-to-text", model="Salesforce/blip-image-captioning-base", device=device ) audio_pipeline = pipeline( "automatic-speech-recognition", model="facebook/wav2vec2-base-960h", device=device ) keyword_pipeline = pipeline( "token-classification", model="ml6team/keyphrase-extraction-kbir-inspec", aggregation_strategy="simple", device=device ) print("Models loaded successfully.") text_pipeline.model.eval() image_pipeline.model.eval() audio_pipeline.model.eval() caption_pipeline.model.eval() emotion_pipeline.model.eval() topic_pipeline.model.eval() keyword_pipeline.model.eval() # ================= SCORE NORMALIZATION ================= def normalize_score(score): if score <= 1: return round(score * 100, 2) return round(score, 2) def multimodal_analyze(text, image, audio): start_time = time.time() text_label = None text_conf = 0 topics = [ "technology", "business", "education", "politics", "entertainment", "health", "science", "sports", "Lifestyle", "Personal Development", "History & Humanities", "Finance", "Case Studies", "News & Analysis", "Interviews" ] keywords = [] try: kw_results = keyword_pipeline(text) keywords = [k["word"] for k in kw_results[:5]] except: keywords = [] image_label = None image_conf = 0 audio_label = None audio_conf = 0 transcription = None text_result_display = "No text provided." image_result_display = "No image provided." audio_result_display = "No audio provided." # ================= TEXT ================= if text and text.strip(): try: res = text_pipeline(text)[0] emotion_res = emotion_pipeline(text)[0] topic_res = topic_pipeline(text, topics) topic_label = topic_res["labels"][0] topic_conf = round(topic_res["scores"][0] * 100, 2) emotion_label = emotion_res["label"] emotion_conf = round(emotion_res["score"] * 100, 2) label_map = { "LABEL_0": "NEGATIVE", "LABEL_1": "NEUTRAL", "LABEL_2": "POSITIVE" } text_label = label_map.get(res["label"], res["label"]) text_conf = normalize_score(res["score"]) text_result_display = f""" ## 📝 Text Intelligence Sentiment: **{text_label}** Confidence: **{text_conf}%** Emotion: **{emotion_label}** Emotion Confidence: **{emotion_conf}%** Topic: **{topic_label}** Topic Confidence: **{topic_conf}%** ### 🔑 Key Concepts {", ".join(keywords) if keywords else "None detected"} """ except Exception as e: text_result_display = f"Text error: {str(e)}" # ================= IMAGE ================= if image is not None: try: results = image_pipeline(image) inputs = caption_processor(image, return_tensors="pt") if device == 0: inputs = {k: v.to("cuda") for k, v in inputs.items()} out = caption_model.generate(**inputs) caption = caption_processor.decode(out[0], skip_special_tokens=True) image_result_display = "## 🖼 Image Classification\n\n" image_result_display += "### Objects Detected\n" for r in results[:3]: label = r["label"] conf = round(r["score"] * 100, 2) image_result_display += f"- **{label}** ({conf}%)\n" image_result_display += f"\n### Caption\n\"{caption}\"" image_label = results[0]["label"] image_conf = normalize_score(res["score"]) except Exception as e: image_result_display = f"Image error: {str(e)}" # ================= AUDIO ================= if audio is not None: try: res = audio_pipeline(audio) transcription = res["text"] sent = text_pipeline(transcription)[0] label_map = { "LABEL_0": "NEGATIVE", "LABEL_1": "NEUTRAL", "LABEL_2": "POSITIVE" } audio_label = label_map.get(sent["label"], sent["label"]) audio_conf = round(sent["score"]*100,2) audio_result_display = f""" ## 🎙 Audio Intelligence Transcription: "{transcription}" Tone: **{audio_label}** Confidence: **{audio_conf}%** """ except Exception as e: audio_result_display = f"Audio error: {str(e)}" # ================= FUSION REASONING ================= caption = caption if 'caption' in locals() else None reasoning_lines = []# ================= CONTRADICTION DETECTION ================= if text_label: reasoning_lines.append( f"📝 Text Analysis: " f"The text expresses a {text_label.lower()} sentiment with emotion " f"'{emotion_label}'. The topic appears related to {topic_label}." ) if keywords: reasoning_lines.append( f"🔑 Key Concepts: " f"{', '.join(keywords)}." ) # ================= IMAGE REASONING ================= if image_label: reasoning_lines.append( f"🖼 Image Analysis: " f"Classifier predicted '{image_label}' ({image_conf}% confidence), " "though this may be an approximate category." ) if caption: reasoning_lines.append( f"📷 Scene Description: " f"'{caption}', providing a clearer interpretation of the image." ) if audio_label: reasoning_lines.append( f"🎙 Audio Tone: " f"Spoken content carries a {audio_label.lower()} tone ({audio_conf}%)." ) if text_label and audio_label: if text_label == "POSITIVE" and audio_label == "NEGATIVE": reasoning_lines.append( "⚠️ Cross-Modal Conflict: " " the text expresses positivity " "but the voice tone suggests negativity." ) if text_label == "NEGATIVE" and audio_label == "POSITIVE": reasoning_lines.append( "⚠ Cross-Modal Conflict: " "Text expresses negativity while the voice tone appears positive." ) if image_label and topic_label: if topic_label == "technology": reasoning_lines.append( "💻🤖 Technology: " "The textual topic relates to technology, which aligns with the detected visual content." ) # ================= FUSION SCORE ================= # ================= CONFIDENCE CALIBRATION ================= text_weight = 0.4 image_weight = 0.3 audio_weight = 0.3 fusion_score = 0 if text_label: if text_label == "POSITIVE": fusion_score += text_conf * text_weight elif text_label == "NEGATIVE": fusion_score -= text_conf * text_weight if image_label: fusion_score += image_conf * image_weight if audio_label: fusion_score += audio_conf * audio_weight # ================= INTERPRETATION ================= # ================= INTERPRETATION ================= if fusion_score >= 60: alignment_message = "Multimodal signals align toward a positive and confident contextual interpretation." color = "#22c55e" # GREEN elif fusion_score >= 20: alignment_message = "Multimodal signals show moderate contextual alignment." color = "#f59e0b" # ORANGE else: alignment_message = "Multimodal signals are weak, mixed, or uncertain." color = "#ef4444" # RED # ================= CONSISTENCY SCORE ================= caption = caption if 'caption' in locals() else None tech_words = [ "robot", "computer", "ai", "machine", "device", "tablet", "software", "screen", "technology" ] alignment_detected = False if topic_label and caption: if topic_label == "technology": for word in tech_words: if word in caption.lower(): alignment_detected = True break consistency = 0 checks = 0 if text_label and audio_label: checks += 1 if text_label == audio_label: consistency += 1 if caption and topic_label: checks += 1 if topic_label in caption.lower() or alignment_detected: consistency += 1 if checks > 0: consistency_score = round((consistency / checks) * 100, 2) else: consistency_score = 0 # ================= SEMANTIC ALIGNMENT ================= if consistency_score > 70: consistency_text = "High consistency across modalities." elif consistency_score > 40: consistency_text = "Moderate alignment between modalities." else: consistency_text = "Low consistency detected across modalities." if alignment_detected: reasoning_lines.append( "🤖 Technology Alignment: " "Visual content contains technology-related objects matching the text topic." ) # ================= DISPLAY SUMMARY ================= if not reasoning_lines: reasoning_lines.append("No multimodal signals detected from the provided inputs.") processing_time = round(time.time() - start_time, 2) fusion_summary = f"""

🔎 Multimodal Intelligence Summary

{"
".join(reasoning_lines)}

📊 Fusion Score

{round(fusion_score,2)}

🔗 Multimodal Consistency

{consistency_score}%
{consistency_text}

🧠 Interpretation

{alignment_message}
⏱ Processing Time: {processing_time} sec
""" # ================= SAVE HISTORY ================= save_analysis({ "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "text": text, "image": image_label, "audio": audio_label, "transcription": transcription, "fusion_score": round(fusion_score, 2) }) return fusion_summary, text_result_display, image_result_display, audio_result_display