Spaces:
Running
Running
| # ============================== | |
| # MODEL + MODALITY PROCESSING | |
| # ============================== | |
| from transformers import BlipProcessor, BlipForConditionalGeneration | |
| from database import save_analysis | |
| from datetime import datetime | |
| from transformers import pipeline | |
| from fusion import compute_fusion | |
| import torch | |
| import time | |
| import os | |
| torch.set_grad_enabled(False) | |
| device = 0 if torch.cuda.is_available() else -1 | |
| print("Loading AI models...") | |
| text_pipeline = pipeline( | |
| "sentiment-analysis", | |
| model="cardiffnlp/twitter-roberta-base-sentiment", | |
| device=device, | |
| batch_size=8 | |
| ) | |
| emotion_pipeline = pipeline( | |
| "text-classification", | |
| model="j-hartmann/emotion-english-distilroberta-base", | |
| device=device | |
| ) | |
| topic_pipeline = pipeline( | |
| "zero-shot-classification", | |
| model="facebook/bart-large-mnli", | |
| device=device | |
| ) | |
| image_pipeline = pipeline( | |
| "image-classification", | |
| model="google/vit-base-patch16-224", | |
| device=device, | |
| batch_size=8 | |
| ) | |
| print("Loading image caption model...") | |
| caption_processor = BlipProcessor.from_pretrained( | |
| "Salesforce/blip-image-captioning-base" | |
| ) | |
| caption_model = BlipForConditionalGeneration.from_pretrained( | |
| "Salesforce/blip-image-captioning-base" | |
| ) | |
| caption_model.eval() | |
| caption_pipeline = pipeline( | |
| "image-text-to-text", | |
| model="Salesforce/blip-image-captioning-base", | |
| device=device | |
| ) | |
| audio_pipeline = pipeline( | |
| "automatic-speech-recognition", | |
| model="facebook/wav2vec2-base-960h", | |
| device=device | |
| ) | |
| keyword_pipeline = pipeline( | |
| "token-classification", | |
| model="ml6team/keyphrase-extraction-kbir-inspec", | |
| aggregation_strategy="simple", | |
| device=device | |
| ) | |
| print("Models loaded successfully.") | |
| text_pipeline.model.eval() | |
| image_pipeline.model.eval() | |
| audio_pipeline.model.eval() | |
| caption_pipeline.model.eval() | |
| emotion_pipeline.model.eval() | |
| topic_pipeline.model.eval() | |
| keyword_pipeline.model.eval() | |
| # ================= SCORE NORMALIZATION ================= | |
| def normalize_score(score): | |
| if score <= 1: | |
| return round(score * 100, 2) | |
| return round(score, 2) | |
| def multimodal_analyze(text, image, audio): | |
| start_time = time.time() | |
| text_label = None | |
| text_conf = 0 | |
| topics = [ | |
| "technology", | |
| "business", | |
| "education", | |
| "politics", | |
| "entertainment", | |
| "health", | |
| "science", | |
| "sports", | |
| "Lifestyle", | |
| "Personal Development", | |
| "History & Humanities", | |
| "Finance", | |
| "Case Studies", | |
| "News & Analysis", | |
| "Interviews" | |
| ] | |
| keywords = [] | |
| try: | |
| kw_results = keyword_pipeline(text) | |
| keywords = [k["word"] for k in kw_results[:5]] | |
| except: | |
| keywords = [] | |
| image_label = None | |
| image_conf = 0 | |
| audio_label = None | |
| audio_conf = 0 | |
| transcription = None | |
| text_result_display = "No text provided." | |
| image_result_display = "No image provided." | |
| audio_result_display = "No audio provided." | |
| # ================= TEXT ================= | |
| if text and text.strip(): | |
| try: | |
| res = text_pipeline(text)[0] | |
| emotion_res = emotion_pipeline(text)[0] | |
| topic_res = topic_pipeline(text, topics) | |
| topic_label = topic_res["labels"][0] | |
| topic_conf = round(topic_res["scores"][0] * 100, 2) | |
| emotion_label = emotion_res["label"] | |
| emotion_conf = round(emotion_res["score"] * 100, 2) | |
| label_map = { | |
| "LABEL_0": "NEGATIVE", | |
| "LABEL_1": "NEUTRAL", | |
| "LABEL_2": "POSITIVE" | |
| } | |
| text_label = label_map.get(res["label"], res["label"]) | |
| text_conf = normalize_score(res["score"]) | |
| text_result_display = f""" | |
| ## π Text Intelligence | |
| Sentiment: **{text_label}** | |
| Confidence: **{text_conf}%** | |
| Emotion: **{emotion_label}** | |
| Emotion Confidence: **{emotion_conf}%** | |
| Topic: **{topic_label}** | |
| Topic Confidence: **{topic_conf}%** | |
| ### π Key Concepts | |
| {", ".join(keywords) if keywords else "None detected"} | |
| """ | |
| except Exception as e: | |
| text_result_display = f"Text error: {str(e)}" | |
| # ================= IMAGE ================= | |
| if image is not None: | |
| try: | |
| results = image_pipeline(image) | |
| inputs = caption_processor(image, return_tensors="pt") | |
| if device == 0: | |
| inputs = {k: v.to("cuda") for k, v in inputs.items()} | |
| out = caption_model.generate(**inputs) | |
| caption = caption_processor.decode(out[0], skip_special_tokens=True) | |
| image_result_display = "## πΌ Image Classification\n\n" | |
| image_result_display += "### Objects Detected\n" | |
| for r in results[:3]: | |
| label = r["label"] | |
| conf = round(r["score"] * 100, 2) | |
| image_result_display += f"- **{label}** ({conf}%)\n" | |
| image_result_display += f"\n### Caption\n\"{caption}\"" | |
| image_label = results[0]["label"] | |
| image_conf = normalize_score(res["score"]) | |
| except Exception as e: | |
| image_result_display = f"Image error: {str(e)}" | |
| # ================= AUDIO ================= | |
| if audio is not None: | |
| try: | |
| res = audio_pipeline(audio) | |
| transcription = res["text"] | |
| sent = text_pipeline(transcription)[0] | |
| label_map = { | |
| "LABEL_0": "NEGATIVE", | |
| "LABEL_1": "NEUTRAL", | |
| "LABEL_2": "POSITIVE" | |
| } | |
| audio_label = label_map.get(sent["label"], sent["label"]) | |
| audio_conf = round(sent["score"]*100,2) | |
| audio_result_display = f""" | |
| ## π Audio Intelligence | |
| Transcription: | |
| "{transcription}" | |
| Tone: **{audio_label}** | |
| Confidence: **{audio_conf}%** | |
| """ | |
| except Exception as e: | |
| audio_result_display = f"Audio error: {str(e)}" | |
| # ================= FUSION REASONING ================= | |
| caption = caption if 'caption' in locals() else None | |
| reasoning_lines = []# ================= CONTRADICTION DETECTION ================= | |
| if text_label: | |
| reasoning_lines.append( | |
| f"<b style='color:#60a5fa;'>π Text Analysis:</b> " | |
| f"The text expresses a {text_label.lower()} sentiment with emotion " | |
| f"'{emotion_label}'. The topic appears related to {topic_label}." | |
| ) | |
| if keywords: | |
| reasoning_lines.append( | |
| f"<b style='color:#38bdf8;'>π Key Concepts:</b> " | |
| f"{', '.join(keywords)}." | |
| ) | |
| # ================= IMAGE REASONING ================= | |
| if image_label: | |
| reasoning_lines.append( | |
| f"<b style='color:#22c55e;'>πΌ Image Analysis:</b> " | |
| f"Classifier predicted '{image_label}' ({image_conf}% confidence), " | |
| "though this may be an approximate category." | |
| ) | |
| if caption: | |
| reasoning_lines.append( | |
| f"<b style='color:#a78bfa;'>π· Scene Description:</b> " | |
| f"'{caption}', providing a clearer interpretation of the image." | |
| ) | |
| if audio_label: | |
| reasoning_lines.append( | |
| f"<b style='color:#f59e0b;'>π Audio Tone:</b> " | |
| f"Spoken content carries a {audio_label.lower()} tone ({audio_conf}%)." | |
| ) | |
| if text_label and audio_label: | |
| if text_label == "POSITIVE" and audio_label == "NEGATIVE": | |
| reasoning_lines.append( | |
| "<b style='color:#ef4444;'>β οΈ Cross-Modal Conflict:</b> " | |
| " the text expresses positivity " | |
| "but the voice tone suggests negativity." | |
| ) | |
| if text_label == "NEGATIVE" and audio_label == "POSITIVE": | |
| reasoning_lines.append( | |
| "<b style='color:#ef4444;'>β Cross-Modal Conflict:</b> " | |
| "Text expresses negativity while the voice tone appears positive." | |
| ) | |
| if image_label and topic_label: | |
| if topic_label == "technology": | |
| reasoning_lines.append( | |
| "<b style='color:#ef4444;'>π»π€ Technology:</b> " | |
| "The textual topic relates to technology, which aligns with the detected visual content." | |
| ) | |
| # ================= FUSION SCORE ================= | |
| # ================= CONFIDENCE CALIBRATION ================= | |
| text_weight = 0.4 | |
| image_weight = 0.3 | |
| audio_weight = 0.3 | |
| fusion_score = 0 | |
| if text_label: | |
| if text_label == "POSITIVE": | |
| fusion_score += text_conf * text_weight | |
| elif text_label == "NEGATIVE": | |
| fusion_score -= text_conf * text_weight | |
| if image_label: | |
| fusion_score += image_conf * image_weight | |
| if audio_label: | |
| fusion_score += audio_conf * audio_weight | |
| # ================= INTERPRETATION ================= | |
| # ================= INTERPRETATION ================= | |
| if fusion_score >= 60: | |
| alignment_message = "Multimodal signals align toward a positive and confident contextual interpretation." | |
| color = "#22c55e" # GREEN | |
| elif fusion_score >= 20: | |
| alignment_message = "Multimodal signals show moderate contextual alignment." | |
| color = "#f59e0b" # ORANGE | |
| else: | |
| alignment_message = "Multimodal signals are weak, mixed, or uncertain." | |
| color = "#ef4444" # RED | |
| # ================= CONSISTENCY SCORE ================= | |
| caption = caption if 'caption' in locals() else None | |
| tech_words = [ | |
| "robot", "computer", "ai", "machine", "device", | |
| "tablet", "software", "screen", "technology" | |
| ] | |
| alignment_detected = False | |
| if topic_label and caption: | |
| if topic_label == "technology": | |
| for word in tech_words: | |
| if word in caption.lower(): | |
| alignment_detected = True | |
| break | |
| consistency = 0 | |
| checks = 0 | |
| if text_label and audio_label: | |
| checks += 1 | |
| if text_label == audio_label: | |
| consistency += 1 | |
| if caption and topic_label: | |
| checks += 1 | |
| if topic_label in caption.lower() or alignment_detected: | |
| consistency += 1 | |
| if checks > 0: | |
| consistency_score = round((consistency / checks) * 100, 2) | |
| else: | |
| consistency_score = 0 | |
| # ================= SEMANTIC ALIGNMENT ================= | |
| if consistency_score > 70: | |
| consistency_text = "High consistency across modalities." | |
| elif consistency_score > 40: | |
| consistency_text = "Moderate alignment between modalities." | |
| else: | |
| consistency_text = "Low consistency detected across modalities." | |
| if alignment_detected: | |
| reasoning_lines.append( | |
| "<b style='color:#22c55e;'>π€ Technology Alignment:</b> " | |
| "Visual content contains technology-related objects matching the text topic." | |
| ) | |
| # ================= DISPLAY SUMMARY ================= | |
| if not reasoning_lines: | |
| reasoning_lines.append("No multimodal signals detected from the provided inputs.") | |
| processing_time = round(time.time() - start_time, 2) | |
| fusion_summary = f""" | |
| <div style="padding:20px;border-radius:16px; | |
| background:linear-gradient(135deg,#0f172a,#1e293b); | |
| border:1px solid #1f2a44;"> | |
| <h2>π Multimodal Intelligence Summary</h2> | |
| {"<br>".join(reasoning_lines)} | |
| <hr> | |
| <h3>π Fusion Score</h3> | |
| <span style="color:{color}; font-size:34px; font-weight:800;"> | |
| {round(fusion_score,2)} | |
| </span> | |
| <hr> | |
| <h3>π Multimodal Consistency</h3> | |
| <span style="font-size:28px;font-weight:700;"> | |
| {consistency_score}% | |
| </span> | |
| <br> | |
| {consistency_text} | |
| <hr> | |
| <h3>π§ Interpretation</h3> | |
| {alignment_message} | |
| <br> | |
| β± Processing Time: {processing_time} sec | |
| </div> | |
| """ | |
| # ================= SAVE HISTORY ================= | |
| save_analysis({ | |
| "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), | |
| "text": text, | |
| "image": image_label, | |
| "audio": audio_label, | |
| "transcription": transcription, | |
| "fusion_score": round(fusion_score, 2) | |
| }) | |
| return fusion_summary, text_result_display, image_result_display, audio_result_display | |