pathananas's picture
model.py
647f372 verified
# ==============================
# MODEL + MODALITY PROCESSING
# ==============================
from transformers import BlipProcessor, BlipForConditionalGeneration
from database import save_analysis
from datetime import datetime
from transformers import pipeline
from fusion import compute_fusion
import torch
import time
import os
torch.set_grad_enabled(False)
device = 0 if torch.cuda.is_available() else -1
print("Loading AI models...")
text_pipeline = pipeline(
"sentiment-analysis",
model="cardiffnlp/twitter-roberta-base-sentiment",
device=device,
batch_size=8
)
emotion_pipeline = pipeline(
"text-classification",
model="j-hartmann/emotion-english-distilroberta-base",
device=device
)
topic_pipeline = pipeline(
"zero-shot-classification",
model="facebook/bart-large-mnli",
device=device
)
image_pipeline = pipeline(
"image-classification",
model="google/vit-base-patch16-224",
device=device,
batch_size=8
)
print("Loading image caption model...")
caption_processor = BlipProcessor.from_pretrained(
"Salesforce/blip-image-captioning-base"
)
caption_model = BlipForConditionalGeneration.from_pretrained(
"Salesforce/blip-image-captioning-base"
)
caption_model.eval()
caption_pipeline = pipeline(
"image-text-to-text",
model="Salesforce/blip-image-captioning-base",
device=device
)
audio_pipeline = pipeline(
"automatic-speech-recognition",
model="facebook/wav2vec2-base-960h",
device=device
)
keyword_pipeline = pipeline(
"token-classification",
model="ml6team/keyphrase-extraction-kbir-inspec",
aggregation_strategy="simple",
device=device
)
print("Models loaded successfully.")
text_pipeline.model.eval()
image_pipeline.model.eval()
audio_pipeline.model.eval()
caption_pipeline.model.eval()
emotion_pipeline.model.eval()
topic_pipeline.model.eval()
keyword_pipeline.model.eval()
# ================= SCORE NORMALIZATION =================
def normalize_score(score):
if score <= 1:
return round(score * 100, 2)
return round(score, 2)
def multimodal_analyze(text, image, audio):
start_time = time.time()
text_label = None
text_conf = 0
topics = [
"technology",
"business",
"education",
"politics",
"entertainment",
"health",
"science",
"sports",
"Lifestyle",
"Personal Development",
"History & Humanities",
"Finance",
"Case Studies",
"News & Analysis",
"Interviews"
]
keywords = []
try:
kw_results = keyword_pipeline(text)
keywords = [k["word"] for k in kw_results[:5]]
except:
keywords = []
image_label = None
image_conf = 0
audio_label = None
audio_conf = 0
transcription = None
text_result_display = "No text provided."
image_result_display = "No image provided."
audio_result_display = "No audio provided."
# ================= TEXT =================
if text and text.strip():
try:
res = text_pipeline(text)[0]
emotion_res = emotion_pipeline(text)[0]
topic_res = topic_pipeline(text, topics)
topic_label = topic_res["labels"][0]
topic_conf = round(topic_res["scores"][0] * 100, 2)
emotion_label = emotion_res["label"]
emotion_conf = round(emotion_res["score"] * 100, 2)
label_map = {
"LABEL_0": "NEGATIVE",
"LABEL_1": "NEUTRAL",
"LABEL_2": "POSITIVE"
}
text_label = label_map.get(res["label"], res["label"])
text_conf = normalize_score(res["score"])
text_result_display = f"""
## πŸ“ Text Intelligence
Sentiment: **{text_label}**
Confidence: **{text_conf}%**
Emotion: **{emotion_label}**
Emotion Confidence: **{emotion_conf}%**
Topic: **{topic_label}**
Topic Confidence: **{topic_conf}%**
### πŸ”‘ Key Concepts
{", ".join(keywords) if keywords else "None detected"}
"""
except Exception as e:
text_result_display = f"Text error: {str(e)}"
# ================= IMAGE =================
if image is not None:
try:
results = image_pipeline(image)
inputs = caption_processor(image, return_tensors="pt")
if device == 0:
inputs = {k: v.to("cuda") for k, v in inputs.items()}
out = caption_model.generate(**inputs)
caption = caption_processor.decode(out[0], skip_special_tokens=True)
image_result_display = "## πŸ–Ό Image Classification\n\n"
image_result_display += "### Objects Detected\n"
for r in results[:3]:
label = r["label"]
conf = round(r["score"] * 100, 2)
image_result_display += f"- **{label}** ({conf}%)\n"
image_result_display += f"\n### Caption\n\"{caption}\""
image_label = results[0]["label"]
image_conf = normalize_score(res["score"])
except Exception as e:
image_result_display = f"Image error: {str(e)}"
# ================= AUDIO =================
if audio is not None:
try:
res = audio_pipeline(audio)
transcription = res["text"]
sent = text_pipeline(transcription)[0]
label_map = {
"LABEL_0": "NEGATIVE",
"LABEL_1": "NEUTRAL",
"LABEL_2": "POSITIVE"
}
audio_label = label_map.get(sent["label"], sent["label"])
audio_conf = round(sent["score"]*100,2)
audio_result_display = f"""
## πŸŽ™ Audio Intelligence
Transcription:
"{transcription}"
Tone: **{audio_label}**
Confidence: **{audio_conf}%**
"""
except Exception as e:
audio_result_display = f"Audio error: {str(e)}"
# ================= FUSION REASONING =================
caption = caption if 'caption' in locals() else None
reasoning_lines = []# ================= CONTRADICTION DETECTION =================
if text_label:
reasoning_lines.append(
f"<b style='color:#60a5fa;'>πŸ“ Text Analysis:</b> "
f"The text expresses a {text_label.lower()} sentiment with emotion "
f"'{emotion_label}'. The topic appears related to {topic_label}."
)
if keywords:
reasoning_lines.append(
f"<b style='color:#38bdf8;'>πŸ”‘ Key Concepts:</b> "
f"{', '.join(keywords)}."
)
# ================= IMAGE REASONING =================
if image_label:
reasoning_lines.append(
f"<b style='color:#22c55e;'>πŸ–Ό Image Analysis:</b> "
f"Classifier predicted '{image_label}' ({image_conf}% confidence), "
"though this may be an approximate category."
)
if caption:
reasoning_lines.append(
f"<b style='color:#a78bfa;'>πŸ“· Scene Description:</b> "
f"'{caption}', providing a clearer interpretation of the image."
)
if audio_label:
reasoning_lines.append(
f"<b style='color:#f59e0b;'>πŸŽ™ Audio Tone:</b> "
f"Spoken content carries a {audio_label.lower()} tone ({audio_conf}%)."
)
if text_label and audio_label:
if text_label == "POSITIVE" and audio_label == "NEGATIVE":
reasoning_lines.append(
"<b style='color:#ef4444;'>⚠️ Cross-Modal Conflict:</b> "
" the text expresses positivity "
"but the voice tone suggests negativity."
)
if text_label == "NEGATIVE" and audio_label == "POSITIVE":
reasoning_lines.append(
"<b style='color:#ef4444;'>⚠ Cross-Modal Conflict:</b> "
"Text expresses negativity while the voice tone appears positive."
)
if image_label and topic_label:
if topic_label == "technology":
reasoning_lines.append(
"<b style='color:#ef4444;'>πŸ’»πŸ€– Technology:</b> "
"The textual topic relates to technology, which aligns with the detected visual content."
)
# ================= FUSION SCORE =================
# ================= CONFIDENCE CALIBRATION =================
text_weight = 0.4
image_weight = 0.3
audio_weight = 0.3
fusion_score = 0
if text_label:
if text_label == "POSITIVE":
fusion_score += text_conf * text_weight
elif text_label == "NEGATIVE":
fusion_score -= text_conf * text_weight
if image_label:
fusion_score += image_conf * image_weight
if audio_label:
fusion_score += audio_conf * audio_weight
# ================= INTERPRETATION =================
# ================= INTERPRETATION =================
if fusion_score >= 60:
alignment_message = "Multimodal signals align toward a positive and confident contextual interpretation."
color = "#22c55e" # GREEN
elif fusion_score >= 20:
alignment_message = "Multimodal signals show moderate contextual alignment."
color = "#f59e0b" # ORANGE
else:
alignment_message = "Multimodal signals are weak, mixed, or uncertain."
color = "#ef4444" # RED
# ================= CONSISTENCY SCORE =================
caption = caption if 'caption' in locals() else None
tech_words = [
"robot", "computer", "ai", "machine", "device",
"tablet", "software", "screen", "technology"
]
alignment_detected = False
if topic_label and caption:
if topic_label == "technology":
for word in tech_words:
if word in caption.lower():
alignment_detected = True
break
consistency = 0
checks = 0
if text_label and audio_label:
checks += 1
if text_label == audio_label:
consistency += 1
if caption and topic_label:
checks += 1
if topic_label in caption.lower() or alignment_detected:
consistency += 1
if checks > 0:
consistency_score = round((consistency / checks) * 100, 2)
else:
consistency_score = 0
# ================= SEMANTIC ALIGNMENT =================
if consistency_score > 70:
consistency_text = "High consistency across modalities."
elif consistency_score > 40:
consistency_text = "Moderate alignment between modalities."
else:
consistency_text = "Low consistency detected across modalities."
if alignment_detected:
reasoning_lines.append(
"<b style='color:#22c55e;'>πŸ€– Technology Alignment:</b> "
"Visual content contains technology-related objects matching the text topic."
)
# ================= DISPLAY SUMMARY =================
if not reasoning_lines:
reasoning_lines.append("No multimodal signals detected from the provided inputs.")
processing_time = round(time.time() - start_time, 2)
fusion_summary = f"""
<div style="padding:20px;border-radius:16px;
background:linear-gradient(135deg,#0f172a,#1e293b);
border:1px solid #1f2a44;">
<h2>πŸ”Ž Multimodal Intelligence Summary</h2>
{"<br>".join(reasoning_lines)}
<hr>
<h3>πŸ“Š Fusion Score</h3>
<span style="color:{color}; font-size:34px; font-weight:800;">
{round(fusion_score,2)}
</span>
<hr>
<h3>πŸ”— Multimodal Consistency</h3>
<span style="font-size:28px;font-weight:700;">
{consistency_score}%
</span>
<br>
{consistency_text}
<hr>
<h3>🧠 Interpretation</h3>
{alignment_message}
<br>
⏱ Processing Time: {processing_time} sec
</div>
"""
# ================= SAVE HISTORY =================
save_analysis({
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"text": text,
"image": image_label,
"audio": audio_label,
"transcription": transcription,
"fusion_score": round(fusion_score, 2)
})
return fusion_summary, text_result_display, image_result_display, audio_result_display