Spaces:

pathananas
/

multimodal-ai-engine

Running

App Files Files Community

multimodal-ai-engine / model.py

pathananas

model.py

647f372 verified 1 day ago

raw

history blame contribute delete

12.2 kB

	# ==============================
	# MODEL + MODALITY PROCESSING
	# ==============================
	from transformers import BlipProcessor, BlipForConditionalGeneration
	from database import save_analysis
	from datetime import datetime
	from transformers import pipeline
	from fusion import compute_fusion
	import torch
	import time
	import os
	torch.set_grad_enabled(False)

	device = 0 if torch.cuda.is_available() else -1

	print("Loading AI models...")

	text_pipeline = pipeline(
	"sentiment-analysis",
	model="cardiffnlp/twitter-roberta-base-sentiment",
	device=device,
	batch_size=8
	)
	emotion_pipeline = pipeline(
	"text-classification",
	model="j-hartmann/emotion-english-distilroberta-base",
	device=device
	)
	topic_pipeline = pipeline(
	"zero-shot-classification",
	model="facebook/bart-large-mnli",
	device=device
	)

	image_pipeline = pipeline(
	"image-classification",
	model="google/vit-base-patch16-224",
	device=device,
	batch_size=8
	)
	print("Loading image caption model...")

	caption_processor = BlipProcessor.from_pretrained(
	"Salesforce/blip-image-captioning-base"
	)

	caption_model = BlipForConditionalGeneration.from_pretrained(
	"Salesforce/blip-image-captioning-base"
	)

	caption_model.eval()
	caption_pipeline = pipeline(
	"image-text-to-text",
	model="Salesforce/blip-image-captioning-base",
	device=device
	)

	audio_pipeline = pipeline(
	"automatic-speech-recognition",
	model="facebook/wav2vec2-base-960h",
	device=device
	)
	keyword_pipeline = pipeline(
	"token-classification",
	model="ml6team/keyphrase-extraction-kbir-inspec",
	aggregation_strategy="simple",
	device=device
	)
	print("Models loaded successfully.")
	text_pipeline.model.eval()
	image_pipeline.model.eval()
	audio_pipeline.model.eval()
	caption_pipeline.model.eval()
	emotion_pipeline.model.eval()
	topic_pipeline.model.eval()
	keyword_pipeline.model.eval()
	# ================= SCORE NORMALIZATION =================

	def normalize_score(score):
	if score <= 1:
	return round(score * 100, 2)
	return round(score, 2)


	def multimodal_analyze(text, image, audio):

	start_time = time.time()

	text_label = None
	text_conf = 0
	topics = [
	"technology",
	"business",
	"education",
	"politics",
	"entertainment",
	"health",
	"science",
	"sports",
	"Lifestyle",
	"Personal Development",
	"History & Humanities",
	"Finance",
	"Case Studies",
	"News & Analysis",
	"Interviews"
	]
	keywords = []
	try:
	kw_results = keyword_pipeline(text)
	keywords = [k["word"] for k in kw_results[:5]]
	except:
	keywords = []
	image_label = None
	image_conf = 0
	audio_label = None
	audio_conf = 0
	transcription = None

	text_result_display = "No text provided."
	image_result_display = "No image provided."
	audio_result_display = "No audio provided."

	# ================= TEXT =================
	if text and text.strip():
	try:
	res = text_pipeline(text)[0]
	emotion_res = emotion_pipeline(text)[0]
	topic_res = topic_pipeline(text, topics)
	topic_label = topic_res["labels"][0]
	topic_conf = round(topic_res["scores"][0] * 100, 2)
	emotion_label = emotion_res["label"]
	emotion_conf = round(emotion_res["score"] * 100, 2)

	label_map = {
	"LABEL_0": "NEGATIVE",
	"LABEL_1": "NEUTRAL",
	"LABEL_2": "POSITIVE"
	}

	text_label = label_map.get(res["label"], res["label"])
	text_conf = normalize_score(res["score"])
	text_result_display = f"""
	## 📝 Text Intelligence

	Sentiment: {text_label}
	Confidence: {text_conf}%

	Emotion: {emotion_label}
	Emotion Confidence: {emotion_conf}%

	Topic: {topic_label}
	Topic Confidence: {topic_conf}%

	### 🔑 Key Concepts
	{", ".join(keywords) if keywords else "None detected"}
	"""
	except Exception as e:
	text_result_display = f"Text error: {str(e)}"

	# ================= IMAGE =================
	if image is not None:
	try:
	results = image_pipeline(image)
	inputs = caption_processor(image, return_tensors="pt")
	if device == 0:
	inputs = {k: v.to("cuda") for k, v in inputs.items()}

	out = caption_model.generate(**inputs)
	caption = caption_processor.decode(out[0], skip_special_tokens=True)


	image_result_display = "## 🖼 Image Classification\n\n"
	image_result_display += "### Objects Detected\n"

	for r in results[:3]:
	label = r["label"]
	conf = round(r["score"] * 100, 2)
	image_result_display += f"- {label} ({conf}%)\n"

	image_result_display += f"\n### Caption\n\"{caption}\""

	image_label = results[0]["label"]
	image_conf = normalize_score(res["score"])

	except Exception as e:
	image_result_display = f"Image error: {str(e)}"

	# ================= AUDIO =================
	if audio is not None:
	try:
	res = audio_pipeline(audio)
	transcription = res["text"]

	sent = text_pipeline(transcription)[0]

	label_map = {
	"LABEL_0": "NEGATIVE",
	"LABEL_1": "NEUTRAL",
	"LABEL_2": "POSITIVE"
	}

	audio_label = label_map.get(sent["label"], sent["label"])
	audio_conf = round(sent["score"]*100,2)

	audio_result_display = f"""
	## 🎙 Audio Intelligence

	Transcription:
	"{transcription}"

	Tone: {audio_label}

	Confidence: {audio_conf}%
	"""

	except Exception as e:
	audio_result_display = f"Audio error: {str(e)}"

	# ================= FUSION REASONING =================

	caption = caption if 'caption' in locals() else None
	reasoning_lines = []# ================= CONTRADICTION DETECTION =================


	if text_label:
	reasoning_lines.append(
	f"<b style='color:#60a5fa;'>📝 Text Analysis:</b> "
	f"The text expresses a {text_label.lower()} sentiment with emotion "
	f"'{emotion_label}'. The topic appears related to {topic_label}."
	)
	if keywords:
	reasoning_lines.append(
	f"<b style='color:#38bdf8;'>🔑 Key Concepts:</b> "
	f"{', '.join(keywords)}."
	)
	# ================= IMAGE REASONING =================
	if image_label:
	reasoning_lines.append(
	f"<b style='color:#22c55e;'>🖼 Image Analysis:</b> "
	f"Classifier predicted '{image_label}' ({image_conf}% confidence), "
	"though this may be an approximate category."
	)
	if caption:
	reasoning_lines.append(
	f"<b style='color:#a78bfa;'>📷 Scene Description:</b> "
	f"'{caption}', providing a clearer interpretation of the image."
	)

	if audio_label:
	reasoning_lines.append(
	f"<b style='color:#f59e0b;'>🎙 Audio Tone:</b> "
	f"Spoken content carries a {audio_label.lower()} tone ({audio_conf}%)."
	)
	if text_label and audio_label:
	if text_label == "POSITIVE" and audio_label == "NEGATIVE":
	reasoning_lines.append(
	"<b style='color:#ef4444;'>⚠️ Cross-Modal Conflict:</b> "
	" the text expresses positivity "
	"but the voice tone suggests negativity."
	)
	if text_label == "NEGATIVE" and audio_label == "POSITIVE":
	reasoning_lines.append(
	"<b style='color:#ef4444;'>⚠ Cross-Modal Conflict:</b> "
	"Text expresses negativity while the voice tone appears positive."
	)
	if image_label and topic_label:
	if topic_label == "technology":
	reasoning_lines.append(
	"<b style='color:#ef4444;'>💻🤖 Technology:</b> "
	"The textual topic relates to technology, which aligns with the detected visual content."
	)




	# ================= FUSION SCORE =================

	# ================= CONFIDENCE CALIBRATION =================
	text_weight = 0.4
	image_weight = 0.3
	audio_weight = 0.3

	fusion_score = 0
	if text_label:
	if text_label == "POSITIVE":
	fusion_score += text_conf * text_weight
	elif text_label == "NEGATIVE":
	fusion_score -= text_conf * text_weight

	if image_label:
	fusion_score += image_conf * image_weight
	if audio_label:
	fusion_score += audio_conf * audio_weight

	# ================= INTERPRETATION =================
	# ================= INTERPRETATION =================
	if fusion_score >= 60:
	alignment_message = "Multimodal signals align toward a positive and confident contextual interpretation."
	color = "#22c55e" # GREEN
	elif fusion_score >= 20:
	alignment_message = "Multimodal signals show moderate contextual alignment."
	color = "#f59e0b" # ORANGE
	else:
	alignment_message = "Multimodal signals are weak, mixed, or uncertain."
	color = "#ef4444" # RED
	# ================= CONSISTENCY SCORE =================
	caption = caption if 'caption' in locals() else None
	tech_words = [
	"robot", "computer", "ai", "machine", "device",
	"tablet", "software", "screen", "technology"
	]
	alignment_detected = False
	if topic_label and caption:
	if topic_label == "technology":
	for word in tech_words:
	if word in caption.lower():
	alignment_detected = True
	break
	consistency = 0
	checks = 0

	if text_label and audio_label:
	checks += 1
	if text_label == audio_label:
	consistency += 1

	if caption and topic_label:
	checks += 1
	if topic_label in caption.lower() or alignment_detected:
	consistency += 1

	if checks > 0:
	consistency_score = round((consistency / checks) * 100, 2)
	else:
	consistency_score = 0

	# ================= SEMANTIC ALIGNMENT =================



	if consistency_score > 70:
	consistency_text = "High consistency across modalities."
	elif consistency_score > 40:
	consistency_text = "Moderate alignment between modalities."
	else:
	consistency_text = "Low consistency detected across modalities."

	if alignment_detected:
	reasoning_lines.append(
	"<b style='color:#22c55e;'>🤖 Technology Alignment:</b> "
	"Visual content contains technology-related objects matching the text topic."
	)

	# ================= DISPLAY SUMMARY =================
	if not reasoning_lines:
	reasoning_lines.append("No multimodal signals detected from the provided inputs.")

	processing_time = round(time.time() - start_time, 2)
	fusion_summary = f"""
	<div style="padding:20px;border-radius:16px;
	background:linear-gradient(135deg,#0f172a,#1e293b);
	border:1px solid #1f2a44;">

	<h2>🔎 Multimodal Intelligence Summary</h2>

	{"<br>".join(reasoning_lines)}

	<hr>

	<h3>📊 Fusion Score</h3>

	<span style="color:{color}; font-size:34px; font-weight:800;">
	{round(fusion_score,2)}
	</span>

	<hr>

	<h3>🔗 Multimodal Consistency</h3>

	<span style="font-size:28px;font-weight:700;">
	{consistency_score}%
	</span>

	<br>

	{consistency_text}
	<hr>
	<h3>🧠 Interpretation</h3>

	{alignment_message}

	<br>
	⏱ Processing Time: {processing_time} sec

	</div>
	"""

	# ================= SAVE HISTORY =================

	save_analysis({
	"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
	"text": text,
	"image": image_label,
	"audio": audio_label,
	"transcription": transcription,
	"fusion_score": round(fusion_score, 2)
	})

	return fusion_summary, text_result_display, image_result_display, audio_result_display