Spaces:

gamaly
/

ArticleClassifier

Sleeping

App Files Files Community

ArticleClassifier / app.py

gamaly

Update app.py

a9d4f37 verified about 1 month ago

raw

history blame contribute delete

10.7 kB

	"""Gradio app for Maritime Intelligence Classifier + Entity Extraction."""
	import gradio as gr
	from setfit import SetFitModel
	from transformers import pipeline
	from pathlib import Path
	import os

	# ============================================================
	# MODEL PATHS
	# ============================================================
	# Classification model (SetFit)
	CLASSIFIER_PATH = os.getenv("CLASSIFIER_PATH", "gamaly/maritime-intelligence-classifier")
	LOCAL_CLASSIFIER_PATH = "./maritime_classifier"

	# NER model (BERT) - UPDATE THIS WITH YOUR HF REPO
	NER_PATH = os.getenv("NER_PATH", "gamaly/bert-vessel-ner") # ← Change to your repo!
	LOCAL_NER_PATH = "./models/bert-vessel-ner"

	# ============================================================
	# LOAD MODELS
	# ============================================================
	print("="*60)
	print("Loading models...")
	print("="*60)

	# Load Classification Model
	classifier = None
	try:
	if "/" in CLASSIFIER_PATH and not Path(CLASSIFIER_PATH).exists():
	print(f"Loading classifier from HuggingFace: {CLASSIFIER_PATH}")
	classifier = SetFitModel.from_pretrained(CLASSIFIER_PATH)
	elif Path(LOCAL_CLASSIFIER_PATH).exists():
	print(f"Loading classifier from local: {LOCAL_CLASSIFIER_PATH}")
	classifier = SetFitModel.from_pretrained(LOCAL_CLASSIFIER_PATH)
	else:
	print(f"Loading classifier from HuggingFace: {CLASSIFIER_PATH}")
	classifier = SetFitModel.from_pretrained(CLASSIFIER_PATH)
	print(f"✓ Classifier loaded")
	except Exception as e:
	print(f"❌ Classifier failed to load: {e}")

	# Load NER Model
	ner_model = None
	try:
	if "/" in NER_PATH and not Path(NER_PATH).exists():
	print(f"Loading NER from HuggingFace: {NER_PATH}")
	ner_model = pipeline("ner", model=NER_PATH, aggregation_strategy="simple")
	elif Path(LOCAL_NER_PATH).exists():
	print(f"Loading NER from local: {LOCAL_NER_PATH}")
	ner_model = pipeline("ner", model=LOCAL_NER_PATH, aggregation_strategy="simple")
	else:
	print(f"Loading NER from HuggingFace: {NER_PATH}")
	ner_model = pipeline("ner", model=NER_PATH, aggregation_strategy="simple")
	print(f"✓ NER model loaded")
	except Exception as e:
	print(f"❌ NER model failed to load: {e}")

	print("="*60)
	if classifier and ner_model:
	print("✅ All models loaded successfully!")
	else:
	print("⚠️ Some models failed to load. Check logs above.")
	print("="*60)

	# ============================================================
	# HELPER FUNCTIONS
	# ============================================================
	def truncate_text(text, max_tokens=256):
	"""Truncate text to approximately max_tokens."""
	if not text:
	return text

	max_words = int(max_tokens * 0.75)
	words = text.split()

	if len(words) <= max_words:
	return text

	truncated = " ".join(words[:max_words])
	return truncated + "... [truncated]"

	def extract_entities(text):
	"""Extract VESSEL and ORG entities from text."""
	if ner_model is None:
	return [], []

	if not text or not text.strip():
	return [], []

	try:
	entities = ner_model(text)

	vessels = []
	orgs = []

	for e in entities:
	entity_text = e['word'].strip()
	score = e['score']
	entity_type = e['entity_group']

	# Skip low confidence
	if score < 0.5:
	continue

	# Clean up tokenization artifacts
	entity_text = entity_text.replace(" ##", "").replace("##", "")

	if entity_type == 'VESSEL':
	vessels.append({"text": entity_text, "score": score})
	elif entity_type == 'ORG':
	orgs.append({"text": entity_text, "score": score})

	# Deduplicate
	vessels = list({v['text']: v for v in vessels}.values())
	orgs = list({o['text']: o for o in orgs}.values())

	return vessels, orgs
	except Exception as e:
	print(f"NER error: {e}")
	return [], []

	def predict_text(text):
	"""Predict whether text is actionable and extract entities."""
	if classifier is None:
	return "Error: Classifier not loaded.", 0.0, "error"

	if not text or not text.strip():
	return "Please enter some text to classify.", 0.0, "neutral"

	try:
	# Truncate if needed
	word_count = len(text.split())
	token_estimate = int(word_count / 0.75)

	if token_estimate > 300:
	processed_text = truncate_text(text, max_tokens=256)
	else:
	processed_text = text

	# Make prediction
	prediction = classifier.predict([processed_text])[0]

	# Get probabilities
	try:
	probabilities = classifier.predict_proba([processed_text])[0]
	confidence = probabilities[prediction] * 100
	except AttributeError:
	confidence = 85.0

	label = "YES (Actionable)" if prediction == 1 else "NO (Not Actionable)"
	status = "actionable" if prediction == 1 else "not_actionable"

	return label, confidence, status
	except Exception as e:
	print(f"Classification error: {e}")
	return f"Error: {str(e)}", 0.0, "error"

	def format_entities(vessels, orgs):
	"""Format extracted entities as markdown."""
	if not vessels and not orgs:
	return "No entities detected."

	output = ""

	if vessels:
	output += "### 🚢 Vessels\n"
	for v in vessels:
	output += f"- {v['text']} ({v['score']:.0%})\n"
	output += "\n"

	if orgs:
	output += "### 🏢 Organizations\n"
	for o in orgs:
	output += f"- {o['text']} ({o['score']:.0%})\n"

	return output

	def get_explanation(status):
	"""Get explanation based on prediction status."""
	explanations = {
	"actionable": "✓ This text contains actionable vessel-specific evidence.",
	"not_actionable": "✗ This text does not contain actionable vessel-specific evidence.",
	"error": "⚠️ An error occurred. Please check the model is properly loaded.",
	"neutral": ""
	}
	return explanations.get(status, "")

	# ============================================================
	# GRADIO APP
	# ============================================================
	with gr.Blocks(title="Maritime Intelligence Classifier") as app:
	gr.Markdown(
	"""
	# 🚢 Maritime Intelligence Classifier

	Two-stage analysis:
	1. Classification - Is this article actionable?
	2. Entity Extraction - What vessels and organizations are mentioned?
	"""
	)

	with gr.Row():
	with gr.Column(scale=2):
	text_input = gr.Textbox(
	label="Article Text",
	placeholder="Paste or type the maritime news article text here...",
	lines=10,
	max_lines=20
	)

	submit_btn = gr.Button("Analyze", variant="primary", size="lg")

	with gr.Column(scale=1):
	# Classification results
	gr.Markdown("### 📊 Classification")
	prediction_output = gr.Label(
	label="Prediction",
	value={"YES (Actionable)": 0.0, "NO (Not Actionable)": 0.0}
	)

	confidence_output = gr.Number(
	label="Confidence",
	value=0.0,
	precision=1
	)

	explanation_output = gr.Markdown()

	# Entity extraction results
	gr.Markdown("---")
	entities_output = gr.Markdown(
	label="Extracted Entities",
	value="### 🔍 Extracted Entities\nNo entities detected yet."
	)

	# Example texts
	gr.Markdown("### 📝 Example Texts")
	with gr.Row():
	example_yes = gr.Examples(
	examples=[
	["The fishing vessel Marine 707 was involved in the disappearance of fisheries observer Samuel Abayateye in Ghanaian waters. The observer's decapitated body was found weeks later."],
	["Authorities detained the Meng Xin 15 after discovering evidence of illegal saiko transshipment. Pacific Seafood Inc. was identified as the vessel operator."],
	],
	inputs=text_input,
	label="Actionable Examples"
	)

	example_no = gr.Examples(
	examples=[
	["A new maritime museum opened in the port city, showcasing historical ships and ocean exploration artifacts."],
	["Marine scientists are studying the effects of ocean acidification on coral reefs in tropical waters."],
	],
	inputs=text_input,
	label="Non-Actionable Examples"
	)

	# Main analysis function
	def analyze_text(text):
	# Classification
	label, confidence, status = predict_text(text)

	# Create label dict
	if status == "actionable":
	label_dict = {"YES (Actionable)": confidence / 100, "NO (Not Actionable)": (100 - confidence) / 100}
	elif status == "not_actionable":
	label_dict = {"YES (Actionable)": (100 - confidence) / 100, "NO (Not Actionable)": confidence / 100}
	else:
	label_dict = {"YES (Actionable)": 0.0, "NO (Not Actionable)": 0.0}

	explanation = get_explanation(status)

	# Entity extraction
	vessels, orgs = extract_entities(text)
	entities_md = "### 🔍 Extracted Entities\n" + format_entities(vessels, orgs)

	return label_dict, confidence, explanation, entities_md

	submit_btn.click(
	fn=analyze_text,
	inputs=text_input,
	outputs=[prediction_output, confidence_output, explanation_output, entities_output]
	)

	text_input.submit(
	fn=analyze_text,
	inputs=text_input,
	outputs=[prediction_output, confidence_output, explanation_output, entities_output]
	)

	gr.Markdown(
	"""
	---
	### ℹ️ About

	Classification: SetFit model identifies actionable maritime intelligence.

	Entity Extraction: BERT-NER model extracts vessel names and organizations.

	Built for The Outlaw Ocean Project.
	"""
	)

	if __name__ == "__main__":
	app.launch(share=False, theme=gr.themes.Soft())