Spaces:

gamaly
/

ArticleClassifier

Sleeping

File size: 10,661 Bytes

a9d4f37
5bcf32b
 
a9d4f37
5bcf32b
 
 
a9d4f37
 
 
 
 
 
5bcf32b
a9d4f37
 
 
327be00
a9d4f37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5bcf32b
a9d4f37
 
 
 
 
 
5bcf32b
a9d4f37
 
 
5bcf32b
a9d4f37
5bcf32b
a9d4f37
 
 
327be00
a9d4f37
 
327be00
a9d4f37
 
 
3c76e95
a9d4f37
3c76e95
 
 
 
 
 
 
 
 
 
 
 
a9d4f37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5bcf32b
a9d4f37
 
 
5bcf32b
 
 
 
 
a9d4f37
3c76e95
 
 
a9d4f37
3c76e95
 
 
 
5bcf32b
a9d4f37
5bcf32b
a9d4f37
3c76e95
a9d4f37
3c76e95
a9d4f37
 
5bcf32b
a9d4f37
5bcf32b
 
 
 
a9d4f37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5bcf32b
 
 
 
a9d4f37
 
5bcf32b
 
 
 
 
a9d4f37
 
 
3c76e95
5bcf32b
 
 
 
a9d4f37
 
 
5bcf32b
 
 
 
 
 
 
 
 
 
 
 
a9d4f37
5bcf32b
 
a9d4f37
 
5bcf32b
 
 
 
 
 
 
 
 
 
 
 
a9d4f37
 
 
 
 
 
 
5bcf32b
 
 
 
 
 
 
a9d4f37
5bcf32b
 
a9d4f37
5bcf32b
 
 
 
 
 
 
 
a9d4f37
5bcf32b
 
a9d4f37
 
 
5bcf32b
 
a9d4f37
5bcf32b
 
 
 
 
 
 
 
 
a9d4f37
 
 
 
 
5bcf32b
 
a9d4f37
5bcf32b
a9d4f37
5bcf32b
 
 
a9d4f37
5bcf32b
a9d4f37
5bcf32b
 
 
 
 
 
 
a9d4f37
 
 
5bcf32b
a9d4f37
5bcf32b
 
 
 
a9d4f37

"""Gradio app for Maritime Intelligence Classifier + Entity Extraction."""
import gradio as gr
from setfit import SetFitModel
from transformers import pipeline
from pathlib import Path
import os

# ============================================================
# MODEL PATHS
# ============================================================
# Classification model (SetFit)
CLASSIFIER_PATH = os.getenv("CLASSIFIER_PATH", "gamaly/maritime-intelligence-classifier")
LOCAL_CLASSIFIER_PATH = "./maritime_classifier"

# NER model (BERT) - UPDATE THIS WITH YOUR HF REPO
NER_PATH = os.getenv("NER_PATH", "gamaly/bert-vessel-ner")  # ← Change to your repo!
LOCAL_NER_PATH = "./models/bert-vessel-ner"

# ============================================================
# LOAD MODELS
# ============================================================
print("="*60)
print("Loading models...")
print("="*60)

# Load Classification Model
classifier = None
try:
    if "/" in CLASSIFIER_PATH and not Path(CLASSIFIER_PATH).exists():
        print(f"Loading classifier from HuggingFace: {CLASSIFIER_PATH}")
        classifier = SetFitModel.from_pretrained(CLASSIFIER_PATH)
    elif Path(LOCAL_CLASSIFIER_PATH).exists():
        print(f"Loading classifier from local: {LOCAL_CLASSIFIER_PATH}")
        classifier = SetFitModel.from_pretrained(LOCAL_CLASSIFIER_PATH)
    else:
        print(f"Loading classifier from HuggingFace: {CLASSIFIER_PATH}")
        classifier = SetFitModel.from_pretrained(CLASSIFIER_PATH)
    print(f"✓ Classifier loaded")
except Exception as e:
    print(f"❌ Classifier failed to load: {e}")

# Load NER Model
ner_model = None
try:
    if "/" in NER_PATH and not Path(NER_PATH).exists():
        print(f"Loading NER from HuggingFace: {NER_PATH}")
        ner_model = pipeline("ner", model=NER_PATH, aggregation_strategy="simple")
    elif Path(LOCAL_NER_PATH).exists():
        print(f"Loading NER from local: {LOCAL_NER_PATH}")
        ner_model = pipeline("ner", model=LOCAL_NER_PATH, aggregation_strategy="simple")
    else:
        print(f"Loading NER from HuggingFace: {NER_PATH}")
        ner_model = pipeline("ner", model=NER_PATH, aggregation_strategy="simple")
    print(f"✓ NER model loaded")
except Exception as e:
    print(f"❌ NER model failed to load: {e}")

print("="*60)
if classifier and ner_model:
    print("✅ All models loaded successfully!")
else:
    print("⚠️  Some models failed to load. Check logs above.")
print("="*60)

# ============================================================
# HELPER FUNCTIONS
# ============================================================
def truncate_text(text, max_tokens=256):
    """Truncate text to approximately max_tokens."""
    if not text:
        return text
    
    max_words = int(max_tokens * 0.75)
    words = text.split()
    
    if len(words) <= max_words:
        return text
    
    truncated = " ".join(words[:max_words])
    return truncated + "... [truncated]"

def extract_entities(text):
    """Extract VESSEL and ORG entities from text."""
    if ner_model is None:
        return [], []
    
    if not text or not text.strip():
        return [], []
    
    try:
        entities = ner_model(text)
        
        vessels = []
        orgs = []
        
        for e in entities:
            entity_text = e['word'].strip()
            score = e['score']
            entity_type = e['entity_group']
            
            # Skip low confidence
            if score < 0.5:
                continue
            
            # Clean up tokenization artifacts
            entity_text = entity_text.replace(" ##", "").replace("##", "")
            
            if entity_type == 'VESSEL':
                vessels.append({"text": entity_text, "score": score})
            elif entity_type == 'ORG':
                orgs.append({"text": entity_text, "score": score})
        
        # Deduplicate
        vessels = list({v['text']: v for v in vessels}.values())
        orgs = list({o['text']: o for o in orgs}.values())
        
        return vessels, orgs
    except Exception as e:
        print(f"NER error: {e}")
        return [], []

def predict_text(text):
    """Predict whether text is actionable and extract entities."""
    if classifier is None:
        return "Error: Classifier not loaded.", 0.0, "error"
    
    if not text or not text.strip():
        return "Please enter some text to classify.", 0.0, "neutral"
    
    try:
        # Truncate if needed
        word_count = len(text.split())
        token_estimate = int(word_count / 0.75)
        
        if token_estimate > 300:
            processed_text = truncate_text(text, max_tokens=256)
        else:
            processed_text = text
        
        # Make prediction
        prediction = classifier.predict([processed_text])[0]
        
        # Get probabilities
        try:
            probabilities = classifier.predict_proba([processed_text])[0]
            confidence = probabilities[prediction] * 100
        except AttributeError:
            confidence = 85.0
        
        label = "YES (Actionable)" if prediction == 1 else "NO (Not Actionable)"
        status = "actionable" if prediction == 1 else "not_actionable"
        
        return label, confidence, status
    except Exception as e:
        print(f"Classification error: {e}")
        return f"Error: {str(e)}", 0.0, "error"

def format_entities(vessels, orgs):
    """Format extracted entities as markdown."""
    if not vessels and not orgs:
        return "No entities detected."
    
    output = ""
    
    if vessels:
        output += "### 🚢 Vessels\n"
        for v in vessels:
            output += f"- **{v['text']}** ({v['score']:.0%})\n"
        output += "\n"
    
    if orgs:
        output += "### 🏢 Organizations\n"
        for o in orgs:
            output += f"- **{o['text']}** ({o['score']:.0%})\n"
    
    return output

def get_explanation(status):
    """Get explanation based on prediction status."""
    explanations = {
        "actionable": "✓ This text contains actionable vessel-specific evidence.",
        "not_actionable": "✗ This text does not contain actionable vessel-specific evidence.",
        "error": "⚠️ An error occurred. Please check the model is properly loaded.",
        "neutral": ""
    }
    return explanations.get(status, "")

# ============================================================
# GRADIO APP
# ============================================================
with gr.Blocks(title="Maritime Intelligence Classifier") as app:
    gr.Markdown(
        """
        # 🚢 Maritime Intelligence Classifier
        
        **Two-stage analysis:**
        1. **Classification** - Is this article actionable?
        2. **Entity Extraction** - What vessels and organizations are mentioned?
        """
    )
    
    with gr.Row():
        with gr.Column(scale=2):
            text_input = gr.Textbox(
                label="Article Text",
                placeholder="Paste or type the maritime news article text here...",
                lines=10,
                max_lines=20
            )
            
            submit_btn = gr.Button("Analyze", variant="primary", size="lg")
        
        with gr.Column(scale=1):
            # Classification results
            gr.Markdown("### 📊 Classification")
            prediction_output = gr.Label(
                label="Prediction",
                value={"YES (Actionable)": 0.0, "NO (Not Actionable)": 0.0}
            )
            
            confidence_output = gr.Number(
                label="Confidence",
                value=0.0,
                precision=1
            )
            
            explanation_output = gr.Markdown()
            
            # Entity extraction results
            gr.Markdown("---")
            entities_output = gr.Markdown(
                label="Extracted Entities",
                value="### 🔍 Extracted Entities\nNo entities detected yet."
            )
    
    # Example texts
    gr.Markdown("### 📝 Example Texts")
    with gr.Row():
        example_yes = gr.Examples(
            examples=[
                ["The fishing vessel Marine 707 was involved in the disappearance of fisheries observer Samuel Abayateye in Ghanaian waters. The observer's decapitated body was found weeks later."],
                ["Authorities detained the Meng Xin 15 after discovering evidence of illegal saiko transshipment. Pacific Seafood Inc. was identified as the vessel operator."],
            ],
            inputs=text_input,
            label="Actionable Examples"
        )
        
        example_no = gr.Examples(
            examples=[
                ["A new maritime museum opened in the port city, showcasing historical ships and ocean exploration artifacts."],
                ["Marine scientists are studying the effects of ocean acidification on coral reefs in tropical waters."],
            ],
            inputs=text_input,
            label="Non-Actionable Examples"
        )
    
    # Main analysis function
    def analyze_text(text):
        # Classification
        label, confidence, status = predict_text(text)
        
        # Create label dict
        if status == "actionable":
            label_dict = {"YES (Actionable)": confidence / 100, "NO (Not Actionable)": (100 - confidence) / 100}
        elif status == "not_actionable":
            label_dict = {"YES (Actionable)": (100 - confidence) / 100, "NO (Not Actionable)": confidence / 100}
        else:
            label_dict = {"YES (Actionable)": 0.0, "NO (Not Actionable)": 0.0}
        
        explanation = get_explanation(status)
        
        # Entity extraction
        vessels, orgs = extract_entities(text)
        entities_md = "### 🔍 Extracted Entities\n" + format_entities(vessels, orgs)
        
        return label_dict, confidence, explanation, entities_md
    
    submit_btn.click(
        fn=analyze_text,
        inputs=text_input,
        outputs=[prediction_output, confidence_output, explanation_output, entities_output]
    )
    
    text_input.submit(
        fn=analyze_text,
        inputs=text_input,
        outputs=[prediction_output, confidence_output, explanation_output, entities_output]
    )
    
    gr.Markdown(
        """
        ---
        ### ℹ️ About
        
        **Classification**: SetFit model identifies actionable maritime intelligence.
        
        **Entity Extraction**: BERT-NER model extracts vessel names and organizations.
        
        Built for The Outlaw Ocean Project.
        """
    )

if __name__ == "__main__":
    app.launch(share=False, theme=gr.themes.Soft())