Spaces:

gamaly
/

ArticleClassifier

Sleeping

App Files Files Community

gamaly commited on Dec 27, 2025

Commit

a9d4f37

verified ·

1 Parent(s): 7bfc262

Update app.py

Browse files

Files changed (1) hide show

app.py +167 -115

app.py CHANGED Viewed

@@ -1,161 +1,201 @@
-"""Gradio app for Maritime Intelligence Classifier."""
 import gradio as gr
 from setfit import SetFitModel
 from pathlib import Path
 import os
-# Try to load model from Hugging Face Hub first, then fall back to local
-# Set MODEL_PATH environment variable or update this line with your Hugging Face repo ID
-MODEL_PATH = os.getenv("MODEL_PATH", "gamaly/maritime-intelligence-classifier")
-LOCAL_MODEL_PATH = "./maritime_classifier"
-# Load model
-print("Loading model...")
-print(f"MODEL_PATH: {MODEL_PATH}")
-print(f"LOCAL_MODEL_PATH: {LOCAL_MODEL_PATH}")
-model = None
 try:
-    # Check if MODEL_PATH is a Hugging Face repo (contains "/" and doesn't exist locally)
-    if "/" in MODEL_PATH and not Path(MODEL_PATH).exists():
-        print(f"Loading from Hugging Face Hub: {MODEL_PATH}")
-        model = SetFitModel.from_pretrained(MODEL_PATH)
-        print(f"✓ Successfully loaded model from Hugging Face: {MODEL_PATH}")
-    # Check if local model path exists
-    elif Path(LOCAL_MODEL_PATH).exists():
-        print(f"Loading from local path: {LOCAL_MODEL_PATH}")
-        model = SetFitModel.from_pretrained(LOCAL_MODEL_PATH)
-        print(f"✓ Successfully loaded model from local path: {LOCAL_MODEL_PATH}")
-    # If MODEL_PATH is a local path that exists
-    elif Path(MODEL_PATH).exists():
-        print(f"Loading from local path: {MODEL_PATH}")
-        model = SetFitModel.from_pretrained(MODEL_PATH)
-        print(f"✓ Successfully loaded model from local path: {MODEL_PATH}")
-    # Default: try MODEL_PATH as Hugging Face repo
     else:
-        print(f"Attempting to load from Hugging Face Hub: {MODEL_PATH}")
-        model = SetFitModel.from_pretrained(MODEL_PATH)
-        print(f"✓ Successfully loaded model from Hugging Face: {MODEL_PATH}")
 except Exception as e:
-    print(f"❌ Error loading model: {e}")
-    print(f"   Attempted paths:")
-    print(f"   - Hugging Face: {MODEL_PATH}")
-    print(f"   - Local: {LOCAL_MODEL_PATH}")
-    import traceback
-    print("\nFull traceback:")
-    traceback.print_exc()
-    model = None
-if model is None:
-    print("\n⚠️  WARNING: Model failed to load. The app will not work correctly.")
-    print("   Please check:")
-    print(f"   1. Model exists at: https://huggingface.co/{MODEL_PATH}")
-    print("   2. Internet connection is available")
-    print("   3. All dependencies are installed (setfit, sentence-transformers, etc.)")
 else:
-    print("\n✅ Model loaded successfully! Ready for inference.")
 def truncate_text(text, max_tokens=256):
-    """
-    Truncate text to approximately max_tokens.
-    Uses a simple word-based approximation (roughly 1 token = 0.75 words).
-    """
     if not text:
         return text
-    # Rough approximation: 1 token ≈ 0.75 words (conservative estimate)
     max_words = int(max_tokens * 0.75)
     words = text.split()
     if len(words) <= max_words:
         return text
-    # Truncate and add ellipsis
     truncated = " ".join(words[:max_words])
     return truncated + "... [truncated]"
 def predict_text(text):
-    """Predict whether text is actionable (YES) or not (NO)."""
-    if model is None:
-        return "Error: Model not loaded. Please check the console logs.", 0.0, "error"
     if not text or not text.strip():
         return "Please enter some text to classify.", 0.0, "neutral"
     try:
-        # Note: SetFit uses the base model's max_length (256 tokens for all-MiniLM-L6-v2)
-        # The model will automatically truncate longer texts, but we can pre-truncate
-        # to ensure we're using the most relevant part (beginning of text)
-        # For longer articles, the beginning usually contains the most important info
-        # Check approximate length (rough estimate: 1 token ≈ 0.75 words)
         word_count = len(text.split())
         token_estimate = int(word_count / 0.75)
-        # If text is significantly longer than 256 tokens, truncate intelligently
-        # (SetFit will truncate anyway, but we can control which part)
-        if token_estimate > 300:  # Give some buffer
-            # For news articles, the beginning usually has the key info
-            # But we could also try: beginning + end, or just beginning
             processed_text = truncate_text(text, max_tokens=256)
-            print(f"⚠️  Text truncated from ~{token_estimate} tokens to ~256 tokens")
         else:
             processed_text = text
         # Make prediction
-        prediction = model.predict([processed_text])[0]
-        # Get probabilities (handle version compatibility)
         try:
-            probabilities = model.predict_proba([processed_text])[0]
             confidence = probabilities[prediction] * 100
-        except AttributeError as e:
-            # Fallback if predict_proba fails due to version mismatch
-            # Use a simple confidence estimate based on prediction
-            print(f"Warning: predict_proba failed ({e}), using fallback confidence")
-            # For binary classification, we can estimate confidence from the decision function
-            # or just use a default high confidence
-            confidence = 85.0  # Default confidence when we can't get probabilities
-        # Convert to labels
-        label = "YES (Actionable)" if prediction == 1 else "NO (Not Actionable)"
-        # Determine status for styling
         status = "actionable" if prediction == 1 else "not_actionable"
         return label, confidence, status
     except Exception as e:
-        error_msg = f"Error during prediction: {str(e)}"
-        print(error_msg)
-        import traceback
-        traceback.print_exc()
-        return error_msg, 0.0, "error"
 def get_explanation(status):
     """Get explanation based on prediction status."""
     explanations = {
-        "actionable": "✓ This text contains actionable vessel-specific evidence (e.g., specific vessel names, crimes, incidents).",
-        "not_actionable": "✗ This text does not contain actionable vessel-specific evidence (e.g., general maritime news, non-specific information).",
         "error": "⚠️ An error occurred. Please check the model is properly loaded.",
         "neutral": ""
     }
     return explanations.get(status, "")
-# Create Gradio interface
-# Note: theme parameter moved to launch() in Gradio 6.0+
 with gr.Blocks(title="Maritime Intelligence Classifier") as app:
     gr.Markdown(
         """
         # 🚢 Maritime Intelligence Classifier
-        Classify maritime news articles as containing **actionable vessel-specific evidence** (YES) or not (NO).
-        **Actionable articles** typically include:
-        - Specific vessel names
-        - Specific crimes or incidents
-        - Evidence that can be used for investigation
-        **Non-actionable articles** are general maritime news without specific vessel details.
         """
     )
@@ -168,9 +208,11 @@ with gr.Blocks(title="Maritime Intelligence Classifier") as app:
                 max_lines=20
             )
-            submit_btn = gr.Button("Classify", variant="primary", size="lg")
         with gr.Column(scale=1):
             prediction_output = gr.Label(
                 label="Prediction",
                 value={"YES (Actionable)": 0.0, "NO (Not Actionable)": 0.0}
@@ -183,6 +225,13 @@ with gr.Blocks(title="Maritime Intelligence Classifier") as app:
             )
             explanation_output = gr.Markdown()
     # Example texts
     gr.Markdown("### 📝 Example Texts")
@@ -190,10 +239,10 @@ with gr.Blocks(title="Maritime Intelligence Classifier") as app:
         example_yes = gr.Examples(
             examples=[
                 ["The fishing vessel Marine 707 was involved in the disappearance of fisheries observer Samuel Abayateye in Ghanaian waters. The observer's decapitated body was found weeks later."],
-                ["Authorities detained the Meng Xin 15 after discovering evidence of illegal saiko transshipment and threats against fisheries observers."],
             ],
             inputs=text_input,
-            label="YES Examples (Actionable)"
         )
         example_no = gr.Examples(
@@ -202,14 +251,15 @@ with gr.Blocks(title="Maritime Intelligence Classifier") as app:
                 ["Marine scientists are studying the effects of ocean acidification on coral reefs in tropical waters."],
             ],
             inputs=text_input,
-            label="NO Examples (Not Actionable)"
         )
-    # Connect the prediction function
-    def update_prediction(text):
         label, confidence, status = predict_text(text)
-        # Create label dict for gradio Label component
         if status == "actionable":
             label_dict = {"YES (Actionable)": confidence / 100, "NO (Not Actionable)": (100 - confidence) / 100}
         elif status == "not_actionable":
@@ -219,18 +269,22 @@ with gr.Blocks(title="Maritime Intelligence Classifier") as app:
         explanation = get_explanation(status)
-        return label_dict, confidence, explanation
     submit_btn.click(
-        fn=update_prediction,
         inputs=text_input,
-        outputs=[prediction_output, confidence_output, explanation_output]
     )
     text_input.submit(
-        fn=update_prediction,
         inputs=text_input,
-        outputs=[prediction_output, confidence_output, explanation_output]
     )
     gr.Markdown(
@@ -238,15 +292,13 @@ with gr.Blocks(title="Maritime Intelligence Classifier") as app:
         ---
         ### ℹ️ About
-        This classifier uses SetFit to identify maritime news articles containing actionable vessel-specific evidence.
-        Built for The Outlaw Ocean Project.
-        **Model**: SetFit (sentence-transformers/all-MiniLM-L6-v2 base)
         """
     )
 if __name__ == "__main__":
-    app.launch(share=False, theme=gr.themes.Soft())

+"""Gradio app for Maritime Intelligence Classifier + Entity Extraction."""
 import gradio as gr
 from setfit import SetFitModel
+from transformers import pipeline
 from pathlib import Path
 import os
+# ============================================================
+# MODEL PATHS
+# ============================================================
+# Classification model (SetFit)
+CLASSIFIER_PATH = os.getenv("CLASSIFIER_PATH", "gamaly/maritime-intelligence-classifier")
+LOCAL_CLASSIFIER_PATH = "./maritime_classifier"
+# NER model (BERT) - UPDATE THIS WITH YOUR HF REPO
+NER_PATH = os.getenv("NER_PATH", "gamaly/bert-vessel-ner")  # ← Change to your repo!
+LOCAL_NER_PATH = "./models/bert-vessel-ner"
+# ============================================================
+# LOAD MODELS
+# ============================================================
+print("="*60)
+print("Loading models...")
+print("="*60)
+# Load Classification Model
+classifier = None
+try:
+    if "/" in CLASSIFIER_PATH and not Path(CLASSIFIER_PATH).exists():
+        print(f"Loading classifier from HuggingFace: {CLASSIFIER_PATH}")
+        classifier = SetFitModel.from_pretrained(CLASSIFIER_PATH)
+    elif Path(LOCAL_CLASSIFIER_PATH).exists():
+        print(f"Loading classifier from local: {LOCAL_CLASSIFIER_PATH}")
+        classifier = SetFitModel.from_pretrained(LOCAL_CLASSIFIER_PATH)
+    else:
+        print(f"Loading classifier from HuggingFace: {CLASSIFIER_PATH}")
+        classifier = SetFitModel.from_pretrained(CLASSIFIER_PATH)
+    print(f"✓ Classifier loaded")
+except Exception as e:
+    print(f"❌ Classifier failed to load: {e}")
+# Load NER Model
+ner_model = None
 try:
+    if "/" in NER_PATH and not Path(NER_PATH).exists():
+        print(f"Loading NER from HuggingFace: {NER_PATH}")
+        ner_model = pipeline("ner", model=NER_PATH, aggregation_strategy="simple")
+    elif Path(LOCAL_NER_PATH).exists():
+        print(f"Loading NER from local: {LOCAL_NER_PATH}")
+        ner_model = pipeline("ner", model=LOCAL_NER_PATH, aggregation_strategy="simple")
     else:
+        print(f"Loading NER from HuggingFace: {NER_PATH}")
+        ner_model = pipeline("ner", model=NER_PATH, aggregation_strategy="simple")
+    print(f"✓ NER model loaded")
 except Exception as e:
+    print(f"❌ NER model failed to load: {e}")
+print("="*60)
+if classifier and ner_model:
+    print("✅ All models loaded successfully!")
 else:
+    print("⚠️  Some models failed to load. Check logs above.")
+print("="*60)
+# ============================================================
+# HELPER FUNCTIONS
+# ============================================================
 def truncate_text(text, max_tokens=256):
+    """Truncate text to approximately max_tokens."""
     if not text:
         return text
     max_words = int(max_tokens * 0.75)
     words = text.split()
     if len(words) <= max_words:
         return text
     truncated = " ".join(words[:max_words])
     return truncated + "... [truncated]"
+def extract_entities(text):
+    """Extract VESSEL and ORG entities from text."""
+    if ner_model is None:
+        return [], []
+    if not text or not text.strip():
+        return [], []
+    try:
+        entities = ner_model(text)
+        vessels = []
+        orgs = []
+        for e in entities:
+            entity_text = e['word'].strip()
+            score = e['score']
+            entity_type = e['entity_group']
+            # Skip low confidence
+            if score < 0.5:
+                continue
+            # Clean up tokenization artifacts
+            entity_text = entity_text.replace(" ##", "").replace("##", "")
+            if entity_type == 'VESSEL':
+                vessels.append({"text": entity_text, "score": score})
+            elif entity_type == 'ORG':
+                orgs.append({"text": entity_text, "score": score})
+        # Deduplicate
+        vessels = list({v['text']: v for v in vessels}.values())
+        orgs = list({o['text']: o for o in orgs}.values())
+        return vessels, orgs
+    except Exception as e:
+        print(f"NER error: {e}")
+        return [], []
 def predict_text(text):
+    """Predict whether text is actionable and extract entities."""
+    if classifier is None:
+        return "Error: Classifier not loaded.", 0.0, "error"
     if not text or not text.strip():
         return "Please enter some text to classify.", 0.0, "neutral"
     try:
+        # Truncate if needed
         word_count = len(text.split())
         token_estimate = int(word_count / 0.75)
+        if token_estimate > 300:
             processed_text = truncate_text(text, max_tokens=256)
         else:
             processed_text = text
         # Make prediction
+        prediction = classifier.predict([processed_text])[0]
+        # Get probabilities
         try:
+            probabilities = classifier.predict_proba([processed_text])[0]
             confidence = probabilities[prediction] * 100
+        except AttributeError:
+            confidence = 85.0
+        label = "YES (Actionable)" if prediction == 1 else "NO (Not Actionable)"
         status = "actionable" if prediction == 1 else "not_actionable"
         return label, confidence, status
     except Exception as e:
+        print(f"Classification error: {e}")
+        return f"Error: {str(e)}", 0.0, "error"
+def format_entities(vessels, orgs):
+    """Format extracted entities as markdown."""
+    if not vessels and not orgs:
+        return "No entities detected."
+    output = ""
+    if vessels:
+        output += "### 🚢 Vessels\n"
+        for v in vessels:
+            output += f"- **{v['text']}** ({v['score']:.0%})\n"
+        output += "\n"
+    if orgs:
+        output += "### 🏢 Organizations\n"
+        for o in orgs:
+            output += f"- **{o['text']}** ({o['score']:.0%})\n"
+    return output
 def get_explanation(status):
     """Get explanation based on prediction status."""
     explanations = {
+        "actionable": "✓ This text contains actionable vessel-specific evidence.",
+        "not_actionable": "✗ This text does not contain actionable vessel-specific evidence.",
         "error": "⚠️ An error occurred. Please check the model is properly loaded.",
         "neutral": ""
     }
     return explanations.get(status, "")
+# ============================================================
+# GRADIO APP
+# ============================================================
 with gr.Blocks(title="Maritime Intelligence Classifier") as app:
     gr.Markdown(
         """
         # 🚢 Maritime Intelligence Classifier
+        **Two-stage analysis:**
+        1. **Classification** - Is this article actionable?
+        2. **Entity Extraction** - What vessels and organizations are mentioned?
         """
     )
                 max_lines=20
             )
+            submit_btn = gr.Button("Analyze", variant="primary", size="lg")
         with gr.Column(scale=1):
+            # Classification results
+            gr.Markdown("### 📊 Classification")
             prediction_output = gr.Label(
                 label="Prediction",
                 value={"YES (Actionable)": 0.0, "NO (Not Actionable)": 0.0}
             )
             explanation_output = gr.Markdown()
+            # Entity extraction results
+            gr.Markdown("---")
+            entities_output = gr.Markdown(
+                label="Extracted Entities",
+                value="### 🔍 Extracted Entities\nNo entities detected yet."
+            )
     # Example texts
     gr.Markdown("### 📝 Example Texts")
         example_yes = gr.Examples(
             examples=[
                 ["The fishing vessel Marine 707 was involved in the disappearance of fisheries observer Samuel Abayateye in Ghanaian waters. The observer's decapitated body was found weeks later."],
+                ["Authorities detained the Meng Xin 15 after discovering evidence of illegal saiko transshipment. Pacific Seafood Inc. was identified as the vessel operator."],
             ],
             inputs=text_input,
+            label="Actionable Examples"
         )
         example_no = gr.Examples(
                 ["Marine scientists are studying the effects of ocean acidification on coral reefs in tropical waters."],
             ],
             inputs=text_input,
+            label="Non-Actionable Examples"
         )
+    # Main analysis function
+    def analyze_text(text):
+        # Classification
         label, confidence, status = predict_text(text)
+        # Create label dict
         if status == "actionable":
             label_dict = {"YES (Actionable)": confidence / 100, "NO (Not Actionable)": (100 - confidence) / 100}
         elif status == "not_actionable":
         explanation = get_explanation(status)
+        # Entity extraction
+        vessels, orgs = extract_entities(text)
+        entities_md = "### 🔍 Extracted Entities\n" + format_entities(vessels, orgs)
+        return label_dict, confidence, explanation, entities_md
     submit_btn.click(
+        fn=analyze_text,
         inputs=text_input,
+        outputs=[prediction_output, confidence_output, explanation_output, entities_output]
     )
     text_input.submit(
+        fn=analyze_text,
         inputs=text_input,
+        outputs=[prediction_output, confidence_output, explanation_output, entities_output]
     )
     gr.Markdown(
         ---
         ### ℹ️ About
+        **Classification**: SetFit model identifies actionable maritime intelligence.
+        **Entity Extraction**: BERT-NER model extracts vessel names and organizations.
+        Built for The Outlaw Ocean Project.
         """
     )
 if __name__ == "__main__":
+    app.launch(share=False, theme=gr.themes.Soft())