Spaces:

BILALfym
/

SkimLit_NLP

Sleeping

File size: 7,957 Bytes

import streamlit as st
import numpy as np
import tensorflow as tf
import re
from pathlib import Path

# Set page config
st.set_page_config(
    page_title="SkimLit - Abstract Classifier",
    page_icon="📄",
    layout="wide",
)

# Custom CSS
st.markdown("""
<style>
    .section-title {
        font-size: 1.5em;
        font-weight: bold;
        margin-top: 1.5em;
        margin-bottom: 0.5em;
    }
    .section-content {
        padding: 1em;
        border-left: 4px solid #ccc;
        margin-bottom: 1em;
        line-height: 1.6;
    }
    .background { border-left-color: #FFB347; }
    .objective { border-left-color: #87CEEB; }
    .methods { border-left-color: #90EE90; }
    .results { border-left-color: #FFD700; }
    .conclusions { border-left-color: #DDA0DD; }
</style>
""", unsafe_allow_html=True)

@st.cache_resource
def load_model_and_encoder():
    """Load the trained model and sentence encoder"""
    try:
        from sentence_transformers import SentenceTransformer
        import urllib.request
        import os

        script_dir = Path(__file__).parent
        model_path = script_dir / 'model_5.keras'

        # Load sentence encoder
        encoder = SentenceTransformer("all-MiniLM-L6-v2")

        # Load the model - try local first, then download
        if model_path.exists():
            model = tf.keras.models.load_model(str(model_path))
        else:
            st.info("Downloading model... (first time only)")
            # Download from HF Hub
            model_url = "https://huggingface.co/BILALfym/skimlit-model/resolve/main/model_5.keras"
            urllib.request.urlretrieve(model_url, str(model_path))
            model = tf.keras.models.load_model(str(model_path))

        return model, encoder
    except Exception as e:
        st.error(f"Error loading: {e}")
        return None, None

def encode_line_number(line_number, max_value=15):
    """Encode line number as a one-hot vector"""
    vec = np.zeros(max_value)
    if line_number < max_value:
        vec[line_number] = 1
    return vec

def encode_total_lines(total_lines, max_value=20):
    """Encode total lines as a one-hot vector"""
    vec = np.zeros(max_value)
    if total_lines < max_value:
        vec[total_lines] = 1
    return vec

def predict_labels(sentences, model, encoder):
    """Predict labels for sentences"""
    if not model or not encoder:
        return []

    predictions = []
    total_sentences = len(sentences)

    # Encode all sentences at once
    try:
        embeddings = encoder.encode(sentences, batch_size=32, show_progress_bar=False)
    except Exception as e:
        st.error(f"Error encoding sentences: {e}")
        return []

    for idx, sentence in enumerate(sentences):
        try:
            # Prepare character input (space-separated chars)
            char_text = " ".join(list(sentence))

            # Get embedding for this sentence
            token_embedding = embeddings[idx:idx+1].astype(np.float32)

            # Prepare positional inputs
            line_input = encode_line_number(idx, max_value=15).astype(np.float32)
            total_input = encode_total_lines(total_sentences, max_value=20).astype(np.float32)

            # Predict - convert all to TensorFlow tensors with correct dtypes
            pred = model.predict(
                {
                    'token_inputs': tf.constant(token_embedding, dtype=tf.float32),
                    'char_inputs': tf.constant([char_text], dtype=tf.string),
                    'line_number_inputs': tf.constant([line_input], dtype=tf.float32),
                    'total_lines_inputs': tf.constant([total_input], dtype=tf.float32)
                },
                verbose=0
            )

            pred_probs = pred[0]
            pred_label = np.argmax(pred_probs)
            confidence = np.max(pred_probs)

            predictions.append({
                'sentence': sentence,
                'label_id': int(pred_label),
                'confidence': float(confidence),
                'probabilities': [float(p) for p in pred_probs]
            })
        except Exception as e:
            st.warning(f"Error predicting: {str(e)[:80]}")
            continue

    return predictions

def get_label_name(label_id):
    """Map label ID to name — ordre alphabétique sklearn LabelEncoder"""
    labels = ['Background', 'Conclusions', 'Methods', 'Objective', 'Results']
    return labels[label_id] if 0 <= label_id < len(labels) else 'Unknown'

def get_emoji(label_name):
    """Get emoji for label"""
    emojis = {
        'Background': '📚',
        'Objective': '🎯',
        'Methods': '🔬',
        'Results': '📊',
        'Conclusions': '✅'
    }
    return emojis.get(label_name, '📄')

# Main app
st.title("📄 SkimLit - Abstract Section Classifier")
st.write("Organize your scientific abstract into structured sections")

# Load model
model, encoder = load_model_and_encoder()

if model is None or encoder is None:
    st.stop()

# Input section
st.markdown("---")

input_method = st.radio(
    "Choose input:",
    ["Sample abstract", "Enter your text"]
)

if input_method == "Sample abstract":
    sample = """Background: Cardiovascular disease remains a leading cause of mortality globally. Early detection through biomarkers can improve patient outcomes. Objective: This study aims to identify novel cardiovascular biomarkers. Methods: We conducted a prospective cohort study of 500 participants over 5 years, collecting blood samples for mass spectrometry analysis. Results: We identified three novel biomarkers with 85% sensitivity and 90% specificity for early cardiovascular disease detection. Conclusions: These biomarkers show significant promise and warrant further validation in independent cohorts."""
    text = st.text_area("Abstract:", value=sample, height=200)
else:
    text = st.text_area(
        "Paste your abstract:",
        height=200,
        placeholder="Enter scientific abstract..."
    )

# Classify button
if st.button("🚀 Classify", use_container_width=True):
    if text.strip():
        sentences = re.split(r'(?<=[.!?])\s+', text.strip())
        sentences = [s.strip() for s in sentences if s.strip()]

        if sentences:
            with st.spinner("Classifying..."):
                predictions = predict_labels(sentences, model, encoder)

            if predictions:
                st.markdown("---")
                st.subheader("📋 Classified Abstract")

                # Group sentences by label
                sections = {
                    'Background': [],
                    'Objective': [],
                    'Methods': [],
                    'Results': [],
                    'Conclusions': []
                }

                for pred in predictions:
                    label = get_label_name(pred['label_id'])
                    sections[label].append(pred['sentence'])

                # Display sections in order
                section_order = ['Background', 'Objective', 'Methods', 'Results', 'Conclusions']

                for section_name in section_order:
                    sentences_in_section = sections[section_name]

                    if sentences_in_section:
                        emoji = get_emoji(section_name)
                        st.markdown(f"### {emoji} {section_name}")

                        # Join sentences in this section
                        section_text = " ".join(sentences_in_section)

                        # Display with styling
                        st.markdown(f"<div class='section-content {section_name.lower()}'>{section_text}</div>",
                                   unsafe_allow_html=True)
            else:
                st.error("Could not generate predictions.")
        else:
            st.warning("No sentences found.")
    else:
        st.warning("Please enter some text.")

st.markdown("---")
st.caption("🔬 SkimLit | Scientific Abstract Classifier")