Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import numpy as np | |
| import tensorflow as tf | |
| import re | |
| from pathlib import Path | |
| # Set page config | |
| st.set_page_config( | |
| page_title="SkimLit - Abstract Classifier", | |
| page_icon="📄", | |
| layout="wide", | |
| ) | |
| # Custom CSS | |
| st.markdown(""" | |
| <style> | |
| .section-title { | |
| font-size: 1.5em; | |
| font-weight: bold; | |
| margin-top: 1.5em; | |
| margin-bottom: 0.5em; | |
| } | |
| .section-content { | |
| padding: 1em; | |
| border-left: 4px solid #ccc; | |
| margin-bottom: 1em; | |
| line-height: 1.6; | |
| } | |
| .background { border-left-color: #FFB347; } | |
| .objective { border-left-color: #87CEEB; } | |
| .methods { border-left-color: #90EE90; } | |
| .results { border-left-color: #FFD700; } | |
| .conclusions { border-left-color: #DDA0DD; } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| def load_model_and_encoder(): | |
| """Load the trained model and sentence encoder""" | |
| try: | |
| from sentence_transformers import SentenceTransformer | |
| import urllib.request | |
| import os | |
| script_dir = Path(__file__).parent | |
| model_path = script_dir / 'model_5.keras' | |
| # Load sentence encoder | |
| encoder = SentenceTransformer("all-MiniLM-L6-v2") | |
| # Load the model - try local first, then download | |
| if model_path.exists(): | |
| model = tf.keras.models.load_model(str(model_path)) | |
| else: | |
| st.info("Downloading model... (first time only)") | |
| # Download from HF Hub | |
| model_url = "https://huggingface.co/BILALfym/skimlit-model/resolve/main/model_5.keras" | |
| urllib.request.urlretrieve(model_url, str(model_path)) | |
| model = tf.keras.models.load_model(str(model_path)) | |
| return model, encoder | |
| except Exception as e: | |
| st.error(f"Error loading: {e}") | |
| return None, None | |
| def encode_line_number(line_number, max_value=15): | |
| """Encode line number as a one-hot vector""" | |
| vec = np.zeros(max_value) | |
| if line_number < max_value: | |
| vec[line_number] = 1 | |
| return vec | |
| def encode_total_lines(total_lines, max_value=20): | |
| """Encode total lines as a one-hot vector""" | |
| vec = np.zeros(max_value) | |
| if total_lines < max_value: | |
| vec[total_lines] = 1 | |
| return vec | |
| def predict_labels(sentences, model, encoder): | |
| """Predict labels for sentences""" | |
| if not model or not encoder: | |
| return [] | |
| predictions = [] | |
| total_sentences = len(sentences) | |
| # Encode all sentences at once | |
| try: | |
| embeddings = encoder.encode(sentences, batch_size=32, show_progress_bar=False) | |
| except Exception as e: | |
| st.error(f"Error encoding sentences: {e}") | |
| return [] | |
| for idx, sentence in enumerate(sentences): | |
| try: | |
| # Prepare character input (space-separated chars) | |
| char_text = " ".join(list(sentence)) | |
| # Get embedding for this sentence | |
| token_embedding = embeddings[idx:idx+1].astype(np.float32) | |
| # Prepare positional inputs | |
| line_input = encode_line_number(idx, max_value=15).astype(np.float32) | |
| total_input = encode_total_lines(total_sentences, max_value=20).astype(np.float32) | |
| # Predict - convert all to TensorFlow tensors with correct dtypes | |
| pred = model.predict( | |
| { | |
| 'token_inputs': tf.constant(token_embedding, dtype=tf.float32), | |
| 'char_inputs': tf.constant([char_text], dtype=tf.string), | |
| 'line_number_inputs': tf.constant([line_input], dtype=tf.float32), | |
| 'total_lines_inputs': tf.constant([total_input], dtype=tf.float32) | |
| }, | |
| verbose=0 | |
| ) | |
| pred_probs = pred[0] | |
| pred_label = np.argmax(pred_probs) | |
| confidence = np.max(pred_probs) | |
| predictions.append({ | |
| 'sentence': sentence, | |
| 'label_id': int(pred_label), | |
| 'confidence': float(confidence), | |
| 'probabilities': [float(p) for p in pred_probs] | |
| }) | |
| except Exception as e: | |
| st.warning(f"Error predicting: {str(e)[:80]}") | |
| continue | |
| return predictions | |
| def get_label_name(label_id): | |
| """Map label ID to name — ordre alphabétique sklearn LabelEncoder""" | |
| labels = ['Background', 'Conclusions', 'Methods', 'Objective', 'Results'] | |
| return labels[label_id] if 0 <= label_id < len(labels) else 'Unknown' | |
| def get_emoji(label_name): | |
| """Get emoji for label""" | |
| emojis = { | |
| 'Background': '📚', | |
| 'Objective': '🎯', | |
| 'Methods': '🔬', | |
| 'Results': '📊', | |
| 'Conclusions': '✅' | |
| } | |
| return emojis.get(label_name, '📄') | |
| # Main app | |
| st.title("📄 SkimLit - Abstract Section Classifier") | |
| st.write("Organize your scientific abstract into structured sections") | |
| # Load model | |
| model, encoder = load_model_and_encoder() | |
| if model is None or encoder is None: | |
| st.stop() | |
| # Input section | |
| st.markdown("---") | |
| input_method = st.radio( | |
| "Choose input:", | |
| ["Sample abstract", "Enter your text"] | |
| ) | |
| if input_method == "Sample abstract": | |
| sample = """Background: Cardiovascular disease remains a leading cause of mortality globally. Early detection through biomarkers can improve patient outcomes. Objective: This study aims to identify novel cardiovascular biomarkers. Methods: We conducted a prospective cohort study of 500 participants over 5 years, collecting blood samples for mass spectrometry analysis. Results: We identified three novel biomarkers with 85% sensitivity and 90% specificity for early cardiovascular disease detection. Conclusions: These biomarkers show significant promise and warrant further validation in independent cohorts.""" | |
| text = st.text_area("Abstract:", value=sample, height=200) | |
| else: | |
| text = st.text_area( | |
| "Paste your abstract:", | |
| height=200, | |
| placeholder="Enter scientific abstract..." | |
| ) | |
| # Classify button | |
| if st.button("🚀 Classify", use_container_width=True): | |
| if text.strip(): | |
| sentences = re.split(r'(?<=[.!?])\s+', text.strip()) | |
| sentences = [s.strip() for s in sentences if s.strip()] | |
| if sentences: | |
| with st.spinner("Classifying..."): | |
| predictions = predict_labels(sentences, model, encoder) | |
| if predictions: | |
| st.markdown("---") | |
| st.subheader("📋 Classified Abstract") | |
| # Group sentences by label | |
| sections = { | |
| 'Background': [], | |
| 'Objective': [], | |
| 'Methods': [], | |
| 'Results': [], | |
| 'Conclusions': [] | |
| } | |
| for pred in predictions: | |
| label = get_label_name(pred['label_id']) | |
| sections[label].append(pred['sentence']) | |
| # Display sections in order | |
| section_order = ['Background', 'Objective', 'Methods', 'Results', 'Conclusions'] | |
| for section_name in section_order: | |
| sentences_in_section = sections[section_name] | |
| if sentences_in_section: | |
| emoji = get_emoji(section_name) | |
| st.markdown(f"### {emoji} {section_name}") | |
| # Join sentences in this section | |
| section_text = " ".join(sentences_in_section) | |
| # Display with styling | |
| st.markdown(f"<div class='section-content {section_name.lower()}'>{section_text}</div>", | |
| unsafe_allow_html=True) | |
| else: | |
| st.error("Could not generate predictions.") | |
| else: | |
| st.warning("No sentences found.") | |
| else: | |
| st.warning("Please enter some text.") | |
| st.markdown("---") | |
| st.caption("🔬 SkimLit | Scientific Abstract Classifier") | |