Spaces:

PrashantRGore
/

Drug_Causality_Classifier

Runtime error

App Files Files Community

PrashantRGore commited on Nov 5, 2025

Commit

0f21e9b

1 Parent(s): 1f12230

Deploy drug-causality-bert v1 with BioBERT model and caching optimizations

Browse files

Files changed (18) hide show

app/.ipynb_checkpoints/requirements-checkpoint.txt +9 -0
app/.ipynb_checkpoints/streamlit_app-checkpoint.py +344 -0
app/requirements.txt +9 -0
app/streamlit_app.py +352 -0
models/production_model_final/config.json +25 -0
models/production_model_final/model.safetensors +3 -0
models/production_model_final/special_tokens_map.json +7 -0
models/production_model_final/tokenizer.json +0 -0
models/production_model_final/tokenizer_config.json +58 -0
models/production_model_final/training_args.bin +3 -0
models/production_model_final/training_config.json +16 -0
models/production_model_final/vocab.txt +0 -0
requirements.txt +10 -2
src/__init__.py +0 -0
src/__pycache__/__init__.cpython-313.pyc +0 -0
src/__pycache__/inference.cpython-313.pyc +0 -0
src/inference.py +169 -0
streamlit_app.py +352 -0

app/.ipynb_checkpoints/requirements-checkpoint.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+torch>=2.1.0
+transformers>=4.35.0
+pandas
+numpy
+scikit-learn
+nltk>=3.7
+PyPDF2>=3.0.1
+streamlit>=1.22.0
+safetensors>=0.4.0

app/.ipynb_checkpoints/streamlit_app-checkpoint.py ADDED Viewed

	@@ -0,0 +1,344 @@

+import streamlit as st
+import tempfile
+import os
+import sys
+from pathlib import Path
+import nltk
+nltk.download('punkt')
+# Add parent directory to Python path for imports
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from src.inference import CausalityClassifier, extract_text_from_pdf, classify_causality, process_pdf_file, process_multiple_pdfs
+# App Configuration
+st.set_page_config(
+    page_title="Drug Causality Classifier",
+    page_icon="💊",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Main Title
+st.title("💊 Drug Causality Classifier")
+st.caption("BioBERT Model | F1 Score: 97.59% | Sensitivity: 98.68% | Specificity: 96.50%")
+# Load model (cached)
+@st.cache_resource
+def load_model():
+    try:
+        return CausalityClassifier("models/production_model_final")
+    except Exception as e:
+        st.error(f"Failed to load model: {e}")
+        return None
+classifier = load_model()
+# Sidebar Configuration
+st.sidebar.header("⚙️ Configuration")
+threshold = st.sidebar.slider(
+    "Classification Threshold",
+    min_value=0.0,
+    max_value=1.0,
+    value=0.5,
+    step=0.05,
+    help="Higher threshold = stricter causality detection"
+)
+st.sidebar.info(
+    "**Threshold Guide:**\n"
+    "- 0.3-0.4: High sensitivity (catch all events)\n"
+    "- 0.5: Balanced performance\n"
+    "- 0.7-0.8: High precision (reduce false alarms)"
+)
+# Main Content
+tab1, tab2, tab3 = st.tabs(["📝 Single Text", "📄 PDF Analysis", "📁 Batch Processing"])
+# TAB 1: Single Text Classification
+with tab1:
+    st.header("📝 Single Statement Classification")
+    st.write("Enter medical text to classify drug-adverse event causality:")
+    text_input = st.text_area(
+        "Medical Text:",
+        height=150,
+        placeholder="e.g., Patient developed severe nausea and vomiting 2 hours after taking Drug X. Clinical assessment confirmed drug-related causality."
+    )
+    col1, col2 = st.columns([2, 1])
+    with col1:
+        if st.button("🔍 Classify Text", type="primary", use_container_width=True):
+            if text_input and classifier:
+                with st.spinner("Analyzing text..."):
+                    result = classifier.predict(text_input, threshold)
+                # Display Results
+                st.subheader("📊 Results")
+                result_col1, result_col2 = st.columns(2)
+                with result_col1:
+                    classification = result['prediction'].upper()
+                    color = "green" if result['prediction'] == 'related' else "red"
+                    st.markdown(f"**Classification:** :{color}[{classification}]")
+                with result_col2:
+                    confidence_pct = result['confidence'] * 100
+                    st.metric("Confidence", f"{confidence_pct:.1f}%")
+                # Probability Distribution
+                st.subheader("📈 Probability Distribution")
+                probs = result['probabilities']
+                # Progress bars
+                st.write("**Related (Drug-Caused):**")
+                st.progress(probs['related'], text=f"{probs['related']:.2%}")
+                st.write("**Not Related:**")
+                st.progress(probs['not_related'], text=f"{probs['not_related']:.2%}")
+                # Raw JSON Output
+                with st.expander("🔍 View Raw Results"):
+                    st.json(result)
+            elif not classifier:
+                st.error("Model not loaded properly.")
+            else:
+                st.warning("Please enter text to classify.")
+    with col2:
+        st.info(
+            "**Example Inputs:**\n\n"
+            "**Related:** _Patient developed rash after taking aspirin. Symptoms resolved after discontinuation._\n\n"
+            "**Not Related:** _Patient has a history of diabetes and hypertension. Takes metformin daily._"
+        )
+# TAB 2: PDF Analysis
+with tab2:
+    st.header("📄 PDF Document Analysis")
+    st.write("Upload a PDF document for comprehensive drug-adverse event analysis:")
+    pdf_file = st.file_uploader(
+        "Choose a PDF file",
+        type=["pdf"],
+        help="Upload medical documents, case reports, or clinical notes"
+    )
+    if pdf_file and classifier:
+        # Save uploaded file temporarily
+        temp_dir = tempfile.gettempdir()
+        temp_path = os.path.join(temp_dir, pdf_file.name)
+        with open(temp_path, "wb") as tmp_f:
+            tmp_f.write(pdf_file.getbuffer())
+        # Analysis Button
+        if st.button("🔍 Analyze PDF", type="primary", use_container_width=True):
+            with st.spinner(f"Processing {pdf_file.name}..."):
+                try:
+                    # Extract and classify
+                    pdf_text = extract_text_from_pdf(temp_path)
+                    results = classify_causality(pdf_text, threshold=threshold)
+                    # Display Summary
+                    st.subheader("📊 Analysis Summary")
+                    summary_col1, summary_col2, summary_col3 = st.columns(3)
+                    with summary_col1:
+                        classification = results['final_classification'].upper()
+                        color = "green" if results['final_classification'] == 'related' else "red"
+                        st.markdown(f"**Overall:** :{color}[{classification}]")
+                    with summary_col2:
+                        confidence_pct = results['confidence_score'] * 100
+                        st.metric("Confidence", f"{confidence_pct:.1f}%")
+                    with summary_col3:
+                        st.metric("Total Sentences", results['total_sentences'])
+                    # Sentence Breakdown
+                    st.subheader("🔍 Sentence Analysis")
+                    breakdown_col1, breakdown_col2 = st.columns(2)
+                    with breakdown_col1:
+                        st.metric("Related Sentences", results['related_sentences'])
+                    with breakdown_col2:
+                        st.metric("Not Related", results['not_related_sentences'])
+                    # Top Related Sentences
+                    if results['related_sentences'] > 0:
+                        st.subheader("🎯 Top Related Sentences")
+                        for i, sent_detail in enumerate(results.get('top_related_sentences', []), 1):
+                            confidence = sent_detail['probability_related']
+                            confidence_color = "green" if confidence > 0.7 else "orange" if confidence > 0.5 else "red"
+                            st.markdown(f"**{i}.** ({confidence:.1%} confidence)")
+                            st.markdown(f":{confidence_color}[{sent_detail['sentence']}]")
+                            st.write("")
+                    # Download Button
+                    st.subheader("💾 Download Report")
+                    import json
+                    report_json = json.dumps(results, indent=2)
+                    st.download_button(
+                        label="📥 Download JSON Report",
+                        data=report_json,
+                        file_name=f"{pdf_file.name}_causality_report.json",
+                        mime="application/json"
+                    )
+                    # Raw Results Expander
+                    with st.expander("🔍 View Full Results"):
+                        st.json(results)
+                except Exception as e:
+                    st.error(f"Error processing PDF: {str(e)}")
+                    st.info("Please ensure the PDF contains readable text and try again.")
+                # Clean up temp file
+                finally:
+                    try:
+                        os.remove(temp_path)
+                    except:
+                        pass
+# TAB 3: Batch Processing
+with tab3:
+    st.header("📁 Batch PDF Processing")
+    st.write("Upload multiple PDF files for batch causality analysis:")
+    batch_files = st.file_uploader(
+        "Choose PDF files",
+        type=["pdf"],
+        accept_multiple_files=True,
+        help="Upload multiple medical documents for batch analysis"
+    )
+    if batch_files and classifier:
+        st.write(f"**Selected files:** {len(batch_files)} PDFs")
+        for i, file in enumerate(batch_files, 1):
+            st.write(f"{i}. {file.name}")
+        if st.button("🔍 Process All PDFs", type="primary", use_container_width=True):
+            # Create temporary paths for all files
+            batch_temp_paths = []
+            temp_dir = tempfile.gettempdir()
+            try:
+                # Save all files temporarily
+                for batch_file in batch_files:
+                    temp_path = os.path.join(temp_dir, batch_file.name)
+                    with open(temp_path, "wb") as tmp_f:
+                        tmp_f.write(batch_file.getbuffer())
+                    batch_temp_paths.append(temp_path)
+                # Process all files
+                with st.spinner(f"Processing {len(batch_files)} files..."):
+                    batch_results = process_multiple_pdfs(batch_temp_paths, threshold=threshold)
+                # Display Batch Summary
+                st.subheader("📊 Batch Analysis Summary")
+                # Overall stats
+                total_files = len(batch_results)
+                successful = len([r for r in batch_results if 'error' not in r])
+                related_count = len([r for r in batch_results if r.get('final_classification') == 'related'])
+                stat_col1, stat_col2, stat_col3 = st.columns(3)
+                with stat_col1:
+                    st.metric("Total Files", total_files)
+                with stat_col2:
+                    st.metric("Successfully Processed", successful)
+                with stat_col3:
+                    st.metric("Drug-Related Files", related_count)
+                # Individual Results
+                st.subheader("📄 Individual Results")
+                for i, res in enumerate(batch_results, 1):
+                    if 'error' in res:
+                        st.error(f"**{i}. {res['pdf_file']}:** Error - {res['error']}")
+                    else:
+                        classification = res['final_classification'].upper()
+                        confidence = res.get('confidence_score', 0) * 100
+                        color = "green" if res['final_classification'] == 'related' else "red"
+                        st.markdown(f"**{i}. {res['pdf_file']}:** :{color}[{classification}] ({confidence:.1f}% confidence)")
+                # Download Batch Summary
+                st.subheader("💾 Download Batch Report")
+                import json
+                batch_report = {
+                    'summary': {
+                        'total_files': total_files,
+                        'successful': successful,
+                        'related_count': related_count,
+                        'threshold_used': threshold
+                    },
+                    'individual_results': batch_results
+                }
+                batch_json = json.dumps(batch_report, indent=2)
+                st.download_button(
+                    label="📥 Download Batch Summary",
+                    data=batch_json,
+                    file_name="batch_causality_summary.json",
+                    mime="application/json"
+                )
+                # Raw Results Expander
+                with st.expander("🔍 View Full Batch Results"):
+                    st.json(batch_results)
+            except Exception as e:
+                st.error(f"Batch processing error: {str(e)}")
+            finally:
+                # Clean up all temp files
+                for temp_path in batch_temp_paths:
+                    try:
+                        os.remove(temp_path)
+                    except:
+                        pass
+# Footer
+st.markdown("---")
+st.markdown(
+    "**Built with BioBERT for Pharmacovigilance** | "
+    "Developed for clinical decision support and regulatory compliance"
+)
+# Sidebar additional info
+st.sidebar.markdown("---")
+st.sidebar.markdown("### 📈 Model Performance")
+st.sidebar.markdown(
+    "- **F1 Score:** 97.59%\n"
+    "- **Accuracy:** 97.59%\n"
+    "- **Sensitivity:** 98.68%\n"
+    "- **Specificity:** 96.50%"
+)
+st.sidebar.markdown("### 🏥 Clinical Use")
+st.sidebar.markdown(
+    "This tool assists in:\n"
+    "- Adverse event detection\n"
+    "- Pharmacovigilance screening\n"
+    "- Clinical report analysis\n"
+    "- Regulatory compliance"
+)

app/requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+torch>=2.1.0
+transformers>=4.35.0
+pandas
+numpy
+scikit-learn
+nltk>=3.7
+PyPDF2>=3.0.1
+streamlit>=1.22.0
+safetensors>=0.4.0

app/streamlit_app.py ADDED Viewed

	@@ -0,0 +1,352 @@

+import streamlit as st
+import tempfile
+import os
+import sys
+import pandas as pd
+import json
+from pathlib import Path
+import nltk
+nltk.download('punkt')
+# Add parent directory to path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+# NOW THIS IMPORT WILL WORK!
+from src.inference import (
+    CausalityClassifier,
+    extract_text_from_pdf,
+    classify_causality,
+    process_pdf_file,
+    process_multiple_pdfs
+)
+# SINGLE load_model function with caching
+@st.cache_resource
+def load_model():
+    """Load CausalityClassifier model once and reuse across sessions"""
+    try:
+        return CausalityClassifier("models/production_model_final")
+    except Exception as e:
+        st.error(f"Failed to load model: {e}")
+        return None
+# App Configuration
+st.set_page_config(
+    page_title="Drug Causality Classifier",
+    page_icon="💊",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Main Title
+st.title("💊 Drug Causality Classifier")
+st.caption("BioBERT Model | F1 Score: 97.59% | Sensitivity: 98.68% | Specificity: 96.50%")
+# Load model (cached)
+classifier = load_model()
+# Sidebar Configuration
+st.sidebar.header("⚙️ Configuration")
+threshold = st.sidebar.slider(
+    "Classification Threshold",
+    min_value=0.0,
+    max_value=1.0,
+    value=0.5,
+    step=0.05,
+    help="Higher threshold = stricter causality detection"
+)
+st.sidebar.info(
+    "**Threshold Guide:**\n"
+    "- 0.3-0.4: High sensitivity (catch all events)\n"
+    "- 0.5: Balanced performance\n"
+    "- 0.7-0.8: High precision (reduce false alarms)"
+)
+# Main Content
+tab1, tab2, tab3 = st.tabs(["📝 Single Text", "📄 PDF Analysis", "📁 Batch Processing"])
+# TAB 1: Single Text Classification
+with tab1:
+    st.header("📝 Single Statement Classification")
+    st.write("Enter medical text to classify drug-adverse event causality:")
+    text_input = st.text_area(
+        "Medical Text:",
+        height=150,
+        placeholder="e.g., Patient developed severe nausea and vomiting 2 hours after taking Drug X. Clinical assessment confirmed drug-related causality."
+    )
+    col1, col2 = st.columns([2, 1])
+    with col1:
+        if st.button("🔍 Classify Text", type="primary", use_container_width=True):
+            if text_input and classifier:
+                with st.spinner("Analyzing text..."):
+                    result = classifier.predict(text_input, threshold)
+                # Display Results
+                st.subheader("📊 Results")
+                result_col1, result_col2 = st.columns(2)
+                with result_col1:
+                    classification = result['prediction'].upper()
+                    color = "green" if result['prediction'] == 'related' else "red"
+                    st.markdown(f"**Classification:** :{color}[{classification}]")
+                with result_col2:
+                    confidence_pct = result['confidence'] * 100
+                    st.metric("Confidence", f"{confidence_pct:.1f}%")
+                # Probability Distribution
+                st.subheader("📈 Probability Distribution")
+                probs = result['probabilities']
+                # Progress bars
+                st.write("**Related (Drug-Caused):**")
+                st.progress(probs['related'], text=f"{probs['related']:.2%}")
+                st.write("**Not Related:**")
+                st.progress(probs['not_related'], text=f"{probs['not_related']:.2%}")
+                # Raw JSON Output
+                with st.expander("🔍 View Raw Results"):
+                    st.json(result)
+            elif not classifier:
+                st.error("Model not loaded properly.")
+            else:
+                st.warning("Please enter text to classify.")
+    with col2:
+        st.info(
+            "**Example Inputs:**\n\n"
+            "**Related:** _Patient developed rash after taking aspirin. Symptoms resolved after discontinuation._\n\n"
+            "**Not Related:** _Patient has a history of diabetes and hypertension. Takes metformin daily._"
+        )
+# TAB 2: PDF Analysis
+with tab2:
+    st.header("📄 PDF Document Analysis")
+    st.write("Upload a PDF document for comprehensive drug-adverse event analysis:")
+    pdf_file = st.file_uploader(
+        "Choose a PDF file",
+        type=["pdf"],
+        help="Upload medical documents, case reports, or clinical notes"
+    )
+    if pdf_file and classifier:
+        # Save uploaded file temporarily
+        temp_dir = tempfile.gettempdir()
+        temp_path = os.path.join(temp_dir, pdf_file.name)
+        with open(temp_path, "wb") as tmp_f:
+            tmp_f.write(pdf_file.getbuffer())
+        # Analysis Button
+        if st.button("🔍 Analyze PDF", type="primary", use_container_width=True):
+            with st.spinner(f"Processing {pdf_file.name}..."):
+                try:
+                    # Extract and classify
+                    pdf_text = extract_text_from_pdf(temp_path)
+                    results = classify_causality(pdf_text, threshold=threshold)
+                    # Display Summary
+                    st.subheader("📊 Analysis Summary")
+                    summary_col1, summary_col2, summary_col3 = st.columns(3)
+                    with summary_col1:
+                        classification = results['final_classification'].upper()
+                        color = "green" if results['final_classification'] == 'related' else "red"
+                        st.markdown(f"**Overall:** :{color}[{classification}]")
+                    with summary_col2:
+                        confidence_pct = results['confidence_score'] * 100
+                        st.metric("Confidence", f"{confidence_pct:.1f}%")
+                    with summary_col3:
+                        st.metric("Total Sentences", results['total_sentences'])
+                    # Sentence Breakdown
+                    st.subheader("🔍 Sentence Analysis")
+                    breakdown_col1, breakdown_col2 = st.columns(2)
+                    with breakdown_col1:
+                        st.metric("Related Sentences", results['related_sentences'])
+                    with breakdown_col2:
+                        st.metric("Not Related", results['not_related_sentences'])
+                    # Top Related Sentences
+                    if results['related_sentences'] > 0:
+                        st.subheader("🎯 Top Related Sentences")
+                        for i, sent_detail in enumerate(results.get('top_related_sentences', []), 1):
+                            confidence = sent_detail['probability_related']
+                            confidence_color = "green" if confidence > 0.7 else "orange" if confidence > 0.5 else "red"
+                            st.markdown(f"**{i}.** ({confidence:.1%} confidence)")
+                            st.markdown(f":{confidence_color}[{sent_detail['sentence']}]")
+                            st.write("")
+                    # Download Button
+                    st.subheader("💾 Download Report")
+                    report_json = json.dumps(results, indent=2)
+                    st.download_button(
+                        label="📥 Download JSON Report",
+                        data=report_json,
+                        file_name=f"{pdf_file.name}_causality_report.json",
+                        mime="application/json"
+                    )
+                    # Raw Results Expander
+                    with st.expander("🔍 View Full Results"):
+                        st.json(results)
+                except Exception as e:
+                    st.error(f"Error processing PDF: {str(e)}")
+                    st.info("Please ensure the PDF contains readable text and try again.")
+                # Clean up temp file
+                finally:
+                    try:
+                        os.remove(temp_path)
+                    except:
+                        pass
+# TAB 3: Batch Processing
+with tab3:
+    st.header("📁 Batch PDF Processing")
+    st.write("Upload multiple PDF files for batch causality analysis:")
+    batch_files = st.file_uploader(
+        "Choose PDF files",
+        type=["pdf"],
+        accept_multiple_files=True,
+        help="Upload multiple medical documents for batch analysis"
+    )
+    if batch_files and classifier:
+        st.write(f"**Selected files:** {len(batch_files)} PDFs")
+        for i, file in enumerate(batch_files, 1):
+            st.write(f"{i}. {file.name}")
+        if st.button("🔍 Process All PDFs", type="primary", use_container_width=True):
+            # Create temporary paths for all files
+            batch_temp_paths = []
+            temp_dir = tempfile.gettempdir()
+            try:
+                # Save all files temporarily
+                for batch_file in batch_files:
+                    temp_path = os.path.join(temp_dir, batch_file.name)
+                    with open(temp_path, "wb") as tmp_f:
+                        tmp_f.write(batch_file.getbuffer())
+                    batch_temp_paths.append(temp_path)
+                # Process all files
+                with st.spinner(f"Processing {len(batch_files)} files..."):
+                    batch_results = process_multiple_pdfs(batch_temp_paths, threshold=threshold)
+                # Display Batch Summary
+                st.subheader("📊 Batch Analysis Summary")
+                # Overall stats
+                total_files = len(batch_results)
+                successful = len([r for r in batch_results if 'error' not in r])
+                related_count = len([r for r in batch_results if r.get('final_classification') == 'related'])
+                stat_col1, stat_col2, stat_col3 = st.columns(3)
+                with stat_col1:
+                    st.metric("Total Files", total_files)
+                with stat_col2:
+                    st.metric("Successfully Processed", successful)
+                with stat_col3:
+                    st.metric("Drug-Related Files", related_count)
+                # Individual Results
+                st.subheader("📄 Individual Results")
+                for i, res in enumerate(batch_results, 1):
+                    if 'error' in res:
+                        st.error(f"**{i}. {res['pdf_file']}:** Error - {res['error']}")
+                    else:
+                        classification = res['final_classification'].upper()
+                        confidence = res.get('confidence_score', 0) * 100
+                        color = "green" if res['final_classification'] == 'related' else "red"
+                        st.markdown(f"**{i}. {res['pdf_file']}:** :{color}[{classification}] ({confidence:.1f}% confidence)")
+                # Download Batch Summary
+                st.subheader("💾 Download Batch Report")
+                batch_report = {
+                    'summary': {
+                        'total_files': total_files,
+                        'successful': successful,
+                        'related_count': related_count,
+                        'threshold_used': threshold
+                    },
+                    'individual_results': batch_results
+                }
+                batch_json = json.dumps(batch_report, indent=2)
+                st.download_button(
+                    label="📥 Download Batch Summary",
+                    data=batch_json,
+                    file_name="batch_causality_summary.json",
+                    mime="application/json"
+                )
+                # Raw Results Expander
+                with st.expander("🔍 View Full Batch Results"):
+                    st.json(batch_results)
+            except Exception as e:
+                st.error(f"Batch processing error: {str(e)}")
+            finally:
+                # Clean up all temp files
+                for temp_path in batch_temp_paths:
+                    try:
+                        os.remove(temp_path)
+                    except:
+                        pass
+# Footer
+st.markdown("---")
+st.markdown(
+    "**Built with BioBERT for Pharmacovigilance** | "
+    "Developed for clinical decision support and regulatory compliance"
+)
+# Sidebar additional info
+st.sidebar.markdown("---")
+st.sidebar.markdown("### 📈 Model Performance")
+st.sidebar.markdown(
+    "- **F1 Score:** 97.59%\n"
+    "- **Accuracy:** 97.59%\n"
+    "- **Sensitivity:** 98.68%\n"
+    "- **Specificity:** 96.50%"
+)
+st.sidebar.markdown("### 🏥 Clinical Use")
+st.sidebar.markdown(
+    "This tool assists in:\n"
+    "- Adverse event detection\n"
+    "- Pharmacovigilance screening\n"
+    "- Clinical report analysis\n"
+    "- Regulatory compliance"
+)

models/production_model_final/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "architectures": [
+    "BertForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "dtype": "float32",
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "transformers_version": "4.57.1",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30522
+}

models/production_model_final/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7f73202c120a52a4288c045e5713eeecbfe7b3431b5e15dafa3ded35b8ba18e4
+size 437958648

models/production_model_final/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

models/production_model_final/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

models/production_model_final/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

models/production_model_final/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ca6a57bb3665602515473f6c3e6aa96cb5505d7b8642beb6c8604c4a00aec451
+size 5777

models/production_model_final/training_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "best_params": {
+    "learning_rate": 3.758180918998249e-05,
+    "num_train_epochs": 1,
+    "batch_size": 4,
+    "gradient_accumulation_steps": 4
+  },
+  "final_results": {
+    "accuracy": 0.9758909853249476,
+    "f1": 0.9758881040529953,
+    "precision": 0.9761185621296976,
+    "recall": 0.9758909853249476
+  },
+  "optuna_source": "Trial 1",
+  "training_date": "2025-10-25T16:06:34.368403"
+}

models/production_model_final/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt CHANGED Viewed

@@ -1,3 +1,11 @@
-altair
 pandas
-streamlit

+torch>=2.1.0
+transformers>=4.35.0
 pandas
+numpy
+scikit-learn
+nltk>=3.7
+PyPDF2>=3.0.1
+streamlit>=1.22.0
+safetensors>=0.4.0
+pip install boto3
+pip freeze > requirements.txt

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (147 Bytes). View file

src/__pycache__/inference.cpython-313.pyc ADDED Viewed

Binary file (20.3 kB). View file

src/inference.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import torch
+import numpy as np
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from pathlib import Path
+import PyPDF2
+import json
+from datetime import datetime
+from typing import Union, List, Dict
+import re
+# NLTK with robust error handling
+import nltk
+import ssl
+# SSL fix for NLTK
+try:
+    _create_unverified_https_context = ssl._create_unverified_context
+except AttributeError:
+    pass
+else:
+    ssl._create_default_https_context = _create_unverified_https_context
+# Enhanced NLTK data download with retry
+def download_nltk_data_robust():
+    """Download NLTK data with multiple attempts and fallbacks"""
+    import os
+    # Set NLTK data path explicitly
+    nltk_data_dir = '/home/appuser/nltk_data'
+    if not os.path.exists(nltk_data_dir):
+        try:
+            os.makedirs(nltk_data_dir, exist_ok=True)
+        except:
+            pass
+    if nltk_data_dir not in nltk.data.path:
+        nltk.data.path.insert(0, nltk_data_dir)
+    packages = ['punkt', 'punkt_tab']
+    for package in packages:
+        for attempt in range(3):  # Try 3 times
+            try:
+                nltk.data.find(f'tokenizers/{package}')
+                print(f"✓ {package} already available")
+                break
+            except LookupError:
+                try:
+                    print(f"Downloading {package} (attempt {attempt + 1})...")
+                    nltk.download(package, download_dir=nltk_data_dir, quiet=False)
+                    print(f"✓ {package} downloaded successfully")
+                    break
+                except Exception as e:
+                    print(f"Warning: Could not download {package}: {e}")
+                    if attempt == 2:
+                        print(f"Failed to download {package} after 3 attempts")
+# Download on import
+download_nltk_data_robust()
+# Fallback sentence tokenizer using regex
+def simple_sentence_tokenize(text):
+    """Simple regex-based sentence tokenizer as fallback"""
+    # Split on common sentence boundaries
+    sentences = re.split(r'(?<=[.!?])\s+', text)
+    return [s.strip() for s in sentences if s.strip()]
+# Safe sentence tokenization with fallback
+def safe_sent_tokenize(text):
+    """Tokenize with NLTK, fallback to regex if NLTK fails"""
+    try:
+        from nltk.tokenize import sent_tokenize
+        return sent_tokenize(text)
+    except Exception as e:
+        print(f"NLTK tokenization failed ({e}), using fallback...")
+        return simple_sentence_tokenize(text)
+class CausalityClassifier:
+    def __init__(self, model_path='./models/production_model_final', threshold=0.5):
+        self.model_path = Path(model_path)
+        self.threshold = threshold
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
+        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_path)
+        self.model.eval()
+    def predict(self, text, return_probs=False):
+        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=96)
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+            probs = torch.softmax(outputs.logits, dim=1).numpy()[0]
+            pred = 1 if probs[1] > self.threshold else 0
+        result = {
+            'prediction': 'related' if pred == 1 else 'not related',
+            'confidence': float(probs[pred]),
+            'label': int(pred)
+        }
+        if return_probs:
+            result['probabilities'] = {
+                'not_related': float(probs[0]),
+                'related': float(probs[1])
+            }
+        return result
+def extract_text_from_pdf(pdf_path):
+    with open(pdf_path, 'rb') as file:
+        pdf_reader = PyPDF2.PdfReader(file)
+        text = ""
+        for page in pdf_reader.pages:
+            text += page.extract_text()
+        return text
+def classify_causality(pdf_text, model_path='./models/production_model_final', threshold=0.5, verbose=False):
+    classifier = CausalityClassifier(model_path, threshold)
+    # Use safe tokenization with fallback
+    sentences = safe_sent_tokenize(pdf_text)
+    if verbose:
+        print(f"Tokenized {len(sentences)} sentences")
+    related_count = 0
+    sentence_details = []
+    for sent in sentences:
+        if not sent.strip():
+            continue
+        result = classifier.predict(sent, return_probs=True)
+        if result['label'] == 1:
+            related_count += 1
+            sentence_details.append({
+                'sentence': sent[:100],
+                'probability_related': result['probabilities']['related'],
+                'confidence': result['confidence']
+            })
+    sentence_details.sort(key=lambda x: x['probability_related'], reverse=True)
+    return {
+        'final_classification': 'related' if related_count > 0 else 'not related',
+        'confidence_score': related_count / len(sentences) if sentences else 0,
+        'related_sentences': related_count,
+        'total_sentences': len(sentences),
+        'top_related_sentences': sentence_details[:5],
+        'threshold_used': threshold
+    }
+def process_pdf_file(pdf_path, model_path='./models/production_model_final', threshold=0.5, save_report=False, output_dir='./results'):
+    pdf_text = extract_text_from_pdf(pdf_path)
+    results = classify_causality(pdf_text, model_path, threshold)
+    results['pdf_file'] = str(Path(pdf_path).name)
+    if save_report:
+        Path(output_dir).mkdir(parents=True, exist_ok=True)
+        with open(Path(output_dir) / f"{Path(pdf_path).stem}_report.json", 'w') as f:
+            json.dump(results, f, indent=2)
+    return results
+def process_multiple_pdfs(pdf_paths, model_path='./models/production_model_final', threshold=0.5, save_reports=False, output_dir='./results'):
+    all_results = []
+    for pdf_path in pdf_paths:
+        try:
+            results = process_pdf_file(pdf_path, model_path, threshold, save_reports, output_dir)
+            all_results.append(results)
+        except Exception as e:
+            all_results.append({
+                'pdf_file': str(Path(pdf_path).name),
+                'error': str(e),
+                'final_classification': 'error'
+            })
+    return all_results

streamlit_app.py ADDED Viewed

	@@ -0,0 +1,352 @@

+import streamlit as st
+import tempfile
+import os
+import sys
+import pandas as pd
+import json
+from pathlib import Path
+import nltk
+nltk.download('punkt')
+# Add parent directory to path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+# NOW THIS IMPORT WILL WORK!
+from src.inference import (
+    CausalityClassifier,
+    extract_text_from_pdf,
+    classify_causality,
+    process_pdf_file,
+    process_multiple_pdfs
+)
+# SINGLE load_model function with caching
+@st.cache_resource
+def load_model():
+    """Load CausalityClassifier model once and reuse across sessions"""
+    try:
+        return CausalityClassifier("models/production_model_final")
+    except Exception as e:
+        st.error(f"Failed to load model: {e}")
+        return None
+# App Configuration
+st.set_page_config(
+    page_title="Drug Causality Classifier",
+    page_icon="💊",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Main Title
+st.title("💊 Drug Causality Classifier")
+st.caption("BioBERT Model | F1 Score: 97.59% | Sensitivity: 98.68% | Specificity: 96.50%")
+# Load model (cached)
+classifier = load_model()
+# Sidebar Configuration
+st.sidebar.header("⚙️ Configuration")
+threshold = st.sidebar.slider(
+    "Classification Threshold",
+    min_value=0.0,
+    max_value=1.0,
+    value=0.5,
+    step=0.05,
+    help="Higher threshold = stricter causality detection"
+)
+st.sidebar.info(
+    "**Threshold Guide:**\n"
+    "- 0.3-0.4: High sensitivity (catch all events)\n"
+    "- 0.5: Balanced performance\n"
+    "- 0.7-0.8: High precision (reduce false alarms)"
+)
+# Main Content
+tab1, tab2, tab3 = st.tabs(["📝 Single Text", "📄 PDF Analysis", "📁 Batch Processing"])
+# TAB 1: Single Text Classification
+with tab1:
+    st.header("📝 Single Statement Classification")
+    st.write("Enter medical text to classify drug-adverse event causality:")
+    text_input = st.text_area(
+        "Medical Text:",
+        height=150,
+        placeholder="e.g., Patient developed severe nausea and vomiting 2 hours after taking Drug X. Clinical assessment confirmed drug-related causality."
+    )
+    col1, col2 = st.columns([2, 1])
+    with col1:
+        if st.button("🔍 Classify Text", type="primary", use_container_width=True):
+            if text_input and classifier:
+                with st.spinner("Analyzing text..."):
+                    result = classifier.predict(text_input, threshold)
+                # Display Results
+                st.subheader("📊 Results")
+                result_col1, result_col2 = st.columns(2)
+                with result_col1:
+                    classification = result['prediction'].upper()
+                    color = "green" if result['prediction'] == 'related' else "red"
+                    st.markdown(f"**Classification:** :{color}[{classification}]")
+                with result_col2:
+                    confidence_pct = result['confidence'] * 100
+                    st.metric("Confidence", f"{confidence_pct:.1f}%")
+                # Probability Distribution
+                st.subheader("📈 Probability Distribution")
+                probs = result['probabilities']
+                # Progress bars
+                st.write("**Related (Drug-Caused):**")
+                st.progress(probs['related'], text=f"{probs['related']:.2%}")
+                st.write("**Not Related:**")
+                st.progress(probs['not_related'], text=f"{probs['not_related']:.2%}")
+                # Raw JSON Output
+                with st.expander("🔍 View Raw Results"):
+                    st.json(result)
+            elif not classifier:
+                st.error("Model not loaded properly.")
+            else:
+                st.warning("Please enter text to classify.")
+    with col2:
+        st.info(
+            "**Example Inputs:**\n\n"
+            "**Related:** _Patient developed rash after taking aspirin. Symptoms resolved after discontinuation._\n\n"
+            "**Not Related:** _Patient has a history of diabetes and hypertension. Takes metformin daily._"
+        )
+# TAB 2: PDF Analysis
+with tab2:
+    st.header("📄 PDF Document Analysis")
+    st.write("Upload a PDF document for comprehensive drug-adverse event analysis:")
+    pdf_file = st.file_uploader(
+        "Choose a PDF file",
+        type=["pdf"],
+        help="Upload medical documents, case reports, or clinical notes"
+    )
+    if pdf_file and classifier:
+        # Save uploaded file temporarily
+        temp_dir = tempfile.gettempdir()
+        temp_path = os.path.join(temp_dir, pdf_file.name)
+        with open(temp_path, "wb") as tmp_f:
+            tmp_f.write(pdf_file.getbuffer())
+        # Analysis Button
+        if st.button("🔍 Analyze PDF", type="primary", use_container_width=True):
+            with st.spinner(f"Processing {pdf_file.name}..."):
+                try:
+                    # Extract and classify
+                    pdf_text = extract_text_from_pdf(temp_path)
+                    results = classify_causality(pdf_text, threshold=threshold)
+                    # Display Summary
+                    st.subheader("📊 Analysis Summary")
+                    summary_col1, summary_col2, summary_col3 = st.columns(3)
+                    with summary_col1:
+                        classification = results['final_classification'].upper()
+                        color = "green" if results['final_classification'] == 'related' else "red"
+                        st.markdown(f"**Overall:** :{color}[{classification}]")
+                    with summary_col2:
+                        confidence_pct = results['confidence_score'] * 100
+                        st.metric("Confidence", f"{confidence_pct:.1f}%")
+                    with summary_col3:
+                        st.metric("Total Sentences", results['total_sentences'])
+                    # Sentence Breakdown
+                    st.subheader("🔍 Sentence Analysis")
+                    breakdown_col1, breakdown_col2 = st.columns(2)
+                    with breakdown_col1:
+                        st.metric("Related Sentences", results['related_sentences'])
+                    with breakdown_col2:
+                        st.metric("Not Related", results['not_related_sentences'])
+                    # Top Related Sentences
+                    if results['related_sentences'] > 0:
+                        st.subheader("🎯 Top Related Sentences")
+                        for i, sent_detail in enumerate(results.get('top_related_sentences', []), 1):
+                            confidence = sent_detail['probability_related']
+                            confidence_color = "green" if confidence > 0.7 else "orange" if confidence > 0.5 else "red"
+                            st.markdown(f"**{i}.** ({confidence:.1%} confidence)")
+                            st.markdown(f":{confidence_color}[{sent_detail['sentence']}]")
+                            st.write("")
+                    # Download Button
+                    st.subheader("💾 Download Report")
+                    report_json = json.dumps(results, indent=2)
+                    st.download_button(
+                        label="📥 Download JSON Report",
+                        data=report_json,
+                        file_name=f"{pdf_file.name}_causality_report.json",
+                        mime="application/json"
+                    )
+                    # Raw Results Expander
+                    with st.expander("🔍 View Full Results"):
+                        st.json(results)
+                except Exception as e:
+                    st.error(f"Error processing PDF: {str(e)}")
+                    st.info("Please ensure the PDF contains readable text and try again.")
+                # Clean up temp file
+                finally:
+                    try:
+                        os.remove(temp_path)
+                    except:
+                        pass
+# TAB 3: Batch Processing
+with tab3:
+    st.header("📁 Batch PDF Processing")
+    st.write("Upload multiple PDF files for batch causality analysis:")
+    batch_files = st.file_uploader(
+        "Choose PDF files",
+        type=["pdf"],
+        accept_multiple_files=True,
+        help="Upload multiple medical documents for batch analysis"
+    )
+    if batch_files and classifier:
+        st.write(f"**Selected files:** {len(batch_files)} PDFs")
+        for i, file in enumerate(batch_files, 1):
+            st.write(f"{i}. {file.name}")
+        if st.button("🔍 Process All PDFs", type="primary", use_container_width=True):
+            # Create temporary paths for all files
+            batch_temp_paths = []
+            temp_dir = tempfile.gettempdir()
+            try:
+                # Save all files temporarily
+                for batch_file in batch_files:
+                    temp_path = os.path.join(temp_dir, batch_file.name)
+                    with open(temp_path, "wb") as tmp_f:
+                        tmp_f.write(batch_file.getbuffer())
+                    batch_temp_paths.append(temp_path)
+                # Process all files
+                with st.spinner(f"Processing {len(batch_files)} files..."):
+                    batch_results = process_multiple_pdfs(batch_temp_paths, threshold=threshold)
+                # Display Batch Summary
+                st.subheader("📊 Batch Analysis Summary")
+                # Overall stats
+                total_files = len(batch_results)
+                successful = len([r for r in batch_results if 'error' not in r])
+                related_count = len([r for r in batch_results if r.get('final_classification') == 'related'])
+                stat_col1, stat_col2, stat_col3 = st.columns(3)
+                with stat_col1:
+                    st.metric("Total Files", total_files)
+                with stat_col2:
+                    st.metric("Successfully Processed", successful)
+                with stat_col3:
+                    st.metric("Drug-Related Files", related_count)
+                # Individual Results
+                st.subheader("📄 Individual Results")
+                for i, res in enumerate(batch_results, 1):
+                    if 'error' in res:
+                        st.error(f"**{i}. {res['pdf_file']}:** Error - {res['error']}")
+                    else:
+                        classification = res['final_classification'].upper()
+                        confidence = res.get('confidence_score', 0) * 100
+                        color = "green" if res['final_classification'] == 'related' else "red"
+                        st.markdown(f"**{i}. {res['pdf_file']}:** :{color}[{classification}] ({confidence:.1f}% confidence)")
+                # Download Batch Summary
+                st.subheader("💾 Download Batch Report")
+                batch_report = {
+                    'summary': {
+                        'total_files': total_files,
+                        'successful': successful,
+                        'related_count': related_count,
+                        'threshold_used': threshold
+                    },
+                    'individual_results': batch_results
+                }
+                batch_json = json.dumps(batch_report, indent=2)
+                st.download_button(
+                    label="📥 Download Batch Summary",
+                    data=batch_json,
+                    file_name="batch_causality_summary.json",
+                    mime="application/json"
+                )
+                # Raw Results Expander
+                with st.expander("🔍 View Full Batch Results"):
+                    st.json(batch_results)
+            except Exception as e:
+                st.error(f"Batch processing error: {str(e)}")
+            finally:
+                # Clean up all temp files
+                for temp_path in batch_temp_paths:
+                    try:
+                        os.remove(temp_path)
+                    except:
+                        pass
+# Footer
+st.markdown("---")
+st.markdown(
+    "**Built with BioBERT for Pharmacovigilance** | "
+    "Developed for clinical decision support and regulatory compliance"
+)
+# Sidebar additional info
+st.sidebar.markdown("---")
+st.sidebar.markdown("### 📈 Model Performance")
+st.sidebar.markdown(
+    "- **F1 Score:** 97.59%\n"
+    "- **Accuracy:** 97.59%\n"
+    "- **Sensitivity:** 98.68%\n"
+    "- **Specificity:** 96.50%"
+)
+st.sidebar.markdown("### 🏥 Clinical Use")
+st.sidebar.markdown(
+    "This tool assists in:\n"
+    "- Adverse event detection\n"
+    "- Pharmacovigilance screening\n"
+    "- Clinical report analysis\n"
+    "- Regulatory compliance"
+)