Spaces:

mlkorra
/

Product-doc-classifier

Sleeping

App Files Files Community

mlkorra commited on Jan 11, 2025

Commit

dcb2841

verified ·

1 Parent(s): 357c0c8

Add pages

Browse files

Files changed (3) hide show

pages/Classifier.py +288 -0
pages/Home.py +54 -0
pages/Project_Wiki.py +274 -0

pages/Classifier.py ADDED Viewed

	@@ -0,0 +1,288 @@

+import streamlit as st
+from utils.util_classifier import TextClassificationPipeline
+import time
+import requests
+import io
+import pdfplumber
+from urllib.parse import urlparse
+import plotly.graph_objects as go
+import plotly.express as px
+def validate_url(url):
+    try:
+        result = urlparse(url)
+        return all([result.scheme, result.netloc])
+    except:
+        return False
+def download_pdf(url):
+    try:
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+            'Accept': 'application/pdf,*/*',
+            'Referer': 'https://www.inter-lux.com/'
+        }
+        response = requests.get(url, headers=headers)
+        response.raise_for_status()
+        # Verify content type is PDF
+        content_type = response.headers.get('content-type', '')
+        if 'application/pdf' not in content_type.lower():
+            raise ValueError(f"URL does not point to a PDF file. Content-Type: {content_type}")
+        return io.BytesIO(response.content)
+    except Exception as e:
+        st.error(f"Download error: {str(e)}")
+        return None
+def extract_text(pdf_file):
+    try:
+        # Reset file pointer
+        pdf_file.seek(0)
+        with pdfplumber.open(pdf_file) as pdf:
+            text = ""
+            for page in pdf.pages:
+                extracted = page.extract_text()
+                if extracted:
+                    text += extracted + "\n"
+            if not text.strip():
+                raise ValueError("No text could be extracted from the PDF")
+            return text.strip()
+    except Exception as e:
+        st.error(f"Text extraction error: {str(e)}")
+        return None
+def main():
+    st.title("🎯 Document Classifier")
+    # Model selection
+    method = "bertbased"
+    # Initialize classifier
+    classifier = TextClassificationPipeline(method=method)
+    # File input tabs
+    tab1, tab2 = st.tabs(["🔗 URL Input", "📁 File Upload"])
+    with tab1:
+        url = st.text_input("Enter PDF URL")
+        process_btn = st.button("Classify Document", key="url_classify")
+        if process_btn and url:
+            if not validate_url(url):
+                st.error("Please enter a valid URL")
+                return
+            progress_container = st.container()
+            with progress_container:
+                # Step 1: Downloading
+                with st.spinner("Downloading PDF..."):
+                    pdf_file = download_pdf(url)
+                    if pdf_file is None:
+                        return
+                    st.success("PDF downloaded successfully!")
+                # Step 2: Extracting Text
+                with st.spinner("Extracting text from PDF..."):
+                    text = extract_text(pdf_file)
+                    if text is None or len(text.strip()) == 0:
+                        return
+                    st.success("Text extracted successfully!")
+                    with st.expander("View Extracted Text"):
+                        st.text(text[:500] + "..." if len(text) > 500 else text)
+                # Step 3: Classification
+                with st.spinner("Classifying document..."):
+                    result = classifier.predict(text, return_probability=True)
+                    if isinstance(result, list):
+                        result = result[0]
+                # Display results
+                def create_gauge_chart(confidence):
+                    """Create a gauge chart for confidence score"""
+                    fig = go.Figure(go.Indicator(
+                        mode = "gauge+number+delta",
+                        value = confidence * 100,
+                        domain = {'x': [0, 1], 'y': [0, 1]},
+                        gauge = {
+                            'axis': {'range': [None, 100], 'tickwidth': 1, 'tickcolor': "darkblue"},
+                            'bar': {'color': "darkblue"},
+                            'bgcolor': "white",
+                            'borderwidth': 2,
+                            'bordercolor': "gray",
+                            'steps': [
+                                {'range': [0, 50], 'color': '#FF9999'},
+                                {'range': [50, 75], 'color': '#FFCC99'},
+                                {'range': [75, 100], 'color': '#99FF99'}
+                            ],
+                        },
+                        title = {'text': "Confidence Score"}
+                    ))
+                    fig.update_layout(
+                        height=300,
+                        margin=dict(l=10, r=10, t=50, b=10),
+                        paper_bgcolor='rgba(0,0,0,0)',
+                        font={'color': "darkblue", 'family': "Arial"}
+                    )
+                    return fig
+                def create_probability_chart(probabilities):
+                    """Create a horizontal bar chart for probability distribution"""
+                    labels = list(probabilities.keys())
+                    values = list(probabilities.values())
+                    fig = go.Figure()
+                    # Add bars
+                    fig.add_trace(go.Bar(
+                        y=labels,
+                        x=[v * 100 for v in values],
+                        orientation='h',
+                        marker=dict(
+                            color=[px.colors.sequential.Blues[i] for i in range(2, len(labels) + 2)],
+                            line=dict(color='rgba(0,0,0,0.8)', width=2)
+                        ),
+                        text=[f'{v:.1f}%' for v in [v * 100 for v in values]],
+                        textposition='auto',
+                    ))
+                    # Update layout
+                    fig.update_layout(
+                        title=dict(
+                            text='Probability Distribution',
+                            y=0.95,
+                            x=0.5,
+                            xanchor='center',
+                            yanchor='top',
+                            font=dict(size=20, color='darkblue')
+                        ),
+                        xaxis_title="Probability (%)",
+                        yaxis_title="Categories",
+                        height=400,
+                        margin=dict(l=20, r=20, t=70, b=20),
+                        paper_bgcolor='rgba(0,0,0,0)',
+                        plot_bgcolor='rgba(0,0,0,0)',
+                        font=dict(family="Arial", size=14),
+                        showlegend=False
+                    )
+                    # Update axes
+                    fig.update_xaxes(
+                        range=[0, 100],
+                        gridcolor='rgba(0,0,0,0.1)',
+                        zerolinecolor='rgba(0,0,0,0.2)'
+                    )
+                    fig.update_yaxes(
+                        gridcolor='rgba(0,0,0,0.1)',
+                        zerolinecolor='rgba(0,0,0,0.2)'
+                    )
+                    return fig
+                # Update the results display section
+                def display_results(result):
+                    """Display classification results with modern visualizations"""
+                    # Create three columns for the results
+                    col1, col2 = st.columns([1, 2])
+                    with col1:
+                        # Predicted Category Card
+                        st.markdown("""
+                            <div style='
+                                background-color: white;
+                                padding: 20px;
+                                border-radius: 10px;
+                                box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+                                text-align: center;
+                                margin-bottom: 20px;
+                            '>
+                                <h4 style='color: #1f77b4; margin-bottom: 10px;'>Predicted Category</h4>
+                                <p style='
+                                    font-size: 24px;
+                                    font-weight: bold;
+                                    color: #2c3e50;
+                                    margin: 0;
+                                    padding: 10px;
+                                    background-color: #f8f9fa;
+                                    border-radius: 5px;
+                                '>{}</p>
+                            </div>
+                        """.format(result['predicted_label']), unsafe_allow_html=True)
+                        # Confidence Gauge
+                        st.plotly_chart(create_gauge_chart(result['confidence']), use_container_width=True)
+                    with col2:
+                        # Probability Distribution
+                        st.plotly_chart(create_probability_chart(result['probabilities']), use_container_width=True)
+                    # Add metadata section
+                    with st.expander("📊 Classification Details"):
+                        st.markdown(f"""
+                            - **Model Type**: {result['model_type'].title()}
+                            - **Document Length**: {len(result['text'])} characters
+                        """)
+                # Update the main classification results section
+                # Replace the existing results display with:
+                st.markdown("### 📊 Classification Results")
+                display_results(result)
+    with tab2:
+        uploaded_file = st.file_uploader("Upload PDF file", type="pdf")
+        process_btn = st.button("Classify Document", key="file_classify")
+        if process_btn and uploaded_file:
+            with st.spinner("Processing uploaded PDF..."):
+                text = extract_text(uploaded_file)
+                if text is None:
+                    return
+                result = classifier.predict(text, return_probability=True)
+                if isinstance(result, list):
+                    result = result[0]
+                # Display results (same as URL tab)
+                st.markdown("### 📊 Classification Results")
+                confidence = result['confidence']
+                st.markdown(f"""
+                    <div class="confidence-meter">
+                        <div class="meter-fill" style="width: {confidence*100}%"></div>
+                        <span class="meter-text">{confidence:.1%} Confident</span>
+                    </div>
+                """, unsafe_allow_html=True)
+                st.markdown(f"""
+                    <div class="result-card">
+                        <h4>Predicted Category</h4>
+                        <p class="prediction">{result['predicted_label']}</p>
+                    </div>
+                """, unsafe_allow_html=True)
+                st.markdown("#### Probability Distribution")
+                for label, prob in result['probabilities'].items():
+                    st.markdown(f"""
+                        <div class="prob-bar">
+                            <span class="label">{label}</span>
+                            <div class="bar">
+                                <div class="fill" style="width: {prob*100}%"></div>
+                            </div>
+                            <span class="value">{prob:.1%}</span>
+                        </div>
+                    """, unsafe_allow_html=True)
+main()

pages/Home.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import streamlit as st
+from pathlib import Path
+st.title("🏗️ ConstructAI - Smart Document Classifier")
+# Hero section
+st.markdown("""
+    <div class="hero-section">
+        <h4>Automate your construction document classification with AI-powered accuracy</h4>
+    </div>
+""", unsafe_allow_html=True)
+# Key Features
+st.markdown("### 🚀 Key Features")
+col1, col2, col3 = st.columns(3)
+with col1:
+    st.markdown("""
+        <div class="feature-card">
+            <h4>🎯 Precise Classification</h4>
+            <p>Advanced AI models for accurate document categorization</p>
+        </div>
+    """, unsafe_allow_html=True)
+with col2:
+    st.markdown("""
+        <div class="feature-card">
+            <h4>⚡ Instant Results</h4>
+            <p>Get classifications in seconds, not hours</p>
+        </div>
+    """, unsafe_allow_html=True)
+with col3:
+    st.markdown("""
+        <div class="feature-card">
+            <h4>📊 Detailed Analytics</h4>
+            <p>Confidence scores and detailed predictions</p>
+        </div>
+    """, unsafe_allow_html=True)
+# Use Cases
+st.divider()
+# Call to Action
+st.markdown("""
+    <div class="cta-section">
+        <h3>Ready to Get Started?</h3>
+        <p>Try our classifier now and experience the power of AI in construction document management.</p>
+    </div>
+""", unsafe_allow_html=True)
+if st.button("Try Classifier Now →", key="cta_button"):
+    st.switch_page("pages/Classifier.py")

pages/Project_Wiki.py ADDED Viewed

	@@ -0,0 +1,274 @@

+import streamlit as st
+import pandas as pd
+import plotly.express as px
+def main():
+    st.title("📚 Project Documentation")
+    # Custom CSS for better styling
+    st.markdown("""
+        <style>
+        .question-card {
+            background-color: #f8f9fa;
+            padding: 20px;
+            border-radius: 10px;
+            border-left: 5px solid #1f77b4;
+            margin: 20px 0;
+        }
+        .question {
+            color: #1f77b4;
+            font-size: 1.2em;
+            font-weight: bold;
+            margin-bottom: 15px;
+        }
+        .answer {
+            color: #2c3e50;
+            line-height: 1.6;
+        }
+        </style>
+    """, unsafe_allow_html=True)
+    # Q1: Development Timeline
+    st.markdown("""
+        <div class="question-card">
+            <div class="question">⏱️ Q1: How long did it take to solve the problem?</div>
+            <div class="answer">
+                The solution was developed in approximately <b>5 hours</b> (excluding data collection and model training phases).
+            </div>
+        </div>
+    """, unsafe_allow_html=True)
+    # Q2: Solution Explanation
+    st.markdown("""
+        <div class="question-card">
+            <div class="question">🔍 Q2: Can you explain your solution approach?</div>
+            <div class="answer">
+                The solution implements a multi-stage document classification pipeline:
+                <br><br>
+                <b>1. Direct URL Text Approach:</b>
+                <ul>
+                    <li>Initially considered direct URL text extraction</li>
+                    <li>Found limitations in accuracy and reliability</li>
+                </ul>
+                <br>
+                <b>2. Baseline Approach (ML Model):</b>
+                <ul>
+                    <li>Implemented TF-IDF vectorization</li>
+                    <li>Used Logistic Regression for classification</li>
+                    <li>Provided quick and efficient results</li>
+                </ul>
+                <br>
+                <b>3. (DL Model):</b>
+                <ul>
+                    <li>Utilized BERT-based model architecture</li>
+                    <li>Fine-tuned on construction document dataset</li>
+                    <li>Achieved superior accuracy and context understanding</li>
+                </ul>
+            </div>
+        </div>
+    """, unsafe_allow_html=True)
+    # Q3: Model Selection
+    st.markdown("""
+        <div class="question-card">
+            <div class="question">🤖 Q3: Which models did you use and why?</div>
+            <div class="answer">
+                Implemented baseline using TF-IDF and Logistic Regression and then used BERT-based model:
+                <br><br>
+                <b>Baseline Model:</b>
+                <ul>
+                    <li>TF-IDF + Logistic Regression</li>
+                    <li>Quick inference time</li>
+                    <li>Resource-efficient</li>
+                </ul>
+                <br>
+                <b>BERT Model:</b>
+                <ul>
+                    <li>Fine-tuned on 1800 samples text</li>
+                    <li>Better context understanding</li>
+                    <li>Better handling of complex documents</li>
+                </ul>
+            </div>
+        </div>
+    """, unsafe_allow_html=True)
+    # Q4: Limitations and Improvements
+    st.markdown("""
+        <div class="question-card">
+            <div class="question">⚠️ Q4: What are the current limitations and potential improvements?</div>
+            <div class="answer">
+                <b>Current Implementation & Limitations:</b>
+                <ul>
+                    <li>~25% of dataset URLs were inaccessible</li>
+                    <li>Used Threadpooling for parallel downloading of train and test documents</li>
+                </ul>
+                <br>
+                <b>Proposed Improvements:</b>
+                <ul>
+                    <li>Use latest LLMs like GPT-4o, Claude 3.5 Sonnet etc with few shot prompting to speed up the development process</li>
+                    <li>Optimize inference pipeline for faster processing using distilled models like DistilBERT, or the last BERT based model - ModernBERT to compare the performance</li>
+                    <li>Add support for more document formats</li>
+                </ul>
+            </div>
+        </div>
+    """, unsafe_allow_html=True)
+    # Q5: Model Performance
+    st.markdown("""
+            <div class="question-card">
+            <div class="question">📊 Q5: What is the model's performance on test data?</div>
+            <div class="answer">
+                <b>BERT Model Performance:</b>
+                <br><br>
+                <div style="overflow-x: auto;">
+                    <table style="
+                        width: 100%;
+                        border-collapse: collapse;
+                        margin: 20px 0;
+                        font-size: 0.9em;
+                        font-family: sans-serif;
+                        box-shadow: 0 0 20px rgba(0, 0, 0, 0.15);
+                        border-radius: 5px;
+                    ">
+                        <thead>
+                            <tr style="
+                                background-color: #1f77b4;
+                                color: white;
+                                text-align: left;
+                            ">
+                                <th style="padding: 12px 15px;">Category</th>
+                                <th style="padding: 12px 15px;">Precision</th>
+                                <th style="padding: 12px 15px;">Recall</th>
+                                <th style="padding: 12px 15px;">F1-Score</th>
+                                <th style="padding: 12px 15px;">Support</th>
+                            </tr>
+                        </thead>
+                        <tbody>
+                            <tr style="border-bottom: 1px solid #dddddd;">
+                                <td style="padding: 12px 15px;"><b>Cable</b></td>
+                                <td style="padding: 12px 15px;">1.00</td>
+                                <td style="padding: 12px 15px;">1.00</td>
+                                <td style="padding: 12px 15px;">1.00</td>
+                                <td style="padding: 12px 15px;">92</td>
+                            </tr>
+                            <tr style="border-bottom: 1px solid #dddddd; background-color: #f3f3f3;">
+                                <td style="padding: 12px 15px;"><b>Fuses</b></td>
+                                <td style="padding: 12px 15px;">0.95</td>
+                                <td style="padding: 12px 15px;">1.00</td>
+                                <td style="padding: 12px 15px;">0.98</td>
+                                <td style="padding: 12px 15px;">42</td>
+                            </tr>
+                            <tr style="border-bottom: 1px solid #dddddd;">
+                                <td style="padding: 12px 15px;"><b>Lighting</b></td>
+                                <td style="padding: 12px 15px;">0.94</td>
+                                <td style="padding: 12px 15px;">1.00</td>
+                                <td style="padding: 12px 15px;">0.97</td>
+                                <td style="padding: 12px 15px;">74</td>
+                            </tr>
+                            <tr style="border-bottom: 1px solid #dddddd; background-color: #f3f3f3;">
+                                <td style="padding: 12px 15px;"><b>Others</b></td>
+                                <td style="padding: 12px 15px;">1.00</td>
+                                <td style="padding: 12px 15px;">0.92</td>
+                                <td style="padding: 12px 15px;">0.96</td>
+                                <td style="padding: 12px 15px;">83</td>
+                            </tr>
+                        </tbody>
+                        <tfoot>
+                            <tr style="background-color: #f8f9fa; font-weight: bold; border-top: 2px solid #dddddd;">
+                                <td style="padding: 12px 15px;">Accuracy</td>
+                                <td style="padding: 12px 15px;" colspan="3">0.98</td>
+                                <td style="padding: 12px 15px;">291</td>
+                            </tr>
+                            <tr style="background-color: #f8f9fa; color: #666;">
+                                <td style="padding: 12px 15px;">Macro Avg</td>
+                                <td style="padding: 12px 15px;">0.97</td>
+                                <td style="padding: 12px 15px;">0.98</td>
+                                <td style="padding: 12px 15px;">0.98</td>
+                                <td style="padding: 12px 15px;">291</td>
+                            </tr>
+                            <tr style="background-color: #f8f9fa; color: #666;">
+                                <td style="padding: 12px 15px;">Weighted Avg</td>
+                                <td style="padding: 12px 15px;">0.98</td>
+                                <td style="padding: 12px 15px;">0.98</td>
+                                <td style="padding: 12px 15px;">0.98</td>
+                                <td style="padding: 12px 15px;">291</td>
+                            </tr>
+                        </tfoot>
+                    </table>
+                </div>
+            </div>
+        </div>
+    """, unsafe_allow_html=True)
+    st.markdown("""
+    <div style='
+        background-color: #f8f9fa;
+        padding: 20px;
+        border-radius: 10px;
+        border-left: 5px solid #1f77b4;
+        margin: 20px 0;
+    '>
+        ✨ Perfect performance (1.00) for Cable category<br>
+        📈 High recall (1.00) across most categories<br>
+        🎯 Overall accuracy of 98%<br>
+        ⚖️ Balanced performance across all metrics
+    </div>
+    """, unsafe_allow_html=True)
+    # Q6: Metric Selection
+    st.markdown("""
+        <div class="question-card">
+            <div class="question">📈 Q6: Why did you choose these particular metrics?</div>
+            <div class="answer">
+                Our metric selection was driven by the dataset characteristics:
+                <br><br>
+                <b>Key Considerations:</b>
+                <ul>
+                    <li>Dataset has mild class imbalance (Imbalance Ratio: 2.36)</li>
+                    <li>Need for balanced evaluation across all classes</li>
+                </ul>
+                <br>
+                <b>Selected Metrics:</b>
+                <ul>
+                    <li><b>Precision:</b> Critical for minimizing false positives</li>
+                    <li><b>Recall:</b> Important for catching all instances of each class</li>
+                    <li><b>F1-Score:</b> Provides balanced evaluation of both metrics</li>
+                    <li><b>Weighted Average:</b> Accounts for class imbalance</li>
+                </ul>
+            </div>
+        </div>
+    """, unsafe_allow_html=True)
+    # Performance Visualization
+    st.markdown("### 📊 Model Performance Comparison")
+    metrics = {
+        'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score'],
+        'Baseline': [0.85, 0.83, 0.84, 0.83],
+        'BERT': [0.98, 0.97, 0.98, 0.98]
+    }
+    df = pd.DataFrame(metrics)
+    fig = px.bar(
+        df,
+        x='Metric',
+        y=['Baseline', 'BERT'],
+        barmode='group',
+        title='Model Performance Comparison',
+        color_discrete_sequence=['#2ecc71', '#3498db'],
+        template='plotly_white'
+    )
+    fig.update_layout(
+        title_x=0.5,
+        title_font_size=20,
+        legend_title_text='Model Type',
+        xaxis_title="Evaluation Metric",
+        yaxis_title="Score",
+        bargap=0.2,
+        height=500
+    )
+    st.plotly_chart(fig, use_container_width=True)
+main()