Spaces:

mlkorra
/

Product-doc-classifier

Sleeping

App Files Files Community

mlkorra commited on Jan 11, 2025

Commit

a20a7ca

verified ·

1 Parent(s): dcb2841

Add app, utils classifier

Browse files

Files changed (3) hide show

app.py +18 -0
assets/style.css +80 -0
utils/util_classifier.py +264 -0

app.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import streamlit as st
+def main():
+    st.set_page_config(
+        page_title="ConstructAI",
+        page_icon="🏗️",
+        layout="wide"
+    )
+    home_page = st.Page("pages/Home.py",icon="🏠")
+    classifier_page = st.Page('pages/Classifier.py',title='Classifier',icon="🛠️")
+    project_wiki_page = st.Page('pages/Project_Wiki.py',title = 'Project Wiki', icon=":material/dashboard:")
+    pg = st.navigation([home_page, classifier_page, project_wiki_page])
+    pg.run()
+if __name__ == "__main__":
+    main()

assets/style.css ADDED Viewed

	@@ -0,0 +1,80 @@

+/* General Styles */
+.stButton>button {
+    background-color: #4CAF50;
+    color: white;
+    padding: 0.5rem 1rem;
+    border-radius: 5px;
+    border: none;
+    transition: all 0.3s;
+}
+.stButton>button:hover {
+    background-color: #45a049;
+    transform: translateY(-2px);
+}
+/* Hero Section */
+.hero-section {
+    background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
+    padding: 2rem;
+    border-radius: 10px;
+    margin: 2rem 0;
+    text-align: center;
+}
+/* Feature Cards */
+.feature-card {
+    background: white;
+    padding: 1.5rem;
+    border-radius: 8px;
+    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+    margin: 1rem 0;
+    text-align: center;
+}
+/* Results Display */
+.confidence-meter {
+    background: #f0f0f0;
+    border-radius: 10px;
+    height: 20px;
+    position: relative;
+    margin: 1rem 0;
+}
+.meter-fill {
+    background: linear-gradient(90deg, #4CAF50, #45a049);
+    height: 100%;
+    border-radius: 10px;
+    transition: width 0.5s ease-in-out;
+}
+.result-card {
+    background: white;
+    padding: 1.5rem;
+    border-radius: 8px;
+    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+    margin: 1rem 0;
+    text-align: center;
+}
+/* Probability Bars */
+.prob-bar {
+    display: flex;
+    align-items: center;
+    margin: 0.5rem 0;
+}
+.bar {
+    flex-grow: 1;
+    height: 20px;
+    background: #f0f0f0;
+    margin: 0 1rem;
+    border-radius: 10px;
+    overflow: hidden;
+}
+.fill {
+    height: 100%;
+    background: #4CAF50;
+    transition: width 0.5s ease-in-out;
+}

utils/util_classifier.py ADDED Viewed

	@@ -0,0 +1,264 @@

+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import torch
+import numpy as np
+from sklearn.feature_extraction.text import TfidfVectorizer
+import joblib
+import pandas as pd
+from datetime import datetime
+import logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class TextClassificationPipeline:
+    def __init__(self, model_path='./models', method='bertbased'):
+        """
+        Initialize the classification pipeline
+        Args:
+            model_path: Path to saved models
+            method: 'bertbased' or 'baseline'
+        """
+        try:
+            self.method = method
+            if method == 'bertbased':
+                logger.info("Loading BERT model...")
+                self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
+                self.model = AutoModelForSequenceClassification.from_pretrained(f"{model_path}/bert-model")
+                self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+                self.model.to(self.device)
+                self.model.eval()
+                logger.info(f"BERT model loaded successfully. Using device: {self.device}")
+            else:
+                logger.info("Loading baseline model...")
+                self.tfidf = joblib.load(f"{model_path}/baseline-model/tfidf_vectorizer.pkl")
+                self.baseline_model = joblib.load(f"{model_path}/baseline-model/baseline_model.pkl")
+                logger.info("Baseline model loaded successfully")
+            # Load label encoder for both methods
+            self.label_encoder = joblib.load(f"{model_path}/label_encoder.pkl")
+        except Exception as e:
+            logger.error(f"Error initializing model: {str(e)}")
+            raise
+    # def preprocess_text(self, text):
+    #     """Clean and preprocess text"""
+    #     if isinstance(text, str):
+    #         # Basic cleaning
+    #         text = text.strip()
+    #         text = ' '.join(text.split())  # Remove extra whitespace
+    #         return text
+    #     return text
+    def preprocess_text(self, text):
+        """Clean and preprocess text"""
+        if isinstance(text, str):
+            # Basic cleaning
+            text = text.strip()
+            text = ' '.join(text.split())  # Remove extra whitespace
+            # Capitalize first letter to match training data format
+            text = text.title()  # This will capitalize first letter of each word
+            return text
+        return text
+    def preprocess(self, text):
+        """
+        Preprocess the input text based on method
+        """
+        try:
+            # Clean text first
+            text = self.preprocess_text(text)
+            if self.method == 'bertbased':
+                # BERT preprocessing
+                encodings = self.tokenizer(
+                    text,
+                    truncation=True,
+                    padding=True,
+                    max_length=512,
+                    return_tensors='pt'
+                )
+                encodings = {k: v.to(self.device) for k, v in encodings.items()}
+                return encodings
+            else:
+                # Baseline preprocessing
+                return self.tfidf.transform([text] if isinstance(text, str) else text)
+        except Exception as e:
+            logger.error(f"Error in preprocessing: {str(e)}")
+            raise
+    def predict(self, text, return_probability=False):
+        """
+        Predict using either BERT or baseline model
+        Args:
+            text: Input text or list of texts
+            return_probability: Whether to return probability scores
+        Returns:
+            Predictions with metadata
+        """
+        try:
+            # Handle both single string and list of strings
+            if isinstance(text, str):
+                text = [text]
+            # Preprocess
+            inputs = self.preprocess(text)
+            if self.method == 'bertbased':
+                # BERT predictions
+                with torch.no_grad():
+                    outputs = self.model(**inputs)
+                    probabilities = torch.softmax(outputs.logits, dim=-1)
+                    predictions = torch.argmax(probabilities, dim=-1)
+                predictions = predictions.cpu().numpy()
+                probabilities = probabilities.cpu().numpy()
+            else:
+                # Baseline predictions
+                predictions = self.baseline_model.predict(inputs)
+                probabilities = self.baseline_model.predict_proba(inputs)
+            # Convert numeric predictions to original labels
+            predicted_labels = self.label_encoder.inverse_transform(predictions)
+            # Ensure consistent casing with training data
+            predicted_labels = [label.title() for label in predicted_labels]
+            if return_probability:
+                results = []
+                for t, label, prob, probs in zip(text, predicted_labels,
+                                            probabilities.max(axis=1),
+                                            probabilities):
+                    result = {
+                        'text': t[:200] + '...' if len(t) > 200 else t,
+                        'predicted_label': label.title(),  # Ensure consistent casing
+                        'confidence': float(prob),
+                        'model_type': self.method,
+                        'probabilities': {
+                            self.label_encoder.inverse_transform([i])[0].title(): float(p)  # Consistent casing
+                            for i, p in enumerate(probs)
+                        },
+                        # ... rest of the result dictionary ...
+                    }
+                    results.append(result)
+                return results[0] if len(text) == 1 else results
+            return predicted_labels[0] if len(text) == 1 else predicted_labels
+        except Exception as e:
+            logger.error(f"Error in prediction: {str(e)}")
+            raise
+    def predict_old(self, text, return_probability=False):
+        """
+        Predict using either BERT or baseline model
+        Args:
+            text: Input text or list of texts
+            return_probability: Whether to return probability scores
+        Returns:
+            Predictions with metadata
+        """
+        try:
+            # Handle both single string and list of strings
+            if isinstance(text, str):
+                text = [text]
+            # Preprocess
+            inputs = self.preprocess(text)
+            if self.method == 'bertbased':
+                # BERT predictions
+                with torch.no_grad():
+                    outputs = self.model(**inputs)
+                    probabilities = torch.softmax(outputs.logits, dim=-1)
+                    predictions = torch.argmax(probabilities, dim=-1)
+                predictions = predictions.cpu().numpy()
+                probabilities = probabilities.cpu().numpy()
+            else:
+                # Baseline predictions
+                predictions = self.baseline_model.predict(inputs)
+                probabilities = self.baseline_model.predict_proba(inputs)
+            # Convert numeric predictions to original labels
+            predicted_labels = self.label_encoder.inverse_transform(predictions)
+            if return_probability:
+                results = []
+                for t, label, prob, probs in zip(text, predicted_labels,
+                                               probabilities.max(axis=1),
+                                               probabilities):
+                    # Create detailed result dictionary
+                    result = {
+                        'text': t[:200] + '...' if len(t) > 200 else t,  # Truncate long text
+                        'predicted_label': label,
+                        'confidence': float(prob),
+                        'model_type': self.method,
+                        'probabilities': {
+                            self.label_encoder.inverse_transform([i])[0]: float(p)
+                            for i, p in enumerate(probs)
+                        },
+                        'timestamp': datetime.now().isoformat(),
+                        'metadata': {
+                            'model_name': 'BERT' if self.method == 'bertbased' else 'Baseline',
+                            'text_length': len(t),
+                            'preprocessing_steps': ['cleaning', 'tokenization']
+                        }
+                    }
+                    results.append(result)
+                return results[0] if len(text) == 1 else results
+            return predicted_labels[0] if len(text) == 1 else predicted_labels
+        except Exception as e:
+            logger.error(f"Error in prediction: {str(e)}")
+            raise
+    def get_model_info(self):
+        """Return model information"""
+        return {
+            'model_type': self.method,
+            'model_name': 'BERT' if self.method == 'bertbased' else 'Baseline',
+            'device': str(self.device) if self.method == 'bertbased' else 'CPU',
+            'max_sequence_length': 512 if self.method == 'bertbased' else None,
+            'number_of_classes': len(self.label_encoder.classes_),
+            'classes': list(self.label_encoder.classes_)
+        }
+def load_and_process_pdf(url_or_file):
+    """
+    Load and process PDF from URL or file
+    Returns extracted text
+    """
+    try:
+        # Your PDF processing code here
+        # Return extracted text
+        pass
+    except Exception as e:
+        logger.error(f"Error processing PDF: {str(e)}")
+        raise
+# Example usage
+if __name__ == "__main__":
+    # Test the pipeline
+    classifier = TextClassificationPipeline()
+    # Test single prediction
+    text = "Example construction document text"
+    result = classifier.predict(text, return_probability=True)
+    print("\nSingle Prediction Result:")
+    print(result)
+    # Test batch prediction
+    texts = ["First document", "Second document"]
+    results = classifier.predict(texts, return_probability=True)
+    print("\nBatch Prediction Results:")
+    for result in results:
+        print(f"\nText: {result['text']}")
+        print(f"Prediction: {result['predicted_label']}")
+        print(f"Confidence: {result['confidence']:.4f}")