Spaces:

ayushsahu45
/

Multi-AI-Analytics-Platform

Sleeping

App Files Files Community

ayushsahu45 commited on Mar 23

Commit

82dccf5

verified ·

1 Parent(s): 6c1a9ca

Upload 4 files

Browse files

Files changed (4) hide show

models/dl_module.py +256 -0
models/generative_ai.py +196 -0
models/ml_models.py +487 -0
models/nlp_module.py +327 -0

models/dl_module.py ADDED Viewed

	@@ -0,0 +1,256 @@

+"""
+dl_module.py - Deep Learning Module
+Image classification using pretrained MobileNetV2/ResNet50 + OpenCV object detection
+"""
+import streamlit as st
+import numpy as np
+import cv2
+import io
+import warnings
+warnings.filterwarnings("ignore")
+from PIL import Image
+# ─── Lazy imports ────────────────────────────────────────────────────────────
+def _load_tf_model(model_name):
+    """Load a Keras pretrained model."""
+    import tensorflow as tf
+    from tensorflow.keras.applications import MobileNetV2, ResNet50, VGG16
+    from tensorflow.keras.applications.mobilenet_v2 import preprocess_input as mn_pre, decode_predictions as mn_dec
+    from tensorflow.keras.applications.resnet50  import preprocess_input as rn_pre, decode_predictions as rn_dec
+    from tensorflow.keras.applications.vgg16     import preprocess_input as vg_pre, decode_predictions as vg_dec
+    models_map = {
+        "MobileNetV2": (MobileNetV2, mn_pre, mn_dec, (224, 224)),
+        "ResNet50":    (ResNet50,    rn_pre, rn_dec, (224, 224)),
+        "VGG16":       (VGG16,       vg_pre, vg_dec, (224, 224)),
+    }
+    ModelClass, preprocess, decode, size = models_map[model_name]
+    model = ModelClass(weights="imagenet")
+    return model, preprocess, decode, size
+def _classify_image_tf(image_pil, model_name):
+    """Classify an image using TF/Keras pretrained model."""
+    import numpy as np
+    from tensorflow.keras.preprocessing.image import img_to_array
+    model, preprocess, decode, (h, w) = _load_tf_model(model_name)
+    img = image_pil.convert("RGB").resize((w, h))
+    arr = img_to_array(img)
+    arr = np.expand_dims(arr, axis=0)
+    arr = preprocess(arr)
+    preds = model.predict(arr, verbose=0)
+    top = decode(preds, top=5)[0]
+    results = [{"Rank": i+1, "Label": label.replace("_", " ").title(),
+                "Confidence": f"{prob*100:.2f}%", "Score": round(prob, 4)}
+               for i, (_, label, prob) in enumerate(top)]
+    return results
+def _classify_image_torch(image_pil, model_name):
+    """Classify an image using PyTorch pretrained model."""
+    import torch
+    import torchvision.transforms as T
+    import torchvision.models as models_tv
+    import json
+    import urllib.request
+    # Load imagenet class labels
+    LABELS_URL = "https://raw.githubusercontent.com/anishathalye/imagenet-simple-labels/master/imagenet-simple-labels.json"
+    try:
+        with urllib.request.urlopen(LABELS_URL, timeout=5) as r:
+            class_labels = json.load(r)
+    except Exception:
+        class_labels = [str(i) for i in range(1000)]
+    torch_models = {
+        "MobileNetV2": models_tv.mobilenet_v2,
+        "ResNet50":    models_tv.resnet50,
+    }
+    model_fn = torch_models.get(model_name, models_tv.mobilenet_v2)
+    model = model_fn(pretrained=True)
+    model.eval()
+    transform = T.Compose([
+        T.Resize(256),
+        T.CenterCrop(224),
+        T.ToTensor(),
+        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+    img = image_pil.convert("RGB")
+    tensor = transform(img).unsqueeze(0)
+    with torch.no_grad():
+        output = model(tensor)
+        probs = torch.nn.functional.softmax(output[0], dim=0)
+    top_probs, top_idxs = torch.topk(probs, 5)
+    results = []
+    for i, (prob, idx) in enumerate(zip(top_probs, top_idxs)):
+        label = class_labels[idx.item()] if idx.item() < len(class_labels) else str(idx.item())
+        results.append({
+            "Rank": i+1,
+            "Label": label.replace("_", " ").title(),
+            "Confidence": f"{prob.item()*100:.2f}%",
+            "Score": round(prob.item(), 4),
+        })
+    return results
+def detect_edges_opencv(image_pil):
+    """Apply Canny edge detection using OpenCV."""
+    img_array = np.array(image_pil.convert("RGB"))
+    gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
+    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
+    edges = cv2.Canny(blurred, threshold1=50, threshold2=150)
+    return edges
+def detect_faces_opencv(image_pil):
+    """Detect faces using Haar Cascade classifier."""
+    img_array = np.array(image_pil.convert("RGB"))
+    img_bgr = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
+    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+    cascade_path = cv2.data.haarcascades + "haarcascade_frontalface_default.xml"
+    face_cascade = cv2.CascadeClassifier(cascade_path)
+    faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
+    result_img = img_array.copy()
+    for (x, y, w, h) in faces:
+        cv2.rectangle(result_img, (x, y), (x+w, y+h), (0, 200, 255), 2)
+        cv2.putText(result_img, "Face", (x, y-8), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 200, 255), 2)
+    return result_img, len(faces)
+def apply_image_filters(image_pil):
+    """Apply various OpenCV image processing filters and return dict of results."""
+    img = np.array(image_pil.convert("RGB"))
+    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
+    blurred = cv2.GaussianBlur(img, (15, 15), 0)
+    sharpened = cv2.addWeighted(img, 1.5, blurred, -0.5, 0)
+    thresh = cv2.adaptiveThreshold(
+        gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
+    )
+    contours_img = img.copy()
+    contours, _ = cv2.findContours(
+        cv2.Canny(gray, 50, 150), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
+    )
+    cv2.drawContours(contours_img, contours, -1, (0, 255, 120), 1)
+    return {
+        "Grayscale": gray,
+        "Blurred": blurred,
+        "Sharpened": sharpened,
+        "Threshold": thresh,
+        "Contours": contours_img,
+    }
+# ─── Streamlit UI ─────────────────────────────────────────────────────────────
+def render_dl_module():
+    st.header("🧠 Deep Learning Module")
+    st.markdown("Upload an image to classify it with pretrained CNNs or run OpenCV computer vision pipelines.")
+    uploaded = st.file_uploader("Upload Image (JPG/PNG)", type=["jpg", "jpeg", "png"], key="dl_upload")
+    if uploaded is None:
+        st.info("👆 Upload an image (JPG or PNG) to begin. Try uploading a photo of an animal, vehicle, or everyday object.")
+        return
+    image_pil = Image.open(uploaded)
+    st.image(image_pil, caption="Uploaded Image", use_column_width=True)
+    tabs = st.tabs(["🏷️ Image Classification", "👁️ OpenCV Analysis", "🎨 Image Filters"])
+    # ── Tab 1: Classification ─────────────────────────────────────────────────
+    with tabs[0]:
+        st.subheader("Image Classification (ImageNet)")
+        backend = st.radio("Choose Backend", ["TensorFlow/Keras", "PyTorch"], horizontal=True)
+        if backend == "TensorFlow/Keras":
+            model_choice = st.selectbox("Model", ["MobileNetV2", "ResNet50", "VGG16"])
+        else:
+            model_choice = st.selectbox("Model", ["MobileNetV2", "ResNet50"])
+        if st.button("🔍 Classify Image", type="primary"):
+            with st.spinner(f"Running {model_choice} inference..."):
+                try:
+                    if backend == "TensorFlow/Keras":
+                        results = _classify_image_tf(image_pil, model_choice)
+                    else:
+                        results = _classify_image_torch(image_pil, model_choice)
+                    import pandas as pd
+                    import matplotlib.pyplot as plt
+                    st.success(f"✅ Top prediction: **{results[0]['Label']}** ({results[0]['Confidence']})")
+                    st.subheader("Top 5 Predictions")
+                    df_preds = pd.DataFrame(results)
+                    st.dataframe(df_preds, use_container_width=True)
+                    # Bar chart of confidences
+                    fig, ax = plt.subplots(figsize=(8, 4))
+                    labels = [r["Label"][:30] for r in results]
+                    scores = [r["Score"] for r in results]
+                    colors = ["#0ea5e9" if i == 0 else "#334155" for i in range(len(scores))]
+                    bars = ax.barh(labels[::-1], scores[::-1], color=colors[::-1])
+                    ax.set_xlabel("Confidence Score")
+                    ax.set_title("Top 5 Predictions")
+                    ax.set_xlim(0, max(scores) * 1.2)
+                    for bar, score in zip(bars, scores[::-1]):
+                        ax.text(bar.get_width() + 0.005, bar.get_y() + bar.get_height()/2,
+                                f"{score*100:.1f}%", va="center", fontsize=9)
+                    plt.tight_layout()
+                    st.pyplot(fig)
+                except Exception as e:
+                    st.error(f"Classification failed: {e}")
+                    st.info("Make sure TensorFlow or PyTorch is installed. Run: `pip install tensorflow` or `pip install torch torchvision`")
+    # ── Tab 2: OpenCV Analysis ────────────────────────────────────────────────
+    with tabs[1]:
+        st.subheader("OpenCV Computer Vision")
+        cv_task = st.selectbox("Select Analysis", ["Edge Detection", "Face Detection"])
+        if st.button("▶ Run OpenCV Analysis", type="primary"):
+            with st.spinner("Processing with OpenCV..."):
+                if cv_task == "Edge Detection":
+                    edges = detect_edges_opencv(image_pil)
+                    col1, col2 = st.columns(2)
+                    with col1:
+                        st.image(image_pil, caption="Original", use_column_width=True)
+                    with col2:
+                        st.image(edges, caption="Canny Edge Detection", use_column_width=True, clamp=True)
+                    st.info(f"Detected approximately **{np.sum(edges > 0):,}** edge pixels.")
+                elif cv_task == "Face Detection":
+                    result_img, face_count = detect_faces_opencv(image_pil)
+                    col1, col2 = st.columns(2)
+                    with col1:
+                        st.image(image_pil, caption="Original", use_column_width=True)
+                    with col2:
+                        st.image(result_img, caption="Face Detection", use_column_width=True)
+                    if face_count > 0:
+                        st.success(f"✅ Detected **{face_count}** face(s).")
+                    else:
+                        st.warning("No faces detected. Try a clear portrait photo.")
+    # ── Tab 3: Image Filters ──────────────────────────────────────────────────
+    with tabs[2]:
+        st.subheader("OpenCV Image Processing Filters")
+        if st.button("🎨 Apply All Filters", type="primary"):
+            with st.spinner("Applying filters..."):
+                filters = apply_image_filters(image_pil)
+                cols = st.columns(3)
+                for i, (name, img) in enumerate(filters.items()):
+                    with cols[i % 3]:
+                        if len(img.shape) == 2:
+                            st.image(img, caption=name, use_column_width=True, clamp=True)
+                        else:
+                            st.image(img, caption=name, use_column_width=True)

models/generative_ai.py ADDED Viewed

	@@ -0,0 +1,196 @@

+"""
+generative_ai.py - Generative AI Module
+Supports OpenAI GPT, Google Gemini, Anthropic Claude, and Smart AI fallback
+"""
+import warnings
+warnings.filterwarnings("ignore")
+OPENAI_OK = False
+GOOGLE_OK = False
+ANTHROPIC_OK = False
+try:
+    import openai
+    OPENAI_OK = True
+except ImportError:
+    pass
+try:
+    import google.generativeai as genai
+    GOOGLE_OK = True
+except ImportError:
+    pass
+try:
+    import anthropic
+    ANTHROPIC_OK = True
+except ImportError:
+    pass
+def _smart_respond(prompt: str, history: list) -> str:
+    """Instant smart AI response without API calls - keyword-based fallback."""
+    p = prompt.lower()
+    if any(w in p for w in ["hello", "hi", "hey", "greetings"]):
+        return "Hello! I'm your AI assistant. How can I help you today?"
+    if "machine learning" in p or " ml " in p or "machine learning" in p:
+        return (
+            "**Machine Learning** enables systems to learn from data without explicit programming. "
+            "Types: Supervised, Unsupervised, Reinforcement Learning. "
+            "Popular libraries: scikit-learn, XGBoost, LightGBM, PyTorch, TensorFlow."
+        )
+    if "deep learning" in p or "neural network" in p or "cnn" in p:
+        return (
+            "**Deep Learning** uses multi-layer neural networks to learn complex patterns. "
+            "Best for: images (CNNs), sequences (RNNs/LSTMs), Transformers. "
+            "Frameworks: PyTorch, TensorFlow/Keras."
+        )
+    if "xgboost" in p or "gradient boosting" in p:
+        return (
+            "**XGBoost** builds trees sequentially, each correcting prior errors. "
+            "Key parameters: n_estimators, max_depth, learning_rate, subsample. "
+            "Extremely fast and accurate for tabular data."
+        )
+    if "lightgbm" in p:
+        return (
+            "**LightGBM** uses histogram-based gradient boosting for speed. "
+            "Great for large datasets. Uses leaf-wise tree growth vs level-wise."
+        )
+    if "overfitting" in p or "underfitting" in p:
+        return (
+            "**Overfitting** = model memorizes training noise, fails on new data. "
+            "Fixes: cross-validation, regularization (L1/L2), dropout, more data, simpler model. "
+            "**Underfitting** = model too simple to capture patterns. Fixes: more features, complex model."
+        )
+    if "python" in p:
+        return (
+            "**Python** dominates AI/ML thanks to: NumPy, Pandas, scikit-learn, "
+            "PyTorch, TensorFlow, HuggingFace Transformers. "
+            "Use virtual environments (venv/conda) to manage dependencies."
+        )
+    if "nlp" in p or "natural language" in p or "text" in p:
+        return (
+            "**NLP** (Natural Language Processing) enables machines to understand text. "
+            "Key tasks: sentiment analysis, NER, classification, summarization, translation. "
+            "Modern approach: HuggingFace Transformers (BERT, GPT, T5), spaCy."
+        )
+    if "data" in p and ("clean" in p or "preprocess" in p):
+        return (
+            "**Data Preprocessing** steps: 1) Handle missing values (mean/median/mode), "
+            "2) Encode categoricals (LabelEncoder, OneHot), 3) Scale numeric features, "
+            "4) Remove outliers, 5) Feature engineering."
+        )
+    if "random forest" in p or "rf " in p:
+        return (
+            "**Random Forest** is an ensemble of decision trees. "
+            "Uses bagging and random feature selection. "
+            "Key params: n_estimators, max_depth, min_samples_split. "
+            "Good for feature importance and handling missing values."
+        )
+    if "classification" in p:
+        return (
+            "**Classification** predicts categorical labels. "
+            "Algorithms: Logistic Regression, Decision Trees, Random Forest, SVM, XGBoost. "
+            "Metrics: Accuracy, Precision, Recall, F1-Score, ROC-AUC."
+        )
+    if "regression" in p:
+        return (
+            "**Regression** predicts continuous values. "
+            "Algorithms: Linear Regression, Ridge, Lasso, Random Forest, XGBoost. "
+            "Metrics: MSE, RMSE, MAE, R² Score."
+        )
+    if "api" in p or "key" in p or "openai" in p or "gpt" in p:
+        return (
+            "To use GPT models, set OPENAI_API_KEY environment variable or pass api_key parameter. "
+            "Get your key from https://platform.openai.com/api-keys"
+        )
+    if "help" in p or "what can you do" in p:
+        return (
+            "I can help with: Machine Learning, Deep Learning, NLP, Data Science, "
+            "Python programming, XGBoost, scikit-learn, TensorFlow, PyTorch, "
+            "model evaluation, and more! Ask me anything."
+        )
+    return (
+        f"I understand you're asking about: '{prompt[:50]}...'. "
+        "Try asking about: machine learning, neural networks, XGBoost, Python, "
+        "NLP, data preprocessing, classification, regression, or specific algorithms!"
+    )
+class GenerativeAI:
+    def __init__(self, api_key: str = "", provider: str = "smart"):
+        self.api_key = api_key
+        self.provider = provider
+        self._provider = provider
+        self._provider_config = self._get_provider_config(provider)
+        self.client = None
+        if provider == "openai" and OPENAI_OK and api_key:
+            openai.api_key = api_key
+            self.client = openai
+        elif provider == "google" and GOOGLE_OK and api_key:
+            genai.configure(api_key=api_key)
+            self.client = genai
+        elif provider == "anthropic" and ANTHROPIC_OK and api_key:
+            self.client = anthropic.Anthropic(api_key=api_key)
+    def _get_provider_config(self, provider: str) -> dict:
+        configs = {
+            "smart": {"name": "Smart AI", "status": "✅", "desc": "Instant responses - no API key needed"},
+            "openai": {"name": "OpenAI GPT-4o", "status": "🟢" if OPENAI_OK else "❌", "desc": "Requires API key"},
+            "google": {"name": "Google Gemini", "status": "🔵" if GOOGLE_OK else "❌", "desc": "Requires API key"},
+            "anthropic": {"name": "Anthropic Claude", "status": "🟣" if ANTHROPIC_OK else "❌", "desc": "Requires API key"},
+        }
+        return configs.get(provider, configs["smart"])
+    def generate(self, prompt: str, history: list = None) -> str:
+        """Generate response based on provider."""
+        if self.provider == "smart" or not self.client:
+            return _smart_respond(prompt, history or [])
+        try:
+            if self.provider == "openai":
+                messages = [{"role": "user", "content": prompt}]
+                if history:
+                    for h in history:
+                        messages.append(h)
+                response = self.client.chat.completions.create(
+                    model="gpt-4o",
+                    messages=messages,
+                )
+                return response.choices[0].message.content
+            elif self.provider == "google":
+                model = self.client.GenerativeModel("gemini-pro")
+                chat = model.start_chat(history=[])
+                response = chat.send_message(prompt)
+                return response.text
+            elif self.provider == "anthropic":
+                response = self.client.messages.create(
+                    model="claude-3-opus-20240229",
+                    max_tokens=1024,
+                    messages=[{"role": "user", "content": prompt}]
+                )
+                return response.content[0].text
+        except Exception as e:
+            return f"Error with {self.provider}: {str(e)}. Falling back to smart AI.\n\n" + _smart_respond(prompt, history or [])
+        return _smart_respond(prompt, history or [])

models/ml_models.py ADDED Viewed

	@@ -0,0 +1,487 @@

+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold
+from sklearn.ensemble import (
+    RandomForestClassifier, RandomForestRegressor,
+    GradientBoostingClassifier, GradientBoostingRegressor,
+    VotingClassifier, VotingRegressor,
+)
+from sklearn.linear_model import LogisticRegression, Ridge, Lasso
+from sklearn.svm import SVC, SVR
+from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
+from sklearn.metrics import (
+    accuracy_score, classification_report, mean_squared_error,
+    r2_score, f1_score, roc_auc_score, confusion_matrix,
+    mean_absolute_error,
+)
+from sklearn.pipeline import Pipeline
+from sklearn.impute import SimpleImputer
+from typing import Dict, Any, Tuple, Optional, List
+import warnings
+warnings.filterwarnings('ignore')
+try:
+    import xgboost as xgb
+    XGB_AVAILABLE = True
+except ImportError:
+    XGB_AVAILABLE = False
+try:
+    import lightgbm as lgb
+    LGB_AVAILABLE = True
+except ImportError:
+    LGB_AVAILABLE = False
+class MLPipeline:
+    """
+    A powerful, production-ready Machine Learning pipeline supporting
+    classification and regression with ensemble methods, cross-validation,
+    feature importance, and detailed metrics.
+    """
+    def __init__(self, task_type: str = "classification", model_name: str = "Random Forest"):
+        self.task_type = task_type
+        self.model_name = model_name
+        self.model = None
+        self.scaler = StandardScaler()
+        self.imputer = SimpleImputer(strategy='median')
+        self.label_encoder = LabelEncoder()
+        self.is_fitted = False
+        self.feature_names: List[str] = []
+        self.metrics: Dict[str, Any] = {}
+        self.X_test = None
+        self.y_test = None
+        self.y_pred = None
+        self.classes_: Optional[np.ndarray] = None
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+    def _build_model(self):
+        name = self.model_name
+        if self.task_type == "classification":
+            models = {
+                "Random Forest": RandomForestClassifier(
+                    n_estimators=200, max_depth=None, min_samples_split=2,
+                    random_state=42, n_jobs=-1, class_weight='balanced'
+                ),
+                "Gradient Boosting": GradientBoostingClassifier(
+                    n_estimators=150, learning_rate=0.1, max_depth=5,
+                    random_state=42
+                ),
+                "Logistic Regression": LogisticRegression(
+                    max_iter=1000, random_state=42, class_weight='balanced'
+                ),
+                "SVM": SVC(probability=True, kernel='rbf', random_state=42, class_weight='balanced'),
+            }
+            return models.get(name, models["Random Forest"])
+        else:
+            models = {
+                "Random Forest": RandomForestRegressor(
+                    n_estimators=200, max_depth=None, random_state=42, n_jobs=-1
+                ),
+                "Gradient Boosting": GradientBoostingRegressor(
+                    n_estimators=150, learning_rate=0.1, max_depth=5, random_state=42
+                ),
+                "Ridge Regression": Ridge(alpha=1.0),
+                "Lasso Regression": Lasso(alpha=1.0, max_iter=5000),
+                "SVM": SVR(kernel='rbf'),
+            }
+            return models.get(name, models["Random Forest"])
+    def _preprocess_X(self, df: pd.DataFrame, fit: bool = True) -> np.ndarray:
+        df = df.copy()
+        # Encode categoricals
+        for col in df.select_dtypes(include=['object', 'category']).columns:
+            le = LabelEncoder()
+            df[col] = le.fit_transform(df[col].astype(str))
+        # Boolean → int
+        for col in df.select_dtypes(include=['bool']).columns:
+            df[col] = df[col].astype(int)
+        arr = df.values.astype(float)
+        if fit:
+            arr = self.imputer.fit_transform(arr)
+            arr = self.scaler.fit_transform(arr)
+        else:
+            arr = self.imputer.transform(arr)
+            arr = self.scaler.transform(arr)
+        return arr
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+    def preprocess(
+        self, df: pd.DataFrame, target_col: Optional[str] = None
+    ) -> Tuple[np.ndarray, Optional[np.ndarray]]:
+        df = df.copy()
+        if target_col and target_col in df.columns:
+            y_raw = df[target_col]
+            if self.task_type == "classification":
+                self.label_encoder = LabelEncoder()
+                y = self.label_encoder.fit_transform(y_raw.astype(str))
+                self.classes_ = self.label_encoder.classes_
+            else:
+                y = y_raw.values.astype(float)
+            df = df.drop(columns=[target_col])
+        else:
+            y = None
+        # One-hot for remaining categoricals after splitting target
+        df = pd.get_dummies(df, drop_first=True)
+        self.feature_names = df.columns.tolist()
+        X = self._preprocess_X(df, fit=True)
+        return X, y
+    def train(
+        self,
+        X: np.ndarray,
+        y: np.ndarray,
+        test_size: float = 0.2,
+    ) -> Dict[str, Any]:
+        """Train the model and return comprehensive metrics."""
+        if isinstance(X, pd.DataFrame):
+            X = self._preprocess_X(X, fit=True)
+        # Stratified split for classification when possible
+        stratify = None
+        if self.task_type == "classification":
+            unique, counts = np.unique(y, return_counts=True)
+            if len(unique) >= 2 and all(c >= 2 for c in counts):
+                stratify = y
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=test_size, random_state=42, stratify=stratify
+        )
+        self.X_test = X_test
+        self.y_test = y_test
+        self.model = self._build_model()
+        self.model.fit(X_train, y_train)
+        self.is_fitted = True
+        y_pred = self.model.predict(X_test)
+        self.y_pred = y_pred
+        self.metrics = self._compute_metrics(y_test, y_pred, X, y)
+        return self.metrics
+    def _compute_metrics(
+        self,
+        y_test: np.ndarray,
+        y_pred: np.ndarray,
+        X_full: np.ndarray,
+        y_full: np.ndarray,
+    ) -> Dict[str, Any]:
+        metrics: Dict[str, Any] = {}
+        if self.task_type == "classification":
+            metrics["accuracy"] = round(float(accuracy_score(y_test, y_pred)), 4)
+            metrics["f1_score"] = round(float(f1_score(y_test, y_pred, average='weighted')), 4)
+            # ROC-AUC (binary only)
+            if len(np.unique(y_full)) == 2 and hasattr(self.model, 'predict_proba'):
+                try:
+                    proba = self.model.predict_proba(self.X_test)[:, 1]
+                    metrics["roc_auc"] = round(float(roc_auc_score(y_test, proba)), 4)
+                except Exception:
+                    pass
+            # Cross-validation
+            try:
+                cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
+                cv_scores = cross_val_score(self.model, X_full, y_full, cv=cv, scoring='accuracy', n_jobs=-1)
+                metrics["cv_mean_accuracy"] = round(float(cv_scores.mean()), 4)
+                metrics["cv_std"] = round(float(cv_scores.std()), 4)
+            except Exception:
+                pass
+            # Classification report as string
+            try:
+                class_names = [str(c) for c in self.classes_] if self.classes_ is not None else None
+                metrics["classification_report"] = classification_report(
+                    y_test, y_pred, target_names=class_names
+                )
+            except Exception:
+                pass
+            # Confusion matrix
+            try:
+                cm = confusion_matrix(y_test, y_pred)
+                metrics["confusion_matrix"] = cm.tolist()
+            except Exception:
+                pass
+        else:  # regression
+            metrics["mse"] = round(float(mean_squared_error(y_test, y_pred)), 4)
+            metrics["rmse"] = round(float(np.sqrt(mean_squared_error(y_test, y_pred))), 4)
+            metrics["mae"] = round(float(mean_absolute_error(y_test, y_pred)), 4)
+            metrics["r2_score"] = round(float(r2_score(y_test, y_pred)), 4)
+            # Cross-validation
+            try:
+                cv = KFold(n_splits=5, shuffle=True, random_state=42)
+                cv_scores = cross_val_score(self.model, X_full, y_full, cv=cv, scoring='r2', n_jobs=-1)
+                metrics["cv_mean_r2"] = round(float(cv_scores.mean()), 4)
+                metrics["cv_std"] = round(float(cv_scores.std()), 4)
+            except Exception:
+                pass
+        return metrics
+    def predict(self, X: np.ndarray) -> np.ndarray:
+        if not self.is_fitted:
+            raise ValueError("Model must be trained before prediction")
+        if isinstance(X, pd.DataFrame):
+            X = self._preprocess_X(X, fit=False)
+        return self.model.predict(X)
+    def predict_proba(self, X: np.ndarray) -> np.ndarray:
+        if not self.is_fitted:
+            raise ValueError("Model must be trained before prediction")
+        if self.task_type != "classification":
+            raise ValueError("predict_proba only available for classification")
+        if not hasattr(self.model, 'predict_proba'):
+            raise ValueError(f"{self.model_name} does not support probability estimates")
+        if isinstance(X, pd.DataFrame):
+            X = self._preprocess_X(X, fit=False)
+        return self.model.predict_proba(X)
+    def get_feature_importance(self) -> pd.DataFrame:
+        if not self.is_fitted:
+            raise ValueError("Model must be trained first")
+        if hasattr(self.model, 'feature_importances_'):
+            importance = self.model.feature_importances_
+        elif hasattr(self.model, 'coef_'):
+            coef = self.model.coef_
+            importance = np.abs(coef).mean(axis=0) if coef.ndim > 1 else np.abs(coef)
+        else:
+            # Fallback: permutation-style zeros
+            importance = np.zeros(len(self.feature_names))
+        return pd.DataFrame({
+            "feature": self.feature_names[:len(importance)],
+            "importance": importance,
+        }).sort_values("importance", ascending=False).reset_index(drop=True)
+    def get_predictions_df(self, df_original: pd.DataFrame) -> pd.DataFrame:
+        """Returns original df with predictions appended."""
+        if not self.is_fitted:
+            raise ValueError("Model not trained yet")
+        result = df_original.copy()
+        # Preprocess same features used in training
+        feature_df = df_original[[f for f in self.feature_names if f in df_original.columns]]
+        preds = self.predict(feature_df)
+        result["prediction"] = preds
+        return result
+# ---------------------------------------------------------------------------
+# XGBoost Pipeline
+# ---------------------------------------------------------------------------
+class XGBoostPipeline(MLPipeline):
+    """XGBoost-based pipeline with early stopping and full metrics."""
+    def __init__(self, task_type: str = "classification"):
+        super().__init__(task_type=task_type, model_name="XGBoost")
+    def _build_xgb_model(self, n_classes: int = 2):
+        if self.task_type == "classification":
+            objective = "multi:softprob" if n_classes > 2 else "binary:logistic"
+            return xgb.XGBClassifier(
+                n_estimators=200,
+                max_depth=6,
+                learning_rate=0.05,
+                subsample=0.8,
+                colsample_bytree=0.8,
+                eval_metric='logloss',
+                random_state=42,
+                n_jobs=-1,
+                objective=objective,
+            )
+        else:
+            return xgb.XGBRegressor(
+                n_estimators=200,
+                max_depth=6,
+                learning_rate=0.05,
+                subsample=0.8,
+                colsample_bytree=0.8,
+                random_state=42,
+                n_jobs=-1,
+            )
+    def train(self, X: np.ndarray, y: np.ndarray, test_size: float = 0.2) -> Dict[str, Any]:
+        if not XGB_AVAILABLE:
+            raise ImportError("xgboost is not installed. Run: pip install xgboost")
+        if isinstance(X, pd.DataFrame):
+            X = self._preprocess_X(X, fit=True)
+        stratify = None
+        if self.task_type == "classification":
+            unique, counts = np.unique(y, return_counts=True)
+            if len(unique) >= 2 and all(c >= 2 for c in counts):
+                stratify = y
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=test_size, random_state=42, stratify=stratify
+        )
+        self.X_test = X_test
+        self.y_test = y_test
+        n_classes = len(np.unique(y)) if self.task_type == "classification" else 2
+        self.model = self._build_xgb_model(n_classes=n_classes)
+        self.model.fit(
+            X_train, y_train,
+            eval_set=[(X_test, y_test)],
+            verbose=False,
+        )
+        self.is_fitted = True
+        y_pred = self.model.predict(X_test)
+        self.y_pred = y_pred
+        self.metrics = self._compute_metrics(y_test, y_pred, X, y)
+        return self.metrics
+# ---------------------------------------------------------------------------
+# LightGBM Pipeline
+# ---------------------------------------------------------------------------
+class LightGBMPipeline(MLPipeline):
+    """LightGBM pipeline — fastest gradient boosting for large datasets."""
+    def __init__(self, task_type: str = "classification"):
+        super().__init__(task_type=task_type, model_name="LightGBM")
+    def train(self, X: np.ndarray, y: np.ndarray, test_size: float = 0.2) -> Dict[str, Any]:
+        if not LGB_AVAILABLE:
+            raise ImportError("lightgbm is not installed. Run: pip install lightgbm")
+        if isinstance(X, pd.DataFrame):
+            X = self._preprocess_X(X, fit=True)
+        stratify = None
+        if self.task_type == "classification":
+            unique, counts = np.unique(y, return_counts=True)
+            if all(c >= 2 for c in counts):
+                stratify = y
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=test_size, random_state=42, stratify=stratify
+        )
+        self.X_test = X_test
+        self.y_test = y_test
+        if self.task_type == "classification":
+            n_classes = len(np.unique(y))
+            objective = "multiclass" if n_classes > 2 else "binary"
+            self.model = lgb.LGBMClassifier(
+                n_estimators=200, learning_rate=0.05,
+                num_leaves=31, random_state=42,
+                objective=objective, n_jobs=-1,
+                class_weight='balanced',
+                verbose=-1,
+            )
+        else:
+            self.model = lgb.LGBMRegressor(
+                n_estimators=200, learning_rate=0.05,
+                num_leaves=31, random_state=42,
+                n_jobs=-1, verbose=-1,
+            )
+        self.model.fit(X_train, y_train, eval_set=[(X_test, y_test)])
+        self.is_fitted = True
+        y_pred = self.model.predict(X_test)
+        self.y_pred = y_pred
+        self.metrics = self._compute_metrics(y_test, y_pred, X, y)
+        return self.metrics
+# ---------------------------------------------------------------------------
+# Ensemble / AutoML-style pipeline
+# ---------------------------------------------------------------------------
+class EnsemblePipeline(MLPipeline):
+    """
+    Voting ensemble of Random Forest + Gradient Boosting (+ XGBoost if available).
+    Best overall accuracy across most datasets.
+    """
+    def __init__(self, task_type: str = "classification"):
+        super().__init__(task_type=task_type, model_name="Ensemble")
+    def train(self, X: np.ndarray, y: np.ndarray, test_size: float = 0.2) -> Dict[str, Any]:
+        if isinstance(X, pd.DataFrame):
+            X = self._preprocess_X(X, fit=True)
+        stratify = None
+        if self.task_type == "classification":
+            unique, counts = np.unique(y, return_counts=True)
+            if all(c >= 2 for c in counts):
+                stratify = y
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=test_size, random_state=42, stratify=stratify
+        )
+        self.X_test = X_test
+        self.y_test = y_test
+        if self.task_type == "classification":
+            estimators = [
+                ("rf", RandomForestClassifier(n_estimators=150, random_state=42, n_jobs=-1, class_weight='balanced')),
+                ("gb", GradientBoostingClassifier(n_estimators=100, random_state=42)),
+            ]
+            if XGB_AVAILABLE:
+                estimators.append(("xgb", xgb.XGBClassifier(
+                    n_estimators=100,
+                    eval_metric='logloss', random_state=42, n_jobs=-1,
+                )))
+            self.model = VotingClassifier(estimators=estimators, voting='soft', n_jobs=-1)
+        else:
+            estimators = [
+                ("rf", RandomForestRegressor(n_estimators=150, random_state=42, n_jobs=-1)),
+                ("gb", GradientBoostingRegressor(n_estimators=100, random_state=42)),
+            ]
+            if XGB_AVAILABLE:
+                estimators.append(("xgb", xgb.XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1)))
+            self.model = VotingRegressor(estimators=estimators, n_jobs=-1)
+        self.model.fit(X_train, y_train)
+        self.is_fitted = True
+        y_pred = self.model.predict(X_test)
+        self.y_pred = y_pred
+        self.metrics = self._compute_metrics(y_test, y_pred, X, y)
+        return self.metrics
+    def get_feature_importance(self) -> pd.DataFrame:
+        """Average feature importances from sub-estimators that support it."""
+        importances = []
+        estimators = self.model.estimators_
+        for est in estimators:
+            if hasattr(est, 'feature_importances_'):
+                importances.append(est.feature_importances_)
+        if not importances:
+            return pd.DataFrame({"feature": self.feature_names, "importance": 0.0})
+        avg_importance = np.mean(importances, axis=0)
+        return pd.DataFrame({
+            "feature": self.feature_names[:len(avg_importance)],
+            "importance": avg_importance,
+        }).sort_values("importance", ascending=False).reset_index(drop=True)

models/nlp_module.py ADDED Viewed

	@@ -0,0 +1,327 @@

+"""
+nlp_module.py  —  NLP Module (v2.1 Clean)
+Models:
+  - DistilBERT SST-2     → sentiment analysis  (~250 MB, downloads on first use)
+  - spaCy en_core_web_sm → named entity recognition (~15 MB, auto-downloads)
+  - TF-IDF              → zero-shot classification (no download)
+  - Extractive          → summarization (no download)
+  - Smart AI (built-in)  → chatbot, zero downloads
+"""
+import warnings
+warnings.filterwarnings("ignore")
+import streamlit as st
+# ══════════════════════════════════════════════════════════════════════════════
+#  Cached pipeline loaders
+# ══════════════════════════════════════════════════════════════════════════════
+@st.cache_resource(show_spinner=False)
+def load_sentiment_pipeline():
+    """DistilBERT SST-2 — ~250 MB, fast and accurate."""
+    from transformers import pipeline  # type: ignore[import-untyped]
+    return pipeline(  # type: ignore[call-overload]
+        "sentiment-analysis",
+        model="distilbert-base-uncased-finetuned-sst-2-english",
+    )
+@st.cache_resource(show_spinner=False)
+def load_ner_pipeline():
+    """
+    spaCy en_core_web_sm (~15 MB) for NER.
+    Falls back to regex-based NER if spaCy is not installed.
+    Install: pip install spacy && python -m spacy download en_core_web_sm
+    """
+    try:
+        import spacy
+        try:
+            return ("spacy", spacy.load("en_core_web_sm"))
+        except OSError:
+            from spacy.cli.download import download as spacy_download  # type: ignore[import]
+            spacy_download("en_core_web_sm")
+            return ("spacy", spacy.load("en_core_web_sm"))
+    except ImportError:
+        return ("regex", None)
+@st.cache_resource(show_spinner=False)
+def load_zero_shot_pipeline():
+    """
+    Lightweight zero-shot classification using TF-IDF cosine similarity.
+    Zero model downloads, zero RAM overhead — works on any machine.
+    Falls back gracefully without any internet or large model requirement.
+    """
+    return "tfidf"   # sentinel value — actual logic is in run_text_classification
+@st.cache_resource(show_spinner=False)
+def load_summarization_pipeline():
+    """
+    Extractive summarizer — word-frequency scoring, zero model download.
+    Picks the most informative sentences from the input text.
+    """
+    return "extractive"   # sentinel — actual logic in run_summarization
+# ══════════════════════════════════════════════════════════════════════════════
+#  Business logic
+# ══════════════════════════════════════════════════════════════════════════════
+def run_sentiment(texts: list) -> list:
+    """
+    Sentiment analysis on a list of strings.
+    Returns list of dicts: Text, Sentiment, Confidence, Score.
+    """
+    pipe    = load_sentiment_pipeline()
+    results = []
+    for text in texts:
+        if text.strip():
+            r = pipe(text[:512], truncation=True, max_length=512)[0]
+            results.append({
+                "Text":       text[:80],
+                "Sentiment":  r["label"],
+                "Confidence": f"{r['score'] * 100:.1f}%",
+                "Score":      round(r["score"], 4),
+            })
+    return results
+def run_ner(text: str) -> list:
+    """
+    Named Entity Recognition using spaCy (15 MB) or regex fallback.
+    Returns list of dicts: Entity, Type, Score, Start, End.
+    """
+    backend, model = load_ner_pipeline()
+    if backend == "spacy" and model is not None:
+        doc = model(text[:1000])
+        return [
+            {
+                "Entity": ent.text,
+                "Type":   ent.label_,
+                "Score":  "100.0%",
+                "Start":  ent.start_char,
+                "End":    ent.end_char,
+            }
+            for ent in doc.ents
+        ]
+    # ── Regex fallback — works with zero extra installs ──────────────────────
+    import re
+    patterns = [
+        (
+            r'\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\s+'
+            r'(?:Inc|Corp|Ltd|LLC|Co|Group|Foundation|Institute|University|'
+            r'College|School|Hospital|Bank|Technologies|Solutions|Systems|Services)\.?)\b',
+            "ORG",
+        ),
+        (
+            r'\b([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*)\b'
+            r'(?=\s+(?:City|State|Country|Street|Avenue|Road|Park|Lake|River|'
+            r'Mountain|Valley|Island|Bay|County|District|Province|Region))',
+            "LOC",
+        ),
+        (
+            r'\b([A-Z][a-z]{2,}\s+[A-Z][a-z]{2,}(?:\s+[A-Z][a-z]{2,})?)\b',
+            "PER",
+        ),
+        (r'\b([A-Z]{2,6})\b', "ORG"),
+    ]
+    seen, results = set(), []
+    for pattern, label in patterns:
+        for m in re.finditer(pattern, text):
+            entity = m.group(1).strip()
+            key    = (entity, label)
+            if key not in seen and len(entity) > 1:
+                seen.add(key)
+                results.append({
+                    "Entity": entity,
+                    "Type":   label,
+                    "Score":  "~",
+                    "Start":  m.start(),
+                    "End":    m.end(),
+                })
+    return sorted(results, key=lambda x: x["Start"])
+def _tfidf_cosine(text: str, label: str) -> float:
+    """Compute TF-IDF cosine similarity between text and a label string."""
+    import re
+    from collections import Counter
+    import math
+    _stop = {"the","a","an","is","are","was","were","be","been","being","have",
+             "has","had","do","does","did","will","would","could","should","may",
+             "might","can","to","of","in","for","on","with","at","by","from","as",
+             "and","but","or","not","it","its","this","that","i","we","you","he",
+             "she","they","all","any","more","so","very","also","just","about"}
+    def _tokens(s: str) -> list:
+        return [w for w in re.findall(r"[a-z]+", s.lower()) if w not in _stop and len(w) > 1]
+    t_tokens = _tokens(text)
+    l_tokens = _tokens(label)
+    if not t_tokens or not l_tokens:
+        return 0.0
+    # TF of text
+    tf_t = Counter(t_tokens)
+    tf_l = Counter(l_tokens)
+    # Vocabulary union
+    vocab = set(tf_t) | set(tf_l)
+    # Simple IDF weight: log(1 + 1/freq_ratio) — single-doc approximation
+    def vec(tf: Counter) -> dict:
+        total = sum(tf.values()) or 1
+        return {w: tf[w] / total for w in vocab}
+    vt = vec(tf_t)
+    vl = vec(tf_l)
+    dot    = sum(vt[w] * vl[w] for w in vocab)
+    norm_t = math.sqrt(sum(v * v for v in vt.values())) or 1e-9
+    norm_l = math.sqrt(sum(v * v for v in vl.values())) or 1e-9
+    return dot / (norm_t * norm_l)
+def run_text_classification(text: str, labels: list) -> list:
+    """
+    Zero-shot text classification using TF-IDF cosine similarity.
+    No model download required — works instantly on any machine.
+    Returns list of dicts: Label, Score, Confidence — sorted by score desc.
+    """
+    if not labels:
+        return []
+    scores = []
+    for label in labels:
+        # Boost: also compare text against expanded label description
+        sim = _tfidf_cosine(text, label)
+        scores.append((label, sim))
+    # Normalise scores so they sum to 1 (softmax-like)
+    import math
+    exp_scores = [(lbl, math.exp(s * 8)) for lbl, s in scores]   # temperature=8 sharpens
+    total = sum(s for _, s in exp_scores) or 1.0
+    normalised = sorted(
+        [{"Label": lbl, "Score": round(s / total, 4), "Confidence": f"{s / total * 100:.1f}%"}
+         for lbl, s in exp_scores],
+        key=lambda x: x["Score"], reverse=True,
+    )
+    return normalised
+def run_summarization(text: str) -> str:
+    """
+    Extractive summarization using word-frequency scoring.
+    Zero model download — works on any machine, any RAM size.
+    Picks the top 3 most informative sentences.
+    """
+    import re
+    from collections import Counter
+    text = text.strip()
+    # Split into sentences
+    sentences = re.split(r"(?<=[.!?])\s+", text)
+    sentences = [s.strip() for s in sentences if len(s.split()) > 4]
+    if len(sentences) <= 2:
+        return text[:400] + ("…" if len(text) > 400 else "")
+    # Stop words to ignore when computing importance
+    stop = {"the","a","an","is","are","was","were","be","been","being","have",
+            "has","had","do","does","did","will","would","could","should","may",
+            "might","can","to","of","in","for","on","with","at","by","from",
+            "as","into","and","but","or","not","it","its","this","that","i",
+            "we","you","he","she","they","all","any","each","more","most","so",
+            "very","also","just","about","than","other","such","when","which"}
+    words  = re.findall(r"[a-z]+", text.lower())
+    freq   = Counter(w for w in words if w not in stop and len(w) > 2)
+    max_f  = max(freq.values(), default=1)
+    freq   = {w: v / max_f for w, v in freq.items()}
+    # Score sentences
+    scores: dict = {}
+    for i, sent in enumerate(sentences):
+        score = sum(freq.get(w, 0) for w in re.findall(r"[a-z]+", sent.lower()))
+        score = score / max(len(sent.split()), 1)
+        if i == 0:
+            score *= 1.3    # slight boost for the opening sentence
+        scores[i] = score
+    # Pick top N sentences (preserve original order)
+    n   = max(1, min(4, len(sentences) // 3))
+    top = sorted(sorted(scores, key=lambda k: scores[k], reverse=True)[:n])
+    return " ".join(sentences[i] for i in top)
+def chat_with_model(prompt: str, history: list) -> str:
+    """
+    Instant chatbot using Smart AI — no model download, zero RAM.
+    Falls back to simple keyword responses if the import fails.
+    """
+    try:
+        import sys
+        from pathlib import Path
+        # Support both flat and models/ directory layouts
+        sys.path.insert(0, str(Path(__file__).parent))
+        sys.path.insert(0, str(Path(__file__).parent.parent))
+        from generative_ai import _smart_respond
+        # Convert (user, bot) tuple history to dict format
+        hist_dicts = []
+        for u, b in history[-4:]:
+            hist_dicts.append({"role": "user",      "content": u})
+            hist_dicts.append({"role": "assistant",  "content": b})
+        return _smart_respond(prompt, hist_dicts)
+    except Exception:
+        # Ultra-safe fallback if generative_ai import fails
+        p = prompt.lower()
+        if any(w in p for w in ["hello", "hi", "hey"]):
+            return "Hello! Ask me anything about ML, data science, or AI. 😊"
+        if "machine learning" in p or " ml " in p:
+            return (
+                "**Machine Learning** enables systems to learn patterns from data without "
+                "explicit programming. Types: Supervised, Unsupervised, Reinforcement. "
+                "Libraries: scikit-learn, XGBoost, LightGBM."
+            )
+        if "deep learning" in p or "neural" in p:
+            return (
+                "**Deep Learning** uses multi-layer neural networks to learn complex features. "
+                "Best for images (CNNs), sequences (Transformers), and unstructured data. "
+                "Frameworks: PyTorch, TensorFlow."
+            )
+        if "xgboost" in p or "gradient boosting" in p:
+            return (
+                "**XGBoost** builds trees sequentially, each correcting errors of the prior. "
+                "Key params: n_estimators, max_depth, learning_rate. Extremely fast and accurate."
+            )
+        if "overfitting" in p:
+            return (
+                "**Overfitting** = model memorises training noise, fails on new data. "
+                "Fixes: cross-validation, regularisation (L1/L2), dropout, more data, simpler model."
+            )
+        if "python" in p:
+            return (
+                "**Python** dominates AI/ML thanks to: NumPy, Pandas, scikit-learn, "
+                "PyTorch, TensorFlow, HuggingFace Transformers. "
+                "Use virtual environments to manage dependencies."
+            )
+        if "nlp" in p or "natural language" in p:
+            return (
+                "**NLP** (Natural Language Processing) enables machines to understand text. "
+                "Key tasks: sentiment, NER, classification, summarisation, translation. "
+                "Modern approach: HuggingFace Transformers (BERT, GPT, T5)."
+            )
+        return (
+            "I'm your AI assistant. Try asking about: machine learning, neural networks, "
+            "XGBoost, overfitting, Python, NLP, or data science topics!"
+        )