ayushsahu45 commited on
Commit
82dccf5
Β·
verified Β·
1 Parent(s): 6c1a9ca

Upload 4 files

Browse files
models/dl_module.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ dl_module.py - Deep Learning Module
3
+ Image classification using pretrained MobileNetV2/ResNet50 + OpenCV object detection
4
+ """
5
+
6
+ import streamlit as st
7
+ import numpy as np
8
+ import cv2
9
+ import io
10
+ import warnings
11
+ warnings.filterwarnings("ignore")
12
+
13
+ from PIL import Image
14
+
15
+ # ─── Lazy imports ────────────────────────────────────────────────────────────
16
+
17
+ def _load_tf_model(model_name):
18
+ """Load a Keras pretrained model."""
19
+ import tensorflow as tf
20
+ from tensorflow.keras.applications import MobileNetV2, ResNet50, VGG16
21
+ from tensorflow.keras.applications.mobilenet_v2 import preprocess_input as mn_pre, decode_predictions as mn_dec
22
+ from tensorflow.keras.applications.resnet50 import preprocess_input as rn_pre, decode_predictions as rn_dec
23
+ from tensorflow.keras.applications.vgg16 import preprocess_input as vg_pre, decode_predictions as vg_dec
24
+
25
+ models_map = {
26
+ "MobileNetV2": (MobileNetV2, mn_pre, mn_dec, (224, 224)),
27
+ "ResNet50": (ResNet50, rn_pre, rn_dec, (224, 224)),
28
+ "VGG16": (VGG16, vg_pre, vg_dec, (224, 224)),
29
+ }
30
+ ModelClass, preprocess, decode, size = models_map[model_name]
31
+ model = ModelClass(weights="imagenet")
32
+ return model, preprocess, decode, size
33
+
34
+
35
+ def _classify_image_tf(image_pil, model_name):
36
+ """Classify an image using TF/Keras pretrained model."""
37
+ import numpy as np
38
+ from tensorflow.keras.preprocessing.image import img_to_array
39
+
40
+ model, preprocess, decode, (h, w) = _load_tf_model(model_name)
41
+ img = image_pil.convert("RGB").resize((w, h))
42
+ arr = img_to_array(img)
43
+ arr = np.expand_dims(arr, axis=0)
44
+ arr = preprocess(arr)
45
+ preds = model.predict(arr, verbose=0)
46
+ top = decode(preds, top=5)[0]
47
+ results = [{"Rank": i+1, "Label": label.replace("_", " ").title(),
48
+ "Confidence": f"{prob*100:.2f}%", "Score": round(prob, 4)}
49
+ for i, (_, label, prob) in enumerate(top)]
50
+ return results
51
+
52
+
53
+ def _classify_image_torch(image_pil, model_name):
54
+ """Classify an image using PyTorch pretrained model."""
55
+ import torch
56
+ import torchvision.transforms as T
57
+ import torchvision.models as models_tv
58
+ import json
59
+ import urllib.request
60
+
61
+ # Load imagenet class labels
62
+ LABELS_URL = "https://raw.githubusercontent.com/anishathalye/imagenet-simple-labels/master/imagenet-simple-labels.json"
63
+ try:
64
+ with urllib.request.urlopen(LABELS_URL, timeout=5) as r:
65
+ class_labels = json.load(r)
66
+ except Exception:
67
+ class_labels = [str(i) for i in range(1000)]
68
+
69
+ torch_models = {
70
+ "MobileNetV2": models_tv.mobilenet_v2,
71
+ "ResNet50": models_tv.resnet50,
72
+ }
73
+ model_fn = torch_models.get(model_name, models_tv.mobilenet_v2)
74
+ model = model_fn(pretrained=True)
75
+ model.eval()
76
+
77
+ transform = T.Compose([
78
+ T.Resize(256),
79
+ T.CenterCrop(224),
80
+ T.ToTensor(),
81
+ T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
82
+ ])
83
+
84
+ img = image_pil.convert("RGB")
85
+ tensor = transform(img).unsqueeze(0)
86
+ with torch.no_grad():
87
+ output = model(tensor)
88
+ probs = torch.nn.functional.softmax(output[0], dim=0)
89
+
90
+ top_probs, top_idxs = torch.topk(probs, 5)
91
+ results = []
92
+ for i, (prob, idx) in enumerate(zip(top_probs, top_idxs)):
93
+ label = class_labels[idx.item()] if idx.item() < len(class_labels) else str(idx.item())
94
+ results.append({
95
+ "Rank": i+1,
96
+ "Label": label.replace("_", " ").title(),
97
+ "Confidence": f"{prob.item()*100:.2f}%",
98
+ "Score": round(prob.item(), 4),
99
+ })
100
+ return results
101
+
102
+
103
+ def detect_edges_opencv(image_pil):
104
+ """Apply Canny edge detection using OpenCV."""
105
+ img_array = np.array(image_pil.convert("RGB"))
106
+ gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
107
+ blurred = cv2.GaussianBlur(gray, (5, 5), 0)
108
+ edges = cv2.Canny(blurred, threshold1=50, threshold2=150)
109
+ return edges
110
+
111
+
112
+ def detect_faces_opencv(image_pil):
113
+ """Detect faces using Haar Cascade classifier."""
114
+ img_array = np.array(image_pil.convert("RGB"))
115
+ img_bgr = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
116
+ gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
117
+
118
+ cascade_path = cv2.data.haarcascades + "haarcascade_frontalface_default.xml"
119
+ face_cascade = cv2.CascadeClassifier(cascade_path)
120
+ faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
121
+
122
+ result_img = img_array.copy()
123
+ for (x, y, w, h) in faces:
124
+ cv2.rectangle(result_img, (x, y), (x+w, y+h), (0, 200, 255), 2)
125
+ cv2.putText(result_img, "Face", (x, y-8), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 200, 255), 2)
126
+ return result_img, len(faces)
127
+
128
+
129
+ def apply_image_filters(image_pil):
130
+ """Apply various OpenCV image processing filters and return dict of results."""
131
+ img = np.array(image_pil.convert("RGB"))
132
+ gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
133
+ blurred = cv2.GaussianBlur(img, (15, 15), 0)
134
+ sharpened = cv2.addWeighted(img, 1.5, blurred, -0.5, 0)
135
+ thresh = cv2.adaptiveThreshold(
136
+ gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
137
+ )
138
+ contours_img = img.copy()
139
+ contours, _ = cv2.findContours(
140
+ cv2.Canny(gray, 50, 150), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
141
+ )
142
+ cv2.drawContours(contours_img, contours, -1, (0, 255, 120), 1)
143
+
144
+ return {
145
+ "Grayscale": gray,
146
+ "Blurred": blurred,
147
+ "Sharpened": sharpened,
148
+ "Threshold": thresh,
149
+ "Contours": contours_img,
150
+ }
151
+
152
+
153
+ # ─── Streamlit UI ─────────────────────────────────────────────────────────────
154
+
155
+ def render_dl_module():
156
+ st.header("🧠 Deep Learning Module")
157
+ st.markdown("Upload an image to classify it with pretrained CNNs or run OpenCV computer vision pipelines.")
158
+
159
+ uploaded = st.file_uploader("Upload Image (JPG/PNG)", type=["jpg", "jpeg", "png"], key="dl_upload")
160
+
161
+ if uploaded is None:
162
+ st.info("πŸ‘† Upload an image (JPG or PNG) to begin. Try uploading a photo of an animal, vehicle, or everyday object.")
163
+ return
164
+
165
+ image_pil = Image.open(uploaded)
166
+ st.image(image_pil, caption="Uploaded Image", use_column_width=True)
167
+
168
+ tabs = st.tabs(["🏷️ Image Classification", "πŸ‘οΈ OpenCV Analysis", "🎨 Image Filters"])
169
+
170
+ # ── Tab 1: Classification ─────────────────────────────────────────────────
171
+ with tabs[0]:
172
+ st.subheader("Image Classification (ImageNet)")
173
+
174
+ backend = st.radio("Choose Backend", ["TensorFlow/Keras", "PyTorch"], horizontal=True)
175
+ if backend == "TensorFlow/Keras":
176
+ model_choice = st.selectbox("Model", ["MobileNetV2", "ResNet50", "VGG16"])
177
+ else:
178
+ model_choice = st.selectbox("Model", ["MobileNetV2", "ResNet50"])
179
+
180
+ if st.button("πŸ” Classify Image", type="primary"):
181
+ with st.spinner(f"Running {model_choice} inference..."):
182
+ try:
183
+ if backend == "TensorFlow/Keras":
184
+ results = _classify_image_tf(image_pil, model_choice)
185
+ else:
186
+ results = _classify_image_torch(image_pil, model_choice)
187
+
188
+ import pandas as pd
189
+ import matplotlib.pyplot as plt
190
+
191
+ st.success(f"βœ… Top prediction: **{results[0]['Label']}** ({results[0]['Confidence']})")
192
+ st.subheader("Top 5 Predictions")
193
+ df_preds = pd.DataFrame(results)
194
+ st.dataframe(df_preds, use_container_width=True)
195
+
196
+ # Bar chart of confidences
197
+ fig, ax = plt.subplots(figsize=(8, 4))
198
+ labels = [r["Label"][:30] for r in results]
199
+ scores = [r["Score"] for r in results]
200
+ colors = ["#0ea5e9" if i == 0 else "#334155" for i in range(len(scores))]
201
+ bars = ax.barh(labels[::-1], scores[::-1], color=colors[::-1])
202
+ ax.set_xlabel("Confidence Score")
203
+ ax.set_title("Top 5 Predictions")
204
+ ax.set_xlim(0, max(scores) * 1.2)
205
+ for bar, score in zip(bars, scores[::-1]):
206
+ ax.text(bar.get_width() + 0.005, bar.get_y() + bar.get_height()/2,
207
+ f"{score*100:.1f}%", va="center", fontsize=9)
208
+ plt.tight_layout()
209
+ st.pyplot(fig)
210
+
211
+ except Exception as e:
212
+ st.error(f"Classification failed: {e}")
213
+ st.info("Make sure TensorFlow or PyTorch is installed. Run: `pip install tensorflow` or `pip install torch torchvision`")
214
+
215
+ # ── Tab 2: OpenCV Analysis ────────────────────────────────────────────────
216
+ with tabs[1]:
217
+ st.subheader("OpenCV Computer Vision")
218
+
219
+ cv_task = st.selectbox("Select Analysis", ["Edge Detection", "Face Detection"])
220
+
221
+ if st.button("β–Ά Run OpenCV Analysis", type="primary"):
222
+ with st.spinner("Processing with OpenCV..."):
223
+ if cv_task == "Edge Detection":
224
+ edges = detect_edges_opencv(image_pil)
225
+ col1, col2 = st.columns(2)
226
+ with col1:
227
+ st.image(image_pil, caption="Original", use_column_width=True)
228
+ with col2:
229
+ st.image(edges, caption="Canny Edge Detection", use_column_width=True, clamp=True)
230
+ st.info(f"Detected approximately **{np.sum(edges > 0):,}** edge pixels.")
231
+
232
+ elif cv_task == "Face Detection":
233
+ result_img, face_count = detect_faces_opencv(image_pil)
234
+ col1, col2 = st.columns(2)
235
+ with col1:
236
+ st.image(image_pil, caption="Original", use_column_width=True)
237
+ with col2:
238
+ st.image(result_img, caption="Face Detection", use_column_width=True)
239
+ if face_count > 0:
240
+ st.success(f"βœ… Detected **{face_count}** face(s).")
241
+ else:
242
+ st.warning("No faces detected. Try a clear portrait photo.")
243
+
244
+ # ── Tab 3: Image Filters ──────────────────────────────────────────────────
245
+ with tabs[2]:
246
+ st.subheader("OpenCV Image Processing Filters")
247
+ if st.button("🎨 Apply All Filters", type="primary"):
248
+ with st.spinner("Applying filters..."):
249
+ filters = apply_image_filters(image_pil)
250
+ cols = st.columns(3)
251
+ for i, (name, img) in enumerate(filters.items()):
252
+ with cols[i % 3]:
253
+ if len(img.shape) == 2:
254
+ st.image(img, caption=name, use_column_width=True, clamp=True)
255
+ else:
256
+ st.image(img, caption=name, use_column_width=True)
models/generative_ai.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ generative_ai.py - Generative AI Module
3
+ Supports OpenAI GPT, Google Gemini, Anthropic Claude, and Smart AI fallback
4
+ """
5
+
6
+ import warnings
7
+ warnings.filterwarnings("ignore")
8
+
9
+ OPENAI_OK = False
10
+ GOOGLE_OK = False
11
+ ANTHROPIC_OK = False
12
+
13
+ try:
14
+ import openai
15
+ OPENAI_OK = True
16
+ except ImportError:
17
+ pass
18
+
19
+ try:
20
+ import google.generativeai as genai
21
+ GOOGLE_OK = True
22
+ except ImportError:
23
+ pass
24
+
25
+ try:
26
+ import anthropic
27
+ ANTHROPIC_OK = True
28
+ except ImportError:
29
+ pass
30
+
31
+
32
+ def _smart_respond(prompt: str, history: list) -> str:
33
+ """Instant smart AI response without API calls - keyword-based fallback."""
34
+ p = prompt.lower()
35
+
36
+ if any(w in p for w in ["hello", "hi", "hey", "greetings"]):
37
+ return "Hello! I'm your AI assistant. How can I help you today?"
38
+
39
+ if "machine learning" in p or " ml " in p or "machine learning" in p:
40
+ return (
41
+ "**Machine Learning** enables systems to learn from data without explicit programming. "
42
+ "Types: Supervised, Unsupervised, Reinforcement Learning. "
43
+ "Popular libraries: scikit-learn, XGBoost, LightGBM, PyTorch, TensorFlow."
44
+ )
45
+
46
+ if "deep learning" in p or "neural network" in p or "cnn" in p:
47
+ return (
48
+ "**Deep Learning** uses multi-layer neural networks to learn complex patterns. "
49
+ "Best for: images (CNNs), sequences (RNNs/LSTMs), Transformers. "
50
+ "Frameworks: PyTorch, TensorFlow/Keras."
51
+ )
52
+
53
+ if "xgboost" in p or "gradient boosting" in p:
54
+ return (
55
+ "**XGBoost** builds trees sequentially, each correcting prior errors. "
56
+ "Key parameters: n_estimators, max_depth, learning_rate, subsample. "
57
+ "Extremely fast and accurate for tabular data."
58
+ )
59
+
60
+ if "lightgbm" in p:
61
+ return (
62
+ "**LightGBM** uses histogram-based gradient boosting for speed. "
63
+ "Great for large datasets. Uses leaf-wise tree growth vs level-wise."
64
+ )
65
+
66
+ if "overfitting" in p or "underfitting" in p:
67
+ return (
68
+ "**Overfitting** = model memorizes training noise, fails on new data. "
69
+ "Fixes: cross-validation, regularization (L1/L2), dropout, more data, simpler model. "
70
+ "**Underfitting** = model too simple to capture patterns. Fixes: more features, complex model."
71
+ )
72
+
73
+ if "python" in p:
74
+ return (
75
+ "**Python** dominates AI/ML thanks to: NumPy, Pandas, scikit-learn, "
76
+ "PyTorch, TensorFlow, HuggingFace Transformers. "
77
+ "Use virtual environments (venv/conda) to manage dependencies."
78
+ )
79
+
80
+ if "nlp" in p or "natural language" in p or "text" in p:
81
+ return (
82
+ "**NLP** (Natural Language Processing) enables machines to understand text. "
83
+ "Key tasks: sentiment analysis, NER, classification, summarization, translation. "
84
+ "Modern approach: HuggingFace Transformers (BERT, GPT, T5), spaCy."
85
+ )
86
+
87
+ if "data" in p and ("clean" in p or "preprocess" in p):
88
+ return (
89
+ "**Data Preprocessing** steps: 1) Handle missing values (mean/median/mode), "
90
+ "2) Encode categoricals (LabelEncoder, OneHot), 3) Scale numeric features, "
91
+ "4) Remove outliers, 5) Feature engineering."
92
+ )
93
+
94
+ if "random forest" in p or "rf " in p:
95
+ return (
96
+ "**Random Forest** is an ensemble of decision trees. "
97
+ "Uses bagging and random feature selection. "
98
+ "Key params: n_estimators, max_depth, min_samples_split. "
99
+ "Good for feature importance and handling missing values."
100
+ )
101
+
102
+ if "classification" in p:
103
+ return (
104
+ "**Classification** predicts categorical labels. "
105
+ "Algorithms: Logistic Regression, Decision Trees, Random Forest, SVM, XGBoost. "
106
+ "Metrics: Accuracy, Precision, Recall, F1-Score, ROC-AUC."
107
+ )
108
+
109
+ if "regression" in p:
110
+ return (
111
+ "**Regression** predicts continuous values. "
112
+ "Algorithms: Linear Regression, Ridge, Lasso, Random Forest, XGBoost. "
113
+ "Metrics: MSE, RMSE, MAE, RΒ² Score."
114
+ )
115
+
116
+ if "api" in p or "key" in p or "openai" in p or "gpt" in p:
117
+ return (
118
+ "To use GPT models, set OPENAI_API_KEY environment variable or pass api_key parameter. "
119
+ "Get your key from https://platform.openai.com/api-keys"
120
+ )
121
+
122
+ if "help" in p or "what can you do" in p:
123
+ return (
124
+ "I can help with: Machine Learning, Deep Learning, NLP, Data Science, "
125
+ "Python programming, XGBoost, scikit-learn, TensorFlow, PyTorch, "
126
+ "model evaluation, and more! Ask me anything."
127
+ )
128
+
129
+ return (
130
+ f"I understand you're asking about: '{prompt[:50]}...'. "
131
+ "Try asking about: machine learning, neural networks, XGBoost, Python, "
132
+ "NLP, data preprocessing, classification, regression, or specific algorithms!"
133
+ )
134
+
135
+
136
+ class GenerativeAI:
137
+ def __init__(self, api_key: str = "", provider: str = "smart"):
138
+ self.api_key = api_key
139
+ self.provider = provider
140
+ self._provider = provider
141
+ self._provider_config = self._get_provider_config(provider)
142
+ self.client = None
143
+
144
+ if provider == "openai" and OPENAI_OK and api_key:
145
+ openai.api_key = api_key
146
+ self.client = openai
147
+ elif provider == "google" and GOOGLE_OK and api_key:
148
+ genai.configure(api_key=api_key)
149
+ self.client = genai
150
+ elif provider == "anthropic" and ANTHROPIC_OK and api_key:
151
+ self.client = anthropic.Anthropic(api_key=api_key)
152
+
153
+ def _get_provider_config(self, provider: str) -> dict:
154
+ configs = {
155
+ "smart": {"name": "Smart AI", "status": "βœ…", "desc": "Instant responses - no API key needed"},
156
+ "openai": {"name": "OpenAI GPT-4o", "status": "🟒" if OPENAI_OK else "❌", "desc": "Requires API key"},
157
+ "google": {"name": "Google Gemini", "status": "πŸ”΅" if GOOGLE_OK else "❌", "desc": "Requires API key"},
158
+ "anthropic": {"name": "Anthropic Claude", "status": "🟣" if ANTHROPIC_OK else "❌", "desc": "Requires API key"},
159
+ }
160
+ return configs.get(provider, configs["smart"])
161
+
162
+ def generate(self, prompt: str, history: list = None) -> str:
163
+ """Generate response based on provider."""
164
+ if self.provider == "smart" or not self.client:
165
+ return _smart_respond(prompt, history or [])
166
+
167
+ try:
168
+ if self.provider == "openai":
169
+ messages = [{"role": "user", "content": prompt}]
170
+ if history:
171
+ for h in history:
172
+ messages.append(h)
173
+ response = self.client.chat.completions.create(
174
+ model="gpt-4o",
175
+ messages=messages,
176
+ )
177
+ return response.choices[0].message.content
178
+
179
+ elif self.provider == "google":
180
+ model = self.client.GenerativeModel("gemini-pro")
181
+ chat = model.start_chat(history=[])
182
+ response = chat.send_message(prompt)
183
+ return response.text
184
+
185
+ elif self.provider == "anthropic":
186
+ response = self.client.messages.create(
187
+ model="claude-3-opus-20240229",
188
+ max_tokens=1024,
189
+ messages=[{"role": "user", "content": prompt}]
190
+ )
191
+ return response.content[0].text
192
+
193
+ except Exception as e:
194
+ return f"Error with {self.provider}: {str(e)}. Falling back to smart AI.\n\n" + _smart_respond(prompt, history or [])
195
+
196
+ return _smart_respond(prompt, history or [])
models/ml_models.py ADDED
@@ -0,0 +1,487 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold
4
+ from sklearn.ensemble import (
5
+ RandomForestClassifier, RandomForestRegressor,
6
+ GradientBoostingClassifier, GradientBoostingRegressor,
7
+ VotingClassifier, VotingRegressor,
8
+ )
9
+ from sklearn.linear_model import LogisticRegression, Ridge, Lasso
10
+ from sklearn.svm import SVC, SVR
11
+ from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
12
+ from sklearn.metrics import (
13
+ accuracy_score, classification_report, mean_squared_error,
14
+ r2_score, f1_score, roc_auc_score, confusion_matrix,
15
+ mean_absolute_error,
16
+ )
17
+ from sklearn.pipeline import Pipeline
18
+ from sklearn.impute import SimpleImputer
19
+ from typing import Dict, Any, Tuple, Optional, List
20
+ import warnings
21
+ warnings.filterwarnings('ignore')
22
+
23
+ try:
24
+ import xgboost as xgb
25
+ XGB_AVAILABLE = True
26
+ except ImportError:
27
+ XGB_AVAILABLE = False
28
+
29
+ try:
30
+ import lightgbm as lgb
31
+ LGB_AVAILABLE = True
32
+ except ImportError:
33
+ LGB_AVAILABLE = False
34
+
35
+
36
+ class MLPipeline:
37
+ """
38
+ A powerful, production-ready Machine Learning pipeline supporting
39
+ classification and regression with ensemble methods, cross-validation,
40
+ feature importance, and detailed metrics.
41
+ """
42
+
43
+ def __init__(self, task_type: str = "classification", model_name: str = "Random Forest"):
44
+ self.task_type = task_type
45
+ self.model_name = model_name
46
+ self.model = None
47
+ self.scaler = StandardScaler()
48
+ self.imputer = SimpleImputer(strategy='median')
49
+ self.label_encoder = LabelEncoder()
50
+ self.is_fitted = False
51
+ self.feature_names: List[str] = []
52
+ self.metrics: Dict[str, Any] = {}
53
+ self.X_test = None
54
+ self.y_test = None
55
+ self.y_pred = None
56
+ self.classes_: Optional[np.ndarray] = None
57
+
58
+ # ------------------------------------------------------------------
59
+ # Internal helpers
60
+ # ------------------------------------------------------------------
61
+
62
+ def _build_model(self):
63
+ name = self.model_name
64
+ if self.task_type == "classification":
65
+ models = {
66
+ "Random Forest": RandomForestClassifier(
67
+ n_estimators=200, max_depth=None, min_samples_split=2,
68
+ random_state=42, n_jobs=-1, class_weight='balanced'
69
+ ),
70
+ "Gradient Boosting": GradientBoostingClassifier(
71
+ n_estimators=150, learning_rate=0.1, max_depth=5,
72
+ random_state=42
73
+ ),
74
+ "Logistic Regression": LogisticRegression(
75
+ max_iter=1000, random_state=42, class_weight='balanced'
76
+ ),
77
+ "SVM": SVC(probability=True, kernel='rbf', random_state=42, class_weight='balanced'),
78
+ }
79
+ return models.get(name, models["Random Forest"])
80
+ else:
81
+ models = {
82
+ "Random Forest": RandomForestRegressor(
83
+ n_estimators=200, max_depth=None, random_state=42, n_jobs=-1
84
+ ),
85
+ "Gradient Boosting": GradientBoostingRegressor(
86
+ n_estimators=150, learning_rate=0.1, max_depth=5, random_state=42
87
+ ),
88
+ "Ridge Regression": Ridge(alpha=1.0),
89
+ "Lasso Regression": Lasso(alpha=1.0, max_iter=5000),
90
+ "SVM": SVR(kernel='rbf'),
91
+ }
92
+ return models.get(name, models["Random Forest"])
93
+
94
+ def _preprocess_X(self, df: pd.DataFrame, fit: bool = True) -> np.ndarray:
95
+ df = df.copy()
96
+
97
+ # Encode categoricals
98
+ for col in df.select_dtypes(include=['object', 'category']).columns:
99
+ le = LabelEncoder()
100
+ df[col] = le.fit_transform(df[col].astype(str))
101
+
102
+ # Boolean β†’ int
103
+ for col in df.select_dtypes(include=['bool']).columns:
104
+ df[col] = df[col].astype(int)
105
+
106
+ arr = df.values.astype(float)
107
+
108
+ if fit:
109
+ arr = self.imputer.fit_transform(arr)
110
+ arr = self.scaler.fit_transform(arr)
111
+ else:
112
+ arr = self.imputer.transform(arr)
113
+ arr = self.scaler.transform(arr)
114
+
115
+ return arr
116
+
117
+ # ------------------------------------------------------------------
118
+ # Public API
119
+ # ------------------------------------------------------------------
120
+
121
+ def preprocess(
122
+ self, df: pd.DataFrame, target_col: Optional[str] = None
123
+ ) -> Tuple[np.ndarray, Optional[np.ndarray]]:
124
+ df = df.copy()
125
+
126
+ if target_col and target_col in df.columns:
127
+ y_raw = df[target_col]
128
+ if self.task_type == "classification":
129
+ self.label_encoder = LabelEncoder()
130
+ y = self.label_encoder.fit_transform(y_raw.astype(str))
131
+ self.classes_ = self.label_encoder.classes_
132
+ else:
133
+ y = y_raw.values.astype(float)
134
+ df = df.drop(columns=[target_col])
135
+ else:
136
+ y = None
137
+
138
+ # One-hot for remaining categoricals after splitting target
139
+ df = pd.get_dummies(df, drop_first=True)
140
+ self.feature_names = df.columns.tolist()
141
+
142
+ X = self._preprocess_X(df, fit=True)
143
+ return X, y
144
+
145
+ def train(
146
+ self,
147
+ X: np.ndarray,
148
+ y: np.ndarray,
149
+ test_size: float = 0.2,
150
+ ) -> Dict[str, Any]:
151
+ """Train the model and return comprehensive metrics."""
152
+
153
+ if isinstance(X, pd.DataFrame):
154
+ X = self._preprocess_X(X, fit=True)
155
+
156
+ # Stratified split for classification when possible
157
+ stratify = None
158
+ if self.task_type == "classification":
159
+ unique, counts = np.unique(y, return_counts=True)
160
+ if len(unique) >= 2 and all(c >= 2 for c in counts):
161
+ stratify = y
162
+
163
+ X_train, X_test, y_train, y_test = train_test_split(
164
+ X, y, test_size=test_size, random_state=42, stratify=stratify
165
+ )
166
+ self.X_test = X_test
167
+ self.y_test = y_test
168
+
169
+ self.model = self._build_model()
170
+ self.model.fit(X_train, y_train)
171
+ self.is_fitted = True
172
+
173
+ y_pred = self.model.predict(X_test)
174
+ self.y_pred = y_pred
175
+
176
+ self.metrics = self._compute_metrics(y_test, y_pred, X, y)
177
+ return self.metrics
178
+
179
+ def _compute_metrics(
180
+ self,
181
+ y_test: np.ndarray,
182
+ y_pred: np.ndarray,
183
+ X_full: np.ndarray,
184
+ y_full: np.ndarray,
185
+ ) -> Dict[str, Any]:
186
+ metrics: Dict[str, Any] = {}
187
+
188
+ if self.task_type == "classification":
189
+ metrics["accuracy"] = round(float(accuracy_score(y_test, y_pred)), 4)
190
+ metrics["f1_score"] = round(float(f1_score(y_test, y_pred, average='weighted')), 4)
191
+
192
+ # ROC-AUC (binary only)
193
+ if len(np.unique(y_full)) == 2 and hasattr(self.model, 'predict_proba'):
194
+ try:
195
+ proba = self.model.predict_proba(self.X_test)[:, 1]
196
+ metrics["roc_auc"] = round(float(roc_auc_score(y_test, proba)), 4)
197
+ except Exception:
198
+ pass
199
+
200
+ # Cross-validation
201
+ try:
202
+ cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
203
+ cv_scores = cross_val_score(self.model, X_full, y_full, cv=cv, scoring='accuracy', n_jobs=-1)
204
+ metrics["cv_mean_accuracy"] = round(float(cv_scores.mean()), 4)
205
+ metrics["cv_std"] = round(float(cv_scores.std()), 4)
206
+ except Exception:
207
+ pass
208
+
209
+ # Classification report as string
210
+ try:
211
+ class_names = [str(c) for c in self.classes_] if self.classes_ is not None else None
212
+ metrics["classification_report"] = classification_report(
213
+ y_test, y_pred, target_names=class_names
214
+ )
215
+ except Exception:
216
+ pass
217
+
218
+ # Confusion matrix
219
+ try:
220
+ cm = confusion_matrix(y_test, y_pred)
221
+ metrics["confusion_matrix"] = cm.tolist()
222
+ except Exception:
223
+ pass
224
+
225
+ else: # regression
226
+ metrics["mse"] = round(float(mean_squared_error(y_test, y_pred)), 4)
227
+ metrics["rmse"] = round(float(np.sqrt(mean_squared_error(y_test, y_pred))), 4)
228
+ metrics["mae"] = round(float(mean_absolute_error(y_test, y_pred)), 4)
229
+ metrics["r2_score"] = round(float(r2_score(y_test, y_pred)), 4)
230
+
231
+ # Cross-validation
232
+ try:
233
+ cv = KFold(n_splits=5, shuffle=True, random_state=42)
234
+ cv_scores = cross_val_score(self.model, X_full, y_full, cv=cv, scoring='r2', n_jobs=-1)
235
+ metrics["cv_mean_r2"] = round(float(cv_scores.mean()), 4)
236
+ metrics["cv_std"] = round(float(cv_scores.std()), 4)
237
+ except Exception:
238
+ pass
239
+
240
+ return metrics
241
+
242
+ def predict(self, X: np.ndarray) -> np.ndarray:
243
+ if not self.is_fitted:
244
+ raise ValueError("Model must be trained before prediction")
245
+ if isinstance(X, pd.DataFrame):
246
+ X = self._preprocess_X(X, fit=False)
247
+ return self.model.predict(X)
248
+
249
+ def predict_proba(self, X: np.ndarray) -> np.ndarray:
250
+ if not self.is_fitted:
251
+ raise ValueError("Model must be trained before prediction")
252
+ if self.task_type != "classification":
253
+ raise ValueError("predict_proba only available for classification")
254
+ if not hasattr(self.model, 'predict_proba'):
255
+ raise ValueError(f"{self.model_name} does not support probability estimates")
256
+ if isinstance(X, pd.DataFrame):
257
+ X = self._preprocess_X(X, fit=False)
258
+ return self.model.predict_proba(X)
259
+
260
+ def get_feature_importance(self) -> pd.DataFrame:
261
+ if not self.is_fitted:
262
+ raise ValueError("Model must be trained first")
263
+
264
+ if hasattr(self.model, 'feature_importances_'):
265
+ importance = self.model.feature_importances_
266
+ elif hasattr(self.model, 'coef_'):
267
+ coef = self.model.coef_
268
+ importance = np.abs(coef).mean(axis=0) if coef.ndim > 1 else np.abs(coef)
269
+ else:
270
+ # Fallback: permutation-style zeros
271
+ importance = np.zeros(len(self.feature_names))
272
+
273
+ return pd.DataFrame({
274
+ "feature": self.feature_names[:len(importance)],
275
+ "importance": importance,
276
+ }).sort_values("importance", ascending=False).reset_index(drop=True)
277
+
278
+ def get_predictions_df(self, df_original: pd.DataFrame) -> pd.DataFrame:
279
+ """Returns original df with predictions appended."""
280
+ if not self.is_fitted:
281
+ raise ValueError("Model not trained yet")
282
+ result = df_original.copy()
283
+ # Preprocess same features used in training
284
+ feature_df = df_original[[f for f in self.feature_names if f in df_original.columns]]
285
+ preds = self.predict(feature_df)
286
+ result["prediction"] = preds
287
+ return result
288
+
289
+
290
+ # ---------------------------------------------------------------------------
291
+ # XGBoost Pipeline
292
+ # ---------------------------------------------------------------------------
293
+
294
+ class XGBoostPipeline(MLPipeline):
295
+ """XGBoost-based pipeline with early stopping and full metrics."""
296
+
297
+ def __init__(self, task_type: str = "classification"):
298
+ super().__init__(task_type=task_type, model_name="XGBoost")
299
+
300
+ def _build_xgb_model(self, n_classes: int = 2):
301
+ if self.task_type == "classification":
302
+ objective = "multi:softprob" if n_classes > 2 else "binary:logistic"
303
+ return xgb.XGBClassifier(
304
+ n_estimators=200,
305
+ max_depth=6,
306
+ learning_rate=0.05,
307
+ subsample=0.8,
308
+ colsample_bytree=0.8,
309
+ eval_metric='logloss',
310
+ random_state=42,
311
+ n_jobs=-1,
312
+ objective=objective,
313
+ )
314
+ else:
315
+ return xgb.XGBRegressor(
316
+ n_estimators=200,
317
+ max_depth=6,
318
+ learning_rate=0.05,
319
+ subsample=0.8,
320
+ colsample_bytree=0.8,
321
+ random_state=42,
322
+ n_jobs=-1,
323
+ )
324
+
325
+ def train(self, X: np.ndarray, y: np.ndarray, test_size: float = 0.2) -> Dict[str, Any]:
326
+ if not XGB_AVAILABLE:
327
+ raise ImportError("xgboost is not installed. Run: pip install xgboost")
328
+
329
+ if isinstance(X, pd.DataFrame):
330
+ X = self._preprocess_X(X, fit=True)
331
+
332
+ stratify = None
333
+ if self.task_type == "classification":
334
+ unique, counts = np.unique(y, return_counts=True)
335
+ if len(unique) >= 2 and all(c >= 2 for c in counts):
336
+ stratify = y
337
+
338
+ X_train, X_test, y_train, y_test = train_test_split(
339
+ X, y, test_size=test_size, random_state=42, stratify=stratify
340
+ )
341
+ self.X_test = X_test
342
+ self.y_test = y_test
343
+
344
+ n_classes = len(np.unique(y)) if self.task_type == "classification" else 2
345
+ self.model = self._build_xgb_model(n_classes=n_classes)
346
+
347
+ self.model.fit(
348
+ X_train, y_train,
349
+ eval_set=[(X_test, y_test)],
350
+ verbose=False,
351
+ )
352
+ self.is_fitted = True
353
+
354
+ y_pred = self.model.predict(X_test)
355
+ self.y_pred = y_pred
356
+ self.metrics = self._compute_metrics(y_test, y_pred, X, y)
357
+ return self.metrics
358
+
359
+
360
+ # ---------------------------------------------------------------------------
361
+ # LightGBM Pipeline
362
+ # ---------------------------------------------------------------------------
363
+
364
+ class LightGBMPipeline(MLPipeline):
365
+ """LightGBM pipeline β€” fastest gradient boosting for large datasets."""
366
+
367
+ def __init__(self, task_type: str = "classification"):
368
+ super().__init__(task_type=task_type, model_name="LightGBM")
369
+
370
+ def train(self, X: np.ndarray, y: np.ndarray, test_size: float = 0.2) -> Dict[str, Any]:
371
+ if not LGB_AVAILABLE:
372
+ raise ImportError("lightgbm is not installed. Run: pip install lightgbm")
373
+
374
+ if isinstance(X, pd.DataFrame):
375
+ X = self._preprocess_X(X, fit=True)
376
+
377
+ stratify = None
378
+ if self.task_type == "classification":
379
+ unique, counts = np.unique(y, return_counts=True)
380
+ if all(c >= 2 for c in counts):
381
+ stratify = y
382
+
383
+ X_train, X_test, y_train, y_test = train_test_split(
384
+ X, y, test_size=test_size, random_state=42, stratify=stratify
385
+ )
386
+ self.X_test = X_test
387
+ self.y_test = y_test
388
+
389
+ if self.task_type == "classification":
390
+ n_classes = len(np.unique(y))
391
+ objective = "multiclass" if n_classes > 2 else "binary"
392
+ self.model = lgb.LGBMClassifier(
393
+ n_estimators=200, learning_rate=0.05,
394
+ num_leaves=31, random_state=42,
395
+ objective=objective, n_jobs=-1,
396
+ class_weight='balanced',
397
+ verbose=-1,
398
+ )
399
+ else:
400
+ self.model = lgb.LGBMRegressor(
401
+ n_estimators=200, learning_rate=0.05,
402
+ num_leaves=31, random_state=42,
403
+ n_jobs=-1, verbose=-1,
404
+ )
405
+
406
+ self.model.fit(X_train, y_train, eval_set=[(X_test, y_test)])
407
+ self.is_fitted = True
408
+
409
+ y_pred = self.model.predict(X_test)
410
+ self.y_pred = y_pred
411
+ self.metrics = self._compute_metrics(y_test, y_pred, X, y)
412
+ return self.metrics
413
+
414
+
415
+ # ---------------------------------------------------------------------------
416
+ # Ensemble / AutoML-style pipeline
417
+ # ---------------------------------------------------------------------------
418
+
419
+ class EnsemblePipeline(MLPipeline):
420
+ """
421
+ Voting ensemble of Random Forest + Gradient Boosting (+ XGBoost if available).
422
+ Best overall accuracy across most datasets.
423
+ """
424
+
425
+ def __init__(self, task_type: str = "classification"):
426
+ super().__init__(task_type=task_type, model_name="Ensemble")
427
+
428
+ def train(self, X: np.ndarray, y: np.ndarray, test_size: float = 0.2) -> Dict[str, Any]:
429
+ if isinstance(X, pd.DataFrame):
430
+ X = self._preprocess_X(X, fit=True)
431
+
432
+ stratify = None
433
+ if self.task_type == "classification":
434
+ unique, counts = np.unique(y, return_counts=True)
435
+ if all(c >= 2 for c in counts):
436
+ stratify = y
437
+
438
+ X_train, X_test, y_train, y_test = train_test_split(
439
+ X, y, test_size=test_size, random_state=42, stratify=stratify
440
+ )
441
+ self.X_test = X_test
442
+ self.y_test = y_test
443
+
444
+ if self.task_type == "classification":
445
+ estimators = [
446
+ ("rf", RandomForestClassifier(n_estimators=150, random_state=42, n_jobs=-1, class_weight='balanced')),
447
+ ("gb", GradientBoostingClassifier(n_estimators=100, random_state=42)),
448
+ ]
449
+ if XGB_AVAILABLE:
450
+ estimators.append(("xgb", xgb.XGBClassifier(
451
+ n_estimators=100,
452
+ eval_metric='logloss', random_state=42, n_jobs=-1,
453
+ )))
454
+ self.model = VotingClassifier(estimators=estimators, voting='soft', n_jobs=-1)
455
+ else:
456
+ estimators = [
457
+ ("rf", RandomForestRegressor(n_estimators=150, random_state=42, n_jobs=-1)),
458
+ ("gb", GradientBoostingRegressor(n_estimators=100, random_state=42)),
459
+ ]
460
+ if XGB_AVAILABLE:
461
+ estimators.append(("xgb", xgb.XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1)))
462
+ self.model = VotingRegressor(estimators=estimators, n_jobs=-1)
463
+
464
+ self.model.fit(X_train, y_train)
465
+ self.is_fitted = True
466
+
467
+ y_pred = self.model.predict(X_test)
468
+ self.y_pred = y_pred
469
+ self.metrics = self._compute_metrics(y_test, y_pred, X, y)
470
+ return self.metrics
471
+
472
+ def get_feature_importance(self) -> pd.DataFrame:
473
+ """Average feature importances from sub-estimators that support it."""
474
+ importances = []
475
+ estimators = self.model.estimators_
476
+ for est in estimators:
477
+ if hasattr(est, 'feature_importances_'):
478
+ importances.append(est.feature_importances_)
479
+
480
+ if not importances:
481
+ return pd.DataFrame({"feature": self.feature_names, "importance": 0.0})
482
+
483
+ avg_importance = np.mean(importances, axis=0)
484
+ return pd.DataFrame({
485
+ "feature": self.feature_names[:len(avg_importance)],
486
+ "importance": avg_importance,
487
+ }).sort_values("importance", ascending=False).reset_index(drop=True)
models/nlp_module.py ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ nlp_module.py β€” NLP Module (v2.1 Clean)
3
+ Models:
4
+ - DistilBERT SST-2 β†’ sentiment analysis (~250 MB, downloads on first use)
5
+ - spaCy en_core_web_sm β†’ named entity recognition (~15 MB, auto-downloads)
6
+ - TF-IDF β†’ zero-shot classification (no download)
7
+ - Extractive β†’ summarization (no download)
8
+ - Smart AI (built-in) β†’ chatbot, zero downloads
9
+ """
10
+ import warnings
11
+ warnings.filterwarnings("ignore")
12
+
13
+ import streamlit as st
14
+
15
+
16
+ # ══════════════════════════════════════════════════════════════════════════════
17
+ # Cached pipeline loaders
18
+ # ══════════════════════════════════════════════════════════════════════════════
19
+
20
+ @st.cache_resource(show_spinner=False)
21
+ def load_sentiment_pipeline():
22
+ """DistilBERT SST-2 β€” ~250 MB, fast and accurate."""
23
+ from transformers import pipeline # type: ignore[import-untyped]
24
+ return pipeline( # type: ignore[call-overload]
25
+ "sentiment-analysis",
26
+ model="distilbert-base-uncased-finetuned-sst-2-english",
27
+ )
28
+
29
+
30
+ @st.cache_resource(show_spinner=False)
31
+ def load_ner_pipeline():
32
+ """
33
+ spaCy en_core_web_sm (~15 MB) for NER.
34
+ Falls back to regex-based NER if spaCy is not installed.
35
+ Install: pip install spacy && python -m spacy download en_core_web_sm
36
+ """
37
+ try:
38
+ import spacy
39
+ try:
40
+ return ("spacy", spacy.load("en_core_web_sm"))
41
+ except OSError:
42
+ from spacy.cli.download import download as spacy_download # type: ignore[import]
43
+ spacy_download("en_core_web_sm")
44
+ return ("spacy", spacy.load("en_core_web_sm"))
45
+ except ImportError:
46
+ return ("regex", None)
47
+
48
+
49
+ @st.cache_resource(show_spinner=False)
50
+ def load_zero_shot_pipeline():
51
+ """
52
+ Lightweight zero-shot classification using TF-IDF cosine similarity.
53
+ Zero model downloads, zero RAM overhead β€” works on any machine.
54
+ Falls back gracefully without any internet or large model requirement.
55
+ """
56
+ return "tfidf" # sentinel value β€” actual logic is in run_text_classification
57
+
58
+
59
+ @st.cache_resource(show_spinner=False)
60
+ def load_summarization_pipeline():
61
+ """
62
+ Extractive summarizer β€” word-frequency scoring, zero model download.
63
+ Picks the most informative sentences from the input text.
64
+ """
65
+ return "extractive" # sentinel β€” actual logic in run_summarization
66
+
67
+
68
+ # ══════════════════════════════════════════════════════════════════════════════
69
+ # Business logic
70
+ # ══════════════════════════════════════════════════════════════════════════════
71
+
72
+ def run_sentiment(texts: list) -> list:
73
+ """
74
+ Sentiment analysis on a list of strings.
75
+ Returns list of dicts: Text, Sentiment, Confidence, Score.
76
+ """
77
+ pipe = load_sentiment_pipeline()
78
+ results = []
79
+ for text in texts:
80
+ if text.strip():
81
+ r = pipe(text[:512], truncation=True, max_length=512)[0]
82
+ results.append({
83
+ "Text": text[:80],
84
+ "Sentiment": r["label"],
85
+ "Confidence": f"{r['score'] * 100:.1f}%",
86
+ "Score": round(r["score"], 4),
87
+ })
88
+ return results
89
+
90
+
91
+ def run_ner(text: str) -> list:
92
+ """
93
+ Named Entity Recognition using spaCy (15 MB) or regex fallback.
94
+ Returns list of dicts: Entity, Type, Score, Start, End.
95
+ """
96
+ backend, model = load_ner_pipeline()
97
+
98
+ if backend == "spacy" and model is not None:
99
+ doc = model(text[:1000])
100
+ return [
101
+ {
102
+ "Entity": ent.text,
103
+ "Type": ent.label_,
104
+ "Score": "100.0%",
105
+ "Start": ent.start_char,
106
+ "End": ent.end_char,
107
+ }
108
+ for ent in doc.ents
109
+ ]
110
+
111
+ # ── Regex fallback β€” works with zero extra installs ──────────────────────
112
+ import re
113
+ patterns = [
114
+ (
115
+ r'\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\s+'
116
+ r'(?:Inc|Corp|Ltd|LLC|Co|Group|Foundation|Institute|University|'
117
+ r'College|School|Hospital|Bank|Technologies|Solutions|Systems|Services)\.?)\b',
118
+ "ORG",
119
+ ),
120
+ (
121
+ r'\b([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*)\b'
122
+ r'(?=\s+(?:City|State|Country|Street|Avenue|Road|Park|Lake|River|'
123
+ r'Mountain|Valley|Island|Bay|County|District|Province|Region))',
124
+ "LOC",
125
+ ),
126
+ (
127
+ r'\b([A-Z][a-z]{2,}\s+[A-Z][a-z]{2,}(?:\s+[A-Z][a-z]{2,})?)\b',
128
+ "PER",
129
+ ),
130
+ (r'\b([A-Z]{2,6})\b', "ORG"),
131
+ ]
132
+
133
+ seen, results = set(), []
134
+ for pattern, label in patterns:
135
+ for m in re.finditer(pattern, text):
136
+ entity = m.group(1).strip()
137
+ key = (entity, label)
138
+ if key not in seen and len(entity) > 1:
139
+ seen.add(key)
140
+ results.append({
141
+ "Entity": entity,
142
+ "Type": label,
143
+ "Score": "~",
144
+ "Start": m.start(),
145
+ "End": m.end(),
146
+ })
147
+
148
+ return sorted(results, key=lambda x: x["Start"])
149
+
150
+
151
+ def _tfidf_cosine(text: str, label: str) -> float:
152
+ """Compute TF-IDF cosine similarity between text and a label string."""
153
+ import re
154
+ from collections import Counter
155
+ import math
156
+
157
+ _stop = {"the","a","an","is","are","was","were","be","been","being","have",
158
+ "has","had","do","does","did","will","would","could","should","may",
159
+ "might","can","to","of","in","for","on","with","at","by","from","as",
160
+ "and","but","or","not","it","its","this","that","i","we","you","he",
161
+ "she","they","all","any","more","so","very","also","just","about"}
162
+
163
+ def _tokens(s: str) -> list:
164
+ return [w for w in re.findall(r"[a-z]+", s.lower()) if w not in _stop and len(w) > 1]
165
+
166
+ t_tokens = _tokens(text)
167
+ l_tokens = _tokens(label)
168
+ if not t_tokens or not l_tokens:
169
+ return 0.0
170
+
171
+ # TF of text
172
+ tf_t = Counter(t_tokens)
173
+ tf_l = Counter(l_tokens)
174
+
175
+ # Vocabulary union
176
+ vocab = set(tf_t) | set(tf_l)
177
+
178
+ # Simple IDF weight: log(1 + 1/freq_ratio) β€” single-doc approximation
179
+ def vec(tf: Counter) -> dict:
180
+ total = sum(tf.values()) or 1
181
+ return {w: tf[w] / total for w in vocab}
182
+
183
+ vt = vec(tf_t)
184
+ vl = vec(tf_l)
185
+
186
+ dot = sum(vt[w] * vl[w] for w in vocab)
187
+ norm_t = math.sqrt(sum(v * v for v in vt.values())) or 1e-9
188
+ norm_l = math.sqrt(sum(v * v for v in vl.values())) or 1e-9
189
+ return dot / (norm_t * norm_l)
190
+
191
+
192
+ def run_text_classification(text: str, labels: list) -> list:
193
+ """
194
+ Zero-shot text classification using TF-IDF cosine similarity.
195
+ No model download required β€” works instantly on any machine.
196
+ Returns list of dicts: Label, Score, Confidence β€” sorted by score desc.
197
+ """
198
+ if not labels:
199
+ return []
200
+
201
+ scores = []
202
+ for label in labels:
203
+ # Boost: also compare text against expanded label description
204
+ sim = _tfidf_cosine(text, label)
205
+ scores.append((label, sim))
206
+
207
+ # Normalise scores so they sum to 1 (softmax-like)
208
+ import math
209
+ exp_scores = [(lbl, math.exp(s * 8)) for lbl, s in scores] # temperature=8 sharpens
210
+ total = sum(s for _, s in exp_scores) or 1.0
211
+ normalised = sorted(
212
+ [{"Label": lbl, "Score": round(s / total, 4), "Confidence": f"{s / total * 100:.1f}%"}
213
+ for lbl, s in exp_scores],
214
+ key=lambda x: x["Score"], reverse=True,
215
+ )
216
+ return normalised
217
+
218
+
219
+ def run_summarization(text: str) -> str:
220
+ """
221
+ Extractive summarization using word-frequency scoring.
222
+ Zero model download β€” works on any machine, any RAM size.
223
+ Picks the top 3 most informative sentences.
224
+ """
225
+ import re
226
+ from collections import Counter
227
+
228
+ text = text.strip()
229
+ # Split into sentences
230
+ sentences = re.split(r"(?<=[.!?])\s+", text)
231
+ sentences = [s.strip() for s in sentences if len(s.split()) > 4]
232
+
233
+ if len(sentences) <= 2:
234
+ return text[:400] + ("…" if len(text) > 400 else "")
235
+
236
+ # Stop words to ignore when computing importance
237
+ stop = {"the","a","an","is","are","was","were","be","been","being","have",
238
+ "has","had","do","does","did","will","would","could","should","may",
239
+ "might","can","to","of","in","for","on","with","at","by","from",
240
+ "as","into","and","but","or","not","it","its","this","that","i",
241
+ "we","you","he","she","they","all","any","each","more","most","so",
242
+ "very","also","just","about","than","other","such","when","which"}
243
+
244
+ words = re.findall(r"[a-z]+", text.lower())
245
+ freq = Counter(w for w in words if w not in stop and len(w) > 2)
246
+ max_f = max(freq.values(), default=1)
247
+ freq = {w: v / max_f for w, v in freq.items()}
248
+
249
+ # Score sentences
250
+ scores: dict = {}
251
+ for i, sent in enumerate(sentences):
252
+ score = sum(freq.get(w, 0) for w in re.findall(r"[a-z]+", sent.lower()))
253
+ score = score / max(len(sent.split()), 1)
254
+ if i == 0:
255
+ score *= 1.3 # slight boost for the opening sentence
256
+ scores[i] = score
257
+
258
+ # Pick top N sentences (preserve original order)
259
+ n = max(1, min(4, len(sentences) // 3))
260
+ top = sorted(sorted(scores, key=lambda k: scores[k], reverse=True)[:n])
261
+ return " ".join(sentences[i] for i in top)
262
+
263
+
264
+ def chat_with_model(prompt: str, history: list) -> str:
265
+ """
266
+ Instant chatbot using Smart AI β€” no model download, zero RAM.
267
+ Falls back to simple keyword responses if the import fails.
268
+ """
269
+ try:
270
+ import sys
271
+ from pathlib import Path
272
+ # Support both flat and models/ directory layouts
273
+ sys.path.insert(0, str(Path(__file__).parent))
274
+ sys.path.insert(0, str(Path(__file__).parent.parent))
275
+ from generative_ai import _smart_respond
276
+
277
+ # Convert (user, bot) tuple history to dict format
278
+ hist_dicts = []
279
+ for u, b in history[-4:]:
280
+ hist_dicts.append({"role": "user", "content": u})
281
+ hist_dicts.append({"role": "assistant", "content": b})
282
+
283
+ return _smart_respond(prompt, hist_dicts)
284
+
285
+ except Exception:
286
+ # Ultra-safe fallback if generative_ai import fails
287
+ p = prompt.lower()
288
+ if any(w in p for w in ["hello", "hi", "hey"]):
289
+ return "Hello! Ask me anything about ML, data science, or AI. 😊"
290
+ if "machine learning" in p or " ml " in p:
291
+ return (
292
+ "**Machine Learning** enables systems to learn patterns from data without "
293
+ "explicit programming. Types: Supervised, Unsupervised, Reinforcement. "
294
+ "Libraries: scikit-learn, XGBoost, LightGBM."
295
+ )
296
+ if "deep learning" in p or "neural" in p:
297
+ return (
298
+ "**Deep Learning** uses multi-layer neural networks to learn complex features. "
299
+ "Best for images (CNNs), sequences (Transformers), and unstructured data. "
300
+ "Frameworks: PyTorch, TensorFlow."
301
+ )
302
+ if "xgboost" in p or "gradient boosting" in p:
303
+ return (
304
+ "**XGBoost** builds trees sequentially, each correcting errors of the prior. "
305
+ "Key params: n_estimators, max_depth, learning_rate. Extremely fast and accurate."
306
+ )
307
+ if "overfitting" in p:
308
+ return (
309
+ "**Overfitting** = model memorises training noise, fails on new data. "
310
+ "Fixes: cross-validation, regularisation (L1/L2), dropout, more data, simpler model."
311
+ )
312
+ if "python" in p:
313
+ return (
314
+ "**Python** dominates AI/ML thanks to: NumPy, Pandas, scikit-learn, "
315
+ "PyTorch, TensorFlow, HuggingFace Transformers. "
316
+ "Use virtual environments to manage dependencies."
317
+ )
318
+ if "nlp" in p or "natural language" in p:
319
+ return (
320
+ "**NLP** (Natural Language Processing) enables machines to understand text. "
321
+ "Key tasks: sentiment, NER, classification, summarisation, translation. "
322
+ "Modern approach: HuggingFace Transformers (BERT, GPT, T5)."
323
+ )
324
+ return (
325
+ "I'm your AI assistant. Try asking about: machine learning, neural networks, "
326
+ "XGBoost, overfitting, Python, NLP, or data science topics!"
327
+ )