| """ |
| nlp_module.py — NLP Module (v2.1 Clean) |
| Models: |
| - DistilBERT SST-2 → sentiment analysis (~250 MB, downloads on first use) |
| - spaCy en_core_web_sm → named entity recognition (~15 MB, auto-downloads) |
| - TF-IDF → zero-shot classification (no download) |
| - Extractive → summarization (no download) |
| - Smart AI (built-in) → chatbot, zero downloads |
| """ |
| import warnings |
| warnings.filterwarnings("ignore") |
|
|
| import streamlit as st |
|
|
|
|
| |
| |
| |
|
|
| @st.cache_resource(show_spinner=False) |
| def load_sentiment_pipeline(): |
| """DistilBERT SST-2 — ~250 MB, fast and accurate.""" |
| from transformers import pipeline |
| return pipeline( |
| "sentiment-analysis", |
| model="distilbert-base-uncased-finetuned-sst-2-english", |
| ) |
|
|
|
|
| @st.cache_resource(show_spinner=False) |
| def load_ner_pipeline(): |
| """ |
| spaCy en_core_web_sm (~15 MB) for NER. |
| Falls back to regex-based NER if spaCy is not installed. |
| Install: pip install spacy && python -m spacy download en_core_web_sm |
| """ |
| try: |
| import spacy |
| try: |
| return ("spacy", spacy.load("en_core_web_sm")) |
| except OSError: |
| from spacy.cli.download import download as spacy_download |
| spacy_download("en_core_web_sm") |
| return ("spacy", spacy.load("en_core_web_sm")) |
| except ImportError: |
| return ("regex", None) |
|
|
|
|
| @st.cache_resource(show_spinner=False) |
| def load_zero_shot_pipeline(): |
| """ |
| Lightweight zero-shot classification using TF-IDF cosine similarity. |
| Zero model downloads, zero RAM overhead — works on any machine. |
| Falls back gracefully without any internet or large model requirement. |
| """ |
| return "tfidf" |
|
|
|
|
| @st.cache_resource(show_spinner=False) |
| def load_summarization_pipeline(): |
| """ |
| Extractive summarizer — word-frequency scoring, zero model download. |
| Picks the most informative sentences from the input text. |
| """ |
| return "extractive" |
|
|
|
|
| |
| |
| |
|
|
| def run_sentiment(texts: list) -> list: |
| """ |
| Sentiment analysis on a list of strings. |
| Returns list of dicts: Text, Sentiment, Confidence, Score. |
| """ |
| pipe = load_sentiment_pipeline() |
| results = [] |
| for text in texts: |
| if text.strip(): |
| r = pipe(text[:512], truncation=True, max_length=512)[0] |
| results.append({ |
| "Text": text[:80], |
| "Sentiment": r["label"], |
| "Confidence": f"{r['score'] * 100:.1f}%", |
| "Score": round(r["score"], 4), |
| }) |
| return results |
|
|
|
|
| def run_ner(text: str) -> list: |
| """ |
| Named Entity Recognition using spaCy (15 MB) or regex fallback. |
| Returns list of dicts: Entity, Type, Score, Start, End. |
| """ |
| backend, model = load_ner_pipeline() |
|
|
| if backend == "spacy" and model is not None: |
| doc = model(text[:1000]) |
| return [ |
| { |
| "Entity": ent.text, |
| "Type": ent.label_, |
| "Score": "100.0%", |
| "Start": ent.start_char, |
| "End": ent.end_char, |
| } |
| for ent in doc.ents |
| ] |
|
|
| |
| import re |
| patterns = [ |
| ( |
| r'\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\s+' |
| r'(?:Inc|Corp|Ltd|LLC|Co|Group|Foundation|Institute|University|' |
| r'College|School|Hospital|Bank|Technologies|Solutions|Systems|Services)\.?)\b', |
| "ORG", |
| ), |
| ( |
| r'\b([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*)\b' |
| r'(?=\s+(?:City|State|Country|Street|Avenue|Road|Park|Lake|River|' |
| r'Mountain|Valley|Island|Bay|County|District|Province|Region))', |
| "LOC", |
| ), |
| ( |
| r'\b([A-Z][a-z]{2,}\s+[A-Z][a-z]{2,}(?:\s+[A-Z][a-z]{2,})?)\b', |
| "PER", |
| ), |
| (r'\b([A-Z]{2,6})\b', "ORG"), |
| ] |
|
|
| seen, results = set(), [] |
| for pattern, label in patterns: |
| for m in re.finditer(pattern, text): |
| entity = m.group(1).strip() |
| key = (entity, label) |
| if key not in seen and len(entity) > 1: |
| seen.add(key) |
| results.append({ |
| "Entity": entity, |
| "Type": label, |
| "Score": "~", |
| "Start": m.start(), |
| "End": m.end(), |
| }) |
|
|
| return sorted(results, key=lambda x: x["Start"]) |
|
|
|
|
| def _tfidf_cosine(text: str, label: str) -> float: |
| """Compute TF-IDF cosine similarity between text and a label string.""" |
| import re |
| from collections import Counter |
| import math |
|
|
| _stop = {"the","a","an","is","are","was","were","be","been","being","have", |
| "has","had","do","does","did","will","would","could","should","may", |
| "might","can","to","of","in","for","on","with","at","by","from","as", |
| "and","but","or","not","it","its","this","that","i","we","you","he", |
| "she","they","all","any","more","so","very","also","just","about"} |
|
|
| def _tokens(s: str) -> list: |
| return [w for w in re.findall(r"[a-z]+", s.lower()) if w not in _stop and len(w) > 1] |
|
|
| t_tokens = _tokens(text) |
| l_tokens = _tokens(label) |
| if not t_tokens or not l_tokens: |
| return 0.0 |
|
|
| |
| tf_t = Counter(t_tokens) |
| tf_l = Counter(l_tokens) |
|
|
| |
| vocab = set(tf_t) | set(tf_l) |
|
|
| |
| def vec(tf: Counter) -> dict: |
| total = sum(tf.values()) or 1 |
| return {w: tf[w] / total for w in vocab} |
|
|
| vt = vec(tf_t) |
| vl = vec(tf_l) |
|
|
| dot = sum(vt[w] * vl[w] for w in vocab) |
| norm_t = math.sqrt(sum(v * v for v in vt.values())) or 1e-9 |
| norm_l = math.sqrt(sum(v * v for v in vl.values())) or 1e-9 |
| return dot / (norm_t * norm_l) |
|
|
|
|
| def run_text_classification(text: str, labels: list) -> list: |
| """ |
| Zero-shot text classification using TF-IDF cosine similarity. |
| No model download required — works instantly on any machine. |
| Returns list of dicts: Label, Score, Confidence — sorted by score desc. |
| """ |
| if not labels: |
| return [] |
|
|
| scores = [] |
| for label in labels: |
| |
| sim = _tfidf_cosine(text, label) |
| scores.append((label, sim)) |
|
|
| |
| import math |
| exp_scores = [(lbl, math.exp(s * 8)) for lbl, s in scores] |
| total = sum(s for _, s in exp_scores) or 1.0 |
| normalised = sorted( |
| [{"Label": lbl, "Score": round(s / total, 4), "Confidence": f"{s / total * 100:.1f}%"} |
| for lbl, s in exp_scores], |
| key=lambda x: x["Score"], reverse=True, |
| ) |
| return normalised |
|
|
|
|
| def run_summarization(text: str) -> str: |
| """ |
| Extractive summarization using word-frequency scoring. |
| Zero model download — works on any machine, any RAM size. |
| Picks the top 3 most informative sentences. |
| """ |
| import re |
| from collections import Counter |
|
|
| text = text.strip() |
| |
| sentences = re.split(r"(?<=[.!?])\s+", text) |
| sentences = [s.strip() for s in sentences if len(s.split()) > 4] |
|
|
| if len(sentences) <= 2: |
| return text[:400] + ("…" if len(text) > 400 else "") |
|
|
| |
| stop = {"the","a","an","is","are","was","were","be","been","being","have", |
| "has","had","do","does","did","will","would","could","should","may", |
| "might","can","to","of","in","for","on","with","at","by","from", |
| "as","into","and","but","or","not","it","its","this","that","i", |
| "we","you","he","she","they","all","any","each","more","most","so", |
| "very","also","just","about","than","other","such","when","which"} |
|
|
| words = re.findall(r"[a-z]+", text.lower()) |
| freq = Counter(w for w in words if w not in stop and len(w) > 2) |
| max_f = max(freq.values(), default=1) |
| freq = {w: v / max_f for w, v in freq.items()} |
|
|
| |
| scores: dict = {} |
| for i, sent in enumerate(sentences): |
| score = sum(freq.get(w, 0) for w in re.findall(r"[a-z]+", sent.lower())) |
| score = score / max(len(sent.split()), 1) |
| if i == 0: |
| score *= 1.3 |
| scores[i] = score |
|
|
| |
| n = max(1, min(4, len(sentences) // 3)) |
| top = sorted(sorted(scores, key=lambda k: scores[k], reverse=True)[:n]) |
| return " ".join(sentences[i] for i in top) |
|
|
|
|
| def chat_with_model(prompt: str, history: list) -> str: |
| """ |
| Instant chatbot using Smart AI — no model download, zero RAM. |
| Falls back to simple keyword responses if the import fails. |
| """ |
| try: |
| import sys |
| from pathlib import Path |
| |
| sys.path.insert(0, str(Path(__file__).parent)) |
| sys.path.insert(0, str(Path(__file__).parent.parent)) |
| from generative_ai import _smart_respond |
|
|
| |
| hist_dicts = [] |
| for u, b in history[-4:]: |
| hist_dicts.append({"role": "user", "content": u}) |
| hist_dicts.append({"role": "assistant", "content": b}) |
|
|
| return _smart_respond(prompt, hist_dicts) |
|
|
| except Exception: |
| |
| p = prompt.lower() |
| if any(w in p for w in ["hello", "hi", "hey"]): |
| return "Hello! Ask me anything about ML, data science, or AI. 😊" |
| if "machine learning" in p or " ml " in p: |
| return ( |
| "**Machine Learning** enables systems to learn patterns from data without " |
| "explicit programming. Types: Supervised, Unsupervised, Reinforcement. " |
| "Libraries: scikit-learn, XGBoost, LightGBM." |
| ) |
| if "deep learning" in p or "neural" in p: |
| return ( |
| "**Deep Learning** uses multi-layer neural networks to learn complex features. " |
| "Best for images (CNNs), sequences (Transformers), and unstructured data. " |
| "Frameworks: PyTorch, TensorFlow." |
| ) |
| if "xgboost" in p or "gradient boosting" in p: |
| return ( |
| "**XGBoost** builds trees sequentially, each correcting errors of the prior. " |
| "Key params: n_estimators, max_depth, learning_rate. Extremely fast and accurate." |
| ) |
| if "overfitting" in p: |
| return ( |
| "**Overfitting** = model memorises training noise, fails on new data. " |
| "Fixes: cross-validation, regularisation (L1/L2), dropout, more data, simpler model." |
| ) |
| if "python" in p: |
| return ( |
| "**Python** dominates AI/ML thanks to: NumPy, Pandas, scikit-learn, " |
| "PyTorch, TensorFlow, HuggingFace Transformers. " |
| "Use virtual environments to manage dependencies." |
| ) |
| if "nlp" in p or "natural language" in p: |
| return ( |
| "**NLP** (Natural Language Processing) enables machines to understand text. " |
| "Key tasks: sentiment, NER, classification, summarisation, translation. " |
| "Modern approach: HuggingFace Transformers (BERT, GPT, T5)." |
| ) |
| return ( |
| "I'm your AI assistant. Try asking about: machine learning, neural networks, " |
| "XGBoost, overfitting, Python, NLP, or data science topics!" |
| ) |