Spaces:

isgr9801
/

DMj_backend

Sleeping

App Files Files Community

isgr9801 commited on Mar 5

Commit

dcbb6a2

verified ·

1 Parent(s): cc2dffd

test

Browse files

Files changed (16) hide show

Dockerfile +16 -0
backend/Procfile +1 -0
backend/README.md +43 -0
backend/auth_deps.py +106 -0
backend/connection.py +36 -0
backend/crud.py +159 -0
backend/main.py +80 -0
backend/nlp_processor.py +937 -0
backend/requirements.txt +9 -0
backend/routers/auth.py +17 -0
backend/routers/dashboard.py +11 -0
backend/routers/emotion_analyzer.py +0 -0
backend/routers/memories.py +144 -0
backend/schemas.py +67 -0
backend/text_preprocessor.py +410 -0
components.json +21 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+FROM python:3.11-slim
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PORT=7860
+WORKDIR /app
+COPY backend/requirements.txt /app/backend/requirements.txt
+RUN pip install --no-cache-dir -r /app/backend/requirements.txt
+COPY . /app
+EXPOSE 7860
+CMD ["sh", "-c", "uvicorn backend.main:app --host 0.0.0.0 --port ${PORT:-7860}"]

backend/Procfile ADDED Viewed

	@@ -0,0 +1 @@


1	+ web: uvicorn backend.main:app --host 0.0.0.0 --port $PORT

backend/README.md ADDED Viewed

	@@ -0,0 +1,43 @@

+DMJ backend (FastAPI)
+======================
+```
+cd backend
+python -m venv .venv
+.venv\Scripts\Activate.ps1
+pip install -r requirements.txt
+setx MONGO_URI "mongouri"
+uvicorn backend.main:app --reload --port 8000
+```
+- Replace the `auth` router with proper Firebase/JWT verification
+**NLP Processing**
+emotion scoring
+----------------------------------
+uses Hugging Face Inference API for emotion scoring.
+Set environment vars:
+- `HF_API_TOKEN` = Hugging Face token
+keyword extraction
+--------------------------------
+uses KeyBERT with a multilingual sentence-transformer.
+- `KEYBERT_MODEL` =sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
+supports  mar/eng/hindi
+topic categorization
+------------------------------------
+- `TOPIC_LABELS` (comma-separated candidate labels required)
+supports  mar/eng/hindi
+entity recognition (NER)
+----------------------------
+multilingual Hugging Face NER with spaCy fallback.
+- `NER_MODEL` = xx_ent_wiki_sm
+supports  mar/eng/hindi
+embedding generation
+----------------------------
+- `EMBEDDING_MODEL` =entence-transformers/paraphrase-multilingual-MiniLM-L12-v2
+supports  mar/eng/hindi

backend/auth_deps.py ADDED Viewed

	@@ -0,0 +1,106 @@

+"""Firebase auth dependency for protecting routes."""
+import os
+from pathlib import Path
+import firebase_admin
+from firebase_admin import credentials, auth
+from dotenv import load_dotenv
+from fastapi import Depends, HTTPException, status
+from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
+import logging
+from typing import Optional
+logger = logging.getLogger(__name__)
+# Ensure FIREBASE_* env vars are available regardless of working directory
+BACKEND_ENV_PATH = Path(__file__).resolve().parent / ".env"
+load_dotenv(dotenv_path=BACKEND_ENV_PATH)
+load_dotenv()
+def _build_firebase_cert_from_env() -> dict | None:
+    project_id = os.getenv("FIREBASE_PROJECT_ID")
+    private_key = os.getenv("FIREBASE_PRIVATE_KEY")
+    client_email = os.getenv("FIREBASE_CLIENT_EMAIL")
+    if not project_id or not private_key or not client_email:
+        return None
+    return {
+        "type": "service_account",
+        "project_id": project_id,
+        "private_key": private_key.replace("\\n", "\n"),
+        "client_email": client_email,
+        "token_uri": "https://oauth2.googleapis.com/token",
+    }
+def _initialize_firebase_admin() -> None:
+    try:
+        firebase_admin.get_app()
+        return
+    except ValueError:
+        pass
+    cert_data = _build_firebase_cert_from_env()
+    if cert_data:
+        try:
+            cred = credentials.Certificate(cert_data)
+            firebase_admin.initialize_app(
+                cred,
+                {
+                    "projectId": cert_data["project_id"],
+                },
+            )
+            os.environ.setdefault("GOOGLE_CLOUD_PROJECT", cert_data["project_id"])
+            logger.info("Firebase Admin initialized from FIREBASE_* env vars (projectId=%s)", cert_data["project_id"])
+            return
+        except Exception as exc:
+            logger.exception("Failed to initialize Firebase from FIREBASE_* env vars: %s", exc)
+            logger.warning("Falling back to default application credentials")
+    firebase_admin.initialize_app()
+    logger.info("Firebase Admin initialized using default application credentials")
+_initialize_firebase_admin()
+security = HTTPBearer()
+optional_security = HTTPBearer(auto_error=False)
+async def verify_firebase_token(credentials: HTTPAuthorizationCredentials = Depends(security)) -> dict:
+    """
+    Dependency to verify Firebase ID token from Authorization header.
+    Returns the decoded token (user info).
+    """
+    token = credentials.credentials
+    try:
+        decoded_token = auth.verify_id_token(token)
+        return decoded_token
+    except Exception as e:
+        logger.exception("Firebase token verification failed: %s", str(e))
+        raise HTTPException(
+            status_code=status.HTTP_401_UNAUTHORIZED,
+            detail=f"Invalid or expired token: {str(e)}",
+        )
+async def verify_firebase_token_optional(
+    credentials: Optional[HTTPAuthorizationCredentials] = Depends(optional_security),
+) -> Optional[dict]:
+    """
+    Optional auth dependency for read-only endpoints.
+    Returns decoded token when valid, otherwise returns None.
+    """
+    if not credentials or not credentials.credentials:
+        return None
+    token = credentials.credentials
+    try:
+        decoded_token = auth.verify_id_token(token)
+        return decoded_token
+    except Exception as e:
+        logger.warning("Optional Firebase token verification failed: %s", str(e))
+        return None

backend/connection.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import os
+from typing import Tuple
+import pymongo
+from pymongo import MongoClient
+MONGO_URI = os.getenv("MONGO_URI")
+COLLECTION_NAME = os.getenv("COLLECTION_NAME")
+_client = None
+def get_client() -> MongoClient:
+    global _client
+    if _client is None:
+        _client = MongoClient(MONGO_URI)
+    return _client
+def get_db(db_name: str = COLLECTION_NAME):
+    client = get_client()
+    return client[db_name]
+def get_collection(name: str, db_name: str = "dmj"):
+    db = get_db(db_name)
+    return db[name]
+if __name__ == "__main__":
+    try:
+        col = get_collection("memories")
+        print("Connected to collection:", col.name)
+        print("Documents count:", col.count_documents({}))
+    except Exception as e:
+        print("Connection test failed:", e)

backend/crud.py ADDED Viewed

	@@ -0,0 +1,159 @@

+from typing import List, Optional
+from datetime import datetime
+from bson.objectid import ObjectId
+from backend.connection import get_collection
+def _safe_round(value, digits: int = 3) -> float:
+    try:
+        if value is None:
+            return 0.0
+        return round(float(value), digits)
+    except (TypeError, ValueError):
+        return 0.0
+def create_memory(data: dict) -> str:
+    col = get_collection("memories")
+    # Add timestamps and metadata
+    now = datetime.utcnow()
+    data["created_at"] = now
+    data["updated_at"] = now
+    data["is_processed"] = "embedding_id" in data and "nlp_insights" in data
+    res = col.insert_one(data)
+    return str(res.inserted_id)
+def list_memories(limit: int = 50, processed_only: bool = False) -> List[dict]:
+    col = get_collection("memories")
+    query = {}
+    if processed_only:
+        query = {"is_processed": True}
+    docs = col.find(query).sort("created_at", -1).limit(limit)
+    result = []
+    for d in docs:
+        d["id"] = str(d["_id"])
+        # Serialize datetime objects to ISO format
+        if "created_at" in d and isinstance(d["created_at"], datetime):
+            d["created_at"] = d["created_at"].isoformat()
+        if "updated_at" in d and isinstance(d["updated_at"], datetime):
+            d["updated_at"] = d["updated_at"].isoformat()
+        result.append(d)
+    return result
+def get_memory_by_id(memory_id: str) -> Optional[dict]:
+    """Get a single memory by ID."""
+    col = get_collection("memories")
+    try:
+        doc = col.find_one({"_id": ObjectId(memory_id)})
+        if doc:
+            doc["id"] = str(doc["_id"])
+            if "created_at" in doc and isinstance(doc["created_at"], datetime):
+                doc["created_at"] = doc["created_at"].isoformat()
+            if "updated_at" in doc and isinstance(doc["updated_at"], datetime):
+                doc["updated_at"] = doc["updated_at"].isoformat()
+        return doc
+    except Exception:
+        return None
+def update_memory_by_id(memory_id: str, updates: dict) -> bool:
+    """Update editable memory fields by ID."""
+    col = get_collection("memories")
+    try:
+        updates["updated_at"] = datetime.utcnow()
+        result = col.update_one(
+            {"_id": ObjectId(memory_id)},
+            {"$set": updates}
+        )
+        return result.modified_count > 0
+    except Exception:
+        return False
+def update_memory_with_nlp(memory_id: str, nlp_data: dict) -> bool:
+    """Update a memory with NLP extraction results.
+    Args:
+        memory_id: MongoDB ObjectId as string
+        nlp_data: Dict with content_clean, nlp_insights, embedding_id, etc.
+    """
+    col = get_collection("memories")
+    try:
+        nlp_data["updated_at"] = datetime.utcnow()
+        nlp_data["is_processed"] = True
+        result = col.update_one(
+            {"_id": ObjectId(memory_id)},
+            {"$set": nlp_data}
+        )
+        return result.modified_count > 0
+    except Exception:
+        return False
+def get_stats() -> dict:
+    """Get aggregated stats from memories including emotion analysis."""
+    col = get_collection("memories")
+    total = col.count_documents({})
+    # Get most common mood
+    mood_pipeline = [
+        {"$match": {"mood": {"$exists": True}}},
+        {"$group": {"_id": "$mood", "count": {"$sum": 1}}},
+        {"$sort": {"count": -1}},
+        {"$limit": 1},
+    ]
+    mood_agg = list(col.aggregate(mood_pipeline))
+    most_common_mood = mood_agg[0]["_id"] if mood_agg else None
+    # Get top emotions across all memories
+    emotion_pipeline = [
+        {"$match": {"nlp_insights.emotion_scores": {"$exists": True}}},
+        {"$group": {
+            "_id": None,
+            "joy_avg": {"$avg": "$nlp_insights.emotion_scores.joy"},
+            "sadness_avg": {"$avg": "$nlp_insights.emotion_scores.sadness"},
+            "anger_avg": {"$avg": "$nlp_insights.emotion_scores.anger"},
+            "fear_avg": {"$avg": "$nlp_insights.emotion_scores.fear"},
+            "surprise_avg": {"$avg": "$nlp_insights.emotion_scores.surprise"},
+            "disgust_avg": {"$avg": "$nlp_insights.emotion_scores.disgust"},
+        }},
+    ]
+    emotion_agg = list(col.aggregate(emotion_pipeline))
+    top_emotions = {}
+    if emotion_agg:
+        e = emotion_agg[0]
+        top_emotions = {
+            "joy": _safe_round(e.get("joy_avg")),
+            "sadness": _safe_round(e.get("sadness_avg")),
+            "anger": _safe_round(e.get("anger_avg")),
+            "fear": _safe_round(e.get("fear_avg")),
+            "surprise": _safe_round(e.get("surprise_avg")),
+            "disgust": _safe_round(e.get("disgust_avg")),
+        }
+    # Get top topics
+    topic_pipeline = [
+        {"$match": {"nlp_insights.topics": {"$exists": True}}},
+        {"$unwind": "$nlp_insights.topics"},
+        {"$group": {"_id": "$nlp_insights.topics", "count": {"$sum": 1}}},
+        {"$sort": {"count": -1}},
+        {"$limit": 5},
+    ]
+    topic_agg = list(col.aggregate(topic_pipeline))
+    top_topics = [t["_id"] for t in topic_agg]
+    return {
+        "total_memories": total,
+        "most_common_mood": most_common_mood,
+        "top_emotions": top_emotions if top_emotions else None,
+        "top_topics": top_topics if top_topics else None,
+    }

backend/main.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from dotenv import load_dotenv
+from apscheduler.schedulers.background import BackgroundScheduler
+import logging
+import os
+from pathlib import Path
+# Load environment variables from .env before importing modules that depend on them
+load_dotenv(dotenv_path=Path(__file__).resolve().parent / ".env")
+load_dotenv()
+from backend.routers import auth, memories, dashboard
+from backend.nlp_processor import process_unprocessed_memories
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+app = FastAPI(title="DMJ Backend")
+# Allow local dev frontend to call the API on any local port.
+# You can override with CORS_ALLOW_ORIGINS="http://localhost:3000,http://127.0.0.1:3001"
+configured_origins = [
+    origin.strip()
+    for origin in os.getenv("CORS_ALLOW_ORIGINS","https://v0-djjv2.vercel.app,https://dmemoryjar.vercel.app/").split(",")
+    if origin.strip()
+]
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=configured_origins,
+    allow_origin_regex=r"https?://(localhost|127\.0\.0\.1)(:\d+)?$",
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@app.get("/healthz")
+def health():
+    return {"status": "ok"}
+app.include_router(auth.router, prefix="/auth", tags=["auth"])
+app.include_router(memories.router, prefix="/memories", tags=["memories"])
+app.include_router(dashboard.router, prefix="/dashboard", tags=["dashboard"])
+# # Background scheduler for processing unprocessed memories
+# scheduler = BackgroundScheduler()
+# def process_memories_job():
+#     """Job that runs every 10 seconds to process unprocessed memories."""
+#     try:
+#         # logger.info("Running memory processing job...")
+#         process_unprocessed_memories()
+#         # logger.info("Memory processing job completed.")
+#     except Exception as e:
+#         logger.error(f"Error in memory processing job: {e}")
+# scheduler.add_job(process_memories_job, "interval", seconds=10, id="process_memories")
+# scheduler.start()
+@app.on_event("startup")
+def startup_event():
+    logger.info("*********************************Application startup - scheduler running*********************************")
+@app.on_event("shutdown")
+def shutdown_event():
+    # scheduler.shutdown()
+    logger.info("*********************************Application shutdown - scheduler stopped*********************************")
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="127.0.0.1", port=8000, reload=True)

backend/nlp_processor.py ADDED Viewed

	@@ -0,0 +1,937 @@

+"""
+AI NLP Processing Pipeline Template
+1. Text Preprocessing & Cleaning (text_preprocessor.py)
+2. Emotion Analysis
+3. Keyword & Topic Extraction
+4. Entity Recognition
+5. Embedding Generation
+6. Store in MongoDB + FAISS
+"""
+from typing import Dict, List, Optional
+import logging
+import json
+import os
+from urllib import error, request
+from datetime import datetime
+from functools import lru_cache
+from backend.connection import get_collection
+from backend.crud import update_memory_with_nlp
+from backend.text_preprocessor import TextPreprocessor, preprocess_unprocessed_memories
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# ----------------------------------------------------------------------
+# Helper functions for Hugging Face API calls
+# ----------------------------------------------------------------------
+def _get_hf_timeout_seconds() -> int:
+    """Get timeout for HF inference requests from env, default 20 seconds."""
+    value = os.getenv("HF_INFERENCE_TIMEOUT_SECONDS", "20")
+    try:
+        return int(value)
+    except ValueError:
+        return 20
+def _hf_inference_endpoints(model_id: str) -> List[str]:
+    """Return ordered Hugging Face inference endpoints to try."""
+    explicit_base = os.getenv("HF_INFERENCE_BASE_URL", "").strip().rstrip("/")
+    endpoints: List[str] = []
+    if explicit_base:
+        endpoints.append(f"{explicit_base}/{model_id}")
+    endpoints.extend([
+        f"https://router.huggingface.co/hf-inference/models/{model_id}",
+        f"https://api-inference.huggingface.co/models/{model_id}",
+    ])
+    # Remove duplicates
+    deduped: List[str] = []
+    seen = set()
+    for endpoint in endpoints:
+        if endpoint not in seen:
+            deduped.append(endpoint)
+            seen.add(endpoint)
+    return deduped
+# ----------------------------------------------------------------------
+# Keyword extraction with KeyBERT (cached model)
+# ----------------------------------------------------------------------
+# Keyword extraction now uses TextPreprocessor (lightweight, no ML models)
+def _get_keybert_top_n() -> int:
+    """Get number of keywords to extract from env, default 8."""
+    value = os.getenv("KEYBERT_TOP_N", "8")
+    try:
+        parsed = int(value)
+        return max(1, min(parsed, 20))
+    except ValueError:
+        return 8
+# ----------------------------------------------------------------------
+# Zero-shot topic classification
+# ----------------------------------------------------------------------
+# Zero-shot now uses HF API instead of local transformers
+def _get_topic_candidate_labels() -> List[str]:
+    # configured = os.getenv("TOPIC_LABELS", "").strip()
+    # if configured:
+    #     return [label.strip() for label in configured.split(",") if label.strip()]
+    return [
+        "Work & Productivity",
+        "Health & Wellness",
+        "Emotions & Mental Health",
+        "Relationships & Family",
+        "Learning & Growth",
+        "Finance",
+        "Travel & Leisure",
+        "Daily Life",
+    ]
+def _get_topic_score_threshold() -> float:
+    """Minimum confidence score for topic assignment."""
+    value = os.getenv("TOPIC_MIN_SCORE", "0.2")
+    try:
+        parsed = float(value)
+        return max(0.0, min(parsed, 1.0))
+    except ValueError:
+        return 0.2
+def _get_topic_max_labels() -> int:
+    """Maximum number of topics to assign per memory."""
+    value = os.getenv("TOPIC_MAX_LABELS", "2")
+    try:
+        parsed = int(value)
+        return max(1, min(parsed, 5))
+    except ValueError:
+        return 2
+# ----------------------------------------------------------------------
+# Named Entity Recognition
+# ----------------------------------------------------------------------
+# NER disabled for memory optimization on free hosting
+def _get_ner_score_threshold() -> float:
+    """Minimum confidence for NER entities (optional env)."""
+    value = os.getenv("NER_MIN_SCORE", "0.35")
+    try:
+        parsed = float(value)
+        return max(0.0, min(parsed, 1.0))
+    except ValueError:
+        return 0.35
+# ----------------------------------------------------------------------
+# Embedding generation
+# ----------------------------------------------------------------------
+# Embedding generation now uses HF API instead of local SentenceTransformers
+# ----------------------------------------------------------------------
+# General helpers
+def _flatten_hf_labels(payload: object) -> List[Dict[str, float]]:
+    """Convert Hugging Face API output to list of {label, score} dicts."""
+    if not isinstance(payload, list):
+        return []
+    if payload and isinstance(payload[0], list):
+        candidates = payload[0]
+    else:
+        candidates = payload
+    parsed: List[Dict[str, float]] = []
+    for item in candidates:
+        if not isinstance(item, dict):
+            continue
+        label = str(item.get("label", "")).strip().lower()
+        score = item.get("score", 0.0)
+        try:
+            parsed.append({"label": label, "score": float(score)})
+        except (TypeError, ValueError):
+            continue
+    return parsed
+def _dedupe_text_items(items: List[str]) -> List[str]:
+    """Remove duplicate strings (case‑insensitive)."""
+    cleaned: List[str] = []
+    seen = set()
+    for item in items:
+        value = item.strip()
+        if not value:
+            continue
+        key = value.lower()
+        if key in seen:
+            continue
+        seen.add(key)
+        cleaned.append(value)
+    return cleaned
+# ----------------------------------------------------------------------
+# Text cleaning
+def clean_text(text: str) -> str:
+    """
+    Normalize, tokenize, lemmatize, remove stopwords
+    """
+    preprocessor = TextPreprocessor()
+    result = preprocessor.preprocess(text)
+    return result["cleaned"]
+# ----------------------------------------------------------------------
+# Emotion scoring (using Hugging Face API)
+# ----------------------------------------------------------------------
+EMOTION_BUCKET_LABELS = {
+    "joy": {"joy", "amusement", "excitement", "optimism", "contentment", "happy", "excited", "content"},
+    "sadness": {"sadness", "disappointment", "grief", "remorse", "hurt", "lonely", "disappointed"},
+    "anger": {"anger", "annoyance", "rage", "frustration", "frustrated", "annoyed", "furious"},
+    "fear": {"fear", "nervousness", "anxiety", "worry", "anxious", "nervous", "worried"},
+    "surprise": {"surprise", "realization", "amazed", "amaze", "shocked"},
+    "disgust": {"disgust", "disapproval", "embarrassment", "dislike", "uncomfortable"},
+}
+def _neutral_emotion_scores() -> Dict[str, float]:
+    """Return zero‑initialized emotion score dict."""
+    return {
+        "joy": 0.0,
+        "sadness": 0.0,
+        "anger": 0.0,
+        "fear": 0.0,
+        "surprise": 0.0,
+        "disgust": 0.0,
+    }
+def _bucketize_emotions(label_scores: List[Dict[str, float]]) -> Dict[str, float]:
+    bucket_scores = _neutral_emotion_scores()
+    for item in label_scores:
+        label = item["label"]
+        score = float(item["score"])
+        for bucket, aliases in EMOTION_BUCKET_LABELS.items():
+            if label in aliases:
+                bucket_scores[bucket] += score
+                break
+    total = sum(bucket_scores.values())
+    if total > 0:
+        return {k: round(v / total, 4) for k, v in bucket_scores.items()}
+    return bucket_scores
+def extract_emotion_scores(text: str) -> Dict[str, float]:
+    if not text or not text.strip():
+        return _neutral_emotion_scores()
+    hf_api_token = os.getenv("HF_API_TOKEN")
+    hf_timeout_seconds = _get_hf_timeout_seconds()
+    if not hf_api_token:
+        logger.warning("HF_API_TOKEN missing. Returning default emotion scores.")
+        return _neutral_emotion_scores()
+    body = json.dumps({"inputs": text, "options": {"wait_for_model": True}}).encode("utf-8")
+    last_error: Optional[str] = None
+    for endpoint in _hf_inference_endpoints("AnasAlokla/multilingual_go_emotions"):
+        req = request.Request(
+            endpoint,
+            data=body,
+            method="POST",
+            headers={
+                "Authorization": f"Bearer {hf_api_token}",
+                "Content-Type": "application/json",
+            },
+        )
+        try:
+            with request.urlopen(req, timeout=hf_timeout_seconds) as res:
+                raw_payload = res.read().decode("utf-8")
+                payload = json.loads(raw_payload)
+            if isinstance(payload, dict) and payload.get("error"):
+                last_error = str(payload.get("error"))
+                logger.warning("Hugging Face API error from %s: %s", endpoint, last_error)
+                continue
+            label_scores = _flatten_hf_labels(payload)
+            if not label_scores:
+                last_error = "No valid label scores from Hugging Face response."
+                logger.warning("%s Endpoint: %s", last_error, endpoint)
+                continue
+            return _bucketize_emotions(label_scores)
+        except error.HTTPError as e:
+            try:
+                error_body = e.read().decode("utf-8")
+            except Exception:
+                error_body = ""
+            last_error = f"HTTP {e.code} {e.reason}"
+            logger.warning(
+                "Hugging Face HTTP error via %s: %s. Body: %s",
+                endpoint,
+                last_error,
+                error_body,
+            )
+            continue
+        except error.URLError as e:
+            last_error = f"Network error: {e.reason}"
+            logger.warning("Hugging Face network error via %s: %s", endpoint, e.reason)
+            continue
+        except json.JSONDecodeError:
+            last_error = "Failed to decode Hugging Face response JSON."
+            logger.warning("%s Endpoint: %s", last_error, endpoint)
+            continue
+        except Exception as e:
+            last_error = f"Unexpected emotion scoring error: {str(e)}"
+            logger.warning("%s Endpoint: %s", last_error, endpoint)
+            continue
+    if last_error:
+        logger.error("Emotion scoring failed for model. Last error: %s", last_error)
+    return _neutral_emotion_scores()
+# ----------------------------------------------------------------------
+# Keyword extraction (KeyBERT with fallback)
+# ----------------------------------------------------------------------
+def extract_keywords(text: str) -> List[str]:
+    """Extract keywords using TextPreprocessor (lightweight, no ML models)."""
+    if not text or not text.strip():
+        return []
+    top_n = _get_keybert_top_n()
+    try:
+        preprocessor = TextPreprocessor()
+        return preprocessor.extract_keywords(text, top_n=top_n)
+    except Exception as e:
+        logger.error("Keyword extraction failed: %s", str(e))
+        return []
+# ----------------------------------------------------------------------
+# Topic categorization
+# ----------------------------------------------------------------------
+def categorize_topics(text: str, keywords: List[str]) -> List[str]:
+    if not text or not text.strip():
+        return ["Daily Life"]
+    candidate_labels = _get_topic_candidate_labels()
+    min_score = _get_topic_score_threshold()
+    max_labels = _get_topic_max_labels()
+    text_for_classification = text
+    if keywords:
+        text_for_classification = f"{text}\nKeywords: {', '.join(keywords[:10])}"
+    try:
+        classifier = _get_zero_shot_classifier()
+        result = classifier(
+            text_for_classification,
+            candidate_labels=candidate_labels,
+            # Use HuggingFace API instead of local transformers
+            hf_api_token = os.getenv("HF_API_TOKEN")
+            if not hf_api_token:
+                logger.warning("HF_API_TOKEN missing. Using keyword-based topic classification.")
+                return _fallback_topic_classification(text)
+            hf_timeout = _get_hf_timeout_seconds()
+            model_id = "joeddav/xlm-roberta-large-xnli"
+            multi_label=True,
+                body = json.dumps({
+                    "inputs": text_for_classification,
+                    "parameters": {
+                        "candidate_labels": candidate_labels,
+                        "multi_label": True
+                    },
+                    "options": {"wait_for_model": True}
+                }).encode("utf-8")
+                for endpoint in _hf_inference_endpoints(model_id):
+                    req = request.Request(
+                        endpoint,
+                        data=body,
+                        method="POST",
+                        headers={
+                            "Authorization": f"Bearer {hf_api_token}",
+                            "Content-Type": "application/json",
+                        },
+                    )
+                    try:
+                        with request.urlopen(req, timeout=hf_timeout) as res:
+                            raw_payload = res.read().decode("utf-8")
+                            result = json.loads(raw_payload)
+                        if isinstance(result, dict) and result.get("error"):
+                            logger.warning("HF API error: %s", result.get("error"))
+                            continue
+                        labels = result.get("labels", []) if isinstance(result, dict) else []
+                        scores = result.get("scores", []) if isinstance(result, dict) else []
+                        ranked_topics: List[str] = []
+                        for label, score in zip(labels, scores):
+                            if float(score) >= min_score:
+                                ranked_topics.append(str(label))
+                            if len(ranked_topics) >= max_labels:
+                                break
+                        if ranked_topics:
+                            return ranked_topics
+                        if labels:
+                            return [str(labels[0])]
+                    except Exception as e:
+                        logger.warning("HF API request failed: %s", str(e))
+                        continue
+            except Exception as e:
+                logger.error("Zero-shot topic classification failed: %s", str(e))
+            return _fallback_topic_classification(text)
+        def _fallback_topic_classification(text: str) -> List[str]:
+            """Fallback keyword-based topic classification."""
+            topics = []
+            work_keywords = ["work", "email", "project", "deliverable", "deadline"]
+            health_keywords = ["walk", "exercise", "sleep", "health", "tired"]
+            mood_keywords = ["grateful", "happy", "sad", "anxious", "stressed"]
+            text_lower = text.lower()
+            if any(k in text_lower for k in work_keywords):
+                topics.append("Work & Productivity")
+            if any(k in text_lower for k in health_keywords):
+                topics.append("Health & Wellness")
+            if any(k in text_lower for k in mood_keywords):
+                topics.append("Emotions & Mental Health")
+            return topics or ["Daily Life"]
+        def extract_entities_OLD_DISABLED(text: str) -> List[str]:
+            """OLD VERSION - Extract named entities using multilingual HF NER with spaCy fallback."""
+            if not text or not text.strip():
+                return []
+            min_score = _get_ner_score_threshold()
+            try:
+                ner_pipeline = _get_hf_ner_pipeline()
+                raw_entities = ner_pipeline(text)
+                entities = []
+                for item in raw_entities:
+                    if not isinstance(item, dict):
+                        continue
+                    entity_text = str(item.get("word", "")).strip()
+                    score = item.get("score", 0.0)
+                    try:
+                        score_value = float(score)
+                    except (TypeError, ValueError):
+                        score_value = 0.0
+                    if entity_text and score_value >= min_score:
+                        entities.append(entity_text)
+                entities = _dedupe_text_items(entities)
+                if entities:
+                    return entities
+            except ImportError as e:
+                logger.warning("Transformers dependency missing for NER (%s).", str(e))
+            except Exception as e:
+                logger.error("Hugging Face NER failed: %s", str(e))
+            try:
+                nlp = _get_spacy_ner_model()
+                doc = nlp(text)
+                entities = _dedupe_text_items([ent.text for ent in doc.ents])
+                if entities:
+                    return entities
+            except ImportError as e:
+                logger.warning("spaCy dependency missing for NER fallback (%s).", str(e))
+            except Exception as e:
+                logger.error("spaCy NER fallback failed: %s", str(e))
+            return []
+        def extract_entities(text: str) -> List[str]:
+            """Extract named entities - disabled for memory optimization."""
+            # NER requires heavy transformers models
+            # Disabled to keep memory usage low on free hosting
+            logger.info("NER disabled for memory optimization")
+            return []
+        def generate_embedding_OLD_DISABLED(text: str) -> Dict:
+            """OLD VERSION - Generate multilingual sentence embedding using Sentence Transformers."""
+            model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
+            if not text or not text.strip():
+            if float(score) >= min_score:
+                    "vector": [],
+                    "model": model_name,
+                }
+            try:
+                embedding_model = _get_embedding_model_instance()
+                vector = embedding_model.encode(text, convert_to_tensor=False, normalize_embeddings=True)
+                if hasattr(vector, "tolist"):
+                    vector_list = vector.tolist()
+                else:
+                    vector_list = list(vector)
+                return {
+                    "vector": vector_list,
+                    "model": model_name,
+                }
+            except ImportError as e:
+                logger.warning("sentence-transformers dependency missing for embeddings (%s).", str(e))
+            except Exception as e:
+                logger.error("Embedding generation failed: %s", str(e))
+            return {
+                "vector": [],
+                "model": model_name,
+            }
+        def generate_embedding(text: str) -> Dict:
+            """Generate multilingual sentence embedding using Hugging Face API."""
+            model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
+            if not text or not text.strip():
+                return {
+                    "vector": [],
+                    "model": model_name,
+                }
+            hf_api_token = os.getenv("HF_API_TOKEN")
+            if not hf_api_token:
+                logger.warning("HF_API_TOKEN missing. Skipping embedding generation.")
+                return {
+                    "vector": [],
+                    "model": model_name,
+                }
+            hf_timeout = _get_hf_timeout_seconds()
+            try:
+                body = json.dumps({
+                    "inputs": text,
+                    "options": {"wait_for_model": True}
+                }).encode("utf-8")
+                for endpoint in _hf_inference_endpoints(model_name):
+                    req = request.Request(
+                        endpoint,
+                        data=body,
+                        method="POST",
+                        headers={
+                            "Authorization": f"Bearer {hf_api_token}",
+                            "Content-Type": "application/json",
+                        },
+                    )
+                    try:
+                        with request.urlopen(req, timeout=hf_timeout) as res:
+                            raw_payload = res.read().decode("utf-8")
+                            payload = json.loads(raw_payload)
+                        if isinstance(payload, dict) and payload.get("error"):
+                            logger.warning("HF API error: %s", payload.get("error"))
+                            continue
+                        # HF Feature Extraction API returns embeddings directly
+                        if isinstance(payload, list) and len(payload) > 0:
+                            vector_list = payload[0] if isinstance(payload[0], list) else payload
+                            return {
+                                "vector": vector_list,
+                                "model": model_name,
+                            }
+                    except Exception as e:
+                        logger.warning("HF API embedding request failed: %s", str(e))
+                        continue
+            except Exception as e:
+                logger.error("Embedding generation failed: %s", str(e))
+            return {
+                ranked_topics.append(str(label))
+                "model": model_name,
+            }
+# ----------------------------------------------------------------------
+# Named Entity Recognition (HF + spaCy fallback)
+# ----------------------------------------------------------------------
+def extract_entities(text: str) -> List[str]:
+    """Extract named entities using multilingual HF NER with spaCy fallback."""
+    if not text or not text.strip():
+        return []
+    min_score = _get_ner_score_threshold()
+    try:
+        ner_pipeline = _get_hf_ner_pipeline()
+        raw_entities = ner_pipeline(text)
+        entities = []
+        for item in raw_entities:
+            if not isinstance(item, dict):
+                continue
+            entity_text = str(item.get("word", "")).strip()
+            score = item.get("score", 0.0)
+            try:
+                score_value = float(score)
+            except (TypeError, ValueError):
+                score_value = 0.0
+            if entity_text and score_value >= min_score:
+                entities.append(entity_text)
+        entities = _dedupe_text_items(entities)
+        if entities:
+            return entities
+    except ImportError as e:
+        logger.warning("Transformers dependency missing for NER (%s).", str(e))
+    except Exception as e:
+        logger.error("Hugging Face NER failed: %s", str(e))
+    try:
+        nlp = _get_spacy_ner_model()
+        doc = nlp(text)
+        entities = _dedupe_text_items([ent.text for ent in doc.ents])
+        if entities:
+            return entities
+    except ImportError as e:
+        logger.warning("spaCy dependency missing for NER fallback (%s).", str(e))
+    except Exception as e:
+        logger.error("spaCy NER fallback failed: %s", str(e))
+    return []
+# ----------------------------------------------------------------------
+# Embedding generation (Sentence Transformers)
+# ----------------------------------------------------------------------
+def generate_embedding(text: str) -> Dict:
+    """Generate multilingual sentence embedding using Sentence Transformers."""
+    model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
+    if not text or not text.strip():
+        return {
+            "vector": [],
+            "model": model_name,
+        }
+    try:
+        embedding_model = _get_embedding_model_instance()
+        vector = embedding_model.encode(text, convert_to_tensor=False, normalize_embeddings=True)
+        if hasattr(vector, "tolist"):
+            vector_list = vector.tolist()
+        else:
+            vector_list = list(vector)
+        return {
+            "vector": vector_list,
+            "model": model_name,
+        }
+    except ImportError as e:
+        logger.warning("sentence-transformers dependency missing for embeddings (%s).", str(e))
+    except Exception as e:
+        logger.error("Embedding generation failed: %s", str(e))
+    return {
+        "vector": [],
+        "model": model_name,
+    }
+# ----------------------------------------------------------------------
+# FAISS storage (placeholder – to be implemented)
+# ----------------------------------------------------------------------
+def store_embedding_in_faiss(vector: List[float], memory_id: str, faiss_index) -> int:
+    """
+    Store vector embedding in FAISS index and return its ID.
+    Args:
+        vector: Embedding vector
+        memory_id: MongoDB ObjectId
+        faiss_index: FAISS IndexFlatL2 instance
+    Returns:
+        embedding_id: Position in FAISS index
+    """
+    # TODO: Implement actual FAISS integration
+    # import faiss
+    # import numpy as np
+    # index.add(np.array([vector]).astype('float32'))
+    # embedding_id = index.ntotal - 1
+    # Save mapping: embedding_id → memory_id in a separate collection
+    embedding_id = 4271  # Dummy ID for now
+    return embedding_id
+# ----------------------------------------------------------------------
+# Main processing loop (called by scheduler)
+# ----------------------------------------------------------------------
+def process_unprocessed_memories(batch_size: int = 50) -> Dict:
+    """
+    Order of operations:
+    1. Text Preprocessing & cleaning spaCy already done in separate step
+    2. Emotion Analysis
+    3. Keyword & Topic Extraction
+    4. Entity Recognition
+    5. Embedding Generation
+    6. Store in MongoDB
+    """
+    col = get_collection("memories")
+    # Step 1: Preprocess any memories without preprocessing
+    preprocessing_result = preprocess_unprocessed_memories(batch_size)
+    # Step 2: Process preprocessed memories for emotion/embedding
+    unprocessed = list(col.find(
+        {
+            "preprocessing": {"$exists": True},
+            "nlp_insights": {"$exists": False}
+        }
+    ).limit(batch_size))
+    processed_count = 0
+    failed_count = 0
+    errors = []
+    for memory in unprocessed:
+        try:
+            memory_id = str(memory["_id"])
+            preprocessed = memory.get("preprocessing", {})
+            cleaned_text = preprocessed.get("cleaned", "")
+            preprocessing_keywords = preprocessed.get("keywords", [])
+            if not cleaned_text:
+                continue
+            logger.info(f"Processing memory {memory_id}...")
+            # Run NLP pipeline on cleaned text
+            emotion_scores = extract_emotion_scores(cleaned_text)
+            keywords = extract_keywords(cleaned_text) or preprocessing_keywords
+            topics = categorize_topics(cleaned_text, keywords)
+            entities = extract_entities(cleaned_text)
+            embedding_data = generate_embedding(cleaned_text)
+            embedding_id = store_embedding_in_faiss(
+                embedding_data["vector"],
+                memory_id,
+                faiss_index=None   # FAISS index not yet initialized
+            )
+            # Determine mood from top emotion
+            mood = max(emotion_scores, key=emotion_scores.get) if emotion_scores else "neutral"
+            # Prepare update data
+            nlp_data = {
+                "content_clean": cleaned_text,
+                "mood": mood,
+                "embedding_id": embedding_id,
+                "nlp_insights": {
+                    "emotion_scores": emotion_scores,
+                    "keywords": keywords,
+                    "topics": topics,
+                    "entities": entities,
+                },
+            }
+            # Update memory in MongoDB
+            if update_memory_with_nlp(memory_id, nlp_data):
+                processed_count += 1
+                logger.info(f"✓ Processed {memory_id}")
+            else:
+                failed_count += 1
+                errors.append(f"Failed to update {memory_id}")
+        except Exception as e:
+            failed_count += 1
+            error_msg = f"Error processing {memory.get('_id')}: {str(e)}"
+            errors.append(error_msg)
+            logger.error(error_msg)
+    return {
+        "preprocessing": preprocessing_result,
+        "nlp_processing": {
+            "total": len(unprocessed),
+            "processed": processed_count,
+            "failed": failed_count,
+            "errors": errors,
+        }
+    }

backend/requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+APScheduler==3.11.2
+dnspython==2.8.0
+fastapi==0.133.0
+firebase_admin==7.1.0
+pydantic==2.12.5
+pymongo==4.16.0
+python-dotenv==1.2.1
+uvicorn==0.41.0
+wrapt==2.1.1

backend/routers/auth.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from fastapi import APIRouter, Depends
+from backend.auth_deps import verify_firebase_token
+router = APIRouter()
+@router.get("/me")
+async def me(user: dict = Depends(verify_firebase_token)):
+    """Return current user info from Firebase token."""
+    return {
+        "uid": user.get("uid"),
+        "email": user.get("email"),
+        "email_verified": user.get("email_verified"),
+    }

backend/routers/dashboard.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from fastapi import APIRouter, Depends
+from backend import crud, schemas
+from backend.auth_deps import verify_firebase_token_optional
+router = APIRouter()
+@router.get("/stats", response_model=schemas.StatsResponse)
+async def stats(user: dict | None = Depends(verify_firebase_token_optional)):
+	return crud.get_stats()

backend/routers/emotion_analyzer.py ADDED Viewed

File without changes

backend/routers/memories.py ADDED Viewed

	@@ -0,0 +1,144 @@

+from typing import List
+import logging
+from fastapi import APIRouter, Depends, HTTPException
+from backend import crud, schemas
+from backend.auth_deps import verify_firebase_token, verify_firebase_token_optional
+from backend.nlp_processor import extract_emotion_scores, extract_keywords, categorize_topics
+router = APIRouter()
+logger = logging.getLogger(__name__)
+def _stage_log(stage: str) -> None:
+	message = f"[NLP][analyze] {stage}"
+	print(message)
+	logger.info(message)
+def _save_stage_log(stage: str) -> None:
+	message = f"[MEMORY][save] {stage}"
+	print(message)
+	logger.info(message)
+def _generate_simple_summary(text: str, max_words: int = 28) -> str:
+	"""Create a lightweight summary from raw memory text."""
+	cleaned = " ".join(text.split())
+	if not cleaned:
+		return ""
+	words = cleaned.split(" ")
+	if len(words) <= max_words:
+		return cleaned
+	return " ".join(words[:max_words]).rstrip(".,;: ") + "..."
+def _top_tags(keywords: List[str], topics: List[str], max_tags: int = 5) -> List[str]:
+	"""Combine and normalize keywords/topics into compact tag list."""
+	tags: List[str] = []
+	seen = set()
+	for item in (keywords or []) + (topics or []):
+		value = str(item).strip().lower()
+		if not value:
+			continue
+		value = value.replace("&", "and")
+		if value in seen:
+			continue
+		seen.add(value)
+		tags.append(value)
+		if len(tags) >= max_tags:
+			break
+	return tags
+@router.get("/", response_model=List[schemas.MemoryDB])
+async def list_memories(user: dict | None = Depends(verify_firebase_token_optional)):
+	docs = crud.list_memories()
+	return docs
+@router.post("/", status_code=201)
+async def create_memory(
+	payload: schemas.MemoryCreate,
+	user: dict = Depends(verify_firebase_token),
+):
+	_save_stage_log("1/3 received request")
+	data = payload.dict(exclude_none=True)
+	data["uid"] = user.get("uid")
+	_save_stage_log(f"2/3 persisting for uid={data['uid']}")
+	inserted_id = crud.create_memory(data)
+	_save_stage_log(f"3/3 completed id={inserted_id}")
+	return {"id": inserted_id, "status": "created", "message": "Memory stored. Will be processed by AI."}
+@router.post("/analyze", response_model=schemas.MemoryAnalyzeResponse)
+async def analyze_memory(payload: schemas.MemoryAnalyzeRequest, user: dict = Depends(verify_firebase_token)):
+	_stage_log("1/6 received request")
+	text = payload.content.strip()
+	if not text:
+		_stage_log("validation failed: empty content")
+		raise HTTPException(status_code=400, detail="Memory content is required")
+	_stage_log("2/6 emotion scoring")
+	emotion_scores = extract_emotion_scores(text)
+	mood = max(emotion_scores, key=emotion_scores.get) if emotion_scores else "neutral"
+	_stage_log(f"emotion scoring done, mood={mood}")
+	_stage_log("3/6 keyword extraction")
+	keywords = extract_keywords(text)
+	_stage_log(f"keyword extraction done, count={len(keywords)}")
+	_stage_log("4/6 topic categorization")
+	topics = categorize_topics(text, keywords)
+	_stage_log(f"topic categorization done, count={len(topics)}")
+	_stage_log("5/6 summary + tags")
+	ai_summary = _generate_simple_summary(text)
+	tags = _top_tags(keywords, topics)
+	_stage_log(f"summary + tags done, tags={len(tags)}")
+	_stage_log("6/6 response ready")
+	return {
+		"ai_summary": ai_summary,
+		"mood": mood,
+		"tags": tags,
+		"nlp_insights": {
+			"emotion_scores": emotion_scores,
+			"keywords": keywords,
+			"topics": topics,
+			"entities": [],
+		},
+	}
+@router.get("/{memory_id}", response_model=schemas.MemoryDB)
+async def get_memory(memory_id: str, user: dict | None = Depends(verify_firebase_token_optional)):
+	"""Get a specific memory by ID."""
+	memory = crud.get_memory_by_id(memory_id)
+	if not memory:
+		raise HTTPException(status_code=404, detail="Memory not found")
+	return memory
+@router.put("/{memory_id}", response_model=schemas.MemoryDB)
+async def update_memory(memory_id: str, payload: schemas.MemoryUpdate, user: dict = Depends(verify_firebase_token)):
+	updates = payload.dict(exclude_none=True)
+	if not updates:
+		raise HTTPException(status_code=400, detail="No updates provided")
+	updated = crud.update_memory_by_id(memory_id, updates)
+	if not updated:
+		raise HTTPException(status_code=404, detail="Memory not found or unchanged")
+	memory = crud.get_memory_by_id(memory_id)
+	if not memory:
+		raise HTTPException(status_code=404, detail="Memory not found")
+	return memory

backend/schemas.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from typing import Optional, List, Dict, Any
+from datetime import datetime
+from pydantic import BaseModel, Field
+class EmotionScores(BaseModel):
+    """Emotion analysis from NLP."""
+    joy: Optional[float] = Field(None, description="Joy score 0-1")
+    sadness: Optional[float] = Field(None, description="Sadness score 0-1")
+    anger: Optional[float] = Field(None, description="Anger score 0-1")
+    fear: Optional[float] = Field(None, description="Fear score 0-1")
+    surprise: Optional[float] = Field(None, description="Surprise score 0-1")
+    disgust: Optional[float] = Field(None, description="Disgust score 0-1")
+class NLPInsights(BaseModel):
+    emotion_scores: Optional[EmotionScores] = Field(None, description="Emotion sentiment analysis")
+    keywords: Optional[List[str]] = Field(default_factory=list, description="Extracted keywords/phrases")
+    topics: Optional[List[str]] = Field(default_factory=list, description="Identified topics (e.g., Work, Health, Relationships)")
+    entities: Optional[List[str]] = Field(default_factory=list, description="Named entities (people, places)")
+class MemoryCreate(BaseModel):
+    content: str = Field(..., description="Raw text content of the memory")
+    content_clean: Optional[str] = Field(None, description="Cleaned/normalized version of content")
+    mood: Optional[str] = Field(None, description="Detected mood (e.g., happy, sad, reflective)")
+    ai_summary: Optional[str] = Field(None, description="AI-generated summary of the memory")
+    tags: Optional[List[str]] = Field(default_factory=list, description="Associated tags")
+    recorded_by: Optional[str] = Field(None, description="Input method: text, voice, etc.")
+    nlp_insights: Optional[NLPInsights] = Field(None, description="NLP extraction results")
+    embedding_id: Optional[int] = Field(None, description="Reference to FAISS index ID for vector search")
+class MemoryUpdate(BaseModel):
+    content: Optional[str] = Field(None, description="Raw text content of the memory")
+    mood: Optional[str] = Field(None, description="Updated mood")
+    ai_summary: Optional[str] = Field(None, description="Updated AI summary")
+    tags: Optional[List[str]] = Field(None, description="Updated tags")
+class MemoryAnalyzeRequest(BaseModel):
+    content: str = Field(..., description="Raw text content to analyze")
+class MemoryAnalyzeResponse(BaseModel):
+    ai_summary: str = Field(..., description="Generated concise summary")
+    mood: str = Field(..., description="Detected mood label")
+    tags: List[str] = Field(default_factory=list, description="Detected keyword/topic tags")
+    nlp_insights: Optional[NLPInsights] = Field(None, description="Detailed NLP extraction results")
+class MemoryDB(MemoryCreate):
+    id: str = Field(..., description="MongoDB ObjectId as string")
+    uid: Optional[str] = Field(None, description="Firebase user ID")
+    created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation timestamp")
+    updated_at: Optional[datetime] = Field(None, description="Last update timestamp")
+    is_processed: bool = Field(False, description="Whether NLP extraction/embedding has been completed")
+class StatsResponse(BaseModel):
+    total_memories: int
+    avg_mood_score: Optional[float] = None
+    most_common_mood: Optional[str] = None
+    top_emotions: Optional[Dict[str, float]] = None
+    top_topics: Optional[List[str]] = None

backend/text_preprocessor.py ADDED Viewed

	@@ -0,0 +1,410 @@

+"""
+TEXT PREPROCESSING & CLEANING MODULE
+1. Load NLP Pipeline (spaCy model)
+2. Normalize Text (lowercase, remove special chars, URLs)
+3. Tokenize & Analyze (break into words, POS tags)
+4. Lemmatize & Clean (reduce to base forms, remove stopwords)
+5. Store cleaned text & metadata in MongoDB
+6. Feed to downstream AI models
+flow: User Input → Normalize → Tokenize → Lemmatize → Store → AI Models
+"""
+import re
+import string
+from typing import Dict, List, Optional, Tuple
+from datetime import datetime
+import logging
+try:
+    import spacy
+    from spacy.language import Language
+    SPACY_AVAILABLE = True
+except ImportError:
+    SPACY_AVAILABLE = False
+    logging.info("spaCy not installed - using lightweight regex-based preprocessing")
+    Language = None
+from backend.connection import get_collection
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Global cache for spaCy model (load once, reuse)
+_nlp_model: Optional[Language] = None
+def load_nlp_pipeline() -> Language:
+    """
+    Load and cache spaCy NLP pipeline.
+    Downloads en_core_web_sm on first run.
+    Uses cache on subsequent calls for performance.
+    Returns:
+        spacy Language model instance
+    """
+    global _nlp_model
+    if _nlp_model is not None:
+        return _nlp_model
+    if not SPACY_AVAILABLE:
+        raise RuntimeError("spaCy not installed.")
+    try:
+        # Try to load the model
+        _nlp_model = spacy.load("en_core_web_sm")
+        logger.info("Loaded spaCy model: en_core_web_sm")
+        return _nlp_model
+    except OSError:
+        # Model not found, try to download
+        logger.info("Downloading en_core_web_sm model...")
+        ################################################################################3
+        # import subprocess
+        # subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"], check=True)
+        import sys, subprocess
+        subprocess.run([sys.executable, "-m", "spacy", "download", "en_core_web_sm"], check=True)
+        _nlp_model = spacy.load("en_core_web_sm")
+        logger.info("ownloaded and loaded en_core_web_sm")
+        return _nlp_model
+class TextPreprocessor:
+    # Complete text preprocessing pipeline
+    def __init__(self):
+        """Initialize preprocessor with spaCy pipeline if available, else use lightweight mode."""
+        if SPACY_AVAILABLE:
+            try:
+                self.nlp = load_nlp_pipeline()
+                self.stop_words = self.nlp.Defaults.stop_words
+                self.use_spacy = True
+                logger.info("TextPreprocessor initialized with spaCy")
+            except Exception as e:
+                logger.warning(f"Failed to load spaCy: {e}. Using lightweight mode.")
+                self.nlp = None
+                self.stop_words = self._get_basic_stopwords()
+                self.use_spacy = False
+        else:
+            self.nlp = None
+            self.stop_words = self._get_basic_stopwords()
+            self.use_spacy = False
+            logger.info("TextPreprocessor initialized without spaCy (lightweight mode)")
+    def _get_basic_stopwords(self) -> set:
+        """Basic English stopwords for lightweight mode."""
+        return {
+            'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
+            'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers',
+            'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
+            'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
+            'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does',
+            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until',
+            'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into',
+            'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down',
+            'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once'
+        }
+    def normalize_text(self, text: str) -> str:
+        """
+        - Convert to lowercase
+        - Remove URLs (https://..., http://...)
+        - Remove email addresses
+        - Remove special characters except apostrophes
+        - Remove extra whitespace
+        """
+        if not text:
+            return ""
+        # Remove URLs
+        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
+        # Remove email addresses
+        text = re.sub(r'\S+@\S+', '', text)
+        # Remove mentions (@user) and hashtags (#hashtag)
+        text = re.sub(r'@\w+|#\w+', '', text)
+        # Convert to lowercase
+        text = text.lower()
+        # Remove special characters but keep spaces and apostrophes
+        text = re.sub(r"[^\w\s']", '', text)
+        # Remove extra whitespace and tabs
+        text = ' '.join(text.split())
+        return text
+    def tokenize_and_analyze(self, text: str) -> Tuple[List[str], List[Tuple[str, str]]]:
+        if not text:
+            return [], []
+        if not self.use_spacy:
+            # Lightweight tokenization without spaCy
+            tokens = re.findall(r'\b\w+\b', text.lower())
+            pos_tags = [(token, "NOUN") for token in tokens]  # Simplified POS
+            return tokens, pos_tags
+        doc = self.nlp(text)
+        tokens = [token.text for token in doc]
+        pos_tags = [(token.text, token.pos_) for token in doc]
+        return tokens, pos_tags
+    def lemmatize_and_clean(self, text: str, remove_stopwords: bool = True,remove_punctuation: bool = True) -> Tuple[str, Dict]:
+        if not text:
+            return "", {}
+        if not self.use_spacy:
+            # Lightweight lemmatization without spaCy
+            tokens = re.findall(r'\b\w+\b', text.lower())
+            lemmas = []
+            removed_stopwords = 0
+            for token in tokens:
+                if remove_stopwords and token in self.stop_words:
+                    removed_stopwords += 1
+                    continue
+                if len(token) >= 2:
+                    lemmas.append(token)
+            cleaned_text = ' '.join(lemmas)
+            metadata = {
+                "original_token_count": len(tokens),
+                "cleaned_token_count": len(lemmas),
+                "removed_stopwords": removed_stopwords,
+                "pos_distribution": {},
+                "compression_ratio": round(len(lemmas) / len(tokens), 2) if tokens else 0,
+            }
+            return cleaned_text, metadata
+        doc = self.nlp(text)
+        lemmas = []
+        pos_distribution = {}
+        removed_stopwords = 0
+        original_count = 0
+        for token in doc:
+            original_count += 1
+            # Count pos tags
+            pos = token.pos_
+            pos_distribution[pos] = pos_distribution.get(pos, 0) + 1
+            # Skip stopwords
+            if remove_stopwords and token.is_stop:
+                removed_stopwords += 1
+                continue
+            # Skip punctuation
+            if remove_punctuation and token.is_punct:
+                continue
+            # Get lemma (base form)
+            lemma = token.lemma_.lower()
+            # Skip single characters (unless important)
+            if len(lemma) < 2 and token.pos_ not in ["NOUN", "VERB", "ADJ", "ADV"]:
+                continue
+            lemmas.append(lemma)
+        cleaned_text = ' '.join(lemmas)
+        metadata = {
+            "original_token_count": original_count,
+            "cleaned_token_count": len(lemmas),
+            "removed_stopwords": removed_stopwords,
+            "pos_distribution": pos_distribution,
+            "compression_ratio": round(len(lemmas) / original_count, 2) if original_count > 0 else 0,
+        }
+        return cleaned_text, metadata
+    def extract_keywords(self, text: str, top_n: int = 10) -> List[str]:
+        """
+        - Extract noun phrases (noun chunks)
+        - Filter by part-of-speech (NOUN, VERB, ADJ)
+        - Rank by frequency
+        - Return top N
+        """
+        if not text:
+            return []
+        if not self.use_spacy:
+            # Lightweight keyword extraction without spaCy
+            tokens = re.findall(r'\b\w{3,}\b', text.lower())
+            # Filter stopwords
+            keywords = [t for t in tokens if t not in self.stop_words]
+            # Count frequency
+            from collections import Counter
+            keyword_freq = Counter(keywords)
+            return [kw for kw, _ in keyword_freq.most_common(top_n)]
+        doc = self.nlp(text)
+        # Extract noun chunks
+        noun_chunks = [chunk.text.lower() for chunk in doc.noun_chunks]
+        # Extract high-value POS (nouns, verbs, adjectives)
+        important_tokens = [
+            token.text.lower()
+            for token in doc
+            if token.pos_ in ["NOUN", "VERB", "ADJ", "ADV"]
+            and not token.is_stop
+            and len(token.text) > 2
+        ]
+        # Combine and deduplicate
+        all_keywords = list(set(noun_chunks + important_tokens))
+        # Sort by frequency in text
+        keyword_freq = {}
+        for keyword in all_keywords:
+            keyword_freq[keyword] = text.lower().count(keyword)
+        sorted_keywords = sorted(
+            keyword_freq.items(),
+            key=lambda x: x[1],
+            reverse=True
+        )
+        return [kw for kw, _ in sorted_keywords[:top_n]]
+    def preprocess(self, text: str) -> Dict:
+        if not text:
+            return {
+                "original": "",
+                "normalized": "",
+                "tokens": [],
+                "pos_tags": [],
+                "cleaned": "",
+                "keywords": [],
+                "metadata": {},
+            }
+        # Step 1: Normalize
+        normalized = self.normalize_text(text)
+        # Step 2: Tokenize
+        tokens, pos_tags = self.tokenize_and_analyze(normalized)
+        # Step 3: Lemmatize & Clean
+        cleaned, metadata = self.lemmatize_and_clean(normalized)
+        # Extract keywords
+        keywords = self.extract_keywords(normalized)
+        return {
+            "original": text,
+            "normalized": normalized,
+            "tokens": tokens,
+            "pos_tags": pos_tags,
+            "cleaned": cleaned,
+            "keywords": keywords,
+            "metadata": metadata,
+        }
+def store_preprocessing_results(memory_id: str, preprocessing_results: Dict) -> bool:
+    # Store cleaned text & metadata in MongoDB.
+    col = get_collection("memories")
+    try:
+        update_data = {
+            "preprocessing": {
+                "normalized": preprocessing_results.get("normalized"),
+                "cleaned": preprocessing_results.get("cleaned"),
+                "tokens": preprocessing_results.get("tokens"),
+                "keywords": preprocessing_results.get("keywords"),
+                "metadata": preprocessing_results.get("metadata"),
+            },
+            "updated_at": datetime.utcnow(),
+        }
+        result = col.update_one(
+            {"_id": __import__("bson").ObjectId(memory_id)},
+            {"$set": update_data}
+        )
+        return result.modified_count > 0
+    except Exception as e:
+        logger.error(f"Failed to store preprocessing results: {e}")
+        return False
+def preprocess_unprocessed_memories(batch_size: int = 50) -> Dict:
+    """
+    Step 1 in the full NLP workflow.
+    Subsequent steps (emotion analysis, embeddings) use cleaned text.
+    """
+    col = get_collection("memories")
+    preprocessor = TextPreprocessor()
+    # Find memories without preprocessing
+    unprocessed = list(col.find(
+        {"preprocessing": {"$exists": False}}
+    ).limit(batch_size))
+    processed_count = 0
+    failed_count = 0
+    errors = []
+    for memory in unprocessed:
+        try:
+            memory_id = str(memory["_id"])
+            content = memory.get("content", "")
+            if not content:
+                continue
+            logger.info(f"Preprocessing memory {memory_id}...")
+            results = preprocessor.preprocess(content)
+            # Store results
+            if store_preprocessing_results(memory_id, results):
+                processed_count += 1
+                logger.info(f"✓ Preprocessed {memory_id}")
+            else:
+                failed_count += 1
+                errors.append(f"Failed to store preprocessing for {memory_id}")
+        except Exception as e:
+            failed_count += 1
+            error_msg = f"Error preprocessing {memory.get('_id')}: {str(e)}"
+            errors.append(error_msg)
+            logger.error(error_msg)
+    return {
+        "total": len(unprocessed),
+        "processed": processed_count,
+        "failed": failed_count,
+        "errors": errors,
+    }
+# # Test the preprocessor
+# if __name__ == "__main__":
+#     preprocessor = TextPreprocessor()
+#     sample_text = """
+#     Today was a mix of productivity and much-needed relaxation!
+#     I checked https://example.com for work, then took a 10-minute walk to clear my head. ## 3 434
+#     Feeling grateful and peaceful. Contact me at test@example.com if you need anything!
+#     """
+#     result = preprocessor.preprocess(sample_text)
+#     print("\n" + "="*60)
+#     print("TEXT PREPROCESSING PIPELINE OUTPUT")
+#     print("="*60)
+#     print(f"\nOriginal:\n{result['original']}")
+#     print(f"\nNormalized:\n{result['normalized']}")
+#     print(f"\nTokens: {result['tokens']}")
+#     print(f"\nPOS Tags: {result['pos_tags']}")
+#     print(f"\nCleaned:\n{result['cleaned']}")
+#     print(f"\nKeywords: {result['keywords']}")
+#     print(f"\nMetadata: {result['metadata']}")
+#     print("\n" + "="*60)

components.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "$schema": "https://ui.shadcn.com/schema.json",
+  "style": "new-york",
+  "rsc": true,
+  "tsx": true,
+  "tailwind": {
+    "config": "",
+    "css": "app/globals.css",
+    "baseColor": "neutral",
+    "cssVariables": true,
+    "prefix": ""
+  },
+  "aliases": {
+    "components": "@/components",
+    "utils": "@/lib/utils",
+    "ui": "@/components/ui",
+    "lib": "@/lib",
+    "hooks": "@/hooks"
+  },
+  "iconLibrary": "lucide"
+}