isgr9801 commited on
Commit
dcbb6a2
·
verified ·
1 Parent(s): cc2dffd
Dockerfile ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ ENV PYTHONDONTWRITEBYTECODE=1 \
4
+ PYTHONUNBUFFERED=1 \
5
+ PORT=7860
6
+
7
+ WORKDIR /app
8
+
9
+ COPY backend/requirements.txt /app/backend/requirements.txt
10
+ RUN pip install --no-cache-dir -r /app/backend/requirements.txt
11
+
12
+ COPY . /app
13
+
14
+ EXPOSE 7860
15
+
16
+ CMD ["sh", "-c", "uvicorn backend.main:app --host 0.0.0.0 --port ${PORT:-7860}"]
backend/Procfile ADDED
@@ -0,0 +1 @@
 
 
1
+ web: uvicorn backend.main:app --host 0.0.0.0 --port $PORT
backend/README.md ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ DMJ backend (FastAPI)
2
+ ======================
3
+ ```
4
+ cd backend
5
+ python -m venv .venv
6
+ .venv\Scripts\Activate.ps1
7
+ pip install -r requirements.txt
8
+ setx MONGO_URI "mongouri"
9
+ uvicorn backend.main:app --reload --port 8000
10
+ ```
11
+ - Replace the `auth` router with proper Firebase/JWT verification
12
+
13
+ **NLP Processing**
14
+ emotion scoring
15
+ ----------------------------------
16
+ uses Hugging Face Inference API for emotion scoring.
17
+ Set environment vars:
18
+ - `HF_API_TOKEN` = Hugging Face token
19
+
20
+ keyword extraction
21
+ --------------------------------
22
+ uses KeyBERT with a multilingual sentence-transformer.
23
+ - `KEYBERT_MODEL` =sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
24
+ supports mar/eng/hindi
25
+
26
+
27
+ topic categorization
28
+ ------------------------------------
29
+ - `TOPIC_LABELS` (comma-separated candidate labels required)
30
+ supports mar/eng/hindi
31
+
32
+
33
+ entity recognition (NER)
34
+ ----------------------------
35
+ multilingual Hugging Face NER with spaCy fallback.
36
+ - `NER_MODEL` = xx_ent_wiki_sm
37
+ supports mar/eng/hindi
38
+
39
+
40
+ embedding generation
41
+ ----------------------------
42
+ - `EMBEDDING_MODEL` =entence-transformers/paraphrase-multilingual-MiniLM-L12-v2
43
+ supports mar/eng/hindi
backend/auth_deps.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Firebase auth dependency for protecting routes."""
2
+ import os
3
+ from pathlib import Path
4
+ import firebase_admin
5
+ from firebase_admin import credentials, auth
6
+ from dotenv import load_dotenv
7
+ from fastapi import Depends, HTTPException, status
8
+ from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
9
+ import logging
10
+ from typing import Optional
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ # Ensure FIREBASE_* env vars are available regardless of working directory
15
+ BACKEND_ENV_PATH = Path(__file__).resolve().parent / ".env"
16
+ load_dotenv(dotenv_path=BACKEND_ENV_PATH)
17
+ load_dotenv()
18
+
19
+
20
+ def _build_firebase_cert_from_env() -> dict | None:
21
+ project_id = os.getenv("FIREBASE_PROJECT_ID")
22
+ private_key = os.getenv("FIREBASE_PRIVATE_KEY")
23
+ client_email = os.getenv("FIREBASE_CLIENT_EMAIL")
24
+
25
+ if not project_id or not private_key or not client_email:
26
+ return None
27
+
28
+ return {
29
+ "type": "service_account",
30
+ "project_id": project_id,
31
+ "private_key": private_key.replace("\\n", "\n"),
32
+ "client_email": client_email,
33
+ "token_uri": "https://oauth2.googleapis.com/token",
34
+ }
35
+
36
+
37
+ def _initialize_firebase_admin() -> None:
38
+ try:
39
+ firebase_admin.get_app()
40
+ return
41
+ except ValueError:
42
+ pass
43
+
44
+ cert_data = _build_firebase_cert_from_env()
45
+ if cert_data:
46
+ try:
47
+ cred = credentials.Certificate(cert_data)
48
+ firebase_admin.initialize_app(
49
+ cred,
50
+ {
51
+ "projectId": cert_data["project_id"],
52
+ },
53
+ )
54
+ os.environ.setdefault("GOOGLE_CLOUD_PROJECT", cert_data["project_id"])
55
+ logger.info("Firebase Admin initialized from FIREBASE_* env vars (projectId=%s)", cert_data["project_id"])
56
+ return
57
+ except Exception as exc:
58
+ logger.exception("Failed to initialize Firebase from FIREBASE_* env vars: %s", exc)
59
+ logger.warning("Falling back to default application credentials")
60
+
61
+ firebase_admin.initialize_app()
62
+ logger.info("Firebase Admin initialized using default application credentials")
63
+
64
+
65
+ _initialize_firebase_admin()
66
+
67
+
68
+ security = HTTPBearer()
69
+ optional_security = HTTPBearer(auto_error=False)
70
+
71
+
72
+ async def verify_firebase_token(credentials: HTTPAuthorizationCredentials = Depends(security)) -> dict:
73
+ """
74
+ Dependency to verify Firebase ID token from Authorization header.
75
+ Returns the decoded token (user info).
76
+ """
77
+ token = credentials.credentials
78
+ try:
79
+ decoded_token = auth.verify_id_token(token)
80
+ return decoded_token
81
+ except Exception as e:
82
+ logger.exception("Firebase token verification failed: %s", str(e))
83
+ raise HTTPException(
84
+ status_code=status.HTTP_401_UNAUTHORIZED,
85
+ detail=f"Invalid or expired token: {str(e)}",
86
+ )
87
+
88
+
89
+ async def verify_firebase_token_optional(
90
+ credentials: Optional[HTTPAuthorizationCredentials] = Depends(optional_security),
91
+ ) -> Optional[dict]:
92
+ """
93
+ Optional auth dependency for read-only endpoints.
94
+ Returns decoded token when valid, otherwise returns None.
95
+ """
96
+ if not credentials or not credentials.credentials:
97
+ return None
98
+
99
+ token = credentials.credentials
100
+ try:
101
+ decoded_token = auth.verify_id_token(token)
102
+ return decoded_token
103
+ except Exception as e:
104
+ logger.warning("Optional Firebase token verification failed: %s", str(e))
105
+ return None
106
+
backend/connection.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Tuple
3
+
4
+ import pymongo
5
+ from pymongo import MongoClient
6
+
7
+ MONGO_URI = os.getenv("MONGO_URI")
8
+ COLLECTION_NAME = os.getenv("COLLECTION_NAME")
9
+
10
+ _client = None
11
+
12
+
13
+ def get_client() -> MongoClient:
14
+ global _client
15
+ if _client is None:
16
+ _client = MongoClient(MONGO_URI)
17
+ return _client
18
+
19
+
20
+ def get_db(db_name: str = COLLECTION_NAME):
21
+ client = get_client()
22
+ return client[db_name]
23
+
24
+
25
+ def get_collection(name: str, db_name: str = "dmj"):
26
+ db = get_db(db_name)
27
+ return db[name]
28
+
29
+
30
+ if __name__ == "__main__":
31
+ try:
32
+ col = get_collection("memories")
33
+ print("Connected to collection:", col.name)
34
+ print("Documents count:", col.count_documents({}))
35
+ except Exception as e:
36
+ print("Connection test failed:", e)
backend/crud.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional
2
+ from datetime import datetime
3
+ from bson.objectid import ObjectId
4
+
5
+ from backend.connection import get_collection
6
+
7
+
8
+ def _safe_round(value, digits: int = 3) -> float:
9
+ try:
10
+ if value is None:
11
+ return 0.0
12
+ return round(float(value), digits)
13
+ except (TypeError, ValueError):
14
+ return 0.0
15
+
16
+
17
+ def create_memory(data: dict) -> str:
18
+ col = get_collection("memories")
19
+
20
+ # Add timestamps and metadata
21
+ now = datetime.utcnow()
22
+ data["created_at"] = now
23
+ data["updated_at"] = now
24
+
25
+ data["is_processed"] = "embedding_id" in data and "nlp_insights" in data
26
+
27
+ res = col.insert_one(data)
28
+ return str(res.inserted_id)
29
+
30
+
31
+ def list_memories(limit: int = 50, processed_only: bool = False) -> List[dict]:
32
+ col = get_collection("memories")
33
+
34
+ query = {}
35
+ if processed_only:
36
+ query = {"is_processed": True}
37
+
38
+ docs = col.find(query).sort("created_at", -1).limit(limit)
39
+ result = []
40
+ for d in docs:
41
+ d["id"] = str(d["_id"])
42
+ # Serialize datetime objects to ISO format
43
+ if "created_at" in d and isinstance(d["created_at"], datetime):
44
+ d["created_at"] = d["created_at"].isoformat()
45
+ if "updated_at" in d and isinstance(d["updated_at"], datetime):
46
+ d["updated_at"] = d["updated_at"].isoformat()
47
+ result.append(d)
48
+ return result
49
+
50
+
51
+ def get_memory_by_id(memory_id: str) -> Optional[dict]:
52
+ """Get a single memory by ID."""
53
+ col = get_collection("memories")
54
+ try:
55
+ doc = col.find_one({"_id": ObjectId(memory_id)})
56
+ if doc:
57
+ doc["id"] = str(doc["_id"])
58
+ if "created_at" in doc and isinstance(doc["created_at"], datetime):
59
+ doc["created_at"] = doc["created_at"].isoformat()
60
+ if "updated_at" in doc and isinstance(doc["updated_at"], datetime):
61
+ doc["updated_at"] = doc["updated_at"].isoformat()
62
+ return doc
63
+ except Exception:
64
+ return None
65
+
66
+
67
+ def update_memory_by_id(memory_id: str, updates: dict) -> bool:
68
+ """Update editable memory fields by ID."""
69
+ col = get_collection("memories")
70
+ try:
71
+ updates["updated_at"] = datetime.utcnow()
72
+ result = col.update_one(
73
+ {"_id": ObjectId(memory_id)},
74
+ {"$set": updates}
75
+ )
76
+ return result.modified_count > 0
77
+ except Exception:
78
+ return False
79
+
80
+
81
+ def update_memory_with_nlp(memory_id: str, nlp_data: dict) -> bool:
82
+ """Update a memory with NLP extraction results.
83
+
84
+ Args:
85
+ memory_id: MongoDB ObjectId as string
86
+ nlp_data: Dict with content_clean, nlp_insights, embedding_id, etc.
87
+ """
88
+ col = get_collection("memories")
89
+ try:
90
+ nlp_data["updated_at"] = datetime.utcnow()
91
+ nlp_data["is_processed"] = True
92
+
93
+ result = col.update_one(
94
+ {"_id": ObjectId(memory_id)},
95
+ {"$set": nlp_data}
96
+ )
97
+ return result.modified_count > 0
98
+ except Exception:
99
+ return False
100
+
101
+
102
+ def get_stats() -> dict:
103
+ """Get aggregated stats from memories including emotion analysis."""
104
+ col = get_collection("memories")
105
+ total = col.count_documents({})
106
+
107
+ # Get most common mood
108
+ mood_pipeline = [
109
+ {"$match": {"mood": {"$exists": True}}},
110
+ {"$group": {"_id": "$mood", "count": {"$sum": 1}}},
111
+ {"$sort": {"count": -1}},
112
+ {"$limit": 1},
113
+ ]
114
+ mood_agg = list(col.aggregate(mood_pipeline))
115
+ most_common_mood = mood_agg[0]["_id"] if mood_agg else None
116
+
117
+ # Get top emotions across all memories
118
+ emotion_pipeline = [
119
+ {"$match": {"nlp_insights.emotion_scores": {"$exists": True}}},
120
+ {"$group": {
121
+ "_id": None,
122
+ "joy_avg": {"$avg": "$nlp_insights.emotion_scores.joy"},
123
+ "sadness_avg": {"$avg": "$nlp_insights.emotion_scores.sadness"},
124
+ "anger_avg": {"$avg": "$nlp_insights.emotion_scores.anger"},
125
+ "fear_avg": {"$avg": "$nlp_insights.emotion_scores.fear"},
126
+ "surprise_avg": {"$avg": "$nlp_insights.emotion_scores.surprise"},
127
+ "disgust_avg": {"$avg": "$nlp_insights.emotion_scores.disgust"},
128
+ }},
129
+ ]
130
+ emotion_agg = list(col.aggregate(emotion_pipeline))
131
+ top_emotions = {}
132
+ if emotion_agg:
133
+ e = emotion_agg[0]
134
+ top_emotions = {
135
+ "joy": _safe_round(e.get("joy_avg")),
136
+ "sadness": _safe_round(e.get("sadness_avg")),
137
+ "anger": _safe_round(e.get("anger_avg")),
138
+ "fear": _safe_round(e.get("fear_avg")),
139
+ "surprise": _safe_round(e.get("surprise_avg")),
140
+ "disgust": _safe_round(e.get("disgust_avg")),
141
+ }
142
+
143
+ # Get top topics
144
+ topic_pipeline = [
145
+ {"$match": {"nlp_insights.topics": {"$exists": True}}},
146
+ {"$unwind": "$nlp_insights.topics"},
147
+ {"$group": {"_id": "$nlp_insights.topics", "count": {"$sum": 1}}},
148
+ {"$sort": {"count": -1}},
149
+ {"$limit": 5},
150
+ ]
151
+ topic_agg = list(col.aggregate(topic_pipeline))
152
+ top_topics = [t["_id"] for t in topic_agg]
153
+
154
+ return {
155
+ "total_memories": total,
156
+ "most_common_mood": most_common_mood,
157
+ "top_emotions": top_emotions if top_emotions else None,
158
+ "top_topics": top_topics if top_topics else None,
159
+ }
backend/main.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from dotenv import load_dotenv
4
+ from apscheduler.schedulers.background import BackgroundScheduler
5
+ import logging
6
+ import os
7
+ from pathlib import Path
8
+
9
+ # Load environment variables from .env before importing modules that depend on them
10
+ load_dotenv(dotenv_path=Path(__file__).resolve().parent / ".env")
11
+ load_dotenv()
12
+
13
+ from backend.routers import auth, memories, dashboard
14
+ from backend.nlp_processor import process_unprocessed_memories
15
+
16
+ # Configure logging
17
+ logging.basicConfig(level=logging.INFO)
18
+ logger = logging.getLogger(__name__)
19
+
20
+ app = FastAPI(title="DMJ Backend")
21
+
22
+ # Allow local dev frontend to call the API on any local port.
23
+ # You can override with CORS_ALLOW_ORIGINS="http://localhost:3000,http://127.0.0.1:3001"
24
+ configured_origins = [
25
+ origin.strip()
26
+ for origin in os.getenv("CORS_ALLOW_ORIGINS","https://v0-djjv2.vercel.app,https://dmemoryjar.vercel.app/").split(",")
27
+ if origin.strip()
28
+ ]
29
+
30
+ app.add_middleware(
31
+ CORSMiddleware,
32
+ allow_origins=configured_origins,
33
+ allow_origin_regex=r"https?://(localhost|127\.0\.0\.1)(:\d+)?$",
34
+ allow_credentials=True,
35
+ allow_methods=["*"],
36
+ allow_headers=["*"],
37
+ )
38
+
39
+
40
+ @app.get("/healthz")
41
+ def health():
42
+ return {"status": "ok"}
43
+
44
+
45
+ app.include_router(auth.router, prefix="/auth", tags=["auth"])
46
+ app.include_router(memories.router, prefix="/memories", tags=["memories"])
47
+ app.include_router(dashboard.router, prefix="/dashboard", tags=["dashboard"])
48
+
49
+
50
+ # # Background scheduler for processing unprocessed memories
51
+ # scheduler = BackgroundScheduler()
52
+
53
+ # def process_memories_job():
54
+ # """Job that runs every 10 seconds to process unprocessed memories."""
55
+ # try:
56
+ # # logger.info("Running memory processing job...")
57
+ # process_unprocessed_memories()
58
+ # # logger.info("Memory processing job completed.")
59
+ # except Exception as e:
60
+ # logger.error(f"Error in memory processing job: {e}")
61
+
62
+ # scheduler.add_job(process_memories_job, "interval", seconds=10, id="process_memories")
63
+ # scheduler.start()
64
+
65
+
66
+ @app.on_event("startup")
67
+ def startup_event():
68
+ logger.info("*********************************Application startup - scheduler running*********************************")
69
+
70
+
71
+ @app.on_event("shutdown")
72
+ def shutdown_event():
73
+ # scheduler.shutdown()
74
+ logger.info("*********************************Application shutdown - scheduler stopped*********************************")
75
+
76
+
77
+ if __name__ == "__main__":
78
+ import uvicorn
79
+
80
+ uvicorn.run(app, host="127.0.0.1", port=8000, reload=True)
backend/nlp_processor.py ADDED
@@ -0,0 +1,937 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ AI NLP Processing Pipeline Template
3
+ 1. Text Preprocessing & Cleaning (text_preprocessor.py)
4
+ 2. Emotion Analysis
5
+ 3. Keyword & Topic Extraction
6
+ 4. Entity Recognition
7
+ 5. Embedding Generation
8
+ 6. Store in MongoDB + FAISS
9
+ """
10
+
11
+ from typing import Dict, List, Optional
12
+ import logging
13
+ import json
14
+ import os
15
+ from urllib import error, request
16
+ from datetime import datetime
17
+ from functools import lru_cache
18
+
19
+ from backend.connection import get_collection
20
+ from backend.crud import update_memory_with_nlp
21
+ from backend.text_preprocessor import TextPreprocessor, preprocess_unprocessed_memories
22
+
23
+ logging.basicConfig(level=logging.INFO)
24
+ logger = logging.getLogger(__name__)
25
+ # ----------------------------------------------------------------------
26
+ # Helper functions for Hugging Face API calls
27
+ # ----------------------------------------------------------------------
28
+ def _get_hf_timeout_seconds() -> int:
29
+ """Get timeout for HF inference requests from env, default 20 seconds."""
30
+ value = os.getenv("HF_INFERENCE_TIMEOUT_SECONDS", "20")
31
+ try:
32
+ return int(value)
33
+ except ValueError:
34
+ return 20
35
+ def _hf_inference_endpoints(model_id: str) -> List[str]:
36
+ """Return ordered Hugging Face inference endpoints to try."""
37
+ explicit_base = os.getenv("HF_INFERENCE_BASE_URL", "").strip().rstrip("/")
38
+ endpoints: List[str] = []
39
+ if explicit_base:
40
+ endpoints.append(f"{explicit_base}/{model_id}")
41
+
42
+ endpoints.extend([
43
+ f"https://router.huggingface.co/hf-inference/models/{model_id}",
44
+ f"https://api-inference.huggingface.co/models/{model_id}",
45
+ ])
46
+ # Remove duplicates
47
+ deduped: List[str] = []
48
+ seen = set()
49
+ for endpoint in endpoints:
50
+ if endpoint not in seen:
51
+ deduped.append(endpoint)
52
+ seen.add(endpoint)
53
+ return deduped
54
+
55
+
56
+
57
+
58
+
59
+
60
+
61
+
62
+
63
+
64
+
65
+
66
+
67
+ # ----------------------------------------------------------------------
68
+ # Keyword extraction with KeyBERT (cached model)
69
+ # ----------------------------------------------------------------------
70
+
71
+ # Keyword extraction now uses TextPreprocessor (lightweight, no ML models)
72
+
73
+ def _get_keybert_top_n() -> int:
74
+ """Get number of keywords to extract from env, default 8."""
75
+ value = os.getenv("KEYBERT_TOP_N", "8")
76
+ try:
77
+ parsed = int(value)
78
+ return max(1, min(parsed, 20))
79
+ except ValueError:
80
+ return 8
81
+
82
+
83
+
84
+
85
+
86
+
87
+
88
+
89
+
90
+
91
+
92
+
93
+ # ----------------------------------------------------------------------
94
+ # Zero-shot topic classification
95
+ # ----------------------------------------------------------------------
96
+ # Zero-shot now uses HF API instead of local transformers
97
+
98
+ def _get_topic_candidate_labels() -> List[str]:
99
+ # configured = os.getenv("TOPIC_LABELS", "").strip()
100
+ # if configured:
101
+ # return [label.strip() for label in configured.split(",") if label.strip()]
102
+ return [
103
+ "Work & Productivity",
104
+ "Health & Wellness",
105
+ "Emotions & Mental Health",
106
+ "Relationships & Family",
107
+ "Learning & Growth",
108
+ "Finance",
109
+ "Travel & Leisure",
110
+ "Daily Life",
111
+ ]
112
+
113
+
114
+ def _get_topic_score_threshold() -> float:
115
+ """Minimum confidence score for topic assignment."""
116
+ value = os.getenv("TOPIC_MIN_SCORE", "0.2")
117
+ try:
118
+ parsed = float(value)
119
+ return max(0.0, min(parsed, 1.0))
120
+ except ValueError:
121
+ return 0.2
122
+
123
+
124
+ def _get_topic_max_labels() -> int:
125
+ """Maximum number of topics to assign per memory."""
126
+ value = os.getenv("TOPIC_MAX_LABELS", "2")
127
+ try:
128
+ parsed = int(value)
129
+ return max(1, min(parsed, 5))
130
+ except ValueError:
131
+ return 2
132
+
133
+
134
+
135
+
136
+
137
+
138
+
139
+ # ----------------------------------------------------------------------
140
+ # Named Entity Recognition
141
+ # ----------------------------------------------------------------------
142
+
143
+ # NER disabled for memory optimization on free hosting
144
+
145
+ def _get_ner_score_threshold() -> float:
146
+ """Minimum confidence for NER entities (optional env)."""
147
+ value = os.getenv("NER_MIN_SCORE", "0.35")
148
+ try:
149
+ parsed = float(value)
150
+ return max(0.0, min(parsed, 1.0))
151
+ except ValueError:
152
+ return 0.35
153
+
154
+
155
+
156
+
157
+
158
+
159
+
160
+
161
+
162
+
163
+
164
+
165
+
166
+ # ----------------------------------------------------------------------
167
+ # Embedding generation
168
+ # ----------------------------------------------------------------------
169
+
170
+ # Embedding generation now uses HF API instead of local SentenceTransformers
171
+
172
+
173
+ # ----------------------------------------------------------------------
174
+ # General helpers
175
+
176
+ def _flatten_hf_labels(payload: object) -> List[Dict[str, float]]:
177
+ """Convert Hugging Face API output to list of {label, score} dicts."""
178
+ if not isinstance(payload, list):
179
+ return []
180
+
181
+ if payload and isinstance(payload[0], list):
182
+ candidates = payload[0]
183
+ else:
184
+ candidates = payload
185
+
186
+ parsed: List[Dict[str, float]] = []
187
+ for item in candidates:
188
+ if not isinstance(item, dict):
189
+ continue
190
+ label = str(item.get("label", "")).strip().lower()
191
+ score = item.get("score", 0.0)
192
+ try:
193
+ parsed.append({"label": label, "score": float(score)})
194
+ except (TypeError, ValueError):
195
+ continue
196
+ return parsed
197
+
198
+
199
+ def _dedupe_text_items(items: List[str]) -> List[str]:
200
+ """Remove duplicate strings (case‑insensitive)."""
201
+ cleaned: List[str] = []
202
+ seen = set()
203
+ for item in items:
204
+ value = item.strip()
205
+ if not value:
206
+ continue
207
+ key = value.lower()
208
+ if key in seen:
209
+ continue
210
+ seen.add(key)
211
+ cleaned.append(value)
212
+ return cleaned
213
+
214
+
215
+
216
+
217
+
218
+
219
+
220
+
221
+
222
+
223
+
224
+
225
+
226
+ # ----------------------------------------------------------------------
227
+ # Text cleaning
228
+ def clean_text(text: str) -> str:
229
+ """
230
+ Normalize, tokenize, lemmatize, remove stopwords
231
+ """
232
+ preprocessor = TextPreprocessor()
233
+ result = preprocessor.preprocess(text)
234
+ return result["cleaned"]
235
+
236
+
237
+ # ----------------------------------------------------------------------
238
+ # Emotion scoring (using Hugging Face API)
239
+ # ----------------------------------------------------------------------
240
+
241
+ EMOTION_BUCKET_LABELS = {
242
+ "joy": {"joy", "amusement", "excitement", "optimism", "contentment", "happy", "excited", "content"},
243
+ "sadness": {"sadness", "disappointment", "grief", "remorse", "hurt", "lonely", "disappointed"},
244
+ "anger": {"anger", "annoyance", "rage", "frustration", "frustrated", "annoyed", "furious"},
245
+ "fear": {"fear", "nervousness", "anxiety", "worry", "anxious", "nervous", "worried"},
246
+ "surprise": {"surprise", "realization", "amazed", "amaze", "shocked"},
247
+ "disgust": {"disgust", "disapproval", "embarrassment", "dislike", "uncomfortable"},
248
+ }
249
+
250
+
251
+ def _neutral_emotion_scores() -> Dict[str, float]:
252
+ """Return zero‑initialized emotion score dict."""
253
+ return {
254
+ "joy": 0.0,
255
+ "sadness": 0.0,
256
+ "anger": 0.0,
257
+ "fear": 0.0,
258
+ "surprise": 0.0,
259
+ "disgust": 0.0,
260
+ }
261
+
262
+
263
+ def _bucketize_emotions(label_scores: List[Dict[str, float]]) -> Dict[str, float]:
264
+ bucket_scores = _neutral_emotion_scores()
265
+ for item in label_scores:
266
+ label = item["label"]
267
+ score = float(item["score"])
268
+ for bucket, aliases in EMOTION_BUCKET_LABELS.items():
269
+ if label in aliases:
270
+ bucket_scores[bucket] += score
271
+ break
272
+
273
+ total = sum(bucket_scores.values())
274
+ if total > 0:
275
+ return {k: round(v / total, 4) for k, v in bucket_scores.items()}
276
+ return bucket_scores
277
+
278
+
279
+ def extract_emotion_scores(text: str) -> Dict[str, float]:
280
+ if not text or not text.strip():
281
+ return _neutral_emotion_scores()
282
+
283
+ hf_api_token = os.getenv("HF_API_TOKEN")
284
+ hf_timeout_seconds = _get_hf_timeout_seconds()
285
+
286
+ if not hf_api_token:
287
+ logger.warning("HF_API_TOKEN missing. Returning default emotion scores.")
288
+ return _neutral_emotion_scores()
289
+
290
+ body = json.dumps({"inputs": text, "options": {"wait_for_model": True}}).encode("utf-8")
291
+ last_error: Optional[str] = None
292
+
293
+ for endpoint in _hf_inference_endpoints("AnasAlokla/multilingual_go_emotions"):
294
+ req = request.Request(
295
+ endpoint,
296
+ data=body,
297
+ method="POST",
298
+ headers={
299
+ "Authorization": f"Bearer {hf_api_token}",
300
+ "Content-Type": "application/json",
301
+ },
302
+ )
303
+
304
+ try:
305
+ with request.urlopen(req, timeout=hf_timeout_seconds) as res:
306
+ raw_payload = res.read().decode("utf-8")
307
+ payload = json.loads(raw_payload)
308
+
309
+ if isinstance(payload, dict) and payload.get("error"):
310
+ last_error = str(payload.get("error"))
311
+ logger.warning("Hugging Face API error from %s: %s", endpoint, last_error)
312
+ continue
313
+
314
+ label_scores = _flatten_hf_labels(payload)
315
+ if not label_scores:
316
+ last_error = "No valid label scores from Hugging Face response."
317
+ logger.warning("%s Endpoint: %s", last_error, endpoint)
318
+ continue
319
+
320
+ return _bucketize_emotions(label_scores)
321
+
322
+ except error.HTTPError as e:
323
+ try:
324
+ error_body = e.read().decode("utf-8")
325
+ except Exception:
326
+ error_body = ""
327
+ last_error = f"HTTP {e.code} {e.reason}"
328
+ logger.warning(
329
+ "Hugging Face HTTP error via %s: %s. Body: %s",
330
+ endpoint,
331
+ last_error,
332
+ error_body,
333
+ )
334
+ continue
335
+ except error.URLError as e:
336
+ last_error = f"Network error: {e.reason}"
337
+ logger.warning("Hugging Face network error via %s: %s", endpoint, e.reason)
338
+ continue
339
+ except json.JSONDecodeError:
340
+ last_error = "Failed to decode Hugging Face response JSON."
341
+ logger.warning("%s Endpoint: %s", last_error, endpoint)
342
+ continue
343
+ except Exception as e:
344
+ last_error = f"Unexpected emotion scoring error: {str(e)}"
345
+ logger.warning("%s Endpoint: %s", last_error, endpoint)
346
+ continue
347
+
348
+ if last_error:
349
+ logger.error("Emotion scoring failed for model. Last error: %s", last_error)
350
+
351
+ return _neutral_emotion_scores()
352
+
353
+
354
+
355
+
356
+
357
+
358
+
359
+
360
+
361
+
362
+
363
+
364
+
365
+
366
+
367
+
368
+
369
+
370
+
371
+ # ----------------------------------------------------------------------
372
+ # Keyword extraction (KeyBERT with fallback)
373
+ # ----------------------------------------------------------------------
374
+
375
+ def extract_keywords(text: str) -> List[str]:
376
+ """Extract keywords using TextPreprocessor (lightweight, no ML models)."""
377
+ if not text or not text.strip():
378
+ return []
379
+
380
+ top_n = _get_keybert_top_n()
381
+
382
+ try:
383
+ preprocessor = TextPreprocessor()
384
+ return preprocessor.extract_keywords(text, top_n=top_n)
385
+ except Exception as e:
386
+ logger.error("Keyword extraction failed: %s", str(e))
387
+ return []
388
+
389
+
390
+
391
+
392
+
393
+
394
+
395
+
396
+
397
+
398
+
399
+
400
+
401
+
402
+
403
+
404
+
405
+
406
+
407
+
408
+ # ----------------------------------------------------------------------
409
+ # Topic categorization
410
+ # ----------------------------------------------------------------------
411
+
412
+ def categorize_topics(text: str, keywords: List[str]) -> List[str]:
413
+ if not text or not text.strip():
414
+ return ["Daily Life"]
415
+
416
+ candidate_labels = _get_topic_candidate_labels()
417
+ min_score = _get_topic_score_threshold()
418
+ max_labels = _get_topic_max_labels()
419
+ text_for_classification = text
420
+ if keywords:
421
+ text_for_classification = f"{text}\nKeywords: {', '.join(keywords[:10])}"
422
+
423
+ try:
424
+ classifier = _get_zero_shot_classifier()
425
+ result = classifier(
426
+ text_for_classification,
427
+ candidate_labels=candidate_labels,
428
+ # Use HuggingFace API instead of local transformers
429
+ hf_api_token = os.getenv("HF_API_TOKEN")
430
+ if not hf_api_token:
431
+ logger.warning("HF_API_TOKEN missing. Using keyword-based topic classification.")
432
+ return _fallback_topic_classification(text)
433
+
434
+ hf_timeout = _get_hf_timeout_seconds()
435
+ model_id = "joeddav/xlm-roberta-large-xnli"
436
+
437
+ multi_label=True,
438
+ body = json.dumps({
439
+ "inputs": text_for_classification,
440
+ "parameters": {
441
+ "candidate_labels": candidate_labels,
442
+ "multi_label": True
443
+ },
444
+ "options": {"wait_for_model": True}
445
+ }).encode("utf-8")
446
+
447
+ for endpoint in _hf_inference_endpoints(model_id):
448
+ req = request.Request(
449
+ endpoint,
450
+ data=body,
451
+ method="POST",
452
+ headers={
453
+ "Authorization": f"Bearer {hf_api_token}",
454
+ "Content-Type": "application/json",
455
+ },
456
+ )
457
+
458
+ try:
459
+ with request.urlopen(req, timeout=hf_timeout) as res:
460
+ raw_payload = res.read().decode("utf-8")
461
+ result = json.loads(raw_payload)
462
+
463
+ if isinstance(result, dict) and result.get("error"):
464
+ logger.warning("HF API error: %s", result.get("error"))
465
+ continue
466
+
467
+ labels = result.get("labels", []) if isinstance(result, dict) else []
468
+ scores = result.get("scores", []) if isinstance(result, dict) else []
469
+
470
+ ranked_topics: List[str] = []
471
+ for label, score in zip(labels, scores):
472
+ if float(score) >= min_score:
473
+ ranked_topics.append(str(label))
474
+ if len(ranked_topics) >= max_labels:
475
+ break
476
+
477
+ if ranked_topics:
478
+ return ranked_topics
479
+ if labels:
480
+ return [str(labels[0])]
481
+
482
+ except Exception as e:
483
+ logger.warning("HF API request failed: %s", str(e))
484
+ continue
485
+
486
+ except Exception as e:
487
+ logger.error("Zero-shot topic classification failed: %s", str(e))
488
+
489
+ return _fallback_topic_classification(text)
490
+
491
+ def _fallback_topic_classification(text: str) -> List[str]:
492
+ """Fallback keyword-based topic classification."""
493
+ topics = []
494
+ work_keywords = ["work", "email", "project", "deliverable", "deadline"]
495
+ health_keywords = ["walk", "exercise", "sleep", "health", "tired"]
496
+ mood_keywords = ["grateful", "happy", "sad", "anxious", "stressed"]
497
+
498
+ text_lower = text.lower()
499
+
500
+ if any(k in text_lower for k in work_keywords):
501
+ topics.append("Work & Productivity")
502
+ if any(k in text_lower for k in health_keywords):
503
+ topics.append("Health & Wellness")
504
+ if any(k in text_lower for k in mood_keywords):
505
+ topics.append("Emotions & Mental Health")
506
+
507
+ return topics or ["Daily Life"]
508
+
509
+ def extract_entities_OLD_DISABLED(text: str) -> List[str]:
510
+ """OLD VERSION - Extract named entities using multilingual HF NER with spaCy fallback."""
511
+ if not text or not text.strip():
512
+ return []
513
+
514
+ min_score = _get_ner_score_threshold()
515
+
516
+ try:
517
+ ner_pipeline = _get_hf_ner_pipeline()
518
+ raw_entities = ner_pipeline(text)
519
+ entities = []
520
+
521
+ for item in raw_entities:
522
+ if not isinstance(item, dict):
523
+ continue
524
+ entity_text = str(item.get("word", "")).strip()
525
+ score = item.get("score", 0.0)
526
+ try:
527
+ score_value = float(score)
528
+ except (TypeError, ValueError):
529
+ score_value = 0.0
530
+
531
+ if entity_text and score_value >= min_score:
532
+ entities.append(entity_text)
533
+
534
+ entities = _dedupe_text_items(entities)
535
+ if entities:
536
+ return entities
537
+ except ImportError as e:
538
+ logger.warning("Transformers dependency missing for NER (%s).", str(e))
539
+ except Exception as e:
540
+ logger.error("Hugging Face NER failed: %s", str(e))
541
+
542
+ try:
543
+ nlp = _get_spacy_ner_model()
544
+ doc = nlp(text)
545
+ entities = _dedupe_text_items([ent.text for ent in doc.ents])
546
+ if entities:
547
+ return entities
548
+ except ImportError as e:
549
+ logger.warning("spaCy dependency missing for NER fallback (%s).", str(e))
550
+ except Exception as e:
551
+ logger.error("spaCy NER fallback failed: %s", str(e))
552
+
553
+ return []
554
+
555
+ def extract_entities(text: str) -> List[str]:
556
+ """Extract named entities - disabled for memory optimization."""
557
+ # NER requires heavy transformers models
558
+ # Disabled to keep memory usage low on free hosting
559
+ logger.info("NER disabled for memory optimization")
560
+ return []
561
+
562
+ def generate_embedding_OLD_DISABLED(text: str) -> Dict:
563
+ """OLD VERSION - Generate multilingual sentence embedding using Sentence Transformers."""
564
+ model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
565
+
566
+ if not text or not text.strip():
567
+ if float(score) >= min_score:
568
+ "vector": [],
569
+ "model": model_name,
570
+ }
571
+
572
+ try:
573
+ embedding_model = _get_embedding_model_instance()
574
+ vector = embedding_model.encode(text, convert_to_tensor=False, normalize_embeddings=True)
575
+
576
+ if hasattr(vector, "tolist"):
577
+ vector_list = vector.tolist()
578
+ else:
579
+ vector_list = list(vector)
580
+
581
+ return {
582
+ "vector": vector_list,
583
+ "model": model_name,
584
+ }
585
+ except ImportError as e:
586
+ logger.warning("sentence-transformers dependency missing for embeddings (%s).", str(e))
587
+ except Exception as e:
588
+ logger.error("Embedding generation failed: %s", str(e))
589
+
590
+ return {
591
+ "vector": [],
592
+ "model": model_name,
593
+ }
594
+
595
+ def generate_embedding(text: str) -> Dict:
596
+ """Generate multilingual sentence embedding using Hugging Face API."""
597
+ model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
598
+
599
+ if not text or not text.strip():
600
+ return {
601
+ "vector": [],
602
+ "model": model_name,
603
+ }
604
+
605
+ hf_api_token = os.getenv("HF_API_TOKEN")
606
+ if not hf_api_token:
607
+ logger.warning("HF_API_TOKEN missing. Skipping embedding generation.")
608
+ return {
609
+ "vector": [],
610
+ "model": model_name,
611
+ }
612
+
613
+ hf_timeout = _get_hf_timeout_seconds()
614
+
615
+ try:
616
+ body = json.dumps({
617
+ "inputs": text,
618
+ "options": {"wait_for_model": True}
619
+ }).encode("utf-8")
620
+
621
+ for endpoint in _hf_inference_endpoints(model_name):
622
+ req = request.Request(
623
+ endpoint,
624
+ data=body,
625
+ method="POST",
626
+ headers={
627
+ "Authorization": f"Bearer {hf_api_token}",
628
+ "Content-Type": "application/json",
629
+ },
630
+ )
631
+
632
+ try:
633
+ with request.urlopen(req, timeout=hf_timeout) as res:
634
+ raw_payload = res.read().decode("utf-8")
635
+ payload = json.loads(raw_payload)
636
+
637
+ if isinstance(payload, dict) and payload.get("error"):
638
+ logger.warning("HF API error: %s", payload.get("error"))
639
+ continue
640
+
641
+ # HF Feature Extraction API returns embeddings directly
642
+ if isinstance(payload, list) and len(payload) > 0:
643
+ vector_list = payload[0] if isinstance(payload[0], list) else payload
644
+ return {
645
+ "vector": vector_list,
646
+ "model": model_name,
647
+ }
648
+
649
+ except Exception as e:
650
+ logger.warning("HF API embedding request failed: %s", str(e))
651
+ continue
652
+
653
+ except Exception as e:
654
+ logger.error("Embedding generation failed: %s", str(e))
655
+
656
+ return {
657
+ ranked_topics.append(str(label))
658
+ "model": model_name,
659
+ }
660
+
661
+
662
+
663
+
664
+
665
+
666
+
667
+
668
+
669
+
670
+
671
+
672
+
673
+
674
+
675
+
676
+
677
+
678
+
679
+
680
+ # ----------------------------------------------------------------------
681
+ # Named Entity Recognition (HF + spaCy fallback)
682
+ # ----------------------------------------------------------------------
683
+
684
+ def extract_entities(text: str) -> List[str]:
685
+ """Extract named entities using multilingual HF NER with spaCy fallback."""
686
+ if not text or not text.strip():
687
+ return []
688
+
689
+ min_score = _get_ner_score_threshold()
690
+
691
+ try:
692
+ ner_pipeline = _get_hf_ner_pipeline()
693
+ raw_entities = ner_pipeline(text)
694
+ entities = []
695
+
696
+ for item in raw_entities:
697
+ if not isinstance(item, dict):
698
+ continue
699
+ entity_text = str(item.get("word", "")).strip()
700
+ score = item.get("score", 0.0)
701
+ try:
702
+ score_value = float(score)
703
+ except (TypeError, ValueError):
704
+ score_value = 0.0
705
+
706
+ if entity_text and score_value >= min_score:
707
+ entities.append(entity_text)
708
+
709
+ entities = _dedupe_text_items(entities)
710
+ if entities:
711
+ return entities
712
+ except ImportError as e:
713
+ logger.warning("Transformers dependency missing for NER (%s).", str(e))
714
+ except Exception as e:
715
+ logger.error("Hugging Face NER failed: %s", str(e))
716
+
717
+ try:
718
+ nlp = _get_spacy_ner_model()
719
+ doc = nlp(text)
720
+ entities = _dedupe_text_items([ent.text for ent in doc.ents])
721
+ if entities:
722
+ return entities
723
+ except ImportError as e:
724
+ logger.warning("spaCy dependency missing for NER fallback (%s).", str(e))
725
+ except Exception as e:
726
+ logger.error("spaCy NER fallback failed: %s", str(e))
727
+
728
+ return []
729
+
730
+
731
+
732
+
733
+
734
+
735
+
736
+
737
+
738
+
739
+
740
+
741
+
742
+
743
+
744
+
745
+
746
+
747
+
748
+
749
+
750
+
751
+
752
+ # ----------------------------------------------------------------------
753
+ # Embedding generation (Sentence Transformers)
754
+ # ----------------------------------------------------------------------
755
+
756
+ def generate_embedding(text: str) -> Dict:
757
+ """Generate multilingual sentence embedding using Sentence Transformers."""
758
+ model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
759
+
760
+ if not text or not text.strip():
761
+ return {
762
+ "vector": [],
763
+ "model": model_name,
764
+ }
765
+
766
+ try:
767
+ embedding_model = _get_embedding_model_instance()
768
+ vector = embedding_model.encode(text, convert_to_tensor=False, normalize_embeddings=True)
769
+
770
+ if hasattr(vector, "tolist"):
771
+ vector_list = vector.tolist()
772
+ else:
773
+ vector_list = list(vector)
774
+
775
+ return {
776
+ "vector": vector_list,
777
+ "model": model_name,
778
+ }
779
+ except ImportError as e:
780
+ logger.warning("sentence-transformers dependency missing for embeddings (%s).", str(e))
781
+ except Exception as e:
782
+ logger.error("Embedding generation failed: %s", str(e))
783
+
784
+ return {
785
+ "vector": [],
786
+ "model": model_name,
787
+ }
788
+
789
+
790
+
791
+
792
+
793
+
794
+
795
+
796
+
797
+
798
+
799
+
800
+
801
+
802
+
803
+
804
+
805
+
806
+
807
+
808
+
809
+
810
+ # ----------------------------------------------------------------------
811
+ # FAISS storage (placeholder – to be implemented)
812
+ # ----------------------------------------------------------------------
813
+
814
+ def store_embedding_in_faiss(vector: List[float], memory_id: str, faiss_index) -> int:
815
+ """
816
+ Store vector embedding in FAISS index and return its ID.
817
+
818
+ Args:
819
+ vector: Embedding vector
820
+ memory_id: MongoDB ObjectId
821
+ faiss_index: FAISS IndexFlatL2 instance
822
+
823
+ Returns:
824
+ embedding_id: Position in FAISS index
825
+ """
826
+ # TODO: Implement actual FAISS integration
827
+ # import faiss
828
+ # import numpy as np
829
+ # index.add(np.array([vector]).astype('float32'))
830
+ # embedding_id = index.ntotal - 1
831
+ # Save mapping: embedding_id → memory_id in a separate collection
832
+
833
+ embedding_id = 4271 # Dummy ID for now
834
+ return embedding_id
835
+
836
+
837
+
838
+
839
+
840
+
841
+
842
+
843
+
844
+ # ----------------------------------------------------------------------
845
+ # Main processing loop (called by scheduler)
846
+ # ----------------------------------------------------------------------
847
+
848
+ def process_unprocessed_memories(batch_size: int = 50) -> Dict:
849
+ """
850
+ Order of operations:
851
+ 1. Text Preprocessing & cleaning spaCy already done in separate step
852
+ 2. Emotion Analysis
853
+ 3. Keyword & Topic Extraction
854
+ 4. Entity Recognition
855
+ 5. Embedding Generation
856
+ 6. Store in MongoDB
857
+ """
858
+ col = get_collection("memories")
859
+
860
+ # Step 1: Preprocess any memories without preprocessing
861
+ preprocessing_result = preprocess_unprocessed_memories(batch_size)
862
+
863
+ # Step 2: Process preprocessed memories for emotion/embedding
864
+ unprocessed = list(col.find(
865
+ {
866
+ "preprocessing": {"$exists": True},
867
+ "nlp_insights": {"$exists": False}
868
+ }
869
+ ).limit(batch_size))
870
+
871
+ processed_count = 0
872
+ failed_count = 0
873
+ errors = []
874
+
875
+ for memory in unprocessed:
876
+ try:
877
+ memory_id = str(memory["_id"])
878
+ preprocessed = memory.get("preprocessing", {})
879
+ cleaned_text = preprocessed.get("cleaned", "")
880
+ preprocessing_keywords = preprocessed.get("keywords", [])
881
+
882
+ if not cleaned_text:
883
+ continue
884
+
885
+ logger.info(f"Processing memory {memory_id}...")
886
+
887
+ # Run NLP pipeline on cleaned text
888
+ emotion_scores = extract_emotion_scores(cleaned_text)
889
+ keywords = extract_keywords(cleaned_text) or preprocessing_keywords
890
+ topics = categorize_topics(cleaned_text, keywords)
891
+ entities = extract_entities(cleaned_text)
892
+ embedding_data = generate_embedding(cleaned_text)
893
+ embedding_id = store_embedding_in_faiss(
894
+ embedding_data["vector"],
895
+ memory_id,
896
+ faiss_index=None # FAISS index not yet initialized
897
+ )
898
+
899
+ # Determine mood from top emotion
900
+ mood = max(emotion_scores, key=emotion_scores.get) if emotion_scores else "neutral"
901
+
902
+ # Prepare update data
903
+ nlp_data = {
904
+ "content_clean": cleaned_text,
905
+ "mood": mood,
906
+ "embedding_id": embedding_id,
907
+ "nlp_insights": {
908
+ "emotion_scores": emotion_scores,
909
+ "keywords": keywords,
910
+ "topics": topics,
911
+ "entities": entities,
912
+ },
913
+ }
914
+
915
+ # Update memory in MongoDB
916
+ if update_memory_with_nlp(memory_id, nlp_data):
917
+ processed_count += 1
918
+ logger.info(f"✓ Processed {memory_id}")
919
+ else:
920
+ failed_count += 1
921
+ errors.append(f"Failed to update {memory_id}")
922
+
923
+ except Exception as e:
924
+ failed_count += 1
925
+ error_msg = f"Error processing {memory.get('_id')}: {str(e)}"
926
+ errors.append(error_msg)
927
+ logger.error(error_msg)
928
+
929
+ return {
930
+ "preprocessing": preprocessing_result,
931
+ "nlp_processing": {
932
+ "total": len(unprocessed),
933
+ "processed": processed_count,
934
+ "failed": failed_count,
935
+ "errors": errors,
936
+ }
937
+ }
backend/requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ APScheduler==3.11.2
2
+ dnspython==2.8.0
3
+ fastapi==0.133.0
4
+ firebase_admin==7.1.0
5
+ pydantic==2.12.5
6
+ pymongo==4.16.0
7
+ python-dotenv==1.2.1
8
+ uvicorn==0.41.0
9
+ wrapt==2.1.1
backend/routers/auth.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, Depends
2
+
3
+ from backend.auth_deps import verify_firebase_token
4
+
5
+ router = APIRouter()
6
+
7
+
8
+ @router.get("/me")
9
+ async def me(user: dict = Depends(verify_firebase_token)):
10
+ """Return current user info from Firebase token."""
11
+ return {
12
+ "uid": user.get("uid"),
13
+ "email": user.get("email"),
14
+ "email_verified": user.get("email_verified"),
15
+ }
16
+
17
+
backend/routers/dashboard.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, Depends
2
+
3
+ from backend import crud, schemas
4
+ from backend.auth_deps import verify_firebase_token_optional
5
+
6
+ router = APIRouter()
7
+
8
+
9
+ @router.get("/stats", response_model=schemas.StatsResponse)
10
+ async def stats(user: dict | None = Depends(verify_firebase_token_optional)):
11
+ return crud.get_stats()
backend/routers/emotion_analyzer.py ADDED
File without changes
backend/routers/memories.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ import logging
3
+
4
+ from fastapi import APIRouter, Depends, HTTPException
5
+
6
+ from backend import crud, schemas
7
+ from backend.auth_deps import verify_firebase_token, verify_firebase_token_optional
8
+ from backend.nlp_processor import extract_emotion_scores, extract_keywords, categorize_topics
9
+
10
+
11
+ router = APIRouter()
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def _stage_log(stage: str) -> None:
16
+ message = f"[NLP][analyze] {stage}"
17
+ print(message)
18
+ logger.info(message)
19
+
20
+
21
+ def _save_stage_log(stage: str) -> None:
22
+ message = f"[MEMORY][save] {stage}"
23
+ print(message)
24
+ logger.info(message)
25
+
26
+
27
+ def _generate_simple_summary(text: str, max_words: int = 28) -> str:
28
+ """Create a lightweight summary from raw memory text."""
29
+ cleaned = " ".join(text.split())
30
+ if not cleaned:
31
+ return ""
32
+
33
+ words = cleaned.split(" ")
34
+ if len(words) <= max_words:
35
+ return cleaned
36
+
37
+ return " ".join(words[:max_words]).rstrip(".,;: ") + "..."
38
+
39
+
40
+ def _top_tags(keywords: List[str], topics: List[str], max_tags: int = 5) -> List[str]:
41
+ """Combine and normalize keywords/topics into compact tag list."""
42
+ tags: List[str] = []
43
+ seen = set()
44
+
45
+ for item in (keywords or []) + (topics or []):
46
+ value = str(item).strip().lower()
47
+ if not value:
48
+ continue
49
+ value = value.replace("&", "and")
50
+ if value in seen:
51
+ continue
52
+ seen.add(value)
53
+ tags.append(value)
54
+ if len(tags) >= max_tags:
55
+ break
56
+
57
+ return tags
58
+
59
+
60
+ @router.get("/", response_model=List[schemas.MemoryDB])
61
+ async def list_memories(user: dict | None = Depends(verify_firebase_token_optional)):
62
+ docs = crud.list_memories()
63
+ return docs
64
+
65
+
66
+ @router.post("/", status_code=201)
67
+ async def create_memory(
68
+ payload: schemas.MemoryCreate,
69
+ user: dict = Depends(verify_firebase_token),
70
+ ):
71
+ _save_stage_log("1/3 received request")
72
+ data = payload.dict(exclude_none=True)
73
+ data["uid"] = user.get("uid")
74
+ _save_stage_log(f"2/3 persisting for uid={data['uid']}")
75
+ inserted_id = crud.create_memory(data)
76
+ _save_stage_log(f"3/3 completed id={inserted_id}")
77
+ return {"id": inserted_id, "status": "created", "message": "Memory stored. Will be processed by AI."}
78
+
79
+
80
+ @router.post("/analyze", response_model=schemas.MemoryAnalyzeResponse)
81
+ async def analyze_memory(payload: schemas.MemoryAnalyzeRequest, user: dict = Depends(verify_firebase_token)):
82
+ _stage_log("1/6 received request")
83
+ text = payload.content.strip()
84
+ if not text:
85
+ _stage_log("validation failed: empty content")
86
+ raise HTTPException(status_code=400, detail="Memory content is required")
87
+
88
+ _stage_log("2/6 emotion scoring")
89
+ emotion_scores = extract_emotion_scores(text)
90
+ mood = max(emotion_scores, key=emotion_scores.get) if emotion_scores else "neutral"
91
+ _stage_log(f"emotion scoring done, mood={mood}")
92
+
93
+ _stage_log("3/6 keyword extraction")
94
+ keywords = extract_keywords(text)
95
+ _stage_log(f"keyword extraction done, count={len(keywords)}")
96
+
97
+ _stage_log("4/6 topic categorization")
98
+ topics = categorize_topics(text, keywords)
99
+ _stage_log(f"topic categorization done, count={len(topics)}")
100
+
101
+ _stage_log("5/6 summary + tags")
102
+ ai_summary = _generate_simple_summary(text)
103
+ tags = _top_tags(keywords, topics)
104
+ _stage_log(f"summary + tags done, tags={len(tags)}")
105
+
106
+ _stage_log("6/6 response ready")
107
+
108
+ return {
109
+ "ai_summary": ai_summary,
110
+ "mood": mood,
111
+ "tags": tags,
112
+ "nlp_insights": {
113
+ "emotion_scores": emotion_scores,
114
+ "keywords": keywords,
115
+ "topics": topics,
116
+ "entities": [],
117
+ },
118
+ }
119
+
120
+
121
+ @router.get("/{memory_id}", response_model=schemas.MemoryDB)
122
+ async def get_memory(memory_id: str, user: dict | None = Depends(verify_firebase_token_optional)):
123
+ """Get a specific memory by ID."""
124
+ memory = crud.get_memory_by_id(memory_id)
125
+ if not memory:
126
+ raise HTTPException(status_code=404, detail="Memory not found")
127
+ return memory
128
+
129
+
130
+ @router.put("/{memory_id}", response_model=schemas.MemoryDB)
131
+ async def update_memory(memory_id: str, payload: schemas.MemoryUpdate, user: dict = Depends(verify_firebase_token)):
132
+ updates = payload.dict(exclude_none=True)
133
+ if not updates:
134
+ raise HTTPException(status_code=400, detail="No updates provided")
135
+
136
+ updated = crud.update_memory_by_id(memory_id, updates)
137
+ if not updated:
138
+ raise HTTPException(status_code=404, detail="Memory not found or unchanged")
139
+
140
+ memory = crud.get_memory_by_id(memory_id)
141
+ if not memory:
142
+ raise HTTPException(status_code=404, detail="Memory not found")
143
+
144
+ return memory
backend/schemas.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, List, Dict, Any
2
+ from datetime import datetime
3
+
4
+ from pydantic import BaseModel, Field
5
+
6
+
7
+ class EmotionScores(BaseModel):
8
+ """Emotion analysis from NLP."""
9
+ joy: Optional[float] = Field(None, description="Joy score 0-1")
10
+ sadness: Optional[float] = Field(None, description="Sadness score 0-1")
11
+ anger: Optional[float] = Field(None, description="Anger score 0-1")
12
+ fear: Optional[float] = Field(None, description="Fear score 0-1")
13
+ surprise: Optional[float] = Field(None, description="Surprise score 0-1")
14
+ disgust: Optional[float] = Field(None, description="Disgust score 0-1")
15
+
16
+
17
+
18
+ class NLPInsights(BaseModel):
19
+ emotion_scores: Optional[EmotionScores] = Field(None, description="Emotion sentiment analysis")
20
+ keywords: Optional[List[str]] = Field(default_factory=list, description="Extracted keywords/phrases")
21
+ topics: Optional[List[str]] = Field(default_factory=list, description="Identified topics (e.g., Work, Health, Relationships)")
22
+ entities: Optional[List[str]] = Field(default_factory=list, description="Named entities (people, places)")
23
+
24
+
25
+ class MemoryCreate(BaseModel):
26
+ content: str = Field(..., description="Raw text content of the memory")
27
+ content_clean: Optional[str] = Field(None, description="Cleaned/normalized version of content")
28
+ mood: Optional[str] = Field(None, description="Detected mood (e.g., happy, sad, reflective)")
29
+ ai_summary: Optional[str] = Field(None, description="AI-generated summary of the memory")
30
+ tags: Optional[List[str]] = Field(default_factory=list, description="Associated tags")
31
+ recorded_by: Optional[str] = Field(None, description="Input method: text, voice, etc.")
32
+ nlp_insights: Optional[NLPInsights] = Field(None, description="NLP extraction results")
33
+ embedding_id: Optional[int] = Field(None, description="Reference to FAISS index ID for vector search")
34
+
35
+
36
+ class MemoryUpdate(BaseModel):
37
+ content: Optional[str] = Field(None, description="Raw text content of the memory")
38
+ mood: Optional[str] = Field(None, description="Updated mood")
39
+ ai_summary: Optional[str] = Field(None, description="Updated AI summary")
40
+ tags: Optional[List[str]] = Field(None, description="Updated tags")
41
+
42
+
43
+ class MemoryAnalyzeRequest(BaseModel):
44
+ content: str = Field(..., description="Raw text content to analyze")
45
+
46
+
47
+ class MemoryAnalyzeResponse(BaseModel):
48
+ ai_summary: str = Field(..., description="Generated concise summary")
49
+ mood: str = Field(..., description="Detected mood label")
50
+ tags: List[str] = Field(default_factory=list, description="Detected keyword/topic tags")
51
+ nlp_insights: Optional[NLPInsights] = Field(None, description="Detailed NLP extraction results")
52
+
53
+
54
+ class MemoryDB(MemoryCreate):
55
+ id: str = Field(..., description="MongoDB ObjectId as string")
56
+ uid: Optional[str] = Field(None, description="Firebase user ID")
57
+ created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation timestamp")
58
+ updated_at: Optional[datetime] = Field(None, description="Last update timestamp")
59
+ is_processed: bool = Field(False, description="Whether NLP extraction/embedding has been completed")
60
+
61
+
62
+ class StatsResponse(BaseModel):
63
+ total_memories: int
64
+ avg_mood_score: Optional[float] = None
65
+ most_common_mood: Optional[str] = None
66
+ top_emotions: Optional[Dict[str, float]] = None
67
+ top_topics: Optional[List[str]] = None
backend/text_preprocessor.py ADDED
@@ -0,0 +1,410 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ TEXT PREPROCESSING & CLEANING MODULE
3
+ 1. Load NLP Pipeline (spaCy model)
4
+ 2. Normalize Text (lowercase, remove special chars, URLs)
5
+ 3. Tokenize & Analyze (break into words, POS tags)
6
+ 4. Lemmatize & Clean (reduce to base forms, remove stopwords)
7
+ 5. Store cleaned text & metadata in MongoDB
8
+ 6. Feed to downstream AI models
9
+
10
+ flow: User Input → Normalize → Tokenize → Lemmatize → Store → AI Models
11
+ """
12
+
13
+ import re
14
+ import string
15
+ from typing import Dict, List, Optional, Tuple
16
+ from datetime import datetime
17
+ import logging
18
+
19
+ try:
20
+ import spacy
21
+ from spacy.language import Language
22
+ SPACY_AVAILABLE = True
23
+ except ImportError:
24
+ SPACY_AVAILABLE = False
25
+ logging.info("spaCy not installed - using lightweight regex-based preprocessing")
26
+ Language = None
27
+
28
+ from backend.connection import get_collection
29
+
30
+ # Configure logging
31
+ logging.basicConfig(level=logging.INFO)
32
+ logger = logging.getLogger(__name__)
33
+
34
+ # Global cache for spaCy model (load once, reuse)
35
+ _nlp_model: Optional[Language] = None
36
+
37
+
38
+ def load_nlp_pipeline() -> Language:
39
+ """
40
+ Load and cache spaCy NLP pipeline.
41
+
42
+ Downloads en_core_web_sm on first run.
43
+ Uses cache on subsequent calls for performance.
44
+
45
+ Returns:
46
+ spacy Language model instance
47
+ """
48
+ global _nlp_model
49
+
50
+ if _nlp_model is not None:
51
+ return _nlp_model
52
+
53
+ if not SPACY_AVAILABLE:
54
+ raise RuntimeError("spaCy not installed.")
55
+
56
+ try:
57
+ # Try to load the model
58
+ _nlp_model = spacy.load("en_core_web_sm")
59
+ logger.info("Loaded spaCy model: en_core_web_sm")
60
+ return _nlp_model
61
+ except OSError:
62
+ # Model not found, try to download
63
+ logger.info("Downloading en_core_web_sm model...")
64
+ ################################################################################3
65
+ # import subprocess
66
+ # subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"], check=True)
67
+ import sys, subprocess
68
+ subprocess.run([sys.executable, "-m", "spacy", "download", "en_core_web_sm"], check=True)
69
+
70
+ _nlp_model = spacy.load("en_core_web_sm")
71
+ logger.info("ownloaded and loaded en_core_web_sm")
72
+ return _nlp_model
73
+
74
+
75
+ class TextPreprocessor:
76
+ # Complete text preprocessing pipeline
77
+
78
+ def __init__(self):
79
+ """Initialize preprocessor with spaCy pipeline if available, else use lightweight mode."""
80
+ if SPACY_AVAILABLE:
81
+ try:
82
+ self.nlp = load_nlp_pipeline()
83
+ self.stop_words = self.nlp.Defaults.stop_words
84
+ self.use_spacy = True
85
+ logger.info("TextPreprocessor initialized with spaCy")
86
+ except Exception as e:
87
+ logger.warning(f"Failed to load spaCy: {e}. Using lightweight mode.")
88
+ self.nlp = None
89
+ self.stop_words = self._get_basic_stopwords()
90
+ self.use_spacy = False
91
+ else:
92
+ self.nlp = None
93
+ self.stop_words = self._get_basic_stopwords()
94
+ self.use_spacy = False
95
+ logger.info("TextPreprocessor initialized without spaCy (lightweight mode)")
96
+
97
+ def _get_basic_stopwords(self) -> set:
98
+ """Basic English stopwords for lightweight mode."""
99
+ return {
100
+ 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
101
+ 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers',
102
+ 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
103
+ 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
104
+ 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does',
105
+ 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until',
106
+ 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into',
107
+ 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down',
108
+ 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once'
109
+ }
110
+
111
+ def normalize_text(self, text: str) -> str:
112
+ """
113
+ - Convert to lowercase
114
+ - Remove URLs (https://..., http://...)
115
+ - Remove email addresses
116
+ - Remove special characters except apostrophes
117
+ - Remove extra whitespace
118
+ """
119
+ if not text:
120
+ return ""
121
+
122
+ # Remove URLs
123
+ text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
124
+ # Remove email addresses
125
+ text = re.sub(r'\S+@\S+', '', text)
126
+ # Remove mentions (@user) and hashtags (#hashtag)
127
+ text = re.sub(r'@\w+|#\w+', '', text)
128
+ # Convert to lowercase
129
+ text = text.lower()
130
+ # Remove special characters but keep spaces and apostrophes
131
+ text = re.sub(r"[^\w\s']", '', text)
132
+ # Remove extra whitespace and tabs
133
+ text = ' '.join(text.split())
134
+ return text
135
+
136
+
137
+
138
+
139
+
140
+ def tokenize_and_analyze(self, text: str) -> Tuple[List[str], List[Tuple[str, str]]]:
141
+ if not text:
142
+ return [], []
143
+
144
+ if not self.use_spacy:
145
+ # Lightweight tokenization without spaCy
146
+ tokens = re.findall(r'\b\w+\b', text.lower())
147
+ pos_tags = [(token, "NOUN") for token in tokens] # Simplified POS
148
+ return tokens, pos_tags
149
+
150
+ doc = self.nlp(text)
151
+ tokens = [token.text for token in doc]
152
+ pos_tags = [(token.text, token.pos_) for token in doc]
153
+ return tokens, pos_tags
154
+
155
+
156
+
157
+
158
+
159
+
160
+ def lemmatize_and_clean(self, text: str, remove_stopwords: bool = True,remove_punctuation: bool = True) -> Tuple[str, Dict]:
161
+ if not text:
162
+ return "", {}
163
+
164
+ if not self.use_spacy:
165
+ # Lightweight lemmatization without spaCy
166
+ tokens = re.findall(r'\b\w+\b', text.lower())
167
+ lemmas = []
168
+ removed_stopwords = 0
169
+
170
+ for token in tokens:
171
+ if remove_stopwords and token in self.stop_words:
172
+ removed_stopwords += 1
173
+ continue
174
+ if len(token) >= 2:
175
+ lemmas.append(token)
176
+
177
+ cleaned_text = ' '.join(lemmas)
178
+ metadata = {
179
+ "original_token_count": len(tokens),
180
+ "cleaned_token_count": len(lemmas),
181
+ "removed_stopwords": removed_stopwords,
182
+ "pos_distribution": {},
183
+ "compression_ratio": round(len(lemmas) / len(tokens), 2) if tokens else 0,
184
+ }
185
+ return cleaned_text, metadata
186
+
187
+ doc = self.nlp(text)
188
+
189
+ lemmas = []
190
+ pos_distribution = {}
191
+ removed_stopwords = 0
192
+ original_count = 0
193
+
194
+ for token in doc:
195
+ original_count += 1
196
+ # Count pos tags
197
+ pos = token.pos_
198
+ pos_distribution[pos] = pos_distribution.get(pos, 0) + 1
199
+ # Skip stopwords
200
+ if remove_stopwords and token.is_stop:
201
+ removed_stopwords += 1
202
+ continue
203
+ # Skip punctuation
204
+ if remove_punctuation and token.is_punct:
205
+ continue
206
+ # Get lemma (base form)
207
+ lemma = token.lemma_.lower()
208
+ # Skip single characters (unless important)
209
+ if len(lemma) < 2 and token.pos_ not in ["NOUN", "VERB", "ADJ", "ADV"]:
210
+ continue
211
+ lemmas.append(lemma)
212
+ cleaned_text = ' '.join(lemmas)
213
+ metadata = {
214
+ "original_token_count": original_count,
215
+ "cleaned_token_count": len(lemmas),
216
+ "removed_stopwords": removed_stopwords,
217
+ "pos_distribution": pos_distribution,
218
+ "compression_ratio": round(len(lemmas) / original_count, 2) if original_count > 0 else 0,
219
+ }
220
+ return cleaned_text, metadata
221
+
222
+
223
+
224
+
225
+
226
+
227
+
228
+
229
+ def extract_keywords(self, text: str, top_n: int = 10) -> List[str]:
230
+ """
231
+ - Extract noun phrases (noun chunks)
232
+ - Filter by part-of-speech (NOUN, VERB, ADJ)
233
+ - Rank by frequency
234
+ - Return top N
235
+ """
236
+ if not text:
237
+ return []
238
+
239
+ if not self.use_spacy:
240
+ # Lightweight keyword extraction without spaCy
241
+ tokens = re.findall(r'\b\w{3,}\b', text.lower())
242
+ # Filter stopwords
243
+ keywords = [t for t in tokens if t not in self.stop_words]
244
+ # Count frequency
245
+ from collections import Counter
246
+ keyword_freq = Counter(keywords)
247
+ return [kw for kw, _ in keyword_freq.most_common(top_n)]
248
+
249
+ doc = self.nlp(text)
250
+
251
+ # Extract noun chunks
252
+ noun_chunks = [chunk.text.lower() for chunk in doc.noun_chunks]
253
+
254
+ # Extract high-value POS (nouns, verbs, adjectives)
255
+ important_tokens = [
256
+ token.text.lower()
257
+ for token in doc
258
+ if token.pos_ in ["NOUN", "VERB", "ADJ", "ADV"]
259
+ and not token.is_stop
260
+ and len(token.text) > 2
261
+ ]
262
+
263
+ # Combine and deduplicate
264
+ all_keywords = list(set(noun_chunks + important_tokens))
265
+
266
+ # Sort by frequency in text
267
+ keyword_freq = {}
268
+ for keyword in all_keywords:
269
+ keyword_freq[keyword] = text.lower().count(keyword)
270
+
271
+ sorted_keywords = sorted(
272
+ keyword_freq.items(),
273
+ key=lambda x: x[1],
274
+ reverse=True
275
+ )
276
+
277
+ return [kw for kw, _ in sorted_keywords[:top_n]]
278
+
279
+ def preprocess(self, text: str) -> Dict:
280
+ if not text:
281
+ return {
282
+ "original": "",
283
+ "normalized": "",
284
+ "tokens": [],
285
+ "pos_tags": [],
286
+ "cleaned": "",
287
+ "keywords": [],
288
+ "metadata": {},
289
+ }
290
+ # Step 1: Normalize
291
+ normalized = self.normalize_text(text)
292
+ # Step 2: Tokenize
293
+ tokens, pos_tags = self.tokenize_and_analyze(normalized)
294
+ # Step 3: Lemmatize & Clean
295
+ cleaned, metadata = self.lemmatize_and_clean(normalized)
296
+ # Extract keywords
297
+ keywords = self.extract_keywords(normalized)
298
+ return {
299
+ "original": text,
300
+ "normalized": normalized,
301
+ "tokens": tokens,
302
+ "pos_tags": pos_tags,
303
+ "cleaned": cleaned,
304
+ "keywords": keywords,
305
+ "metadata": metadata,
306
+ }
307
+
308
+
309
+
310
+
311
+
312
+ def store_preprocessing_results(memory_id: str, preprocessing_results: Dict) -> bool:
313
+ # Store cleaned text & metadata in MongoDB.
314
+ col = get_collection("memories")
315
+ try:
316
+ update_data = {
317
+ "preprocessing": {
318
+ "normalized": preprocessing_results.get("normalized"),
319
+ "cleaned": preprocessing_results.get("cleaned"),
320
+ "tokens": preprocessing_results.get("tokens"),
321
+ "keywords": preprocessing_results.get("keywords"),
322
+ "metadata": preprocessing_results.get("metadata"),
323
+ },
324
+ "updated_at": datetime.utcnow(),
325
+ }
326
+
327
+ result = col.update_one(
328
+ {"_id": __import__("bson").ObjectId(memory_id)},
329
+ {"$set": update_data}
330
+ )
331
+
332
+ return result.modified_count > 0
333
+ except Exception as e:
334
+ logger.error(f"Failed to store preprocessing results: {e}")
335
+ return False
336
+
337
+
338
+
339
+
340
+
341
+
342
+ def preprocess_unprocessed_memories(batch_size: int = 50) -> Dict:
343
+ """
344
+ Step 1 in the full NLP workflow.
345
+ Subsequent steps (emotion analysis, embeddings) use cleaned text.
346
+ """
347
+ col = get_collection("memories")
348
+ preprocessor = TextPreprocessor()
349
+ # Find memories without preprocessing
350
+ unprocessed = list(col.find(
351
+ {"preprocessing": {"$exists": False}}
352
+ ).limit(batch_size))
353
+ processed_count = 0
354
+ failed_count = 0
355
+ errors = []
356
+ for memory in unprocessed:
357
+ try:
358
+ memory_id = str(memory["_id"])
359
+ content = memory.get("content", "")
360
+ if not content:
361
+ continue
362
+ logger.info(f"Preprocessing memory {memory_id}...")
363
+
364
+ results = preprocessor.preprocess(content)
365
+ # Store results
366
+ if store_preprocessing_results(memory_id, results):
367
+ processed_count += 1
368
+ logger.info(f"✓ Preprocessed {memory_id}")
369
+ else:
370
+ failed_count += 1
371
+ errors.append(f"Failed to store preprocessing for {memory_id}")
372
+
373
+ except Exception as e:
374
+ failed_count += 1
375
+ error_msg = f"Error preprocessing {memory.get('_id')}: {str(e)}"
376
+ errors.append(error_msg)
377
+ logger.error(error_msg)
378
+ return {
379
+ "total": len(unprocessed),
380
+ "processed": processed_count,
381
+ "failed": failed_count,
382
+ "errors": errors,
383
+ }
384
+
385
+
386
+
387
+
388
+ # # Test the preprocessor
389
+ # if __name__ == "__main__":
390
+ # preprocessor = TextPreprocessor()
391
+
392
+ # sample_text = """
393
+ # Today was a mix of productivity and much-needed relaxation!
394
+ # I checked https://example.com for work, then took a 10-minute walk to clear my head. ## 3 434
395
+ # Feeling grateful and peaceful. Contact me at test@example.com if you need anything!
396
+ # """
397
+
398
+ # result = preprocessor.preprocess(sample_text)
399
+
400
+ # print("\n" + "="*60)
401
+ # print("TEXT PREPROCESSING PIPELINE OUTPUT")
402
+ # print("="*60)
403
+ # print(f"\nOriginal:\n{result['original']}")
404
+ # print(f"\nNormalized:\n{result['normalized']}")
405
+ # print(f"\nTokens: {result['tokens']}")
406
+ # print(f"\nPOS Tags: {result['pos_tags']}")
407
+ # print(f"\nCleaned:\n{result['cleaned']}")
408
+ # print(f"\nKeywords: {result['keywords']}")
409
+ # print(f"\nMetadata: {result['metadata']}")
410
+ # print("\n" + "="*60)
components.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "$schema": "https://ui.shadcn.com/schema.json",
3
+ "style": "new-york",
4
+ "rsc": true,
5
+ "tsx": true,
6
+ "tailwind": {
7
+ "config": "",
8
+ "css": "app/globals.css",
9
+ "baseColor": "neutral",
10
+ "cssVariables": true,
11
+ "prefix": ""
12
+ },
13
+ "aliases": {
14
+ "components": "@/components",
15
+ "utils": "@/lib/utils",
16
+ "ui": "@/components/ui",
17
+ "lib": "@/lib",
18
+ "hooks": "@/hooks"
19
+ },
20
+ "iconLibrary": "lucide"
21
+ }