Spaces:

InnoTrack
/

Graduation_Project-v1.2

Sleeping

App Files Files Community

bat-6 commited on 28 days ago

Commit

5ec2fc9

1 Parent(s): 599f4e7

update .gitignore and add scheduler and sync_projects services

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.env +0 -2
.gitattributes +0 -33
.gitignore +5 -0
Data/database/__pycache__/sql_connector.cpython-311.pyc +0 -0
Data/database/sql_connector.py +54 -11
Data_gemini/projects_clean_gemini.csv +0 -0
Data_gemini/projects_clean_gemini.parquet +3 -0
Dockerfile +1 -17
Notebooks/TEST.ipynb +0 -0
Notebooks/test2.ipynb +2653 -0
README.md +376 -8
api/__pycache__/__init__.cpython-311.pyc +0 -0
api/__pycache__/main.cpython-311.pyc +0 -0
api/__pycache__/schemas.cpython-311.pyc +0 -0
api/__pycache__/services.cpython-311.pyc +0 -0
models/faiss_index.bin +2 -2
models/metadata.parquet +2 -2
models/project_embeddings.npy +0 -0
requirements.txt +1 -1
src/recommendation_engine/__pycache__/__init__.cpython-311.pyc +0 -0
src/recommendation_engine/__pycache__/chatbot_engine.cpython-311.pyc +0 -0
src/recommendation_engine/__pycache__/command_handler.cpython-311.pyc +0 -0
src/recommendation_engine/__pycache__/config.cpython-311.pyc +0 -0
src/recommendation_engine/__pycache__/context_builder.cpython-311.pyc +0 -0
src/recommendation_engine/__pycache__/feature_generator.cpython-311.pyc +0 -0
src/recommendation_engine/__pycache__/full_project_generator.cpython-311.pyc +0 -0
src/recommendation_engine/__pycache__/idea_generator.cpython-311.pyc +0 -0
src/recommendation_engine/__pycache__/llm_client.cpython-311.pyc +0 -0
src/recommendation_engine/__pycache__/llm_router.cpython-311.pyc +0 -0
src/recommendation_engine/__pycache__/memory_store.cpython-311.pyc +0 -0
src/recommendation_engine/__pycache__/novelty_checker.cpython-311.pyc +0 -0
src/recommendation_engine/__pycache__/prompt_builder.cpython-311.pyc +0 -0
src/recommendation_engine/__pycache__/response_formatter.cpython-311.pyc +0 -0
src/recommendation_engine/__pycache__/state_manager.cpython-311.pyc +0 -0
src/recommendation_engine/__pycache__/test.cpython-311.pyc +0 -0
src/recommendation_engine/__pycache__/validator.cpython-311.pyc +0 -0
src/recommendation_engine/llm_client.py +4 -2
src/services/scheduler.py +19 -0
src/services/sync_projects.py +172 -0
src/similarity_model/__pycache__/__init__.cpython-311.pyc +0 -0
src/similarity_model/__pycache__/embedding_engine.cpython-311.pyc +0 -0
src/similarity_model/__pycache__/feature_similarity.cpython-311.pyc +0 -0
src/similarity_model/__pycache__/hybrid_ranker.cpython-311.pyc +0 -0
src/similarity_model/__pycache__/preprocessing.cpython-311.pyc +0 -0
src/similarity_model/__pycache__/semantic_search.cpython-311.pyc +0 -0
src/similarity_model/__pycache__/similarity_engine.cpython-311.pyc +0 -0
src/similarity_model/embedding_engine.py +18 -3
src/similarity_model/feature_similarity.py +8 -5
src/similarity_model/hybrid_ranker.py +57 -26
src/similarity_model/llm_feature_extractor.py +223 -0

.env DELETED Viewed

	@@ -1,2 +0,0 @@
1	- GEMINI_API_KEY=AIzaSyAkFsaN3BKoSQmRW4FzTahhZXbq-ldsDZ4
2	- GEMINI_MODEL_NAME=gemini-2.5-flash

.gitattributes CHANGED Viewed

@@ -1,35 +1,2 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
 *.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text




1	*.bin filter=lfs diff=lfs merge=lfs -text














2	*.parquet filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+venv/
+.venv/
+__pycache__/
+*.pyc
+.env

Data/database/__pycache__/sql_connector.cpython-311.pyc CHANGED Viewed

Binary files a/Data/database/__pycache__/sql_connector.cpython-311.pyc and b/Data/database/__pycache__/sql_connector.cpython-311.pyc differ

Data/database/sql_connector.py CHANGED Viewed

@@ -1,13 +1,12 @@
-import os
-import json
-import urllib
-import pandas as pd
 from sqlalchemy import create_engine
-SERVER = os.getenv("AZURE_SQL_SERVER")
-DATABASE = os.getenv("AZURE_SQL_DATABASE")
-USERNAME = os.getenv("AZURE_SQL_USERNAME")
-PASSWORD = os.getenv("AZURE_SQL_PASSWORD")
 params = urllib.parse.quote_plus(
     f"DRIVER={{ODBC Driver 18 for SQL Server}};"
@@ -20,12 +19,56 @@ params = urllib.parse.quote_plus(
     "Connection Timeout=30;"
 )
-engine = create_engine(f"mssql+pyodbc:///?odbc_connect={params}")
 def load_preprocessed_projects():
-    df = pd.read_sql("SELECT * FROM PreProcessed_Projects", engine)
     if "features" in df.columns:
-        df["features"] = df["features"].apply(json.loads)
     return df

 from sqlalchemy import create_engine
+import pandas as pd
+import urllib
+import json
+SERVER = "innotrack-sql-server.database.windows.net"
+DATABASE = "InnoTrackDB"
+USERNAME = "innotrackadmin"
+PASSWORD = "Innotrack@admin233"
 params = urllib.parse.quote_plus(
     f"DRIVER={{ODBC Driver 18 for SQL Server}};"
     "Connection Timeout=30;"
 )
+connection_string = (
+    f"mssql+pyodbc:///?odbc_connect={params}"
+)
+engine = create_engine(connection_string)
+try:
+    with engine.connect() as conn:
+        print("SQL Connected Successfully")
+except Exception as e:
+    print("Connection Failed")
+    print(e)
 def load_preprocessed_projects():
+    query = """
+    SELECT *
+    FROM PreProcessed_Projects
+    """
+    df = pd.read_sql(
+        query,
+        engine
+    )
     if "features" in df.columns:
+        def parse_features(x):
+            if not isinstance(x, str):
+                return x
+            try:
+                x = json.loads(x)
+                if isinstance(x, str):
+                    x = json.loads(x)
+                return x
+            except Exception:
+                return []
+        df["features"] = df["features"].apply(parse_features)
     return df

Data_gemini/projects_clean_gemini.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

Data_gemini/projects_clean_gemini.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:789b1063cc584c694924b03c26846a6c3c1e41ea0a1ac2df97ed42907acbea8e
+size 772640

Dockerfile CHANGED Viewed

@@ -1,20 +1,4 @@
-FROM python:3.11-slim-bookworm
-USER root
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    curl \
-    gnupg \
-    ca-certificates \
-    unixodbc \
-    unixodbc-dev \
-    && curl -sSL -O https://packages.microsoft.com/config/debian/12/packages-microsoft-prod.deb \
-    && dpkg -i packages-microsoft-prod.deb \
-    && rm packages-microsoft-prod.deb \
-    && apt-get update \
-    && ACCEPT_EULA=Y apt-get install -y --no-install-recommends msodbcsql18 \
-    && apt-get clean \
-    && rm -rf /var/lib/apt/lists/*
 RUN useradd -m -u 1000 user


1	+ FROM python:3.11-slim
















2
3	RUN useradd -m -u 1000 user
4

Notebooks/TEST.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

Notebooks/test2.ipynb CHANGED Viewed

	@@ -0,0 +1,2653 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "49c6b17c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'e:\\\\gradution project'"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\n",
+    "import os\n",
+    "os.getcwd()\n",
+    "os.chdir(\"/gradution project\")\n",
+    "os.getcwd()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "509448bd",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      " CONFIG LOADED:\n",
+      "ENV: development\n",
+      "DEBUG_MODE: True\n",
+      "MODELS: ['gemini-3.1-flash-lite-preview', 'gemini-2.5-flash-lite', 'gemini-2.5-flash', 'gemini-2.5-pro']\n",
+      "MAX_RETRIES: 3\n",
+      "IDEA_TEMP: 0.9\n",
+      "=================================\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2026-06-04 00:29:43,014 | INFO | Load pretrained SentenceTransformer: all-MiniLM-L6-v2\n",
+      "e:\\gradution project\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:949: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n",
+      "2026-06-04 00:29:46,381 | INFO | Use pytorch device_name: cpu\n",
+      "2026-06-04 00:29:46,388 | INFO | Loading faiss with AVX2 support.\n",
+      "2026-06-04 00:29:46,418 | INFO | Successfully loaded faiss with AVX2 support.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "SQL Connected Successfully\n",
+      "All modules imported successfully\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from tqdm.notebook import tqdm\n",
+    "\n",
+    "from src.similarity_model import preprocess_dataset\n",
+    "from src.similarity_model import train_embedding_engine\n",
+    "from src.similarity_model import search_by_text\n",
+    "from src.similarity_model import find_similar_projects\n",
+    "from src.similarity_model import extract_features\n",
+    "\n",
+    "from src.similarity_model import normalize_text\n",
+    "from src.similarity_model import compute_feature_similarity\n",
+    "from Data.database.sql_connector import (\n",
+    "    load_preprocessed_projects,\n",
+    "    engine\n",
+    ")\n",
+    "\n",
+    "print(\"All modules imported successfully\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "0bf93b8e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Engine created\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sqlalchemy import create_engine\n",
+    "import urllib\n",
+    "\n",
+    "SERVER = \"innotrack-sql-server.database.windows.net\"\n",
+    "DATABASE = \"InnoTrackDB\"\n",
+    "USERNAME = \"innotrackadmin\"\n",
+    "PASSWORD = \"Innotrack@admin233\"\n",
+    "\n",
+    "params = urllib.parse.quote_plus(\n",
+    "    f\"DRIVER={{ODBC Driver 18 for SQL Server}};\"\n",
+    "    f\"SERVER={SERVER};\"\n",
+    "    f\"DATABASE={DATABASE};\"\n",
+    "    f\"UID={USERNAME};\"\n",
+    "    f\"PWD={PASSWORD};\"\n",
+    "    \"Encrypt=yes;\"\n",
+    "    \"TrustServerCertificate=no;\"\n",
+    "    \"Connection Timeout=30;\"\n",
+    ")\n",
+    "\n",
+    "engine = create_engine(\n",
+    "    f\"mssql+pyodbc:///?odbc_connect={params}\"\n",
+    ")\n",
+    "\n",
+    "print(\"Engine created\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "11f40d1d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>TABLE_NAME</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Teams</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>ChatRooms</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>ChatMessageHiddens</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>JoinRequests</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>ChatMessageReactions</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>Projects</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>TeamMembers</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>ProjectTechnologies_Backup</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>ChatMessages</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>Feedbacks</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>MissingProjectTechsSplit</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>PreProcessed_Projects</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>OriginalityReports</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>ProjectAttachments</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>ProjectTechnologies</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>VectorEmbeddings</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>ChatMessageAttachments</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>SimilarProjects</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>AuditLogs</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>AcademicYears</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>Schema</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21</th>\n",
+       "      <td>Job</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22</th>\n",
+       "      <td>State</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
+       "      <td>JobParameter</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24</th>\n",
+       "      <td>JobQueue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25</th>\n",
+       "      <td>database_firewall_rules</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26</th>\n",
+       "      <td>Server</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>27</th>\n",
+       "      <td>List</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28</th>\n",
+       "      <td>Set</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29</th>\n",
+       "      <td>Counter</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>30</th>\n",
+       "      <td>Hash</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>31</th>\n",
+       "      <td>AggregatedCounter</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>32</th>\n",
+       "      <td>__EFMigrationsHistory</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>33</th>\n",
+       "      <td>Departments</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>34</th>\n",
+       "      <td>Skills_Backup</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>35</th>\n",
+       "      <td>Projects_Backup</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>36</th>\n",
+       "      <td>Domains</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>37</th>\n",
+       "      <td>Skills</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>38</th>\n",
+       "      <td>Technologies</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>39</th>\n",
+       "      <td>Users</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>40</th>\n",
+       "      <td>ProjectDrafts</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>41</th>\n",
+       "      <td>Notifications</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>42</th>\n",
+       "      <td>ProjectDraftTechnologies</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>43</th>\n",
+       "      <td>StudentSkills</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                    TABLE_NAME\n",
+       "0                        Teams\n",
+       "1                    ChatRooms\n",
+       "2           ChatMessageHiddens\n",
+       "3                 JoinRequests\n",
+       "4         ChatMessageReactions\n",
+       "5                     Projects\n",
+       "6                  TeamMembers\n",
+       "7   ProjectTechnologies_Backup\n",
+       "8                 ChatMessages\n",
+       "9                    Feedbacks\n",
+       "10    MissingProjectTechsSplit\n",
+       "11       PreProcessed_Projects\n",
+       "12          OriginalityReports\n",
+       "13          ProjectAttachments\n",
+       "14         ProjectTechnologies\n",
+       "15            VectorEmbeddings\n",
+       "16      ChatMessageAttachments\n",
+       "17             SimilarProjects\n",
+       "18                   AuditLogs\n",
+       "19               AcademicYears\n",
+       "20                      Schema\n",
+       "21                         Job\n",
+       "22                       State\n",
+       "23                JobParameter\n",
+       "24                    JobQueue\n",
+       "25     database_firewall_rules\n",
+       "26                      Server\n",
+       "27                        List\n",
+       "28                         Set\n",
+       "29                     Counter\n",
+       "30                        Hash\n",
+       "31           AggregatedCounter\n",
+       "32       __EFMigrationsHistory\n",
+       "33                 Departments\n",
+       "34               Skills_Backup\n",
+       "35             Projects_Backup\n",
+       "36                     Domains\n",
+       "37                      Skills\n",
+       "38                Technologies\n",
+       "39                       Users\n",
+       "40               ProjectDrafts\n",
+       "41               Notifications\n",
+       "42    ProjectDraftTechnologies\n",
+       "43               StudentSkills"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "with engine.connect() as conn:\n",
+    "\n",
+    "    tables = pd.read_sql(\n",
+    "        \"\"\"\n",
+    "        SELECT TABLE_NAME\n",
+    "        FROM INFORMATION_SCHEMA.TABLES\n",
+    "        \"\"\",\n",
+    "        conn\n",
+    "    )\n",
+    "\n",
+    "tables"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "5d1125cb",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>submitted_at</th>\n",
+       "      <th>project_title</th>\n",
+       "      <th>student_names</th>\n",
+       "      <th>year</th>\n",
+       "      <th>abstract</th>\n",
+       "      <th>description</th>\n",
+       "      <th>problem_statement</th>\n",
+       "      <th>proposed_solution</th>\n",
+       "      <th>objectives</th>\n",
+       "      <th>full_content</th>\n",
+       "      <th>clean_text</th>\n",
+       "      <th>word_count</th>\n",
+       "      <th>features</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>3D hand game for neuromuscular patients</td>\n",
+       "      <td>Ahmed Mansour Mohamed Saber, Ahmed Mohamed Moh...</td>\n",
+       "      <td>2017</td>\n",
+       "      <td>In this project we have designed and implement...</td>\n",
+       "      <td>A virtual rehabilitation system that uses a Le...</td>\n",
+       "      <td>Neuromuscular patients suffer from nerve atrop...</td>\n",
+       "      <td>The development of a 3D interactive game integ...</td>\n",
+       "      <td>1. Develop a scalable and maintainable solutio...</td>\n",
+       "      <td>3D hand game for neuromuscular patients. 3D ha...</td>\n",
+       "      <td>3d hand game for neuromuscular patients. 3d ha...</td>\n",
+       "      <td>172</td>\n",
+       "      <td>\"\\\"[\\\\\\\"Leap Motion controller sensor\\\\\\\", \\\\\\...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>3D Laser Scanning</td>\n",
+       "      <td>Aya Essam Hegazi, Asmaa Abd EL-Aziz, Ebtehal E...</td>\n",
+       "      <td>2024</td>\n",
+       "      <td>3D scanning is used in many applications such ...</td>\n",
+       "      <td>This project implements a low-cost 3D laser sc...</td>\n",
+       "      <td>Existing 3D scanning devices are often extreme...</td>\n",
+       "      <td>A low-cost 3D laser scanning system that utili...</td>\n",
+       "      <td>1. Improve overall productivity and workflow o...</td>\n",
+       "      <td>3D Laser Scanning. 3D Laser Scanning. 3D scann...</td>\n",
+       "      <td>3d laser scanning. 3d laser scanning. 3d scann...</td>\n",
+       "      <td>185</td>\n",
+       "      <td>\"\\\"[\\\\\\\"3d laser scanning\\\\\\\", \\\\\\\"Hand-held l...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>3</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>A Smart Automatic System for Criminal Identifi...</td>\n",
+       "      <td>Yousef Yacoub Mohammed, Ahmed Mohamed Hassan,\\...</td>\n",
+       "      <td>2020</td>\n",
+       "      <td>The increasing use of biometric technologies i...</td>\n",
+       "      <td>This project develops an automated criminal id...</td>\n",
+       "      <td>Traditional identification methods, such as ph...</td>\n",
+       "      <td>A real-time facial recognition system develope...</td>\n",
+       "      <td>1. Support future scalability and feature expa...</td>\n",
+       "      <td>A Smart Automatic System for Criminal Identifi...</td>\n",
+       "      <td>a smart automatic system for criminal identifi...</td>\n",
+       "      <td>138</td>\n",
+       "      <td>\"\\\"[\\\\\\\"real-time face recognition system\\\\\\\",...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>4</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Advanced Educational Platform “ABSTHALK”</td>\n",
+       "      <td>Mohamed Nasser Maher, Karim Ashraf Salah Eldie...</td>\n",
+       "      <td>2025</td>\n",
+       "      <td>The Educational Platform for Students and Teac...</td>\n",
+       "      <td>ABSTHALK is a comprehensive, role-based e-lear...</td>\n",
+       "      <td>Traditional learning methods often lack access...</td>\n",
+       "      <td>The project proposes a structured, role-based,...</td>\n",
+       "      <td>1. Provide interactive educational tools and r...</td>\n",
+       "      <td>Advanced Educational Platform “ABSTHALK”. Adva...</td>\n",
+       "      <td>advanced educational platform absthalk . advan...</td>\n",
+       "      <td>192</td>\n",
+       "      <td>\"\\\"[\\\\\\\"Role-based management system\\\\\\\", \\\\\\\"...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>5</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Agricultural Information and Management System</td>\n",
+       "      <td>Ahmed Mohamed, Omar Hassan, Mahmoud Ali ,Mazen...</td>\n",
+       "      <td>2020</td>\n",
+       "      <td>It is a permanent link between the decision-ma...</td>\n",
+       "      <td>This project is an integrated information syst...</td>\n",
+       "      <td>The competent authorities of the Ministry of A...</td>\n",
+       "      <td>The development of an integrated information s...</td>\n",
+       "      <td>1. Reduce operational complexity and improve e...</td>\n",
+       "      <td>Agricultural Information and Management System...</td>\n",
+       "      <td>agricultural information and management system...</td>\n",
+       "      <td>109</td>\n",
+       "      <td>\"\\\"[\\\\\\\"centralized database\\\\\\\", \\\\\\\"track la...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   id submitted_at                                      project_title  \\\n",
+       "0   1          NaT            3D hand game for neuromuscular patients   \n",
+       "1   2          NaT                                  3D Laser Scanning   \n",
+       "2   3          NaT  A Smart Automatic System for Criminal Identifi...   \n",
+       "3   4          NaT           Advanced Educational Platform “ABSTHALK”   \n",
+       "4   5          NaT     Agricultural Information and Management System   \n",
+       "\n",
+       "                                       student_names  year  \\\n",
+       "0  Ahmed Mansour Mohamed Saber, Ahmed Mohamed Moh...  2017   \n",
+       "1  Aya Essam Hegazi, Asmaa Abd EL-Aziz, Ebtehal E...  2024   \n",
+       "2  Yousef Yacoub Mohammed, Ahmed Mohamed Hassan,\\...  2020   \n",
+       "3  Mohamed Nasser Maher, Karim Ashraf Salah Eldie...  2025   \n",
+       "4  Ahmed Mohamed, Omar Hassan, Mahmoud Ali ,Mazen...  2020   \n",
+       "\n",
+       "                                            abstract  \\\n",
+       "0  In this project we have designed and implement...   \n",
+       "1  3D scanning is used in many applications such ...   \n",
+       "2  The increasing use of biometric technologies i...   \n",
+       "3  The Educational Platform for Students and Teac...   \n",
+       "4  It is a permanent link between the decision-ma...   \n",
+       "\n",
+       "                                         description  \\\n",
+       "0  A virtual rehabilitation system that uses a Le...   \n",
+       "1  This project implements a low-cost 3D laser sc...   \n",
+       "2  This project develops an automated criminal id...   \n",
+       "3  ABSTHALK is a comprehensive, role-based e-lear...   \n",
+       "4  This project is an integrated information syst...   \n",
+       "\n",
+       "                                   problem_statement  \\\n",
+       "0  Neuromuscular patients suffer from nerve atrop...   \n",
+       "1  Existing 3D scanning devices are often extreme...   \n",
+       "2  Traditional identification methods, such as ph...   \n",
+       "3  Traditional learning methods often lack access...   \n",
+       "4  The competent authorities of the Ministry of A...   \n",
+       "\n",
+       "                                   proposed_solution  \\\n",
+       "0  The development of a 3D interactive game integ...   \n",
+       "1  A low-cost 3D laser scanning system that utili...   \n",
+       "2  A real-time facial recognition system develope...   \n",
+       "3  The project proposes a structured, role-based,...   \n",
+       "4  The development of an integrated information s...   \n",
+       "\n",
+       "                                          objectives  \\\n",
+       "0  1. Develop a scalable and maintainable solutio...   \n",
+       "1  1. Improve overall productivity and workflow o...   \n",
+       "2  1. Support future scalability and feature expa...   \n",
+       "3  1. Provide interactive educational tools and r...   \n",
+       "4  1. Reduce operational complexity and improve e...   \n",
+       "\n",
+       "                                        full_content  \\\n",
+       "0  3D hand game for neuromuscular patients. 3D ha...   \n",
+       "1  3D Laser Scanning. 3D Laser Scanning. 3D scann...   \n",
+       "2  A Smart Automatic System for Criminal Identifi...   \n",
+       "3  Advanced Educational Platform “ABSTHALK”. Adva...   \n",
+       "4  Agricultural Information and Management System...   \n",
+       "\n",
+       "                                          clean_text  word_count  \\\n",
+       "0  3d hand game for neuromuscular patients. 3d ha...         172   \n",
+       "1  3d laser scanning. 3d laser scanning. 3d scann...         185   \n",
+       "2  a smart automatic system for criminal identifi...         138   \n",
+       "3  advanced educational platform absthalk . advan...         192   \n",
+       "4  agricultural information and management system...         109   \n",
+       "\n",
+       "                                            features  \n",
+       "0  \"\\\"[\\\\\\\"Leap Motion controller sensor\\\\\\\", \\\\\\...  \n",
+       "1  \"\\\"[\\\\\\\"3d laser scanning\\\\\\\", \\\\\\\"Hand-held l...  \n",
+       "2  \"\\\"[\\\\\\\"real-time face recognition system\\\\\\\",...  \n",
+       "3  \"\\\"[\\\\\\\"Role-based management system\\\\\\\", \\\\\\\"...  \n",
+       "4  \"\\\"[\\\\\\\"centralized database\\\\\\\", \\\\\\\"track la...  "
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "query = \"\"\"\n",
+    "SELECT *\n",
+    "FROM PreProcessed_Projects\n",
+    "\"\"\"\n",
+    "\n",
+    "df = pd.read_sql(query, engine)\n",
+    "\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "4429717d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['id', 'submitted_at', 'project_title', 'student_names', 'year', 'abstract', 'description', 'problem_statement', 'proposed_solution', 'objectives', 'full_content', 'clean_text', 'word_count', 'features']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(df.columns.tolist())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "9925da4c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = df.rename(columns={\n",
+    "    \"Title\": \"project_title\",\n",
+    "    \"Description\": \"description\",\n",
+    "    \"Abstract\": \"abstract\"\n",
+    "})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "fc62d4f3",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>submitted_at</th>\n",
+       "      <th>project_title</th>\n",
+       "      <th>student_names</th>\n",
+       "      <th>year</th>\n",
+       "      <th>abstract</th>\n",
+       "      <th>description</th>\n",
+       "      <th>problem_statement</th>\n",
+       "      <th>proposed_solution</th>\n",
+       "      <th>objectives</th>\n",
+       "      <th>full_content</th>\n",
+       "      <th>clean_text</th>\n",
+       "      <th>word_count</th>\n",
+       "      <th>features</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>3D hand game for neuromuscular patients</td>\n",
+       "      <td>Ahmed Mansour Mohamed Saber, Ahmed Mohamed Moh...</td>\n",
+       "      <td>2017</td>\n",
+       "      <td>In this project we have designed and implement...</td>\n",
+       "      <td>A virtual rehabilitation system that uses a Le...</td>\n",
+       "      <td>Neuromuscular patients suffer from nerve atrop...</td>\n",
+       "      <td>The development of a 3D interactive game integ...</td>\n",
+       "      <td>1. Develop a scalable and maintainable solutio...</td>\n",
+       "      <td>3D hand game for neuromuscular patients. 3D ha...</td>\n",
+       "      <td>3d hand game for neuromuscular patients. 3d ha...</td>\n",
+       "      <td>172</td>\n",
+       "      <td>\"\\\"[\\\\\\\"Leap Motion controller sensor\\\\\\\", \\\\\\...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>3D Laser Scanning</td>\n",
+       "      <td>Aya Essam Hegazi, Asmaa Abd EL-Aziz, Ebtehal E...</td>\n",
+       "      <td>2024</td>\n",
+       "      <td>3D scanning is used in many applications such ...</td>\n",
+       "      <td>This project implements a low-cost 3D laser sc...</td>\n",
+       "      <td>Existing 3D scanning devices are often extreme...</td>\n",
+       "      <td>A low-cost 3D laser scanning system that utili...</td>\n",
+       "      <td>1. Improve overall productivity and workflow o...</td>\n",
+       "      <td>3D Laser Scanning. 3D Laser Scanning. 3D scann...</td>\n",
+       "      <td>3d laser scanning. 3d laser scanning. 3d scann...</td>\n",
+       "      <td>185</td>\n",
+       "      <td>\"\\\"[\\\\\\\"3d laser scanning\\\\\\\", \\\\\\\"Hand-held l...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>3</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>A Smart Automatic System for Criminal Identifi...</td>\n",
+       "      <td>Yousef Yacoub Mohammed, Ahmed Mohamed Hassan,\\...</td>\n",
+       "      <td>2020</td>\n",
+       "      <td>The increasing use of biometric technologies i...</td>\n",
+       "      <td>This project develops an automated criminal id...</td>\n",
+       "      <td>Traditional identification methods, such as ph...</td>\n",
+       "      <td>A real-time facial recognition system develope...</td>\n",
+       "      <td>1. Support future scalability and feature expa...</td>\n",
+       "      <td>A Smart Automatic System for Criminal Identifi...</td>\n",
+       "      <td>a smart automatic system for criminal identifi...</td>\n",
+       "      <td>138</td>\n",
+       "      <td>\"\\\"[\\\\\\\"real-time face recognition system\\\\\\\",...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>4</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Advanced Educational Platform “ABSTHALK”</td>\n",
+       "      <td>Mohamed Nasser Maher, Karim Ashraf Salah Eldie...</td>\n",
+       "      <td>2025</td>\n",
+       "      <td>The Educational Platform for Students and Teac...</td>\n",
+       "      <td>ABSTHALK is a comprehensive, role-based e-lear...</td>\n",
+       "      <td>Traditional learning methods often lack access...</td>\n",
+       "      <td>The project proposes a structured, role-based,...</td>\n",
+       "      <td>1. Provide interactive educational tools and r...</td>\n",
+       "      <td>Advanced Educational Platform “ABSTHALK”. Adva...</td>\n",
+       "      <td>advanced educational platform absthalk . advan...</td>\n",
+       "      <td>192</td>\n",
+       "      <td>\"\\\"[\\\\\\\"Role-based management system\\\\\\\", \\\\\\\"...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>5</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Agricultural Information and Management System</td>\n",
+       "      <td>Ahmed Mohamed, Omar Hassan, Mahmoud Ali ,Mazen...</td>\n",
+       "      <td>2020</td>\n",
+       "      <td>It is a permanent link between the decision-ma...</td>\n",
+       "      <td>This project is an integrated information syst...</td>\n",
+       "      <td>The competent authorities of the Ministry of A...</td>\n",
+       "      <td>The development of an integrated information s...</td>\n",
+       "      <td>1. Reduce operational complexity and improve e...</td>\n",
+       "      <td>Agricultural Information and Management System...</td>\n",
+       "      <td>agricultural information and management system...</td>\n",
+       "      <td>109</td>\n",
+       "      <td>\"\\\"[\\\\\\\"centralized database\\\\\\\", \\\\\\\"track la...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   id submitted_at                                      project_title  \\\n",
+       "0   1          NaT            3D hand game for neuromuscular patients   \n",
+       "1   2          NaT                                  3D Laser Scanning   \n",
+       "2   3          NaT  A Smart Automatic System for Criminal Identifi...   \n",
+       "3   4          NaT           Advanced Educational Platform “ABSTHALK”   \n",
+       "4   5          NaT     Agricultural Information and Management System   \n",
+       "\n",
+       "                                       student_names  year  \\\n",
+       "0  Ahmed Mansour Mohamed Saber, Ahmed Mohamed Moh...  2017   \n",
+       "1  Aya Essam Hegazi, Asmaa Abd EL-Aziz, Ebtehal E...  2024   \n",
+       "2  Yousef Yacoub Mohammed, Ahmed Mohamed Hassan,\\...  2020   \n",
+       "3  Mohamed Nasser Maher, Karim Ashraf Salah Eldie...  2025   \n",
+       "4  Ahmed Mohamed, Omar Hassan, Mahmoud Ali ,Mazen...  2020   \n",
+       "\n",
+       "                                            abstract  \\\n",
+       "0  In this project we have designed and implement...   \n",
+       "1  3D scanning is used in many applications such ...   \n",
+       "2  The increasing use of biometric technologies i...   \n",
+       "3  The Educational Platform for Students and Teac...   \n",
+       "4  It is a permanent link between the decision-ma...   \n",
+       "\n",
+       "                                         description  \\\n",
+       "0  A virtual rehabilitation system that uses a Le...   \n",
+       "1  This project implements a low-cost 3D laser sc...   \n",
+       "2  This project develops an automated criminal id...   \n",
+       "3  ABSTHALK is a comprehensive, role-based e-lear...   \n",
+       "4  This project is an integrated information syst...   \n",
+       "\n",
+       "                                   problem_statement  \\\n",
+       "0  Neuromuscular patients suffer from nerve atrop...   \n",
+       "1  Existing 3D scanning devices are often extreme...   \n",
+       "2  Traditional identification methods, such as ph...   \n",
+       "3  Traditional learning methods often lack access...   \n",
+       "4  The competent authorities of the Ministry of A...   \n",
+       "\n",
+       "                                   proposed_solution  \\\n",
+       "0  The development of a 3D interactive game integ...   \n",
+       "1  A low-cost 3D laser scanning system that utili...   \n",
+       "2  A real-time facial recognition system develope...   \n",
+       "3  The project proposes a structured, role-based,...   \n",
+       "4  The development of an integrated information s...   \n",
+       "\n",
+       "                                          objectives  \\\n",
+       "0  1. Develop a scalable and maintainable solutio...   \n",
+       "1  1. Improve overall productivity and workflow o...   \n",
+       "2  1. Support future scalability and feature expa...   \n",
+       "3  1. Provide interactive educational tools and r...   \n",
+       "4  1. Reduce operational complexity and improve e...   \n",
+       "\n",
+       "                                        full_content  \\\n",
+       "0  3D hand game for neuromuscular patients. 3D ha...   \n",
+       "1  3D Laser Scanning. 3D Laser Scanning. 3D scann...   \n",
+       "2  A Smart Automatic System for Criminal Identifi...   \n",
+       "3  Advanced Educational Platform “ABSTHALK”. Adva...   \n",
+       "4  Agricultural Information and Management System...   \n",
+       "\n",
+       "                                          clean_text  word_count  \\\n",
+       "0  3d hand game for neuromuscular patients. 3d ha...         172   \n",
+       "1  3d laser scanning. 3d laser scanning. 3d scann...         185   \n",
+       "2  a smart automatic system for criminal identifi...         138   \n",
+       "3  advanced educational platform absthalk . advan...         192   \n",
+       "4  agricultural information and management system...         109   \n",
+       "\n",
+       "                                            features  \n",
+       "0  \"\\\"[\\\\\\\"Leap Motion controller sensor\\\\\\\", \\\\\\...  \n",
+       "1  \"\\\"[\\\\\\\"3d laser scanning\\\\\\\", \\\\\\\"Hand-held l...  \n",
+       "2  \"\\\"[\\\\\\\"real-time face recognition system\\\\\\\",...  \n",
+       "3  \"\\\"[\\\\\\\"Role-based management system\\\\\\\", \\\\\\\"...  \n",
+       "4  \"\\\"[\\\\\\\"centralized database\\\\\\\", \\\\\\\"track la...  "
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "query = \"\"\"\n",
+    "SELECT *\n",
+    "FROM PreProcessed_Projects\n",
+    "\"\"\"\n",
+    "\n",
+    "clean_df = pd.read_sql(query, engine)\n",
+    "\n",
+    "clean_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "e5af88d4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(255, 14)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(clean_df.shape)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "bb80639a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "count    255.000000\n",
+       "mean     236.031373\n",
+       "std       87.747619\n",
+       "min       24.000000\n",
+       "25%      173.500000\n",
+       "50%      225.000000\n",
+       "75%      287.000000\n",
+       "max      719.000000\n",
+       "Name: features, dtype: float64"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "clean_df[\"features\"].apply(len).describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "633cfec4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Saved cleaned dataset\n"
+     ]
+    }
+   ],
+   "source": [
+    "clean_df.to_parquet(\"Data_gemini/projects_clean_gemini.parquet\", index=False)\n",
+    "clean_df.to_csv(\"Data_gemini/projects_clean_gemini.csv\", index=False)\n",
+    "\n",
+    "print(\"Saved cleaned dataset\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "36f84432",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(255, 14)\n"
+     ]
+    }
+   ],
+   "source": [
+    "test_df = pd.read_parquet(\n",
+    "    \"Data_gemini/projects_clean_gemini.parquet\"\n",
+    ")\n",
+    "\n",
+    "print(test_df.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "0dd86aec",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['id', 'submitted_at', 'project_title', 'student_names', 'year', 'abstract', 'description', 'problem_statement', 'proposed_solution', 'objectives', 'full_content', 'clean_text', 'word_count', 'features']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(clean_df.columns.tolist())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "e3e96549",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>submitted_at</th>\n",
+       "      <th>project_title</th>\n",
+       "      <th>student_names</th>\n",
+       "      <th>year</th>\n",
+       "      <th>abstract</th>\n",
+       "      <th>description</th>\n",
+       "      <th>problem_statement</th>\n",
+       "      <th>proposed_solution</th>\n",
+       "      <th>objectives</th>\n",
+       "      <th>full_content</th>\n",
+       "      <th>clean_text</th>\n",
+       "      <th>word_count</th>\n",
+       "      <th>features</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>3D hand game for neuromuscular patients</td>\n",
+       "      <td>Ahmed Mansour Mohamed Saber, Ahmed Mohamed Moh...</td>\n",
+       "      <td>2017</td>\n",
+       "      <td>In this project we have designed and implement...</td>\n",
+       "      <td>A virtual rehabilitation system that uses a Le...</td>\n",
+       "      <td>Neuromuscular patients suffer from nerve atrop...</td>\n",
+       "      <td>The development of a 3D interactive game integ...</td>\n",
+       "      <td>1. Develop a scalable and maintainable solutio...</td>\n",
+       "      <td>3D hand game for neuromuscular patients. 3D ha...</td>\n",
+       "      <td>3d hand game for neuromuscular patients. 3d ha...</td>\n",
+       "      <td>172</td>\n",
+       "      <td>\"\\\"[\\\\\\\"Leap Motion controller sensor\\\\\\\", \\\\\\...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2</td>\n",
+       "      <td>None</td>\n",
+       "      <td>3D Laser Scanning</td>\n",
+       "      <td>Aya Essam Hegazi, Asmaa Abd EL-Aziz, Ebtehal E...</td>\n",
+       "      <td>2024</td>\n",
+       "      <td>3D scanning is used in many applications such ...</td>\n",
+       "      <td>This project implements a low-cost 3D laser sc...</td>\n",
+       "      <td>Existing 3D scanning devices are often extreme...</td>\n",
+       "      <td>A low-cost 3D laser scanning system that utili...</td>\n",
+       "      <td>1. Improve overall productivity and workflow o...</td>\n",
+       "      <td>3D Laser Scanning. 3D Laser Scanning. 3D scann...</td>\n",
+       "      <td>3d laser scanning. 3d laser scanning. 3d scann...</td>\n",
+       "      <td>185</td>\n",
+       "      <td>\"\\\"[\\\\\\\"3d laser scanning\\\\\\\", \\\\\\\"Hand-held l...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>3</td>\n",
+       "      <td>None</td>\n",
+       "      <td>A Smart Automatic System for Criminal Identifi...</td>\n",
+       "      <td>Yousef Yacoub Mohammed, Ahmed Mohamed Hassan,\\...</td>\n",
+       "      <td>2020</td>\n",
+       "      <td>The increasing use of biometric technologies i...</td>\n",
+       "      <td>This project develops an automated criminal id...</td>\n",
+       "      <td>Traditional identification methods, such as ph...</td>\n",
+       "      <td>A real-time facial recognition system develope...</td>\n",
+       "      <td>1. Support future scalability and feature expa...</td>\n",
+       "      <td>A Smart Automatic System for Criminal Identifi...</td>\n",
+       "      <td>a smart automatic system for criminal identifi...</td>\n",
+       "      <td>138</td>\n",
+       "      <td>\"\\\"[\\\\\\\"real-time face recognition system\\\\\\\",...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>4</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Advanced Educational Platform “ABSTHALK”</td>\n",
+       "      <td>Mohamed Nasser Maher, Karim Ashraf Salah Eldie...</td>\n",
+       "      <td>2025</td>\n",
+       "      <td>The Educational Platform for Students and Teac...</td>\n",
+       "      <td>ABSTHALK is a comprehensive, role-based e-lear...</td>\n",
+       "      <td>Traditional learning methods often lack access...</td>\n",
+       "      <td>The project proposes a structured, role-based,...</td>\n",
+       "      <td>1. Provide interactive educational tools and r...</td>\n",
+       "      <td>Advanced Educational Platform “ABSTHALK”. Adva...</td>\n",
+       "      <td>advanced educational platform absthalk . advan...</td>\n",
+       "      <td>192</td>\n",
+       "      <td>\"\\\"[\\\\\\\"Role-based management system\\\\\\\", \\\\\\\"...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>5</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Agricultural Information and Management System</td>\n",
+       "      <td>Ahmed Mohamed, Omar Hassan, Mahmoud Ali ,Mazen...</td>\n",
+       "      <td>2020</td>\n",
+       "      <td>It is a permanent link between the decision-ma...</td>\n",
+       "      <td>This project is an integrated information syst...</td>\n",
+       "      <td>The competent authorities of the Ministry of A...</td>\n",
+       "      <td>The development of an integrated information s...</td>\n",
+       "      <td>1. Reduce operational complexity and improve e...</td>\n",
+       "      <td>Agricultural Information and Management System...</td>\n",
+       "      <td>agricultural information and management system...</td>\n",
+       "      <td>109</td>\n",
+       "      <td>\"\\\"[\\\\\\\"centralized database\\\\\\\", \\\\\\\"track la...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   id submitted_at                                      project_title  \\\n",
+       "0   1         None            3D hand game for neuromuscular patients   \n",
+       "1   2         None                                  3D Laser Scanning   \n",
+       "2   3         None  A Smart Automatic System for Criminal Identifi...   \n",
+       "3   4         None           Advanced Educational Platform “ABSTHALK”   \n",
+       "4   5         None     Agricultural Information and Management System   \n",
+       "\n",
+       "                                       student_names  year  \\\n",
+       "0  Ahmed Mansour Mohamed Saber, Ahmed Mohamed Moh...  2017   \n",
+       "1  Aya Essam Hegazi, Asmaa Abd EL-Aziz, Ebtehal E...  2024   \n",
+       "2  Yousef Yacoub Mohammed, Ahmed Mohamed Hassan,\\...  2020   \n",
+       "3  Mohamed Nasser Maher, Karim Ashraf Salah Eldie...  2025   \n",
+       "4  Ahmed Mohamed, Omar Hassan, Mahmoud Ali ,Mazen...  2020   \n",
+       "\n",
+       "                                            abstract  \\\n",
+       "0  In this project we have designed and implement...   \n",
+       "1  3D scanning is used in many applications such ...   \n",
+       "2  The increasing use of biometric technologies i...   \n",
+       "3  The Educational Platform for Students and Teac...   \n",
+       "4  It is a permanent link between the decision-ma...   \n",
+       "\n",
+       "                                         description  \\\n",
+       "0  A virtual rehabilitation system that uses a Le...   \n",
+       "1  This project implements a low-cost 3D laser sc...   \n",
+       "2  This project develops an automated criminal id...   \n",
+       "3  ABSTHALK is a comprehensive, role-based e-lear...   \n",
+       "4  This project is an integrated information syst...   \n",
+       "\n",
+       "                                   problem_statement  \\\n",
+       "0  Neuromuscular patients suffer from nerve atrop...   \n",
+       "1  Existing 3D scanning devices are often extreme...   \n",
+       "2  Traditional identification methods, such as ph...   \n",
+       "3  Traditional learning methods often lack access...   \n",
+       "4  The competent authorities of the Ministry of A...   \n",
+       "\n",
+       "                                   proposed_solution  \\\n",
+       "0  The development of a 3D interactive game integ...   \n",
+       "1  A low-cost 3D laser scanning system that utili...   \n",
+       "2  A real-time facial recognition system develope...   \n",
+       "3  The project proposes a structured, role-based,...   \n",
+       "4  The development of an integrated information s...   \n",
+       "\n",
+       "                                          objectives  \\\n",
+       "0  1. Develop a scalable and maintainable solutio...   \n",
+       "1  1. Improve overall productivity and workflow o...   \n",
+       "2  1. Support future scalability and feature expa...   \n",
+       "3  1. Provide interactive educational tools and r...   \n",
+       "4  1. Reduce operational complexity and improve e...   \n",
+       "\n",
+       "                                        full_content  \\\n",
+       "0  3D hand game for neuromuscular patients. 3D ha...   \n",
+       "1  3D Laser Scanning. 3D Laser Scanning. 3D scann...   \n",
+       "2  A Smart Automatic System for Criminal Identifi...   \n",
+       "3  Advanced Educational Platform “ABSTHALK”. Adva...   \n",
+       "4  Agricultural Information and Management System...   \n",
+       "\n",
+       "                                          clean_text  word_count  \\\n",
+       "0  3d hand game for neuromuscular patients. 3d ha...         172   \n",
+       "1  3d laser scanning. 3d laser scanning. 3d scann...         185   \n",
+       "2  a smart automatic system for criminal identifi...         138   \n",
+       "3  advanced educational platform absthalk . advan...         192   \n",
+       "4  agricultural information and management system...         109   \n",
+       "\n",
+       "                                            features  \n",
+       "0  \"\\\"[\\\\\\\"Leap Motion controller sensor\\\\\\\", \\\\\\...  \n",
+       "1  \"\\\"[\\\\\\\"3d laser scanning\\\\\\\", \\\\\\\"Hand-held l...  \n",
+       "2  \"\\\"[\\\\\\\"real-time face recognition system\\\\\\\",...  \n",
+       "3  \"\\\"[\\\\\\\"Role-based management system\\\\\\\", \\\\\\\"...  \n",
+       "4  \"\\\"[\\\\\\\"centralized database\\\\\\\", \\\\\\\"track la...  "
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "test_df = pd.read_sql(\n",
+    "    \"SELECT TOP 5 * FROM PreProcessed_Projects\",\n",
+    "    engine\n",
+    ")\n",
+    "\n",
+    "test_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "078d4b8c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "================================================================================\n",
+      "Hospital Test\n",
+      "================================================================================\n",
+      "USING GEMINI FEATURE EXTRACTOR\n",
+      "CALLING GEMINI\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2026-06-04 00:30:08,804 | INFO | [LLM] model=gemini-3.1-flash-lite-preview | task=feature | attempt=1\n",
+      "2026-06-04 00:30:08,805 | INFO | AFC is enabled with max remote calls: 10.\n",
+      "2026-06-04 00:30:09,875 | INFO | HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-lite-preview:generateContent \"HTTP/1.1 200 OK\"\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "PARSED FEATURES:\n",
+      "['appointment booking', 'patient records management', 'medical records storage', 'doctor dashboard', 'physician dashboard', 'ai chatbot']\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a32846683c0e41e48b4b5cac27cbb769",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Feature Count: 5\n",
+      "\n",
+      "- appointment booking\n",
+      "- patient records management\n",
+      "- medical records storage\n",
+      "- doctor dashboard\n",
+      "- ai chatbot\n",
+      "\n",
+      "Duplicate Check:\n",
+      "patient records management <-> medical records storage (shared=1)\n",
+      "\n",
+      "\n",
+      "================================================================================\n",
+      "Machine Learning Test\n",
+      "================================================================================\n",
+      "USING GEMINI FEATURE EXTRACTOR\n",
+      "CALLING GEMINI\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2026-06-04 00:30:16,521 | INFO | [LLM] model=gemini-3.1-flash-lite-preview | task=feature | attempt=1\n",
+      "2026-06-04 00:30:16,522 | INFO | AFC is enabled with max remote calls: 10.\n",
+      "2026-06-04 00:30:17,431 | INFO | HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-lite-preview:generateContent \"HTTP/1.1 200 OK\"\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "PARSED FEATURES:\n",
+      "['prediction', 'analysis']\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "560f448ba2794e0e9e1940be1b66697d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Feature Count: 2\n",
+      "\n",
+      "- prediction\n",
+      "- analysis\n",
+      "\n",
+      "Duplicate Check:\n",
+      "No duplicate overlaps found\n",
+      "\n",
+      "\n",
+      "================================================================================\n",
+      "Face Recognition Test\n",
+      "================================================================================\n",
+      "USING GEMINI FEATURE EXTRACTOR\n",
+      "CALLING GEMINI\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2026-06-04 00:30:21,508 | INFO | [LLM] model=gemini-3.1-flash-lite-preview | task=feature | attempt=1\n",
+      "2026-06-04 00:30:21,509 | INFO | AFC is enabled with max remote calls: 10.\n",
+      "2026-06-04 00:30:22,145 | INFO | HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-lite-preview:generateContent \"HTTP/1.1 200 OK\"\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "PARSED FEATURES:\n",
+      "['face recognition', 'real-time face detection', 'student attendance management', 'mobile application']\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4ce3cd5b56544cb4864d5f0779063227",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Feature Count: 4\n",
+      "\n",
+      "- face recognition\n",
+      "- real-time face detection\n",
+      "- student attendance management\n",
+      "- mobile application\n",
+      "\n",
+      "Duplicate Check:\n",
+      "face recognition <-> real-time face detection (shared=1)\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from src.similarity_model.preprocessing import (\n",
+    "    extract_features,\n",
+    "    normalize_text\n",
+    ")\n",
+    "\n",
+    "def check_duplicates(features):\n",
+    "\n",
+    "    found = False\n",
+    "\n",
+    "    for i in range(len(features)):\n",
+    "        for j in range(i + 1, len(features)):\n",
+    "\n",
+    "            a = set(features[i].split())\n",
+    "            b = set(features[j].split())\n",
+    "\n",
+    "            overlap = len(a & b)\n",
+    "\n",
+    "            if overlap > 0:\n",
+    "                found = True\n",
+    "                print(\n",
+    "                    f\"{features[i]} <-> {features[j]} \"\n",
+    "                    f\"(shared={overlap})\"\n",
+    "                )\n",
+    "\n",
+    "    if not found:\n",
+    "        print(\"No duplicate overlaps found\")\n",
+    "\n",
+    "\n",
+    "tests = {\n",
+    "    \"Hospital Test\": \"\"\"\n",
+    "        Hospital management system with\n",
+    "        appointment booking,\n",
+    "        online appointment booking,\n",
+    "        patient records,\n",
+    "        medical records,\n",
+    "        doctor dashboard,\n",
+    "        physician dashboard,\n",
+    "        AI chatbot,\n",
+    "        intelligent chatbot\n",
+    "    \"\"\",\n",
+    "\n",
+    "    \"Machine Learning Test\": \"\"\"\n",
+    "        Machine learning system using machine learning\n",
+    "        for machine learning prediction and machine learning analysis.\n",
+    "    \"\"\",\n",
+    "\n",
+    "    \"Face Recognition Test\": \"\"\"\n",
+    "        Face recognition attendance system using deep learning,\n",
+    "        computer vision,\n",
+    "        real-time face detection,\n",
+    "        student attendance management and mobile application.\n",
+    "    \"\"\"\n",
+    "}\n",
+    "\n",
+    "for name, query in tests.items():\n",
+    "\n",
+    "    print(\"=\" * 80)\n",
+    "    print(name)\n",
+    "    print(\"=\" * 80)\n",
+    "\n",
+    "    features = extract_features(\n",
+    "        normalize_text(query)\n",
+    "    )\n",
+    "\n",
+    "    print(f\"Feature Count: {len(features)}\")\n",
+    "    print()\n",
+    "\n",
+    "    for f in features:\n",
+    "        print(\"-\", f)\n",
+    "\n",
+    "    print(\"\\nDuplicate Check:\")\n",
+    "    check_duplicates(features)\n",
+    "\n",
+    "    print(\"\\n\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "edc0890d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from Data.database.sql_connector import engine\n",
+    "\n",
+    "engine.dispose()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "0a231154",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2026-06-04 00:30:22,479 | INFO | Loading models and artifacts...\n",
+      "2026-06-04 00:30:22,481 | INFO | Loading model: all-MiniLM-L6-v2\n",
+      "2026-06-04 00:30:22,481 | INFO | Load pretrained SentenceTransformer: all-MiniLM-L6-v2\n",
+      "e:\\gradution project\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:949: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n",
+      "2026-06-04 00:30:24,618 | INFO | Use pytorch device_name: cpu\n",
+      "2026-06-04 00:30:24,624 | INFO | Loading FAISS index...\n",
+      "2026-06-04 00:30:24,627 | INFO | Loading feature model: all-MiniLM-L6-v2\n",
+      "2026-06-04 00:30:24,628 | INFO | Load pretrained SentenceTransformer: all-MiniLM-L6-v2\n",
+      "2026-06-04 00:30:26,763 | INFO | Use pytorch device_name: cpu\n",
+      "2026-06-04 00:30:26,767 | INFO | Loading metadata from Azure SQL...\n",
+      "2026-06-04 00:30:32,815 | INFO | Preparing query...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "USING GEMINI FEATURE EXTRACTOR\n",
+      "CALLING GEMINI\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2026-06-04 00:30:36,816 | INFO | [LLM] model=gemini-3.1-flash-lite-preview | task=feature | attempt=1\n",
+      "2026-06-04 00:30:36,817 | INFO | AFC is enabled with max remote calls: 10.\n",
+      "2026-06-04 00:30:37,822 | INFO | HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-lite-preview:generateContent \"HTTP/1.1 200 OK\"\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "PARSED FEATURES:\n",
+      "['appointment booking', 'patient records', 'doctor dashboard', 'ai chatbot']\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "eff76001187242a6a509b00507dae4ee",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2026-06-04 00:30:37,890 | INFO | Running semantic retrieval...\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a03cd362fbff43c2b60ee37fa346b9b3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2026-06-04 00:30:37,995 | INFO | Running hybrid ranking...\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b9258e51c54f445a87adba34482d1627",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c9bba1f165354e5486b8c88ccaeef00e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fc4577ed377747f3b87e810d02179ce9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "507fec89dc7643bb87467da4e0a3d874",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "235c224eec464cf796972ffbb4764179",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d64ac601101a43e59bfdcba31ca440de",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "dbc0825ced57497a96d822eb5f69d133",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8e1a98329d0948b39732408daa3d3d0f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "cacbca885c544c5dbd5ee851924c5e35",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c1624ce6d6a143eea18a16bcf2b6d598",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "48789bbc44a84be9b2574aae502457f6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b12e1bc1c5f54918b7220c4d548c272c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d25ff7d6bcf04b88ad3f278fc1074ec0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3669c573bc0740d099cdea8534da1929",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a9b77ba859c247afb63cd9e13f6ec58f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "be0e6c66976c4ef88b04cddf583f5b75",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7beccc50a11349d4aa7eb3b83b33f9b7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "952a0e024ec347b7ace0f2e33ec63fab",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "dcefd6ab863d484cb4edbf99dbf9bfce",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b33e2e9264a6485aa8f7558ceb1b72e3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "da264b88b6304434af2e12621422ef53",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "549650cf75964ccfbe521e28eca314a9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "06a80f5a77f645e783a6601570b9bd38",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "db35a10c1302487ab1122c3a6a0d37c9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a10c675d23a245138b950e0203c37f05",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d3c9324e3f984f17acec7f24d552ec10",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c711d80cc018412a9437dacda5e046c4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "382264332377473682355df0537c205f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6c68bb5e2f914bc181f621974e099338",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "121a21c56f9f48849601032b46927682",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c942dadc11284abb8caa847d765222d5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ac1b3107c77b4e6c9515dd8925d388ce",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "533faf5886374abc9f127b15ae388739",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1bc595ace2a14c02909f3f0f8b09148a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4e61ceaef6384da5a0b761bd5ee69165",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3e27cc1a6a514cedb1b295198ee2c3af",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "433c7f69da8c4bc199455170aa52abf8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "314054bcb00d4cd7bc4c4ca86409d751",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5838180e1b4849e997ef24b8ca304a6c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ba66fd00bbe442bd99b47a4eaba434b4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>project_title</th>\n",
+       "      <th>semantic_score</th>\n",
+       "      <th>feature_score</th>\n",
+       "      <th>coverage</th>\n",
+       "      <th>hybrid_score</th>\n",
+       "      <th>duplicate_risk</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Detecting Diseases Using Chatbot and Booking C...</td>\n",
+       "      <td>0.7480</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.05</td>\n",
+       "      <td>Very Low</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Clinical Information System</td>\n",
+       "      <td>0.6479</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.05</td>\n",
+       "      <td>Very Low</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Doctor 4 U</td>\n",
+       "      <td>0.6437</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.05</td>\n",
+       "      <td>Very Low</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Health Care Management System</td>\n",
+       "      <td>0.6402</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.05</td>\n",
+       "      <td>Very Low</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Hospital Management System</td>\n",
+       "      <td>0.6397</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.05</td>\n",
+       "      <td>Very Low</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                       project_title  semantic_score  \\\n",
+       "0  Detecting Diseases Using Chatbot and Booking C...          0.7480   \n",
+       "1                        Clinical Information System          0.6479   \n",
+       "2                                         Doctor 4 U          0.6437   \n",
+       "3                      Health Care Management System          0.6402   \n",
+       "4                         Hospital Management System          0.6397   \n",
+       "\n",
+       "   feature_score  coverage  hybrid_score duplicate_risk  \n",
+       "0            0.0       0.0          0.05       Very Low  \n",
+       "1            0.0       0.0          0.05       Very Low  \n",
+       "2            0.0       0.0          0.05       Very Low  \n",
+       "3            0.0       0.0          0.05       Very Low  \n",
+       "4            0.0       0.0          0.05       Very Low  "
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "results = find_similar_projects(\n",
+    "    title=\"AI Clinic Management System\",\n",
+    "    description=\"\"\"\n",
+    "    Smart clinic management platform with\n",
+    "    appointment booking,\n",
+    "    patient records,\n",
+    "    doctor dashboard,\n",
+    "    AI chatbot.\n",
+    "    \"\"\",\n",
+    "    top_k=5\n",
+    ")\n",
+    "\n",
+    "results[[\n",
+    "    \"project_title\",\n",
+    "    \"semantic_score\",\n",
+    "    \"feature_score\",\n",
+    "    \"coverage\",\n",
+    "    \"hybrid_score\",\n",
+    "    \"duplicate_risk\"\n",
+    "]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "5ab1315b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e3c94f184d4f485c871ada26ed9f5abc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "13e2f339bdd544949ec9a26f472a95ef",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'score': 0.8726, 'coverage': 0.8, 'shared_count': 4, 'matches': [{'feature_a': 'appointment booking', 'feature_b': 'booking doctor appointments', 'score': 0.821}, {'feature_a': 'patient records', 'feature_b': 'medical records', 'score': 0.895}, {'feature_a': 'doctor dashboard', 'feature_b': 'doctor dashboard', 'score': 1.0}, {'feature_a': 'ai chatbot', 'feature_b': 'intelligent chatbot', 'score': 0.899}], 'unique_a': ['clinic management'], 'unique_b': ['hospital management']}\n"
+     ]
+    }
+   ],
+   "source": [
+    "project_a = [\n",
+    "    \"appointment booking\",\n",
+    "    \"patient records\",\n",
+    "    \"doctor dashboard\",\n",
+    "    \"ai chatbot\",\n",
+    "    \"clinic management\"\n",
+    "]\n",
+    "\n",
+    "project_b = [\n",
+    "    \"booking doctor appointments\",\n",
+    "    \"medical records\",\n",
+    "    \"doctor dashboard\",\n",
+    "    \"intelligent chatbot\",\n",
+    "    \"hospital management\"\n",
+    "]\n",
+    "\n",
+    "result = compute_feature_similarity(\n",
+    "    project_a,\n",
+    "    project_b\n",
+    ")\n",
+    "\n",
+    "print(result)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "9f571cb2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "82.25\n"
+     ]
+    }
+   ],
+   "source": [
+    "from src.similarity_model import compute_originality\n",
+    "\n",
+    "print(\n",
+    "    compute_originality(\n",
+    "        hybrid_score=0.30,\n",
+    "        unique_query_features=7,\n",
+    "        total_query_features=8\n",
+    "    )\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "53eeed12",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2026-06-04 00:30:41,636 | INFO | Loading processed dataset from Azure SQL...\n",
+      "2026-06-04 00:30:46,601 | INFO | Loading embedding model: all-MiniLM-L6-v2\n",
+      "2026-06-04 00:30:46,602 | INFO | Load pretrained SentenceTransformer: all-MiniLM-L6-v2\n",
+      "e:\\gradution project\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:949: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n",
+      "2026-06-04 00:30:49,233 | INFO | Use pytorch device_name: cpu\n",
+      "2026-06-04 00:30:49,243 | INFO | Generating embeddings for 255 projects...\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f40167a736a840a6bd04e2b85b18c92d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/4 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2026-06-04 00:31:05,278 | INFO | FAISS index built successfully with 255 vectors.\n",
+      "2026-06-04 00:31:05,299 | INFO | Artifacts saved to models\n",
+      "2026-06-04 00:31:05,301 | INFO | Embedding engine completed successfully.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training Completed\n"
+     ]
+    }
+   ],
+   "source": [
+    "from src.similarity_model.embedding_engine import (\n",
+    "    train_embedding_engine\n",
+    ")\n",
+    "\n",
+    "engine = train_embedding_engine()\n",
+    "\n",
+    "print(\"Training Completed\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "94ebeacc",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2026-06-04 00:31:05,325 | INFO | Loading embedding model: all-MiniLM-L6-v2\n",
+      "2026-06-04 00:31:05,327 | INFO | Load pretrained SentenceTransformer: all-MiniLM-L6-v2\n",
+      "e:\\gradution project\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:949: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n",
+      "2026-06-04 00:31:07,549 | INFO | Use pytorch device_name: cpu\n",
+      "2026-06-04 00:31:07,583 | INFO | Artifacts loaded successfully.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4c7332342b3d4027b4960c9256eea984",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "   project_id                                          title technologies  \\\n",
+      "0         105                     Hospital Management System                \n",
+      "1          47                    Clinical Information System                \n",
+      "2         110                  Health Care Management System                \n",
+      "3          62                                     Doctor 4 U                \n",
+      "4         112  health services & medical outcomes monitoring                \n",
+      "\n",
+      "   similarity_score  \n",
+      "0            0.8216  \n",
+      "1            0.6907  \n",
+      "2            0.6779  \n",
+      "3            0.5829  \n",
+      "4            0.5801  \n"
+     ]
+    }
+   ],
+   "source": [
+    "from src.similarity_model.embedding_engine import ProjectEmbedder\n",
+    "\n",
+    "engine = ProjectEmbedder()\n",
+    "engine.load_artifacts()\n",
+    "\n",
+    "results = engine.search(\n",
+    "    \"hospital management system with appointment booking and patient records\",\n",
+    "    k=5\n",
+    ")\n",
+    "\n",
+    "print(results)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "8e5b3729",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e73c8cda22e6469cb5aa1b9620abe390",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "26ab4a1c4e28402487c2bd7ce8558359",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'score': 0.8866, 'coverage': 1.0, 'shared_count': 1, 'matches': [{'feature_a': 'machine learning system', 'feature_b': 'machine learning platform', 'score': 0.838}], 'unique_a': [], 'unique_b': ['ml analytics']}\n"
+     ]
+    }
+   ],
+   "source": [
+    "result = compute_feature_similarity(\n",
+    "    [\n",
+    "        \"machine learning system\",\n",
+    "        \"machine learning prediction\",\n",
+    "        \"machine learning analysis\"\n",
+    "    ],\n",
+    "    [\n",
+    "        \"machine learning platform\",\n",
+    "        \"predictive machine learning\",\n",
+    "        \"ml analytics\"\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "print(result)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "3f0b789e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.05\n"
+     ]
+    }
+   ],
+   "source": [
+    "from src.similarity_model.hybrid_ranker import (\n",
+    "    compute_hybrid_score\n",
+    ")\n",
+    "\n",
+    "print(\n",
+    "    compute_hybrid_score(\n",
+    "        semantic_score=0.95,\n",
+    "        feature_score=0.0,\n",
+    "        coverage=0.0,\n",
+    "        feature_count=5,\n",
+    "        unique_query_count=5\n",
+    "    )\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "5c2e1ed5",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "id:\n",
+      "207\n",
+      "\n",
+      "submitted_at:\n",
+      "NaT\n",
+      "\n",
+      "project_title:\n",
+      "Smart Library\n",
+      "\n",
+      "student_names:\n",
+      "Abdel Hamid Abdel Nasser, Mahmoud Tamer Mahmoud, Amer Saed Mohamed Ali Amer, Tahany Adel Faragallah, Hala Ahmed Saad Salem, Mohamed Khaled Mohamed\n",
+      "\n",
+      "year:\n",
+      "2022\n",
+      "\n",
+      "abstract:\n",
+      "Egypt is striving and our efforts are focused these days towards digital transformation and the nationalization of all its government facilities, including the higher education sector. With more than 4 million university students and up to 644, 000 graduates annually, we need smart digital systems that support the educational process and scientific research. Therefore, we have developed a smart library application that takes care of books, recommendations, and user opinions, and provides the appropriate electronic environment for university students to find and nominate appropriate books through an electronic application based on artificial intelligence. Where, using artificial intelligence algorithms, the application will analyze book data and student data together to choose the most appropriate scientific content, in addition to the chatbot is designed to intelligently simulate human conversations. Finally, the smart library provides books to students faster and easier, and encourages them to read and benefit from their information, and the presence of suggestions for similar books will make them not stop reading and expand their horizons, and also the presence of a chatbot will increase the ease of access to books.\n",
+      "\n",
+      "description:\n",
+      "The Smart Library project is a digital platform designed to modernize university library systems in Egypt. It integrates AI-driven book recommendations, an interactive chatbot for user assistance, social groups for collaborative reading, and a QR-code-based borrowing system to streamline library operations and improve student access to academic resources.\n",
+      "\n",
+      "problem_statement:\n",
+      "University libraries in Egypt face delays in digital transformation, relying on traditional, non-interactive systems. This leads to inefficient resource usage, difficulty for students in finding relevant academic materials, and a lack of engagement, ultimately hindering the educational process.\n",
+      "\n",
+      "proposed_solution:\n",
+      "The project proposes an AI-powered smart library application that features a machine learning recommendation engine, an intelligent chatbot for conversational support, social networking features for students, and a QR-code system for automated book borrowing and management.\n",
+      "\n",
+      "objectives:\n",
+      "1. Provide accurate and reliable functionality.\n",
+      "2. Provide interactive educational tools and resources.\n",
+      "3. Improve decision-making using artificial intelligence techniques.\n",
+      "4. Implement intelligent AI-based functionalities.\n",
+      "5. Improve system performance and reliability.\n",
+      "6. Improve learning experience and educational accessibility.\n",
+      "\n",
+      "full_content:\n",
+      "Smart Library. Smart Library. Egypt is striving and our efforts are focused these days towards digital transformation and the nationalization of all its government facilities, including the higher education sector. With more than 4 million university students and up to 644, 000 graduates annually, we need smart digital systems that support the educational process and scientific research. Therefore, we have developed a smart library application that takes care of books, recommendations, and user opinions, and provides the appropriate electronic environment for university students to find and nominate appropriate books through an electronic application based on artificial intelligence. Where, using artificial intelligence algorithms, the application will analyze book data and student data together to choose the most appropriate scientific content, in addition to the chatbot is designed to intelligently simulate human conversations. Finally, the smart library provides books to students faster and easier, and encourages them to read and benefit from their information, and the presence of suggestions for similar books will make them not stop reading and expand their horizons, and also the presence of a chatbot will increase the ease of access to books.. The Smart Library project is a digital platform designed to modernize university library systems in Egypt. It integrates AI-driven book recommendations, an interactive chatbot for user assistance, social groups for collaborative reading, and a QR-code-based borrowing system to streamline library operations and improve student access to academic resources.\n",
+      "\n",
+      "clean_text:\n",
+      "smart library. smart library. egypt is striving and our efforts are focused these days towards digital transformation and the nationalization of all its government facilities including the higher education sector. with more than 4 million university students and up to 644 000 graduates annually we need smart digital systems that support the educational process and scientific research. therefore we have developed a smart library application that takes care of books recommendations and user opinions and provides the appropriate electronic environment for university students to find and nominate appropriate books through an electronic application based on artificial intelligence. where using artificial intelligence algorithms the application will analyze book data and student data together to choose the most appropriate scientific content in addition to the chatbot is designed to intelligently simulate human conversations. finally the smart library provides books to students faster and easier and encourages them to read and benefit from their information and the presence of suggestions for similar books will make them not stop reading and expand their horizons and also the presence of a chatbot will increase the ease of access to books.. the smart library project is a digital platform designed to modernize university library systems in egypt. it integrates ai-driven book recommendations an interactive chatbot for user assistance social groups for collaborative reading and a qr-code-based borrowing system to streamline library operations and improve student access to academic resources.\n",
+      "\n",
+      "word_count:\n",
+      "233\n",
+      "\n",
+      "features:\n",
+      "\"\\\"[\\\\\\\"Artificial intelligence algorithms\\\\\\\", \\\\\\\"AI-driven book recommendations\\\\\\\", \\\\\\\"Interactive chatbot\\\\\\\", \\\\\\\"Social groups for collaborative reading\\\\\\\", \\\\\\\"QR-code-based borrowing system\\\\\\\"]\\\"\"\n"
+     ]
+    }
+   ],
+   "source": [
+    "row = clean_df[\n",
+    "    clean_df[\"project_title\"] == \"Smart Library\"\n",
+    "].iloc[0]\n",
+    "\n",
+    "for column in clean_df.columns:\n",
+    "    print(f\"\\n{column}:\")\n",
+    "    print(row[column])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7f64358c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

README.md CHANGED Viewed

@@ -1,11 +1,379 @@
 ---
-title: Graduation Project-v1.2
-emoji: 📊
-colorFrom: purple
-colorTo: green
-sdk: docker
-pinned: false
-app_port: 7860
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# 🤖 AI-Powered Graduation Project Recommendation System
+## 📌 Overview
+This project implements an intelligent AI-powered recommendation and semantic similarity platform for graduation projects using:
+* Natural Language Processing (NLP)
+* Semantic Search
+* Vector Embeddings
+* Hybrid Ranking Systems
+* Large Language Models (LLMs)
+The system helps students:
+* discover unique graduation project ideas
+* avoid duplicate projects
+* analyze originality
+* generate intelligent project features
+* receive context-aware recommendations through an AI chatbot
+---
+# ⚙️ System Pipeline
+## 1️⃣ Data Preprocessing
+* Text normalization
+* Duplicate removal
+* Smart content merging
+* Technical keyword extraction
+* Feature engineering
+## 2️⃣ Feature Extraction
+* KeyBERT-based keyword extraction
+* Automatic technical term detection
+* Semantic feature generation
+## 3️⃣ Embedding Generation
+* SentenceTransformer embeddings
+* Normalized vector representations
+* Semantic encoding of projects
+## 4️⃣ Semantic Retrieval
+* FAISS vector indexing
+* Nearest-neighbor semantic search
+* Fast project similarity lookup
+## 5️⃣ Hybrid Ranking
+The final ranking combines:
+* Semantic similarity
+* Feature similarity
+* Coverage ratio
+* Confidence estimation
+* Originality analysis
+## 6️⃣ AI Recommendation Engine
+* Context-aware project generation
+* Feature recommendation
+* Novelty checking
+* Conversational chatbot assistance
+---
+# 🧠 AI & NLP Technologies Used
+## 🔹 Machine Learning & NLP
+* SentenceTransformers
+* KeyBERT
+* Scikit-learn
+* SciPy
+* FAISS
+## 🔹 LLM Integration
+* Google Gemini API
+* Ollama
+* Mistral
+## 🔹 Backend & Infrastructure
+* FastAPI
+* Pandas
+* NumPy
+* Python
 ---
+# 🏗️ Project Architecture
+```text
+User Query
+    ↓
+Intent Classification
+    ↓
+Context Builder
+    ↓
+Feature Extraction
+    ↓
+Embedding Generation
+    ↓
+FAISS Semantic Search
+    ↓
+Hybrid Ranking Engine
+    ↓
+Originality & Duplicate Analysis
+    ↓
+AI Recommendation Response
+```
+---
+# 🔍 Similarity Engine Workflow
+```text
+Raw Dataset
+    ↓
+Preprocessing
+    ↓
+Feature Extraction
+    ↓
+Sentence Embeddings
+    ↓
+FAISS Indexing
+    ↓
+Semantic Retrieval
+    ↓
+Feature Similarity Matching
+    ↓
+Hybrid Re-ranking
+    ↓
+Final Recommendation
+```
+---
+# 🚀 Features
+## ✅ AI Chatbot
+* Context-aware conversations
+* Intent classification
+* Domain-specific recommendations
+* Memory-aware responses
+## ✅ Semantic Similarity Search
+* Embedding-based retrieval
+* Semantic duplicate detection
+* Vector search with FAISS
+## ✅ Hybrid Recommendation System
+* Multi-stage ranking pipeline
+* Feature-level semantic comparison
+* Adaptive scoring strategy
+## ✅ Originality Detection
+* Duplicate risk analysis
+* Originality scoring
+* Similarity confidence estimation
+## ✅ Intelligent Feature Generation
+* AI-generated project features
+* Novelty-aware generation
+* Domain-aware recommendations
+---
+# 📊 Evaluation
+The system includes:
+* Self-retrieval evaluation
+* Real-query testing
+* Hybrid ranking validation
+* Confidence scoring
+### Evaluation Metrics
+* Semantic Similarity Score
+* Hybrid Score
+* Originality Score
+* Confidence Score
+* Duplicate Risk Classification
+---
+# 📁 Project Structure
+```text
+GRADUATION_PROJECT/
+│
+├── api/                         # FastAPI backend
+│
+├── Data/
+│   ├── raw/                    # Original dataset
+│   └── processed/              # Cleaned dataset
+│
+├── models/                     # FAISS index & metadata
+│
+├── Notebooks/
+│   └── TEST.ipynb              # Training & evaluation notebook
+│
+├── src/
+│   ├── recommendation_engine/  # Chatbot & recommendation logic
+│   └── similarity_model/       # Semantic search engine
+│
+├── requirements.txt
+├── README.md
+└── .gitignore
+```
 ---
+# 🧩 Recommendation Engine Modules
+## recommendation_engine/
+Contains:
+* Chatbot engine
+* Intent classification
+* Prompt building
+* Idea generation
+* Feature generation
+* Memory management
+* Novelty checking
+* Response formatting
+---
+# 🔬 Similarity Model Modules
+## similarity_model/
+Contains:
+* Semantic search
+* Embedding engine
+* Hybrid ranker
+* Feature similarity engine
+* Preprocessing pipeline
+* Evaluation framework
+---
+# ⚡ Installation
+## 1️⃣ Clone Repository
+```bash
+git clone https://github.com/YOUR_USERNAME/YOUR_REPOSITORY.git
+cd YOUR_REPOSITORY
+```
+---
+## 2️⃣ Create Virtual Environment
+### Windows
+```bash
+python -m venv .venv
+.venv\Scripts\activate
+```
+### Linux / Mac
+```bash
+python3 -m venv .venv
+source .venv/bin/activate
+```
+---
+## 3️⃣ Install Dependencies
+```bash
+pip install -r requirements.txt
+```
+---
+# 🔑 Environment Variables
+Create a `.env` file:
+```env
+GEMINI_API_KEY=your_api_key_here
+```
+---
+# ▶️ Running The Project
+## Run FastAPI Server
+```bash
+uvicorn api.main:app --reload
+```
+---
+## Run Notebook
+```bash
+jupyter notebook
+```
+Open:
+```text
+Notebooks/TEST.ipynb
+```
+---
+# 💡 Example Query
+## Input
+```text
+AI-based smart library recommendation platform
+```
+## Output
+* Similar graduation projects
+* Semantic similarity scores
+* Originality analysis
+* Duplicate risk estimation
+* Recommended features
+---
+# 🎯 Future Improvements
+* Full RAG integration
+* Multi-agent orchestration
+* GPU acceleration
+* Advanced evaluation metrics
+* Real-time deployment
+* Database persistence
+* Frontend dashboard
+---
+# 📚 Research Areas Covered
+* Natural Language Processing (NLP)
+* Semantic Search
+* Recommendation Systems
+* Vector Databases
+* Conversational AI
+* Information Retrieval
+* Hybrid Ranking Systems
+* Large Language Models (LLMs)
+---
+# 👨‍💻 Author
+Yossef Assem
+---
+# 📄 License
+This project is for educational and research purposes.

api/__pycache__/__init__.cpython-311.pyc CHANGED Viewed

Binary files a/api/__pycache__/__init__.cpython-311.pyc and b/api/__pycache__/__init__.cpython-311.pyc differ

api/__pycache__/main.cpython-311.pyc CHANGED Viewed

Binary files a/api/__pycache__/main.cpython-311.pyc and b/api/__pycache__/main.cpython-311.pyc differ

api/__pycache__/schemas.cpython-311.pyc CHANGED Viewed

Binary files a/api/__pycache__/schemas.cpython-311.pyc and b/api/__pycache__/schemas.cpython-311.pyc differ

api/__pycache__/services.cpython-311.pyc CHANGED Viewed

Binary files a/api/__pycache__/services.cpython-311.pyc and b/api/__pycache__/services.cpython-311.pyc differ

models/faiss_index.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cfe09be00eb70151711b1603dcd8a2b67c102f4218647b2a5bb405a2a1932863
-size 392266

 version https://git-lfs.github.com/spec/v1
+oid sha256:469af18f0e06e31e389d476e47e643626f8ddfd69b593f299c78932080b5c858
+size 393810

models/metadata.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:40d299cdc20636faf33e0acd77ca5b5322ef3de7e6e539b183a25f4e6bdf96cc
-size 794293

 version https://git-lfs.github.com/spec/v1
+oid sha256:fa65da476454398a5d7b124e338e0e4bc2c8015258b73e09fa28cad448f9a420
+size 773665

models/project_embeddings.npy CHANGED Viewed

Binary files a/models/project_embeddings.npy and b/models/project_embeddings.npy differ

requirements.txt CHANGED Viewed

@@ -14,4 +14,4 @@ google-genai
 requests
 pyarrow
 sqlalchemy
-pyodbc

 requests
 pyarrow
 sqlalchemy
+pyodbc

src/recommendation_engine/__pycache__/__init__.cpython-311.pyc CHANGED Viewed

Binary files a/src/recommendation_engine/__pycache__/__init__.cpython-311.pyc and b/src/recommendation_engine/__pycache__/__init__.cpython-311.pyc differ

src/recommendation_engine/__pycache__/chatbot_engine.cpython-311.pyc CHANGED Viewed

Binary files a/src/recommendation_engine/__pycache__/chatbot_engine.cpython-311.pyc and b/src/recommendation_engine/__pycache__/chatbot_engine.cpython-311.pyc differ

src/recommendation_engine/__pycache__/command_handler.cpython-311.pyc CHANGED Viewed

Binary files a/src/recommendation_engine/__pycache__/command_handler.cpython-311.pyc and b/src/recommendation_engine/__pycache__/command_handler.cpython-311.pyc differ

src/recommendation_engine/__pycache__/config.cpython-311.pyc CHANGED Viewed

Binary files a/src/recommendation_engine/__pycache__/config.cpython-311.pyc and b/src/recommendation_engine/__pycache__/config.cpython-311.pyc differ

src/recommendation_engine/__pycache__/context_builder.cpython-311.pyc CHANGED Viewed

Binary files a/src/recommendation_engine/__pycache__/context_builder.cpython-311.pyc and b/src/recommendation_engine/__pycache__/context_builder.cpython-311.pyc differ

src/recommendation_engine/__pycache__/feature_generator.cpython-311.pyc CHANGED Viewed

Binary files a/src/recommendation_engine/__pycache__/feature_generator.cpython-311.pyc and b/src/recommendation_engine/__pycache__/feature_generator.cpython-311.pyc differ

src/recommendation_engine/__pycache__/full_project_generator.cpython-311.pyc CHANGED Viewed

Binary files a/src/recommendation_engine/__pycache__/full_project_generator.cpython-311.pyc and b/src/recommendation_engine/__pycache__/full_project_generator.cpython-311.pyc differ

src/recommendation_engine/__pycache__/idea_generator.cpython-311.pyc CHANGED Viewed

Binary files a/src/recommendation_engine/__pycache__/idea_generator.cpython-311.pyc and b/src/recommendation_engine/__pycache__/idea_generator.cpython-311.pyc differ

src/recommendation_engine/__pycache__/llm_client.cpython-311.pyc CHANGED Viewed

Binary files a/src/recommendation_engine/__pycache__/llm_client.cpython-311.pyc and b/src/recommendation_engine/__pycache__/llm_client.cpython-311.pyc differ

src/recommendation_engine/__pycache__/llm_router.cpython-311.pyc CHANGED Viewed

Binary files a/src/recommendation_engine/__pycache__/llm_router.cpython-311.pyc and b/src/recommendation_engine/__pycache__/llm_router.cpython-311.pyc differ

src/recommendation_engine/__pycache__/memory_store.cpython-311.pyc CHANGED Viewed

Binary files a/src/recommendation_engine/__pycache__/memory_store.cpython-311.pyc and b/src/recommendation_engine/__pycache__/memory_store.cpython-311.pyc differ

src/recommendation_engine/__pycache__/novelty_checker.cpython-311.pyc CHANGED Viewed

Binary files a/src/recommendation_engine/__pycache__/novelty_checker.cpython-311.pyc and b/src/recommendation_engine/__pycache__/novelty_checker.cpython-311.pyc differ

src/recommendation_engine/__pycache__/prompt_builder.cpython-311.pyc CHANGED Viewed

Binary files a/src/recommendation_engine/__pycache__/prompt_builder.cpython-311.pyc and b/src/recommendation_engine/__pycache__/prompt_builder.cpython-311.pyc differ

src/recommendation_engine/__pycache__/response_formatter.cpython-311.pyc CHANGED Viewed

Binary files a/src/recommendation_engine/__pycache__/response_formatter.cpython-311.pyc and b/src/recommendation_engine/__pycache__/response_formatter.cpython-311.pyc differ

src/recommendation_engine/__pycache__/state_manager.cpython-311.pyc CHANGED Viewed

Binary files a/src/recommendation_engine/__pycache__/state_manager.cpython-311.pyc and b/src/recommendation_engine/__pycache__/state_manager.cpython-311.pyc differ

src/recommendation_engine/__pycache__/test.cpython-311.pyc CHANGED Viewed

Binary files a/src/recommendation_engine/__pycache__/test.cpython-311.pyc and b/src/recommendation_engine/__pycache__/test.cpython-311.pyc differ

src/recommendation_engine/__pycache__/validator.cpython-311.pyc CHANGED Viewed

Binary files a/src/recommendation_engine/__pycache__/validator.cpython-311.pyc and b/src/recommendation_engine/__pycache__/validator.cpython-311.pyc differ

src/recommendation_engine/llm_client.py CHANGED Viewed

@@ -170,12 +170,14 @@ def is_bad_response(text: str) -> bool:
 # =========================================
 def generate_text(
     prompt: str,
-    task: str = "chat"
 ) -> str:
     prompt = safe_prompt(prompt)
-    temperature = get_temperature(task)
     max_tokens = get_max_tokens(task)
     for model_name in MODEL_CANDIDATES:

 # =========================================
 def generate_text(
     prompt: str,
+    task: str = "chat",
+    temperature=None
 ) -> str:
     prompt = safe_prompt(prompt)
+    if temperature is None:
+        temperature = get_temperature(task)
     max_tokens = get_max_tokens(task)
     for model_name in MODEL_CANDIDATES:

src/services/scheduler.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import time
+from src.services.sync_projects import sync_projects
+while True:
+    try:
+        print("Checking for new projects...")
+        sync_projects()
+        print("Done")
+    except Exception as e:
+        print("Error:", e)
+    time.sleep(60)

src/services/sync_projects.py ADDED Viewed

	@@ -0,0 +1,172 @@

+import json
+import pandas as pd
+from Data.database.sql_connector import engine
+from src.similarity_model.preprocessing import preprocess_dataset
+from src.similarity_model.embedding_engine import (
+    train_embedding_engine
+)
+def sync_projects():
+    projects_query = """
+    SELECT *
+    FROM Projects
+    WHERE Status IN (
+        'Completed',
+        'UnderReview',
+        'In_Progress'
+    )
+    """
+    projects_df = pd.read_sql(
+        projects_query,
+        engine
+    )
+    existing_df = pd.read_sql(
+        """
+        SELECT id
+        FROM PreProcessed_Projects
+        """,
+        engine
+    )
+    allowed_ids = set(
+        projects_df["Id"].tolist()
+    )
+    processed_ids = set(
+        existing_df["id"].tolist()
+    )
+    # ---------------------------------
+    # Remove projects no longer allowed
+    # ---------------------------------
+    ids_to_remove = (
+        processed_ids - allowed_ids
+    )
+    changed = False
+    if ids_to_remove:
+        ids_str = ",".join(
+            map(str, ids_to_remove)
+        )
+        with engine.begin() as conn:
+            conn.exec_driver_sql(
+                f"""
+                DELETE FROM PreProcessed_Projects
+                WHERE id IN ({ids_str})
+                """
+            )
+        print(
+            f"Removed {len(ids_to_remove)} projects"
+        )
+        changed = True
+    # ---------------------------------
+    # Add new projects
+    # ---------------------------------
+    new_projects = projects_df[
+        ~projects_df["Id"].isin(
+            processed_ids
+        )
+    ].copy()
+    if len(new_projects) > 0:
+        print(
+            f"Found {len(new_projects)} new projects"
+        )
+        processed_df = preprocess_dataset(
+            new_projects
+        )
+        processed_df = processed_df[
+            [
+                "id",
+                "submittedat",
+                "project_title",
+                "studentnames",
+                "year",
+                "abstract",
+                "description",
+                "problemstatement",
+                "proposedsolution",
+                "objectives",
+                "full_content",
+                "clean_text",
+                "word_count",
+                "features"
+            ]
+        ]
+        processed_df = processed_df.rename(
+            columns={
+                "submittedat": "submitted_at",
+                "studentnames": "student_names",
+                "problemstatement": "problem_statement",
+                "proposedsolution": "proposed_solution"
+            }
+        )
+        processed_df["features"] = (
+            processed_df["features"]
+            .apply(json.dumps)
+        )
+        processed_df.to_sql(
+            "PreProcessed_Projects",
+            engine,
+            if_exists="append",
+            index=False
+        )
+        print(
+            f"Processed and inserted {len(processed_df)} projects"
+        )
+        changed = True
+    else:
+        print(
+            "No new projects found"
+        )
+    # ---------------------------------
+    # Rebuild FAISS only if changed
+    # ---------------------------------
+    if changed:
+        print(
+            "Updating embeddings..."
+        )
+        train_embedding_engine()
+        print(
+            "Embeddings updated"
+        )
+    else:
+        print(
+            "No changes detected"
+        )
+if __name__ == "__main__":
+    sync_projects()

src/similarity_model/__pycache__/__init__.cpython-311.pyc CHANGED Viewed

Binary files a/src/similarity_model/__pycache__/__init__.cpython-311.pyc and b/src/similarity_model/__pycache__/__init__.cpython-311.pyc differ

src/similarity_model/__pycache__/embedding_engine.cpython-311.pyc CHANGED Viewed

Binary files a/src/similarity_model/__pycache__/embedding_engine.cpython-311.pyc and b/src/similarity_model/__pycache__/embedding_engine.cpython-311.pyc differ

src/similarity_model/__pycache__/feature_similarity.cpython-311.pyc CHANGED Viewed

Binary files a/src/similarity_model/__pycache__/feature_similarity.cpython-311.pyc and b/src/similarity_model/__pycache__/feature_similarity.cpython-311.pyc differ

src/similarity_model/__pycache__/hybrid_ranker.cpython-311.pyc CHANGED Viewed

Binary files a/src/similarity_model/__pycache__/hybrid_ranker.cpython-311.pyc and b/src/similarity_model/__pycache__/hybrid_ranker.cpython-311.pyc differ

src/similarity_model/__pycache__/preprocessing.cpython-311.pyc CHANGED Viewed

Binary files a/src/similarity_model/__pycache__/preprocessing.cpython-311.pyc and b/src/similarity_model/__pycache__/preprocessing.cpython-311.pyc differ

src/similarity_model/__pycache__/semantic_search.cpython-311.pyc CHANGED Viewed

Binary files a/src/similarity_model/__pycache__/semantic_search.cpython-311.pyc and b/src/similarity_model/__pycache__/semantic_search.cpython-311.pyc differ

src/similarity_model/__pycache__/similarity_engine.cpython-311.pyc CHANGED Viewed

Binary files a/src/similarity_model/__pycache__/similarity_engine.cpython-311.pyc and b/src/similarity_model/__pycache__/similarity_engine.cpython-311.pyc differ

src/similarity_model/embedding_engine.py CHANGED Viewed

@@ -119,12 +119,27 @@ class ProjectEmbedder:
         if TECH_COL not in self.metadata.columns:
             self.metadata[TECH_COL] = ""
         # weighted content:
         # title repeated twice
         rich_texts = (
-            self.metadata[TITLE_COL].fillna("").astype(str) + " " +
-            self.metadata[TITLE_COL].fillna("").astype(str) + " " +
-            self.metadata[TEXT_COL].fillna("").astype(str)
         ).tolist()
         embeddings = self.generate_embeddings(rich_texts)

         if TECH_COL not in self.metadata.columns:
             self.metadata[TECH_COL] = ""
+        FEATURE_COL = "features"
+        if FEATURE_COL not in self.metadata.columns:
+            self.metadata[FEATURE_COL] = ""
+        feature_text = (
+            self.metadata[FEATURE_COL]
+            .fillna("")
+            .astype(str)
+        )
         # weighted content:
         # title repeated twice
         rich_texts = (
+            self.metadata[TITLE_COL].fillna("").astype(str)
+            + " "
+            + self.metadata[TITLE_COL].fillna("").astype(str)
+            + " "
+            + self.metadata[TEXT_COL].fillna("").astype(str)
+            + " "
+            + feature_text
         ).tolist()
         embeddings = self.generate_embeddings(rich_texts)

src/similarity_model/feature_similarity.py CHANGED Viewed

@@ -27,7 +27,7 @@ logger = logging.getLogger(__name__)
 # =====================================================
 MODEL_NAME = "all-MiniLM-L6-v2"
-DEFAULT_THRESHOLD = 0.65
 SIMILARITY_WEIGHT = 0.70
 COVERAGE_WEIGHT = 0.30
@@ -110,7 +110,7 @@ def remove_redundant_features(features):
                 feat_words & existing
             ) / max(len(feat_words), 1)
-            if overlap >= 0.90:
                 redundant = True
                 break
@@ -247,11 +247,11 @@ def compute_feature_similarity(
         if shared_scores else 0.0
     )
-    max_len = max(len(fa), len(fb))
     coverage = (
-        len(matches) / max_len
-        if max_len > 0 else 0.0
     )
     final_score = (
@@ -260,6 +260,9 @@ def compute_feature_similarity(
         (COVERAGE_WEIGHT * coverage)
     )
     final_score = min(final_score, 1.0)
     matched_text_a = " ".join(

 # =====================================================
 MODEL_NAME = "all-MiniLM-L6-v2"
+DEFAULT_THRESHOLD = 0.80
 SIMILARITY_WEIGHT = 0.70
 COVERAGE_WEIGHT = 0.30
                 feat_words & existing
             ) / max(len(feat_words), 1)
+            if overlap >= 0.60:
                 redundant = True
                 break
         if shared_scores else 0.0
     )
+    min_len = min(len(fa), len(fb))
     coverage = (
+        len(matches) / min_len
+        if min_len > 0 else 0.0
     )
     final_score = (
         (COVERAGE_WEIGHT * coverage)
     )
+    if len(matches) == 0:
+        final_score = 0.0
     final_score = min(final_score, 1.0)
     matched_text_a = " ".join(

src/similarity_model/hybrid_ranker.py CHANGED Viewed

@@ -36,7 +36,7 @@ HIGH_FEATURE_WEIGHT = 0.45
 LOW_FEATURE_WEIGHT = 0.20
 BONUS_WEIGHT = 0.05
-MIN_HYBRID_SCORE = 0.35
 # =====================================================
 # Helpers
@@ -77,16 +77,16 @@ def get_dynamic_weights(
 # Score Engines
 # =====================================================
 def compute_hybrid_score(
-    semantic_score: float,
-    feature_score: float,
-    coverage: float,
-    feature_count: int
 ) -> float:
     semantic_score = clamp(semantic_score)
     feature_score = clamp(feature_score)
     coverage = clamp(coverage)
     # ==========================================
     # Strong feature overlap case
     # ==========================================
@@ -103,12 +103,30 @@ def compute_hybrid_score(
     # ==========================================
     # Normal scoring
     # ==========================================
     score = (
-        0.25 * semantic_score +
-        0.55 * feature_score +
-        0.20 * coverage
     )
     return round(clamp(score), 4)
@@ -118,22 +136,29 @@ def compute_originality(
     total_query_features: int
 ) -> float:
     """
-    Higher similarity => lower originality
-    More unique features => higher originality
     """
     hybrid_score = clamp(hybrid_score)
-    inverse_similarity = 1.0 - hybrid_score
     uniqueness_ratio = (
         unique_query_features / total_query_features
-        if total_query_features > 0 else 0.0
     )
-    originality = 1 - hybrid_score
-    return round(clamp(originality), 4)
 def compute_confidence(
@@ -159,16 +184,16 @@ def risk_label(score: float) -> str:
     Duplicate risk label.
     """
-    if score >= 0.85:
         return "Very High"
-    if score >= 0.70:
         return "High"
     if score >= 0.55:
         return "Medium"
-    if score >= 0.40:
         return "Low"
     return "Very Low"
@@ -209,25 +234,31 @@ def compare_single_candidate(
         feature_result["unique_a"]
     )
-    hybrid_score = compute_hybrid_score(
         semantic_score=semantic_score,
         feature_score=feature_score,
         coverage=coverage,
-        feature_count=total_query_features
     )
-    originality_score = compute_originality(
-        hybrid_score=hybrid_score,
-        unique_query_features=unique_query_count,
-        total_query_features=total_query_features
     )
     confidence_score = compute_confidence(
         semantic_score=semantic_score,
         feature_score=feature_score,
         coverage=coverage
     )
     return {
         "project_title":
             candidate_row.get(TITLE_COL, ""),

 LOW_FEATURE_WEIGHT = 0.20
 BONUS_WEIGHT = 0.05
+MIN_HYBRID_SCORE = 0.05
 # =====================================================
 # Helpers
 # Score Engines
 # =====================================================
 def compute_hybrid_score(
+    semantic_score,
+    feature_score,
+    coverage,
+    feature_count,
+    unique_query_count=0
 ) -> float:
     semantic_score = clamp(semantic_score)
     feature_score = clamp(feature_score)
     coverage = clamp(coverage)
     # ==========================================
     # Strong feature overlap case
     # ==========================================
     # ==========================================
     # Normal scoring
     # ==========================================
+    shared_ratio = (
+        (feature_count - unique_query_count)
+        / max(feature_count, 1)
+    )
     score = (
+        0.90 * (shared_ratio ** 2.0)
+        + 0.07 * feature_score
+        + 0.03 * semantic_score
+    )
+    # No feature overlap
+    # No feature overlap
+    if feature_score == 0 or coverage == 0:
+        return 0.03
+    shared_count = (
+        feature_count - unique_query_count
     )
+    # Near duplicate
+    if shared_count >= 6 and unique_query_count <= 1:
+        return 0.95
     return round(clamp(score), 4)
     total_query_features: int
 ) -> float:
     """
+    Originality Score (0-100)
+    - More unique features -> higher originality
+    - More similarity -> lower originality
     """
     hybrid_score = clamp(hybrid_score)
     uniqueness_ratio = (
         unique_query_features / total_query_features
+        if total_query_features > 0
+        else 0.0
     )
+    originality = (
+        0.70 * uniqueness_ratio +
+        0.30 * (1.0 - hybrid_score)
+    )
+    return round(
+        max(0.0, min(100.0, originality * 100)),
+        2
+    )
 def compute_confidence(
     Duplicate risk label.
     """
+    if score >= 0.90:
         return "Very High"
+    if score >= 0.75:
         return "High"
     if score >= 0.55:
         return "Medium"
+    if score >= 0.35:
         return "Low"
     return "Very Low"
         feature_result["unique_a"]
     )
+    base_similarity = compute_hybrid_score(
         semantic_score=semantic_score,
         feature_score=feature_score,
         coverage=coverage,
+        feature_count=total_query_features,
+        unique_query_count=unique_query_count
     )
+    originality_score = round(
+        (1.0 - base_similarity) * 100,
+        2
     )
+    hybrid_score = base_similarity
     confidence_score = compute_confidence(
         semantic_score=semantic_score,
         feature_score=feature_score,
         coverage=coverage
     )
+    print("=" * 50)
+    print("BASE SIMILARITY:", base_similarity)
+    print("ORIGINALITY:", originality_score)
+    print("FINAL SIMILARITY:", hybrid_score)
+    print("=" * 50)
     return {
         "project_title":
             candidate_row.get(TITLE_COL, ""),

src/similarity_model/llm_feature_extractor.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import json
+import re
+import time
+from src.recommendation_engine.llm_client import generate_text
+def build_feature_prompt(text: str):
+    return f"""
+You are an expert graduation project analyst.
+TASK:
+Extract only technical features explicitly mentioned in the project.
+RULES:
+- Do not invent anything.
+- Do not generate new ideas.
+- Extract technologies.
+- Extract algorithms.
+- Extract modules.
+- Extract engineering capabilities.
+- Ignore goals and benefits.
+Return between 2 and 10 features.
+If fewer than 5 technical features exist,
+return only the available features.
+Technologies such as:
+React,
+Angular,
+Vue,
+MySQL,
+PostgreSQL,
+MongoDB,
+Firebase,
+Django,
+Flask,
+Node.js
+must be included if explicitly mentioned.
+IMPORTANT:
+Every feature must be copied or paraphrased from the project text.
+Never infer, assume, or generate new capabilities.
+If a capability is not explicitly mentioned, do not include it.
+IMPORTANT:
+Do not extract:
+- goals
+- benefits
+- outcomes
+- business objectives
+- deployment statements
+Return SHORT canonical features.
+Maximum 4 words per feature.
+Examples:
+computer vision
+deep learning
+internet of things
+route optimization
+waste classification
+Do not include long descriptive phrases.
+IMPORTANT:
+Prefer domain-specific capabilities over generic AI terms.
+BAD:
+- machine learning
+- deep learning
+- neural networks
+GOOD:
+- waste classification
+- route optimization
+- bin fill level monitoring
+- waste generation forecasting
+If a specific capability exists, return it instead of the generic AI technology behind it.
+Maximum 4 words per feature.
+Only extract concrete technical capabilities.
+Return JSON only.
+Format:
+{{
+  "features": [
+    "feature 1",
+    "feature 2"
+  ]
+}}
+PROJECT:
+{text}
+"""
+def extract_features_llm(text: str):
+    print("CALLING GEMINI")
+    prompt = build_feature_prompt(text)
+    time.sleep(4)
+    response = generate_text(
+        prompt,
+        task="feature",
+        temperature = 0
+    )
+    return parse_features_response(
+        response
+    )
+def parse_features_response(response: str):
+    if not response:
+        return []
+    try:
+                # remove markdown json blocks
+        response = response.strip()
+        response = re.sub(
+            r"^```(?:json)?",
+            "",
+            response,
+            flags=re.I
+        )
+        response = re.sub(
+            r"```$",
+            "",
+            response
+        )
+        response = response.strip()
+        response = response.strip()
+        start = response.find("{")
+        end = response.rfind("}")
+        if start != -1 and end != -1:
+            response = response[start:end + 1]
+        data = json.loads(response)
+        features = data.get(
+            "features",
+            []
+        )
+        if not isinstance(features, list):
+            return []
+        cleaned = []
+        NORMALIZATION_MAP = {
+            "computer vision techniques": "computer vision",
+            "deep learning models": "deep learning",
+            "deep neural networks": "deep learning",
+            "machine learning models": "machine learning",
+            "iot sensor networks": "internet of things",
+            "sensor networks": "internet of things",
+            "predictive trends": "predictive analytics",
+            "operational insights": "analytics"
+        }
+        for x in features:
+            feat = str(x).strip().lower()
+            feat = NORMALIZATION_MAP.get(
+                feat,
+                feat
+            )
+            if not feat:
+                continue
+            if len(feat) < 2:
+                continue
+            cleaned.append(feat)
+        cleaned = list(dict.fromkeys(cleaned))
+        print("PARSED FEATURES:")
+        print(cleaned)
+        return cleaned[:10]
+    except Exception as e:
+        print("=" * 50)
+        print("JSON PARSE ERROR")
+        print(e)
+        print("=" * 50)
+        print("RAW RESPONSE:")
+        print(response)
+        print("=" * 50)
+        return []