{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "49c6b17c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'e:\\\\gradution project'" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "import os\n", "os.getcwd()\n", "os.chdir(\"/gradution project\")\n", "os.getcwd()" ] }, { "cell_type": "code", "execution_count": 2, "id": "509448bd", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " CONFIG LOADED:\n", "ENV: development\n", "DEBUG_MODE: True\n", "MODELS: ['gemini-3.1-flash-lite-preview', 'gemini-2.5-flash-lite', 'gemini-2.5-flash', 'gemini-2.5-pro']\n", "MAX_RETRIES: 3\n", "IDEA_TEMP: 0.9\n", "=================================\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2026-06-04 00:29:43,014 | INFO | Load pretrained SentenceTransformer: all-MiniLM-L6-v2\n", "e:\\gradution project\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:949: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", " warnings.warn(\n", "2026-06-04 00:29:46,381 | INFO | Use pytorch device_name: cpu\n", "2026-06-04 00:29:46,388 | INFO | Loading faiss with AVX2 support.\n", "2026-06-04 00:29:46,418 | INFO | Successfully loaded faiss with AVX2 support.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "SQL Connected Successfully\n", "All modules imported successfully\n" ] } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "from tqdm.notebook import tqdm\n", "\n", "from src.similarity_model import preprocess_dataset\n", "from src.similarity_model import train_embedding_engine\n", "from src.similarity_model import search_by_text\n", "from src.similarity_model import find_similar_projects\n", "from src.similarity_model import extract_features\n", "\n", "from src.similarity_model import normalize_text\n", "from src.similarity_model import compute_feature_similarity\n", "from Data.database.sql_connector import (\n", " load_preprocessed_projects,\n", " engine\n", ")\n", "\n", "print(\"All modules imported successfully\")" ] }, { "cell_type": "code", "execution_count": 33, "id": "0bf93b8e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Engine created\n" ] } ], "source": [ "from sqlalchemy import create_engine\n", "import urllib\n", "\n", "SERVER = \"innotrack-sql-server.database.windows.net\"\n", "DATABASE = \"InnoTrackDB\"\n", "USERNAME = \"innotrackadmin\"\n", "PASSWORD = \"Innotrack@admin233\"\n", "\n", "params = urllib.parse.quote_plus(\n", " f\"DRIVER={{ODBC Driver 18 for SQL Server}};\"\n", " f\"SERVER={SERVER};\"\n", " f\"DATABASE={DATABASE};\"\n", " f\"UID={USERNAME};\"\n", " f\"PWD={PASSWORD};\"\n", " \"Encrypt=yes;\"\n", " \"TrustServerCertificate=no;\"\n", " \"Connection Timeout=30;\"\n", ")\n", "\n", "engine = create_engine(\n", " f\"mssql+pyodbc:///?odbc_connect={params}\"\n", ")\n", "\n", "print(\"Engine created\")" ] }, { "cell_type": "code", "execution_count": 4, "id": "11f40d1d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TABLE_NAME
0Teams
1ChatRooms
2ChatMessageHiddens
3JoinRequests
4ChatMessageReactions
5Projects
6TeamMembers
7ProjectTechnologies_Backup
8ChatMessages
9Feedbacks
10MissingProjectTechsSplit
11PreProcessed_Projects
12OriginalityReports
13ProjectAttachments
14ProjectTechnologies
15VectorEmbeddings
16ChatMessageAttachments
17SimilarProjects
18AuditLogs
19AcademicYears
20Schema
21Job
22State
23JobParameter
24JobQueue
25database_firewall_rules
26Server
27List
28Set
29Counter
30Hash
31AggregatedCounter
32__EFMigrationsHistory
33Departments
34Skills_Backup
35Projects_Backup
36Domains
37Skills
38Technologies
39Users
40ProjectDrafts
41Notifications
42ProjectDraftTechnologies
43StudentSkills
\n", "
" ], "text/plain": [ " TABLE_NAME\n", "0 Teams\n", "1 ChatRooms\n", "2 ChatMessageHiddens\n", "3 JoinRequests\n", "4 ChatMessageReactions\n", "5 Projects\n", "6 TeamMembers\n", "7 ProjectTechnologies_Backup\n", "8 ChatMessages\n", "9 Feedbacks\n", "10 MissingProjectTechsSplit\n", "11 PreProcessed_Projects\n", "12 OriginalityReports\n", "13 ProjectAttachments\n", "14 ProjectTechnologies\n", "15 VectorEmbeddings\n", "16 ChatMessageAttachments\n", "17 SimilarProjects\n", "18 AuditLogs\n", "19 AcademicYears\n", "20 Schema\n", "21 Job\n", "22 State\n", "23 JobParameter\n", "24 JobQueue\n", "25 database_firewall_rules\n", "26 Server\n", "27 List\n", "28 Set\n", "29 Counter\n", "30 Hash\n", "31 AggregatedCounter\n", "32 __EFMigrationsHistory\n", "33 Departments\n", "34 Skills_Backup\n", "35 Projects_Backup\n", "36 Domains\n", "37 Skills\n", "38 Technologies\n", "39 Users\n", "40 ProjectDrafts\n", "41 Notifications\n", "42 ProjectDraftTechnologies\n", "43 StudentSkills" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "with engine.connect() as conn:\n", "\n", " tables = pd.read_sql(\n", " \"\"\"\n", " SELECT TABLE_NAME\n", " FROM INFORMATION_SCHEMA.TABLES\n", " \"\"\",\n", " conn\n", " )\n", "\n", "tables" ] }, { "cell_type": "code", "execution_count": 5, "id": "5d1125cb", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idsubmitted_atproject_titlestudent_namesyearabstractdescriptionproblem_statementproposed_solutionobjectivesfull_contentclean_textword_countfeatures
01NaT3D hand game for neuromuscular patientsAhmed Mansour Mohamed Saber, Ahmed Mohamed Moh...2017In this project we have designed and implement...A virtual rehabilitation system that uses a Le...Neuromuscular patients suffer from nerve atrop...The development of a 3D interactive game integ...1. Develop a scalable and maintainable solutio...3D hand game for neuromuscular patients. 3D ha...3d hand game for neuromuscular patients. 3d ha...172\"\\\"[\\\\\\\"Leap Motion controller sensor\\\\\\\", \\\\\\...
12NaT3D Laser ScanningAya Essam Hegazi, Asmaa Abd EL-Aziz, Ebtehal E...20243D scanning is used in many applications such ...This project implements a low-cost 3D laser sc...Existing 3D scanning devices are often extreme...A low-cost 3D laser scanning system that utili...1. Improve overall productivity and workflow o...3D Laser Scanning. 3D Laser Scanning. 3D scann...3d laser scanning. 3d laser scanning. 3d scann...185\"\\\"[\\\\\\\"3d laser scanning\\\\\\\", \\\\\\\"Hand-held l...
23NaTA Smart Automatic System for Criminal Identifi...Yousef Yacoub Mohammed, Ahmed Mohamed Hassan,\\...2020The increasing use of biometric technologies i...This project develops an automated criminal id...Traditional identification methods, such as ph...A real-time facial recognition system develope...1. Support future scalability and feature expa...A Smart Automatic System for Criminal Identifi...a smart automatic system for criminal identifi...138\"\\\"[\\\\\\\"real-time face recognition system\\\\\\\",...
34NaTAdvanced Educational Platform “ABSTHALK”Mohamed Nasser Maher, Karim Ashraf Salah Eldie...2025The Educational Platform for Students and Teac...ABSTHALK is a comprehensive, role-based e-lear...Traditional learning methods often lack access...The project proposes a structured, role-based,...1. Provide interactive educational tools and r...Advanced Educational Platform “ABSTHALK”. Adva...advanced educational platform absthalk . advan...192\"\\\"[\\\\\\\"Role-based management system\\\\\\\", \\\\\\\"...
45NaTAgricultural Information and Management SystemAhmed Mohamed, Omar Hassan, Mahmoud Ali ,Mazen...2020It is a permanent link between the decision-ma...This project is an integrated information syst...The competent authorities of the Ministry of A...The development of an integrated information s...1. Reduce operational complexity and improve e...Agricultural Information and Management System...agricultural information and management system...109\"\\\"[\\\\\\\"centralized database\\\\\\\", \\\\\\\"track la...
\n", "
" ], "text/plain": [ " id submitted_at project_title \\\n", "0 1 NaT 3D hand game for neuromuscular patients \n", "1 2 NaT 3D Laser Scanning \n", "2 3 NaT A Smart Automatic System for Criminal Identifi... \n", "3 4 NaT Advanced Educational Platform “ABSTHALK” \n", "4 5 NaT Agricultural Information and Management System \n", "\n", " student_names year \\\n", "0 Ahmed Mansour Mohamed Saber, Ahmed Mohamed Moh... 2017 \n", "1 Aya Essam Hegazi, Asmaa Abd EL-Aziz, Ebtehal E... 2024 \n", "2 Yousef Yacoub Mohammed, Ahmed Mohamed Hassan,\\... 2020 \n", "3 Mohamed Nasser Maher, Karim Ashraf Salah Eldie... 2025 \n", "4 Ahmed Mohamed, Omar Hassan, Mahmoud Ali ,Mazen... 2020 \n", "\n", " abstract \\\n", "0 In this project we have designed and implement... \n", "1 3D scanning is used in many applications such ... \n", "2 The increasing use of biometric technologies i... \n", "3 The Educational Platform for Students and Teac... \n", "4 It is a permanent link between the decision-ma... \n", "\n", " description \\\n", "0 A virtual rehabilitation system that uses a Le... \n", "1 This project implements a low-cost 3D laser sc... \n", "2 This project develops an automated criminal id... \n", "3 ABSTHALK is a comprehensive, role-based e-lear... \n", "4 This project is an integrated information syst... \n", "\n", " problem_statement \\\n", "0 Neuromuscular patients suffer from nerve atrop... \n", "1 Existing 3D scanning devices are often extreme... \n", "2 Traditional identification methods, such as ph... \n", "3 Traditional learning methods often lack access... \n", "4 The competent authorities of the Ministry of A... \n", "\n", " proposed_solution \\\n", "0 The development of a 3D interactive game integ... \n", "1 A low-cost 3D laser scanning system that utili... \n", "2 A real-time facial recognition system develope... \n", "3 The project proposes a structured, role-based,... \n", "4 The development of an integrated information s... \n", "\n", " objectives \\\n", "0 1. Develop a scalable and maintainable solutio... \n", "1 1. Improve overall productivity and workflow o... \n", "2 1. Support future scalability and feature expa... \n", "3 1. Provide interactive educational tools and r... \n", "4 1. Reduce operational complexity and improve e... \n", "\n", " full_content \\\n", "0 3D hand game for neuromuscular patients. 3D ha... \n", "1 3D Laser Scanning. 3D Laser Scanning. 3D scann... \n", "2 A Smart Automatic System for Criminal Identifi... \n", "3 Advanced Educational Platform “ABSTHALK”. Adva... \n", "4 Agricultural Information and Management System... \n", "\n", " clean_text word_count \\\n", "0 3d hand game for neuromuscular patients. 3d ha... 172 \n", "1 3d laser scanning. 3d laser scanning. 3d scann... 185 \n", "2 a smart automatic system for criminal identifi... 138 \n", "3 advanced educational platform absthalk . advan... 192 \n", "4 agricultural information and management system... 109 \n", "\n", " features \n", "0 \"\\\"[\\\\\\\"Leap Motion controller sensor\\\\\\\", \\\\\\... \n", "1 \"\\\"[\\\\\\\"3d laser scanning\\\\\\\", \\\\\\\"Hand-held l... \n", "2 \"\\\"[\\\\\\\"real-time face recognition system\\\\\\\",... \n", "3 \"\\\"[\\\\\\\"Role-based management system\\\\\\\", \\\\\\\"... \n", "4 \"\\\"[\\\\\\\"centralized database\\\\\\\", \\\\\\\"track la... " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "query = \"\"\"\n", "SELECT *\n", "FROM PreProcessed_Projects\n", "\"\"\"\n", "\n", "df = pd.read_sql(query, engine)\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 6, "id": "4429717d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['id', 'submitted_at', 'project_title', 'student_names', 'year', 'abstract', 'description', 'problem_statement', 'proposed_solution', 'objectives', 'full_content', 'clean_text', 'word_count', 'features']\n" ] } ], "source": [ "print(df.columns.tolist())" ] }, { "cell_type": "code", "execution_count": 7, "id": "9925da4c", "metadata": {}, "outputs": [], "source": [ "df = df.rename(columns={\n", " \"Title\": \"project_title\",\n", " \"Description\": \"description\",\n", " \"Abstract\": \"abstract\"\n", "})" ] }, { "cell_type": "code", "execution_count": 8, "id": "fc62d4f3", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idsubmitted_atproject_titlestudent_namesyearabstractdescriptionproblem_statementproposed_solutionobjectivesfull_contentclean_textword_countfeatures
01NaT3D hand game for neuromuscular patientsAhmed Mansour Mohamed Saber, Ahmed Mohamed Moh...2017In this project we have designed and implement...A virtual rehabilitation system that uses a Le...Neuromuscular patients suffer from nerve atrop...The development of a 3D interactive game integ...1. Develop a scalable and maintainable solutio...3D hand game for neuromuscular patients. 3D ha...3d hand game for neuromuscular patients. 3d ha...172\"\\\"[\\\\\\\"Leap Motion controller sensor\\\\\\\", \\\\\\...
12NaT3D Laser ScanningAya Essam Hegazi, Asmaa Abd EL-Aziz, Ebtehal E...20243D scanning is used in many applications such ...This project implements a low-cost 3D laser sc...Existing 3D scanning devices are often extreme...A low-cost 3D laser scanning system that utili...1. Improve overall productivity and workflow o...3D Laser Scanning. 3D Laser Scanning. 3D scann...3d laser scanning. 3d laser scanning. 3d scann...185\"\\\"[\\\\\\\"3d laser scanning\\\\\\\", \\\\\\\"Hand-held l...
23NaTA Smart Automatic System for Criminal Identifi...Yousef Yacoub Mohammed, Ahmed Mohamed Hassan,\\...2020The increasing use of biometric technologies i...This project develops an automated criminal id...Traditional identification methods, such as ph...A real-time facial recognition system develope...1. Support future scalability and feature expa...A Smart Automatic System for Criminal Identifi...a smart automatic system for criminal identifi...138\"\\\"[\\\\\\\"real-time face recognition system\\\\\\\",...
34NaTAdvanced Educational Platform “ABSTHALK”Mohamed Nasser Maher, Karim Ashraf Salah Eldie...2025The Educational Platform for Students and Teac...ABSTHALK is a comprehensive, role-based e-lear...Traditional learning methods often lack access...The project proposes a structured, role-based,...1. Provide interactive educational tools and r...Advanced Educational Platform “ABSTHALK”. Adva...advanced educational platform absthalk . advan...192\"\\\"[\\\\\\\"Role-based management system\\\\\\\", \\\\\\\"...
45NaTAgricultural Information and Management SystemAhmed Mohamed, Omar Hassan, Mahmoud Ali ,Mazen...2020It is a permanent link between the decision-ma...This project is an integrated information syst...The competent authorities of the Ministry of A...The development of an integrated information s...1. Reduce operational complexity and improve e...Agricultural Information and Management System...agricultural information and management system...109\"\\\"[\\\\\\\"centralized database\\\\\\\", \\\\\\\"track la...
\n", "
" ], "text/plain": [ " id submitted_at project_title \\\n", "0 1 NaT 3D hand game for neuromuscular patients \n", "1 2 NaT 3D Laser Scanning \n", "2 3 NaT A Smart Automatic System for Criminal Identifi... \n", "3 4 NaT Advanced Educational Platform “ABSTHALK” \n", "4 5 NaT Agricultural Information and Management System \n", "\n", " student_names year \\\n", "0 Ahmed Mansour Mohamed Saber, Ahmed Mohamed Moh... 2017 \n", "1 Aya Essam Hegazi, Asmaa Abd EL-Aziz, Ebtehal E... 2024 \n", "2 Yousef Yacoub Mohammed, Ahmed Mohamed Hassan,\\... 2020 \n", "3 Mohamed Nasser Maher, Karim Ashraf Salah Eldie... 2025 \n", "4 Ahmed Mohamed, Omar Hassan, Mahmoud Ali ,Mazen... 2020 \n", "\n", " abstract \\\n", "0 In this project we have designed and implement... \n", "1 3D scanning is used in many applications such ... \n", "2 The increasing use of biometric technologies i... \n", "3 The Educational Platform for Students and Teac... \n", "4 It is a permanent link between the decision-ma... \n", "\n", " description \\\n", "0 A virtual rehabilitation system that uses a Le... \n", "1 This project implements a low-cost 3D laser sc... \n", "2 This project develops an automated criminal id... \n", "3 ABSTHALK is a comprehensive, role-based e-lear... \n", "4 This project is an integrated information syst... \n", "\n", " problem_statement \\\n", "0 Neuromuscular patients suffer from nerve atrop... \n", "1 Existing 3D scanning devices are often extreme... \n", "2 Traditional identification methods, such as ph... \n", "3 Traditional learning methods often lack access... \n", "4 The competent authorities of the Ministry of A... \n", "\n", " proposed_solution \\\n", "0 The development of a 3D interactive game integ... \n", "1 A low-cost 3D laser scanning system that utili... \n", "2 A real-time facial recognition system develope... \n", "3 The project proposes a structured, role-based,... \n", "4 The development of an integrated information s... \n", "\n", " objectives \\\n", "0 1. Develop a scalable and maintainable solutio... \n", "1 1. Improve overall productivity and workflow o... \n", "2 1. Support future scalability and feature expa... \n", "3 1. Provide interactive educational tools and r... \n", "4 1. Reduce operational complexity and improve e... \n", "\n", " full_content \\\n", "0 3D hand game for neuromuscular patients. 3D ha... \n", "1 3D Laser Scanning. 3D Laser Scanning. 3D scann... \n", "2 A Smart Automatic System for Criminal Identifi... \n", "3 Advanced Educational Platform “ABSTHALK”. Adva... \n", "4 Agricultural Information and Management System... \n", "\n", " clean_text word_count \\\n", "0 3d hand game for neuromuscular patients. 3d ha... 172 \n", "1 3d laser scanning. 3d laser scanning. 3d scann... 185 \n", "2 a smart automatic system for criminal identifi... 138 \n", "3 advanced educational platform absthalk . advan... 192 \n", "4 agricultural information and management system... 109 \n", "\n", " features \n", "0 \"\\\"[\\\\\\\"Leap Motion controller sensor\\\\\\\", \\\\\\... \n", "1 \"\\\"[\\\\\\\"3d laser scanning\\\\\\\", \\\\\\\"Hand-held l... \n", "2 \"\\\"[\\\\\\\"real-time face recognition system\\\\\\\",... \n", "3 \"\\\"[\\\\\\\"Role-based management system\\\\\\\", \\\\\\\"... \n", "4 \"\\\"[\\\\\\\"centralized database\\\\\\\", \\\\\\\"track la... " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "query = \"\"\"\n", "SELECT *\n", "FROM PreProcessed_Projects\n", "\"\"\"\n", "\n", "clean_df = pd.read_sql(query, engine)\n", "\n", "clean_df.head()" ] }, { "cell_type": "code", "execution_count": 9, "id": "e5af88d4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(255, 14)\n" ] } ], "source": [ "print(clean_df.shape)\n" ] }, { "cell_type": "code", "execution_count": 10, "id": "bb80639a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 255.000000\n", "mean 236.031373\n", "std 87.747619\n", "min 24.000000\n", "25% 173.500000\n", "50% 225.000000\n", "75% 287.000000\n", "max 719.000000\n", "Name: features, dtype: float64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clean_df[\"features\"].apply(len).describe()" ] }, { "cell_type": "code", "execution_count": 11, "id": "633cfec4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Saved cleaned dataset\n" ] } ], "source": [ "clean_df.to_parquet(\"Data_gemini/projects_clean_gemini.parquet\", index=False)\n", "clean_df.to_csv(\"Data_gemini/projects_clean_gemini.csv\", index=False)\n", "\n", "print(\"Saved cleaned dataset\")" ] }, { "cell_type": "code", "execution_count": 12, "id": "36f84432", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(255, 14)\n" ] } ], "source": [ "test_df = pd.read_parquet(\n", " \"Data_gemini/projects_clean_gemini.parquet\"\n", ")\n", "\n", "print(test_df.shape)" ] }, { "cell_type": "code", "execution_count": 13, "id": "0dd86aec", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['id', 'submitted_at', 'project_title', 'student_names', 'year', 'abstract', 'description', 'problem_statement', 'proposed_solution', 'objectives', 'full_content', 'clean_text', 'word_count', 'features']\n" ] } ], "source": [ "print(clean_df.columns.tolist())" ] }, { "cell_type": "code", "execution_count": 14, "id": "e3e96549", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idsubmitted_atproject_titlestudent_namesyearabstractdescriptionproblem_statementproposed_solutionobjectivesfull_contentclean_textword_countfeatures
01None3D hand game for neuromuscular patientsAhmed Mansour Mohamed Saber, Ahmed Mohamed Moh...2017In this project we have designed and implement...A virtual rehabilitation system that uses a Le...Neuromuscular patients suffer from nerve atrop...The development of a 3D interactive game integ...1. Develop a scalable and maintainable solutio...3D hand game for neuromuscular patients. 3D ha...3d hand game for neuromuscular patients. 3d ha...172\"\\\"[\\\\\\\"Leap Motion controller sensor\\\\\\\", \\\\\\...
12None3D Laser ScanningAya Essam Hegazi, Asmaa Abd EL-Aziz, Ebtehal E...20243D scanning is used in many applications such ...This project implements a low-cost 3D laser sc...Existing 3D scanning devices are often extreme...A low-cost 3D laser scanning system that utili...1. Improve overall productivity and workflow o...3D Laser Scanning. 3D Laser Scanning. 3D scann...3d laser scanning. 3d laser scanning. 3d scann...185\"\\\"[\\\\\\\"3d laser scanning\\\\\\\", \\\\\\\"Hand-held l...
23NoneA Smart Automatic System for Criminal Identifi...Yousef Yacoub Mohammed, Ahmed Mohamed Hassan,\\...2020The increasing use of biometric technologies i...This project develops an automated criminal id...Traditional identification methods, such as ph...A real-time facial recognition system develope...1. Support future scalability and feature expa...A Smart Automatic System for Criminal Identifi...a smart automatic system for criminal identifi...138\"\\\"[\\\\\\\"real-time face recognition system\\\\\\\",...
34NoneAdvanced Educational Platform “ABSTHALK”Mohamed Nasser Maher, Karim Ashraf Salah Eldie...2025The Educational Platform for Students and Teac...ABSTHALK is a comprehensive, role-based e-lear...Traditional learning methods often lack access...The project proposes a structured, role-based,...1. Provide interactive educational tools and r...Advanced Educational Platform “ABSTHALK”. Adva...advanced educational platform absthalk . advan...192\"\\\"[\\\\\\\"Role-based management system\\\\\\\", \\\\\\\"...
45NoneAgricultural Information and Management SystemAhmed Mohamed, Omar Hassan, Mahmoud Ali ,Mazen...2020It is a permanent link between the decision-ma...This project is an integrated information syst...The competent authorities of the Ministry of A...The development of an integrated information s...1. Reduce operational complexity and improve e...Agricultural Information and Management System...agricultural information and management system...109\"\\\"[\\\\\\\"centralized database\\\\\\\", \\\\\\\"track la...
\n", "
" ], "text/plain": [ " id submitted_at project_title \\\n", "0 1 None 3D hand game for neuromuscular patients \n", "1 2 None 3D Laser Scanning \n", "2 3 None A Smart Automatic System for Criminal Identifi... \n", "3 4 None Advanced Educational Platform “ABSTHALK” \n", "4 5 None Agricultural Information and Management System \n", "\n", " student_names year \\\n", "0 Ahmed Mansour Mohamed Saber, Ahmed Mohamed Moh... 2017 \n", "1 Aya Essam Hegazi, Asmaa Abd EL-Aziz, Ebtehal E... 2024 \n", "2 Yousef Yacoub Mohammed, Ahmed Mohamed Hassan,\\... 2020 \n", "3 Mohamed Nasser Maher, Karim Ashraf Salah Eldie... 2025 \n", "4 Ahmed Mohamed, Omar Hassan, Mahmoud Ali ,Mazen... 2020 \n", "\n", " abstract \\\n", "0 In this project we have designed and implement... \n", "1 3D scanning is used in many applications such ... \n", "2 The increasing use of biometric technologies i... \n", "3 The Educational Platform for Students and Teac... \n", "4 It is a permanent link between the decision-ma... \n", "\n", " description \\\n", "0 A virtual rehabilitation system that uses a Le... \n", "1 This project implements a low-cost 3D laser sc... \n", "2 This project develops an automated criminal id... \n", "3 ABSTHALK is a comprehensive, role-based e-lear... \n", "4 This project is an integrated information syst... \n", "\n", " problem_statement \\\n", "0 Neuromuscular patients suffer from nerve atrop... \n", "1 Existing 3D scanning devices are often extreme... \n", "2 Traditional identification methods, such as ph... \n", "3 Traditional learning methods often lack access... \n", "4 The competent authorities of the Ministry of A... \n", "\n", " proposed_solution \\\n", "0 The development of a 3D interactive game integ... \n", "1 A low-cost 3D laser scanning system that utili... \n", "2 A real-time facial recognition system develope... \n", "3 The project proposes a structured, role-based,... \n", "4 The development of an integrated information s... \n", "\n", " objectives \\\n", "0 1. Develop a scalable and maintainable solutio... \n", "1 1. Improve overall productivity and workflow o... \n", "2 1. Support future scalability and feature expa... \n", "3 1. Provide interactive educational tools and r... \n", "4 1. Reduce operational complexity and improve e... \n", "\n", " full_content \\\n", "0 3D hand game for neuromuscular patients. 3D ha... \n", "1 3D Laser Scanning. 3D Laser Scanning. 3D scann... \n", "2 A Smart Automatic System for Criminal Identifi... \n", "3 Advanced Educational Platform “ABSTHALK”. Adva... \n", "4 Agricultural Information and Management System... \n", "\n", " clean_text word_count \\\n", "0 3d hand game for neuromuscular patients. 3d ha... 172 \n", "1 3d laser scanning. 3d laser scanning. 3d scann... 185 \n", "2 a smart automatic system for criminal identifi... 138 \n", "3 advanced educational platform absthalk . advan... 192 \n", "4 agricultural information and management system... 109 \n", "\n", " features \n", "0 \"\\\"[\\\\\\\"Leap Motion controller sensor\\\\\\\", \\\\\\... \n", "1 \"\\\"[\\\\\\\"3d laser scanning\\\\\\\", \\\\\\\"Hand-held l... \n", "2 \"\\\"[\\\\\\\"real-time face recognition system\\\\\\\",... \n", "3 \"\\\"[\\\\\\\"Role-based management system\\\\\\\", \\\\\\\"... \n", "4 \"\\\"[\\\\\\\"centralized database\\\\\\\", \\\\\\\"track la... " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_df = pd.read_sql(\n", " \"SELECT TOP 5 * FROM PreProcessed_Projects\",\n", " engine\n", ")\n", "\n", "test_df.head()" ] }, { "cell_type": "code", "execution_count": 15, "id": "078d4b8c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "================================================================================\n", "Hospital Test\n", "================================================================================\n", "USING GEMINI FEATURE EXTRACTOR\n", "CALLING GEMINI\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2026-06-04 00:30:08,804 | INFO | [LLM] model=gemini-3.1-flash-lite-preview | task=feature | attempt=1\n", "2026-06-04 00:30:08,805 | INFO | AFC is enabled with max remote calls: 10.\n", "2026-06-04 00:30:09,875 | INFO | HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-lite-preview:generateContent \"HTTP/1.1 200 OK\"\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "PARSED FEATURES:\n", "['appointment booking', 'patient records management', 'medical records storage', 'doctor dashboard', 'physician dashboard', 'ai chatbot']\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a32846683c0e41e48b4b5cac27cbb769", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Batches: 0%| | 0/1 [00:00 medical records storage (shared=1)\n", "\n", "\n", "================================================================================\n", "Machine Learning Test\n", "================================================================================\n", "USING GEMINI FEATURE EXTRACTOR\n", "CALLING GEMINI\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2026-06-04 00:30:16,521 | INFO | [LLM] model=gemini-3.1-flash-lite-preview | task=feature | attempt=1\n", "2026-06-04 00:30:16,522 | INFO | AFC is enabled with max remote calls: 10.\n", "2026-06-04 00:30:17,431 | INFO | HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-lite-preview:generateContent \"HTTP/1.1 200 OK\"\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "PARSED FEATURES:\n", "['prediction', 'analysis']\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "560f448ba2794e0e9e1940be1b66697d", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Batches: 0%| | 0/1 [00:00 real-time face detection (shared=1)\n", "\n", "\n" ] } ], "source": [ "from src.similarity_model.preprocessing import (\n", " extract_features,\n", " normalize_text\n", ")\n", "\n", "def check_duplicates(features):\n", "\n", " found = False\n", "\n", " for i in range(len(features)):\n", " for j in range(i + 1, len(features)):\n", "\n", " a = set(features[i].split())\n", " b = set(features[j].split())\n", "\n", " overlap = len(a & b)\n", "\n", " if overlap > 0:\n", " found = True\n", " print(\n", " f\"{features[i]} <-> {features[j]} \"\n", " f\"(shared={overlap})\"\n", " )\n", "\n", " if not found:\n", " print(\"No duplicate overlaps found\")\n", "\n", "\n", "tests = {\n", " \"Hospital Test\": \"\"\"\n", " Hospital management system with\n", " appointment booking,\n", " online appointment booking,\n", " patient records,\n", " medical records,\n", " doctor dashboard,\n", " physician dashboard,\n", " AI chatbot,\n", " intelligent chatbot\n", " \"\"\",\n", "\n", " \"Machine Learning Test\": \"\"\"\n", " Machine learning system using machine learning\n", " for machine learning prediction and machine learning analysis.\n", " \"\"\",\n", "\n", " \"Face Recognition Test\": \"\"\"\n", " Face recognition attendance system using deep learning,\n", " computer vision,\n", " real-time face detection,\n", " student attendance management and mobile application.\n", " \"\"\"\n", "}\n", "\n", "for name, query in tests.items():\n", "\n", " print(\"=\" * 80)\n", " print(name)\n", " print(\"=\" * 80)\n", "\n", " features = extract_features(\n", " normalize_text(query)\n", " )\n", "\n", " print(f\"Feature Count: {len(features)}\")\n", " print()\n", "\n", " for f in features:\n", " print(\"-\", f)\n", "\n", " print(\"\\nDuplicate Check:\")\n", " check_duplicates(features)\n", "\n", " print(\"\\n\")" ] }, { "cell_type": "code", "execution_count": 16, "id": "edc0890d", "metadata": {}, "outputs": [], "source": [ "from Data.database.sql_connector import engine\n", "\n", "engine.dispose()" ] }, { "cell_type": "code", "execution_count": 17, "id": "0a231154", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2026-06-04 00:30:22,479 | INFO | Loading models and artifacts...\n", "2026-06-04 00:30:22,481 | INFO | Loading model: all-MiniLM-L6-v2\n", "2026-06-04 00:30:22,481 | INFO | Load pretrained SentenceTransformer: all-MiniLM-L6-v2\n", "e:\\gradution project\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:949: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", " warnings.warn(\n", "2026-06-04 00:30:24,618 | INFO | Use pytorch device_name: cpu\n", "2026-06-04 00:30:24,624 | INFO | Loading FAISS index...\n", "2026-06-04 00:30:24,627 | INFO | Loading feature model: all-MiniLM-L6-v2\n", "2026-06-04 00:30:24,628 | INFO | Load pretrained SentenceTransformer: all-MiniLM-L6-v2\n", "2026-06-04 00:30:26,763 | INFO | Use pytorch device_name: cpu\n", "2026-06-04 00:30:26,767 | INFO | Loading metadata from Azure SQL...\n", "2026-06-04 00:30:32,815 | INFO | Preparing query...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "USING GEMINI FEATURE EXTRACTOR\n", "CALLING GEMINI\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2026-06-04 00:30:36,816 | INFO | [LLM] model=gemini-3.1-flash-lite-preview | task=feature | attempt=1\n", "2026-06-04 00:30:36,817 | INFO | AFC is enabled with max remote calls: 10.\n", "2026-06-04 00:30:37,822 | INFO | HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-lite-preview:generateContent \"HTTP/1.1 200 OK\"\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "PARSED FEATURES:\n", "['appointment booking', 'patient records', 'doctor dashboard', 'ai chatbot']\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "eff76001187242a6a509b00507dae4ee", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Batches: 0%| | 0/1 [00:00\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
project_titlesemantic_scorefeature_scorecoveragehybrid_scoreduplicate_risk
0Detecting Diseases Using Chatbot and Booking C...0.74800.00.00.05Very Low
1Clinical Information System0.64790.00.00.05Very Low
2Doctor 4 U0.64370.00.00.05Very Low
3Health Care Management System0.64020.00.00.05Very Low
4Hospital Management System0.63970.00.00.05Very Low
\n", "" ], "text/plain": [ " project_title semantic_score \\\n", "0 Detecting Diseases Using Chatbot and Booking C... 0.7480 \n", "1 Clinical Information System 0.6479 \n", "2 Doctor 4 U 0.6437 \n", "3 Health Care Management System 0.6402 \n", "4 Hospital Management System 0.6397 \n", "\n", " feature_score coverage hybrid_score duplicate_risk \n", "0 0.0 0.0 0.05 Very Low \n", "1 0.0 0.0 0.05 Very Low \n", "2 0.0 0.0 0.05 Very Low \n", "3 0.0 0.0 0.05 Very Low \n", "4 0.0 0.0 0.05 Very Low " ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "results = find_similar_projects(\n", " title=\"AI Clinic Management System\",\n", " description=\"\"\"\n", " Smart clinic management platform with\n", " appointment booking,\n", " patient records,\n", " doctor dashboard,\n", " AI chatbot.\n", " \"\"\",\n", " top_k=5\n", ")\n", "\n", "results[[\n", " \"project_title\",\n", " \"semantic_score\",\n", " \"feature_score\",\n", " \"coverage\",\n", " \"hybrid_score\",\n", " \"duplicate_risk\"\n", "]]" ] }, { "cell_type": "code", "execution_count": 18, "id": "5ab1315b", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "e3c94f184d4f485c871ada26ed9f5abc", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Batches: 0%| | 0/1 [00:00