bat-6 commited on
Commit
5ec2fc9
·
1 Parent(s): 599f4e7

update .gitignore and add scheduler and sync_projects services

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .env +0 -2
  2. .gitattributes +0 -33
  3. .gitignore +5 -0
  4. Data/database/__pycache__/sql_connector.cpython-311.pyc +0 -0
  5. Data/database/sql_connector.py +54 -11
  6. Data_gemini/projects_clean_gemini.csv +0 -0
  7. Data_gemini/projects_clean_gemini.parquet +3 -0
  8. Dockerfile +1 -17
  9. Notebooks/TEST.ipynb +0 -0
  10. Notebooks/test2.ipynb +2653 -0
  11. README.md +376 -8
  12. api/__pycache__/__init__.cpython-311.pyc +0 -0
  13. api/__pycache__/main.cpython-311.pyc +0 -0
  14. api/__pycache__/schemas.cpython-311.pyc +0 -0
  15. api/__pycache__/services.cpython-311.pyc +0 -0
  16. models/faiss_index.bin +2 -2
  17. models/metadata.parquet +2 -2
  18. models/project_embeddings.npy +0 -0
  19. requirements.txt +1 -1
  20. src/recommendation_engine/__pycache__/__init__.cpython-311.pyc +0 -0
  21. src/recommendation_engine/__pycache__/chatbot_engine.cpython-311.pyc +0 -0
  22. src/recommendation_engine/__pycache__/command_handler.cpython-311.pyc +0 -0
  23. src/recommendation_engine/__pycache__/config.cpython-311.pyc +0 -0
  24. src/recommendation_engine/__pycache__/context_builder.cpython-311.pyc +0 -0
  25. src/recommendation_engine/__pycache__/feature_generator.cpython-311.pyc +0 -0
  26. src/recommendation_engine/__pycache__/full_project_generator.cpython-311.pyc +0 -0
  27. src/recommendation_engine/__pycache__/idea_generator.cpython-311.pyc +0 -0
  28. src/recommendation_engine/__pycache__/llm_client.cpython-311.pyc +0 -0
  29. src/recommendation_engine/__pycache__/llm_router.cpython-311.pyc +0 -0
  30. src/recommendation_engine/__pycache__/memory_store.cpython-311.pyc +0 -0
  31. src/recommendation_engine/__pycache__/novelty_checker.cpython-311.pyc +0 -0
  32. src/recommendation_engine/__pycache__/prompt_builder.cpython-311.pyc +0 -0
  33. src/recommendation_engine/__pycache__/response_formatter.cpython-311.pyc +0 -0
  34. src/recommendation_engine/__pycache__/state_manager.cpython-311.pyc +0 -0
  35. src/recommendation_engine/__pycache__/test.cpython-311.pyc +0 -0
  36. src/recommendation_engine/__pycache__/validator.cpython-311.pyc +0 -0
  37. src/recommendation_engine/llm_client.py +4 -2
  38. src/services/scheduler.py +19 -0
  39. src/services/sync_projects.py +172 -0
  40. src/similarity_model/__pycache__/__init__.cpython-311.pyc +0 -0
  41. src/similarity_model/__pycache__/embedding_engine.cpython-311.pyc +0 -0
  42. src/similarity_model/__pycache__/feature_similarity.cpython-311.pyc +0 -0
  43. src/similarity_model/__pycache__/hybrid_ranker.cpython-311.pyc +0 -0
  44. src/similarity_model/__pycache__/preprocessing.cpython-311.pyc +0 -0
  45. src/similarity_model/__pycache__/semantic_search.cpython-311.pyc +0 -0
  46. src/similarity_model/__pycache__/similarity_engine.cpython-311.pyc +0 -0
  47. src/similarity_model/embedding_engine.py +18 -3
  48. src/similarity_model/feature_similarity.py +8 -5
  49. src/similarity_model/hybrid_ranker.py +57 -26
  50. src/similarity_model/llm_feature_extractor.py +223 -0
.env DELETED
@@ -1,2 +0,0 @@
1
- GEMINI_API_KEY=AIzaSyAkFsaN3BKoSQmRW4FzTahhZXbq-ldsDZ4
2
- GEMINI_MODEL_NAME=gemini-2.5-flash
 
 
 
.gitattributes CHANGED
@@ -1,35 +1,2 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
  *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
1
  *.bin filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  *.parquet filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ venv/
2
+ .venv/
3
+ __pycache__/
4
+ *.pyc
5
+ .env
Data/database/__pycache__/sql_connector.cpython-311.pyc CHANGED
Binary files a/Data/database/__pycache__/sql_connector.cpython-311.pyc and b/Data/database/__pycache__/sql_connector.cpython-311.pyc differ
 
Data/database/sql_connector.py CHANGED
@@ -1,13 +1,12 @@
1
- import os
2
- import json
3
- import urllib
4
- import pandas as pd
5
  from sqlalchemy import create_engine
 
 
 
6
 
7
- SERVER = os.getenv("AZURE_SQL_SERVER")
8
- DATABASE = os.getenv("AZURE_SQL_DATABASE")
9
- USERNAME = os.getenv("AZURE_SQL_USERNAME")
10
- PASSWORD = os.getenv("AZURE_SQL_PASSWORD")
11
 
12
  params = urllib.parse.quote_plus(
13
  f"DRIVER={{ODBC Driver 18 for SQL Server}};"
@@ -20,12 +19,56 @@ params = urllib.parse.quote_plus(
20
  "Connection Timeout=30;"
21
  )
22
 
23
- engine = create_engine(f"mssql+pyodbc:///?odbc_connect={params}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  def load_preprocessed_projects():
26
- df = pd.read_sql("SELECT * FROM PreProcessed_Projects", engine)
 
 
 
 
 
 
 
 
 
27
 
28
  if "features" in df.columns:
29
- df["features"] = df["features"].apply(json.loads)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  return df
 
 
 
 
 
1
  from sqlalchemy import create_engine
2
+ import pandas as pd
3
+ import urllib
4
+ import json
5
 
6
+ SERVER = "innotrack-sql-server.database.windows.net"
7
+ DATABASE = "InnoTrackDB"
8
+ USERNAME = "innotrackadmin"
9
+ PASSWORD = "Innotrack@admin233"
10
 
11
  params = urllib.parse.quote_plus(
12
  f"DRIVER={{ODBC Driver 18 for SQL Server}};"
 
19
  "Connection Timeout=30;"
20
  )
21
 
22
+ connection_string = (
23
+ f"mssql+pyodbc:///?odbc_connect={params}"
24
+ )
25
+
26
+ engine = create_engine(connection_string)
27
+
28
+ try:
29
+
30
+ with engine.connect() as conn:
31
+ print("SQL Connected Successfully")
32
+
33
+ except Exception as e:
34
+ print("Connection Failed")
35
+ print(e)
36
+
37
+
38
+
39
+
40
+
41
 
42
  def load_preprocessed_projects():
43
+
44
+ query = """
45
+ SELECT *
46
+ FROM PreProcessed_Projects
47
+ """
48
+
49
+ df = pd.read_sql(
50
+ query,
51
+ engine
52
+ )
53
 
54
  if "features" in df.columns:
55
+
56
+ def parse_features(x):
57
+
58
+ if not isinstance(x, str):
59
+ return x
60
+
61
+ try:
62
+ x = json.loads(x)
63
+
64
+ if isinstance(x, str):
65
+ x = json.loads(x)
66
+
67
+ return x
68
+
69
+ except Exception:
70
+ return []
71
+
72
+ df["features"] = df["features"].apply(parse_features)
73
 
74
  return df
Data_gemini/projects_clean_gemini.csv ADDED
The diff for this file is too large to render. See raw diff
 
Data_gemini/projects_clean_gemini.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:789b1063cc584c694924b03c26846a6c3c1e41ea0a1ac2df97ed42907acbea8e
3
+ size 772640
Dockerfile CHANGED
@@ -1,20 +1,4 @@
1
- FROM python:3.11-slim-bookworm
2
-
3
- USER root
4
-
5
- RUN apt-get update && apt-get install -y --no-install-recommends \
6
- curl \
7
- gnupg \
8
- ca-certificates \
9
- unixodbc \
10
- unixodbc-dev \
11
- && curl -sSL -O https://packages.microsoft.com/config/debian/12/packages-microsoft-prod.deb \
12
- && dpkg -i packages-microsoft-prod.deb \
13
- && rm packages-microsoft-prod.deb \
14
- && apt-get update \
15
- && ACCEPT_EULA=Y apt-get install -y --no-install-recommends msodbcsql18 \
16
- && apt-get clean \
17
- && rm -rf /var/lib/apt/lists/*
18
 
19
  RUN useradd -m -u 1000 user
20
 
 
1
+ FROM python:3.11-slim
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  RUN useradd -m -u 1000 user
4
 
Notebooks/TEST.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
Notebooks/test2.ipynb CHANGED
@@ -0,0 +1,2653 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "49c6b17c",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "data": {
11
+ "text/plain": [
12
+ "'e:\\\\gradution project'"
13
+ ]
14
+ },
15
+ "execution_count": 1,
16
+ "metadata": {},
17
+ "output_type": "execute_result"
18
+ }
19
+ ],
20
+ "source": [
21
+ "\n",
22
+ "import os\n",
23
+ "os.getcwd()\n",
24
+ "os.chdir(\"/gradution project\")\n",
25
+ "os.getcwd()"
26
+ ]
27
+ },
28
+ {
29
+ "cell_type": "code",
30
+ "execution_count": 2,
31
+ "id": "509448bd",
32
+ "metadata": {},
33
+ "outputs": [
34
+ {
35
+ "name": "stdout",
36
+ "output_type": "stream",
37
+ "text": [
38
+ "\n",
39
+ " CONFIG LOADED:\n",
40
+ "ENV: development\n",
41
+ "DEBUG_MODE: True\n",
42
+ "MODELS: ['gemini-3.1-flash-lite-preview', 'gemini-2.5-flash-lite', 'gemini-2.5-flash', 'gemini-2.5-pro']\n",
43
+ "MAX_RETRIES: 3\n",
44
+ "IDEA_TEMP: 0.9\n",
45
+ "=================================\n",
46
+ "\n"
47
+ ]
48
+ },
49
+ {
50
+ "name": "stderr",
51
+ "output_type": "stream",
52
+ "text": [
53
+ "2026-06-04 00:29:43,014 | INFO | Load pretrained SentenceTransformer: all-MiniLM-L6-v2\n",
54
+ "e:\\gradution project\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:949: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
55
+ " warnings.warn(\n",
56
+ "2026-06-04 00:29:46,381 | INFO | Use pytorch device_name: cpu\n",
57
+ "2026-06-04 00:29:46,388 | INFO | Loading faiss with AVX2 support.\n",
58
+ "2026-06-04 00:29:46,418 | INFO | Successfully loaded faiss with AVX2 support.\n"
59
+ ]
60
+ },
61
+ {
62
+ "name": "stdout",
63
+ "output_type": "stream",
64
+ "text": [
65
+ "SQL Connected Successfully\n",
66
+ "All modules imported successfully\n"
67
+ ]
68
+ }
69
+ ],
70
+ "source": [
71
+ "import pandas as pd\n",
72
+ "import numpy as np\n",
73
+ "from tqdm.notebook import tqdm\n",
74
+ "\n",
75
+ "from src.similarity_model import preprocess_dataset\n",
76
+ "from src.similarity_model import train_embedding_engine\n",
77
+ "from src.similarity_model import search_by_text\n",
78
+ "from src.similarity_model import find_similar_projects\n",
79
+ "from src.similarity_model import extract_features\n",
80
+ "\n",
81
+ "from src.similarity_model import normalize_text\n",
82
+ "from src.similarity_model import compute_feature_similarity\n",
83
+ "from Data.database.sql_connector import (\n",
84
+ " load_preprocessed_projects,\n",
85
+ " engine\n",
86
+ ")\n",
87
+ "\n",
88
+ "print(\"All modules imported successfully\")"
89
+ ]
90
+ },
91
+ {
92
+ "cell_type": "code",
93
+ "execution_count": 33,
94
+ "id": "0bf93b8e",
95
+ "metadata": {},
96
+ "outputs": [
97
+ {
98
+ "name": "stdout",
99
+ "output_type": "stream",
100
+ "text": [
101
+ "Engine created\n"
102
+ ]
103
+ }
104
+ ],
105
+ "source": [
106
+ "from sqlalchemy import create_engine\n",
107
+ "import urllib\n",
108
+ "\n",
109
+ "SERVER = \"innotrack-sql-server.database.windows.net\"\n",
110
+ "DATABASE = \"InnoTrackDB\"\n",
111
+ "USERNAME = \"innotrackadmin\"\n",
112
+ "PASSWORD = \"Innotrack@admin233\"\n",
113
+ "\n",
114
+ "params = urllib.parse.quote_plus(\n",
115
+ " f\"DRIVER={{ODBC Driver 18 for SQL Server}};\"\n",
116
+ " f\"SERVER={SERVER};\"\n",
117
+ " f\"DATABASE={DATABASE};\"\n",
118
+ " f\"UID={USERNAME};\"\n",
119
+ " f\"PWD={PASSWORD};\"\n",
120
+ " \"Encrypt=yes;\"\n",
121
+ " \"TrustServerCertificate=no;\"\n",
122
+ " \"Connection Timeout=30;\"\n",
123
+ ")\n",
124
+ "\n",
125
+ "engine = create_engine(\n",
126
+ " f\"mssql+pyodbc:///?odbc_connect={params}\"\n",
127
+ ")\n",
128
+ "\n",
129
+ "print(\"Engine created\")"
130
+ ]
131
+ },
132
+ {
133
+ "cell_type": "code",
134
+ "execution_count": 4,
135
+ "id": "11f40d1d",
136
+ "metadata": {},
137
+ "outputs": [
138
+ {
139
+ "data": {
140
+ "text/html": [
141
+ "<div>\n",
142
+ "<style scoped>\n",
143
+ " .dataframe tbody tr th:only-of-type {\n",
144
+ " vertical-align: middle;\n",
145
+ " }\n",
146
+ "\n",
147
+ " .dataframe tbody tr th {\n",
148
+ " vertical-align: top;\n",
149
+ " }\n",
150
+ "\n",
151
+ " .dataframe thead th {\n",
152
+ " text-align: right;\n",
153
+ " }\n",
154
+ "</style>\n",
155
+ "<table border=\"1\" class=\"dataframe\">\n",
156
+ " <thead>\n",
157
+ " <tr style=\"text-align: right;\">\n",
158
+ " <th></th>\n",
159
+ " <th>TABLE_NAME</th>\n",
160
+ " </tr>\n",
161
+ " </thead>\n",
162
+ " <tbody>\n",
163
+ " <tr>\n",
164
+ " <th>0</th>\n",
165
+ " <td>Teams</td>\n",
166
+ " </tr>\n",
167
+ " <tr>\n",
168
+ " <th>1</th>\n",
169
+ " <td>ChatRooms</td>\n",
170
+ " </tr>\n",
171
+ " <tr>\n",
172
+ " <th>2</th>\n",
173
+ " <td>ChatMessageHiddens</td>\n",
174
+ " </tr>\n",
175
+ " <tr>\n",
176
+ " <th>3</th>\n",
177
+ " <td>JoinRequests</td>\n",
178
+ " </tr>\n",
179
+ " <tr>\n",
180
+ " <th>4</th>\n",
181
+ " <td>ChatMessageReactions</td>\n",
182
+ " </tr>\n",
183
+ " <tr>\n",
184
+ " <th>5</th>\n",
185
+ " <td>Projects</td>\n",
186
+ " </tr>\n",
187
+ " <tr>\n",
188
+ " <th>6</th>\n",
189
+ " <td>TeamMembers</td>\n",
190
+ " </tr>\n",
191
+ " <tr>\n",
192
+ " <th>7</th>\n",
193
+ " <td>ProjectTechnologies_Backup</td>\n",
194
+ " </tr>\n",
195
+ " <tr>\n",
196
+ " <th>8</th>\n",
197
+ " <td>ChatMessages</td>\n",
198
+ " </tr>\n",
199
+ " <tr>\n",
200
+ " <th>9</th>\n",
201
+ " <td>Feedbacks</td>\n",
202
+ " </tr>\n",
203
+ " <tr>\n",
204
+ " <th>10</th>\n",
205
+ " <td>MissingProjectTechsSplit</td>\n",
206
+ " </tr>\n",
207
+ " <tr>\n",
208
+ " <th>11</th>\n",
209
+ " <td>PreProcessed_Projects</td>\n",
210
+ " </tr>\n",
211
+ " <tr>\n",
212
+ " <th>12</th>\n",
213
+ " <td>OriginalityReports</td>\n",
214
+ " </tr>\n",
215
+ " <tr>\n",
216
+ " <th>13</th>\n",
217
+ " <td>ProjectAttachments</td>\n",
218
+ " </tr>\n",
219
+ " <tr>\n",
220
+ " <th>14</th>\n",
221
+ " <td>ProjectTechnologies</td>\n",
222
+ " </tr>\n",
223
+ " <tr>\n",
224
+ " <th>15</th>\n",
225
+ " <td>VectorEmbeddings</td>\n",
226
+ " </tr>\n",
227
+ " <tr>\n",
228
+ " <th>16</th>\n",
229
+ " <td>ChatMessageAttachments</td>\n",
230
+ " </tr>\n",
231
+ " <tr>\n",
232
+ " <th>17</th>\n",
233
+ " <td>SimilarProjects</td>\n",
234
+ " </tr>\n",
235
+ " <tr>\n",
236
+ " <th>18</th>\n",
237
+ " <td>AuditLogs</td>\n",
238
+ " </tr>\n",
239
+ " <tr>\n",
240
+ " <th>19</th>\n",
241
+ " <td>AcademicYears</td>\n",
242
+ " </tr>\n",
243
+ " <tr>\n",
244
+ " <th>20</th>\n",
245
+ " <td>Schema</td>\n",
246
+ " </tr>\n",
247
+ " <tr>\n",
248
+ " <th>21</th>\n",
249
+ " <td>Job</td>\n",
250
+ " </tr>\n",
251
+ " <tr>\n",
252
+ " <th>22</th>\n",
253
+ " <td>State</td>\n",
254
+ " </tr>\n",
255
+ " <tr>\n",
256
+ " <th>23</th>\n",
257
+ " <td>JobParameter</td>\n",
258
+ " </tr>\n",
259
+ " <tr>\n",
260
+ " <th>24</th>\n",
261
+ " <td>JobQueue</td>\n",
262
+ " </tr>\n",
263
+ " <tr>\n",
264
+ " <th>25</th>\n",
265
+ " <td>database_firewall_rules</td>\n",
266
+ " </tr>\n",
267
+ " <tr>\n",
268
+ " <th>26</th>\n",
269
+ " <td>Server</td>\n",
270
+ " </tr>\n",
271
+ " <tr>\n",
272
+ " <th>27</th>\n",
273
+ " <td>List</td>\n",
274
+ " </tr>\n",
275
+ " <tr>\n",
276
+ " <th>28</th>\n",
277
+ " <td>Set</td>\n",
278
+ " </tr>\n",
279
+ " <tr>\n",
280
+ " <th>29</th>\n",
281
+ " <td>Counter</td>\n",
282
+ " </tr>\n",
283
+ " <tr>\n",
284
+ " <th>30</th>\n",
285
+ " <td>Hash</td>\n",
286
+ " </tr>\n",
287
+ " <tr>\n",
288
+ " <th>31</th>\n",
289
+ " <td>AggregatedCounter</td>\n",
290
+ " </tr>\n",
291
+ " <tr>\n",
292
+ " <th>32</th>\n",
293
+ " <td>__EFMigrationsHistory</td>\n",
294
+ " </tr>\n",
295
+ " <tr>\n",
296
+ " <th>33</th>\n",
297
+ " <td>Departments</td>\n",
298
+ " </tr>\n",
299
+ " <tr>\n",
300
+ " <th>34</th>\n",
301
+ " <td>Skills_Backup</td>\n",
302
+ " </tr>\n",
303
+ " <tr>\n",
304
+ " <th>35</th>\n",
305
+ " <td>Projects_Backup</td>\n",
306
+ " </tr>\n",
307
+ " <tr>\n",
308
+ " <th>36</th>\n",
309
+ " <td>Domains</td>\n",
310
+ " </tr>\n",
311
+ " <tr>\n",
312
+ " <th>37</th>\n",
313
+ " <td>Skills</td>\n",
314
+ " </tr>\n",
315
+ " <tr>\n",
316
+ " <th>38</th>\n",
317
+ " <td>Technologies</td>\n",
318
+ " </tr>\n",
319
+ " <tr>\n",
320
+ " <th>39</th>\n",
321
+ " <td>Users</td>\n",
322
+ " </tr>\n",
323
+ " <tr>\n",
324
+ " <th>40</th>\n",
325
+ " <td>ProjectDrafts</td>\n",
326
+ " </tr>\n",
327
+ " <tr>\n",
328
+ " <th>41</th>\n",
329
+ " <td>Notifications</td>\n",
330
+ " </tr>\n",
331
+ " <tr>\n",
332
+ " <th>42</th>\n",
333
+ " <td>ProjectDraftTechnologies</td>\n",
334
+ " </tr>\n",
335
+ " <tr>\n",
336
+ " <th>43</th>\n",
337
+ " <td>StudentSkills</td>\n",
338
+ " </tr>\n",
339
+ " </tbody>\n",
340
+ "</table>\n",
341
+ "</div>"
342
+ ],
343
+ "text/plain": [
344
+ " TABLE_NAME\n",
345
+ "0 Teams\n",
346
+ "1 ChatRooms\n",
347
+ "2 ChatMessageHiddens\n",
348
+ "3 JoinRequests\n",
349
+ "4 ChatMessageReactions\n",
350
+ "5 Projects\n",
351
+ "6 TeamMembers\n",
352
+ "7 ProjectTechnologies_Backup\n",
353
+ "8 ChatMessages\n",
354
+ "9 Feedbacks\n",
355
+ "10 MissingProjectTechsSplit\n",
356
+ "11 PreProcessed_Projects\n",
357
+ "12 OriginalityReports\n",
358
+ "13 ProjectAttachments\n",
359
+ "14 ProjectTechnologies\n",
360
+ "15 VectorEmbeddings\n",
361
+ "16 ChatMessageAttachments\n",
362
+ "17 SimilarProjects\n",
363
+ "18 AuditLogs\n",
364
+ "19 AcademicYears\n",
365
+ "20 Schema\n",
366
+ "21 Job\n",
367
+ "22 State\n",
368
+ "23 JobParameter\n",
369
+ "24 JobQueue\n",
370
+ "25 database_firewall_rules\n",
371
+ "26 Server\n",
372
+ "27 List\n",
373
+ "28 Set\n",
374
+ "29 Counter\n",
375
+ "30 Hash\n",
376
+ "31 AggregatedCounter\n",
377
+ "32 __EFMigrationsHistory\n",
378
+ "33 Departments\n",
379
+ "34 Skills_Backup\n",
380
+ "35 Projects_Backup\n",
381
+ "36 Domains\n",
382
+ "37 Skills\n",
383
+ "38 Technologies\n",
384
+ "39 Users\n",
385
+ "40 ProjectDrafts\n",
386
+ "41 Notifications\n",
387
+ "42 ProjectDraftTechnologies\n",
388
+ "43 StudentSkills"
389
+ ]
390
+ },
391
+ "execution_count": 4,
392
+ "metadata": {},
393
+ "output_type": "execute_result"
394
+ }
395
+ ],
396
+ "source": [
397
+ "with engine.connect() as conn:\n",
398
+ "\n",
399
+ " tables = pd.read_sql(\n",
400
+ " \"\"\"\n",
401
+ " SELECT TABLE_NAME\n",
402
+ " FROM INFORMATION_SCHEMA.TABLES\n",
403
+ " \"\"\",\n",
404
+ " conn\n",
405
+ " )\n",
406
+ "\n",
407
+ "tables"
408
+ ]
409
+ },
410
+ {
411
+ "cell_type": "code",
412
+ "execution_count": 5,
413
+ "id": "5d1125cb",
414
+ "metadata": {},
415
+ "outputs": [
416
+ {
417
+ "data": {
418
+ "text/html": [
419
+ "<div>\n",
420
+ "<style scoped>\n",
421
+ " .dataframe tbody tr th:only-of-type {\n",
422
+ " vertical-align: middle;\n",
423
+ " }\n",
424
+ "\n",
425
+ " .dataframe tbody tr th {\n",
426
+ " vertical-align: top;\n",
427
+ " }\n",
428
+ "\n",
429
+ " .dataframe thead th {\n",
430
+ " text-align: right;\n",
431
+ " }\n",
432
+ "</style>\n",
433
+ "<table border=\"1\" class=\"dataframe\">\n",
434
+ " <thead>\n",
435
+ " <tr style=\"text-align: right;\">\n",
436
+ " <th></th>\n",
437
+ " <th>id</th>\n",
438
+ " <th>submitted_at</th>\n",
439
+ " <th>project_title</th>\n",
440
+ " <th>student_names</th>\n",
441
+ " <th>year</th>\n",
442
+ " <th>abstract</th>\n",
443
+ " <th>description</th>\n",
444
+ " <th>problem_statement</th>\n",
445
+ " <th>proposed_solution</th>\n",
446
+ " <th>objectives</th>\n",
447
+ " <th>full_content</th>\n",
448
+ " <th>clean_text</th>\n",
449
+ " <th>word_count</th>\n",
450
+ " <th>features</th>\n",
451
+ " </tr>\n",
452
+ " </thead>\n",
453
+ " <tbody>\n",
454
+ " <tr>\n",
455
+ " <th>0</th>\n",
456
+ " <td>1</td>\n",
457
+ " <td>NaT</td>\n",
458
+ " <td>3D hand game for neuromuscular patients</td>\n",
459
+ " <td>Ahmed Mansour Mohamed Saber, Ahmed Mohamed Moh...</td>\n",
460
+ " <td>2017</td>\n",
461
+ " <td>In this project we have designed and implement...</td>\n",
462
+ " <td>A virtual rehabilitation system that uses a Le...</td>\n",
463
+ " <td>Neuromuscular patients suffer from nerve atrop...</td>\n",
464
+ " <td>The development of a 3D interactive game integ...</td>\n",
465
+ " <td>1. Develop a scalable and maintainable solutio...</td>\n",
466
+ " <td>3D hand game for neuromuscular patients. 3D ha...</td>\n",
467
+ " <td>3d hand game for neuromuscular patients. 3d ha...</td>\n",
468
+ " <td>172</td>\n",
469
+ " <td>\"\\\"[\\\\\\\"Leap Motion controller sensor\\\\\\\", \\\\\\...</td>\n",
470
+ " </tr>\n",
471
+ " <tr>\n",
472
+ " <th>1</th>\n",
473
+ " <td>2</td>\n",
474
+ " <td>NaT</td>\n",
475
+ " <td>3D Laser Scanning</td>\n",
476
+ " <td>Aya Essam Hegazi, Asmaa Abd EL-Aziz, Ebtehal E...</td>\n",
477
+ " <td>2024</td>\n",
478
+ " <td>3D scanning is used in many applications such ...</td>\n",
479
+ " <td>This project implements a low-cost 3D laser sc...</td>\n",
480
+ " <td>Existing 3D scanning devices are often extreme...</td>\n",
481
+ " <td>A low-cost 3D laser scanning system that utili...</td>\n",
482
+ " <td>1. Improve overall productivity and workflow o...</td>\n",
483
+ " <td>3D Laser Scanning. 3D Laser Scanning. 3D scann...</td>\n",
484
+ " <td>3d laser scanning. 3d laser scanning. 3d scann...</td>\n",
485
+ " <td>185</td>\n",
486
+ " <td>\"\\\"[\\\\\\\"3d laser scanning\\\\\\\", \\\\\\\"Hand-held l...</td>\n",
487
+ " </tr>\n",
488
+ " <tr>\n",
489
+ " <th>2</th>\n",
490
+ " <td>3</td>\n",
491
+ " <td>NaT</td>\n",
492
+ " <td>A Smart Automatic System for Criminal Identifi...</td>\n",
493
+ " <td>Yousef Yacoub Mohammed, Ahmed Mohamed Hassan,\\...</td>\n",
494
+ " <td>2020</td>\n",
495
+ " <td>The increasing use of biometric technologies i...</td>\n",
496
+ " <td>This project develops an automated criminal id...</td>\n",
497
+ " <td>Traditional identification methods, such as ph...</td>\n",
498
+ " <td>A real-time facial recognition system develope...</td>\n",
499
+ " <td>1. Support future scalability and feature expa...</td>\n",
500
+ " <td>A Smart Automatic System for Criminal Identifi...</td>\n",
501
+ " <td>a smart automatic system for criminal identifi...</td>\n",
502
+ " <td>138</td>\n",
503
+ " <td>\"\\\"[\\\\\\\"real-time face recognition system\\\\\\\",...</td>\n",
504
+ " </tr>\n",
505
+ " <tr>\n",
506
+ " <th>3</th>\n",
507
+ " <td>4</td>\n",
508
+ " <td>NaT</td>\n",
509
+ " <td>Advanced Educational Platform “ABSTHALK”</td>\n",
510
+ " <td>Mohamed Nasser Maher, Karim Ashraf Salah Eldie...</td>\n",
511
+ " <td>2025</td>\n",
512
+ " <td>The Educational Platform for Students and Teac...</td>\n",
513
+ " <td>ABSTHALK is a comprehensive, role-based e-lear...</td>\n",
514
+ " <td>Traditional learning methods often lack access...</td>\n",
515
+ " <td>The project proposes a structured, role-based,...</td>\n",
516
+ " <td>1. Provide interactive educational tools and r...</td>\n",
517
+ " <td>Advanced Educational Platform “ABSTHALK”. Adva...</td>\n",
518
+ " <td>advanced educational platform absthalk . advan...</td>\n",
519
+ " <td>192</td>\n",
520
+ " <td>\"\\\"[\\\\\\\"Role-based management system\\\\\\\", \\\\\\\"...</td>\n",
521
+ " </tr>\n",
522
+ " <tr>\n",
523
+ " <th>4</th>\n",
524
+ " <td>5</td>\n",
525
+ " <td>NaT</td>\n",
526
+ " <td>Agricultural Information and Management System</td>\n",
527
+ " <td>Ahmed Mohamed, Omar Hassan, Mahmoud Ali ,Mazen...</td>\n",
528
+ " <td>2020</td>\n",
529
+ " <td>It is a permanent link between the decision-ma...</td>\n",
530
+ " <td>This project is an integrated information syst...</td>\n",
531
+ " <td>The competent authorities of the Ministry of A...</td>\n",
532
+ " <td>The development of an integrated information s...</td>\n",
533
+ " <td>1. Reduce operational complexity and improve e...</td>\n",
534
+ " <td>Agricultural Information and Management System...</td>\n",
535
+ " <td>agricultural information and management system...</td>\n",
536
+ " <td>109</td>\n",
537
+ " <td>\"\\\"[\\\\\\\"centralized database\\\\\\\", \\\\\\\"track la...</td>\n",
538
+ " </tr>\n",
539
+ " </tbody>\n",
540
+ "</table>\n",
541
+ "</div>"
542
+ ],
543
+ "text/plain": [
544
+ " id submitted_at project_title \\\n",
545
+ "0 1 NaT 3D hand game for neuromuscular patients \n",
546
+ "1 2 NaT 3D Laser Scanning \n",
547
+ "2 3 NaT A Smart Automatic System for Criminal Identifi... \n",
548
+ "3 4 NaT Advanced Educational Platform “ABSTHALK” \n",
549
+ "4 5 NaT Agricultural Information and Management System \n",
550
+ "\n",
551
+ " student_names year \\\n",
552
+ "0 Ahmed Mansour Mohamed Saber, Ahmed Mohamed Moh... 2017 \n",
553
+ "1 Aya Essam Hegazi, Asmaa Abd EL-Aziz, Ebtehal E... 2024 \n",
554
+ "2 Yousef Yacoub Mohammed, Ahmed Mohamed Hassan,\\... 2020 \n",
555
+ "3 Mohamed Nasser Maher, Karim Ashraf Salah Eldie... 2025 \n",
556
+ "4 Ahmed Mohamed, Omar Hassan, Mahmoud Ali ,Mazen... 2020 \n",
557
+ "\n",
558
+ " abstract \\\n",
559
+ "0 In this project we have designed and implement... \n",
560
+ "1 3D scanning is used in many applications such ... \n",
561
+ "2 The increasing use of biometric technologies i... \n",
562
+ "3 The Educational Platform for Students and Teac... \n",
563
+ "4 It is a permanent link between the decision-ma... \n",
564
+ "\n",
565
+ " description \\\n",
566
+ "0 A virtual rehabilitation system that uses a Le... \n",
567
+ "1 This project implements a low-cost 3D laser sc... \n",
568
+ "2 This project develops an automated criminal id... \n",
569
+ "3 ABSTHALK is a comprehensive, role-based e-lear... \n",
570
+ "4 This project is an integrated information syst... \n",
571
+ "\n",
572
+ " problem_statement \\\n",
573
+ "0 Neuromuscular patients suffer from nerve atrop... \n",
574
+ "1 Existing 3D scanning devices are often extreme... \n",
575
+ "2 Traditional identification methods, such as ph... \n",
576
+ "3 Traditional learning methods often lack access... \n",
577
+ "4 The competent authorities of the Ministry of A... \n",
578
+ "\n",
579
+ " proposed_solution \\\n",
580
+ "0 The development of a 3D interactive game integ... \n",
581
+ "1 A low-cost 3D laser scanning system that utili... \n",
582
+ "2 A real-time facial recognition system develope... \n",
583
+ "3 The project proposes a structured, role-based,... \n",
584
+ "4 The development of an integrated information s... \n",
585
+ "\n",
586
+ " objectives \\\n",
587
+ "0 1. Develop a scalable and maintainable solutio... \n",
588
+ "1 1. Improve overall productivity and workflow o... \n",
589
+ "2 1. Support future scalability and feature expa... \n",
590
+ "3 1. Provide interactive educational tools and r... \n",
591
+ "4 1. Reduce operational complexity and improve e... \n",
592
+ "\n",
593
+ " full_content \\\n",
594
+ "0 3D hand game for neuromuscular patients. 3D ha... \n",
595
+ "1 3D Laser Scanning. 3D Laser Scanning. 3D scann... \n",
596
+ "2 A Smart Automatic System for Criminal Identifi... \n",
597
+ "3 Advanced Educational Platform “ABSTHALK”. Adva... \n",
598
+ "4 Agricultural Information and Management System... \n",
599
+ "\n",
600
+ " clean_text word_count \\\n",
601
+ "0 3d hand game for neuromuscular patients. 3d ha... 172 \n",
602
+ "1 3d laser scanning. 3d laser scanning. 3d scann... 185 \n",
603
+ "2 a smart automatic system for criminal identifi... 138 \n",
604
+ "3 advanced educational platform absthalk . advan... 192 \n",
605
+ "4 agricultural information and management system... 109 \n",
606
+ "\n",
607
+ " features \n",
608
+ "0 \"\\\"[\\\\\\\"Leap Motion controller sensor\\\\\\\", \\\\\\... \n",
609
+ "1 \"\\\"[\\\\\\\"3d laser scanning\\\\\\\", \\\\\\\"Hand-held l... \n",
610
+ "2 \"\\\"[\\\\\\\"real-time face recognition system\\\\\\\",... \n",
611
+ "3 \"\\\"[\\\\\\\"Role-based management system\\\\\\\", \\\\\\\"... \n",
612
+ "4 \"\\\"[\\\\\\\"centralized database\\\\\\\", \\\\\\\"track la... "
613
+ ]
614
+ },
615
+ "execution_count": 5,
616
+ "metadata": {},
617
+ "output_type": "execute_result"
618
+ }
619
+ ],
620
+ "source": [
621
+ "query = \"\"\"\n",
622
+ "SELECT *\n",
623
+ "FROM PreProcessed_Projects\n",
624
+ "\"\"\"\n",
625
+ "\n",
626
+ "df = pd.read_sql(query, engine)\n",
627
+ "\n",
628
+ "df.head()"
629
+ ]
630
+ },
631
+ {
632
+ "cell_type": "code",
633
+ "execution_count": 6,
634
+ "id": "4429717d",
635
+ "metadata": {},
636
+ "outputs": [
637
+ {
638
+ "name": "stdout",
639
+ "output_type": "stream",
640
+ "text": [
641
+ "['id', 'submitted_at', 'project_title', 'student_names', 'year', 'abstract', 'description', 'problem_statement', 'proposed_solution', 'objectives', 'full_content', 'clean_text', 'word_count', 'features']\n"
642
+ ]
643
+ }
644
+ ],
645
+ "source": [
646
+ "print(df.columns.tolist())"
647
+ ]
648
+ },
649
+ {
650
+ "cell_type": "code",
651
+ "execution_count": 7,
652
+ "id": "9925da4c",
653
+ "metadata": {},
654
+ "outputs": [],
655
+ "source": [
656
+ "df = df.rename(columns={\n",
657
+ " \"Title\": \"project_title\",\n",
658
+ " \"Description\": \"description\",\n",
659
+ " \"Abstract\": \"abstract\"\n",
660
+ "})"
661
+ ]
662
+ },
663
+ {
664
+ "cell_type": "code",
665
+ "execution_count": 8,
666
+ "id": "fc62d4f3",
667
+ "metadata": {},
668
+ "outputs": [
669
+ {
670
+ "data": {
671
+ "text/html": [
672
+ "<div>\n",
673
+ "<style scoped>\n",
674
+ " .dataframe tbody tr th:only-of-type {\n",
675
+ " vertical-align: middle;\n",
676
+ " }\n",
677
+ "\n",
678
+ " .dataframe tbody tr th {\n",
679
+ " vertical-align: top;\n",
680
+ " }\n",
681
+ "\n",
682
+ " .dataframe thead th {\n",
683
+ " text-align: right;\n",
684
+ " }\n",
685
+ "</style>\n",
686
+ "<table border=\"1\" class=\"dataframe\">\n",
687
+ " <thead>\n",
688
+ " <tr style=\"text-align: right;\">\n",
689
+ " <th></th>\n",
690
+ " <th>id</th>\n",
691
+ " <th>submitted_at</th>\n",
692
+ " <th>project_title</th>\n",
693
+ " <th>student_names</th>\n",
694
+ " <th>year</th>\n",
695
+ " <th>abstract</th>\n",
696
+ " <th>description</th>\n",
697
+ " <th>problem_statement</th>\n",
698
+ " <th>proposed_solution</th>\n",
699
+ " <th>objectives</th>\n",
700
+ " <th>full_content</th>\n",
701
+ " <th>clean_text</th>\n",
702
+ " <th>word_count</th>\n",
703
+ " <th>features</th>\n",
704
+ " </tr>\n",
705
+ " </thead>\n",
706
+ " <tbody>\n",
707
+ " <tr>\n",
708
+ " <th>0</th>\n",
709
+ " <td>1</td>\n",
710
+ " <td>NaT</td>\n",
711
+ " <td>3D hand game for neuromuscular patients</td>\n",
712
+ " <td>Ahmed Mansour Mohamed Saber, Ahmed Mohamed Moh...</td>\n",
713
+ " <td>2017</td>\n",
714
+ " <td>In this project we have designed and implement...</td>\n",
715
+ " <td>A virtual rehabilitation system that uses a Le...</td>\n",
716
+ " <td>Neuromuscular patients suffer from nerve atrop...</td>\n",
717
+ " <td>The development of a 3D interactive game integ...</td>\n",
718
+ " <td>1. Develop a scalable and maintainable solutio...</td>\n",
719
+ " <td>3D hand game for neuromuscular patients. 3D ha...</td>\n",
720
+ " <td>3d hand game for neuromuscular patients. 3d ha...</td>\n",
721
+ " <td>172</td>\n",
722
+ " <td>\"\\\"[\\\\\\\"Leap Motion controller sensor\\\\\\\", \\\\\\...</td>\n",
723
+ " </tr>\n",
724
+ " <tr>\n",
725
+ " <th>1</th>\n",
726
+ " <td>2</td>\n",
727
+ " <td>NaT</td>\n",
728
+ " <td>3D Laser Scanning</td>\n",
729
+ " <td>Aya Essam Hegazi, Asmaa Abd EL-Aziz, Ebtehal E...</td>\n",
730
+ " <td>2024</td>\n",
731
+ " <td>3D scanning is used in many applications such ...</td>\n",
732
+ " <td>This project implements a low-cost 3D laser sc...</td>\n",
733
+ " <td>Existing 3D scanning devices are often extreme...</td>\n",
734
+ " <td>A low-cost 3D laser scanning system that utili...</td>\n",
735
+ " <td>1. Improve overall productivity and workflow o...</td>\n",
736
+ " <td>3D Laser Scanning. 3D Laser Scanning. 3D scann...</td>\n",
737
+ " <td>3d laser scanning. 3d laser scanning. 3d scann...</td>\n",
738
+ " <td>185</td>\n",
739
+ " <td>\"\\\"[\\\\\\\"3d laser scanning\\\\\\\", \\\\\\\"Hand-held l...</td>\n",
740
+ " </tr>\n",
741
+ " <tr>\n",
742
+ " <th>2</th>\n",
743
+ " <td>3</td>\n",
744
+ " <td>NaT</td>\n",
745
+ " <td>A Smart Automatic System for Criminal Identifi...</td>\n",
746
+ " <td>Yousef Yacoub Mohammed, Ahmed Mohamed Hassan,\\...</td>\n",
747
+ " <td>2020</td>\n",
748
+ " <td>The increasing use of biometric technologies i...</td>\n",
749
+ " <td>This project develops an automated criminal id...</td>\n",
750
+ " <td>Traditional identification methods, such as ph...</td>\n",
751
+ " <td>A real-time facial recognition system develope...</td>\n",
752
+ " <td>1. Support future scalability and feature expa...</td>\n",
753
+ " <td>A Smart Automatic System for Criminal Identifi...</td>\n",
754
+ " <td>a smart automatic system for criminal identifi...</td>\n",
755
+ " <td>138</td>\n",
756
+ " <td>\"\\\"[\\\\\\\"real-time face recognition system\\\\\\\",...</td>\n",
757
+ " </tr>\n",
758
+ " <tr>\n",
759
+ " <th>3</th>\n",
760
+ " <td>4</td>\n",
761
+ " <td>NaT</td>\n",
762
+ " <td>Advanced Educational Platform “ABSTHALK”</td>\n",
763
+ " <td>Mohamed Nasser Maher, Karim Ashraf Salah Eldie...</td>\n",
764
+ " <td>2025</td>\n",
765
+ " <td>The Educational Platform for Students and Teac...</td>\n",
766
+ " <td>ABSTHALK is a comprehensive, role-based e-lear...</td>\n",
767
+ " <td>Traditional learning methods often lack access...</td>\n",
768
+ " <td>The project proposes a structured, role-based,...</td>\n",
769
+ " <td>1. Provide interactive educational tools and r...</td>\n",
770
+ " <td>Advanced Educational Platform “ABSTHALK”. Adva...</td>\n",
771
+ " <td>advanced educational platform absthalk . advan...</td>\n",
772
+ " <td>192</td>\n",
773
+ " <td>\"\\\"[\\\\\\\"Role-based management system\\\\\\\", \\\\\\\"...</td>\n",
774
+ " </tr>\n",
775
+ " <tr>\n",
776
+ " <th>4</th>\n",
777
+ " <td>5</td>\n",
778
+ " <td>NaT</td>\n",
779
+ " <td>Agricultural Information and Management System</td>\n",
780
+ " <td>Ahmed Mohamed, Omar Hassan, Mahmoud Ali ,Mazen...</td>\n",
781
+ " <td>2020</td>\n",
782
+ " <td>It is a permanent link between the decision-ma...</td>\n",
783
+ " <td>This project is an integrated information syst...</td>\n",
784
+ " <td>The competent authorities of the Ministry of A...</td>\n",
785
+ " <td>The development of an integrated information s...</td>\n",
786
+ " <td>1. Reduce operational complexity and improve e...</td>\n",
787
+ " <td>Agricultural Information and Management System...</td>\n",
788
+ " <td>agricultural information and management system...</td>\n",
789
+ " <td>109</td>\n",
790
+ " <td>\"\\\"[\\\\\\\"centralized database\\\\\\\", \\\\\\\"track la...</td>\n",
791
+ " </tr>\n",
792
+ " </tbody>\n",
793
+ "</table>\n",
794
+ "</div>"
795
+ ],
796
+ "text/plain": [
797
+ " id submitted_at project_title \\\n",
798
+ "0 1 NaT 3D hand game for neuromuscular patients \n",
799
+ "1 2 NaT 3D Laser Scanning \n",
800
+ "2 3 NaT A Smart Automatic System for Criminal Identifi... \n",
801
+ "3 4 NaT Advanced Educational Platform “ABSTHALK” \n",
802
+ "4 5 NaT Agricultural Information and Management System \n",
803
+ "\n",
804
+ " student_names year \\\n",
805
+ "0 Ahmed Mansour Mohamed Saber, Ahmed Mohamed Moh... 2017 \n",
806
+ "1 Aya Essam Hegazi, Asmaa Abd EL-Aziz, Ebtehal E... 2024 \n",
807
+ "2 Yousef Yacoub Mohammed, Ahmed Mohamed Hassan,\\... 2020 \n",
808
+ "3 Mohamed Nasser Maher, Karim Ashraf Salah Eldie... 2025 \n",
809
+ "4 Ahmed Mohamed, Omar Hassan, Mahmoud Ali ,Mazen... 2020 \n",
810
+ "\n",
811
+ " abstract \\\n",
812
+ "0 In this project we have designed and implement... \n",
813
+ "1 3D scanning is used in many applications such ... \n",
814
+ "2 The increasing use of biometric technologies i... \n",
815
+ "3 The Educational Platform for Students and Teac... \n",
816
+ "4 It is a permanent link between the decision-ma... \n",
817
+ "\n",
818
+ " description \\\n",
819
+ "0 A virtual rehabilitation system that uses a Le... \n",
820
+ "1 This project implements a low-cost 3D laser sc... \n",
821
+ "2 This project develops an automated criminal id... \n",
822
+ "3 ABSTHALK is a comprehensive, role-based e-lear... \n",
823
+ "4 This project is an integrated information syst... \n",
824
+ "\n",
825
+ " problem_statement \\\n",
826
+ "0 Neuromuscular patients suffer from nerve atrop... \n",
827
+ "1 Existing 3D scanning devices are often extreme... \n",
828
+ "2 Traditional identification methods, such as ph... \n",
829
+ "3 Traditional learning methods often lack access... \n",
830
+ "4 The competent authorities of the Ministry of A... \n",
831
+ "\n",
832
+ " proposed_solution \\\n",
833
+ "0 The development of a 3D interactive game integ... \n",
834
+ "1 A low-cost 3D laser scanning system that utili... \n",
835
+ "2 A real-time facial recognition system develope... \n",
836
+ "3 The project proposes a structured, role-based,... \n",
837
+ "4 The development of an integrated information s... \n",
838
+ "\n",
839
+ " objectives \\\n",
840
+ "0 1. Develop a scalable and maintainable solutio... \n",
841
+ "1 1. Improve overall productivity and workflow o... \n",
842
+ "2 1. Support future scalability and feature expa... \n",
843
+ "3 1. Provide interactive educational tools and r... \n",
844
+ "4 1. Reduce operational complexity and improve e... \n",
845
+ "\n",
846
+ " full_content \\\n",
847
+ "0 3D hand game for neuromuscular patients. 3D ha... \n",
848
+ "1 3D Laser Scanning. 3D Laser Scanning. 3D scann... \n",
849
+ "2 A Smart Automatic System for Criminal Identifi... \n",
850
+ "3 Advanced Educational Platform “ABSTHALK”. Adva... \n",
851
+ "4 Agricultural Information and Management System... \n",
852
+ "\n",
853
+ " clean_text word_count \\\n",
854
+ "0 3d hand game for neuromuscular patients. 3d ha... 172 \n",
855
+ "1 3d laser scanning. 3d laser scanning. 3d scann... 185 \n",
856
+ "2 a smart automatic system for criminal identifi... 138 \n",
857
+ "3 advanced educational platform absthalk . advan... 192 \n",
858
+ "4 agricultural information and management system... 109 \n",
859
+ "\n",
860
+ " features \n",
861
+ "0 \"\\\"[\\\\\\\"Leap Motion controller sensor\\\\\\\", \\\\\\... \n",
862
+ "1 \"\\\"[\\\\\\\"3d laser scanning\\\\\\\", \\\\\\\"Hand-held l... \n",
863
+ "2 \"\\\"[\\\\\\\"real-time face recognition system\\\\\\\",... \n",
864
+ "3 \"\\\"[\\\\\\\"Role-based management system\\\\\\\", \\\\\\\"... \n",
865
+ "4 \"\\\"[\\\\\\\"centralized database\\\\\\\", \\\\\\\"track la... "
866
+ ]
867
+ },
868
+ "execution_count": 8,
869
+ "metadata": {},
870
+ "output_type": "execute_result"
871
+ }
872
+ ],
873
+ "source": [
874
+ "query = \"\"\"\n",
875
+ "SELECT *\n",
876
+ "FROM PreProcessed_Projects\n",
877
+ "\"\"\"\n",
878
+ "\n",
879
+ "clean_df = pd.read_sql(query, engine)\n",
880
+ "\n",
881
+ "clean_df.head()"
882
+ ]
883
+ },
884
+ {
885
+ "cell_type": "code",
886
+ "execution_count": 9,
887
+ "id": "e5af88d4",
888
+ "metadata": {},
889
+ "outputs": [
890
+ {
891
+ "name": "stdout",
892
+ "output_type": "stream",
893
+ "text": [
894
+ "(255, 14)\n"
895
+ ]
896
+ }
897
+ ],
898
+ "source": [
899
+ "print(clean_df.shape)\n"
900
+ ]
901
+ },
902
+ {
903
+ "cell_type": "code",
904
+ "execution_count": 10,
905
+ "id": "bb80639a",
906
+ "metadata": {},
907
+ "outputs": [
908
+ {
909
+ "data": {
910
+ "text/plain": [
911
+ "count 255.000000\n",
912
+ "mean 236.031373\n",
913
+ "std 87.747619\n",
914
+ "min 24.000000\n",
915
+ "25% 173.500000\n",
916
+ "50% 225.000000\n",
917
+ "75% 287.000000\n",
918
+ "max 719.000000\n",
919
+ "Name: features, dtype: float64"
920
+ ]
921
+ },
922
+ "execution_count": 10,
923
+ "metadata": {},
924
+ "output_type": "execute_result"
925
+ }
926
+ ],
927
+ "source": [
928
+ "clean_df[\"features\"].apply(len).describe()"
929
+ ]
930
+ },
931
+ {
932
+ "cell_type": "code",
933
+ "execution_count": 11,
934
+ "id": "633cfec4",
935
+ "metadata": {},
936
+ "outputs": [
937
+ {
938
+ "name": "stdout",
939
+ "output_type": "stream",
940
+ "text": [
941
+ "Saved cleaned dataset\n"
942
+ ]
943
+ }
944
+ ],
945
+ "source": [
946
+ "clean_df.to_parquet(\"Data_gemini/projects_clean_gemini.parquet\", index=False)\n",
947
+ "clean_df.to_csv(\"Data_gemini/projects_clean_gemini.csv\", index=False)\n",
948
+ "\n",
949
+ "print(\"Saved cleaned dataset\")"
950
+ ]
951
+ },
952
+ {
953
+ "cell_type": "code",
954
+ "execution_count": 12,
955
+ "id": "36f84432",
956
+ "metadata": {},
957
+ "outputs": [
958
+ {
959
+ "name": "stdout",
960
+ "output_type": "stream",
961
+ "text": [
962
+ "(255, 14)\n"
963
+ ]
964
+ }
965
+ ],
966
+ "source": [
967
+ "test_df = pd.read_parquet(\n",
968
+ " \"Data_gemini/projects_clean_gemini.parquet\"\n",
969
+ ")\n",
970
+ "\n",
971
+ "print(test_df.shape)"
972
+ ]
973
+ },
974
+ {
975
+ "cell_type": "code",
976
+ "execution_count": 13,
977
+ "id": "0dd86aec",
978
+ "metadata": {},
979
+ "outputs": [
980
+ {
981
+ "name": "stdout",
982
+ "output_type": "stream",
983
+ "text": [
984
+ "['id', 'submitted_at', 'project_title', 'student_names', 'year', 'abstract', 'description', 'problem_statement', 'proposed_solution', 'objectives', 'full_content', 'clean_text', 'word_count', 'features']\n"
985
+ ]
986
+ }
987
+ ],
988
+ "source": [
989
+ "print(clean_df.columns.tolist())"
990
+ ]
991
+ },
992
+ {
993
+ "cell_type": "code",
994
+ "execution_count": 14,
995
+ "id": "e3e96549",
996
+ "metadata": {},
997
+ "outputs": [
998
+ {
999
+ "data": {
1000
+ "text/html": [
1001
+ "<div>\n",
1002
+ "<style scoped>\n",
1003
+ " .dataframe tbody tr th:only-of-type {\n",
1004
+ " vertical-align: middle;\n",
1005
+ " }\n",
1006
+ "\n",
1007
+ " .dataframe tbody tr th {\n",
1008
+ " vertical-align: top;\n",
1009
+ " }\n",
1010
+ "\n",
1011
+ " .dataframe thead th {\n",
1012
+ " text-align: right;\n",
1013
+ " }\n",
1014
+ "</style>\n",
1015
+ "<table border=\"1\" class=\"dataframe\">\n",
1016
+ " <thead>\n",
1017
+ " <tr style=\"text-align: right;\">\n",
1018
+ " <th></th>\n",
1019
+ " <th>id</th>\n",
1020
+ " <th>submitted_at</th>\n",
1021
+ " <th>project_title</th>\n",
1022
+ " <th>student_names</th>\n",
1023
+ " <th>year</th>\n",
1024
+ " <th>abstract</th>\n",
1025
+ " <th>description</th>\n",
1026
+ " <th>problem_statement</th>\n",
1027
+ " <th>proposed_solution</th>\n",
1028
+ " <th>objectives</th>\n",
1029
+ " <th>full_content</th>\n",
1030
+ " <th>clean_text</th>\n",
1031
+ " <th>word_count</th>\n",
1032
+ " <th>features</th>\n",
1033
+ " </tr>\n",
1034
+ " </thead>\n",
1035
+ " <tbody>\n",
1036
+ " <tr>\n",
1037
+ " <th>0</th>\n",
1038
+ " <td>1</td>\n",
1039
+ " <td>None</td>\n",
1040
+ " <td>3D hand game for neuromuscular patients</td>\n",
1041
+ " <td>Ahmed Mansour Mohamed Saber, Ahmed Mohamed Moh...</td>\n",
1042
+ " <td>2017</td>\n",
1043
+ " <td>In this project we have designed and implement...</td>\n",
1044
+ " <td>A virtual rehabilitation system that uses a Le...</td>\n",
1045
+ " <td>Neuromuscular patients suffer from nerve atrop...</td>\n",
1046
+ " <td>The development of a 3D interactive game integ...</td>\n",
1047
+ " <td>1. Develop a scalable and maintainable solutio...</td>\n",
1048
+ " <td>3D hand game for neuromuscular patients. 3D ha...</td>\n",
1049
+ " <td>3d hand game for neuromuscular patients. 3d ha...</td>\n",
1050
+ " <td>172</td>\n",
1051
+ " <td>\"\\\"[\\\\\\\"Leap Motion controller sensor\\\\\\\", \\\\\\...</td>\n",
1052
+ " </tr>\n",
1053
+ " <tr>\n",
1054
+ " <th>1</th>\n",
1055
+ " <td>2</td>\n",
1056
+ " <td>None</td>\n",
1057
+ " <td>3D Laser Scanning</td>\n",
1058
+ " <td>Aya Essam Hegazi, Asmaa Abd EL-Aziz, Ebtehal E...</td>\n",
1059
+ " <td>2024</td>\n",
1060
+ " <td>3D scanning is used in many applications such ...</td>\n",
1061
+ " <td>This project implements a low-cost 3D laser sc...</td>\n",
1062
+ " <td>Existing 3D scanning devices are often extreme...</td>\n",
1063
+ " <td>A low-cost 3D laser scanning system that utili...</td>\n",
1064
+ " <td>1. Improve overall productivity and workflow o...</td>\n",
1065
+ " <td>3D Laser Scanning. 3D Laser Scanning. 3D scann...</td>\n",
1066
+ " <td>3d laser scanning. 3d laser scanning. 3d scann...</td>\n",
1067
+ " <td>185</td>\n",
1068
+ " <td>\"\\\"[\\\\\\\"3d laser scanning\\\\\\\", \\\\\\\"Hand-held l...</td>\n",
1069
+ " </tr>\n",
1070
+ " <tr>\n",
1071
+ " <th>2</th>\n",
1072
+ " <td>3</td>\n",
1073
+ " <td>None</td>\n",
1074
+ " <td>A Smart Automatic System for Criminal Identifi...</td>\n",
1075
+ " <td>Yousef Yacoub Mohammed, Ahmed Mohamed Hassan,\\...</td>\n",
1076
+ " <td>2020</td>\n",
1077
+ " <td>The increasing use of biometric technologies i...</td>\n",
1078
+ " <td>This project develops an automated criminal id...</td>\n",
1079
+ " <td>Traditional identification methods, such as ph...</td>\n",
1080
+ " <td>A real-time facial recognition system develope...</td>\n",
1081
+ " <td>1. Support future scalability and feature expa...</td>\n",
1082
+ " <td>A Smart Automatic System for Criminal Identifi...</td>\n",
1083
+ " <td>a smart automatic system for criminal identifi...</td>\n",
1084
+ " <td>138</td>\n",
1085
+ " <td>\"\\\"[\\\\\\\"real-time face recognition system\\\\\\\",...</td>\n",
1086
+ " </tr>\n",
1087
+ " <tr>\n",
1088
+ " <th>3</th>\n",
1089
+ " <td>4</td>\n",
1090
+ " <td>None</td>\n",
1091
+ " <td>Advanced Educational Platform “ABSTHALK”</td>\n",
1092
+ " <td>Mohamed Nasser Maher, Karim Ashraf Salah Eldie...</td>\n",
1093
+ " <td>2025</td>\n",
1094
+ " <td>The Educational Platform for Students and Teac...</td>\n",
1095
+ " <td>ABSTHALK is a comprehensive, role-based e-lear...</td>\n",
1096
+ " <td>Traditional learning methods often lack access...</td>\n",
1097
+ " <td>The project proposes a structured, role-based,...</td>\n",
1098
+ " <td>1. Provide interactive educational tools and r...</td>\n",
1099
+ " <td>Advanced Educational Platform “ABSTHALK”. Adva...</td>\n",
1100
+ " <td>advanced educational platform absthalk . advan...</td>\n",
1101
+ " <td>192</td>\n",
1102
+ " <td>\"\\\"[\\\\\\\"Role-based management system\\\\\\\", \\\\\\\"...</td>\n",
1103
+ " </tr>\n",
1104
+ " <tr>\n",
1105
+ " <th>4</th>\n",
1106
+ " <td>5</td>\n",
1107
+ " <td>None</td>\n",
1108
+ " <td>Agricultural Information and Management System</td>\n",
1109
+ " <td>Ahmed Mohamed, Omar Hassan, Mahmoud Ali ,Mazen...</td>\n",
1110
+ " <td>2020</td>\n",
1111
+ " <td>It is a permanent link between the decision-ma...</td>\n",
1112
+ " <td>This project is an integrated information syst...</td>\n",
1113
+ " <td>The competent authorities of the Ministry of A...</td>\n",
1114
+ " <td>The development of an integrated information s...</td>\n",
1115
+ " <td>1. Reduce operational complexity and improve e...</td>\n",
1116
+ " <td>Agricultural Information and Management System...</td>\n",
1117
+ " <td>agricultural information and management system...</td>\n",
1118
+ " <td>109</td>\n",
1119
+ " <td>\"\\\"[\\\\\\\"centralized database\\\\\\\", \\\\\\\"track la...</td>\n",
1120
+ " </tr>\n",
1121
+ " </tbody>\n",
1122
+ "</table>\n",
1123
+ "</div>"
1124
+ ],
1125
+ "text/plain": [
1126
+ " id submitted_at project_title \\\n",
1127
+ "0 1 None 3D hand game for neuromuscular patients \n",
1128
+ "1 2 None 3D Laser Scanning \n",
1129
+ "2 3 None A Smart Automatic System for Criminal Identifi... \n",
1130
+ "3 4 None Advanced Educational Platform “ABSTHALK” \n",
1131
+ "4 5 None Agricultural Information and Management System \n",
1132
+ "\n",
1133
+ " student_names year \\\n",
1134
+ "0 Ahmed Mansour Mohamed Saber, Ahmed Mohamed Moh... 2017 \n",
1135
+ "1 Aya Essam Hegazi, Asmaa Abd EL-Aziz, Ebtehal E... 2024 \n",
1136
+ "2 Yousef Yacoub Mohammed, Ahmed Mohamed Hassan,\\... 2020 \n",
1137
+ "3 Mohamed Nasser Maher, Karim Ashraf Salah Eldie... 2025 \n",
1138
+ "4 Ahmed Mohamed, Omar Hassan, Mahmoud Ali ,Mazen... 2020 \n",
1139
+ "\n",
1140
+ " abstract \\\n",
1141
+ "0 In this project we have designed and implement... \n",
1142
+ "1 3D scanning is used in many applications such ... \n",
1143
+ "2 The increasing use of biometric technologies i... \n",
1144
+ "3 The Educational Platform for Students and Teac... \n",
1145
+ "4 It is a permanent link between the decision-ma... \n",
1146
+ "\n",
1147
+ " description \\\n",
1148
+ "0 A virtual rehabilitation system that uses a Le... \n",
1149
+ "1 This project implements a low-cost 3D laser sc... \n",
1150
+ "2 This project develops an automated criminal id... \n",
1151
+ "3 ABSTHALK is a comprehensive, role-based e-lear... \n",
1152
+ "4 This project is an integrated information syst... \n",
1153
+ "\n",
1154
+ " problem_statement \\\n",
1155
+ "0 Neuromuscular patients suffer from nerve atrop... \n",
1156
+ "1 Existing 3D scanning devices are often extreme... \n",
1157
+ "2 Traditional identification methods, such as ph... \n",
1158
+ "3 Traditional learning methods often lack access... \n",
1159
+ "4 The competent authorities of the Ministry of A... \n",
1160
+ "\n",
1161
+ " proposed_solution \\\n",
1162
+ "0 The development of a 3D interactive game integ... \n",
1163
+ "1 A low-cost 3D laser scanning system that utili... \n",
1164
+ "2 A real-time facial recognition system develope... \n",
1165
+ "3 The project proposes a structured, role-based,... \n",
1166
+ "4 The development of an integrated information s... \n",
1167
+ "\n",
1168
+ " objectives \\\n",
1169
+ "0 1. Develop a scalable and maintainable solutio... \n",
1170
+ "1 1. Improve overall productivity and workflow o... \n",
1171
+ "2 1. Support future scalability and feature expa... \n",
1172
+ "3 1. Provide interactive educational tools and r... \n",
1173
+ "4 1. Reduce operational complexity and improve e... \n",
1174
+ "\n",
1175
+ " full_content \\\n",
1176
+ "0 3D hand game for neuromuscular patients. 3D ha... \n",
1177
+ "1 3D Laser Scanning. 3D Laser Scanning. 3D scann... \n",
1178
+ "2 A Smart Automatic System for Criminal Identifi... \n",
1179
+ "3 Advanced Educational Platform “ABSTHALK”. Adva... \n",
1180
+ "4 Agricultural Information and Management System... \n",
1181
+ "\n",
1182
+ " clean_text word_count \\\n",
1183
+ "0 3d hand game for neuromuscular patients. 3d ha... 172 \n",
1184
+ "1 3d laser scanning. 3d laser scanning. 3d scann... 185 \n",
1185
+ "2 a smart automatic system for criminal identifi... 138 \n",
1186
+ "3 advanced educational platform absthalk . advan... 192 \n",
1187
+ "4 agricultural information and management system... 109 \n",
1188
+ "\n",
1189
+ " features \n",
1190
+ "0 \"\\\"[\\\\\\\"Leap Motion controller sensor\\\\\\\", \\\\\\... \n",
1191
+ "1 \"\\\"[\\\\\\\"3d laser scanning\\\\\\\", \\\\\\\"Hand-held l... \n",
1192
+ "2 \"\\\"[\\\\\\\"real-time face recognition system\\\\\\\",... \n",
1193
+ "3 \"\\\"[\\\\\\\"Role-based management system\\\\\\\", \\\\\\\"... \n",
1194
+ "4 \"\\\"[\\\\\\\"centralized database\\\\\\\", \\\\\\\"track la... "
1195
+ ]
1196
+ },
1197
+ "execution_count": 14,
1198
+ "metadata": {},
1199
+ "output_type": "execute_result"
1200
+ }
1201
+ ],
1202
+ "source": [
1203
+ "test_df = pd.read_sql(\n",
1204
+ " \"SELECT TOP 5 * FROM PreProcessed_Projects\",\n",
1205
+ " engine\n",
1206
+ ")\n",
1207
+ "\n",
1208
+ "test_df.head()"
1209
+ ]
1210
+ },
1211
+ {
1212
+ "cell_type": "code",
1213
+ "execution_count": 15,
1214
+ "id": "078d4b8c",
1215
+ "metadata": {},
1216
+ "outputs": [
1217
+ {
1218
+ "name": "stdout",
1219
+ "output_type": "stream",
1220
+ "text": [
1221
+ "================================================================================\n",
1222
+ "Hospital Test\n",
1223
+ "================================================================================\n",
1224
+ "USING GEMINI FEATURE EXTRACTOR\n",
1225
+ "CALLING GEMINI\n"
1226
+ ]
1227
+ },
1228
+ {
1229
+ "name": "stderr",
1230
+ "output_type": "stream",
1231
+ "text": [
1232
+ "2026-06-04 00:30:08,804 | INFO | [LLM] model=gemini-3.1-flash-lite-preview | task=feature | attempt=1\n",
1233
+ "2026-06-04 00:30:08,805 | INFO | AFC is enabled with max remote calls: 10.\n",
1234
+ "2026-06-04 00:30:09,875 | INFO | HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-lite-preview:generateContent \"HTTP/1.1 200 OK\"\n"
1235
+ ]
1236
+ },
1237
+ {
1238
+ "name": "stdout",
1239
+ "output_type": "stream",
1240
+ "text": [
1241
+ "PARSED FEATURES:\n",
1242
+ "['appointment booking', 'patient records management', 'medical records storage', 'doctor dashboard', 'physician dashboard', 'ai chatbot']\n"
1243
+ ]
1244
+ },
1245
+ {
1246
+ "data": {
1247
+ "application/vnd.jupyter.widget-view+json": {
1248
+ "model_id": "a32846683c0e41e48b4b5cac27cbb769",
1249
+ "version_major": 2,
1250
+ "version_minor": 0
1251
+ },
1252
+ "text/plain": [
1253
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
1254
+ ]
1255
+ },
1256
+ "metadata": {},
1257
+ "output_type": "display_data"
1258
+ },
1259
+ {
1260
+ "name": "stdout",
1261
+ "output_type": "stream",
1262
+ "text": [
1263
+ "Feature Count: 5\n",
1264
+ "\n",
1265
+ "- appointment booking\n",
1266
+ "- patient records management\n",
1267
+ "- medical records storage\n",
1268
+ "- doctor dashboard\n",
1269
+ "- ai chatbot\n",
1270
+ "\n",
1271
+ "Duplicate Check:\n",
1272
+ "patient records management <-> medical records storage (shared=1)\n",
1273
+ "\n",
1274
+ "\n",
1275
+ "================================================================================\n",
1276
+ "Machine Learning Test\n",
1277
+ "================================================================================\n",
1278
+ "USING GEMINI FEATURE EXTRACTOR\n",
1279
+ "CALLING GEMINI\n"
1280
+ ]
1281
+ },
1282
+ {
1283
+ "name": "stderr",
1284
+ "output_type": "stream",
1285
+ "text": [
1286
+ "2026-06-04 00:30:16,521 | INFO | [LLM] model=gemini-3.1-flash-lite-preview | task=feature | attempt=1\n",
1287
+ "2026-06-04 00:30:16,522 | INFO | AFC is enabled with max remote calls: 10.\n",
1288
+ "2026-06-04 00:30:17,431 | INFO | HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-lite-preview:generateContent \"HTTP/1.1 200 OK\"\n"
1289
+ ]
1290
+ },
1291
+ {
1292
+ "name": "stdout",
1293
+ "output_type": "stream",
1294
+ "text": [
1295
+ "PARSED FEATURES:\n",
1296
+ "['prediction', 'analysis']\n"
1297
+ ]
1298
+ },
1299
+ {
1300
+ "data": {
1301
+ "application/vnd.jupyter.widget-view+json": {
1302
+ "model_id": "560f448ba2794e0e9e1940be1b66697d",
1303
+ "version_major": 2,
1304
+ "version_minor": 0
1305
+ },
1306
+ "text/plain": [
1307
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
1308
+ ]
1309
+ },
1310
+ "metadata": {},
1311
+ "output_type": "display_data"
1312
+ },
1313
+ {
1314
+ "name": "stdout",
1315
+ "output_type": "stream",
1316
+ "text": [
1317
+ "Feature Count: 2\n",
1318
+ "\n",
1319
+ "- prediction\n",
1320
+ "- analysis\n",
1321
+ "\n",
1322
+ "Duplicate Check:\n",
1323
+ "No duplicate overlaps found\n",
1324
+ "\n",
1325
+ "\n",
1326
+ "================================================================================\n",
1327
+ "Face Recognition Test\n",
1328
+ "================================================================================\n",
1329
+ "USING GEMINI FEATURE EXTRACTOR\n",
1330
+ "CALLING GEMINI\n"
1331
+ ]
1332
+ },
1333
+ {
1334
+ "name": "stderr",
1335
+ "output_type": "stream",
1336
+ "text": [
1337
+ "2026-06-04 00:30:21,508 | INFO | [LLM] model=gemini-3.1-flash-lite-preview | task=feature | attempt=1\n",
1338
+ "2026-06-04 00:30:21,509 | INFO | AFC is enabled with max remote calls: 10.\n",
1339
+ "2026-06-04 00:30:22,145 | INFO | HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-lite-preview:generateContent \"HTTP/1.1 200 OK\"\n"
1340
+ ]
1341
+ },
1342
+ {
1343
+ "name": "stdout",
1344
+ "output_type": "stream",
1345
+ "text": [
1346
+ "PARSED FEATURES:\n",
1347
+ "['face recognition', 'real-time face detection', 'student attendance management', 'mobile application']\n"
1348
+ ]
1349
+ },
1350
+ {
1351
+ "data": {
1352
+ "application/vnd.jupyter.widget-view+json": {
1353
+ "model_id": "4ce3cd5b56544cb4864d5f0779063227",
1354
+ "version_major": 2,
1355
+ "version_minor": 0
1356
+ },
1357
+ "text/plain": [
1358
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
1359
+ ]
1360
+ },
1361
+ "metadata": {},
1362
+ "output_type": "display_data"
1363
+ },
1364
+ {
1365
+ "name": "stdout",
1366
+ "output_type": "stream",
1367
+ "text": [
1368
+ "Feature Count: 4\n",
1369
+ "\n",
1370
+ "- face recognition\n",
1371
+ "- real-time face detection\n",
1372
+ "- student attendance management\n",
1373
+ "- mobile application\n",
1374
+ "\n",
1375
+ "Duplicate Check:\n",
1376
+ "face recognition <-> real-time face detection (shared=1)\n",
1377
+ "\n",
1378
+ "\n"
1379
+ ]
1380
+ }
1381
+ ],
1382
+ "source": [
1383
+ "from src.similarity_model.preprocessing import (\n",
1384
+ " extract_features,\n",
1385
+ " normalize_text\n",
1386
+ ")\n",
1387
+ "\n",
1388
+ "def check_duplicates(features):\n",
1389
+ "\n",
1390
+ " found = False\n",
1391
+ "\n",
1392
+ " for i in range(len(features)):\n",
1393
+ " for j in range(i + 1, len(features)):\n",
1394
+ "\n",
1395
+ " a = set(features[i].split())\n",
1396
+ " b = set(features[j].split())\n",
1397
+ "\n",
1398
+ " overlap = len(a & b)\n",
1399
+ "\n",
1400
+ " if overlap > 0:\n",
1401
+ " found = True\n",
1402
+ " print(\n",
1403
+ " f\"{features[i]} <-> {features[j]} \"\n",
1404
+ " f\"(shared={overlap})\"\n",
1405
+ " )\n",
1406
+ "\n",
1407
+ " if not found:\n",
1408
+ " print(\"No duplicate overlaps found\")\n",
1409
+ "\n",
1410
+ "\n",
1411
+ "tests = {\n",
1412
+ " \"Hospital Test\": \"\"\"\n",
1413
+ " Hospital management system with\n",
1414
+ " appointment booking,\n",
1415
+ " online appointment booking,\n",
1416
+ " patient records,\n",
1417
+ " medical records,\n",
1418
+ " doctor dashboard,\n",
1419
+ " physician dashboard,\n",
1420
+ " AI chatbot,\n",
1421
+ " intelligent chatbot\n",
1422
+ " \"\"\",\n",
1423
+ "\n",
1424
+ " \"Machine Learning Test\": \"\"\"\n",
1425
+ " Machine learning system using machine learning\n",
1426
+ " for machine learning prediction and machine learning analysis.\n",
1427
+ " \"\"\",\n",
1428
+ "\n",
1429
+ " \"Face Recognition Test\": \"\"\"\n",
1430
+ " Face recognition attendance system using deep learning,\n",
1431
+ " computer vision,\n",
1432
+ " real-time face detection,\n",
1433
+ " student attendance management and mobile application.\n",
1434
+ " \"\"\"\n",
1435
+ "}\n",
1436
+ "\n",
1437
+ "for name, query in tests.items():\n",
1438
+ "\n",
1439
+ " print(\"=\" * 80)\n",
1440
+ " print(name)\n",
1441
+ " print(\"=\" * 80)\n",
1442
+ "\n",
1443
+ " features = extract_features(\n",
1444
+ " normalize_text(query)\n",
1445
+ " )\n",
1446
+ "\n",
1447
+ " print(f\"Feature Count: {len(features)}\")\n",
1448
+ " print()\n",
1449
+ "\n",
1450
+ " for f in features:\n",
1451
+ " print(\"-\", f)\n",
1452
+ "\n",
1453
+ " print(\"\\nDuplicate Check:\")\n",
1454
+ " check_duplicates(features)\n",
1455
+ "\n",
1456
+ " print(\"\\n\")"
1457
+ ]
1458
+ },
1459
+ {
1460
+ "cell_type": "code",
1461
+ "execution_count": 16,
1462
+ "id": "edc0890d",
1463
+ "metadata": {},
1464
+ "outputs": [],
1465
+ "source": [
1466
+ "from Data.database.sql_connector import engine\n",
1467
+ "\n",
1468
+ "engine.dispose()"
1469
+ ]
1470
+ },
1471
+ {
1472
+ "cell_type": "code",
1473
+ "execution_count": 17,
1474
+ "id": "0a231154",
1475
+ "metadata": {},
1476
+ "outputs": [
1477
+ {
1478
+ "name": "stderr",
1479
+ "output_type": "stream",
1480
+ "text": [
1481
+ "2026-06-04 00:30:22,479 | INFO | Loading models and artifacts...\n",
1482
+ "2026-06-04 00:30:22,481 | INFO | Loading model: all-MiniLM-L6-v2\n",
1483
+ "2026-06-04 00:30:22,481 | INFO | Load pretrained SentenceTransformer: all-MiniLM-L6-v2\n",
1484
+ "e:\\gradution project\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:949: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
1485
+ " warnings.warn(\n",
1486
+ "2026-06-04 00:30:24,618 | INFO | Use pytorch device_name: cpu\n",
1487
+ "2026-06-04 00:30:24,624 | INFO | Loading FAISS index...\n",
1488
+ "2026-06-04 00:30:24,627 | INFO | Loading feature model: all-MiniLM-L6-v2\n",
1489
+ "2026-06-04 00:30:24,628 | INFO | Load pretrained SentenceTransformer: all-MiniLM-L6-v2\n",
1490
+ "2026-06-04 00:30:26,763 | INFO | Use pytorch device_name: cpu\n",
1491
+ "2026-06-04 00:30:26,767 | INFO | Loading metadata from Azure SQL...\n",
1492
+ "2026-06-04 00:30:32,815 | INFO | Preparing query...\n"
1493
+ ]
1494
+ },
1495
+ {
1496
+ "name": "stdout",
1497
+ "output_type": "stream",
1498
+ "text": [
1499
+ "USING GEMINI FEATURE EXTRACTOR\n",
1500
+ "CALLING GEMINI\n"
1501
+ ]
1502
+ },
1503
+ {
1504
+ "name": "stderr",
1505
+ "output_type": "stream",
1506
+ "text": [
1507
+ "2026-06-04 00:30:36,816 | INFO | [LLM] model=gemini-3.1-flash-lite-preview | task=feature | attempt=1\n",
1508
+ "2026-06-04 00:30:36,817 | INFO | AFC is enabled with max remote calls: 10.\n",
1509
+ "2026-06-04 00:30:37,822 | INFO | HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-lite-preview:generateContent \"HTTP/1.1 200 OK\"\n"
1510
+ ]
1511
+ },
1512
+ {
1513
+ "name": "stdout",
1514
+ "output_type": "stream",
1515
+ "text": [
1516
+ "PARSED FEATURES:\n",
1517
+ "['appointment booking', 'patient records', 'doctor dashboard', 'ai chatbot']\n"
1518
+ ]
1519
+ },
1520
+ {
1521
+ "data": {
1522
+ "application/vnd.jupyter.widget-view+json": {
1523
+ "model_id": "eff76001187242a6a509b00507dae4ee",
1524
+ "version_major": 2,
1525
+ "version_minor": 0
1526
+ },
1527
+ "text/plain": [
1528
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
1529
+ ]
1530
+ },
1531
+ "metadata": {},
1532
+ "output_type": "display_data"
1533
+ },
1534
+ {
1535
+ "name": "stderr",
1536
+ "output_type": "stream",
1537
+ "text": [
1538
+ "2026-06-04 00:30:37,890 | INFO | Running semantic retrieval...\n"
1539
+ ]
1540
+ },
1541
+ {
1542
+ "data": {
1543
+ "application/vnd.jupyter.widget-view+json": {
1544
+ "model_id": "a03cd362fbff43c2b60ee37fa346b9b3",
1545
+ "version_major": 2,
1546
+ "version_minor": 0
1547
+ },
1548
+ "text/plain": [
1549
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
1550
+ ]
1551
+ },
1552
+ "metadata": {},
1553
+ "output_type": "display_data"
1554
+ },
1555
+ {
1556
+ "name": "stderr",
1557
+ "output_type": "stream",
1558
+ "text": [
1559
+ "2026-06-04 00:30:37,995 | INFO | Running hybrid ranking...\n"
1560
+ ]
1561
+ },
1562
+ {
1563
+ "data": {
1564
+ "application/vnd.jupyter.widget-view+json": {
1565
+ "model_id": "b9258e51c54f445a87adba34482d1627",
1566
+ "version_major": 2,
1567
+ "version_minor": 0
1568
+ },
1569
+ "text/plain": [
1570
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
1571
+ ]
1572
+ },
1573
+ "metadata": {},
1574
+ "output_type": "display_data"
1575
+ },
1576
+ {
1577
+ "data": {
1578
+ "application/vnd.jupyter.widget-view+json": {
1579
+ "model_id": "c9bba1f165354e5486b8c88ccaeef00e",
1580
+ "version_major": 2,
1581
+ "version_minor": 0
1582
+ },
1583
+ "text/plain": [
1584
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
1585
+ ]
1586
+ },
1587
+ "metadata": {},
1588
+ "output_type": "display_data"
1589
+ },
1590
+ {
1591
+ "data": {
1592
+ "application/vnd.jupyter.widget-view+json": {
1593
+ "model_id": "fc4577ed377747f3b87e810d02179ce9",
1594
+ "version_major": 2,
1595
+ "version_minor": 0
1596
+ },
1597
+ "text/plain": [
1598
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
1599
+ ]
1600
+ },
1601
+ "metadata": {},
1602
+ "output_type": "display_data"
1603
+ },
1604
+ {
1605
+ "data": {
1606
+ "application/vnd.jupyter.widget-view+json": {
1607
+ "model_id": "507fec89dc7643bb87467da4e0a3d874",
1608
+ "version_major": 2,
1609
+ "version_minor": 0
1610
+ },
1611
+ "text/plain": [
1612
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
1613
+ ]
1614
+ },
1615
+ "metadata": {},
1616
+ "output_type": "display_data"
1617
+ },
1618
+ {
1619
+ "data": {
1620
+ "application/vnd.jupyter.widget-view+json": {
1621
+ "model_id": "235c224eec464cf796972ffbb4764179",
1622
+ "version_major": 2,
1623
+ "version_minor": 0
1624
+ },
1625
+ "text/plain": [
1626
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
1627
+ ]
1628
+ },
1629
+ "metadata": {},
1630
+ "output_type": "display_data"
1631
+ },
1632
+ {
1633
+ "data": {
1634
+ "application/vnd.jupyter.widget-view+json": {
1635
+ "model_id": "d64ac601101a43e59bfdcba31ca440de",
1636
+ "version_major": 2,
1637
+ "version_minor": 0
1638
+ },
1639
+ "text/plain": [
1640
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
1641
+ ]
1642
+ },
1643
+ "metadata": {},
1644
+ "output_type": "display_data"
1645
+ },
1646
+ {
1647
+ "data": {
1648
+ "application/vnd.jupyter.widget-view+json": {
1649
+ "model_id": "dbc0825ced57497a96d822eb5f69d133",
1650
+ "version_major": 2,
1651
+ "version_minor": 0
1652
+ },
1653
+ "text/plain": [
1654
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
1655
+ ]
1656
+ },
1657
+ "metadata": {},
1658
+ "output_type": "display_data"
1659
+ },
1660
+ {
1661
+ "data": {
1662
+ "application/vnd.jupyter.widget-view+json": {
1663
+ "model_id": "8e1a98329d0948b39732408daa3d3d0f",
1664
+ "version_major": 2,
1665
+ "version_minor": 0
1666
+ },
1667
+ "text/plain": [
1668
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
1669
+ ]
1670
+ },
1671
+ "metadata": {},
1672
+ "output_type": "display_data"
1673
+ },
1674
+ {
1675
+ "data": {
1676
+ "application/vnd.jupyter.widget-view+json": {
1677
+ "model_id": "cacbca885c544c5dbd5ee851924c5e35",
1678
+ "version_major": 2,
1679
+ "version_minor": 0
1680
+ },
1681
+ "text/plain": [
1682
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
1683
+ ]
1684
+ },
1685
+ "metadata": {},
1686
+ "output_type": "display_data"
1687
+ },
1688
+ {
1689
+ "data": {
1690
+ "application/vnd.jupyter.widget-view+json": {
1691
+ "model_id": "c1624ce6d6a143eea18a16bcf2b6d598",
1692
+ "version_major": 2,
1693
+ "version_minor": 0
1694
+ },
1695
+ "text/plain": [
1696
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
1697
+ ]
1698
+ },
1699
+ "metadata": {},
1700
+ "output_type": "display_data"
1701
+ },
1702
+ {
1703
+ "data": {
1704
+ "application/vnd.jupyter.widget-view+json": {
1705
+ "model_id": "48789bbc44a84be9b2574aae502457f6",
1706
+ "version_major": 2,
1707
+ "version_minor": 0
1708
+ },
1709
+ "text/plain": [
1710
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
1711
+ ]
1712
+ },
1713
+ "metadata": {},
1714
+ "output_type": "display_data"
1715
+ },
1716
+ {
1717
+ "data": {
1718
+ "application/vnd.jupyter.widget-view+json": {
1719
+ "model_id": "b12e1bc1c5f54918b7220c4d548c272c",
1720
+ "version_major": 2,
1721
+ "version_minor": 0
1722
+ },
1723
+ "text/plain": [
1724
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
1725
+ ]
1726
+ },
1727
+ "metadata": {},
1728
+ "output_type": "display_data"
1729
+ },
1730
+ {
1731
+ "data": {
1732
+ "application/vnd.jupyter.widget-view+json": {
1733
+ "model_id": "d25ff7d6bcf04b88ad3f278fc1074ec0",
1734
+ "version_major": 2,
1735
+ "version_minor": 0
1736
+ },
1737
+ "text/plain": [
1738
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
1739
+ ]
1740
+ },
1741
+ "metadata": {},
1742
+ "output_type": "display_data"
1743
+ },
1744
+ {
1745
+ "data": {
1746
+ "application/vnd.jupyter.widget-view+json": {
1747
+ "model_id": "3669c573bc0740d099cdea8534da1929",
1748
+ "version_major": 2,
1749
+ "version_minor": 0
1750
+ },
1751
+ "text/plain": [
1752
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
1753
+ ]
1754
+ },
1755
+ "metadata": {},
1756
+ "output_type": "display_data"
1757
+ },
1758
+ {
1759
+ "data": {
1760
+ "application/vnd.jupyter.widget-view+json": {
1761
+ "model_id": "a9b77ba859c247afb63cd9e13f6ec58f",
1762
+ "version_major": 2,
1763
+ "version_minor": 0
1764
+ },
1765
+ "text/plain": [
1766
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
1767
+ ]
1768
+ },
1769
+ "metadata": {},
1770
+ "output_type": "display_data"
1771
+ },
1772
+ {
1773
+ "data": {
1774
+ "application/vnd.jupyter.widget-view+json": {
1775
+ "model_id": "be0e6c66976c4ef88b04cddf583f5b75",
1776
+ "version_major": 2,
1777
+ "version_minor": 0
1778
+ },
1779
+ "text/plain": [
1780
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
1781
+ ]
1782
+ },
1783
+ "metadata": {},
1784
+ "output_type": "display_data"
1785
+ },
1786
+ {
1787
+ "data": {
1788
+ "application/vnd.jupyter.widget-view+json": {
1789
+ "model_id": "7beccc50a11349d4aa7eb3b83b33f9b7",
1790
+ "version_major": 2,
1791
+ "version_minor": 0
1792
+ },
1793
+ "text/plain": [
1794
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
1795
+ ]
1796
+ },
1797
+ "metadata": {},
1798
+ "output_type": "display_data"
1799
+ },
1800
+ {
1801
+ "data": {
1802
+ "application/vnd.jupyter.widget-view+json": {
1803
+ "model_id": "952a0e024ec347b7ace0f2e33ec63fab",
1804
+ "version_major": 2,
1805
+ "version_minor": 0
1806
+ },
1807
+ "text/plain": [
1808
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
1809
+ ]
1810
+ },
1811
+ "metadata": {},
1812
+ "output_type": "display_data"
1813
+ },
1814
+ {
1815
+ "data": {
1816
+ "application/vnd.jupyter.widget-view+json": {
1817
+ "model_id": "dcefd6ab863d484cb4edbf99dbf9bfce",
1818
+ "version_major": 2,
1819
+ "version_minor": 0
1820
+ },
1821
+ "text/plain": [
1822
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
1823
+ ]
1824
+ },
1825
+ "metadata": {},
1826
+ "output_type": "display_data"
1827
+ },
1828
+ {
1829
+ "data": {
1830
+ "application/vnd.jupyter.widget-view+json": {
1831
+ "model_id": "b33e2e9264a6485aa8f7558ceb1b72e3",
1832
+ "version_major": 2,
1833
+ "version_minor": 0
1834
+ },
1835
+ "text/plain": [
1836
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
1837
+ ]
1838
+ },
1839
+ "metadata": {},
1840
+ "output_type": "display_data"
1841
+ },
1842
+ {
1843
+ "data": {
1844
+ "application/vnd.jupyter.widget-view+json": {
1845
+ "model_id": "da264b88b6304434af2e12621422ef53",
1846
+ "version_major": 2,
1847
+ "version_minor": 0
1848
+ },
1849
+ "text/plain": [
1850
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
1851
+ ]
1852
+ },
1853
+ "metadata": {},
1854
+ "output_type": "display_data"
1855
+ },
1856
+ {
1857
+ "data": {
1858
+ "application/vnd.jupyter.widget-view+json": {
1859
+ "model_id": "549650cf75964ccfbe521e28eca314a9",
1860
+ "version_major": 2,
1861
+ "version_minor": 0
1862
+ },
1863
+ "text/plain": [
1864
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
1865
+ ]
1866
+ },
1867
+ "metadata": {},
1868
+ "output_type": "display_data"
1869
+ },
1870
+ {
1871
+ "data": {
1872
+ "application/vnd.jupyter.widget-view+json": {
1873
+ "model_id": "06a80f5a77f645e783a6601570b9bd38",
1874
+ "version_major": 2,
1875
+ "version_minor": 0
1876
+ },
1877
+ "text/plain": [
1878
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
1879
+ ]
1880
+ },
1881
+ "metadata": {},
1882
+ "output_type": "display_data"
1883
+ },
1884
+ {
1885
+ "data": {
1886
+ "application/vnd.jupyter.widget-view+json": {
1887
+ "model_id": "db35a10c1302487ab1122c3a6a0d37c9",
1888
+ "version_major": 2,
1889
+ "version_minor": 0
1890
+ },
1891
+ "text/plain": [
1892
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
1893
+ ]
1894
+ },
1895
+ "metadata": {},
1896
+ "output_type": "display_data"
1897
+ },
1898
+ {
1899
+ "data": {
1900
+ "application/vnd.jupyter.widget-view+json": {
1901
+ "model_id": "a10c675d23a245138b950e0203c37f05",
1902
+ "version_major": 2,
1903
+ "version_minor": 0
1904
+ },
1905
+ "text/plain": [
1906
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
1907
+ ]
1908
+ },
1909
+ "metadata": {},
1910
+ "output_type": "display_data"
1911
+ },
1912
+ {
1913
+ "data": {
1914
+ "application/vnd.jupyter.widget-view+json": {
1915
+ "model_id": "d3c9324e3f984f17acec7f24d552ec10",
1916
+ "version_major": 2,
1917
+ "version_minor": 0
1918
+ },
1919
+ "text/plain": [
1920
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
1921
+ ]
1922
+ },
1923
+ "metadata": {},
1924
+ "output_type": "display_data"
1925
+ },
1926
+ {
1927
+ "data": {
1928
+ "application/vnd.jupyter.widget-view+json": {
1929
+ "model_id": "c711d80cc018412a9437dacda5e046c4",
1930
+ "version_major": 2,
1931
+ "version_minor": 0
1932
+ },
1933
+ "text/plain": [
1934
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
1935
+ ]
1936
+ },
1937
+ "metadata": {},
1938
+ "output_type": "display_data"
1939
+ },
1940
+ {
1941
+ "data": {
1942
+ "application/vnd.jupyter.widget-view+json": {
1943
+ "model_id": "382264332377473682355df0537c205f",
1944
+ "version_major": 2,
1945
+ "version_minor": 0
1946
+ },
1947
+ "text/plain": [
1948
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
1949
+ ]
1950
+ },
1951
+ "metadata": {},
1952
+ "output_type": "display_data"
1953
+ },
1954
+ {
1955
+ "data": {
1956
+ "application/vnd.jupyter.widget-view+json": {
1957
+ "model_id": "6c68bb5e2f914bc181f621974e099338",
1958
+ "version_major": 2,
1959
+ "version_minor": 0
1960
+ },
1961
+ "text/plain": [
1962
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
1963
+ ]
1964
+ },
1965
+ "metadata": {},
1966
+ "output_type": "display_data"
1967
+ },
1968
+ {
1969
+ "data": {
1970
+ "application/vnd.jupyter.widget-view+json": {
1971
+ "model_id": "121a21c56f9f48849601032b46927682",
1972
+ "version_major": 2,
1973
+ "version_minor": 0
1974
+ },
1975
+ "text/plain": [
1976
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
1977
+ ]
1978
+ },
1979
+ "metadata": {},
1980
+ "output_type": "display_data"
1981
+ },
1982
+ {
1983
+ "data": {
1984
+ "application/vnd.jupyter.widget-view+json": {
1985
+ "model_id": "c942dadc11284abb8caa847d765222d5",
1986
+ "version_major": 2,
1987
+ "version_minor": 0
1988
+ },
1989
+ "text/plain": [
1990
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
1991
+ ]
1992
+ },
1993
+ "metadata": {},
1994
+ "output_type": "display_data"
1995
+ },
1996
+ {
1997
+ "data": {
1998
+ "application/vnd.jupyter.widget-view+json": {
1999
+ "model_id": "ac1b3107c77b4e6c9515dd8925d388ce",
2000
+ "version_major": 2,
2001
+ "version_minor": 0
2002
+ },
2003
+ "text/plain": [
2004
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
2005
+ ]
2006
+ },
2007
+ "metadata": {},
2008
+ "output_type": "display_data"
2009
+ },
2010
+ {
2011
+ "data": {
2012
+ "application/vnd.jupyter.widget-view+json": {
2013
+ "model_id": "533faf5886374abc9f127b15ae388739",
2014
+ "version_major": 2,
2015
+ "version_minor": 0
2016
+ },
2017
+ "text/plain": [
2018
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
2019
+ ]
2020
+ },
2021
+ "metadata": {},
2022
+ "output_type": "display_data"
2023
+ },
2024
+ {
2025
+ "data": {
2026
+ "application/vnd.jupyter.widget-view+json": {
2027
+ "model_id": "1bc595ace2a14c02909f3f0f8b09148a",
2028
+ "version_major": 2,
2029
+ "version_minor": 0
2030
+ },
2031
+ "text/plain": [
2032
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
2033
+ ]
2034
+ },
2035
+ "metadata": {},
2036
+ "output_type": "display_data"
2037
+ },
2038
+ {
2039
+ "data": {
2040
+ "application/vnd.jupyter.widget-view+json": {
2041
+ "model_id": "4e61ceaef6384da5a0b761bd5ee69165",
2042
+ "version_major": 2,
2043
+ "version_minor": 0
2044
+ },
2045
+ "text/plain": [
2046
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
2047
+ ]
2048
+ },
2049
+ "metadata": {},
2050
+ "output_type": "display_data"
2051
+ },
2052
+ {
2053
+ "data": {
2054
+ "application/vnd.jupyter.widget-view+json": {
2055
+ "model_id": "3e27cc1a6a514cedb1b295198ee2c3af",
2056
+ "version_major": 2,
2057
+ "version_minor": 0
2058
+ },
2059
+ "text/plain": [
2060
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
2061
+ ]
2062
+ },
2063
+ "metadata": {},
2064
+ "output_type": "display_data"
2065
+ },
2066
+ {
2067
+ "data": {
2068
+ "application/vnd.jupyter.widget-view+json": {
2069
+ "model_id": "433c7f69da8c4bc199455170aa52abf8",
2070
+ "version_major": 2,
2071
+ "version_minor": 0
2072
+ },
2073
+ "text/plain": [
2074
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
2075
+ ]
2076
+ },
2077
+ "metadata": {},
2078
+ "output_type": "display_data"
2079
+ },
2080
+ {
2081
+ "data": {
2082
+ "application/vnd.jupyter.widget-view+json": {
2083
+ "model_id": "314054bcb00d4cd7bc4c4ca86409d751",
2084
+ "version_major": 2,
2085
+ "version_minor": 0
2086
+ },
2087
+ "text/plain": [
2088
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
2089
+ ]
2090
+ },
2091
+ "metadata": {},
2092
+ "output_type": "display_data"
2093
+ },
2094
+ {
2095
+ "data": {
2096
+ "application/vnd.jupyter.widget-view+json": {
2097
+ "model_id": "5838180e1b4849e997ef24b8ca304a6c",
2098
+ "version_major": 2,
2099
+ "version_minor": 0
2100
+ },
2101
+ "text/plain": [
2102
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
2103
+ ]
2104
+ },
2105
+ "metadata": {},
2106
+ "output_type": "display_data"
2107
+ },
2108
+ {
2109
+ "data": {
2110
+ "application/vnd.jupyter.widget-view+json": {
2111
+ "model_id": "ba66fd00bbe442bd99b47a4eaba434b4",
2112
+ "version_major": 2,
2113
+ "version_minor": 0
2114
+ },
2115
+ "text/plain": [
2116
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
2117
+ ]
2118
+ },
2119
+ "metadata": {},
2120
+ "output_type": "display_data"
2121
+ },
2122
+ {
2123
+ "data": {
2124
+ "text/html": [
2125
+ "<div>\n",
2126
+ "<style scoped>\n",
2127
+ " .dataframe tbody tr th:only-of-type {\n",
2128
+ " vertical-align: middle;\n",
2129
+ " }\n",
2130
+ "\n",
2131
+ " .dataframe tbody tr th {\n",
2132
+ " vertical-align: top;\n",
2133
+ " }\n",
2134
+ "\n",
2135
+ " .dataframe thead th {\n",
2136
+ " text-align: right;\n",
2137
+ " }\n",
2138
+ "</style>\n",
2139
+ "<table border=\"1\" class=\"dataframe\">\n",
2140
+ " <thead>\n",
2141
+ " <tr style=\"text-align: right;\">\n",
2142
+ " <th></th>\n",
2143
+ " <th>project_title</th>\n",
2144
+ " <th>semantic_score</th>\n",
2145
+ " <th>feature_score</th>\n",
2146
+ " <th>coverage</th>\n",
2147
+ " <th>hybrid_score</th>\n",
2148
+ " <th>duplicate_risk</th>\n",
2149
+ " </tr>\n",
2150
+ " </thead>\n",
2151
+ " <tbody>\n",
2152
+ " <tr>\n",
2153
+ " <th>0</th>\n",
2154
+ " <td>Detecting Diseases Using Chatbot and Booking C...</td>\n",
2155
+ " <td>0.7480</td>\n",
2156
+ " <td>0.0</td>\n",
2157
+ " <td>0.0</td>\n",
2158
+ " <td>0.05</td>\n",
2159
+ " <td>Very Low</td>\n",
2160
+ " </tr>\n",
2161
+ " <tr>\n",
2162
+ " <th>1</th>\n",
2163
+ " <td>Clinical Information System</td>\n",
2164
+ " <td>0.6479</td>\n",
2165
+ " <td>0.0</td>\n",
2166
+ " <td>0.0</td>\n",
2167
+ " <td>0.05</td>\n",
2168
+ " <td>Very Low</td>\n",
2169
+ " </tr>\n",
2170
+ " <tr>\n",
2171
+ " <th>2</th>\n",
2172
+ " <td>Doctor 4 U</td>\n",
2173
+ " <td>0.6437</td>\n",
2174
+ " <td>0.0</td>\n",
2175
+ " <td>0.0</td>\n",
2176
+ " <td>0.05</td>\n",
2177
+ " <td>Very Low</td>\n",
2178
+ " </tr>\n",
2179
+ " <tr>\n",
2180
+ " <th>3</th>\n",
2181
+ " <td>Health Care Management System</td>\n",
2182
+ " <td>0.6402</td>\n",
2183
+ " <td>0.0</td>\n",
2184
+ " <td>0.0</td>\n",
2185
+ " <td>0.05</td>\n",
2186
+ " <td>Very Low</td>\n",
2187
+ " </tr>\n",
2188
+ " <tr>\n",
2189
+ " <th>4</th>\n",
2190
+ " <td>Hospital Management System</td>\n",
2191
+ " <td>0.6397</td>\n",
2192
+ " <td>0.0</td>\n",
2193
+ " <td>0.0</td>\n",
2194
+ " <td>0.05</td>\n",
2195
+ " <td>Very Low</td>\n",
2196
+ " </tr>\n",
2197
+ " </tbody>\n",
2198
+ "</table>\n",
2199
+ "</div>"
2200
+ ],
2201
+ "text/plain": [
2202
+ " project_title semantic_score \\\n",
2203
+ "0 Detecting Diseases Using Chatbot and Booking C... 0.7480 \n",
2204
+ "1 Clinical Information System 0.6479 \n",
2205
+ "2 Doctor 4 U 0.6437 \n",
2206
+ "3 Health Care Management System 0.6402 \n",
2207
+ "4 Hospital Management System 0.6397 \n",
2208
+ "\n",
2209
+ " feature_score coverage hybrid_score duplicate_risk \n",
2210
+ "0 0.0 0.0 0.05 Very Low \n",
2211
+ "1 0.0 0.0 0.05 Very Low \n",
2212
+ "2 0.0 0.0 0.05 Very Low \n",
2213
+ "3 0.0 0.0 0.05 Very Low \n",
2214
+ "4 0.0 0.0 0.05 Very Low "
2215
+ ]
2216
+ },
2217
+ "execution_count": 17,
2218
+ "metadata": {},
2219
+ "output_type": "execute_result"
2220
+ }
2221
+ ],
2222
+ "source": [
2223
+ "results = find_similar_projects(\n",
2224
+ " title=\"AI Clinic Management System\",\n",
2225
+ " description=\"\"\"\n",
2226
+ " Smart clinic management platform with\n",
2227
+ " appointment booking,\n",
2228
+ " patient records,\n",
2229
+ " doctor dashboard,\n",
2230
+ " AI chatbot.\n",
2231
+ " \"\"\",\n",
2232
+ " top_k=5\n",
2233
+ ")\n",
2234
+ "\n",
2235
+ "results[[\n",
2236
+ " \"project_title\",\n",
2237
+ " \"semantic_score\",\n",
2238
+ " \"feature_score\",\n",
2239
+ " \"coverage\",\n",
2240
+ " \"hybrid_score\",\n",
2241
+ " \"duplicate_risk\"\n",
2242
+ "]]"
2243
+ ]
2244
+ },
2245
+ {
2246
+ "cell_type": "code",
2247
+ "execution_count": 18,
2248
+ "id": "5ab1315b",
2249
+ "metadata": {},
2250
+ "outputs": [
2251
+ {
2252
+ "data": {
2253
+ "application/vnd.jupyter.widget-view+json": {
2254
+ "model_id": "e3c94f184d4f485c871ada26ed9f5abc",
2255
+ "version_major": 2,
2256
+ "version_minor": 0
2257
+ },
2258
+ "text/plain": [
2259
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
2260
+ ]
2261
+ },
2262
+ "metadata": {},
2263
+ "output_type": "display_data"
2264
+ },
2265
+ {
2266
+ "data": {
2267
+ "application/vnd.jupyter.widget-view+json": {
2268
+ "model_id": "13e2f339bdd544949ec9a26f472a95ef",
2269
+ "version_major": 2,
2270
+ "version_minor": 0
2271
+ },
2272
+ "text/plain": [
2273
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
2274
+ ]
2275
+ },
2276
+ "metadata": {},
2277
+ "output_type": "display_data"
2278
+ },
2279
+ {
2280
+ "name": "stdout",
2281
+ "output_type": "stream",
2282
+ "text": [
2283
+ "{'score': 0.8726, 'coverage': 0.8, 'shared_count': 4, 'matches': [{'feature_a': 'appointment booking', 'feature_b': 'booking doctor appointments', 'score': 0.821}, {'feature_a': 'patient records', 'feature_b': 'medical records', 'score': 0.895}, {'feature_a': 'doctor dashboard', 'feature_b': 'doctor dashboard', 'score': 1.0}, {'feature_a': 'ai chatbot', 'feature_b': 'intelligent chatbot', 'score': 0.899}], 'unique_a': ['clinic management'], 'unique_b': ['hospital management']}\n"
2284
+ ]
2285
+ }
2286
+ ],
2287
+ "source": [
2288
+ "project_a = [\n",
2289
+ " \"appointment booking\",\n",
2290
+ " \"patient records\",\n",
2291
+ " \"doctor dashboard\",\n",
2292
+ " \"ai chatbot\",\n",
2293
+ " \"clinic management\"\n",
2294
+ "]\n",
2295
+ "\n",
2296
+ "project_b = [\n",
2297
+ " \"booking doctor appointments\",\n",
2298
+ " \"medical records\",\n",
2299
+ " \"doctor dashboard\",\n",
2300
+ " \"intelligent chatbot\",\n",
2301
+ " \"hospital management\"\n",
2302
+ "]\n",
2303
+ "\n",
2304
+ "result = compute_feature_similarity(\n",
2305
+ " project_a,\n",
2306
+ " project_b\n",
2307
+ ")\n",
2308
+ "\n",
2309
+ "print(result)"
2310
+ ]
2311
+ },
2312
+ {
2313
+ "cell_type": "code",
2314
+ "execution_count": 19,
2315
+ "id": "9f571cb2",
2316
+ "metadata": {},
2317
+ "outputs": [
2318
+ {
2319
+ "name": "stdout",
2320
+ "output_type": "stream",
2321
+ "text": [
2322
+ "82.25\n"
2323
+ ]
2324
+ }
2325
+ ],
2326
+ "source": [
2327
+ "from src.similarity_model import compute_originality\n",
2328
+ "\n",
2329
+ "print(\n",
2330
+ " compute_originality(\n",
2331
+ " hybrid_score=0.30,\n",
2332
+ " unique_query_features=7,\n",
2333
+ " total_query_features=8\n",
2334
+ " )\n",
2335
+ ")"
2336
+ ]
2337
+ },
2338
+ {
2339
+ "cell_type": "code",
2340
+ "execution_count": 20,
2341
+ "id": "53eeed12",
2342
+ "metadata": {},
2343
+ "outputs": [
2344
+ {
2345
+ "name": "stderr",
2346
+ "output_type": "stream",
2347
+ "text": [
2348
+ "2026-06-04 00:30:41,636 | INFO | Loading processed dataset from Azure SQL...\n",
2349
+ "2026-06-04 00:30:46,601 | INFO | Loading embedding model: all-MiniLM-L6-v2\n",
2350
+ "2026-06-04 00:30:46,602 | INFO | Load pretrained SentenceTransformer: all-MiniLM-L6-v2\n",
2351
+ "e:\\gradution project\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:949: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
2352
+ " warnings.warn(\n",
2353
+ "2026-06-04 00:30:49,233 | INFO | Use pytorch device_name: cpu\n",
2354
+ "2026-06-04 00:30:49,243 | INFO | Generating embeddings for 255 projects...\n"
2355
+ ]
2356
+ },
2357
+ {
2358
+ "data": {
2359
+ "application/vnd.jupyter.widget-view+json": {
2360
+ "model_id": "f40167a736a840a6bd04e2b85b18c92d",
2361
+ "version_major": 2,
2362
+ "version_minor": 0
2363
+ },
2364
+ "text/plain": [
2365
+ "Batches: 0%| | 0/4 [00:00<?, ?it/s]"
2366
+ ]
2367
+ },
2368
+ "metadata": {},
2369
+ "output_type": "display_data"
2370
+ },
2371
+ {
2372
+ "name": "stderr",
2373
+ "output_type": "stream",
2374
+ "text": [
2375
+ "2026-06-04 00:31:05,278 | INFO | FAISS index built successfully with 255 vectors.\n",
2376
+ "2026-06-04 00:31:05,299 | INFO | Artifacts saved to models\n",
2377
+ "2026-06-04 00:31:05,301 | INFO | Embedding engine completed successfully.\n"
2378
+ ]
2379
+ },
2380
+ {
2381
+ "name": "stdout",
2382
+ "output_type": "stream",
2383
+ "text": [
2384
+ "Training Completed\n"
2385
+ ]
2386
+ }
2387
+ ],
2388
+ "source": [
2389
+ "from src.similarity_model.embedding_engine import (\n",
2390
+ " train_embedding_engine\n",
2391
+ ")\n",
2392
+ "\n",
2393
+ "engine = train_embedding_engine()\n",
2394
+ "\n",
2395
+ "print(\"Training Completed\")"
2396
+ ]
2397
+ },
2398
+ {
2399
+ "cell_type": "code",
2400
+ "execution_count": 21,
2401
+ "id": "94ebeacc",
2402
+ "metadata": {},
2403
+ "outputs": [
2404
+ {
2405
+ "name": "stderr",
2406
+ "output_type": "stream",
2407
+ "text": [
2408
+ "2026-06-04 00:31:05,325 | INFO | Loading embedding model: all-MiniLM-L6-v2\n",
2409
+ "2026-06-04 00:31:05,327 | INFO | Load pretrained SentenceTransformer: all-MiniLM-L6-v2\n",
2410
+ "e:\\gradution project\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:949: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
2411
+ " warnings.warn(\n",
2412
+ "2026-06-04 00:31:07,549 | INFO | Use pytorch device_name: cpu\n",
2413
+ "2026-06-04 00:31:07,583 | INFO | Artifacts loaded successfully.\n"
2414
+ ]
2415
+ },
2416
+ {
2417
+ "data": {
2418
+ "application/vnd.jupyter.widget-view+json": {
2419
+ "model_id": "4c7332342b3d4027b4960c9256eea984",
2420
+ "version_major": 2,
2421
+ "version_minor": 0
2422
+ },
2423
+ "text/plain": [
2424
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
2425
+ ]
2426
+ },
2427
+ "metadata": {},
2428
+ "output_type": "display_data"
2429
+ },
2430
+ {
2431
+ "name": "stdout",
2432
+ "output_type": "stream",
2433
+ "text": [
2434
+ " project_id title technologies \\\n",
2435
+ "0 105 Hospital Management System \n",
2436
+ "1 47 Clinical Information System \n",
2437
+ "2 110 Health Care Management System \n",
2438
+ "3 62 Doctor 4 U \n",
2439
+ "4 112 health services & medical outcomes monitoring \n",
2440
+ "\n",
2441
+ " similarity_score \n",
2442
+ "0 0.8216 \n",
2443
+ "1 0.6907 \n",
2444
+ "2 0.6779 \n",
2445
+ "3 0.5829 \n",
2446
+ "4 0.5801 \n"
2447
+ ]
2448
+ }
2449
+ ],
2450
+ "source": [
2451
+ "from src.similarity_model.embedding_engine import ProjectEmbedder\n",
2452
+ "\n",
2453
+ "engine = ProjectEmbedder()\n",
2454
+ "engine.load_artifacts()\n",
2455
+ "\n",
2456
+ "results = engine.search(\n",
2457
+ " \"hospital management system with appointment booking and patient records\",\n",
2458
+ " k=5\n",
2459
+ ")\n",
2460
+ "\n",
2461
+ "print(results)"
2462
+ ]
2463
+ },
2464
+ {
2465
+ "cell_type": "code",
2466
+ "execution_count": 22,
2467
+ "id": "8e5b3729",
2468
+ "metadata": {},
2469
+ "outputs": [
2470
+ {
2471
+ "data": {
2472
+ "application/vnd.jupyter.widget-view+json": {
2473
+ "model_id": "e73c8cda22e6469cb5aa1b9620abe390",
2474
+ "version_major": 2,
2475
+ "version_minor": 0
2476
+ },
2477
+ "text/plain": [
2478
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
2479
+ ]
2480
+ },
2481
+ "metadata": {},
2482
+ "output_type": "display_data"
2483
+ },
2484
+ {
2485
+ "data": {
2486
+ "application/vnd.jupyter.widget-view+json": {
2487
+ "model_id": "26ab4a1c4e28402487c2bd7ce8558359",
2488
+ "version_major": 2,
2489
+ "version_minor": 0
2490
+ },
2491
+ "text/plain": [
2492
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
2493
+ ]
2494
+ },
2495
+ "metadata": {},
2496
+ "output_type": "display_data"
2497
+ },
2498
+ {
2499
+ "name": "stdout",
2500
+ "output_type": "stream",
2501
+ "text": [
2502
+ "{'score': 0.8866, 'coverage': 1.0, 'shared_count': 1, 'matches': [{'feature_a': 'machine learning system', 'feature_b': 'machine learning platform', 'score': 0.838}], 'unique_a': [], 'unique_b': ['ml analytics']}\n"
2503
+ ]
2504
+ }
2505
+ ],
2506
+ "source": [
2507
+ "result = compute_feature_similarity(\n",
2508
+ " [\n",
2509
+ " \"machine learning system\",\n",
2510
+ " \"machine learning prediction\",\n",
2511
+ " \"machine learning analysis\"\n",
2512
+ " ],\n",
2513
+ " [\n",
2514
+ " \"machine learning platform\",\n",
2515
+ " \"predictive machine learning\",\n",
2516
+ " \"ml analytics\"\n",
2517
+ " ]\n",
2518
+ ")\n",
2519
+ "\n",
2520
+ "print(result)"
2521
+ ]
2522
+ },
2523
+ {
2524
+ "cell_type": "code",
2525
+ "execution_count": 23,
2526
+ "id": "3f0b789e",
2527
+ "metadata": {},
2528
+ "outputs": [
2529
+ {
2530
+ "name": "stdout",
2531
+ "output_type": "stream",
2532
+ "text": [
2533
+ "0.05\n"
2534
+ ]
2535
+ }
2536
+ ],
2537
+ "source": [
2538
+ "from src.similarity_model.hybrid_ranker import (\n",
2539
+ " compute_hybrid_score\n",
2540
+ ")\n",
2541
+ "\n",
2542
+ "print(\n",
2543
+ " compute_hybrid_score(\n",
2544
+ " semantic_score=0.95,\n",
2545
+ " feature_score=0.0,\n",
2546
+ " coverage=0.0,\n",
2547
+ " feature_count=5,\n",
2548
+ " unique_query_count=5\n",
2549
+ " )\n",
2550
+ ")"
2551
+ ]
2552
+ },
2553
+ {
2554
+ "cell_type": "code",
2555
+ "execution_count": 24,
2556
+ "id": "5c2e1ed5",
2557
+ "metadata": {},
2558
+ "outputs": [
2559
+ {
2560
+ "name": "stdout",
2561
+ "output_type": "stream",
2562
+ "text": [
2563
+ "\n",
2564
+ "id:\n",
2565
+ "207\n",
2566
+ "\n",
2567
+ "submitted_at:\n",
2568
+ "NaT\n",
2569
+ "\n",
2570
+ "project_title:\n",
2571
+ "Smart Library\n",
2572
+ "\n",
2573
+ "student_names:\n",
2574
+ "Abdel Hamid Abdel Nasser, Mahmoud Tamer Mahmoud, Amer Saed Mohamed Ali Amer, Tahany Adel Faragallah, Hala Ahmed Saad Salem, Mohamed Khaled Mohamed\n",
2575
+ "\n",
2576
+ "year:\n",
2577
+ "2022\n",
2578
+ "\n",
2579
+ "abstract:\n",
2580
+ "Egypt is striving and our efforts are focused these days towards digital transformation and the nationalization of all its government facilities, including the higher education sector. With more than 4 million university students and up to 644, 000 graduates annually, we need smart digital systems that support the educational process and scientific research. Therefore, we have developed a smart library application that takes care of books, recommendations, and user opinions, and provides the appropriate electronic environment for university students to find and nominate appropriate books through an electronic application based on artificial intelligence. Where, using artificial intelligence algorithms, the application will analyze book data and student data together to choose the most appropriate scientific content, in addition to the chatbot is designed to intelligently simulate human conversations. Finally, the smart library provides books to students faster and easier, and encourages them to read and benefit from their information, and the presence of suggestions for similar books will make them not stop reading and expand their horizons, and also the presence of a chatbot will increase the ease of access to books.\n",
2581
+ "\n",
2582
+ "description:\n",
2583
+ "The Smart Library project is a digital platform designed to modernize university library systems in Egypt. It integrates AI-driven book recommendations, an interactive chatbot for user assistance, social groups for collaborative reading, and a QR-code-based borrowing system to streamline library operations and improve student access to academic resources.\n",
2584
+ "\n",
2585
+ "problem_statement:\n",
2586
+ "University libraries in Egypt face delays in digital transformation, relying on traditional, non-interactive systems. This leads to inefficient resource usage, difficulty for students in finding relevant academic materials, and a lack of engagement, ultimately hindering the educational process.\n",
2587
+ "\n",
2588
+ "proposed_solution:\n",
2589
+ "The project proposes an AI-powered smart library application that features a machine learning recommendation engine, an intelligent chatbot for conversational support, social networking features for students, and a QR-code system for automated book borrowing and management.\n",
2590
+ "\n",
2591
+ "objectives:\n",
2592
+ "1. Provide accurate and reliable functionality.\n",
2593
+ "2. Provide interactive educational tools and resources.\n",
2594
+ "3. Improve decision-making using artificial intelligence techniques.\n",
2595
+ "4. Implement intelligent AI-based functionalities.\n",
2596
+ "5. Improve system performance and reliability.\n",
2597
+ "6. Improve learning experience and educational accessibility.\n",
2598
+ "\n",
2599
+ "full_content:\n",
2600
+ "Smart Library. Smart Library. Egypt is striving and our efforts are focused these days towards digital transformation and the nationalization of all its government facilities, including the higher education sector. With more than 4 million university students and up to 644, 000 graduates annually, we need smart digital systems that support the educational process and scientific research. Therefore, we have developed a smart library application that takes care of books, recommendations, and user opinions, and provides the appropriate electronic environment for university students to find and nominate appropriate books through an electronic application based on artificial intelligence. Where, using artificial intelligence algorithms, the application will analyze book data and student data together to choose the most appropriate scientific content, in addition to the chatbot is designed to intelligently simulate human conversations. Finally, the smart library provides books to students faster and easier, and encourages them to read and benefit from their information, and the presence of suggestions for similar books will make them not stop reading and expand their horizons, and also the presence of a chatbot will increase the ease of access to books.. The Smart Library project is a digital platform designed to modernize university library systems in Egypt. It integrates AI-driven book recommendations, an interactive chatbot for user assistance, social groups for collaborative reading, and a QR-code-based borrowing system to streamline library operations and improve student access to academic resources.\n",
2601
+ "\n",
2602
+ "clean_text:\n",
2603
+ "smart library. smart library. egypt is striving and our efforts are focused these days towards digital transformation and the nationalization of all its government facilities including the higher education sector. with more than 4 million university students and up to 644 000 graduates annually we need smart digital systems that support the educational process and scientific research. therefore we have developed a smart library application that takes care of books recommendations and user opinions and provides the appropriate electronic environment for university students to find and nominate appropriate books through an electronic application based on artificial intelligence. where using artificial intelligence algorithms the application will analyze book data and student data together to choose the most appropriate scientific content in addition to the chatbot is designed to intelligently simulate human conversations. finally the smart library provides books to students faster and easier and encourages them to read and benefit from their information and the presence of suggestions for similar books will make them not stop reading and expand their horizons and also the presence of a chatbot will increase the ease of access to books.. the smart library project is a digital platform designed to modernize university library systems in egypt. it integrates ai-driven book recommendations an interactive chatbot for user assistance social groups for collaborative reading and a qr-code-based borrowing system to streamline library operations and improve student access to academic resources.\n",
2604
+ "\n",
2605
+ "word_count:\n",
2606
+ "233\n",
2607
+ "\n",
2608
+ "features:\n",
2609
+ "\"\\\"[\\\\\\\"Artificial intelligence algorithms\\\\\\\", \\\\\\\"AI-driven book recommendations\\\\\\\", \\\\\\\"Interactive chatbot\\\\\\\", \\\\\\\"Social groups for collaborative reading\\\\\\\", \\\\\\\"QR-code-based borrowing system\\\\\\\"]\\\"\"\n"
2610
+ ]
2611
+ }
2612
+ ],
2613
+ "source": [
2614
+ "row = clean_df[\n",
2615
+ " clean_df[\"project_title\"] == \"Smart Library\"\n",
2616
+ "].iloc[0]\n",
2617
+ "\n",
2618
+ "for column in clean_df.columns:\n",
2619
+ " print(f\"\\n{column}:\")\n",
2620
+ " print(row[column])"
2621
+ ]
2622
+ },
2623
+ {
2624
+ "cell_type": "code",
2625
+ "execution_count": null,
2626
+ "id": "7f64358c",
2627
+ "metadata": {},
2628
+ "outputs": [],
2629
+ "source": []
2630
+ }
2631
+ ],
2632
+ "metadata": {
2633
+ "kernelspec": {
2634
+ "display_name": ".venv",
2635
+ "language": "python",
2636
+ "name": "python3"
2637
+ },
2638
+ "language_info": {
2639
+ "codemirror_mode": {
2640
+ "name": "ipython",
2641
+ "version": 3
2642
+ },
2643
+ "file_extension": ".py",
2644
+ "mimetype": "text/x-python",
2645
+ "name": "python",
2646
+ "nbconvert_exporter": "python",
2647
+ "pygments_lexer": "ipython3",
2648
+ "version": "3.11.9"
2649
+ }
2650
+ },
2651
+ "nbformat": 4,
2652
+ "nbformat_minor": 5
2653
+ }
README.md CHANGED
@@ -1,11 +1,379 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
- title: Graduation Project-v1.2
3
- emoji: 📊
4
- colorFrom: purple
5
- colorTo: green
6
- sdk: docker
7
- pinned: false
8
- app_port: 7860
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  ---
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # 🤖 AI-Powered Graduation Project Recommendation System
3
+
4
+ ## 📌 Overview
5
+
6
+ This project implements an intelligent AI-powered recommendation and semantic similarity platform for graduation projects using:
7
+
8
+ * Natural Language Processing (NLP)
9
+ * Semantic Search
10
+ * Vector Embeddings
11
+ * Hybrid Ranking Systems
12
+ * Large Language Models (LLMs)
13
+
14
+ The system helps students:
15
+
16
+ * discover unique graduation project ideas
17
+ * avoid duplicate projects
18
+ * analyze originality
19
+ * generate intelligent project features
20
+ * receive context-aware recommendations through an AI chatbot
21
+
22
+ ---
23
+
24
+ # ⚙️ System Pipeline
25
+
26
+ ## 1️⃣ Data Preprocessing
27
+
28
+ * Text normalization
29
+ * Duplicate removal
30
+ * Smart content merging
31
+ * Technical keyword extraction
32
+ * Feature engineering
33
+
34
+ ## 2️⃣ Feature Extraction
35
+
36
+ * KeyBERT-based keyword extraction
37
+ * Automatic technical term detection
38
+ * Semantic feature generation
39
+
40
+ ## 3️⃣ Embedding Generation
41
+
42
+ * SentenceTransformer embeddings
43
+ * Normalized vector representations
44
+ * Semantic encoding of projects
45
+
46
+ ## 4️⃣ Semantic Retrieval
47
+
48
+ * FAISS vector indexing
49
+ * Nearest-neighbor semantic search
50
+ * Fast project similarity lookup
51
+
52
+ ## 5️⃣ Hybrid Ranking
53
+
54
+ The final ranking combines:
55
+
56
+ * Semantic similarity
57
+ * Feature similarity
58
+ * Coverage ratio
59
+ * Confidence estimation
60
+ * Originality analysis
61
+
62
+ ## 6️⃣ AI Recommendation Engine
63
+
64
+ * Context-aware project generation
65
+ * Feature recommendation
66
+ * Novelty checking
67
+ * Conversational chatbot assistance
68
+
69
+ ---
70
+
71
+ # 🧠 AI & NLP Technologies Used
72
+
73
+ ## 🔹 Machine Learning & NLP
74
+
75
+ * SentenceTransformers
76
+ * KeyBERT
77
+ * Scikit-learn
78
+ * SciPy
79
+ * FAISS
80
+
81
+ ## 🔹 LLM Integration
82
+
83
+ * Google Gemini API
84
+ * Ollama
85
+ * Mistral
86
+
87
+ ## 🔹 Backend & Infrastructure
88
+
89
+ * FastAPI
90
+ * Pandas
91
+ * NumPy
92
+ * Python
93
+
94
  ---
95
+
96
+ # 🏗️ Project Architecture
97
+
98
+ ```text
99
+ User Query
100
+
101
+ Intent Classification
102
+
103
+ Context Builder
104
+
105
+ Feature Extraction
106
+
107
+ Embedding Generation
108
+
109
+ FAISS Semantic Search
110
+
111
+ Hybrid Ranking Engine
112
+
113
+ Originality & Duplicate Analysis
114
+
115
+ AI Recommendation Response
116
+ ```
117
+
118
+ ---
119
+
120
+ # 🔍 Similarity Engine Workflow
121
+
122
+ ```text
123
+ Raw Dataset
124
+
125
+ Preprocessing
126
+
127
+ Feature Extraction
128
+
129
+ Sentence Embeddings
130
+
131
+ FAISS Indexing
132
+
133
+ Semantic Retrieval
134
+
135
+ Feature Similarity Matching
136
+
137
+ Hybrid Re-ranking
138
+
139
+ Final Recommendation
140
+ ```
141
+
142
+ ---
143
+
144
+ # 🚀 Features
145
+
146
+ ## ✅ AI Chatbot
147
+
148
+ * Context-aware conversations
149
+ * Intent classification
150
+ * Domain-specific recommendations
151
+ * Memory-aware responses
152
+
153
+ ## ✅ Semantic Similarity Search
154
+
155
+ * Embedding-based retrieval
156
+ * Semantic duplicate detection
157
+ * Vector search with FAISS
158
+
159
+ ## ✅ Hybrid Recommendation System
160
+
161
+ * Multi-stage ranking pipeline
162
+ * Feature-level semantic comparison
163
+ * Adaptive scoring strategy
164
+
165
+ ## ✅ Originality Detection
166
+
167
+ * Duplicate risk analysis
168
+ * Originality scoring
169
+ * Similarity confidence estimation
170
+
171
+ ## ✅ Intelligent Feature Generation
172
+
173
+ * AI-generated project features
174
+ * Novelty-aware generation
175
+ * Domain-aware recommendations
176
+
177
+ ---
178
+
179
+ # 📊 Evaluation
180
+
181
+ The system includes:
182
+
183
+ * Self-retrieval evaluation
184
+ * Real-query testing
185
+ * Hybrid ranking validation
186
+ * Confidence scoring
187
+
188
+ ### Evaluation Metrics
189
+
190
+ * Semantic Similarity Score
191
+ * Hybrid Score
192
+ * Originality Score
193
+ * Confidence Score
194
+ * Duplicate Risk Classification
195
+
196
+ ---
197
+
198
+ # 📁 Project Structure
199
+
200
+ ```text
201
+ GRADUATION_PROJECT/
202
+
203
+ ├── api/ # FastAPI backend
204
+
205
+ ├── Data/
206
+ │ ├── raw/ # Original dataset
207
+ │ └── processed/ # Cleaned dataset
208
+
209
+ ├── models/ # FAISS index & metadata
210
+
211
+ ├── Notebooks/
212
+ │ └── TEST.ipynb # Training & evaluation notebook
213
+
214
+ ├── src/
215
+ │ ├── recommendation_engine/ # Chatbot & recommendation logic
216
+ │ └── similarity_model/ # Semantic search engine
217
+
218
+ ├── requirements.txt
219
+ ├── README.md
220
+ └── .gitignore
221
+ ```
222
+
223
  ---
224
 
225
+ # 🧩 Recommendation Engine Modules
226
+
227
+ ## recommendation_engine/
228
+
229
+ Contains:
230
+
231
+ * Chatbot engine
232
+ * Intent classification
233
+ * Prompt building
234
+ * Idea generation
235
+ * Feature generation
236
+ * Memory management
237
+ * Novelty checking
238
+ * Response formatting
239
+
240
+ ---
241
+
242
+ # 🔬 Similarity Model Modules
243
+
244
+ ## similarity_model/
245
+
246
+ Contains:
247
+
248
+ * Semantic search
249
+ * Embedding engine
250
+ * Hybrid ranker
251
+ * Feature similarity engine
252
+ * Preprocessing pipeline
253
+ * Evaluation framework
254
+
255
+ ---
256
+
257
+ # ⚡ Installation
258
+
259
+ ## 1️⃣ Clone Repository
260
+
261
+ ```bash
262
+ git clone https://github.com/YOUR_USERNAME/YOUR_REPOSITORY.git
263
+ cd YOUR_REPOSITORY
264
+ ```
265
+
266
+ ---
267
+
268
+ ## 2️⃣ Create Virtual Environment
269
+
270
+ ### Windows
271
+
272
+ ```bash
273
+ python -m venv .venv
274
+ .venv\Scripts\activate
275
+ ```
276
+
277
+ ### Linux / Mac
278
+
279
+ ```bash
280
+ python3 -m venv .venv
281
+ source .venv/bin/activate
282
+ ```
283
+
284
+ ---
285
+
286
+ ## 3️⃣ Install Dependencies
287
+
288
+ ```bash
289
+ pip install -r requirements.txt
290
+ ```
291
+
292
+ ---
293
+
294
+ # 🔑 Environment Variables
295
+
296
+ Create a `.env` file:
297
+
298
+ ```env
299
+ GEMINI_API_KEY=your_api_key_here
300
+ ```
301
+
302
+ ---
303
+
304
+ # ▶️ Running The Project
305
+
306
+ ## Run FastAPI Server
307
+
308
+ ```bash
309
+ uvicorn api.main:app --reload
310
+ ```
311
+
312
+ ---
313
+
314
+ ## Run Notebook
315
+
316
+ ```bash
317
+ jupyter notebook
318
+ ```
319
+
320
+ Open:
321
+
322
+ ```text
323
+ Notebooks/TEST.ipynb
324
+ ```
325
+
326
+ ---
327
+
328
+ # 💡 Example Query
329
+
330
+ ## Input
331
+
332
+ ```text
333
+ AI-based smart library recommendation platform
334
+ ```
335
+
336
+ ## Output
337
+
338
+ * Similar graduation projects
339
+ * Semantic similarity scores
340
+ * Originality analysis
341
+ * Duplicate risk estimation
342
+ * Recommended features
343
+
344
+ ---
345
+
346
+ # 🎯 Future Improvements
347
+
348
+ * Full RAG integration
349
+ * Multi-agent orchestration
350
+ * GPU acceleration
351
+ * Advanced evaluation metrics
352
+ * Real-time deployment
353
+ * Database persistence
354
+ * Frontend dashboard
355
+
356
+ ---
357
+
358
+ # 📚 Research Areas Covered
359
+
360
+ * Natural Language Processing (NLP)
361
+ * Semantic Search
362
+ * Recommendation Systems
363
+ * Vector Databases
364
+ * Conversational AI
365
+ * Information Retrieval
366
+ * Hybrid Ranking Systems
367
+ * Large Language Models (LLMs)
368
+
369
+ ---
370
+
371
+ # 👨‍💻 Author
372
+
373
+ Yossef Assem
374
+
375
+ ---
376
+
377
+ # 📄 License
378
+
379
+ This project is for educational and research purposes.
api/__pycache__/__init__.cpython-311.pyc CHANGED
Binary files a/api/__pycache__/__init__.cpython-311.pyc and b/api/__pycache__/__init__.cpython-311.pyc differ
 
api/__pycache__/main.cpython-311.pyc CHANGED
Binary files a/api/__pycache__/main.cpython-311.pyc and b/api/__pycache__/main.cpython-311.pyc differ
 
api/__pycache__/schemas.cpython-311.pyc CHANGED
Binary files a/api/__pycache__/schemas.cpython-311.pyc and b/api/__pycache__/schemas.cpython-311.pyc differ
 
api/__pycache__/services.cpython-311.pyc CHANGED
Binary files a/api/__pycache__/services.cpython-311.pyc and b/api/__pycache__/services.cpython-311.pyc differ
 
models/faiss_index.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cfe09be00eb70151711b1603dcd8a2b67c102f4218647b2a5bb405a2a1932863
3
- size 392266
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:469af18f0e06e31e389d476e47e643626f8ddfd69b593f299c78932080b5c858
3
+ size 393810
models/metadata.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:40d299cdc20636faf33e0acd77ca5b5322ef3de7e6e539b183a25f4e6bdf96cc
3
- size 794293
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa65da476454398a5d7b124e338e0e4bc2c8015258b73e09fa28cad448f9a420
3
+ size 773665
models/project_embeddings.npy CHANGED
Binary files a/models/project_embeddings.npy and b/models/project_embeddings.npy differ
 
requirements.txt CHANGED
@@ -14,4 +14,4 @@ google-genai
14
  requests
15
  pyarrow
16
  sqlalchemy
17
- pyodbc
 
14
  requests
15
  pyarrow
16
  sqlalchemy
17
+ pyodbc
src/recommendation_engine/__pycache__/__init__.cpython-311.pyc CHANGED
Binary files a/src/recommendation_engine/__pycache__/__init__.cpython-311.pyc and b/src/recommendation_engine/__pycache__/__init__.cpython-311.pyc differ
 
src/recommendation_engine/__pycache__/chatbot_engine.cpython-311.pyc CHANGED
Binary files a/src/recommendation_engine/__pycache__/chatbot_engine.cpython-311.pyc and b/src/recommendation_engine/__pycache__/chatbot_engine.cpython-311.pyc differ
 
src/recommendation_engine/__pycache__/command_handler.cpython-311.pyc CHANGED
Binary files a/src/recommendation_engine/__pycache__/command_handler.cpython-311.pyc and b/src/recommendation_engine/__pycache__/command_handler.cpython-311.pyc differ
 
src/recommendation_engine/__pycache__/config.cpython-311.pyc CHANGED
Binary files a/src/recommendation_engine/__pycache__/config.cpython-311.pyc and b/src/recommendation_engine/__pycache__/config.cpython-311.pyc differ
 
src/recommendation_engine/__pycache__/context_builder.cpython-311.pyc CHANGED
Binary files a/src/recommendation_engine/__pycache__/context_builder.cpython-311.pyc and b/src/recommendation_engine/__pycache__/context_builder.cpython-311.pyc differ
 
src/recommendation_engine/__pycache__/feature_generator.cpython-311.pyc CHANGED
Binary files a/src/recommendation_engine/__pycache__/feature_generator.cpython-311.pyc and b/src/recommendation_engine/__pycache__/feature_generator.cpython-311.pyc differ
 
src/recommendation_engine/__pycache__/full_project_generator.cpython-311.pyc CHANGED
Binary files a/src/recommendation_engine/__pycache__/full_project_generator.cpython-311.pyc and b/src/recommendation_engine/__pycache__/full_project_generator.cpython-311.pyc differ
 
src/recommendation_engine/__pycache__/idea_generator.cpython-311.pyc CHANGED
Binary files a/src/recommendation_engine/__pycache__/idea_generator.cpython-311.pyc and b/src/recommendation_engine/__pycache__/idea_generator.cpython-311.pyc differ
 
src/recommendation_engine/__pycache__/llm_client.cpython-311.pyc CHANGED
Binary files a/src/recommendation_engine/__pycache__/llm_client.cpython-311.pyc and b/src/recommendation_engine/__pycache__/llm_client.cpython-311.pyc differ
 
src/recommendation_engine/__pycache__/llm_router.cpython-311.pyc CHANGED
Binary files a/src/recommendation_engine/__pycache__/llm_router.cpython-311.pyc and b/src/recommendation_engine/__pycache__/llm_router.cpython-311.pyc differ
 
src/recommendation_engine/__pycache__/memory_store.cpython-311.pyc CHANGED
Binary files a/src/recommendation_engine/__pycache__/memory_store.cpython-311.pyc and b/src/recommendation_engine/__pycache__/memory_store.cpython-311.pyc differ
 
src/recommendation_engine/__pycache__/novelty_checker.cpython-311.pyc CHANGED
Binary files a/src/recommendation_engine/__pycache__/novelty_checker.cpython-311.pyc and b/src/recommendation_engine/__pycache__/novelty_checker.cpython-311.pyc differ
 
src/recommendation_engine/__pycache__/prompt_builder.cpython-311.pyc CHANGED
Binary files a/src/recommendation_engine/__pycache__/prompt_builder.cpython-311.pyc and b/src/recommendation_engine/__pycache__/prompt_builder.cpython-311.pyc differ
 
src/recommendation_engine/__pycache__/response_formatter.cpython-311.pyc CHANGED
Binary files a/src/recommendation_engine/__pycache__/response_formatter.cpython-311.pyc and b/src/recommendation_engine/__pycache__/response_formatter.cpython-311.pyc differ
 
src/recommendation_engine/__pycache__/state_manager.cpython-311.pyc CHANGED
Binary files a/src/recommendation_engine/__pycache__/state_manager.cpython-311.pyc and b/src/recommendation_engine/__pycache__/state_manager.cpython-311.pyc differ
 
src/recommendation_engine/__pycache__/test.cpython-311.pyc CHANGED
Binary files a/src/recommendation_engine/__pycache__/test.cpython-311.pyc and b/src/recommendation_engine/__pycache__/test.cpython-311.pyc differ
 
src/recommendation_engine/__pycache__/validator.cpython-311.pyc CHANGED
Binary files a/src/recommendation_engine/__pycache__/validator.cpython-311.pyc and b/src/recommendation_engine/__pycache__/validator.cpython-311.pyc differ
 
src/recommendation_engine/llm_client.py CHANGED
@@ -170,12 +170,14 @@ def is_bad_response(text: str) -> bool:
170
  # =========================================
171
  def generate_text(
172
  prompt: str,
173
- task: str = "chat"
 
174
  ) -> str:
175
 
176
  prompt = safe_prompt(prompt)
177
 
178
- temperature = get_temperature(task)
 
179
  max_tokens = get_max_tokens(task)
180
 
181
  for model_name in MODEL_CANDIDATES:
 
170
  # =========================================
171
  def generate_text(
172
  prompt: str,
173
+ task: str = "chat",
174
+ temperature=None
175
  ) -> str:
176
 
177
  prompt = safe_prompt(prompt)
178
 
179
+ if temperature is None:
180
+ temperature = get_temperature(task)
181
  max_tokens = get_max_tokens(task)
182
 
183
  for model_name in MODEL_CANDIDATES:
src/services/scheduler.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+
3
+ from src.services.sync_projects import sync_projects
4
+
5
+ while True:
6
+
7
+ try:
8
+
9
+ print("Checking for new projects...")
10
+
11
+ sync_projects()
12
+
13
+ print("Done")
14
+
15
+ except Exception as e:
16
+
17
+ print("Error:", e)
18
+
19
+ time.sleep(60)
src/services/sync_projects.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import pandas as pd
3
+
4
+ from Data.database.sql_connector import engine
5
+ from src.similarity_model.preprocessing import preprocess_dataset
6
+ from src.similarity_model.embedding_engine import (
7
+ train_embedding_engine
8
+ )
9
+
10
+
11
+ def sync_projects():
12
+
13
+ projects_query = """
14
+ SELECT *
15
+ FROM Projects
16
+ WHERE Status IN (
17
+ 'Completed',
18
+ 'UnderReview',
19
+ 'In_Progress'
20
+ )
21
+ """
22
+
23
+ projects_df = pd.read_sql(
24
+ projects_query,
25
+ engine
26
+ )
27
+
28
+ existing_df = pd.read_sql(
29
+ """
30
+ SELECT id
31
+ FROM PreProcessed_Projects
32
+ """,
33
+ engine
34
+ )
35
+
36
+ allowed_ids = set(
37
+ projects_df["Id"].tolist()
38
+ )
39
+
40
+ processed_ids = set(
41
+ existing_df["id"].tolist()
42
+ )
43
+
44
+ # ---------------------------------
45
+ # Remove projects no longer allowed
46
+ # ---------------------------------
47
+
48
+ ids_to_remove = (
49
+ processed_ids - allowed_ids
50
+ )
51
+
52
+ changed = False
53
+
54
+ if ids_to_remove:
55
+
56
+ ids_str = ",".join(
57
+ map(str, ids_to_remove)
58
+ )
59
+
60
+ with engine.begin() as conn:
61
+
62
+ conn.exec_driver_sql(
63
+ f"""
64
+ DELETE FROM PreProcessed_Projects
65
+ WHERE id IN ({ids_str})
66
+ """
67
+ )
68
+
69
+ print(
70
+ f"Removed {len(ids_to_remove)} projects"
71
+ )
72
+
73
+ changed = True
74
+
75
+ # ---------------------------------
76
+ # Add new projects
77
+ # ---------------------------------
78
+
79
+ new_projects = projects_df[
80
+ ~projects_df["Id"].isin(
81
+ processed_ids
82
+ )
83
+ ].copy()
84
+
85
+ if len(new_projects) > 0:
86
+
87
+ print(
88
+ f"Found {len(new_projects)} new projects"
89
+ )
90
+
91
+ processed_df = preprocess_dataset(
92
+ new_projects
93
+ )
94
+
95
+ processed_df = processed_df[
96
+ [
97
+ "id",
98
+ "submittedat",
99
+ "project_title",
100
+ "studentnames",
101
+ "year",
102
+ "abstract",
103
+ "description",
104
+ "problemstatement",
105
+ "proposedsolution",
106
+ "objectives",
107
+ "full_content",
108
+ "clean_text",
109
+ "word_count",
110
+ "features"
111
+ ]
112
+ ]
113
+
114
+ processed_df = processed_df.rename(
115
+ columns={
116
+ "submittedat": "submitted_at",
117
+ "studentnames": "student_names",
118
+ "problemstatement": "problem_statement",
119
+ "proposedsolution": "proposed_solution"
120
+ }
121
+ )
122
+
123
+ processed_df["features"] = (
124
+ processed_df["features"]
125
+ .apply(json.dumps)
126
+ )
127
+
128
+ processed_df.to_sql(
129
+ "PreProcessed_Projects",
130
+ engine,
131
+ if_exists="append",
132
+ index=False
133
+ )
134
+
135
+ print(
136
+ f"Processed and inserted {len(processed_df)} projects"
137
+ )
138
+
139
+ changed = True
140
+
141
+ else:
142
+
143
+ print(
144
+ "No new projects found"
145
+ )
146
+
147
+ # ---------------------------------
148
+ # Rebuild FAISS only if changed
149
+ # ---------------------------------
150
+
151
+ if changed:
152
+
153
+ print(
154
+ "Updating embeddings..."
155
+ )
156
+
157
+ train_embedding_engine()
158
+
159
+ print(
160
+ "Embeddings updated"
161
+ )
162
+
163
+ else:
164
+
165
+ print(
166
+ "No changes detected"
167
+ )
168
+
169
+
170
+ if __name__ == "__main__":
171
+
172
+ sync_projects()
src/similarity_model/__pycache__/__init__.cpython-311.pyc CHANGED
Binary files a/src/similarity_model/__pycache__/__init__.cpython-311.pyc and b/src/similarity_model/__pycache__/__init__.cpython-311.pyc differ
 
src/similarity_model/__pycache__/embedding_engine.cpython-311.pyc CHANGED
Binary files a/src/similarity_model/__pycache__/embedding_engine.cpython-311.pyc and b/src/similarity_model/__pycache__/embedding_engine.cpython-311.pyc differ
 
src/similarity_model/__pycache__/feature_similarity.cpython-311.pyc CHANGED
Binary files a/src/similarity_model/__pycache__/feature_similarity.cpython-311.pyc and b/src/similarity_model/__pycache__/feature_similarity.cpython-311.pyc differ
 
src/similarity_model/__pycache__/hybrid_ranker.cpython-311.pyc CHANGED
Binary files a/src/similarity_model/__pycache__/hybrid_ranker.cpython-311.pyc and b/src/similarity_model/__pycache__/hybrid_ranker.cpython-311.pyc differ
 
src/similarity_model/__pycache__/preprocessing.cpython-311.pyc CHANGED
Binary files a/src/similarity_model/__pycache__/preprocessing.cpython-311.pyc and b/src/similarity_model/__pycache__/preprocessing.cpython-311.pyc differ
 
src/similarity_model/__pycache__/semantic_search.cpython-311.pyc CHANGED
Binary files a/src/similarity_model/__pycache__/semantic_search.cpython-311.pyc and b/src/similarity_model/__pycache__/semantic_search.cpython-311.pyc differ
 
src/similarity_model/__pycache__/similarity_engine.cpython-311.pyc CHANGED
Binary files a/src/similarity_model/__pycache__/similarity_engine.cpython-311.pyc and b/src/similarity_model/__pycache__/similarity_engine.cpython-311.pyc differ
 
src/similarity_model/embedding_engine.py CHANGED
@@ -119,12 +119,27 @@ class ProjectEmbedder:
119
  if TECH_COL not in self.metadata.columns:
120
  self.metadata[TECH_COL] = ""
121
 
 
 
 
 
 
 
 
 
 
 
 
122
  # weighted content:
123
  # title repeated twice
124
  rich_texts = (
125
- self.metadata[TITLE_COL].fillna("").astype(str) + " " +
126
- self.metadata[TITLE_COL].fillna("").astype(str) + " " +
127
- self.metadata[TEXT_COL].fillna("").astype(str)
 
 
 
 
128
  ).tolist()
129
 
130
  embeddings = self.generate_embeddings(rich_texts)
 
119
  if TECH_COL not in self.metadata.columns:
120
  self.metadata[TECH_COL] = ""
121
 
122
+ FEATURE_COL = "features"
123
+
124
+ if FEATURE_COL not in self.metadata.columns:
125
+ self.metadata[FEATURE_COL] = ""
126
+
127
+
128
+ feature_text = (
129
+ self.metadata[FEATURE_COL]
130
+ .fillna("")
131
+ .astype(str)
132
+ )
133
  # weighted content:
134
  # title repeated twice
135
  rich_texts = (
136
+ self.metadata[TITLE_COL].fillna("").astype(str)
137
+ + " "
138
+ + self.metadata[TITLE_COL].fillna("").astype(str)
139
+ + " "
140
+ + self.metadata[TEXT_COL].fillna("").astype(str)
141
+ + " "
142
+ + feature_text
143
  ).tolist()
144
 
145
  embeddings = self.generate_embeddings(rich_texts)
src/similarity_model/feature_similarity.py CHANGED
@@ -27,7 +27,7 @@ logger = logging.getLogger(__name__)
27
  # =====================================================
28
  MODEL_NAME = "all-MiniLM-L6-v2"
29
 
30
- DEFAULT_THRESHOLD = 0.65
31
 
32
  SIMILARITY_WEIGHT = 0.70
33
  COVERAGE_WEIGHT = 0.30
@@ -110,7 +110,7 @@ def remove_redundant_features(features):
110
  feat_words & existing
111
  ) / max(len(feat_words), 1)
112
 
113
- if overlap >= 0.90:
114
  redundant = True
115
  break
116
 
@@ -247,11 +247,11 @@ def compute_feature_similarity(
247
  if shared_scores else 0.0
248
  )
249
 
250
- max_len = max(len(fa), len(fb))
251
 
252
  coverage = (
253
- len(matches) / max_len
254
- if max_len > 0 else 0.0
255
  )
256
 
257
  final_score = (
@@ -260,6 +260,9 @@ def compute_feature_similarity(
260
  (COVERAGE_WEIGHT * coverage)
261
  )
262
 
 
 
 
263
  final_score = min(final_score, 1.0)
264
 
265
  matched_text_a = " ".join(
 
27
  # =====================================================
28
  MODEL_NAME = "all-MiniLM-L6-v2"
29
 
30
+ DEFAULT_THRESHOLD = 0.80
31
 
32
  SIMILARITY_WEIGHT = 0.70
33
  COVERAGE_WEIGHT = 0.30
 
110
  feat_words & existing
111
  ) / max(len(feat_words), 1)
112
 
113
+ if overlap >= 0.60:
114
  redundant = True
115
  break
116
 
 
247
  if shared_scores else 0.0
248
  )
249
 
250
+ min_len = min(len(fa), len(fb))
251
 
252
  coverage = (
253
+ len(matches) / min_len
254
+ if min_len > 0 else 0.0
255
  )
256
 
257
  final_score = (
 
260
  (COVERAGE_WEIGHT * coverage)
261
  )
262
 
263
+ if len(matches) == 0:
264
+ final_score = 0.0
265
+
266
  final_score = min(final_score, 1.0)
267
 
268
  matched_text_a = " ".join(
src/similarity_model/hybrid_ranker.py CHANGED
@@ -36,7 +36,7 @@ HIGH_FEATURE_WEIGHT = 0.45
36
  LOW_FEATURE_WEIGHT = 0.20
37
 
38
  BONUS_WEIGHT = 0.05
39
- MIN_HYBRID_SCORE = 0.35
40
 
41
  # =====================================================
42
  # Helpers
@@ -77,16 +77,16 @@ def get_dynamic_weights(
77
  # Score Engines
78
  # =====================================================
79
  def compute_hybrid_score(
80
- semantic_score: float,
81
- feature_score: float,
82
- coverage: float,
83
- feature_count: int
 
84
  ) -> float:
85
 
86
  semantic_score = clamp(semantic_score)
87
  feature_score = clamp(feature_score)
88
  coverage = clamp(coverage)
89
-
90
  # ==========================================
91
  # Strong feature overlap case
92
  # ==========================================
@@ -103,12 +103,30 @@ def compute_hybrid_score(
103
  # ==========================================
104
  # Normal scoring
105
  # ==========================================
 
 
 
 
 
106
  score = (
107
- 0.25 * semantic_score +
108
- 0.55 * feature_score +
109
- 0.20 * coverage
 
 
 
 
 
 
 
 
 
110
  )
111
 
 
 
 
 
112
  return round(clamp(score), 4)
113
 
114
 
@@ -118,22 +136,29 @@ def compute_originality(
118
  total_query_features: int
119
  ) -> float:
120
  """
121
- Higher similarity => lower originality
122
- More unique features => higher originality
 
 
123
  """
124
 
125
  hybrid_score = clamp(hybrid_score)
126
 
127
- inverse_similarity = 1.0 - hybrid_score
128
-
129
  uniqueness_ratio = (
130
  unique_query_features / total_query_features
131
- if total_query_features > 0 else 0.0
 
132
  )
133
 
134
- originality = 1 - hybrid_score
 
 
 
135
 
136
- return round(clamp(originality), 4)
 
 
 
137
 
138
 
139
  def compute_confidence(
@@ -159,16 +184,16 @@ def risk_label(score: float) -> str:
159
  Duplicate risk label.
160
  """
161
 
162
- if score >= 0.85:
163
  return "Very High"
164
 
165
- if score >= 0.70:
166
  return "High"
167
 
168
  if score >= 0.55:
169
  return "Medium"
170
 
171
- if score >= 0.40:
172
  return "Low"
173
 
174
  return "Very Low"
@@ -209,25 +234,31 @@ def compare_single_candidate(
209
  feature_result["unique_a"]
210
  )
211
 
212
- hybrid_score = compute_hybrid_score(
213
  semantic_score=semantic_score,
214
  feature_score=feature_score,
215
  coverage=coverage,
216
- feature_count=total_query_features
 
217
  )
218
 
219
- originality_score = compute_originality(
220
- hybrid_score=hybrid_score,
221
- unique_query_features=unique_query_count,
222
- total_query_features=total_query_features
223
  )
224
 
 
 
225
  confidence_score = compute_confidence(
226
  semantic_score=semantic_score,
227
  feature_score=feature_score,
228
  coverage=coverage
229
  )
230
-
 
 
 
 
231
  return {
232
  "project_title":
233
  candidate_row.get(TITLE_COL, ""),
 
36
  LOW_FEATURE_WEIGHT = 0.20
37
 
38
  BONUS_WEIGHT = 0.05
39
+ MIN_HYBRID_SCORE = 0.05
40
 
41
  # =====================================================
42
  # Helpers
 
77
  # Score Engines
78
  # =====================================================
79
  def compute_hybrid_score(
80
+ semantic_score,
81
+ feature_score,
82
+ coverage,
83
+ feature_count,
84
+ unique_query_count=0
85
  ) -> float:
86
 
87
  semantic_score = clamp(semantic_score)
88
  feature_score = clamp(feature_score)
89
  coverage = clamp(coverage)
 
90
  # ==========================================
91
  # Strong feature overlap case
92
  # ==========================================
 
103
  # ==========================================
104
  # Normal scoring
105
  # ==========================================
106
+ shared_ratio = (
107
+ (feature_count - unique_query_count)
108
+ / max(feature_count, 1)
109
+ )
110
+
111
  score = (
112
+ 0.90 * (shared_ratio ** 2.0)
113
+ + 0.07 * feature_score
114
+ + 0.03 * semantic_score
115
+ )
116
+
117
+ # No feature overlap
118
+ # No feature overlap
119
+ if feature_score == 0 or coverage == 0:
120
+ return 0.03
121
+
122
+ shared_count = (
123
+ feature_count - unique_query_count
124
  )
125
 
126
+ # Near duplicate
127
+ if shared_count >= 6 and unique_query_count <= 1:
128
+ return 0.95
129
+
130
  return round(clamp(score), 4)
131
 
132
 
 
136
  total_query_features: int
137
  ) -> float:
138
  """
139
+ Originality Score (0-100)
140
+
141
+ - More unique features -> higher originality
142
+ - More similarity -> lower originality
143
  """
144
 
145
  hybrid_score = clamp(hybrid_score)
146
 
 
 
147
  uniqueness_ratio = (
148
  unique_query_features / total_query_features
149
+ if total_query_features > 0
150
+ else 0.0
151
  )
152
 
153
+ originality = (
154
+ 0.70 * uniqueness_ratio +
155
+ 0.30 * (1.0 - hybrid_score)
156
+ )
157
 
158
+ return round(
159
+ max(0.0, min(100.0, originality * 100)),
160
+ 2
161
+ )
162
 
163
 
164
  def compute_confidence(
 
184
  Duplicate risk label.
185
  """
186
 
187
+ if score >= 0.90:
188
  return "Very High"
189
 
190
+ if score >= 0.75:
191
  return "High"
192
 
193
  if score >= 0.55:
194
  return "Medium"
195
 
196
+ if score >= 0.35:
197
  return "Low"
198
 
199
  return "Very Low"
 
234
  feature_result["unique_a"]
235
  )
236
 
237
+ base_similarity = compute_hybrid_score(
238
  semantic_score=semantic_score,
239
  feature_score=feature_score,
240
  coverage=coverage,
241
+ feature_count=total_query_features,
242
+ unique_query_count=unique_query_count
243
  )
244
 
245
+ originality_score = round(
246
+ (1.0 - base_similarity) * 100,
247
+ 2
 
248
  )
249
 
250
+ hybrid_score = base_similarity
251
+
252
  confidence_score = compute_confidence(
253
  semantic_score=semantic_score,
254
  feature_score=feature_score,
255
  coverage=coverage
256
  )
257
+ print("=" * 50)
258
+ print("BASE SIMILARITY:", base_similarity)
259
+ print("ORIGINALITY:", originality_score)
260
+ print("FINAL SIMILARITY:", hybrid_score)
261
+ print("=" * 50)
262
  return {
263
  "project_title":
264
  candidate_row.get(TITLE_COL, ""),
src/similarity_model/llm_feature_extractor.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ import time
4
+
5
+ from src.recommendation_engine.llm_client import generate_text
6
+
7
+
8
+ def build_feature_prompt(text: str):
9
+
10
+ return f"""
11
+ You are an expert graduation project analyst.
12
+
13
+ TASK:
14
+ Extract only technical features explicitly mentioned in the project.
15
+
16
+ RULES:
17
+ - Do not invent anything.
18
+ - Do not generate new ideas.
19
+ - Extract technologies.
20
+ - Extract algorithms.
21
+ - Extract modules.
22
+ - Extract engineering capabilities.
23
+ - Ignore goals and benefits.
24
+ Return between 2 and 10 features.
25
+
26
+ If fewer than 5 technical features exist,
27
+ return only the available features.
28
+
29
+
30
+
31
+ Technologies such as:
32
+ React,
33
+ Angular,
34
+ Vue,
35
+ MySQL,
36
+ PostgreSQL,
37
+ MongoDB,
38
+ Firebase,
39
+ Django,
40
+ Flask,
41
+ Node.js
42
+
43
+ must be included if explicitly mentioned.
44
+
45
+ IMPORTANT:
46
+ Every feature must be copied or paraphrased from the project text.
47
+ Never infer, assume, or generate new capabilities.
48
+ If a capability is not explicitly mentioned, do not include it.
49
+
50
+ IMPORTANT:
51
+
52
+ Do not extract:
53
+ - goals
54
+ - benefits
55
+ - outcomes
56
+ - business objectives
57
+ - deployment statements
58
+
59
+ Return SHORT canonical features.
60
+
61
+ Maximum 4 words per feature.
62
+
63
+ Examples:
64
+ computer vision
65
+ deep learning
66
+ internet of things
67
+ route optimization
68
+ waste classification
69
+
70
+ Do not include long descriptive phrases.
71
+
72
+
73
+ IMPORTANT:
74
+
75
+ Prefer domain-specific capabilities over generic AI terms.
76
+
77
+ BAD:
78
+ - machine learning
79
+ - deep learning
80
+ - neural networks
81
+
82
+ GOOD:
83
+ - waste classification
84
+ - route optimization
85
+ - bin fill level monitoring
86
+ - waste generation forecasting
87
+
88
+ If a specific capability exists, return it instead of the generic AI technology behind it.
89
+
90
+ Maximum 4 words per feature.
91
+
92
+ Only extract concrete technical capabilities.
93
+
94
+ Return JSON only.
95
+
96
+ Format:
97
+
98
+ {{
99
+ "features": [
100
+ "feature 1",
101
+ "feature 2"
102
+ ]
103
+ }}
104
+
105
+ PROJECT:
106
+
107
+ {text}
108
+ """
109
+
110
+
111
+ def extract_features_llm(text: str):
112
+
113
+ print("CALLING GEMINI")
114
+
115
+ prompt = build_feature_prompt(text)
116
+
117
+ time.sleep(4)
118
+
119
+ response = generate_text(
120
+ prompt,
121
+ task="feature",
122
+ temperature = 0
123
+ )
124
+
125
+ return parse_features_response(
126
+ response
127
+ )
128
+
129
+
130
+ def parse_features_response(response: str):
131
+
132
+ if not response:
133
+ return []
134
+
135
+ try:
136
+
137
+ # remove markdown json blocks
138
+ response = response.strip()
139
+
140
+ response = re.sub(
141
+ r"^```(?:json)?",
142
+ "",
143
+ response,
144
+ flags=re.I
145
+ )
146
+
147
+ response = re.sub(
148
+ r"```$",
149
+ "",
150
+ response
151
+ )
152
+
153
+ response = response.strip()
154
+
155
+
156
+ response = response.strip()
157
+
158
+ start = response.find("{")
159
+ end = response.rfind("}")
160
+
161
+ if start != -1 and end != -1:
162
+ response = response[start:end + 1]
163
+
164
+ data = json.loads(response)
165
+
166
+ features = data.get(
167
+ "features",
168
+ []
169
+ )
170
+
171
+ if not isinstance(features, list):
172
+ return []
173
+
174
+ cleaned = []
175
+
176
+ NORMALIZATION_MAP = {
177
+ "computer vision techniques": "computer vision",
178
+ "deep learning models": "deep learning",
179
+ "deep neural networks": "deep learning",
180
+ "machine learning models": "machine learning",
181
+ "iot sensor networks": "internet of things",
182
+ "sensor networks": "internet of things",
183
+ "predictive trends": "predictive analytics",
184
+ "operational insights": "analytics"
185
+ }
186
+
187
+ for x in features:
188
+
189
+ feat = str(x).strip().lower()
190
+
191
+ feat = NORMALIZATION_MAP.get(
192
+ feat,
193
+ feat
194
+ )
195
+
196
+ if not feat:
197
+ continue
198
+
199
+ if len(feat) < 2:
200
+ continue
201
+
202
+ cleaned.append(feat)
203
+
204
+ cleaned = list(dict.fromkeys(cleaned))
205
+
206
+ print("PARSED FEATURES:")
207
+ print(cleaned)
208
+
209
+ return cleaned[:10]
210
+
211
+ except Exception as e:
212
+
213
+ print("=" * 50)
214
+ print("JSON PARSE ERROR")
215
+ print(e)
216
+ print("=" * 50)
217
+
218
+ print("RAW RESPONSE:")
219
+ print(response)
220
+
221
+ print("=" * 50)
222
+
223
+ return []