bat-6's picture
feat: implement similarity engine with embedding generation, hybrid ranking, and explainability modules
5e95de5
Raw
History Blame Contribute Delete
4.04 kB
import logging
import pandas as pd
from src.similarity_model import find_similar_projects
from src.similarity_model import load_metadata
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s | %(levelname)s | %(message)s"
)
logger = logging.getLogger(__name__)
TOP_K = 5
SELF_TEST_SAMPLES = 20
def run_self_test():
df = load_metadata()
total = min(len(df), SELF_TEST_SAMPLES)
success = 0
for i in range(total):
row = df.loc[i]
results = find_similar_projects(
title=row.get("project_title", ""),
abstract=row.get("abstract", ""),
description=row.get("description", ""),
features=row.get("features", []),
top_k=1
)
if "project_id" in results.columns:
pred = int(results.iloc[0]["project_id"])
if pred == i:
success += 1
score = success / total
print("\n==============================")
print("SELF RETRIEVAL TEST")
print("==============================")
print(f"Projects Tested : {total}")
print(f"Top1 Accuracy : {score:.2%}")
print("==============================")
return score
def run_real_queries():
queries = [
{
"title":
"AI Clinic Management System",
"description":
"""
Smart clinic with booking,
chatbot, patient records,
doctor dashboard.
"""
},
{
"title":
"Smart Library Assistant",
"description":
"""
Library app with chatbot,
recommendation system,
qr code borrowing.
"""
},
{
"title":
"Attendance Face Recognition",
"description":
"""
Attendance system using
face recognition and reports.
"""
},
{
"title":
"E-commerce Recommendation Platform",
"description":
"""
Online shopping website with
recommendation engine,
payments and dashboard.
"""
}
]
print("\n==============================")
print("REAL QUERY TEST")
print("==============================")
total_score = 0
count = 0
for q in queries:
results = find_similar_projects(
title=q["title"],
description=q["description"],
top_k=1
)
if "hybrid_score" in results.columns:
score = float(
results.iloc[0]["hybrid_score"]
)
risk = str(
results.iloc[0]["duplicate_risk"]
)
top_title = str(
results.iloc[0]["project_title"]
)
total_score += score
count += 1
print()
print("Query:", q["title"])
print("Top Match:", top_title)
print("Score:", round(score, 4))
print("Risk:", risk)
avg = total_score / count if count else 0
print("\n==============================")
print(f"Average Query Score: {avg:.4f}")
print("==============================")
return avg
def final_status(
self_score,
query_score
):
print("\n==============================")
print("FINAL MODEL STATUS")
print("==============================")
final_score = (
0.60 * self_score +
0.40 * query_score
)
if final_score >= 0.90:
print("EXCELLENT [OK]")
elif final_score >= 0.75:
print("VERY GOOD [OK]")
elif final_score >= 0.60:
print("GOOD [WARN]")
else:
print("NEEDS IMPROVEMENT [FAIL]")
print("Overall Score:", round(final_score, 4))
print("==============================\n")
if __name__ == "__main__":
self_score = run_self_test()
query_score = run_real_queries()
final_status(self_score, query_score)